On 25-May 15:12, Vincent Guittot wrote:
> schedutil governor relies on cfs_rq's util_avg to choose the OPP when cfs
                                                                       ^
                                                                     only
otherwise, while RT tasks are running we go to max.

> tasks are running.
> When the CPU is overloaded by cfs and rt tasks, cfs tasks
                  ^^^^^^^^^^
I would say we always have the provlem whenever an RT task preempt a
CFS one, even just for few ms, isn't it?

> are preempted by rt tasks and in this case util_avg reflects the remaining
> capacity but not what cfs want to use. In such case, schedutil can select a
> lower OPP whereas the CPU is overloaded. In order to have a more accurate
> view of the utilization of the CPU, we track the utilization that is
> "stolen" by rt tasks.
> 
> rt_rq uses rq_clock_task and cfs_rq uses cfs_rq_clock_task but they are
> the same at the root group level, so the PELT windows of the util_sum are
> aligned.
> 
> Signed-off-by: Vincent Guittot <vincent.guit...@linaro.org>
> ---
>  kernel/sched/fair.c  | 15 ++++++++++++++-
>  kernel/sched/pelt.c  | 23 +++++++++++++++++++++++
>  kernel/sched/pelt.h  |  7 +++++++
>  kernel/sched/rt.c    |  8 ++++++++
>  kernel/sched/sched.h |  7 +++++++
>  5 files changed, 59 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6390c66..fb18bcc 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7290,6 +7290,14 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq 
> *cfs_rq)
>       return false;
>  }
>  
> +static inline bool rt_rq_has_blocked(struct rq *rq)
> +{
> +     if (rq->avg_rt.util_avg)

Should use READ_ONCE?

> +             return true;
> +
> +     return false;

What about just:

       return READ_ONCE(rq->avg_rt.util_avg);

?

> +}
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  
>  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
> @@ -7349,6 +7357,10 @@ static void update_blocked_averages(int cpu)
>               if (cfs_rq_has_blocked(cfs_rq))
>                       done = false;
>       }
> +     update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
> +     /* Don't need periodic decay once load/util_avg are null */
> +     if (rt_rq_has_blocked(rq))
> +             done = false;
>  
>  #ifdef CONFIG_NO_HZ_COMMON
>       rq->last_blocked_load_update_tick = jiffies;
> @@ -7414,9 +7426,10 @@ static inline void update_blocked_averages(int cpu)
>       rq_lock_irqsave(rq, &rf);
>       update_rq_clock(rq);
>       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
> +     update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
>  #ifdef CONFIG_NO_HZ_COMMON
>       rq->last_blocked_load_update_tick = jiffies;
> -     if (!cfs_rq_has_blocked(cfs_rq))
> +     if (!cfs_rq_has_blocked(cfs_rq) && !rt_rq_has_blocked(rq))
>               rq->has_blocked_load = 0;
>  #endif
>       rq_unlock_irqrestore(rq, &rf);
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index e6ecbb2..213b922 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -309,3 +309,26 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct 
> cfs_rq *cfs_rq)
>  
>       return 0;
>  }
> +
> +/*
> + * rt_rq:
> + *
> + *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
> + *   util_sum = cpu_scale * load_sum
> + *   runnable_load_sum = load_sum
> + *
> + */
> +
> +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
> +{
> +     if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
> +                             running,
> +                             running,
> +                             running)) {
> +

Not needed empty line?

> +             ___update_load_avg(&rq->avg_rt, 1, 1);
> +             return 1;
> +     }
> +
> +     return 0;
> +}
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> index 9cac73e..b2983b7 100644
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -3,6 +3,7 @@
>  int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
>  int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct 
> sched_entity *se);
>  int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
> +int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
>  
>  /*
>   * When a task is dequeued, its estimated utilization should not be update if
> @@ -38,6 +39,12 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>       return 0;
>  }
>  
> +static inline int
> +update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
> +{
> +     return 0;
> +}
> +
>  #endif
>  
>  
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index ef3c4e6..b4148a9 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -5,6 +5,8 @@
>   */
>  #include "sched.h"
>  
> +#include "pelt.h"
> +
>  int sched_rr_timeslice = RR_TIMESLICE;
>  int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
>  
> @@ -1572,6 +1574,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct 
> *prev, struct rq_flags *rf)
>  
>       rt_queue_push_tasks(rq);
>  
> +     update_rt_rq_load_avg(rq_clock_task(rq), rq,
> +             rq->curr->sched_class == &rt_sched_class);
> +
>       return p;
>  }
>  
> @@ -1579,6 +1584,8 @@ static void put_prev_task_rt(struct rq *rq, struct 
> task_struct *p)
>  {
>       update_curr_rt(rq);
>  
> +     update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
> +
>       /*
>        * The previous task needs to be made eligible for pushing
>        * if it is still active
> @@ -2308,6 +2315,7 @@ static void task_tick_rt(struct rq *rq, struct 
> task_struct *p, int queued)
>       struct sched_rt_entity *rt_se = &p->rt;
>  
>       update_curr_rt(rq);
> +     update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);

Mmm... not entirely sure... can't we fold
   update_rt_rq_load_avg() into update_curr_rt() ?

Currently update_curr_rt() is used in:
   dequeue_task_rt
   pick_next_task_rt
   put_prev_task_rt
   task_tick_rt

while we update_rt_rq_load_avg() only in:
   pick_next_task_rt
   put_prev_task_rt
   task_tick_rt
and
   update_blocked_averages
 
Why we don't we need to update at dequeue_task_rt() time ?

>  
>       watchdog(rq, p);
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 757a3ee..7a16de9 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -592,6 +592,7 @@ struct rt_rq {
>       unsigned long           rt_nr_total;
>       int                     overloaded;
>       struct plist_head       pushable_tasks;
> +
>  #endif /* CONFIG_SMP */
>       int                     rt_queued;
>  
> @@ -847,6 +848,7 @@ struct rq {
>  
>       u64                     rt_avg;
>       u64                     age_stamp;
> +     struct sched_avg        avg_rt;
>       u64                     idle_stamp;
>       u64                     avg_idle;
>  
> @@ -2205,4 +2207,9 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
>  
>       return util;
>  }
> +
> +static inline unsigned long cpu_util_rt(struct rq *rq)
> +{
> +     return rq->avg_rt.util_avg;

READ_ONCE?

> +}
>  #endif
> -- 
> 2.7.4
> 

-- 
#include <best/regards.h>

Patrick Bellasi

Reply via email to