On Fri, 18 Dec 2020 at 10:28, Xuewen Yan <xuewen.ya...@gmail.com> wrote: > > From: Xuewen Yan <xuewen....@unisoc.com> > > CPU (root cfs_rq) estimated utilization (util_est) is currently used in > dequeue_task_fair() to drive frequency selection before it is updated. > > with: > > CPU_util : rq->cfs.avg.util_avg > CPU_util_est : rq->cfs.avg.util_est > CPU_utilization : max(CPU_util, CPU_util_est) > task_util : p->se.avg.util_avg > task_util_est : p->se.avg.util_est > > dequeue_task_fair(): > > /* (1) CPU_util and task_util update + inform schedutil about > CPU_utilization changes */ > for_each_sched_entity() /* 2 loops */ > (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change() > -> cpufreq_update_util() ->...-> sugov_update_[shared\|single] > -> sugov_get_util() -> cpu_util_cfs() > > /* (2) CPU_util_est and task_util_est update */ > util_est_dequeue() > > cpu_util_cfs() uses CPU_utilization which could lead to a false (too > high) utilization value for schedutil in task ramp-down or ramp-up > scenarios during task dequeue. > > To mitigate the issue split the util_est update (2) into: > > (A) CPU_util_est update in util_est_dequeue() > (B) task_util_est update in util_est_update() > > Place (A) before (1) and keep (B) where (2) is. The latter is necessary > since (B) relies on task_util update in (1). >
maybe add a Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT") > Signed-off-by: Xuewen Yan <xuewen....@unisoc.com> > Reviewed-by: Dietmar Eggemann <dietmar.eggem...@arm.com> Reviewed-by: Vincent Guittot <vincent.guit...@linaro.org> > --- > Changes since v2: > -modify the comment > -move util_est_dequeue above within_margin() > -modify the tab and space > > Changes since v1: > -change the util_est_dequeue/update to inline type > -use unsigned int enqueued rather than util_est in util_est_dequeue > -remove "cpu" var > > --- > kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++--------------- > 1 file changed, 28 insertions(+), 15 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index ae7ceba..f3a1b7a 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -3932,6 +3932,22 @@ static inline void util_est_enqueue(struct cfs_rq > *cfs_rq, > trace_sched_util_est_cfs_tp(cfs_rq); > } > > +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, > + struct task_struct *p) > +{ > + unsigned int enqueued; > + > + if (!sched_feat(UTIL_EST)) > + return; > + > + /* Update root cfs_rq's estimated utilization */ > + enqueued = cfs_rq->avg.util_est.enqueued; > + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); > + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); > + > + trace_sched_util_est_cfs_tp(cfs_rq); > +} > + > /* > * Check if a (signed) value is within a specified (unsigned) margin, > * based on the observation that: > @@ -3945,23 +3961,16 @@ static inline bool within_margin(int value, int > margin) > return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); > } > > -static void > -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool > task_sleep) > +static inline void util_est_update(struct cfs_rq *cfs_rq, > + struct task_struct *p, > + bool task_sleep) > { > long last_ewma_diff; > struct util_est ue; > - int cpu; > > if (!sched_feat(UTIL_EST)) > return; > > - /* Update root cfs_rq's estimated utilization */ > - ue.enqueued = cfs_rq->avg.util_est.enqueued; > - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); > - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); > - > - trace_sched_util_est_cfs_tp(cfs_rq); > - > /* > * Skip update of task's estimated utilization when the task has not > * yet completed an activation, e.g. being migrated. > @@ -4001,8 +4010,7 @@ static inline bool within_margin(int value, int margin) > * To avoid overestimation of actual task utilization, skip updates if > * we cannot grant there is idle time in this CPU. > */ > - cpu = cpu_of(rq_of(cfs_rq)); > - if (task_util(p) > capacity_orig_of(cpu)) > + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) > return; > > /* > @@ -4085,8 +4093,11 @@ static inline int newidle_balance(struct rq *rq, > struct rq_flags *rf) > util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} > > static inline void > -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, > - bool task_sleep) {} > +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} > + > +static inline void > +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, > + bool task_sleep) {} > static inline void update_misfit_status(struct task_struct *p, struct rq > *rq) {} > > #endif /* CONFIG_SMP */ > @@ -5589,6 +5600,8 @@ static void dequeue_task_fair(struct rq *rq, struct > task_struct *p, int flags) > int idle_h_nr_running = task_has_idle_policy(p); > bool was_sched_idle = sched_idle_rq(rq); > > + util_est_dequeue(&rq->cfs, p); > + > for_each_sched_entity(se) { > cfs_rq = cfs_rq_of(se); > dequeue_entity(cfs_rq, se, flags); > @@ -5639,7 +5652,7 @@ static void dequeue_task_fair(struct rq *rq, struct > task_struct *p, int flags) > rq->next_balance = jiffies; > > dequeue_throttle: > - util_est_dequeue(&rq->cfs, p, task_sleep); > + util_est_update(&rq->cfs, p, task_sleep); > hrtick_update(rq); > } > > -- > 1.9.1 >