From: Xuewen Yan <xuewen....@unisoc.com>

CPU (root cfs_rq) estimated utilization (util_est) is currently used in
dequeue_task_fair() to drive frequency selection before it is updated.

with:

CPU_util        : rq->cfs.avg.util_avg
CPU_util_est    : rq->cfs.avg.util_est
CPU_utilization : max(CPU_util, CPU_util_est)
task_util       : p->se.avg.util_avg
task_util_est   : p->se.avg.util_est

dequeue_task_fair():

    /* (1) CPU_util and task_util update + inform schedutil about
           CPU_utilization changes */
    for_each_sched_entity() /* 2 loops */
        (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change()
         -> cpufreq_update_util() ->...-> sugov_update_[shared\|single]
         -> sugov_get_util() -> cpu_util_cfs()

    /* (2) CPU_util_est and task_util_est update */
    util_est_dequeue()

cpu_util_cfs() uses CPU_utilization which could lead to a false (too
high) utilization value for schedutil in task ramp-down or ramp-up
scenarios during task dequeue.

To mitigate the issue split the util_est update (2) into:

 (A) CPU_util_est update in util_est_dequeue()
 (B) task_util_est update in util_est_update()

Place (A) before (1) and keep (B) where (2) is. The latter is necessary
since (B) relies on task_util update in (1).

Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT")

Signed-off-by: Xuewen Yan <xuewen....@unisoc.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggem...@arm.com>
Reviewed-by: Vincent Guittot <vincent.guit...@linaro.org>
---
Change since v3:
-add reviewer
-add more comment details

Changes since v2:
-modify the comment
-move util_est_dequeue above within_margin()
-modify the tab and space

Changes since v1:
-change the util_est_dequeue/update to inline type
-use unsigned int enqueued rather than util_est in util_est_dequeue
-remove "cpu" var

---
 kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae7ceba..f3a1b7a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3932,6 +3932,22 @@ static inline void util_est_enqueue(struct cfs_rq 
*cfs_rq,
        trace_sched_util_est_cfs_tp(cfs_rq);
 }
 
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+                                   struct task_struct *p)
+{
+       unsigned int enqueued;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /* Update root cfs_rq's estimated utilization */
+       enqueued  = cfs_rq->avg.util_est.enqueued;
+       enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+       trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
 /*
  * Check if a (signed) value is within a specified (unsigned) margin,
  * based on the observation that:
@@ -3945,23 +3961,16 @@ static inline bool within_margin(int value, int margin)
        return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
 }
 
-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+                                  struct task_struct *p,
+                                  bool task_sleep)
 {
        long last_ewma_diff;
        struct util_est ue;
-       int cpu;
 
        if (!sched_feat(UTIL_EST))
                return;
 
-       /* Update root cfs_rq's estimated utilization */
-       ue.enqueued  = cfs_rq->avg.util_est.enqueued;
-       ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
-       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
-
-       trace_sched_util_est_cfs_tp(cfs_rq);
-
        /*
         * Skip update of task's estimated utilization when the task has not
         * yet completed an activation, e.g. being migrated.
@@ -4001,8 +4010,7 @@ static inline bool within_margin(int value, int margin)
         * To avoid overestimation of actual task utilization, skip updates if
         * we cannot grant there is idle time in this CPU.
         */
-       cpu = cpu_of(rq_of(cfs_rq));
-       if (task_util(p) > capacity_orig_of(cpu))
+       if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
                return;
 
        /*
@@ -4085,8 +4093,11 @@ static inline int newidle_balance(struct rq *rq, struct 
rq_flags *rf)
 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
 
 static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
-                bool task_sleep) {}
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+               bool task_sleep) {}
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 
{}
 
 #endif /* CONFIG_SMP */
@@ -5589,6 +5600,8 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
        int idle_h_nr_running = task_has_idle_policy(p);
        bool was_sched_idle = sched_idle_rq(rq);
 
+       util_est_dequeue(&rq->cfs, p);
+
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
@@ -5639,7 +5652,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
                rq->next_balance = jiffies;
 
 dequeue_throttle:
-       util_est_dequeue(&rq->cfs, p, task_sleep);
+       util_est_update(&rq->cfs, p, task_sleep);
        hrtick_update(rq);
 }
 
-- 
1.9.1

Reply via email to