Hi Yuyang, On 05/04/16 01:15, Yuyang Du wrote: > On Tue, Apr 05, 2016 at 08:51:13AM +0100, Morten Rasmussen wrote: >> On Tue, Apr 05, 2016 at 02:30:03AM +0800, Yuyang Du wrote: >>> On Mon, Apr 04, 2016 at 09:48:23AM +0100, Morten Rasmussen wrote: >>>> On Sat, Apr 02, 2016 at 03:11:54PM +0800, Leo Yan wrote: >>>>> On Fri, Apr 01, 2016 at 03:28:49PM -0700, Steve Muckle wrote: >>>>>> I think I follow - Leo please correct me if I mangle your intentions. >>>>>> It's an issue that Morten and Dietmar had mentioned to me as well. >>>> >>>> Yes. We have been working on this issue for a while without getting to a >>>> nice solution yet. >>> >>> So do you want a "flat hirarchy" for util_avg - just do util_avg for >>> rq and task respectively? Seems it is what you want, and it is even easier? >> >> Pretty much, yes. I can't think of a good reason why we need the >> utilization of groups as long as we have the task utilization and the >> sum of those for the root cfs_rq. > > Sound good to me too. > >> I'm not saying it can't be implemented, just saying that it will make >> utilization tracking for groups redundant and possibly duplicate or hack >> some the existing code to implement the new root utilization sum. > > A initial evaluation of the implementation: it looks much easier to do (at > least) than the current. Lets wait for a day or two, if no objection, then > lets do it. >
I have been playing with this patch to achieve this 'flat hirarchy" for util_avg' after I gave up to implement this propagating down the cfs_rq/se hierarchy thing for task groups. The patch has been created w/o your 'sched/fair: Initiate a new task's util avg to a bounded value' which recently went into tip/sched/core. -- >8 -- Subject: [PATCH] sched/fair: Aggregate task utilization only on the root cfs_rq cpu utilization is defined as the cpu (original) capacity capped sched_avg.util_avg signal of the root cfs_rq of that cpu. With the current pelt version, the utilization of a task enqueued/dequeued on/from a cfs_rq, representing a task group other than the root task group on a cpu, is not immediately propagated down to the root cfs_rq of that cpu. This makes decisions based on cpu_util() for scheduling or cpu frequency settings less accurate in case tasks are running in task groups. This patch aggregates the task utilization only on the root cfs_rq, essentially bypassing cfs_rq's and se's representing task groups (&rq_of(cfs_rq)->cfs != cfs_rq and !entity_is_task(se)). Signed-off-by: Dietmar Eggemann <[email protected]> --- kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33130529e9b5..51d675715776 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -682,8 +682,10 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + if (entity_is_task(se)) { + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -2651,6 +2653,15 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; unsigned int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq, scale_cpu; + int update_util = 0; + + if (cfs_rq) { + if (&rq_of(cfs_rq)->cfs == cfs_rq) + update_util = 1; + } else { + if (entity_is_task(container_of(sa, struct sched_entity, avg))) + update_util = 1; + } delta = now - sa->last_update_time; /* @@ -2696,7 +2707,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, weight * scaled_delta_w; } } - if (running) + if (update_util && running) sa->util_sum += scaled_delta_w * scale_cpu; delta -= delta_w; @@ -2720,7 +2731,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } - if (running) + if (update_util && running) sa->util_sum += contrib * scale_cpu; } @@ -2731,7 +2742,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } - if (running) + if (update_util && running) sa->util_sum += scaled_delta * scale_cpu; sa->period_contrib += delta; @@ -2742,7 +2753,8 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_avg = div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + if (update_util) + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; } return decayed; @@ -2834,7 +2846,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) removed = 1; } - if (atomic_long_read(&cfs_rq->removed_util_avg)) { + if ((&rq_of(cfs_rq)->cfs == cfs_rq) && + atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); @@ -2893,8 +2906,12 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; - cfs_rq->avg.util_avg += se->avg.util_avg; - cfs_rq->avg.util_sum += se->avg.util_sum; + + if (!entity_is_task(se)) + return; + + rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg; + rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum; } static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2905,8 +2922,14 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + + if (!entity_is_task(se)) + return; + + rq_of(cfs_rq)->cfs.avg.util_avg = + max_t(long, rq_of(cfs_rq)->cfs.avg.util_avg - se->avg.util_avg, 0); + rq_of(cfs_rq)->cfs.avg.util_sum = + max_t(s32, rq_of(cfs_rq)->cfs.avg.util_sum - se->avg.util_sum, 0); } /* Add the load generated by se into cfs_rq's load average */ @@ -2989,7 +3012,11 @@ void remove_entity_load_avg(struct sched_entity *se) __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + + if (!entity_is_task(se)) + return; + + atomic_long_add(se->avg.util_avg, &rq_of(cfs_rq)->cfs.removed_util_avg); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -8268,7 +8295,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #endif #ifdef CONFIG_SMP atomic_long_set(&cfs_rq->removed_load_avg, 0); - atomic_long_set(&cfs_rq->removed_util_avg, 0); + + if (&rq_of(cfs_rq)->cfs == cfs_rq) + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } -- 1.9.1

