The load balancer uses runnable_load_avg as load indicator. For
!cgroup this is:

  runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq

That is, a direct sum of all runnable tasks on that runqueue. As
opposed to load_avg, which is a sum of all tasks on the runqueue,
which includes a blocked component.

However, in the cgroup case, this comes apart since the group entities
are always runnable, even if most of their constituent entities are
blocked.

Therefore introduce a runnable_weight which for task entities is the
same as the regular weight, but for group entities is a fraction of
the entity weight and represents the runnable part of the group
runqueue.

Then propagate this load through the PELT hierarchy to arrive at an
effective runnable load avgerage -- which we should not confuse with
the canonical runnable load average.

Suggested-by: Tejun Heo <t...@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 include/linux/sched.h |    3 
 kernel/sched/debug.c  |    8 ++
 kernel/sched/fair.c   |  173 ++++++++++++++++++++++++++++++++------------------
 kernel/sched/sched.h  |    3 
 4 files changed, 125 insertions(+), 62 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -332,9 +332,11 @@ struct load_weight {
 struct sched_avg {
        u64                             last_update_time;
        u64                             load_sum;
+       u64                             runnable_load_sum;
        u32                             util_sum;
        u32                             period_contrib;
        unsigned long                   load_avg;
+       unsigned long                   runnable_load_avg;
        unsigned long                   util_avg;
 };
 
@@ -377,6 +379,7 @@ struct sched_statistics {
 struct sched_entity {
        /* For load-balancing: */
        struct load_weight              load;
+       unsigned long                   runnable_weight;
        struct rb_node                  run_node;
        struct list_head                group_node;
        unsigned int                    on_rq;
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -436,9 +436,11 @@ static void print_cfs_group_stats(struct
                P_SCHEDSTAT(se->statistics.wait_count);
        }
        P(se->load.weight);
+       P(se->runnable_weight);
 #ifdef CONFIG_SMP
        P(se->avg.load_avg);
        P(se->avg.util_avg);
+       P(se->avg.runnable_load_avg);
 #endif
 
 #undef PN_SCHEDSTAT
@@ -555,10 +557,11 @@ void print_cfs_rq(struct seq_file *m, in
        SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
+       SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", 
cfs_rq->runnable_weight);
        SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
                        cfs_rq->avg.load_avg);
        SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
-                       cfs_rq->runnable_load_avg);
+                       cfs_rq->avg.runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
                        cfs_rq->avg.util_avg);
        SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
@@ -1003,10 +1006,13 @@ void proc_sched_show_task(struct task_st
                   "nr_involuntary_switches", (long long)p->nivcsw);
 
        P(se.load.weight);
+       P(se.runnable_weight);
 #ifdef CONFIG_SMP
        P(se.avg.load_sum);
+       P(se.avg.runnable_load_sum);
        P(se.avg.util_sum);
        P(se.avg.load_avg);
+       P(se.avg.runnable_load_avg);
        P(se.avg.util_avg);
        P(se.avg.last_update_time);
 #endif
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -743,8 +743,9 @@ void init_entity_runnable_average(struct
         * nothing has been attached to the task group yet.
         */
        if (entity_is_task(se))
-               sa->load_avg = scale_load_down(se->load.weight);
-       sa->load_sum = LOAD_AVG_MAX;
+               sa->runnable_load_avg = sa->load_avg = 
scale_load_down(se->load.weight);
+       sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX;
+
        /*
         * At this point, util_avg won't be used in select_task_rq_fair anyway
         */
@@ -2744,25 +2745,35 @@ account_entity_dequeue(struct cfs_rq *cf
 
 #ifdef CONFIG_SMP
 /*
- * XXX we want to get rid of this helper and use the full load resolution.
+ * XXX we want to get rid of these helpers and use the full load resolution.
  */
 static inline long se_weight(struct sched_entity *se)
 {
        return scale_load_down(se->load.weight);
 }
 
+static inline long se_runnable(struct sched_entity *se)
+{
+       return scale_load_down(se->runnable_weight);
+}
+
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       cfs_rq->runnable_load_avg += se->avg.load_avg;
-       cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
+       cfs_rq->runnable_weight += se->runnable_weight;
+
+       cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+       cfs_rq->avg.runnable_load_sum += se_runnable(se) * 
se->avg.runnable_load_sum;
 }
 
 static inline void
 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
-       sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * 
se->avg.load_sum);
+       cfs_rq->runnable_weight -= se->runnable_weight;
+
+       sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+       sub_positive(&cfs_rq->avg.runnable_load_sum,
+                    se_runnable(se) * se->avg.runnable_load_sum);
 }
 
 static inline void
@@ -2790,7 +2802,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq,
 #endif
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-                           unsigned long weight)
+                           unsigned long weight, unsigned long runnable)
 {
        if (se->on_rq) {
                /* commit outstanding execution time */
@@ -2801,11 +2813,17 @@ static void reweight_entity(struct cfs_r
        }
        dequeue_load_avg(cfs_rq, se);
 
+       se->runnable_weight = runnable;
        update_load_set(&se->load, weight);
 
 #ifdef CONFIG_SMP
-       se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
-                                  LOAD_AVG_MAX - 1024 + 
se->avg.period_contrib);
+       do {
+               u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+               se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, 
divider);
+               se->avg.runnable_load_avg =
+                       div_u64(se_runnable(se) * se->avg.runnable_load_sum, 
divider);
+       } while (0);
 #endif
 
        enqueue_load_avg(cfs_rq, se);
@@ -2822,7 +2840,7 @@ void reweight_task(struct task_struct *p
        struct load_weight *load = &se->load;
        unsigned long weight = scale_load(sched_prio_to_weight[prio]);
 
-       reweight_entity(cfs_rq, se, weight);
+       reweight_entity(cfs_rq, se, weight, weight);
        load->inv_weight = sched_prio_to_wmult[prio];
 }
 
@@ -2931,31 +2949,45 @@ static long calc_cfs_shares(struct cfs_r
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
 {
-       struct cfs_rq *cfs_rq = group_cfs_rq(se);
-       long shares;
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+       long shares, runnable;
 
-       if (!cfs_rq)
+       if (!gcfs_rq)
                return;
 
-       if (throttled_hierarchy(cfs_rq))
+       if (throttled_hierarchy(gcfs_rq))
                return;
 
 #ifndef CONFIG_SMP
-       shares = READ_ONCE(cfs_rq->tg->shares);
+       runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
 
        if (likely(se->load.weight == shares))
                return;
 #else
-       shares = calc_cfs_shares(cfs_rq);
+       shares = calc_cfs_shares(gcfs_rq);
+       /*
+        * The hierarchical runnable load metric is the proportional part
+        * of this group's runnable_load_avg / load_avg.
+        *
+        * Note: we need to deal with very sporadic 'runnable > load' cases
+        * due to numerical instability.
+        */
+       runnable = shares * gcfs_rq->avg.runnable_load_avg;
+       if (runnable)
+               runnable /= max(gcfs_rq->avg.load_avg, 
gcfs_rq->avg.runnable_load_avg);
 #endif
 
-       reweight_entity(cfs_rq_of(se), se, shares);
+       reweight_entity(cfs_rq_of(se), se, shares, runnable);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3062,7 +3094,7 @@ static u32 __accumulate_pelt_segments(u6
  */
 static __always_inline u32
 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-              unsigned long weight, int running, struct cfs_rq *cfs_rq)
+              unsigned long load, unsigned long runnable, int running)
 {
        unsigned long scale_freq, scale_cpu;
        u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -3079,10 +3111,8 @@ accumulate_sum(u64 delta, int cpu, struc
         */
        if (periods) {
                sa->load_sum = decay_load(sa->load_sum, periods);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_sum =
-                               decay_load(cfs_rq->runnable_load_sum, periods);
-               }
+               sa->runnable_load_sum =
+                       decay_load(sa->runnable_load_sum, periods);
                sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 
                /*
@@ -3095,11 +3125,10 @@ accumulate_sum(u64 delta, int cpu, struc
        sa->period_contrib = delta;
 
        contrib = cap_scale(contrib, scale_freq);
-       if (weight) {
-               sa->load_sum += weight * contrib;
-               if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * contrib;
-       }
+       if (load)
+               sa->load_sum += load * contrib;
+       if (runnable)
+               sa->runnable_load_sum += runnable * contrib;
        if (running)
                sa->util_sum += contrib * scale_cpu;
 
@@ -3136,7 +3165,7 @@ accumulate_sum(u64 delta, int cpu, struc
  */
 static __always_inline int
 ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
-                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
+                 unsigned long load, unsigned long runnable, int running)
 {
        u64 delta;
 
@@ -3169,8 +3198,8 @@ ___update_load_sum(u64 now, int cpu, str
         * this happens during idle_balance() which calls
         * update_blocked_averages()
         */
-       if (!weight)
-               running = 0;
+       if (!load)
+               runnable = running = 0;
 
        /*
         * Now we know we crossed measurement unit boundaries. The *_avg
@@ -3179,45 +3208,60 @@ ___update_load_sum(u64 now, int cpu, str
         * Step 1: accumulate *_sum since last_update_time. If we haven't
         * crossed period boundaries, finish.
         */
-       if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+       if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
                return 0;
 
        return 1;
 }
 
 static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq 
*cfs_rq)
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long 
runnable)
 {
        u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 
        /*
         * Step 2: update *_avg.
         */
-       sa->load_avg = div_u64(weight * sa->load_sum, divider);
-       if (cfs_rq) {
-               cfs_rq->runnable_load_avg =
-                       div_u64(cfs_rq->runnable_load_sum, divider);
-       }
+       sa->load_avg = div_u64(load * sa->load_sum, divider);
+       sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, 
divider);
        sa->util_avg = sa->util_sum / divider;
 }
 
 /*
  * sched_entity:
  *
+ *   task:
+ *     se_runnable() == se_weight()
+ *
+ *   group: [ see update_cfs_group() ]
+ *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
+ *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
  *   load_sum := runnable_sum
  *   load_avg = se_weight(se) * runnable_avg
  *
+ *   runnable_load_sum := runnable_sum
+ *   runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
  * cfq_rs:
  *
  *   load_sum = \Sum se_weight(se) * se->avg.load_sum
  *   load_avg = \Sum se->avg.load_avg
+ *
+ *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ *   runnable_load_avg = \Sum se->avg.runable_load_avg
  */
 
 static int
 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
-       if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
-               ___update_load_avg(&se->avg, se_weight(se), NULL);
+       if (entity_is_task(se))
+               se->runnable_weight = se->load.weight;
+
+       if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+               ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
                return 1;
        }
 
@@ -3227,10 +3271,13 @@ __update_load_avg_blocked_se(u64 now, in
 static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 {
-       if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
-                               cfs_rq->curr == se, NULL)) {
+       if (entity_is_task(se))
+               se->runnable_weight = se->load.weight;
+
+       if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+                               cfs_rq->curr == se)) {
 
-               ___update_load_avg(&se->avg, se_weight(se), NULL);
+               ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
                return 1;
        }
 
@@ -3242,8 +3289,10 @@ __update_load_avg_cfs_rq(u64 now, int cp
 {
        if (___update_load_sum(now, cpu, &cfs_rq->avg,
                                scale_load_down(cfs_rq->load.weight),
-                               cfs_rq->curr != NULL, cfs_rq)) {
-               ___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+                               scale_load_down(cfs_rq->runnable_weight),
+                               cfs_rq->curr != NULL)) {
+
+               ___update_load_avg(&cfs_rq->avg, 1, 1);
                return 1;
        }
 
@@ -3421,8 +3470,8 @@ static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct 
cfs_rq *gcfs_rq)
 {
        long runnable_sum = gcfs_rq->prop_runnable_sum;
-       long load_avg;
-       s64 load_sum;
+       long runnable_load_avg, load_avg;
+       s64 runnable_load_sum, load_sum;
 
        if (!runnable_sum)
                return;
@@ -3438,9 +3487,15 @@ update_tg_cfs_runnable(struct cfs_rq *cf
        add_positive(&cfs_rq->avg.load_avg, load_avg);
        add_positive(&cfs_rq->avg.load_sum, load_sum);
 
+       runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+       runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+
+       add_positive(&se->avg.runnable_load_sum, runnable_sum);
+       add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+
        if (se->on_rq) {
-               add_positive(&cfs_rq->runnable_load_avg, load_avg);
-               add_positive(&cfs_rq->runnable_load_sum, load_sum);
+               add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+               add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
        }
 }
 
@@ -3722,7 +3777,7 @@ void remove_entity_load_avg(struct sched
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 {
-       return cfs_rq->runnable_load_avg;
+       return cfs_rq->avg.runnable_load_avg;
 }
 
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3892,8 +3947,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
         *   - Add its new weight to cfs_rq->load.weight
         */
        update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+       update_cfs_group(se);
        enqueue_runnable_load_avg(cfs_rq, se);
-       update_cfs_shares(se);
        account_entity_enqueue(cfs_rq, se);
 
        if (flags & ENQUEUE_WAKEUP)
@@ -3999,7 +4054,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
        /* return excess runtime on last dequeue */
        return_cfs_rq_runtime(cfs_rq);
 
-       update_cfs_shares(se);
+       update_cfs_group(se);
 
        /*
         * Now advance min_vruntime if @se was the entity holding it back,
@@ -4182,7 +4237,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
         * Ensure that runnable average is periodically updated.
         */
        update_load_avg(cfs_rq, curr, UPDATE_TG);
-       update_cfs_shares(curr);
+       update_cfs_group(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
        /*
@@ -5100,7 +5155,7 @@ enqueue_task_fair(struct rq *rq, struct
                        break;
 
                update_load_avg(cfs_rq, se, UPDATE_TG);
-               update_cfs_shares(se);
+               update_cfs_group(se);
        }
 
        if (!se)
@@ -5159,7 +5214,7 @@ static void dequeue_task_fair(struct rq
                        break;
 
                update_load_avg(cfs_rq, se, UPDATE_TG);
-               update_cfs_shares(se);
+               update_cfs_group(se);
        }
 
        if (!se)
@@ -7184,7 +7239,7 @@ static inline bool cfs_rq_is_decayed(str
        if (cfs_rq->avg.util_sum)
                return false;
 
-       if (cfs_rq->runnable_load_sum)
+       if (cfs_rq->avg.runnable_load_sum)
                return false;
 
        return true;
@@ -9689,7 +9744,7 @@ int sched_group_set_shares(struct task_g
                update_rq_clock(rq);
                for_each_sched_entity(se) {
                        update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
-                       update_cfs_shares(se);
+                       update_cfs_group(se);
                }
                rq_unlock_irqrestore(rq, &rf);
        }
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -418,6 +418,7 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
        struct load_weight load;
+       unsigned long runnable_weight;
        unsigned int nr_running, h_nr_running;
 
        u64 exec_clock;
@@ -444,8 +445,6 @@ struct cfs_rq {
         * CFS load tracking
         */
        struct sched_avg avg;
-       u64 runnable_load_sum;
-       unsigned long runnable_load_avg;
 #ifndef CONFIG_64BIT
        u64 load_last_update_time_copy;
 #endif


Reply via email to