fair: Fix O(# total cgroups) in load balance path

Tejun Heo Wed, 10 May 2017 08:55:59 -0700

On Wed, May 10, 2017 at 10:44:14AM -0400, Tejun Heo wrote:
> Hello,
> 
> On Wed, May 10, 2017 at 08:50:14AM +0200, Vincent Guittot wrote:
> > On 9 May 2017 at 18:18, Tejun Heo <t...@kernel.org> wrote:
> > > Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
> > > live cfs_rqs which have ever been active on the CPU; unfortunately,
> > > this makes update_blocked_averages() O(# total cgroups) which isn't
> > > scalable at all.
> > 
> > Dietmar raised similar optimization in the past. The only question was
> > : what is the impact of  re-adding the cfs_rq in leaf_cfs_rq_list on
> > the wake up path ? Have you done some measurements ?
> 
> Didn't do a perf test yet but it's several more branches and a local
> list operation on enqueue, which is already pretty expensive vs. load
> balance being O(total number of cgroups on the system).
> 
> Anyways, I'll do some hackbench tests with several levels of layering.


Oh, BTW, do we still need bc4278987e38 ("sched/fair: Fix FTQ noise
bench regression") with this patch?  That patch looks like it's just
cutting down on the body of the O(#cgroups) loop.

Ying, can you verify whether the following patch on top of the current
linus#master 56868a460b83 also fixes the regression that you saw?

Index: work/kernel/sched/fair.c
===================================================================
--- work.orig/kernel/sched/fair.c
+++ work/kernel/sched/fair.c
@@ -372,6 +372,10 @@ static inline void list_del_leaf_cfs_rq(
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+       list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+                                leaf_cfs_rq_list)
+
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
@@ -466,6 +470,9 @@ static inline void list_del_leaf_cfs_rq(
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
        return NULL;
@@ -3170,36 +3177,6 @@ static inline int propagate_entity_load_
        return 1;
 }
 
-/*
- * Check if we need to update the load and the utilization of a blocked
- * group_entity:
- */
-static inline bool skip_blocked_update(struct sched_entity *se)
-{
-       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-
-       /*
-        * If sched_entity still have not zero load or utilization, we have to
-        * decay it:
-        */
-       if (se->avg.load_avg || se->avg.util_avg)
-               return false;
-
-       /*
-        * If there is a pending propagation, we have to update the load and
-        * the utilization of the sched_entity:
-        */
-       if (gcfs_rq->propagate_avg)
-               return false;
-
-       /*
-        * Otherwise, the load and the utilization of the sched_entity is
-        * already zero and there is no pending propagation, so it will be a
-        * waste of time to try to decay it:
-        */
-       return true;
-}
-
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -4644,22 +4621,32 @@ static void destroy_cfs_bandwidth(struct
 
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-       struct cfs_rq *cfs_rq;
+       struct task_group *tg;
+
+       lockdep_assert_held(&rq->lock);
 
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
-               struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+       rcu_read_lock();
+       list_for_each_entry_rcu(tg, &task_groups, list) {
+               struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+               struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
                raw_spin_lock(&cfs_b->lock);
                cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
                raw_spin_unlock(&cfs_b->lock);
        }
+       rcu_read_unlock();
 }
 
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-       struct cfs_rq *cfs_rq;
+       struct task_group *tg;
+
+       lockdep_assert_held(&rq->lock);
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(tg, &task_groups, list) {
+               struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
                if (!cfs_rq->runtime_enabled)
                        continue;
 
@@ -4677,6 +4664,7 @@ static void __maybe_unused unthrottle_of
                if (cfs_rq_throttled(cfs_rq))
                        unthrottle_cfs_rq(cfs_rq);
        }
+       rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
@@ -6973,7 +6961,7 @@ static void attach_tasks(struct lb_env *
 static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *pos;
        struct rq_flags rf;
 
        rq_lock_irqsave(rq, &rf);
@@ -6983,9 +6971,7 @@ static void update_blocked_averages(int
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
-               struct sched_entity *se;
-
+       for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
                /* throttled entities do not contribute to load */
                if (throttled_hierarchy(cfs_rq))
                        continue;
@@ -6993,10 +6979,17 @@ static void update_blocked_averages(int
                if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, 
true))
                        update_tg_load_avg(cfs_rq, 0);
 
-               /* Propagate pending load changes to the parent, if any: */
-               se = cfs_rq->tg->se[cpu];
-               if (se && !skip_blocked_update(se))
-                       update_load_avg(se, 0);
+               /* Propagate pending load changes to the parent */
+               if (cfs_rq->tg->se[cpu])
+                       update_load_avg(cfs_rq->tg->se[cpu], 0);
+
+               /*
+                * There can be a lot of idle CPU cgroups.  Don't let fully
+                * decayed cfs_rqs linger on the list.
+                */
+               if (!cfs_rq->load.weight && !cfs_rq->avg.load_sum &&
+                   !cfs_rq->avg.util_sum && !cfs_rq->runnable_load_sum)
+                       list_del_leaf_cfs_rq(cfs_rq);
        }
        rq_unlock_irqrestore(rq, &rf);
 }

Re: [PATCH v2 for-4.12-fixes 2/2] sched/fair: Fix O(# total cgroups) in load balance path

Reply via email to