fair: rework find_idlest_group

Vincent Guittot Fri, 18 Oct 2019 06:27:28 -0700

The slow wake up path computes per sched_group statisics to select the
idlest group, which is quite similar to what load_balance() is doing
for selecting busiest group. Rework find_idlest_group() to classify the
sched_group and select the idlest one following the same steps as
load_balance().


Signed-off-by: Vincent Guittot <vincent.guit...@linaro.org>
---
 kernel/sched/fair.c | 384 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 256 insertions(+), 128 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ed1800d..fbaafae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5541,127 +5541,9 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
        return target;
 }
 
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
-{
-       return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- *
- * Assumes p is allowed on at least one CPU in sd.
- */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag)
-{
-       struct sched_group *idlest = NULL, *group = sd->groups;
-       struct sched_group *most_spare_sg = NULL;
-       unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
-       unsigned long most_spare = 0, this_spare = 0;
-       int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
-       unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
-                               (sd->imbalance_pct-100) / 100;
-
-       do {
-               unsigned long load;
-               unsigned long spare_cap, max_spare_cap;
-               int local_group;
-               int i;
-
-               /* Skip over this group if it has no CPUs allowed */
-               if (!cpumask_intersects(sched_group_span(group),
-                                       p->cpus_ptr))
-                       continue;
-
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_span(group));
-
-               /*
-                * Tally up the load of all CPUs in the group and find
-                * the group containing the CPU with most spare capacity.
-                */
-               load = 0;
-               max_spare_cap = 0;
-
-               for_each_cpu(i, sched_group_span(group)) {
-                       load += cpu_load(cpu_rq(i));
-
-                       spare_cap = capacity_spare_without(i, p);
-
-                       if (spare_cap > max_spare_cap)
-                               max_spare_cap = spare_cap;
-               }
-
-               /* Adjust by relative CPU capacity of the group */
-               load = (load * SCHED_CAPACITY_SCALE) /
-                                       group->sgc->capacity;
-
-               if (local_group) {
-                       this_load = load;
-                       this_spare = max_spare_cap;
-               } else {
-                       if (load < min_load) {
-                               min_load = load;
-                               idlest = group;
-                       }
-
-                       if (most_spare < max_spare_cap) {
-                               most_spare = max_spare_cap;
-                               most_spare_sg = group;
-                       }
-               }
-       } while (group = group->next, group != sd->groups);
-
-       /*
-        * The cross-over point between using spare capacity or least load
-        * is too conservative for high utilization tasks on partially
-        * utilized systems if we require spare_capacity > task_util(p),
-        * so we allow for some task stuffing by using
-        * spare_capacity > task_util(p)/2.
-        *
-        * Spare capacity can't be used for fork because the utilization has
-        * not been set yet, we must first select a rq to compute the initial
-        * utilization.
-        */
-       if (sd_flag & SD_BALANCE_FORK)
-               goto skip_spare;
-
-       if (this_spare > task_util(p) / 2 &&
-           imbalance_scale*this_spare > 100*most_spare)
-               return NULL;
-
-       if (most_spare > task_util(p) / 2)
-               return most_spare_sg;
-
-skip_spare:
-       if (!idlest)
-               return NULL;
-
-       /*
-        * When comparing groups across NUMA domains, it's possible for the
-        * local domain to be very lightly loaded relative to the remote
-        * domains but "imbalance" skews the comparison making remote CPUs
-        * look much more favourable. When considering cross-domain, add
-        * imbalance to the load on the remote node and consider staying
-        * local.
-        */
-       if ((sd->flags & SD_NUMA) &&
-            min_load + imbalance >= this_load)
-               return NULL;
-
-       if (min_load >= this_load + imbalance)
-               return NULL;
-
-       if ((this_load < (min_load + imbalance)) &&
-           (100*this_load < imbalance_scale*min_load))
-               return NULL;
-
-       return idlest;
-}
+                 int this_cpu, int sd_flag);
 
 /*
  * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5734,7 +5616,7 @@ static inline int find_idlest_cpu(struct sched_domain 
*sd, struct task_struct *p
                return prev_cpu;
 
        /*
-        * We need task's util for capacity_spare_without, sync it up to
+        * We need task's util for cpu_util_without, sync it up to
         * prev_cpu's last_update_time.
         */
        if (!(sd_flag & SD_BALANCE_FORK))
@@ -7915,13 +7797,13 @@ static inline int sg_imbalanced(struct sched_group 
*group)
  * any benefit for the load balance.
  */
 static inline bool
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
        if (sgs->sum_nr_running < sgs->group_weight)
                return true;
 
        if ((sgs->group_capacity * 100) >
-                       (sgs->group_util * env->sd->imbalance_pct))
+                       (sgs->group_util * imbalance_pct))
                return true;
 
        return false;
@@ -7936,13 +7818,13 @@ group_has_capacity(struct lb_env *env, struct 
sg_lb_stats *sgs)
  *  false.
  */
 static inline bool
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
        if (sgs->sum_nr_running <= sgs->group_weight)
                return false;
 
        if ((sgs->group_capacity * 100) <
-                       (sgs->group_util * env->sd->imbalance_pct))
+                       (sgs->group_util * imbalance_pct))
                return true;
 
        return false;
@@ -7969,11 +7851,11 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, 
struct sched_group *ref)
 }
 
 static inline enum
-group_type group_classify(struct lb_env *env,
+group_type group_classify(unsigned int imbalance_pct,
                          struct sched_group *group,
                          struct sg_lb_stats *sgs)
 {
-       if (group_is_overloaded(env, sgs))
+       if (group_is_overloaded(imbalance_pct, sgs))
                return group_overloaded;
 
        if (sg_imbalanced(group))
@@ -7985,7 +7867,7 @@ group_type group_classify(struct lb_env *env,
        if (sgs->group_misfit_task_load)
                return group_misfit_task;
 
-       if (!group_has_capacity(env, sgs))
+       if (!group_has_capacity(imbalance_pct, sgs))
                return group_fully_busy;
 
        return group_has_spare;
@@ -8086,7 +7968,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
        sgs->group_weight = group->group_weight;
 
-       sgs->group_type = group_classify(env, group, sgs);
+       sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
 
        /* Computing avg_load makes sense only when group is overloaded */
        if (sgs->group_type == group_overloaded)
@@ -8241,6 +8123,252 @@ static inline enum fbq_type fbq_classify_rq(struct rq 
*rq)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+
+struct sg_lb_stats;
+
+/*
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
+ * @denv: The ched_domain level to look for idlest group.
+ * @group: sched_group whose statistics are to be updated.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
+                                         struct sched_group *group,
+                                         struct sg_lb_stats *sgs,
+                                         struct task_struct *p)
+{
+       int i, nr_running;
+
+       memset(sgs, 0, sizeof(*sgs));
+
+       for_each_cpu(i, sched_group_span(group)) {
+               struct rq *rq = cpu_rq(i);
+
+               sgs->group_load += cpu_load(rq);
+               sgs->group_util += cpu_util_without(i, p);
+               sgs->sum_h_nr_running += rq->cfs.h_nr_running;
+
+               nr_running = rq->nr_running;
+               sgs->sum_nr_running += nr_running;
+
+               /*
+                * No need to call idle_cpu() if nr_running is not 0
+                */
+               if (!nr_running && idle_cpu(i))
+                       sgs->idle_cpus++;
+
+
+       }
+
+       /* Check if task fits in the group */
+       if (sd->flags & SD_ASYM_CPUCAPACITY &&
+           !task_fits_capacity(p, group->sgc->max_capacity)) {
+               sgs->group_misfit_task_load = 1;
+       }
+
+       sgs->group_capacity = group->sgc->capacity;
+
+       sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
+
+       /*
+        * Computing avg_load makes sense only when group is fully busy or
+        * overloaded
+        */
+       if (sgs->group_type < group_fully_busy)
+               sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+                               sgs->group_capacity;
+}
+
+static bool update_pick_idlest(struct sched_group *idlest,
+                              struct sg_lb_stats *idlest_sgs,
+                              struct sched_group *group,
+                              struct sg_lb_stats *sgs)
+{
+       if (sgs->group_type < idlest_sgs->group_type)
+               return true;
+
+       if (sgs->group_type > idlest_sgs->group_type)
+               return false;
+
+       /*
+        * The candidate and the current idles group are the same type of
+        * group. Let check which one is the idlest according to the type.
+        */
+
+       switch (sgs->group_type) {
+       case group_overloaded:
+       case group_fully_busy:
+               /* Select the group with lowest avg_load. */
+               if (idlest_sgs->avg_load <= sgs->avg_load)
+                       return false;
+               break;
+
+       case group_imbalanced:
+       case group_asym_packing:
+               /* Those types are not used in the slow wakeup path */
+               return false;
+
+       case group_misfit_task:
+               /* Select group with the highest max capacity */
+               if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+                       return false;
+               break;
+
+       case group_has_spare:
+               /* Select group with most idle CPUs */
+               if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+                       return false;
+               break;
+       }
+
+       return true;
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int sd_flag)
+{
+       struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
+       struct sg_lb_stats local_sgs, tmp_sgs;
+       struct sg_lb_stats *sgs;
+       unsigned long imbalance;
+       struct sg_lb_stats idlest_sgs = {
+                       .avg_load = UINT_MAX,
+                       .group_type = group_overloaded,
+       };
+
+       imbalance = scale_load_down(NICE_0_LOAD) *
+                               (sd->imbalance_pct-100) / 100;
+
+       do {
+               int local_group;
+
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpumask_intersects(sched_group_span(group),
+                                       p->cpus_ptr))
+                       continue;
+
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_span(group));
+
+               if (local_group) {
+                       sgs = &local_sgs;
+                       local = group;
+               } else {
+                       sgs = &tmp_sgs;
+               }
+
+               update_sg_wakeup_stats(sd, group, sgs, p);
+
+               if (!local_group && update_pick_idlest(idlest, &idlest_sgs, 
group, sgs)) {
+                       idlest = group;
+                       idlest_sgs = *sgs;
+               }
+
+       } while (group = group->next, group != sd->groups);
+
+
+       /* There is no idlest group to push tasks to */
+       if (!idlest)
+               return NULL;
+
+       /*
+        * If the local group is idler than the selected idlest group
+        * don't try and push the task.
+        */
+       if (local_sgs.group_type < idlest_sgs.group_type)
+               return NULL;
+
+       /*
+        * If the local group is busier than the selected idlest group
+        * try and push the task.
+        */
+       if (local_sgs.group_type > idlest_sgs.group_type)
+               return idlest;
+
+       switch (local_sgs.group_type) {
+       case group_overloaded:
+       case group_fully_busy:
+               /*
+                * When comparing groups across NUMA domains, it's possible for
+                * the local domain to be very lightly loaded relative to the
+                * remote domains but "imbalance" skews the comparison making
+                * remote CPUs look much more favourable. When considering
+                * cross-domain, add imbalance to the load on the remote node
+                * and consider staying local.
+                */
+
+               if ((sd->flags & SD_NUMA) &&
+                   ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
+                       return NULL;
+
+               /*
+                * If the local group is less loaded than the selected
+                * idlest group don't try and push any tasks.
+                */
+               if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
+                       return NULL;
+
+               if (100 * local_sgs.avg_load <= sd->imbalance_pct * 
idlest_sgs.avg_load)
+                       return NULL;
+               break;
+
+       case group_imbalanced:
+       case group_asym_packing:
+               /* Those type are not used in the slow wakeup path */
+               return NULL;
+
+       case group_misfit_task:
+               /* Select group with the highest max capacity */
+               if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+                       return NULL;
+               break;
+
+       case group_has_spare:
+               if (sd->flags & SD_NUMA) {
+#ifdef CONFIG_NUMA_BALANCING
+                       int idlest_cpu;
+                       /*
+                        * If there is spare capacity at NUMA, try to select
+                        * the preferred node
+                        */
+                       if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+                               return NULL;
+
+                       idlest_cpu = cpumask_first(sched_group_span(idlest));
+                       if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+                               return idlest;
+#endif
+                       /*
+                        * Otherwise, keep the task on this node to stay close
+                        * its wakeup source and improve locality. If there is
+                        * a real need of migration, periodic load balance will
+                        * take care of it.
+                        */
+                       if (local_sgs.idle_cpus)
+                               return NULL;
+               }
+
+               /*
+                * Select group with highest number of idle cpus. We could also
+                * compare the utilization which is more stable but it can end
+                * up that the group has less spare capacity but finally more
+                * idle cpus which means more opportunity to run task.
+                */
+               if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
+                       return NULL;
+               break;
+       }
+
+       return idlest;
+}
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
-- 
2.7.4

[PATCH v4 11/11] sched/fair: rework find_idlest_group

Reply via email to