[GIT PULL] scheduler fixes

Ingo Molnar Sat, 14 Oct 2017 09:12:07 -0700

Linus,

Please pull the latest sched-urgent-for-linus git tree from:


   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 sched/core: Ensure 
load_balance() respects the active_mask

Three fixes that address an SMP balancing performance regression.

 Thanks,

        Ingo

------------------>
Peter Zijlstra (3):
      sched/core: Fix wake_affine() performance regression
      sched/core: Address more wake_affine() regressions
      sched/core: Ensure load_balance() respects the active_mask


 include/linux/sched/topology.h |   8 ---
 kernel/sched/fair.c            | 140 ++++++++++++++---------------------------
 kernel/sched/features.h        |   3 +
 3 files changed, 49 insertions(+), 102 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
        atomic_t        ref;
        atomic_t        nr_busy_cpus;
        int             has_idle_cores;
-
-       /*
-        * Some variables from the most recent sd_lb_stats for this domain,
-        * used by wake_affine().
-        */
-       unsigned long   nr_running;
-       unsigned long   load;
-       unsigned long   capacity;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
 
-struct llc_stats {
-       unsigned long   nr_running;
-       unsigned long   load;
-       unsigned long   capacity;
-       int             has_capacity;
-};
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *                     will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ *                       scheduling latency of the CPUs. This seems to work
+ *                       for the overloaded case.
+ */
 
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+                int this_cpu, int prev_cpu, int sync)
 {
-       struct sched_domain_shared *sds = 
rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
-       if (!sds)
-               return false;
+       if (idle_cpu(this_cpu))
+               return true;
 
-       stats->nr_running       = READ_ONCE(sds->nr_running);
-       stats->load             = READ_ONCE(sds->load);
-       stats->capacity         = READ_ONCE(sds->capacity);
-       stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
+       if (sync && cpu_rq(this_cpu)->nr_running == 1)
+               return true;
 
-       return true;
+       return false;
 }
 
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
 static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
-               int this_cpu, int prev_cpu, int sync)
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+                  int this_cpu, int prev_cpu, int sync)
 {
-       struct llc_stats prev_stats, this_stats;
        s64 this_eff_load, prev_eff_load;
        unsigned long task_load;
 
-       if (!get_llc_stats(&prev_stats, prev_cpu) ||
-           !get_llc_stats(&this_stats, this_cpu))
-               return false;
+       this_eff_load = target_load(this_cpu, sd->wake_idx);
+       prev_eff_load = source_load(prev_cpu, sd->wake_idx);
 
-       /*
-        * If sync wakeup then subtract the (maximum possible)
-        * effect of the currently running task from the load
-        * of the current LLC.
-        */
        if (sync) {
                unsigned long current_load = task_h_load(current);
 
-               /* in this case load hits 0 and this LLC is considered 'idle' */
-               if (current_load > this_stats.load)
+               if (current_load > this_eff_load)
                        return true;
 
-               this_stats.load -= current_load;
+               this_eff_load -= current_load;
        }
 
-       /*
-        * The has_capacity stuff is not SMT aware, but by trying to balance
-        * the nr_running on both ends we try and fill the domain at equal
-        * rates, thereby first consuming cores before siblings.
-        */
-
-       /* if the old cache has capacity, stay there */
-       if (prev_stats.has_capacity && prev_stats.nr_running < 
this_stats.nr_running+1)
-               return false;
-
-       /* if this cache has capacity, come here */
-       if (this_stats.has_capacity && this_stats.nr_running+1 < 
prev_stats.nr_running)
-               return true;
-
-       /*
-        * Check to see if we can move the load without causing too much
-        * imbalance.
-        */
        task_load = task_h_load(p);
 
-       this_eff_load = 100;
-       this_eff_load *= prev_stats.capacity;
-
-       prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-       prev_eff_load *= this_stats.capacity;
+       this_eff_load += task_load;
+       if (sched_feat(WA_BIAS))
+               this_eff_load *= 100;
+       this_eff_load *= capacity_of(prev_cpu);
 
-       this_eff_load *= this_stats.load + task_load;
-       prev_eff_load *= prev_stats.load - task_load;
+       prev_eff_load -= task_load;
+       if (sched_feat(WA_BIAS))
+               prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+       prev_eff_load *= capacity_of(this_cpu);
 
        return this_eff_load <= prev_eff_load;
 }
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
                       int prev_cpu, int sync)
 {
        int this_cpu = smp_processor_id();
-       bool affine;
+       bool affine = false;
 
-       /*
-        * Default to no affine wakeups; wake_affine() should not effect a task
-        * placement the load-balancer feels inclined to undo. The conservative
-        * option is therefore to not move tasks when they wake up.
-        */
-       affine = false;
+       if (sched_feat(WA_IDLE) && !affine)
+               affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
 
-       /*
-        * If the wakeup is across cache domains, try to evaluate if movement
-        * makes sense, otherwise rely on select_idle_siblings() to do
-        * placement inside the cache domain.
-        */
-       if (!cpus_share_cache(prev_cpu, this_cpu))
-               affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+       if (sched_feat(WA_WEIGHT) && !affine)
+               affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
 
        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats 
*sds)
 {
-       struct sched_domain_shared *shared = env->sd->shared;
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
        }
-
-       if (!shared)
-               return;
-
-       /*
-        * Since these are sums over groups they can contain some CPUs
-        * multiple times for the NUMA domains.
-        *
-        * Currently only wake_affine_llc() and find_busiest_group()
-        * uses these numbers, only the last is affected by this problem.
-        *
-        * XXX fix that.
-        */
-       WRITE_ONCE(shared->nr_running,  sds->total_running);
-       WRITE_ONCE(shared->load,        sds->total_load);
-       WRITE_ONCE(shared->capacity,    sds->total_capacity);
 }
 
 /**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
        int cpu, balance_cpu = -1;
 
        /*
+        * Ensure the balancing environment is consistent; can happen
+        * when the softirq triggers 'during' hotplug.
+        */
+       if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+               return 0;
+
+       /*
         * In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)

[GIT PULL] scheduler fixes

Reply via email to