SUSE's regression testing noticed that...

0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in 
wake_up_new_task()

...introduced a hackbench regression, and indeed it does.  I think this
regression has more to do with randomness than anything else, but in
general...

While averaging calms down load balancing, helping to keep migrations
down to a dull roar, it's not completely wonderful when it comes to
things that live in the here and now, hackbench being one such.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real    0m55.397s
user    0m8.320s
sys     5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real    0m48.049s
user    0m6.510s
sys     5m6.291s

Signed-off-by: Mike Galbraith <[email protected]>
---
 kernel/sched/fair.c     |   54 ++++++++++++++++++++++++------------------------
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    6 +++++
 3 files changed, 35 insertions(+), 26 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,7 +738,7 @@ void post_init_entity_util_avg(struct sc
        }
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, 
int avg);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1229,9 +1229,9 @@ bool should_numa_migrate_memory(struct t
               group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long weighted_cpuload(const int cpu, int avg);
+static unsigned long source_load(int cpu, int type, int avg);
+static unsigned long target_load(int cpu, int type, int avg);
 static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
@@ -1261,7 +1261,7 @@ static void update_numa_stats(struct num
                struct rq *rq = cpu_rq(cpu);
 
                ns->nr_running += rq->nr_running;
-               ns->load += weighted_cpuload(cpu);
+               ns->load += weighted_cpuload(cpu, LOAD_AVERAGE);
                ns->compute_capacity += capacity_of(cpu);
 
                cpus++;
@@ -3102,8 +3102,10 @@ void remove_entity_load_avg(struct sched
        atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, 
int avg)
 {
+       if (sched_feat(LB_INSTANTANEOUS_LOAD) && avg == LOAD_INSTANT)
+               return cfs_rq->load.weight;
        return cfs_rq->runnable_load_avg;
 }
 
@@ -4701,9 +4703,9 @@ static void cpu_load_update(struct rq *t
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(const int cpu, int avg)
 {
-       return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+       return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs, avg);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -4748,7 +4750,7 @@ static void cpu_load_update_idle(struct
        /*
         * bail if there's load or we're actually up-to-date.
         */
-       if (weighted_cpuload(cpu_of(this_rq)))
+       if (weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE))
                return;
 
        cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4769,7 +4771,7 @@ void cpu_load_update_nohz_start(void)
         * concurrently we'll exit nohz. And cpu_load write can race with
         * cpu_load_update_idle() but both updater would be writing the same.
         */
-       this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+       this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 }
 
 /*
@@ -4784,7 +4786,7 @@ void cpu_load_update_nohz_stop(void)
        if (curr_jiffies == this_rq->last_load_update_tick)
                return;
 
-       load = weighted_cpuload(cpu_of(this_rq));
+       load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
        raw_spin_lock(&this_rq->lock);
        update_rq_clock(this_rq);
        cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4810,7 +4812,7 @@ static void cpu_load_update_periodic(str
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-       unsigned long load = weighted_cpuload(cpu_of(this_rq));
+       unsigned long load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 
        if (tick_nohz_tick_stopped())
                cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4825,10 +4827,10 @@ void cpu_load_update_active(struct rq *t
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int type, int avg)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
+       unsigned long total = weighted_cpuload(cpu, avg);
 
        if (type == 0 || !sched_feat(LB_BIAS))
                return total;
@@ -4840,10 +4842,10 @@ static unsigned long source_load(int cpu
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int type, int avg)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
+       unsigned long total = weighted_cpuload(cpu, avg);
 
        if (type == 0 || !sched_feat(LB_BIAS))
                return total;
@@ -4865,7 +4867,7 @@ static unsigned long cpu_avg_load_per_ta
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-       unsigned long load_avg = weighted_cpuload(cpu);
+       unsigned long load_avg = weighted_cpuload(cpu, LOAD_AVERAGE);
 
        if (nr_running)
                return load_avg / nr_running;
@@ -5047,8 +5049,8 @@ static int wake_affine(struct sched_doma
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
        prev_cpu  = task_cpu(p);
-       load      = source_load(prev_cpu, idx);
-       this_load = target_load(this_cpu, idx);
+       load      = source_load(prev_cpu, idx, LOAD_AVERAGE);
+       this_load = target_load(this_cpu, idx, LOAD_AVERAGE);
 
        /*
         * If sync wakeup then subtract the (maximum possible)
@@ -5136,9 +5138,9 @@ find_idlest_group(struct sched_domain *s
                for_each_cpu(i, sched_group_cpus(group)) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                               load = source_load(i, load_idx);
+                               load = source_load(i, load_idx, LOAD_INSTANT);
                        else
-                               load = target_load(i, load_idx);
+                               load = target_load(i, load_idx, LOAD_INSTANT);
 
                        avg_load += load;
                }
@@ -5197,7 +5199,7 @@ find_idlest_cpu(struct sched_group *grou
                                shallowest_idle_cpu = i;
                        }
                } else if (shallowest_idle_cpu == -1) {
-                       load = weighted_cpuload(i);
+                       load = weighted_cpuload(i, LOAD_INSTANT);
                        if (load < min_load || (load == min_load && i == 
this_cpu)) {
                                min_load = load;
                                least_loaded_cpu = i;
@@ -6982,9 +6984,9 @@ static inline void update_sg_lb_stats(st
 
                /* Bias balancing toward cpus of our domain */
                if (local_group)
-                       load = target_load(i, load_idx);
+                       load = target_load(i, load_idx, LOAD_AVERAGE);
                else
-                       load = source_load(i, load_idx);
+                       load = source_load(i, load_idx, LOAD_AVERAGE);
 
                sgs->group_load += load;
                sgs->group_util += cpu_util(i);
@@ -6998,7 +7000,7 @@ static inline void update_sg_lb_stats(st
                sgs->nr_numa_running += rq->nr_numa_running;
                sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-               sgs->sum_weighted_load += weighted_cpuload(i);
+               sgs->sum_weighted_load += weighted_cpuload(i, LOAD_AVERAGE);
                /*
                 * No need to call idle_cpu() if nr_running is not 0
                 */
@@ -7510,7 +7512,7 @@ static struct rq *find_busiest_queue(str
 
                capacity = capacity_of(i);
 
-               wl = weighted_cpuload(i);
+               wl = weighted_cpuload(i, LOAD_AVERAGE);
 
                /*
                 * When comparing with imbalance, use weighted_cpuload()
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(LB_INSTANTANEOUS_LOAD, false)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1630,6 +1630,12 @@ static inline void double_rq_unlock(stru
                __release(rq2->lock);
 }
 
+/*
+ * Tell load balancing functions whether we want instant or average load
+ */
+#define LOAD_INSTANT   0
+#define LOAD_AVERAGE   1
+
 #else /* CONFIG_SMP */
 
 /*

Reply via email to