Commit 88b8dac0 makes load_balance() consider other cpus in its group.
But, in that, there is no code for preventing to re-select dst-cpu.
So, same dst-cpu can be selected over and over.

This patch add functionality to load_balance() in order to exclude
cpu which is selected once.

Cc: Srivatsa Vaddagiri <va...@linux.vnet.ibm.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo....@lge.com>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e6f8783..d4c6ed0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6814,6 +6814,7 @@ struct task_group root_task_group;
 LIST_HEAD(task_groups);
 #endif
 
+DECLARE_PER_CPU(cpumask_var_t, load_balance_dst_grp);
 DECLARE_PER_CPU(cpumask_var_t, load_balance_cpu_active);
 
 void __init sched_init(void)
@@ -6828,7 +6829,7 @@ void __init sched_init(void)
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
-       alloc_size += num_possible_cpus() * cpumask_size();
+       alloc_size += num_possible_cpus() * cpumask_size() * 2;
 #endif
        if (alloc_size) {
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -6851,6 +6852,8 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
+                       per_cpu(load_balance_dst_grp, i) = (void *)ptr;
+                       ptr += cpumask_size();
                        per_cpu(load_balance_cpu_active, i) = (void *)ptr;
                        ptr += cpumask_size();
                }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7382fa5..70631e8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4974,6 +4974,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 #define MAX_PINNED_INTERVAL    512
 
 /* Working cpumask for load_balance and load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_dst_grp);
 DEFINE_PER_CPU(cpumask_var_t, load_balance_cpu_active);
 
 static int need_active_balance(struct lb_env *env)
@@ -5005,17 +5006,17 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,
                        int *balance)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
-       int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
+       struct cpumask *dst_grp = __get_cpu_var(load_balance_dst_grp);
        struct cpumask *cpus = __get_cpu_var(load_balance_cpu_active);
 
        struct lb_env env = {
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
-               .dst_grpmask    = sched_group_cpus(sd->groups),
+               .dst_grpmask    = dst_grp,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
@@ -5025,9 +5026,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         * other cpus in our group */
        if (idle == CPU_NEWLY_IDLE) {
                env.dst_grpmask = NULL;
-               max_lb_iterations = 0;
        } else {
-               max_lb_iterations = cpumask_weight(env.dst_grpmask);
+               cpumask_copy(dst_grp, sched_group_cpus(sd->groups));
+               cpumask_clear_cpu(env.dst_cpu, env.dst_grpmask);
        }
        cpumask_copy(cpus, cpu_active_mask);
 
@@ -5055,7 +5056,6 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 
        ld_moved = 0;
-       lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -5112,14 +5112,17 @@ more_balance:
                 * moreover subsequent load balance cycles should correct the
                 * excess load moved.
                 */
-               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
-                               lb_iterations++ < max_lb_iterations) {
+               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
 
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
                        env.flags       &= ~LBF_SOME_PINNED;
                        env.loop         = 0;
                        env.loop_break   = sched_nr_migrate_break;
+
+                       /* Prevent to re-select dst_cpu */
+                       cpumask_clear_cpu(env.dst_cpu, env.dst_grpmask);
+
                        /*
                         * Go back to "more_balance" rather than "redo" since we
                         * need to continue with same src_cpu.
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to