While selecting a task to pull across a node, try to choose a task that
improves locatity. i.e choose a task that has more affinity to the
destination node.

To achieve this, parse the list of tasks in multiple iterations. For now
choose just two iterations.  In the first iteration, a task is chosen to
move if and only if moving such a task helps improve node locality.  In
the last iteration, choose the default behaviour, i.e, a task is chosen
irrespective of whether it improves node locality or not.(behaviour
before this change). This iteration logic is only for cross node
migration and with CONFIG_NUMA_BALANCING enabled.

So if there are two tasks in a runq, both eligible to be migrated to
another runq belonging to a different node, then this change tries to
chose a task among the two that improves locality.

Similar logic was first used in Peter Zijlstra's numa core.

Signed-off-by: Srikar Dronamraju <sri...@linux.vnet.ibm.com>
---
 kernel/sched/fair.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3df7f76..8fcbf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3906,6 +3906,7 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+       unsigned int            iterations;
 };
 
 /*
@@ -4030,6 +4031,21 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
        return 1;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static bool preferred_node(struct task_struct *p, struct lb_env *env)
+{
+       if (!(env->sd->flags & SD_NUMA))
+               return false;
+
+       return (can_numa_migrate_task(p, env->dst_rq, env->src_rq) == 1);
+}
+#else
+static bool preferred_node(struct task_struct *p, struct lb_env *env)
+{
+       return false;
+}
+#endif
+
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
@@ -4041,7 +4057,11 @@ static int move_one_task(struct lb_env *env)
 {
        struct task_struct *p, *n;
 
+again:
        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+               if (!preferred_node(p, env))
+                       continue;
+
                if (throttled_lb_pair(task_group(p), env->src_rq->cpu, 
env->dst_cpu))
                        continue;
 
@@ -4049,6 +4069,7 @@ static int move_one_task(struct lb_env *env)
                        continue;
 
                move_task(p, env);
+
                /*
                 * Right now, this is only the second place move_task()
                 * is called, so we can safely collect move_task()
@@ -4057,6 +4078,9 @@ static int move_one_task(struct lb_env *env)
                schedstat_inc(env->sd, lb_gained[env->idle]);
                return 1;
        }
+       if (!env->iterations++)
+               goto again;
+
        return 0;
 }
 
@@ -4096,6 +4120,9 @@ static int move_tasks(struct lb_env *env)
                        break;
                }
 
+               if (!preferred_node(p, env))
+                       goto next;
+
                if (throttled_lb_pair(task_group(p), env->src_cpu, 
env->dst_cpu))
                        goto next;
 
@@ -5099,6 +5126,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+               .iterations     = 1,
        };
 
        cpumask_copy(cpus, cpu_active_mask);
@@ -5130,6 +5158,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        ld_moved = 0;
        lb_iterations = 1;
        if (busiest->nr_running > 1) {
+#ifdef CONFIG_NUMA_BALANCING
+               if (sd->flags & SD_NUMA) {
+                       if (cpu_to_node(env.dst_cpu) != 
cpu_to_node(env.src_cpu))
+                               env.iterations = 0;
+               }
+#endif
                /*
                 * Attempt to move tasks. If find_busiest_group has found
                 * an imbalance but busiest->nr_running <= 1, the group is
@@ -5160,6 +5194,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        goto more_balance;
                }
 
+               if (!ld_moved && !env.iterations++) {
+                       env.loop         = 0;
+                       env.loop_break   = sched_nr_migrate_break;
+                       goto more_balance;
+               }
+
                /*
                 * some other cpu did the load balance for us.
                 */
@@ -5407,8 +5447,16 @@ static int active_load_balance_cpu_stop(void *data)
                        .src_cpu        = busiest_rq->cpu,
                        .src_rq         = busiest_rq,
                        .idle           = CPU_IDLE,
+                       .iterations     = 1,
                };
 
+#ifdef CONFIG_NUMA_BALANCING
+               if ((sd->flags & SD_NUMA)) {
+                       if (cpu_to_node(env.dst_cpu) != 
cpu_to_node(env.src_cpu))
+                               env.iterations = 0;
+               }
+#endif
+
                schedstat_inc(sd, alb_count);
 
                if (move_one_task(&env))
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to