Implement home node preference in the load-balancer.

This is done in four pieces:

 - task_numa_hot(); make it harder to migrate tasks away from their
   home-node, controlled using the NUMA_HOT feature flag.

 - select_task_rq_fair(); prefer placing the task in their home-node,
   controlled using the NUMA_BIAS feature flag.

 - load_balance(); during the regular pull load-balance pass, try
   pulling tasks that are on the wrong node first with a preference
   of moving them nearer to their home-node through task_numa_hot(),
   controlled through the NUMA_PULL feature flag.

 - load_balance(); when the balancer finds no imbalance, introduce
   some such that it still prefers to move tasks towards their
   home-node, using active load-balance if needed, controlled through
   the NUMA_PULL_BIAS feature flag.

In order to easily find off-node tasks, split the per-cpu task list
into two parts.

Cc: Paul Turner <[email protected]>
Cc: Lee Schermerhorn <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Linus Torvalds <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
---
 include/linux/sched.h   |    1 
 kernel/sched/core.c     |   21 +++-
 kernel/sched/debug.c    |    3 
 kernel/sched/fair.c     |  236 ++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/features.h |    8 +
 kernel/sched/sched.h    |   19 +++
 6 files changed, 271 insertions(+), 17 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -862,6 +862,7 @@ enum cpu_idle_type {
 #define SD_ASYM_PACKING                0x0800  /* Place busy groups earlier in 
the domain */
 #define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling 
domain */
 #define SD_OVERLAP             0x2000  /* sched_domains of this level overlap 
*/
+#define SD_NUMA                        0x4000  /* cross-node balancing */
 
 extern int __weak arch_sd_sibiling_asym_packing(void);
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6034,7 +6034,9 @@ static void destroy_sched_domains(struct
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
 
-static void update_top_cache_domain(int cpu)
+DEFINE_PER_CPU(struct sched_domain *, sd_node);
+
+static void update_domain_cache(int cpu)
 {
        struct sched_domain *sd;
        int id = cpu;
@@ -6077,6 +6079,15 @@ static void update_top_cache_domain(int 
 
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
+
+       for_each_domain(cpu, sd) {
+               if (cpumask_equal(sched_domain_span(sd),
+                                 cpumask_of_node(cpu_to_node(cpu))))
+                       goto got_node;
+       }
+       sd = NULL;
+got_node:
+       rcu_assign_pointer(per_cpu(sd_node, cpu), sd);
 }
 
 /*
@@ -6119,7 +6130,7 @@ cpu_attach_domain(struct sched_domain *s
        rcu_assign_pointer(rq->sd, sd);
        destroy_sched_domains(tmp, cpu);
 
-       update_top_cache_domain(cpu);
+       update_domain_cache(cpu);
 }
 
 /* cpus with isolated domains */
@@ -6619,6 +6630,7 @@ sd_numa_init(struct sched_domain_topolog
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 1*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                        | sd_local_flags(level)
                                        ,
                .last_balance           = jiffies,
@@ -7410,6 +7422,11 @@ void __init sched_init(void)
                rq->avg_idle = 2*sysctl_sched_migration_cost;
 
                INIT_LIST_HEAD(&rq->cfs_tasks);
+#ifdef CONFIG_NUMA
+               INIT_LIST_HEAD(&rq->offnode_tasks);
+               rq->offnode_running = 0;
+               rq->offnode_weight = 0;
+#endif
 
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,6 +132,9 @@ print_task(struct seq_file *m, struct rq
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_NUMA
+       SEQ_printf(m, " %d/%d", p->node, cpu_to_node(task_cpu(p)));
+#endif
 #ifdef CONFIG_CGROUP_SCHED
        SEQ_printf(m, " %s", task_group_path(task_group(p)));
 #endif
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/random.h>
 
 #include <trace/events/sched.h>
 
@@ -2688,6 +2693,7 @@ select_task_rq_fair(struct task_struct *
        int want_affine = 0;
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
+       int node = tsk_home_node(p);
 
        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
@@ -2699,6 +2705,29 @@ select_task_rq_fair(struct task_struct *
        }
 
        rcu_read_lock();
+       if (sched_feat_numa(NUMA_BIAS) && node != -1) {
+               int node_cpu;
+
+               node_cpu = cpumask_any_and(tsk_cpus_allowed(p), 
cpumask_of_node(node));
+               if (node_cpu >= nr_cpu_ids)
+                       goto find_sd;
+
+               /*
+                * For fork,exec find the idlest cpu in the home-node.
+                */
+               if (sd_flag & (SD_BALANCE_FORK|SD_BALANCE_EXEC)) {
+                       new_cpu = cpu = node_cpu;
+                       sd = per_cpu(sd_node, cpu);
+                       goto pick_idlest;
+               }
+
+               /*
+                * For wake, pretend we were running in the home-node.
+                */
+               prev_cpu = node_cpu;
+       }
+
+find_sd:
        for_each_domain(cpu, tmp) {
                if (!(tmp->flags & SD_LOAD_BALANCE))
                        continue;
@@ -2752,6 +2781,7 @@ select_task_rq_fair(struct task_struct *
                goto unlock;
        }
 
+pick_idlest:
        while (sd) {
                int load_idx = sd->forkexec_idx;
                struct sched_group *group;
@@ -3071,6 +3101,8 @@ struct lb_env {
        long                    imbalance;
        unsigned int            flags;
 
+       struct list_head        *tasks;
+
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
@@ -3092,6 +3124,23 @@ static void move_task(struct task_struct
        check_preempt_curr(env->dst_rq, p, 0);
 }
 
+static int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
+{
+       int from_dist, to_dist;
+       int node = tsk_home_node(p);
+
+       if (!sched_feat_numa(NUMA_HOT) || node == -1)
+               return 0; /* no node preference */
+
+       from_dist = node_distance(cpu_to_node(from_cpu), node);
+       to_dist = node_distance(cpu_to_node(to_cpu), node);
+
+       if (to_dist < from_dist)
+               return 0; /* getting closer is ok */
+
+       return 1; /* stick to where we are */
+}
+
 /*
  * Is this task likely cache-hot:
  */
@@ -3177,6 +3226,7 @@ int can_migrate_task(struct task_struct 
         */
 
        tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+       tsk_cache_hot |= task_numa_hot(p, env->src_cpu, env->dst_cpu);
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -3202,11 +3252,11 @@ int can_migrate_task(struct task_struct 
  *
  * Called with both runqueues locked.
  */
-static int move_one_task(struct lb_env *env)
+static int __move_one_task(struct lb_env *env)
 {
        struct task_struct *p, *n;
 
-       list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+       list_for_each_entry_safe(p, n, env->tasks, se.group_node) {
                if (throttled_lb_pair(task_group(p), env->src_rq->cpu, 
env->dst_cpu))
                        continue;
 
@@ -3225,6 +3275,21 @@ static int move_one_task(struct lb_env *
        return 0;
 }
 
+static int move_one_task(struct lb_env *env)
+{
+       if (sched_feat_numa(NUMA_PULL)) {
+               env->tasks = offnode_tasks(env->src_rq);
+               if (__move_one_task(env))
+                       return 1;
+       }
+
+       env->tasks = &env->src_rq->cfs_tasks;
+       if (__move_one_task(env))
+               return 1;
+
+       return 0;
+}
+
 static unsigned long task_h_load(struct task_struct *p);
 
 static const unsigned int sched_nr_migrate_break = 32;
@@ -3238,7 +3303,6 @@ static const unsigned int sched_nr_migra
  */
 static int move_tasks(struct lb_env *env)
 {
-       struct list_head *tasks = &env->src_rq->cfs_tasks;
        struct task_struct *p;
        unsigned long load;
        int pulled = 0;
@@ -3246,8 +3310,9 @@ static int move_tasks(struct lb_env *env
        if (env->imbalance <= 0)
                return 0;
 
-       while (!list_empty(tasks)) {
-               p = list_first_entry(tasks, struct task_struct, se.group_node);
+again:
+       while (!list_empty(env->tasks)) {
+               p = list_first_entry(env->tasks, struct task_struct, 
se.group_node);
 
                env->loop++;
                /* We've more or less seen every task there is, call it quits */
@@ -3258,7 +3323,7 @@ static int move_tasks(struct lb_env *env
                if (env->loop > env->loop_break) {
                        env->loop_break += sched_nr_migrate_break;
                        env->flags |= LBF_NEED_BREAK;
-                       break;
+                       goto out;
                }
 
                if (throttled_lb_pair(task_group(p), env->src_cpu, 
env->dst_cpu))
@@ -3286,7 +3351,7 @@ static int move_tasks(struct lb_env *env
                 * the critical section.
                 */
                if (env->idle == CPU_NEWLY_IDLE)
-                       break;
+                       goto out;
 #endif
 
                /*
@@ -3294,13 +3359,20 @@ static int move_tasks(struct lb_env *env
                 * weighted load.
                 */
                if (env->imbalance <= 0)
-                       break;
+                       goto out;
 
                continue;
 next:
-               list_move_tail(&p->se.group_node, tasks);
+               list_move_tail(&p->se.group_node, env->tasks);
        }
 
+       if (env->tasks == offnode_tasks(env->src_rq)) {
+               env->tasks = &env->src_rq->cfs_tasks;
+               env->loop = 0;
+               goto again;
+       }
+
+out:
        /*
         * Right now, this is one of only two places move_task() is called,
         * so we can safely collect move_task() stats here rather than
@@ -3447,6 +3519,11 @@ struct sd_lb_stats {
        unsigned int  busiest_group_weight;
 
        int group_imb; /* Is there imbalance in this sd */
+#ifdef CONFIG_NUMA
+       struct sched_group *numa_group; /* group which has offnode_tasks */
+       unsigned long numa_group_weight;
+       unsigned long numa_group_running;
+#endif
 };
 
 /*
@@ -3462,6 +3539,10 @@ struct sg_lb_stats {
        unsigned long group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA
+       unsigned long numa_weight;
+       unsigned long numa_running;
+#endif
 };
 
 /**
@@ -3490,6 +3571,117 @@ static inline int get_sd_load_idx(struct
        return load_idx;
 }
 
+#ifdef CONFIG_NUMA
+static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
+{
+       sgs->numa_weight += rq->offnode_weight;
+       sgs->numa_running += rq->offnode_running;
+}
+
+/*
+ * Since the offnode lists are indiscriminate (they contain tasks for all other
+ * nodes) it is impossible to say if there's any task on there that wants to
+ * move towards the pulling cpu. Therefore select a random offnode list to pull
+ * from such that eventually we'll try them all.
+ */
+static inline bool pick_numa_rand(void)
+{
+       return get_random_int() & 1;
+}
+
+/*
+ * Select a random group that has offnode tasks as sds->numa_group
+ */
+static inline void update_sd_numa_stats(struct sched_domain *sd,
+               struct sched_group *group, struct sd_lb_stats *sds,
+               int local_group, struct sg_lb_stats *sgs)
+{
+       if (!(sd->flags & SD_NUMA))
+               return;
+
+       if (local_group)
+               return;
+
+       if (!sgs->numa_running)
+               return;
+
+       if (!sds->numa_group || pick_numa_rand()) {
+               sds->numa_group = group;
+               sds->numa_group_weight = sgs->numa_weight;
+               sds->numa_group_running = sgs->numa_running;
+       }
+}
+
+/*
+ * Pick a random queue from the group that has offnode tasks.
+ */
+static struct rq *find_busiest_numa_queue(struct lb_env *env,
+                                         struct sched_group *group,
+                                         const struct cpumask *cpus)
+{
+       struct rq *busiest = NULL, *rq;
+       int cpu;
+
+       for_each_cpu_and(cpu, sched_group_cpus(group), cpus) {
+               rq = cpu_rq(cpu);
+               if (!rq->offnode_running)
+                       continue;
+               if (!busiest || pick_numa_rand())
+                       busiest = rq;
+       }
+
+       return busiest;
+}
+
+/*
+ * Called in case of no other imbalance, if there is a queue running offnode
+ * tasksk we'll say we're imbalanced anyway to nudge these tasks towards their
+ * proper node.
+ */
+static inline int check_numa_busiest_group(struct lb_env *env, struct 
sd_lb_stats *sds)
+{
+       if (!sched_feat(NUMA_PULL_BIAS))
+               return 0;
+
+       if (!sds->numa_group)
+               return 0;
+
+       env->imbalance = sds->numa_group_weight / sds->numa_group_running;
+       sds->busiest = sds->numa_group;
+       env->find_busiest_queue = find_busiest_numa_queue;
+       return 1;
+}
+
+static inline bool need_active_numa_balance(struct lb_env *env)
+{
+       return env->find_busiest_queue == find_busiest_numa_queue &&
+                       env->src_rq->offnode_running == 1 &&
+                       env->src_rq->nr_running == 1;
+}
+
+#else /* CONFIG_NUMA */
+
+static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
+{
+}
+
+static inline void update_sd_numa_stats(struct sched_domain *sd,
+               struct sched_group *group, struct sd_lb_stats *sds,
+               int local_group, struct sg_lb_stats *sgs)
+{
+}
+
+static inline int check_numa_busiest_group(struct lb_env *env, struct 
sd_lb_stats *sds)
+{
+       return 0;
+}
+
+static inline bool need_active_numa_balance(struct lb_env *env)
+{
+       return false;
+}
+#endif /* CONFIG_NUMA */
+
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
        return SCHED_POWER_SCALE;
@@ -3707,6 +3899,8 @@ static inline void update_sg_lb_stats(st
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
+
+               update_sg_numa_stats(sgs, rq);
        }
 
        /*
@@ -3863,6 +4057,8 @@ static inline void update_sd_lb_stats(st
                        sds->group_imb = sgs.group_imb;
                }
 
+               update_sd_numa_stats(env->sd, sg, sds, local_group, &sgs);
+
                sg = sg->next;
        } while (sg != env->sd->groups);
 }
@@ -4150,6 +4346,9 @@ find_busiest_group(struct lb_env *env, c
        return sds.busiest;
 
 out_balanced:
+       if (check_numa_busiest_group(env, &sds))
+               return sds.busiest;
+
 ret:
        env->imbalance = 0;
        return NULL;
@@ -4229,6 +4428,9 @@ static int need_active_balance(struct lb
                        return 1;
        }
 
+       if (need_active_numa_balance(env))
+               return 1;
+
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -4280,6 +4482,8 @@ static int load_balance(int this_cpu, st
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
        }
+       env.src_rq  = busiest;
+       env.src_cpu = busiest->cpu;
 
        BUG_ON(busiest == this_rq);
 
@@ -4295,9 +4499,11 @@ static int load_balance(int this_cpu, st
                 * correctly treated as an imbalance.
                 */
                env.flags |= LBF_ALL_PINNED;
-               env.src_cpu   = busiest->cpu;
-               env.src_rq    = busiest;
-               env.loop_max  = min(sysctl_sched_nr_migrate, 
busiest->nr_running);
+               env.loop_max = min(sysctl_sched_nr_migrate, 
busiest->nr_running);
+               if (sched_feat_numa(NUMA_PULL))
+                       env.tasks = offnode_tasks(busiest);
+               else
+                       env.tasks = &busiest->cfs_tasks;
 
 more_balance:
                local_irq_save(flags);
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,11 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+
+#ifdef CONFIG_NUMA
+SCHED_FEAT(NUMA_HOT,       true)
+SCHED_FEAT(NUMA_BIAS,      true)
+SCHED_FEAT(NUMA_PULL,      true)
+SCHED_FEAT(NUMA_PULL_BIAS, true)
+#endif
+
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -414,6 +414,12 @@ struct rq {
 
        struct list_head cfs_tasks;
 
+#ifdef CONFIG_NUMA
+       unsigned long    offnode_running;
+       unsigned long    offnode_weight;
+       struct list_head offnode_tasks;
+#endif
+
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -465,6 +471,15 @@ struct rq {
 #endif
 };
 
+static inline struct list_head *offnode_tasks(struct rq *rq)
+{
+#ifdef CONFIG_NUMA
+       return &rq->offnode_tasks;
+#else
+       return NULL;
+#endif
+}
+
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -525,6 +540,7 @@ static inline struct sched_domain *highe
 
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_node);
 
 extern int group_balance_cpu(struct sched_group *sg);
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to