Re: [PATCH] numa,sched: only consider less busy nodes as numa balancing destination

Peter Zijlstra Tue, 12 May 2015 23:29:22 -0700

On Tue, May 12, 2015 at 11:45:09AM -0400, Rik van Riel wrote:
> I have a few poorly formed ideas on what could be done about that:
> 
> 1) have fbq_classify_rq take the current task on the rq into account,
>    and adjust the fbq classification if all the runnable-but-queued
>    tasks are on the right node


So while looking at this I came up with the below; it treats anything
inside ->active_nodes as a preferred node for balancing purposes.

Would that make sense?

I'll see what I can do about current in the runqueue type
classification.

> 2) ensure that rq->nr_numa_running and rq->nr_preferred_running also
>    get incremented for kernel threads that are bound to a particular
>    CPU - currently CPU-bound kernel threads will cause the NUMA
>    statistics to look like a CPU has tasks that do not belong on that
>    NUMA node

I'm thinking accounting those to nr_pinned, lemme see how that works
out.

---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 58 ++++++++++++++++++++++++++++++++-------------------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cb734861123a..ffebc2e091ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,7 @@ struct task_struct {
        unsigned sched_reset_on_fork:1;
        unsigned sched_contributes_to_load:1;
        unsigned sched_migrated:1;
+       unsigned sched_preferred:1;
 
 #ifdef CONFIG_MEMCG_KMEM
        unsigned memcg_kmem_skip_account:1;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c1510abeefa..d59adb8e8ef4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -856,18 +856,6 @@ static unsigned int task_scan_max(struct task_struct *p)
        return max(smin, smax);
 }
 
-static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
-{
-       rq->nr_numa_running += (p->numa_preferred_nid != -1);
-       rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
-}
-
-static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
-{
-       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
-       rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
-}
-
 struct numa_group {
        atomic_t refcount;
 
@@ -887,6 +875,28 @@ struct numa_group {
        unsigned long faults[0];
 };
 
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+       int node = task_node(p);
+       bool local;
+
+       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+
+       if (p->numa_group)
+               local = node_isset(node, p->numa_group->active_nodes);
+       else
+               local = (p->numa_preferred_nid == node);
+
+       p->sched_preferred = local;
+       rq->nr_preferred_running += local;
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running -= p->sched_preferred;
+}
+
 /* Shared or private faults. */
 #define NR_NUMA_HINT_FAULT_TYPES 2
 
@@ -1572,9 +1582,10 @@ static void numa_migrate_preferred(struct task_struct *p)
  * are added when they cause over 6/16 of the maximum number of faults, but
  * only removed when they drop below 3/16.
  */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static bool update_numa_active_node_mask(struct numa_group *numa_group)
 {
        unsigned long faults, max_faults = 0;
+       bool update = false;
        int nid;
 
        for_each_online_node(nid) {
@@ -1586,11 +1597,17 @@ static void update_numa_active_node_mask(struct 
numa_group *numa_group)
        for_each_online_node(nid) {
                faults = group_faults_cpu(numa_group, nid);
                if (!node_isset(nid, numa_group->active_nodes)) {
-                       if (faults > max_faults * 6 / 16)
+                       if (faults > max_faults * 6 / 16) {
                                node_set(nid, numa_group->active_nodes);
-               } else if (faults < max_faults * 3 / 16)
+                               update = true;
+                       }
+               } else if (faults < max_faults * 3 / 16) {
                        node_clear(nid, numa_group->active_nodes);
+                       update = true;
+               }
        }
+
+       return update;
 }
 
 /*
@@ -1884,16 +1901,15 @@ static void task_numa_placement(struct task_struct *p)
                update_numa_active_node_mask(p->numa_group);
                spin_unlock_irq(group_lock);
                max_nid = preferred_group_nid(p, max_group_nid);
-       }
-
-       if (max_faults) {
+               sched_setnuma(p, max_nid);
+       } else if (max_faults) {
                /* Set the new preferred node */
                if (max_nid != p->numa_preferred_nid)
                        sched_setnuma(p, max_nid);
-
-               if (task_node(p) != p->numa_preferred_nid)
-                       numa_migrate_preferred(p);
        }
+
+       if (task_node(p) != p->numa_preferred_nid)
+               numa_migrate_preferred(p);
 }
 
 static inline int get_numa_group(struct numa_group *grp)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] numa,sched: only consider less busy nodes as numa balancing destination

Reply via email to