numa: Implement per task memory placement for 'big' processes

tip-bot for Peter Zijlstra Fri, 28 Sep 2012 01:03:56 -0700

Commit-ID:  fa74ef9e42df04b671b77baf516012987da2034e
Gitweb:     http://git.kernel.org/tip/fa74ef9e42df04b671b77baf516012987da2034e
Author:     Peter Zijlstra <[email protected]>
AuthorDate: Wed, 18 Jul 2012 22:06:47 +0200
Committer:  Ingo Molnar <[email protected]>
CommitDate: Thu, 27 Sep 2012 19:17:09 +0200


sched/numa: Implement per task memory placement for 'big' processes

Implement a per-task memory placement scheme for 'big' tasks (as per
the last patch). It relies on a regular PROT_NONE 'migration' fault to
scan the memory space of the procress and uses a two stage migration
scheme to reduce the invluence of unlikely usage relations.

It relies on the assumption that the compute part is tied to a
paticular task and builds a task<->page relation set to model the
compute<->data relation.

Probability says that the task faulting on a page after we protect it,
is most likely to be the task that uses that page most.

To decrease the likelyhood of acting on a false relation, we only
migrate a page when two consecutive samples are from the same task.

I'm still not entirely convinced this scheme is sound, esp. for things
like virtualization and n:m threading solutions in general the
compute<->task relation is fundamentally untrue.

NOTES:

 - we don't actually sample the task, but the task's home-node as a
   migration target, so we're effectively building home-node<->page
   relations not task<->page relations.

 - we migrate to the task's home-node, not the node the task is
   currently running on, since the home-node is the long term target
   for the task to run on, irrespective of whatever node it might
   temporarily run on.

Suggested-by: Rik van Riel <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Paul Turner <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
 include/linux/mempolicy.h |    6 +++-
 include/linux/mm_types.h  |   15 +++++++++++++
 kernel/sched/fair.c       |   51 ++++++++++++++++++++++++++++++++++----------
 kernel/sched/features.h   |    1 +
 mm/huge_memory.c          |    3 +-
 mm/memory.c               |    2 +-
 mm/mempolicy.c            |   39 ++++++++++++++++++++++++++++++++-
 7 files changed, 99 insertions(+), 18 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 17cf9da..7f303d1 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -69,6 +69,7 @@ enum mpol_rebind_step {
 #define MPOL_F_LOCAL   (1 << 1)        /* preferred local allocation */
 #define MPOL_F_REBINDING (1 << 2)      /* identify policies in rebinding */
 #define MPOL_F_MOF     (1 << 3) /* this policy wants migrate on fault */
+#define MPOL_F_HOME    (1 << 4) /* this is the home-node policy */
 
 #ifdef __KERNEL__
 
@@ -263,7 +264,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
        return 1;
 }
 
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned 
long);
+extern int mpol_misplaced(struct page *, struct vm_area_struct *,
+                         unsigned long, int);
 
 extern void lazy_migrate_process(struct mm_struct *mm);
 
@@ -393,7 +395,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, 
struct mempolicy *pol,
 }
 
 static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
-                                unsigned long address)
+                                unsigned long address, int multi)
 {
        return -1; /* no node preference */
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 930c006..f306cbb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -176,6 +176,12 @@ struct page {
         */
        void *shadow;
 #endif
+#ifdef CONFIG_SCHED_NUMA
+       /*
+        * XXX fold this into flags for 64bit.. see next patch.
+        */
+       int nid_last;
+#endif
 }
 /*
  * The struct page can be forced to be double word aligned so that atomic ops
@@ -411,6 +417,15 @@ struct mm_struct {
        struct uprobes_state uprobes_state;
 };
 
+static inline bool mm_numa_big(struct mm_struct *mm)
+{
+#ifdef CONFIG_SCHED_NUMA
+       return mm->numa_big;
+#else
+       return false;
+#endif
+}
+
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ea50ac..76a0920 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -786,6 +786,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
  * Tasks start out with their home-node unset (-1) this effectively means
  * they act !NUMA until we've established the task is busy enough to bother
  * with placement.
+ *
+ * Once we start doing NUMA placement there's two modes, 'small' process-wide
+ * and 'big' per-task. For the small mode we have a process-wide home node
+ * and lazily mirgrate all memory only when this home-node changes.
+ *
+ * For big mode we keep a home-node per task and use periodic fault scans
+ * to try and estalish a task<->page relation. This assumes the task<->page
+ * relation is a compute<->data relation, this is false for things like virt.
+ * and n:m threading solutions but its the best we can do given the
+ * information we have.
  */
 
 static unsigned long task_h_load(struct task_struct *p);
@@ -797,6 +807,7 @@ static void account_offnode_enqueue(struct rq *rq, struct 
task_struct *p)
        rq->offnode_weight += p->numa_contrib;
        rq->offnode_running++;
 }
+
 static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
 {
        rq->offnode_weight -= p->numa_contrib;
@@ -822,6 +833,9 @@ static bool task_numa_big(struct task_struct *p)
        u64 runtime = 0;
        int weight = 0;
 
+       if (sched_feat(NUMA_FORCE_BIG))
+               return true;
+
        rcu_read_lock();
        t = p;
        do {
@@ -875,6 +889,7 @@ void task_numa_work(struct callback_head *work)
 {
        unsigned long migrate, next_scan, now = jiffies;
        struct task_struct *p = current;
+       bool need_migration;
        int big;
 
        WARN_ON_ONCE(p != container_of(work, struct task_struct, rcu));
@@ -890,8 +905,19 @@ void task_numa_work(struct callback_head *work)
        if (p->flags & PF_EXITING)
                return;
 
+       big = p->mm->numa_big;
+       need_migration = need_numa_migration(p);
+
        /*
-        * Enforce maximal migration frequency..
+        * Change per-task state before the process wide freq. throttle,
+        * otherwise it might be a long while ere this task wins the
+        * lottery and gets its home-node set.
+        */
+       if (big && need_migration)
+               sched_setnode(p, p->node_curr);
+
+       /*
+        * Enforce maximal scan/migration frequency..
         */
        migrate = p->mm->numa_next_scan;
        if (time_before(now, migrate))
@@ -901,19 +927,17 @@ void task_numa_work(struct callback_head *work)
        if (cmpxchg(&p->mm->numa_next_scan, migrate, next_scan) != migrate)
                return;
 
-       /*
-        * If this task is too big, we bail on NUMA placement for the process.
-        */
        big = p->mm->numa_big = task_numa_big(p);
-       if (big || need_numa_migration(p)) {
-               int node = p->node_curr;
 
+       if (need_migration) {
                if (big)
-                       node = -1;
-               sched_setnode_process(p, node);
-               if (node != -1)
-                       lazy_migrate_process(p->mm);
+                       sched_setnode(p, p->node_curr);
+               else
+                       sched_setnode_process(p, p->node_curr);
        }
+
+       if (big || need_migration)
+               lazy_migrate_process(p->mm);
 }
 
 /*
@@ -928,9 +952,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 
        /*
         * We don't care about NUMA placement if we don't have memory.
-        * We also bail on placement if we're too big.
         */
-       if (!curr->mm || curr->mm->numa_big)
+       if (!curr->mm)
                return;
 
        /*
@@ -957,6 +980,10 @@ void task_tick_numa(struct rq *rq, struct task_struct 
*curr)
                curr->node_last = curr->node_curr;
                curr->node_curr = numa_node_id();
 
+               /*
+                * We need to do expensive work to either migrate or
+                * drive priodic state update or scanning for 'big' processes.
+                */
                if (need_numa_migration(curr) ||
                    !time_before(jiffies, curr->mm->numa_next_scan)) {
                        /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index fa6a0ac..845b838 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -64,6 +64,7 @@ SCHED_FEAT(LB_MIN, false)
 
 #ifdef CONFIG_SCHED_NUMA
 SCHED_FEAT(NUMA,           true)
+SCHED_FEAT(NUMA_FORCE_BIG, false)
 SCHED_FEAT(NUMA_HOT,       true)
 SCHED_FEAT(NUMA_BIAS,      true)
 SCHED_FEAT(NUMA_PULL,      true)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a147d29..45b11f1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -791,7 +791,7 @@ void do_huge_pmd_prot_none(struct mm_struct *mm, struct 
vm_area_struct *vma,
         * XXX should we serialize against split_huge_page ?
         */
 
-       node = mpol_misplaced(page, vma, haddr);
+       node = mpol_misplaced(page, vma, haddr, mm->numa_big);
        if (node == -1)
                goto do_fixup;
 
@@ -1377,6 +1377,7 @@ static void __split_huge_page_refcount(struct page *page)
                page_tail->mapping = page->mapping;
 
                page_tail->index = page->index + i;
+               page_tail->nid_last = page->nid_last;
 
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c
index 965eeef..7dbdd85 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3448,7 +3448,7 @@ static void do_prot_none_numa(struct mm_struct *mm, 
struct vm_area_struct *vma,
         * For NUMA systems we use the special PROT_NONE maps to drive
         * lazy page migration, see MPOL_MF_LAZY and related.
         */
-       node = mpol_misplaced(page, vma, address);
+       node = mpol_misplaced(page, vma, address, mm_numa_big(mm));
        if (node != -1)
                migrate_misplaced_page(mm, page, node);
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6c6490b..861190f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2174,6 +2174,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, 
unsigned long idx)
  * @page   - page to be checked
  * @vma    - vm area where page mapped
  * @addr   - virtual address where page mapped
+ * @multi  - use multi-stage node binding
  *
  * Lookup current policy node id for vma,addr and "compare to" page's
  * node id.
@@ -2185,7 +2186,8 @@ mpol_shared_policy_lookup(struct shared_policy *sp, 
unsigned long idx)
  * Policy determination "mimics" alloc_page_vma().
  * Called from fault path where we know the vma and faulting address.
  */
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned 
long addr)
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+                  unsigned long addr, int multi)
 {
        struct mempolicy *pol;
        struct zone *zone;
@@ -2236,6 +2238,39 @@ int mpol_misplaced(struct page *page, struct 
vm_area_struct *vma, unsigned long
        default:
                BUG();
        }
+
+       /*
+        * Multi-stage node selection is used in conjunction with a periodic
+        * migration fault to build a temporal task<->page relation. By
+        * using a two-stage filter we remove short/unlikely relations.
+        *
+        * Using P(p) ~ n_p / n_t as per frequentist probability, we can
+        * equate a task's usage of a particular page (n_p) per total usage
+        * of this page (n_t) (in a given time-span) to a probability.
+        *
+        * Our periodic faults will then sample this probability and getting
+        * the same result twice in a row, given these samples are fully
+        * independent, is then given by P(n)^2, provided our sample period
+        * is sufficiently short compared to the usage pattern.
+        *
+        * This quadric squishes small probabilities, making it less likely
+        * we act on an unlikely task<->page relation.
+        *
+        * NOTE: effectively we're using task-home-node<->page-node relations
+        * since those are the only thing we can affect.
+        *
+        * NOTE: we're using task-home-node as opposed to the current node
+        * the task might be running on, since the task-home-node is the
+        * long-term node of this task, further reducing noise. Also see
+        * task_tick_numa().
+        */
+       if (multi && (pol->flags & MPOL_F_HOME)) {
+               if (page->nid_last != polnid) {
+                       page->nid_last = polnid;
+                       goto out;
+               }
+       }
+
        if (curnid != polnid)
                ret = polnid;
 out:
@@ -2427,7 +2462,7 @@ void __init numa_policy_init(void)
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
-                       .flags = MPOL_F_MOF,
+                       .flags = MPOL_F_MOF | MPOL_F_HOME,
                        .v = { .preferred_node = nid, },
                };
        }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:sched/numa] sched/numa: Implement per task memory placement for 'big' processes

Reply via email to