NOTE: This is heavily based on "autonuma: CPU follows memory algorithm"
        and "autonuma: mm_autonuma and task_autonuma data structures"

At the most basic level, any placement policy is going to make some
sort of smart decision based on per-mm and per-task statistics. This
patch simply introduces the structures with basic fault statistics
that can be expaned upon or replaced later. It may be that a placement
policy can approximate without needing both structures in which case
they can be safely deleted later while still having a comparison point
to ensure the approximation is accurate.

[dhi...@gmail.com: Use @pages parameter for fault statistics]
Signed-off-by: Mel Gorman <mgor...@suse.de>
---
 include/linux/mm_types.h |   26 ++++++++++++++++++++++++++
 include/linux/sched.h    |   18 ++++++++++++++++++
 kernel/fork.c            |   18 ++++++++++++++++++
 kernel/sched/core.c      |    3 +++
 kernel/sched/fair.c      |   25 ++++++++++++++++++++++++-
 kernel/sched/sched.h     |   14 ++++++++++++++
 6 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b478ff..9588a91 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -312,6 +312,29 @@ struct mm_rss_stat {
        atomic_long_t count[NR_MM_COUNTERS];
 };
 
+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-mm structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults.
+ */
+struct mm_balancenuma {
+       /*
+        * Number of pages that will trigger NUMA faults for this mm. Total
+        * decays each time whether the home node should change to keep
+        * track only of recent events
+        */
+       unsigned long mm_numa_fault_tot;
+
+       /*
+        * Number of pages that will trigger NUMA faults for each [nid].
+        * Also decays.
+        */
+       unsigned long mm_numa_fault[0];
+
+       /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
 struct mm_struct {
        struct vm_area_struct * mmap;           /* list of VMAs */
        struct rb_root mm_rb;
@@ -415,6 +438,9 @@ struct mm_struct {
 
        /* numa_scan_seq prevents two threads setting pte_numa */
        int numa_scan_seq;
+
+       /* this is used by the scheduler and the page allocator */
+       struct mm_balancenuma *mm_balancenuma;
 #endif
        struct uprobes_state uprobes_state;
 };
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1cccfc3..7b6625a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1188,6 +1188,23 @@ enum perf_event_task_context {
        perf_nr_task_contexts,
 };
 
+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-task structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults. This structure is dynamically allocated
+ * when the first pte_numa fault is handled.
+ */
+struct task_balancenuma {
+       /* Total number of eligible pages that triggered NUMA faults */
+       unsigned long task_numa_fault_tot;
+
+       /* Number of pages that triggered NUMA faults for each [nid] */
+       unsigned long task_numa_fault[0];
+
+       /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
 struct task_struct {
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
@@ -1488,6 +1505,7 @@ struct task_struct {
        unsigned int numa_scan_period;
        u64 node_stamp;                 /* migration stamp  */
        struct callback_head numa_work;
+       struct task_balancenuma *task_balancenuma;
 #endif /* CONFIG_BALANCE_NUMA */
 
        struct rcu_head rcu;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7..c8752f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -525,6 +525,20 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+       if (mm->mm_balancenuma)
+               kfree(mm->mm_balancenuma);
+
+       mm->mm_balancenuma = NULL;
+}
+#else
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+}
+#endif
+
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 {
        atomic_set(&mm->mm_users, 1);
@@ -539,6 +553,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p)
        spin_lock_init(&mm->page_table_lock);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
+       mm->mm_balancenuma = NULL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
 
@@ -548,6 +563,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p)
                return mm;
        }
 
+       free_mm_balancenuma(mm);
        free_mm(mm);
        return NULL;
 }
@@ -597,6 +613,7 @@ void __mmdrop(struct mm_struct *mm)
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
        check_mm(mm);
+       free_mm_balancenuma(mm);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -854,6 +871,7 @@ fail_nocontext:
         * If init_new_context() failed, we cannot use mmput() to free the mm
         * because it calls destroy_context()
         */
+       free_mm_balancenuma(mm);
        mm_free_pgd(mm);
        free_mm(mm);
        return NULL;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d9fc26..9472d5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1543,6 +1543,7 @@ static void __sched_fork(struct task_struct *p)
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+       p->task_balancenuma = NULL;
        p->numa_scan_period = sysctl_balance_numa_scan_delay;
        p->numa_work.next = &p->numa_work;
 #endif /* CONFIG_BALANCE_NUMA */
@@ -1787,6 +1788,8 @@ static void finish_task_switch(struct rq *rq, struct 
task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
+               free_task_balancenuma(prev);
+
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 462de9b..fc8f95d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -855,7 +855,30 @@ void task_numa_fault(int node, int pages)
 {
        struct task_struct *p = current;
 
-       /* FIXME: Allocate task-specific structure for placement policy here */
+       if (!p->task_balancenuma) {
+               int size = sizeof(struct task_balancenuma) +
+                               (sizeof(unsigned long) * nr_node_ids);
+               p->task_balancenuma = kzalloc(size, GFP_KERNEL);
+               if (!p->task_balancenuma)
+                       return;
+       }
+
+       if (!p->mm->mm_balancenuma) {
+               int size = sizeof(struct mm_balancenuma) +
+                               (sizeof(unsigned long) * nr_node_ids);
+               p->mm->mm_balancenuma = kzalloc(size, GFP_KERNEL);
+               if (!p->mm->mm_balancenuma) {
+                       kfree(p->task_balancenuma);
+                       p->task_balancenuma = NULL;
+                       return;
+               }
+       }
+
+       /* Record fault statistics */
+       p->task_balancenuma->task_numa_fault_tot        += pages;
+       p->task_balancenuma->task_numa_fault[node]      += pages;
+       p->mm->mm_balancenuma->mm_numa_fault_tot        += pages;
+       p->mm->mm_balancenuma->mm_numa_fault[node]      += pages;
 
        /*
         * Assume that as faults occur that pages are getting properly placed
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f0e5a1..92df3d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -502,6 +502,20 @@ DECLARE_PER_CPU(struct rq, runqueues);
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 #define raw_rq()               (&__raw_get_cpu_var(runqueues))
 
+
+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+       if (p->task_balancenuma)
+               kfree(p->task_balancenuma);
+       p->task_balancenuma = NULL;
+}
+#else
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+}
+#endif /* CONFIG_BALANCE_NUMA */
+
 #ifdef CONFIG_SMP
 
 #define rcu_dereference_check_sched_domain(p) \
-- 
1.7.9.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to