NOTE: This is heavily based on "autonuma: CPU follows memory algorithm" and "autonuma: mm_autonuma and task_autonuma data structures"
At the most basic level, any placement policy is going to make some sort of smart decision based on per-mm and per-task statistics. This patch simply introduces the structures with basic fault statistics that can be expaned upon or replaced later. It may be that a placement policy can approximate without needing both structures in which case they can be safely deleted later while still having a comparison point to ensure the approximation is accurate. [dhi...@gmail.com: Use @pages parameter for fault statistics] Signed-off-by: Mel Gorman <mgor...@suse.de> --- include/linux/mm_types.h | 26 ++++++++++++++++++++++++++ include/linux/sched.h | 18 ++++++++++++++++++ kernel/fork.c | 18 ++++++++++++++++++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 25 ++++++++++++++++++++++++- kernel/sched/sched.h | 14 ++++++++++++++ 6 files changed, 103 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b478ff..9588a91 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -312,6 +312,29 @@ struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; }; +#ifdef CONFIG_BALANCE_NUMA +/* + * Per-mm structure that contains the NUMA memory placement statistics + * generated by pte_numa faults. + */ +struct mm_balancenuma { + /* + * Number of pages that will trigger NUMA faults for this mm. Total + * decays each time whether the home node should change to keep + * track only of recent events + */ + unsigned long mm_numa_fault_tot; + + /* + * Number of pages that will trigger NUMA faults for each [nid]. + * Also decays. + */ + unsigned long mm_numa_fault[0]; + + /* do not add more variables here, the above array size is dynamic */ +}; +#endif /* CONFIG_BALANCE_NUMA */ + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -415,6 +438,9 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; + + /* this is used by the scheduler and the page allocator */ + struct mm_balancenuma *mm_balancenuma; #endif struct uprobes_state uprobes_state; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 1cccfc3..7b6625a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1188,6 +1188,23 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +#ifdef CONFIG_BALANCE_NUMA +/* + * Per-task structure that contains the NUMA memory placement statistics + * generated by pte_numa faults. This structure is dynamically allocated + * when the first pte_numa fault is handled. + */ +struct task_balancenuma { + /* Total number of eligible pages that triggered NUMA faults */ + unsigned long task_numa_fault_tot; + + /* Number of pages that triggered NUMA faults for each [nid] */ + unsigned long task_numa_fault[0]; + + /* do not add more variables here, the above array size is dynamic */ +}; +#endif /* CONFIG_BALANCE_NUMA */ + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; @@ -1488,6 +1505,7 @@ struct task_struct { unsigned int numa_scan_period; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + struct task_balancenuma *task_balancenuma; #endif /* CONFIG_BALANCE_NUMA */ struct rcu_head rcu; diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7..c8752f6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -525,6 +525,20 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +#ifdef CONFIG_BALANCE_NUMA +static inline void free_mm_balancenuma(struct mm_struct *mm) +{ + if (mm->mm_balancenuma) + kfree(mm->mm_balancenuma); + + mm->mm_balancenuma = NULL; +} +#else +static inline void free_mm_balancenuma(struct mm_struct *mm) +{ +} +#endif + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -539,6 +553,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm->mm_balancenuma = NULL; mm_init_aio(mm); mm_init_owner(mm, p); @@ -548,6 +563,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) return mm; } + free_mm_balancenuma(mm); free_mm(mm); return NULL; } @@ -597,6 +613,7 @@ void __mmdrop(struct mm_struct *mm) destroy_context(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); + free_mm_balancenuma(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -854,6 +871,7 @@ fail_nocontext: * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() */ + free_mm_balancenuma(mm); mm_free_pgd(mm); free_mm(mm); return NULL; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3d9fc26..9472d5d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1543,6 +1543,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->task_balancenuma = NULL; p->numa_scan_period = sysctl_balance_numa_scan_delay; p->numa_work.next = &p->numa_work; #endif /* CONFIG_BALANCE_NUMA */ @@ -1787,6 +1788,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { + free_task_balancenuma(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 462de9b..fc8f95d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -855,7 +855,30 @@ void task_numa_fault(int node, int pages) { struct task_struct *p = current; - /* FIXME: Allocate task-specific structure for placement policy here */ + if (!p->task_balancenuma) { + int size = sizeof(struct task_balancenuma) + + (sizeof(unsigned long) * nr_node_ids); + p->task_balancenuma = kzalloc(size, GFP_KERNEL); + if (!p->task_balancenuma) + return; + } + + if (!p->mm->mm_balancenuma) { + int size = sizeof(struct mm_balancenuma) + + (sizeof(unsigned long) * nr_node_ids); + p->mm->mm_balancenuma = kzalloc(size, GFP_KERNEL); + if (!p->mm->mm_balancenuma) { + kfree(p->task_balancenuma); + p->task_balancenuma = NULL; + return; + } + } + + /* Record fault statistics */ + p->task_balancenuma->task_numa_fault_tot += pages; + p->task_balancenuma->task_numa_fault[node] += pages; + p->mm->mm_balancenuma->mm_numa_fault_tot += pages; + p->mm->mm_balancenuma->mm_numa_fault[node] += pages; /* * Assume that as faults occur that pages are getting properly placed diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3f0e5a1..92df3d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -502,6 +502,20 @@ DECLARE_PER_CPU(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_BALANCE_NUMA +static inline void free_task_balancenuma(struct task_struct *p) +{ + if (p->task_balancenuma) + kfree(p->task_balancenuma); + p->task_balancenuma = NULL; +} +#else +static inline void free_task_balancenuma(struct task_struct *p) +{ +} +#endif /* CONFIG_BALANCE_NUMA */ + #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ -- 1.7.9.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/