On Fri, 2012-12-07 at 10:23 +, Mel Gorman wrote:
> From: Peter Zijlstra
>
> NOTE: This patch is based on "sched, numa, mm: Add fault driven
> placement and migration policy" but as it throws away all the policy
> to just leave a basic foundation I had to drop the signed-offs-by.
>
> This patch creates a bare-bones method for setting PTEs pte_numa in the
> context of the scheduler that when faulted later will be faulted onto the
> node the CPU is running on. In itself this does nothing useful but any
> placement policy will fundamentally depend on receiving hints on placement
> from fault context and doing something intelligent about it.
>
> Signed-off-by: Mel Gorman
> Acked-by: Rik van Riel
> ---
> arch/sh/mm/Kconfig |1 +
> arch/x86/Kconfig |2 +
> include/linux/mm_types.h | 11
> include/linux/sched.h| 20
> kernel/sched/core.c | 13 +
> kernel/sched/fair.c | 125
> ++
> kernel/sched/features.h |7 +++
> kernel/sched/sched.h |6 +++
> kernel/sysctl.c | 24 -
> mm/huge_memory.c |5 +-
> mm/memory.c | 14 +-
> 11 files changed, 224 insertions(+), 4 deletions(-)
>
> diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
> index cb8f992..0f7c852 100644
> --- a/arch/sh/mm/Kconfig
> +++ b/arch/sh/mm/Kconfig
> @@ -111,6 +111,7 @@ config VSYSCALL
> config NUMA
> bool "Non Uniform Memory Access (NUMA) Support"
> depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
> + select ARCH_WANT_NUMA_VARIABLE_LOCALITY
> default n
> help
> Some SH systems have many various memories scattered around
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 46c3bff..1137028 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -22,6 +22,8 @@ config X86
> def_bool y
> select HAVE_AOUT if X86_32
> select HAVE_UNSTABLE_SCHED_CLOCK
> + select ARCH_SUPPORTS_NUMA_BALANCING
> + select ARCH_WANTS_PROT_NUMA_PROT_NONE
> select HAVE_IDE
> select HAVE_OPROFILE
> select HAVE_PCSPKR_PLATFORM
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 31f8a3a..d82accb 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -398,6 +398,17 @@ struct mm_struct {
> #ifdef CONFIG_CPUMASK_OFFSTACK
> struct cpumask cpumask_allocation;
> #endif
> +#ifdef CONFIG_BALANCE_NUMA
> + /*
> + * numa_next_scan is the next time when the PTEs will me marked
s/me/be
> + * pte_numa to gather statistics and migrate pages to new nodes
> + * if necessary
> + */
> + unsigned long numa_next_scan;
> +
> + /* numa_scan_seq prevents two threads setting pte_numa */
> + int numa_scan_seq;
> +#endif
> struct uprobes_state uprobes_state;
> };
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0dd42a0..ac71181 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1479,6 +1479,14 @@ struct task_struct {
> short il_next;
> short pref_node_fork;
> #endif
> +#ifdef CONFIG_BALANCE_NUMA
> + int numa_scan_seq;
> + int numa_migrate_seq;
> + unsigned int numa_scan_period;
> + u64 node_stamp; /* migration stamp */
> + struct callback_head numa_work;
> +#endif /* CONFIG_BALANCE_NUMA */
> +
> struct rcu_head rcu;
>
> /*
> @@ -1553,6 +1561,14 @@ struct task_struct {
> /* Future-safe accessor for struct task_struct's cpus_allowed. */
> #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>
> +#ifdef CONFIG_BALANCE_NUMA
> +extern void task_numa_fault(int node, int pages);
> +#else
> +static inline void task_numa_fault(int node, int pages)
> +{
> +}
> +#endif
> +
> /*
> * Priority of a process goes from 0..MAX_PRIO-1, valid RT
> * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
> @@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
> };
> extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
>
> +extern unsigned int sysctl_balance_numa_scan_period_min;
> +extern unsigned int sysctl_balance_numa_scan_period_max;
> +extern unsigned int sysctl_balance_numa_settle_count;
> +
> #ifdef CONFIG_SCHED_DEBUG
> extern unsigned int sysctl_sched_migration_cost;
> extern unsigned int sysctl_sched_nr_migrate;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 2d8927f..81fa185 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
> #ifdef CONFIG_PREEMPT_NOTIFIERS
> INIT_HLIST_HEAD(>preempt_notifiers);
> #endif
> +
> +#ifdef CONFIG_BALANCE_NUMA
> + if (p->mm && atomic_read(>mm->mm_users) == 1) {
> + p->mm->numa_next_scan = jiffies;
> + p->mm->numa_scan_seq = 0;
> + }
> +
> + p->node_stamp = 0ULL;
> + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
> +