Re: [PATCH 25/49] mm: numa: Add fault driven placement and migration

2013-01-04 Thread Simon Jeons
On Fri, 2012-12-07 at 10:23 +, Mel Gorman wrote:
> From: Peter Zijlstra 
> 
> NOTE: This patch is based on "sched, numa, mm: Add fault driven
>   placement and migration policy" but as it throws away all the policy
>   to just leave a basic foundation I had to drop the signed-offs-by.
> 
> This patch creates a bare-bones method for setting PTEs pte_numa in the
> context of the scheduler that when faulted later will be faulted onto the
> node the CPU is running on.  In itself this does nothing useful but any
> placement policy will fundamentally depend on receiving hints on placement
> from fault context and doing something intelligent about it.
> 
> Signed-off-by: Mel Gorman 
> Acked-by: Rik van Riel 
> ---
>  arch/sh/mm/Kconfig   |1 +
>  arch/x86/Kconfig |2 +
>  include/linux/mm_types.h |   11 
>  include/linux/sched.h|   20 
>  kernel/sched/core.c  |   13 +
>  kernel/sched/fair.c  |  125 
> ++
>  kernel/sched/features.h  |7 +++
>  kernel/sched/sched.h |6 +++
>  kernel/sysctl.c  |   24 -
>  mm/huge_memory.c |5 +-
>  mm/memory.c  |   14 +-
>  11 files changed, 224 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
> index cb8f992..0f7c852 100644
> --- a/arch/sh/mm/Kconfig
> +++ b/arch/sh/mm/Kconfig
> @@ -111,6 +111,7 @@ config VSYSCALL
>  config NUMA
>   bool "Non Uniform Memory Access (NUMA) Support"
>   depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
> + select ARCH_WANT_NUMA_VARIABLE_LOCALITY
>   default n
>   help
> Some SH systems have many various memories scattered around
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 46c3bff..1137028 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -22,6 +22,8 @@ config X86
>   def_bool y
>   select HAVE_AOUT if X86_32
>   select HAVE_UNSTABLE_SCHED_CLOCK
> + select ARCH_SUPPORTS_NUMA_BALANCING
> + select ARCH_WANTS_PROT_NUMA_PROT_NONE
>   select HAVE_IDE
>   select HAVE_OPROFILE
>   select HAVE_PCSPKR_PLATFORM
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 31f8a3a..d82accb 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -398,6 +398,17 @@ struct mm_struct {
>  #ifdef CONFIG_CPUMASK_OFFSTACK
>   struct cpumask cpumask_allocation;
>  #endif
> +#ifdef CONFIG_BALANCE_NUMA
> + /*
> +  * numa_next_scan is the next time when the PTEs will me marked

s/me/be

> +  * pte_numa to gather statistics and migrate pages to new nodes
> +  * if necessary
> +  */
> + unsigned long numa_next_scan;
> +
> + /* numa_scan_seq prevents two threads setting pte_numa */
> + int numa_scan_seq;
> +#endif
>   struct uprobes_state uprobes_state;
>  };
>  
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0dd42a0..ac71181 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1479,6 +1479,14 @@ struct task_struct {
>   short il_next;
>   short pref_node_fork;
>  #endif
> +#ifdef CONFIG_BALANCE_NUMA
> + int numa_scan_seq;
> + int numa_migrate_seq;
> + unsigned int numa_scan_period;
> + u64 node_stamp; /* migration stamp  */
> + struct callback_head numa_work;
> +#endif /* CONFIG_BALANCE_NUMA */
> +
>   struct rcu_head rcu;
>  
>   /*
> @@ -1553,6 +1561,14 @@ struct task_struct {
>  /* Future-safe accessor for struct task_struct's cpus_allowed. */
>  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>  
> +#ifdef CONFIG_BALANCE_NUMA
> +extern void task_numa_fault(int node, int pages);
> +#else
> +static inline void task_numa_fault(int node, int pages)
> +{
> +}
> +#endif
> +
>  /*
>   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
>   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
> @@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
>  };
>  extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
>  
> +extern unsigned int sysctl_balance_numa_scan_period_min;
> +extern unsigned int sysctl_balance_numa_scan_period_max;
> +extern unsigned int sysctl_balance_numa_settle_count;
> +
>  #ifdef CONFIG_SCHED_DEBUG
>  extern unsigned int sysctl_sched_migration_cost;
>  extern unsigned int sysctl_sched_nr_migrate;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 2d8927f..81fa185 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
>  #ifdef CONFIG_PREEMPT_NOTIFIERS
>   INIT_HLIST_HEAD(>preempt_notifiers);
>  #endif
> +
> +#ifdef CONFIG_BALANCE_NUMA
> + if (p->mm && atomic_read(>mm->mm_users) == 1) {
> + p->mm->numa_next_scan = jiffies;
> + p->mm->numa_scan_seq = 0;
> + }
> +
> + p->node_stamp = 0ULL;
> + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
> + 

Re: [PATCH 25/49] mm: numa: Add fault driven placement and migration

2013-01-04 Thread Simon Jeons
On Fri, 2012-12-07 at 10:23 +, Mel Gorman wrote:
 From: Peter Zijlstra a.p.zijls...@chello.nl
 
 NOTE: This patch is based on sched, numa, mm: Add fault driven
   placement and migration policy but as it throws away all the policy
   to just leave a basic foundation I had to drop the signed-offs-by.
 
 This patch creates a bare-bones method for setting PTEs pte_numa in the
 context of the scheduler that when faulted later will be faulted onto the
 node the CPU is running on.  In itself this does nothing useful but any
 placement policy will fundamentally depend on receiving hints on placement
 from fault context and doing something intelligent about it.
 
 Signed-off-by: Mel Gorman mgor...@suse.de
 Acked-by: Rik van Riel r...@redhat.com
 ---
  arch/sh/mm/Kconfig   |1 +
  arch/x86/Kconfig |2 +
  include/linux/mm_types.h |   11 
  include/linux/sched.h|   20 
  kernel/sched/core.c  |   13 +
  kernel/sched/fair.c  |  125 
 ++
  kernel/sched/features.h  |7 +++
  kernel/sched/sched.h |6 +++
  kernel/sysctl.c  |   24 -
  mm/huge_memory.c |5 +-
  mm/memory.c  |   14 +-
  11 files changed, 224 insertions(+), 4 deletions(-)
 
 diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
 index cb8f992..0f7c852 100644
 --- a/arch/sh/mm/Kconfig
 +++ b/arch/sh/mm/Kconfig
 @@ -111,6 +111,7 @@ config VSYSCALL
  config NUMA
   bool Non Uniform Memory Access (NUMA) Support
   depends on MMU  SYS_SUPPORTS_NUMA  EXPERIMENTAL
 + select ARCH_WANT_NUMA_VARIABLE_LOCALITY
   default n
   help
 Some SH systems have many various memories scattered around
 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
 index 46c3bff..1137028 100644
 --- a/arch/x86/Kconfig
 +++ b/arch/x86/Kconfig
 @@ -22,6 +22,8 @@ config X86
   def_bool y
   select HAVE_AOUT if X86_32
   select HAVE_UNSTABLE_SCHED_CLOCK
 + select ARCH_SUPPORTS_NUMA_BALANCING
 + select ARCH_WANTS_PROT_NUMA_PROT_NONE
   select HAVE_IDE
   select HAVE_OPROFILE
   select HAVE_PCSPKR_PLATFORM
 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
 index 31f8a3a..d82accb 100644
 --- a/include/linux/mm_types.h
 +++ b/include/linux/mm_types.h
 @@ -398,6 +398,17 @@ struct mm_struct {
  #ifdef CONFIG_CPUMASK_OFFSTACK
   struct cpumask cpumask_allocation;
  #endif
 +#ifdef CONFIG_BALANCE_NUMA
 + /*
 +  * numa_next_scan is the next time when the PTEs will me marked

s/me/be

 +  * pte_numa to gather statistics and migrate pages to new nodes
 +  * if necessary
 +  */
 + unsigned long numa_next_scan;
 +
 + /* numa_scan_seq prevents two threads setting pte_numa */
 + int numa_scan_seq;
 +#endif
   struct uprobes_state uprobes_state;
  };
  
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index 0dd42a0..ac71181 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1479,6 +1479,14 @@ struct task_struct {
   short il_next;
   short pref_node_fork;
  #endif
 +#ifdef CONFIG_BALANCE_NUMA
 + int numa_scan_seq;
 + int numa_migrate_seq;
 + unsigned int numa_scan_period;
 + u64 node_stamp; /* migration stamp  */
 + struct callback_head numa_work;
 +#endif /* CONFIG_BALANCE_NUMA */
 +
   struct rcu_head rcu;
  
   /*
 @@ -1553,6 +1561,14 @@ struct task_struct {
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
  #define tsk_cpus_allowed(tsk) ((tsk)-cpus_allowed)
  
 +#ifdef CONFIG_BALANCE_NUMA
 +extern void task_numa_fault(int node, int pages);
 +#else
 +static inline void task_numa_fault(int node, int pages)
 +{
 +}
 +#endif
 +
  /*
   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
 @@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
  };
  extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
  
 +extern unsigned int sysctl_balance_numa_scan_period_min;
 +extern unsigned int sysctl_balance_numa_scan_period_max;
 +extern unsigned int sysctl_balance_numa_settle_count;
 +
  #ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 index 2d8927f..81fa185 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
  #ifdef CONFIG_PREEMPT_NOTIFIERS
   INIT_HLIST_HEAD(p-preempt_notifiers);
  #endif
 +
 +#ifdef CONFIG_BALANCE_NUMA
 + if (p-mm  atomic_read(p-mm-mm_users) == 1) {
 + p-mm-numa_next_scan = jiffies;
 + p-mm-numa_scan_seq = 0;
 + }
 +
 + p-node_stamp = 0ULL;
 + p-numa_scan_seq = p-mm ? p-mm-numa_scan_seq : 0;
 + p-numa_migrate_seq = p-mm ? p-mm-numa_scan_seq - 1 : 0;
 + p-numa_scan_period =