Re: [PATCH v2 2/7] sched: rework of sched_domain topology definition

Dietmar Eggemann Wed, 19 Mar 2014 04:28:27 -0700

On 18/03/14 17:56, Vincent Guittot wrote:
> We replace the old way to configure the scheduler topology with a new method
> which enables a platform to declare additionnal level (if needed).
> 
> We still have a default topology table definition that can be used by platform
> that don't want more level than the SMT, MC, CPU and NUMA ones. This table can
> be overwritten by an arch which wants to add new level where a load balance
> make sense like BOOK or powergating level.
> 
> For each level, we need a function pointer that returns cpumask for each cpu,
> a function pointer that returns the flags for the level and a name. Only flags
> that describe topology, can be set by an architecture. The current topology
> flags are:
>  SD_SHARE_CPUPOWER
>  SD_SHARE_PKG_RESOURCES
>  SD_NUMA
>  SD_ASYM_PACKING
> 
> Then, each level must be a subset on the next one. The build sequence of the
> sched_domain will take care of removing useless levels like those with 1 CPU
> and those with the same CPU span and relevant information for load balancing
> than its child.
> 
> Signed-off-by: Vincent Guittot <vincent.guit...@linaro.org>
> ---
>  arch/ia64/include/asm/topology.h |  24 ----
>  arch/s390/include/asm/topology.h |   2 -
>  arch/tile/include/asm/topology.h |  33 ------
>  include/linux/sched.h            |  48 ++++++++
>  include/linux/topology.h         | 128 +++------------------
>  kernel/sched/core.c              | 235 
> ++++++++++++++++++++-------------------
>  6 files changed, 183 insertions(+), 287 deletions(-)
> 
> diff --git a/arch/ia64/include/asm/topology.h 
> b/arch/ia64/include/asm/topology.h
> index 5cb55a1..3202aa7 100644
> --- a/arch/ia64/include/asm/topology.h
> +++ b/arch/ia64/include/asm/topology.h
> @@ -46,30 +46,6 @@
> 
>  void build_cpu_to_node_map(void);
> 
> -#define SD_CPU_INIT (struct sched_domain) {            \
> -       .parent                 = NULL,                 \
> -       .child                  = NULL,                 \
> -       .groups                 = NULL,                 \
> -       .min_interval           = 1,                    \
> -       .max_interval           = 4,                    \
> -       .busy_factor            = 64,                   \
> -       .imbalance_pct          = 125,                  \
> -       .cache_nice_tries       = 2,                    \
> -       .busy_idx               = 2,                    \
> -       .idle_idx               = 1,                    \
> -       .newidle_idx            = 0,                    \
> -       .wake_idx               = 0,                    \
> -       .forkexec_idx           = 0,                    \
> -       .flags                  = SD_LOAD_BALANCE       \
> -                               | SD_BALANCE_NEWIDLE    \
> -                               | SD_BALANCE_EXEC       \
> -                               | SD_BALANCE_FORK       \
> -                               | SD_WAKE_AFFINE,       \
> -       .last_balance           = jiffies,              \
> -       .balance_interval       = 1,                    \
> -       .nr_balance_failed      = 0,                    \
> -}
> -
>  #endif /* CONFIG_NUMA */
> 
>  #ifdef CONFIG_SMP
> diff --git a/arch/s390/include/asm/topology.h 
> b/arch/s390/include/asm/topology.h
> index 05425b1..07763bd 100644
> --- a/arch/s390/include/asm/topology.h
> +++ b/arch/s390/include/asm/topology.h
> @@ -64,8 +64,6 @@ static inline void s390_init_cpu_topology(void)
>  };
>  #endif
> 
> -#define SD_BOOK_INIT   SD_CPU_INIT
> -
>  #include <asm-generic/topology.h>
> 
>  #endif /* _ASM_S390_TOPOLOGY_H */
> diff --git a/arch/tile/include/asm/topology.h 
> b/arch/tile/include/asm/topology.h
> index d15c0d8..9383118 100644
> --- a/arch/tile/include/asm/topology.h
> +++ b/arch/tile/include/asm/topology.h
> @@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int 
> node)
>  /* For now, use numa node -1 for global allocation. */
>  #define pcibus_to_node(bus)            ((void)(bus), -1)
> 
> -/*
> - * TILE architecture has many cores integrated in one processor, so we need
> - * setup bigger balance_interval for both CPU/NODE scheduling domains to
> - * reduce process scheduling costs.
> - */
> -
> -/* sched_domains SD_CPU_INIT for TILE architecture */
> -#define SD_CPU_INIT (struct sched_domain) {                            \
> -       .min_interval           = 4,                                    \
> -       .max_interval           = 128,                                  \
> -       .busy_factor            = 64,                                   \
> -       .imbalance_pct          = 125,                                  \
> -       .cache_nice_tries       = 1,                                    \
> -       .busy_idx               = 2,                                    \
> -       .idle_idx               = 1,                                    \
> -       .newidle_idx            = 0,                                    \
> -       .wake_idx               = 0,                                    \
> -       .forkexec_idx           = 0,                                    \
> -                                                                       \
> -       .flags                  = 1*SD_LOAD_BALANCE                     \
> -                               | 1*SD_BALANCE_NEWIDLE                  \
> -                               | 1*SD_BALANCE_EXEC                     \
> -                               | 1*SD_BALANCE_FORK                     \
> -                               | 0*SD_BALANCE_WAKE                     \
> -                               | 0*SD_WAKE_AFFINE                      \
> -                               | 0*SD_SHARE_CPUPOWER                   \
> -                               | 0*SD_SHARE_PKG_RESOURCES              \
> -                               | 0*SD_SERIALIZE                        \
> -                               ,                                       \
> -       .last_balance           = jiffies,                              \
> -       .balance_interval       = 32,                                   \
> -}
> -
>  /* By definition, we create nodes based on online memory. */
>  #define node_has_online_mem(nid) 1
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 825ed83..4db592a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -870,6 +870,20 @@ enum cpu_idle_type {
> 
>  extern int __weak arch_sd_sibiling_asym_packing(void);
> 
> +#ifdef CONFIG_SCHED_SMT
> +static inline const int cpu_smt_flags(void)
> +{
> +       return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
> +}
> +#endif
> +
> +#ifdef CONFIG_SCHED_MC
> +static inline const int cpu_core_flags(void)
> +{
> +       return SD_SHARE_PKG_RESOURCES;
> +}
> +#endif
> +
>  struct sched_domain_attr {
>         int relax_domain_level;
>  };
> @@ -976,6 +990,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned 
> int ndoms);
> 
>  bool cpus_share_cache(int this_cpu, int that_cpu);
> 
> +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
> +typedef const int (*sched_domain_flags_f)(void);
> +
> +#define SDTL_OVERLAP   0x01
> +
> +struct sd_data {
> +       struct sched_domain **__percpu sd;
> +       struct sched_group **__percpu sg;
> +       struct sched_group_power **__percpu sgp;
> +};
> +
> +struct sched_domain_topology_level {
> +       sched_domain_mask_f mask;
> +       sched_domain_flags_f sd_flags;
> +       int                 flags;
> +       int                 numa_level;
> +       struct sd_data      data;
> +#ifdef CONFIG_SCHED_DEBUG
> +       char                *name;
> +#endif
> +};
> +
> +extern struct sched_domain_topology_level *sched_domain_topology;
> +
> +extern void set_sched_topology(struct sched_domain_topology_level *tl);
> +
> +#ifdef CONFIG_SCHED_DEBUG
> +# define SD_INIT_NAME(type)            .name = #type
> +#else
> +# define SD_INIT_NAME(type)
> +#endif
> +
>  #else /* CONFIG_SMP */
> 
>  struct sched_domain_attr;
> @@ -991,6 +1037,8 @@ static inline bool cpus_share_cache(int this_cpu, int 
> that_cpu)
>         return true;
>  }
> 
> +static inline void set_sched_topology(struct sched_domain_topology_level 
> *tl) { }
> +
>  #endif /* !CONFIG_SMP */
> 
> 
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 12ae6ce..3a9db05 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -66,121 +66,6 @@ int arch_update_cpu_topology(void);
>  #define PENALTY_FOR_NODE_WITH_CPUS     (1)
>  #endif
> 
> -/*
> - * Below are the 3 major initializers used in building sched_domains:
> - * SD_SIBLING_INIT, for SMT domains
> - * SD_CPU_INIT, for SMP domains
> - *
> - * Any architecture that cares to do any tuning to these values should do so
> - * by defining their own arch-specific initializer in include/asm/topology.h.
> - * A definition there will automagically override these default initializers
> - * and allow arch-specific performance tuning of sched_domains.
> - * (Only non-zero and non-null fields need be specified.)
> - */
> -
> -#ifdef CONFIG_SCHED_SMT
> -/* MCD - Do we really need this?  It is always on if CONFIG_SCHED_SMT is,
> - * so can't we drop this in favor of CONFIG_SCHED_SMT?
> - */
> -#define ARCH_HAS_SCHED_WAKE_IDLE
> -/* Common values for SMT siblings */
> -#ifndef SD_SIBLING_INIT
> -#define SD_SIBLING_INIT (struct sched_domain) {                              
>   \
> -       .min_interval           = 1,                                    \
> -       .max_interval           = 2,                                    \
> -       .busy_factor            = 64,                                   \
> -       .imbalance_pct          = 110,                                  \
> -                                                                       \
> -       .flags                  = 1*SD_LOAD_BALANCE                     \
> -                               | 1*SD_BALANCE_NEWIDLE                  \
> -                               | 1*SD_BALANCE_EXEC                     \
> -                               | 1*SD_BALANCE_FORK                     \
> -                               | 0*SD_BALANCE_WAKE                     \
> -                               | 1*SD_WAKE_AFFINE                      \
> -                               | 1*SD_SHARE_CPUPOWER                   \
> -                               | 1*SD_SHARE_PKG_RESOURCES              \
> -                               | 0*SD_SERIALIZE                        \
> -                               | 0*SD_PREFER_SIBLING                   \
> -                               | arch_sd_sibling_asym_packing()        \
> -                               ,                                       \
> -       .last_balance           = jiffies,                              \
> -       .balance_interval       = 1,                                    \
> -       .smt_gain               = 1178, /* 15% */                       \
> -       .max_newidle_lb_cost    = 0,                                    \
> -       .next_decay_max_lb_cost = jiffies,                              \
> -}
> -#endif
> -#endif /* CONFIG_SCHED_SMT */
> -
> -#ifdef CONFIG_SCHED_MC
> -/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
> -#ifndef SD_MC_INIT
> -#define SD_MC_INIT (struct sched_domain) {                             \
> -       .min_interval           = 1,                                    \
> -       .max_interval           = 4,                                    \
> -       .busy_factor            = 64,                                   \
> -       .imbalance_pct          = 125,                                  \
> -       .cache_nice_tries       = 1,                                    \
> -       .busy_idx               = 2,                                    \
> -       .wake_idx               = 0,                                    \
> -       .forkexec_idx           = 0,                                    \
> -                                                                       \
> -       .flags                  = 1*SD_LOAD_BALANCE                     \
> -                               | 1*SD_BALANCE_NEWIDLE                  \
> -                               | 1*SD_BALANCE_EXEC                     \
> -                               | 1*SD_BALANCE_FORK                     \
> -                               | 0*SD_BALANCE_WAKE                     \
> -                               | 1*SD_WAKE_AFFINE                      \
> -                               | 0*SD_SHARE_CPUPOWER                   \
> -                               | 1*SD_SHARE_PKG_RESOURCES              \
> -                               | 0*SD_SERIALIZE                        \
> -                               ,                                       \
> -       .last_balance           = jiffies,                              \
> -       .balance_interval       = 1,                                    \
> -       .max_newidle_lb_cost    = 0,                                    \
> -       .next_decay_max_lb_cost = jiffies,                              \
> -}
> -#endif
> -#endif /* CONFIG_SCHED_MC */
> -
> -/* Common values for CPUs */
> -#ifndef SD_CPU_INIT
> -#define SD_CPU_INIT (struct sched_domain) {                            \
> -       .min_interval           = 1,                                    \
> -       .max_interval           = 4,                                    \
> -       .busy_factor            = 64,                                   \
> -       .imbalance_pct          = 125,                                  \
> -       .cache_nice_tries       = 1,                                    \
> -       .busy_idx               = 2,                                    \
> -       .idle_idx               = 1,                                    \
> -       .newidle_idx            = 0,                                    \
> -       .wake_idx               = 0,                                    \
> -       .forkexec_idx           = 0,                                    \
> -                                                                       \
> -       .flags                  = 1*SD_LOAD_BALANCE                     \
> -                               | 1*SD_BALANCE_NEWIDLE                  \
> -                               | 1*SD_BALANCE_EXEC                     \
> -                               | 1*SD_BALANCE_FORK                     \
> -                               | 0*SD_BALANCE_WAKE                     \
> -                               | 1*SD_WAKE_AFFINE                      \
> -                               | 0*SD_SHARE_CPUPOWER                   \
> -                               | 0*SD_SHARE_PKG_RESOURCES              \
> -                               | 0*SD_SERIALIZE                        \
> -                               | 1*SD_PREFER_SIBLING                   \
> -                               ,                                       \
> -       .last_balance           = jiffies,                              \
> -       .balance_interval       = 1,                                    \
> -       .max_newidle_lb_cost    = 0,                                    \
> -       .next_decay_max_lb_cost = jiffies,                              \
> -}
> -#endif
> -
> -#ifdef CONFIG_SCHED_BOOK
> -#ifndef SD_BOOK_INIT
> -#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
> -#endif
> -#endif /* CONFIG_SCHED_BOOK */
> -
>  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
>  DECLARE_PER_CPU(int, numa_node);
> 
> @@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu)
>  #define topology_core_cpumask(cpu)             cpumask_of(cpu)
>  #endif
> 
> +#ifdef CONFIG_SCHED_SMT
> +static inline const struct cpumask *cpu_smt_mask(int cpu)
> +{
> +       return topology_thread_cpumask(cpu);
> +}
> +#endif
> +
> +static inline const struct cpumask *cpu_cpu_mask(int cpu)
> +{
> +       return cpumask_of_node(cpu_to_node(cpu));
> +}
> +
> +
>  #endif /* _LINUX_TOPOLOGY_H */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ae365aa..3397bcb 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5603,17 +5603,6 @@ static int __init isolated_cpu_setup(char *str)
> 
>  __setup("isolcpus=", isolated_cpu_setup);
> 
> -static const struct cpumask *cpu_cpu_mask(int cpu)
> -{
> -       return cpumask_of_node(cpu_to_node(cpu));
> -}
> -
> -struct sd_data {
> -       struct sched_domain **__percpu sd;
> -       struct sched_group **__percpu sg;
> -       struct sched_group_power **__percpu sgp;
> -};
> -
>  struct s_data {
>         struct sched_domain ** __percpu sd;
>         struct root_domain      *rd;
> @@ -5626,21 +5615,6 @@ enum s_alloc {
>         sa_none,
>  };
> 
> -struct sched_domain_topology_level;
> -
> -typedef struct sched_domain *(*sched_domain_init_f)(struct 
> sched_domain_topology_level *tl, int cpu);
> -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
> -
> -#define SDTL_OVERLAP   0x01
> -
> -struct sched_domain_topology_level {
> -       sched_domain_init_f init;
> -       sched_domain_mask_f mask;
> -       int                 flags;
> -       int                 numa_level;
> -       struct sd_data      data;
> -};
> -
>  /*
>   * Build an iteration mask that can exclude certain CPUs from the upwards
>   * domain traversal.
> @@ -5869,34 +5843,6 @@ int __weak arch_sd_sibling_asym_packing(void)
>   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
>   */
> 
> -#ifdef CONFIG_SCHED_DEBUG
> -# define SD_INIT_NAME(sd, type)                sd->name = #type
> -#else
> -# define SD_INIT_NAME(sd, type)                do { } while (0)
> -#endif
> -
> -#define SD_INIT_FUNC(type)                                             \
> -static noinline struct sched_domain *                                  \
> -sd_init_##type(struct sched_domain_topology_level *tl, int cpu)        \
> -{                                                                      \
> -       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
> -       *sd = SD_##type##_INIT;                                         \
> -       SD_INIT_NAME(sd, type);                                         \
> -       sd->private = &tl->data;                                        \
> -       return sd;                                                      \
> -}
> -
> -SD_INIT_FUNC(CPU)
> -#ifdef CONFIG_SCHED_SMT
> - SD_INIT_FUNC(SIBLING)
> -#endif
> -#ifdef CONFIG_SCHED_MC
> - SD_INIT_FUNC(MC)
> -#endif
> -#ifdef CONFIG_SCHED_BOOK
> - SD_INIT_FUNC(BOOK)
> -#endif
> -
>  static int default_relax_domain_level = -1;
>  int sched_domain_level_max;
> 
> @@ -5984,97 +5930,156 @@ static void claim_allocations(int cpu, struct 
> sched_domain *sd)
>                 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
>  }
> 
> -#ifdef CONFIG_SCHED_SMT
> -static const struct cpumask *cpu_smt_mask(int cpu)
> -{
> -       return topology_thread_cpumask(cpu);
> -}
> -#endif
> -
> -/*
> - * Topology list, bottom-up.
> - */
> -static struct sched_domain_topology_level default_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> -       { sd_init_SIBLING, cpu_smt_mask, },
> -#endif
> -#ifdef CONFIG_SCHED_MC
> -       { sd_init_MC, cpu_coregroup_mask, },
> -#endif
> -#ifdef CONFIG_SCHED_BOOK
> -       { sd_init_BOOK, cpu_book_mask, },
> -#endif
> -       { sd_init_CPU, cpu_cpu_mask, },
> -       { NULL, },
> -};
> -
> -static struct sched_domain_topology_level *sched_domain_topology = 
> default_topology;
> -
> -#define for_each_sd_topology(tl)                       \
> -       for (tl = sched_domain_topology; tl->init; tl++)
> -
>  #ifdef CONFIG_NUMA
> -
>  static int sched_domains_numa_levels;
>  static int *sched_domains_numa_distance;
>  static struct cpumask ***sched_domains_numa_masks;
>  static int sched_domains_curr_level;
> +#endif
> 
> -static inline int sd_local_flags(int level)
> -{
> -       if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
> -               return 0;
> -
> -       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
> -}
> +/*
> + * SD_flags allowed in topology descriptions.
> + *
> + * SD_SHARE_CPUPOWER      - describes SMT topologies
> + * SD_SHARE_PKG_RESOURCES - describes shared caches
> + * SD_NUMA                - describes NUMA topologies
> + *
> + * Odd one out:
> + * SD_ASYM_PACKING        - describes SMT quirks
> + */
> +#define TOPOLOGY_SD_FLAGS              \
> +       (SD_SHARE_CPUPOWER |            \
> +        SD_SHARE_PKG_RESOURCES |       \
> +        SD_NUMA |                      \
> +        SD_ASYM_PACKING)
> 
>  static struct sched_domain *
> -sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
> +sd_init(struct sched_domain_topology_level *tl, int cpu)
>  {
>         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
> -       int level = tl->numa_level;
> -       int sd_weight = cpumask_weight(
> -                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
> +       int sd_weight, sd_flags = 0;
> +
> +#ifdef CONFIG_NUMA
> +       /*
> +        * Ugly hack to pass state to sd_numa_mask()...
> +        */
> +       sched_domains_curr_level = tl->numa_level;
> +#endif
> +
> +       sd_weight = cpumask_weight(tl->mask(cpu));
> +
> +       if (tl->sd_flags)
> +               sd_flags = (*tl->sd_flags)();
> +       if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
> +                       "wrong sd_flags in topology description\n"))
> +               sd_flags &= ~TOPOLOGY_SD_FLAGS;
> 
>         *sd = (struct sched_domain){
>                 .min_interval           = sd_weight,
>                 .max_interval           = 2*sd_weight,
>                 .busy_factor            = 32,
>                 .imbalance_pct          = 125,
> -               .cache_nice_tries       = 2,
> -               .busy_idx               = 3,
> -               .idle_idx               = 2,
> +
> +               .cache_nice_tries       = 0,
> +               .busy_idx               = 0,
> +               .idle_idx               = 0,
>                 .newidle_idx            = 0,
>                 .wake_idx               = 0,
>                 .forkexec_idx           = 0,
> 
>                 .flags                  = 1*SD_LOAD_BALANCE
>                                         | 1*SD_BALANCE_NEWIDLE
> -                                       | 0*SD_BALANCE_EXEC
> -                                       | 0*SD_BALANCE_FORK
> +                                       | 1*SD_BALANCE_EXEC
> +                                       | 1*SD_BALANCE_FORK
>                                         | 0*SD_BALANCE_WAKE
> -                                       | 0*SD_WAKE_AFFINE
> +                                       | 1*SD_WAKE_AFFINE
>                                         | 0*SD_SHARE_CPUPOWER
>                                         | 0*SD_SHARE_PKG_RESOURCES
> -                                       | 1*SD_SERIALIZE
> +                                       | 0*SD_SERIALIZE
>                                         | 0*SD_PREFER_SIBLING
> -                                       | 1*SD_NUMA
> -                                       | sd_local_flags(level)
> +                                       | 0*SD_NUMA
> +                                       | sd_flags
>                                         ,
> +
>                 .last_balance           = jiffies,
>                 .balance_interval       = sd_weight,
> +               .smt_gain               = 0,
> +               .max_newidle_lb_cost    = 0,
> +               .next_decay_max_lb_cost = jiffies,
> +#ifdef CONFIG_SCHED_DEBUG
> +               .name                   = tl->name,
> +#endif
>         };
> -       SD_INIT_NAME(sd, NUMA);
> -       sd->private = &tl->data;
> 
>         /*
> -        * Ugly hack to pass state to sd_numa_mask()...
> +        * Convert topological properties into behaviour.
>          */
> -       sched_domains_curr_level = tl->numa_level;
> +
> +       if (sd->flags & SD_SHARE_CPUPOWER) {
> +               sd->imbalance_pct = 110;
> +               sd->smt_gain = 1178; /* ~15% */
> +               sd->flags |= arch_sd_sibling_asym_packing();
> +
> +       } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
> +               sd->imbalance_pct = 117;
> +               sd->cache_nice_tries = 1;
> +               sd->busy_idx = 2;
> +
> +#ifdef CONFIG_NUMA
> +       } else if (sd->flags & SD_NUMA) {
> +               sd->cache_nice_tries = 2;
> +               sd->busy_idx = 3;
> +               sd->idle_idx = 2;
> +
> +               sd->flags |= SD_SERIALIZE;
> +               if (sched_domains_numa_distance[tl->numa_level] > 
> RECLAIM_DISTANCE) {
> +                       sd->flags &= ~(SD_BALANCE_EXEC |
> +                                      SD_BALANCE_FORK |
> +                                      SD_WAKE_AFFINE);
> +               }
> +
> +#endif
> +       } else {
> +               sd->flags |= SD_PREFER_SIBLING;
> +               sd->cache_nice_tries = 1;
> +               sd->busy_idx = 2;
> +               sd->idle_idx = 1;
> +       }


This 'if ... else statement' is still a weak point from the perspective
of making the code robust:

On TC2 w/ the following change in cpu_corepower_mask()

 const struct cpumask *cpu_corepower_mask(int cpu)
 {
-       return &cpu_topology[cpu].thread_sibling;
+       return cpu_topology[cpu].socket_id ?
&cpu_topology[cpu].thread_sibling :
+                       &cpu_topology[cpu].core_sibling;
 }

I get a sane set-up:

root@linaro-developer:~# cat /proc/sys/kernel/sched_domain/cpu*/domain*/name
GMC
DIE
GMC
DIE
MC
DIE
MC
DIE
MC
DIE
root@linaro-developer:~# cat
/proc/sys/kernel/sched_domain/cpu*/domain*/flags
815
4143
815
4143
559
4143
559
4143
559
4143

w/ 815 (0x32F : SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC
SD_BALANCE_FORK SD_WAKE_AFFINE *SD_SHARE_POWERDOMAIN*
SD_SHARE_PKG_RESOURCES)

w/ 559 (0x22F : SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC
SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_PKG_RESOURCES)

But when I introduce the following error into the arch specific
cpu_corepower_flags() function

 static inline const int cpu_corepower_flags(void)
 {
-       return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
+       return SD_SHARE_POWERDOMAIN;
 }

the GMC related sd's for CPU0,1 are initialized as DIE in sd_init()
resulting in this wrong set-up w/o any warning/error message:

root@linaro-developer:~# cat /proc/sys/kernel/sched_domain/cpu*/domain*/name
GMC
DIE
GMC
DIE
MC
DIE
MC
DIE
MC
DIE
root@linaro-developer:~# cat
/proc/sys/kernel/sched_domain/cpu*/domain*/flags
4399
4143
4399
4143
559
4143
559
4143
559
4143

w/ 4399 (0x112f : SD_LOAD_BALANCE SD_BALANCE_NEWIDLE SD_BALANCE_EXEC
SD_BALANCE_FORK SD_WAKE_AFFINE *SD_SHARE_POWERDOMAIN* SD_PREFER_SIBLING

Is there a way to check that MC and GMC have to have
SD_SHARE_PKG_RESOURCES set so that this can't happen unnoticed?

-- Dietmar

> +
> +       sd->private = &tl->data;
> 
>         return sd;
>  }
> 
> +/*
> + * Topology list, bottom-up.
> + */
> +static struct sched_domain_topology_level default_topology[] = {
> +#ifdef CONFIG_SCHED_SMT
> +       { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
> +#endif
> +#ifdef CONFIG_SCHED_MC
> +       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
> +#endif
> +#ifdef CONFIG_SCHED_BOOK
> +       { cpu_book_mask, SD_INIT_NAME(BOOK) },
> +#endif
> +       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> +       { NULL, },
> +};
> +
> +struct sched_domain_topology_level *sched_domain_topology = default_topology;
> +
> +#define for_each_sd_topology(tl)                       \
> +       for (tl = sched_domain_topology; tl->mask; tl++)
> +
> +void set_sched_topology(struct sched_domain_topology_level *tl)
> +{
> +       sched_domain_topology = tl;
> +}
> +
> +#ifdef CONFIG_NUMA
> +
>  static const struct cpumask *sd_numa_mask(int cpu)
>  {
>         return 
> sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
> @@ -6218,7 +6223,10 @@ static void sched_init_numa(void)
>                 }
>         }
> 
> -       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
> +       /* Compute default topology size */
> +       for (i = 0; sched_domain_topology[i].mask; i++);
> +
> +       tl = kzalloc((i + level) *
>                         sizeof(struct sched_domain_topology_level), 
> GFP_KERNEL);
>         if (!tl)
>                 return;
> @@ -6226,18 +6234,19 @@ static void sched_init_numa(void)
>         /*
>          * Copy the default topology bits..
>          */
> -       for (i = 0; default_topology[i].init; i++)
> -               tl[i] = default_topology[i];
> +       for (i = 0; sched_domain_topology[i].mask; i++)
> +               tl[i] = sched_domain_topology[i];
> 
>         /*
>          * .. and append 'j' levels of NUMA goodness.
>          */
>         for (j = 0; j < level; i++, j++) {
>                 tl[i] = (struct sched_domain_topology_level){
> -                       .init = sd_numa_init,
>                         .mask = sd_numa_mask,
> +                       .sd_flags = SD_NUMA,
>                         .flags = SDTL_OVERLAP,
>                         .numa_level = j,
> +                       SD_INIT_NAME(NUMA)
>                 };
>         }
> 
> @@ -6395,7 +6404,7 @@ struct sched_domain *build_sched_domain(struct 
> sched_domain_topology_level *tl,
>                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
>                 struct sched_domain *child, int cpu)
>  {
> -       struct sched_domain *sd = tl->init(tl, cpu);
> +       struct sched_domain *sd = sd_init(tl, cpu);
>         if (!sd)
>                 return child;
> 
> --
> 1.9.0
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/7] sched: rework of sched_domain topology definition

Reply via email to