Change the group_imb detection from the old 'load-spike' detector to an actual imbalance detector. We set it from the lower domain balance pass when it fails to create a balance in the presence of task affinities.
The advantage is that this should no longer generate the false positive group_imb conditions generated by transient load spikes from the normal balancing/bulk-wakeup etc. behaviour. While I haven't actually observed those they could happen. I'm not entirely happy with this patch; it somehow feels a little fragile. Nor does it solve the biggest issue I have with the group_imb code; it it still a fragile construct in that once we 'fixed' the imbalance we'll not detect the group_imb again and could end up re-creating it. That said, this patch does seem to preserve behaviour for the described degenerate case. In particular on my 2*6*2 wsm-ep: taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done' ends up with 9 spinners, each on their own CPU; whereas if you disable the group_imb code that typically doesn't happen (you'll get one pair sharing a CPU most of the time). Signed-off-by: Peter Zijlstra <pet...@infradead.org> --- kernel/sched/fair.c | 90 +++++++++++++++++---------------------------------- kernel/sched/sched.h | 1 2 files changed, 31 insertions(+), 60 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3906,7 +3906,8 @@ static unsigned long __read_mostly max_l #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_SOME_PINNED 0x04 +#define LBF_DST_PINNED 0x04 +#define LBF_SOME_PINNED 0x08 struct lb_env { struct sched_domain *sd; @@ -3997,6 +3998,8 @@ int can_migrate_task(struct task_struct schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + env->flags |= LBF_SOME_PINNED; + /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't @@ -4005,13 +4008,13 @@ int can_migrate_task(struct task_struct * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { - env->flags |= LBF_SOME_PINNED; + env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; } @@ -4535,13 +4538,12 @@ fix_small_capacity(struct sched_domain * * cpu 3 and leave one of the cpus in the second group unused. * * The current solution to this issue is detecting the skew in the first group - * by noticing it has a cpu that is overloaded while the remaining cpus are - * idle -- or rather, there's a distinct imbalance in the cpus; see - * sg_imbalanced(). + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculcate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditional to allow it + * find_busiest_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -4549,49 +4551,9 @@ fix_small_capacity(struct sched_domain * * subtle and fragile situation. */ -struct sg_imb_stats { - unsigned long max_nr_running, min_nr_running; - unsigned long max_cpu_load, min_cpu_load; -}; - -static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +static inline int sg_imbalanced(struct sched_group *group) { - sgi->max_cpu_load = sgi->max_nr_running = 0UL; - sgi->min_cpu_load = sgi->min_nr_running = ~0UL; -} - -static inline void -update_sg_imb_stats(struct sg_imb_stats *sgi, - unsigned long load, unsigned long nr_running) -{ - if (load > sgi->max_cpu_load) - sgi->max_cpu_load = load; - if (sgi->min_cpu_load > load) - sgi->min_cpu_load = load; - - if (nr_running > sgi->max_nr_running) - sgi->max_nr_running = nr_running; - if (sgi->min_nr_running > nr_running) - sgi->min_nr_running = nr_running; -} - -static inline int -sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) -{ - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && - (sgi->max_nr_running - sgi->min_nr_running) > 1) - return 1; - - return 0; + return group->sgp->imbalance; } /** @@ -4606,25 +4568,20 @@ static inline void update_sg_lb_stats(st struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - struct sg_imb_stats sgi; unsigned long nr_running; unsigned long load; int i; - init_sg_imb_stats(&sgi); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); nr_running = rq->nr_running; /* Bias balancing toward cpus of our domain */ - if (local_group) { + if (local_group) load = target_load(i, load_idx); - } else { + else load = source_load(i, load_idx); - update_sg_imb_stats(&sgi, load, nr_running); - } sgs->group_load += load; sgs->sum_nr_running += nr_running; @@ -4644,7 +4601,7 @@ static inline void update_sg_lb_stats(st if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - sgs->group_imb = sg_imbalanced(sgs, &sgi); + sgs->group_imb = sg_imbalanced(group); sgs->group_capacity = DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); @@ -5167,6 +5124,7 @@ static int load_balance(int this_cpu, st int *should_balance) { int ld_moved, cur_ld_moved, active_balance = 0; + struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5269,11 +5227,11 @@ static int load_balance(int this_cpu, st * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_SOME_PINNED; + env.flags &= ~LBF_DST_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; @@ -5287,6 +5245,18 @@ static int load_balance(int this_cpu, st goto more_balance; } + /* + * We failed to reach balance because of affinity. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgp->imbalance; + + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + *group_imbalance = 1; + } else if (*group_imbalance) + *group_imbalance = 0; + } + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -5690,7 +5660,7 @@ static void rebalance_domains(int cpu, e if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &should_balance)) { /* - * The LBF_SOME_PINNED logic could have changed + * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -605,6 +605,7 @@ struct sched_group_power { */ unsigned int power, power_orig; unsigned long next_update; + int imbalance; /* XXX unrelated to power but shared group state */ /* * Number of busy cpus in this group. */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/