On Wed, 2011-08-24 at 15:26 +0200, Bjoern Boschman wrote:
> Hi,
> 
> I just wanted to ask if there's already some progress on this?
> I created a patch like suggested in bugzilla#13
> 
> Can anyone please review this patch?
> Would it make sense to create a patch that does the devide-by-zero
> workaround and also throws a WARN_OUT?

I think we should fix up *and* warn.  How about this:

From: Ben Hutchings <b...@decadent.org.uk>
Subject: sched: Work around sched_group::cpu_power = 0

#636797 and others report a division by zero in the scheduler due
to sched_group::cpu_power.  Try to work out why this is happening,
and fix it up to something sane it does.

Thanks to Bjoern Boschman <bjoern.bosch...@nfon.net> for part of this.

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5387d5..7d10fbc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -919,6 +919,15 @@ static inline struct cpumask *sched_group_cpus(struct 
sched_group *sg)
        return to_cpumask(sg->cpumask);
 }
 
+extern unsigned int sched_warn_zero_power(struct sched_group *group);
+
+static inline unsigned int sched_group_power(struct sched_group *group)
+{
+       unsigned int power = ACCESS_ONCE(group->cpu_power);
+
+       return likely(power > 0) ? power : sched_warn_zero_power(group);
+}
+
 enum sched_domain_level {
        SD_LV_NONE = 0,
        SD_LV_SIBLING,
diff --git a/kernel/sched.c b/kernel/sched.c
index 2829d09..93d147d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3804,6 +3804,7 @@ static void update_cpu_power(struct sched_domain *sd, int 
cpu)
        unsigned long weight = cpumask_weight(sched_domain_span(sd));
        unsigned long power = SCHED_LOAD_SCALE;
        struct sched_group *sdg = sd->groups;
+       unsigned long scale_rt;
 
        if (sched_feat(ARCH_POWER))
                power *= arch_scale_freq_power(sd, cpu);
@@ -3821,12 +3822,16 @@ static void update_cpu_power(struct sched_domain *sd, 
int cpu)
                power >>= SCHED_LOAD_SHIFT;
        }
 
-       power *= scale_rt_power(cpu);
+       scale_rt = scale_rt_power(cpu);
+       power *= scale_rt;
        power >>= SCHED_LOAD_SHIFT;
 
        if (!power)
                power = 1;
 
+       if (WARN_ON_ONCE((long)power <= 0))
+               printk(KERN_ERR "group %p cpu_power = %ld; scale_rt = %ld\n", 
sdg, power, scale_rt);
+
        cpu_rq(cpu)->cpu_power = power;
        sdg->cpu_power = power;
 }
@@ -3850,6 +3855,9 @@ static void update_group_power(struct sched_domain *sd, 
int cpu)
                group = group->next;
        } while (group != child->groups);
 
+       if (WARN_ON((long)power <= 0))
+               printk(KERN_ERR "cpu_power = %ld\n", power);
+
        sdg->cpu_power = power;
 }
 
@@ -3932,7 +3940,8 @@ static inline void update_sg_lb_stats(struct sched_domain 
*sd,
        }
 
        /* Adjust by relative CPU power of the group */
-       sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+       sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) /
+               sched_group_power(group);
 
        /*
         * Consider the group unbalanced when the imbalance is larger
@@ -8104,7 +8113,7 @@ static int sched_domain_debug_one(struct sched_domain 
*sd, int cpu, int level,
 
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 
-               printk(KERN_CONT " %s", str);
+               printk(KERN_CONT " group %p cpus %s", group, str);
                if (group->cpu_power != SCHED_LOAD_SCALE) {
                        printk(KERN_CONT " (cpu_power = %d)",
                                group->cpu_power);
@@ -11190,3 +11199,14 @@ void synchronize_sched_expedited(void)
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #endif /* #else #ifndef CONFIG_SMP */
+
+/* Fix up and warn about group with cpu_power = 0 */
+unsigned int sched_warn_zero_power(struct sched_group *group)
+{
+       static char str[256];
+
+       cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+       WARN_ONCE(1, "group %p cpus %s cpu_power = 0", group, str);
+
+       return SCHED_LOAD_SCALE;
+}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d53c9c7..7119d8d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1354,7 +1354,8 @@ find_idlest_group(struct sched_domain *sd, struct 
task_struct *p,
                }
 
                /* Adjust by relative CPU power of the group */
-               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+               avg_load = (avg_load * SCHED_LOAD_SCALE) /
+                       sched_group_power(group);
 
                if (local_group) {
                        this_load = avg_load;
--- END ---

Attachment: signature.asc
Description: This is a digitally signed message part

Reply via email to