Sleeping task has no utiliation, when they were bursty waked up, the
zero utilization make scheduler out of balance, like aim7 benchmark.

rq->avg_idle is 'to used to accommodate bursty loads in a dirt simple
dirt cheap manner' -- Mike Galbraith.

With this cheap and smart bursty indicator, we can find the wake up
burst, and use nr_running as instant utilization in this scenario.

For other scenarios, we still use the precise CPU utilization to
judage if a domain is eligible for power scheduling.

Thanks for Mike Galbraith's idea!

Signed-off-by: Alex Shi <alex....@intel.com>
---
 kernel/sched/fair.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 83b2c39..ae07190 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3371,12 +3371,19 @@ static unsigned int max_rq_util(int cpu)
  * Try to collect the task running number and capacity of the group.
  */
 static void get_sg_power_stats(struct sched_group *group,
-       struct sched_domain *sd, struct sg_lb_stats *sgs)
+       struct sched_domain *sd, struct sg_lb_stats *sgs, int burst)
 {
        int i;
 
-       for_each_cpu(i, sched_group_cpus(group))
-               sgs->group_util += max_rq_util(i);
+       for_each_cpu(i, sched_group_cpus(group)) {
+               struct rq *rq = cpu_rq(i);
+
+               if (burst && rq->nr_running > 1)
+                       /* use nr_running as instant utilization */
+                       sgs->group_util += rq->nr_running;
+               else
+                       sgs->group_util += max_rq_util(i);
+       }
 
        sgs->group_weight = group->group_weight;
 }
@@ -3390,6 +3397,8 @@ static int is_sd_full(struct sched_domain *sd,
        struct sched_group *group;
        struct sg_lb_stats sgs;
        long sd_min_delta = LONG_MAX;
+       int cpu = task_cpu(p);
+       int burst = 0;
        unsigned int putil;
 
        if (p->se.load.weight == p->se.avg.load_avg_contrib)
@@ -3399,15 +3408,21 @@ static int is_sd_full(struct sched_domain *sd,
                putil = (u64)(p->se.avg.runnable_avg_sum << SCHED_POWER_SHIFT)
                                / (p->se.avg.runnable_avg_period + 1);
 
+       if (cpu_rq(cpu)->avg_idle < sysctl_sched_burst_threshold)
+               burst = 1;
+
        /* Try to collect the domain's utilization */
        group = sd->groups;
        do {
                long g_delta;
 
                memset(&sgs, 0, sizeof(sgs));
-               get_sg_power_stats(group, sd, &sgs);
+               get_sg_power_stats(group, sd, &sgs, burst);
 
-               g_delta = sgs.group_weight * FULL_UTIL - sgs.group_util;
+               if (burst)
+                       g_delta = sgs.group_weight - sgs.group_util;
+               else
+                       g_delta = sgs.group_weight * FULL_UTIL - sgs.group_util;
 
                if (g_delta > 0 && g_delta < sd_min_delta) {
                        sd_min_delta = g_delta;
@@ -3417,8 +3432,12 @@ static int is_sd_full(struct sched_domain *sd,
                sds->sd_util += sgs.group_util;
        } while  (group = group->next, group != sd->groups);
 
-       if (sds->sd_util + putil < sd->span_weight * FULL_UTIL)
-               return 0;
+       if (burst) {
+               if (sds->sd_util < sd->span_weight)
+                       return 0;
+       } else
+               if (sds->sd_util + putil < sd->span_weight * FULL_UTIL)
+                       return 0;
 
        /* can not hold one more task in this domain */
        return 1;
-- 
1.7.12

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to