On Mon, Feb 12, 2018 at 09:07:54AM +0100, Vincent Guittot wrote:
> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct 
> rq_flags *rf)
>  
>       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>           !this_rq->rd->overload) {
> +             unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +             unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
> +
>               rcu_read_lock();
>               sd = rcu_dereference_check_sched_domain(this_rq->sd);
>               if (sd)
>                       update_next_balance(sd, &next_balance);
>               rcu_read_unlock();
>  
> +             /*
> +              * Update blocked idle load if it has not been done for a
> +              * while. Try to do it locally before entering idle but kick a
> +              * ilb if it takes too much time and/or might delay next local
> +              * wake up
> +              */
> +             if (has_blocked && time_after_eq(jiffies, next_blocked) &&
> +                             (this_rq->avg_idle < 
> sysctl_sched_migration_cost ||
> +                             !_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, 
> CPU_NEWLY_IDLE)))
> +                     kick_ilb(NOHZ_STATS_KICK);
> +
>               goto out;
>       }

So I really hate this one, also I suspect its broken, because we do this
check before dropping rq->lock and _nohz_idle_balance() will take
rq->lock.

Aside from the above being an unreadable mess, I dislike that it breaks
the various isolation crud, we should not touch CPUs outside of our
domain.

Maybe something like the below? (unfinished)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7839,7 +7839,7 @@ group_type group_classify(struct sched_g
        return group_other;
 }
 
-static bool update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq, bool force)
 {
 #ifdef CONFIG_NO_HZ_COMMON
        unsigned int cpu = rq->cpu;
@@ -7850,7 +7850,7 @@ static bool update_nohz_stats(struct rq
        if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
                return false;
 
-       if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+       if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
                return true;
 
        update_blocked_averages(cpu);
@@ -7883,7 +7883,7 @@ static inline void update_sg_lb_stats(st
        for_each_cpu_and(i, sched_group_span(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
 
-               if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+               if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, 
false))
                        env->flags |= LBF_NOHZ_AGAIN;
 
                /* Bias balancing toward cpus of our domain */
@@ -8857,6 +8857,29 @@ update_next_balance(struct sched_domain
                *next_balance = next;
 }
 
+static int nohz_age(struct sched_domain *sd)
+{
+       struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+       bool has_blocked_load;
+
+       WRITE_ONCE(nohz.has_blocked, 0);
+
+       smp_mb();
+
+       cpumask_and(cpus, sched_domain_span(sd), nohz.idle_cpus_mask);
+
+       has_blocked_load = cpumask_subset(nohz.idle_cpus_mask, 
sched_domain_span(sd));
+
+       for_each_cpu(cpu, cpus) {
+               struct rq *rq = cpu_rq(cpu);
+
+               has_blocked_load |= update_nohz_stats(rq, true);
+       }
+
+       if (has_blocked_load)
+               WRITE_ONCE(nohz.has_blocked, 1);
+}
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -8868,6 +8891,7 @@ static int idle_balance(struct rq *this_
        struct sched_domain *sd;
        int pulled_task = 0;
        u64 curr_cost = 0;
+       bool nohz_blocked = false;
 
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
@@ -8889,8 +8913,8 @@ static int idle_balance(struct rq *this_
         */
        rq_unpin_lock(this_rq, rf);
 
-       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-           !this_rq->rd->overload) {
+       if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+short_idle:
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
@@ -8900,6 +8924,18 @@ static int idle_balance(struct rq *this_
                goto out;
        }
 
+       if (!this_rq->rd->overload) {
+#ifdef CONFIG_NO_HZ_COMMON
+               unsigned int has_blocked = READ_ONCE(nohz.has_blocked);
+               unsigned long next_blocked = READ_ONE(nohz.next_blocked);
+
+               if (has_blocked && time_after_eq(jiffies, next_blocked))
+                       nohz_blocked = true;
+               else
+#endif
+               goto short_idle;
+       }
+
        raw_spin_unlock(&this_rq->lock);
 
        update_blocked_averages(this_cpu);
@@ -8919,9 +8955,13 @@ static int idle_balance(struct rq *this_
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        t0 = sched_clock_cpu(this_cpu);
 
-                       pulled_task = load_balance(this_cpu, this_rq,
-                                                  sd, CPU_NEWLY_IDLE,
-                                                  &continue_balancing);
+                       if (nohz_blocked) {
+                               nohz_age(sd);
+                       } else {
+                               pulled_task = load_balance(this_cpu, this_rq,
+                                               sd, CPU_NEWLY_IDLE,
+                                               &continue_balancing);
+                       }
 
                        domain_cost = sched_clock_cpu(this_cpu) - t0;
                        if (domain_cost > sd->max_newidle_lb_cost)
@@ -9497,8 +9537,7 @@ static bool nohz_idle_balance(struct rq
 
                rq = cpu_rq(balance_cpu);
 
-               update_blocked_averages(rq->cpu);
-               has_blocked_load |= rq->has_blocked_load;
+               has_blocked_load |= update_nohz_stats(rq, true);
 
                /*
                 * If time for next balance is due,

Reply via email to