On Fri, 2015-07-03 at 08:40 +0200, Mike Galbraith wrote:

> Hm.  Seems what this load should like best is if we detect 1:N, skip all
> of the routine gyrations, ie move the N (workers) infrequently, expend
> search cycles frequently only on the 1 (dispatch).
> 
> Ponder..

Since it was too hot to do outside chores (any excuse will do;)...

If we're (read /me) on track, the bellow should help.  Per my tracing,
it may want a wee bit of toning down actually, though when I trace
virgin source I expect to see the same, namely Xorg and friends having
"wide-load" tattooed across their hindquarters earlier than they should.
It doesn't seem to hurt anything, but then demolishing a single llc box
is a tad more difficult than demolishing a NUMA box.


sched: beef up wake_wide()

Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU.  While looking at wake_wide(),
I noticed that it doesn't pay attention to wakeup of the 1:N waker,
returning 1 only when waking one of its N minions.

Correct that, and give the user the option to do an expensive balance IFF
select_idle_sibling() doesn't find an idle CPU, and IFF the wakee is the
the 1:N dispatcher of work, thus worth some extra effort.

Not-Signed-off-by: Mike Galbraith <umgwanakikb...@gmail.com>
---
 kernel/sched/fair.c     |   89 +++++++++++++++++++++++++-----------------------
 kernel/sched/features.h |    6 +++
 2 files changed, 54 insertions(+), 41 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -666,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *c
 }
 
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int cpu, void *clear);
 static unsigned long task_h_load(struct task_struct *p);
 
 static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1375,7 +1375,7 @@ static void task_numa_compare(struct tas
         * Call select_idle_sibling to maybe find a better one.
         */
        if (!cur)
-               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu, NULL);
 
 assign:
        task_numa_assign(env, cur, imp);
@@ -4730,26 +4730,30 @@ static long effective_load(struct task_g
 
 #endif
 
+/*
+ * Detect 1:N waker/wakee relationship via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.  In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.  With
+ * both conditions met, we can be relatively sure that we are seeing a 1:N
+ * relationship, and that load size exceeds socket size.
+ */
 static int wake_wide(struct task_struct *p)
 {
-       int factor = this_cpu_read(sd_llc_size);
-
-       /*
-        * Yeah, it's the switching-frequency, could means many wakee or
-        * rapidly switch, use factor here will just help to automatically
-        * adjust the loose-degree, so bigger node will lead to more pull.
-        */
-       if (p->wakee_flips > factor) {
-               /*
-                * wakee is somewhat hot, it needs certain amount of cpu
-                * resource, so if waker is far more hot, prefer to leave
-                * it alone.
-                */
-               if (current->wakee_flips > (factor * p->wakee_flips))
-                       return 1;
+       unsigned long waker_flips = current->wakee_flips;
+       unsigned long wakee_flips = p->wakee_flips;
+       int factor = this_cpu_read(sd_llc_size), ret = 1;
+
+       if (waker_flips < wakee_flips) {
+               swap(waker_flips, wakee_flips);
+               /* Tell the caller that we're waking a 1:N waker */
+               ret += sched_feat(WAKE_WIDE_BALANCE);
        }
-
-       return 0;
+       if (wakee_flips < factor || waker_flips < wakee_flips * factor)
+               return 0;
+       return ret;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int 
sync)
@@ -4761,13 +4765,6 @@ static int wake_affine(struct sched_doma
        unsigned long weight;
        int balanced;
 
-       /*
-        * If we wake multiple tasks be careful to not bounce
-        * ourselves around too much.
-        */
-       if (wake_wide(p))
-               return 0;
-
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
        prev_cpu  = task_cpu(p);
@@ -4935,20 +4932,22 @@ find_idlest_cpu(struct sched_group *grou
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int target, void *clear)
 {
        struct sched_domain *sd;
        struct sched_group *sg;
        int i = task_cpu(p);
 
        if (idle_cpu(target))
-               return target;
+               goto done;
 
        /*
         * If the prevous cpu is cache affine and idle, don't be stupid.
         */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-               return i;
+       if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
+               target = i;
+               goto done;
+       }
 
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -4973,7 +4972,11 @@ static int select_idle_sibling(struct ta
                        sg = sg->next;
                } while (sg != sd->groups);
        }
+       return target;
 done:
+       if (clear)
+               *(void **)clear = 0;
+
        return target;
 }
 /*
@@ -5021,14 +5024,19 @@ select_task_rq_fair(struct task_struct *
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
-       int new_cpu = cpu;
-       int want_affine = 0;
+       int new_cpu = prev_cpu;
+       int want_affine = 0, want_balance = 0;
        int sync = wake_flags & WF_SYNC;
 
-       if (sd_flag & SD_BALANCE_WAKE)
-               want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
-
        rcu_read_lock();
+       if (sd_flag & SD_BALANCE_WAKE) {
+               want_affine = wake_wide(p);
+               want_balance = want_affine > 1;
+               want_affine = !want_affine && cpumask_test_cpu(cpu, 
tsk_cpus_allowed(p));
+               if (!want_affine && !want_balance)
+                       goto select;
+       }
+
        for_each_domain(cpu, tmp) {
                if (!(tmp->flags & SD_LOAD_BALANCE))
                        continue;
@@ -5043,23 +5051,23 @@ select_task_rq_fair(struct task_struct *
                        break;
                }
 
-               if (tmp->flags & sd_flag)
+               if (tmp->flags & sd_flag || want_balance)
                        sd = tmp;
        }
 
        if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-               prev_cpu = cpu;
+               new_cpu = cpu;
 
        if (sd_flag & SD_BALANCE_WAKE) {
-               new_cpu = select_idle_sibling(p, prev_cpu);
-               goto unlock;
+select:
+               new_cpu = select_idle_sibling(p, new_cpu, &sd);
        }
 
        while (sd) {
                struct sched_group *group;
                int weight;
 
-               if (!(sd->flags & sd_flag)) {
+               if (!(sd->flags & sd_flag) && !want_balance) {
                        sd = sd->child;
                        continue;
                }
@@ -5089,7 +5097,6 @@ select_task_rq_fair(struct task_struct *
                }
                /* while loop will break here if sd == NULL */
        }
-unlock:
        rcu_read_unlock();
 
        return new_cpu;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -96,3 +96,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
  */
 SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
+
+/*
+ * Perform expensive full wake balance for 1:N wakers when the
+ * selected cpu is not completely idle.
+ */
+SCHED_FEAT(WAKE_WIDE_BALANCE, false)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to