On Mon, Apr 11, 2016 at 06:54:21AM +0200, Mike Galbraith wrote:
> On Sun, 2016-04-10 at 15:55 -0400, Chris Mason wrote:
> > On Sun, Apr 10, 2016 at 12:04:21PM +0200, Mike Galbraith wrote:
> > > On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:
> > > 
> > > > This does preserve the existing logic to prefer idle cores over idle
> > > > CPU threads, and includes some tests to try and avoid the idle scan 
> > > > when we're
> > > > actually better off sharing a non-idle CPU with someone else.
> > > 
> > > My box says the "oh nevermind" checks aren't selective enough, tbench
> > > dropped 4% at clients=cores, and 2% at clients=threads.
> > 
> > Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
> > just socket 0.  Version 2 below fixes things for me, but I'm hoping
> > someone can suggest a way to get task_hot() buddy checks without the rq
> > lock.
> > 
> > I haven't run this on production loads yet, but our 4.0 patch for this
> > uses task_hot(), so I'd expect it to be on par.  If this doesn't fix it
> > for you, I'll dig up a similar machine on Monday.
> 
> My box stopped caring.  I personally would be reluctant to apply it
> without a "you asked for it" button or a large pile of benchmark
> results.  Lock banging or not, full scan existing makes me nervous.


We can use a bitmap at the socket level to keep track of which cpus are
idle.  I'm sure there are better places for the array and better ways to
allocate, this is just a rough cut to make sure the idle tracking works.

-chris

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..1c3b5e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1055,6 +1055,8 @@ struct sched_domain {
        unsigned int balance_interval;  /* initialise to 1. units in ms. */
        unsigned int nr_balance_failed; /* initialise to 0 */
 
+       cpumask_var_t idle_cpus_mask;
+
        /* idle_balance() stats */
        u64 max_newidle_lb_cost;
        unsigned long next_decay_max_lb_cost;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41f6b22..237d645 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3204,6 +3204,7 @@ again:
 static void __sched notrace __schedule(bool preempt)
 {
        struct task_struct *prev, *next;
+       struct sched_domain *package_sd;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
@@ -3270,11 +3270,19 @@ static void __sched notrace __schedule(bool preempt)
                update_rq_clock(rq);
 
        next = pick_next_task(rq, prev);
+
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
        rq->clock_skip_update = 0;
 
        if (likely(prev != next)) {
+               package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+               if (package_sd) {
+                       if (prev->policy == SCHED_IDLE && next->policy != 
SCHED_IDLE)
+                               cpumask_clear_cpu(cpu, 
package_sd->idle_cpus_mask);
+                       else if (next->policy == SCHED_IDLE)
+                               cpumask_set_cpu(cpu, 
package_sd->idle_cpus_mask);
+               }
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
@@ -6599,7 +6607,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                sd->imbalance_pct = 117;
                sd->cache_nice_tries = 1;
                sd->busy_idx = 2;
-
 #ifdef CONFIG_NUMA
        } else if (sd->flags & SD_NUMA) {
                sd->cache_nice_tries = 2;
@@ -7041,6 +7048,8 @@ struct sched_domain *build_sched_domain(struct 
sched_domain_topology_level *tl,
                return child;
 
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+       zalloc_cpumask_var(&sd->idle_cpus_mask, GFP_NOWAIT);
+       cpumask_and(sd->idle_cpus_mask, cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c76505..cae6bd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5026,7 +5026,7 @@ next:
         * the package.
         */
        if (package_sd && should_scan_idle(p, target)) {
-               for_each_cpu_and(i, sched_domain_span(package_sd),
+               for_each_cpu_and(i, package_sd->idle_cpus_mask,
                                 tsk_cpus_allowed(p)) {
                        if (idle_cpu(i)) {
                                target = i;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a713..7e34b42 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -202,6 +202,9 @@ DEFINE_PER_CPU(bool, cpu_dead_idle);
  */
 static void cpu_idle_loop(void)
 {
+       int cpu;
+       struct sched_domain *package_sd;
+
        while (1) {
                /*
                 * If the arch has a polling bit, we maintain an invariant:
@@ -212,10 +215,19 @@ static void cpu_idle_loop(void)
                 * guaranteed to cause the cpu to reschedule.
                 */
 
+
                __current_set_polling();
                quiet_vmstat();
                tick_nohz_idle_enter();
 
+               preempt_disable();
+               cpu = smp_processor_id();
+               package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+               if (package_sd) {
+                       cpumask_set_cpu(cpu, package_sd->idle_cpus_mask);
+               }
+               preempt_enable();
+
                while (!need_resched()) {
                        check_pgt_cache();
                        rmb();
-- 
2.8.0.rc2

Reply via email to