Please, don't try this diff blindly it won't make your machine faster. In the past months I've been looking more closely at our scheduler. At p2k16 I've shown to a handful of developers that when running a browser on my x220 with HT enable, a typical desktop usage, the per- CPU runqueues were never balanced. You often have no job on a CPU and multiple on the others.
Currently when a CPU doesn't have any job on its runqueue it tries to "steal" a job from another CPU's runqueue. If look at the stats on my machine running a lot of threaded apps (GNOME3, Thunderbird, Firefox, Chrome), here's what I get: # pstat -d ld sched_stolen sched_choose sched_wasidle sched_stolen: 1665846 sched_choose: 3195615 sched_wasidle: 1309253 For 32K jobs dispatched, 16K got stolen. That's 50% of the jobs on my machine and this ratio is stable for my usage. On my test machine, an Atom with HT, I got the following number: - after boot: sched_stolen: 570 sched_choose: 10450 sched_wasidle: 8936 - after playing a video on youtube w/ firefox: sched_stolen: 2153754 sched_choose: 10261682 sched_wasidle: 1525801 - after playing a video on youtube w/ chromium (after reboot): sched_stolen: 310000 sched_choose: 6470258 sched_wasidle: 934772 What's interesting here is that threaded apps (like firefox) seems to trigger more "stealing". It would be interesting to see if/how this is related to the yield-busy-wait triggered by librthread's thrsleep() usage explained some months ago. What's also interesting is that the number of stolen jobs seems to be higher if your number of CPU is higher. Elementary, My Dear Watson? I observed that for the same workload, playing a HD video in firefox while compiling a kernel with make -j4, I have 50% have stolen jobs with 4 CPUs and 20% with 2 CPUs. Sadly I don't have a bigger machine to test. How bad can it be? So I looked at how this situation could be improved. My goal was to be able to compile a kernel while watching a video in my browser without having my audio slutter. I started by removing the "stealing" logic but the situation didn't improve. Then I tried to play with the calculation of the cost and failed. Then I decided to remove completely the per-CPU runqueues and came up with the diff below... There's too many things that I still don't understand so I'm not asking for ok, but I'd appreciate if people could test this diff and report back. My goal is currently to get a better understanding of our scheduler to hopefully improve it. By using a single runqueue I prioritise latency over throughput. That means your performance might degrade, but at least I can watch my HD video while doing a "make -j4". As a bonus, the diff below also greatly reduces the number of IPIs on my systems. Index: sys/sched.h =================================================================== RCS file: /cvs/src/sys/sys/sched.h,v retrieving revision 1.41 diff -u -p -r1.41 sched.h --- sys/sched.h 17 Mar 2016 13:18:47 -0000 1.41 +++ sys/sched.h 6 Jul 2016 17:31:11 -0000 @@ -89,9 +89,10 @@ #define SCHED_NQS 32 /* 32 run queues. */ +#ifdef _KERNEL + /* * Per-CPU scheduler state. - * XXX - expose to userland for now. */ struct schedstate_percpu { struct timespec spc_runtime; /* time curproc started running */ @@ -102,23 +103,16 @@ struct schedstate_percpu { int spc_rrticks; /* ticks until roundrobin() */ int spc_pscnt; /* prof/stat counter */ int spc_psdiv; /* prof/stat divisor */ + unsigned int spc_npeg; /* nb. of pegged threads on runqueue */ struct proc *spc_idleproc; /* idle proc for this cpu */ - u_int spc_nrun; /* procs on the run queues */ fixpt_t spc_ldavg; /* shortest load avg. for this cpu */ - TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS]; - volatile uint32_t spc_whichqs; - -#ifdef notyet - struct proc *spc_reaper; /* dead proc reaper */ -#endif LIST_HEAD(,proc) spc_deadproc; volatile int spc_barrier; /* for sched_barrier() */ }; -#ifdef _KERNEL /* spc_flags */ #define SPCF_SEENRR 0x0001 /* process has seen roundrobin() */ @@ -141,14 +135,13 @@ void roundrobin(struct cpu_info *); void scheduler_start(void); void userret(struct proc *p); +void sched_init(void); void sched_init_cpu(struct cpu_info *); void sched_idle(void *); void sched_exit(struct proc *); void mi_switch(void); void cpu_switchto(struct proc *, struct proc *); struct proc *sched_chooseproc(void); -struct cpu_info *sched_choosecpu(struct proc *); -struct cpu_info *sched_choosecpu_fork(struct proc *parent, int); void cpu_idle_enter(void); void cpu_idle_cycle(void); void cpu_idle_leave(void); @@ -163,11 +156,11 @@ void sched_start_secondary_cpus(void); void sched_stop_secondary_cpus(void); #endif -#define cpu_is_idle(ci) ((ci)->ci_schedstate.spc_whichqs == 0) - -void sched_init_runqueues(void); void setrunqueue(struct proc *); void remrunqueue(struct proc *); + +extern volatile uint32_t sched_whichqs; +#define sched_qs_empty(ci) (sched_whichqs == 0) /* Inherit the parent's scheduler history */ #define scheduler_fork_hook(parent, child) do { \ Index: kern/sched_bsd.c =================================================================== RCS file: /cvs/src/sys/kern/sched_bsd.c,v retrieving revision 1.43 diff -u -p -r1.43 sched_bsd.c --- kern/sched_bsd.c 9 Mar 2016 13:38:50 -0000 1.43 +++ kern/sched_bsd.c 6 Jul 2016 17:31:11 -0000 @@ -105,7 +105,7 @@ roundrobin(struct cpu_info *ci) } } - if (spc->spc_nrun) + if (!sched_qs_empty(ci)) need_resched(ci); } @@ -300,6 +300,7 @@ yield(void) SCHED_LOCK(s); p->p_priority = p->p_usrpri; p->p_stat = SRUN; + KASSERT(p->p_cpu != NULL); setrunqueue(p); p->p_ru.ru_nvcsw++; mi_switch(); @@ -327,7 +328,7 @@ preempt(struct proc *newp) SCHED_LOCK(s); p->p_priority = p->p_usrpri; p->p_stat = SRUN; - p->p_cpu = sched_choosecpu(p); + KASSERT(p->p_cpu != NULL); setrunqueue(p); p->p_ru.ru_nivcsw++; mi_switch(); @@ -418,6 +419,7 @@ mi_switch(void) } clear_resched(curcpu()); + spc->spc_curpriority = p->p_usrpri; SCHED_ASSERT_LOCKED(); @@ -454,25 +456,15 @@ mi_switch(void) #endif } -static __inline void +/* + * If the last CPU of thread ``p'' is currently running a lower + * priority thread, force a reschedule. + */ +static inline void resched_proc(struct proc *p, u_char pri) { - struct cpu_info *ci; + struct cpu_info *ci = p->p_cpu; - /* - * XXXSMP - * This does not handle the case where its last - * CPU is running a higher-priority process, but every - * other CPU is running a lower-priority process. There - * are ways to handle this situation, but they're not - * currently very pretty, and we also need to weigh the - * cost of moving a process from one CPU to another. - * - * XXXSMP - * There is also the issue of locking the other CPU's - * sched state, which we currently do not do. - */ - ci = (p->p_cpu != NULL) ? p->p_cpu : curcpu(); if (pri < ci->ci_schedstate.spc_curpriority) need_resched(ci); } @@ -507,7 +499,7 @@ setrunnable(struct proc *p) break; } p->p_stat = SRUN; - p->p_cpu = sched_choosecpu(p); + KASSERT(p->p_cpu != NULL); setrunqueue(p); if (p->p_slptime > 1) updatepri(p); Index: kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v retrieving revision 1.132 diff -u -p -r1.132 kern_synch.c --- kern/kern_synch.c 4 Jul 2016 16:12:52 -0000 1.132 +++ kern/kern_synch.c 6 Jul 2016 17:31:11 -0000 @@ -266,6 +266,7 @@ sleep_finish(struct sleep_state *sls, in mi_switch(); } else if (!do_sleep) { unsleep(p); + p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri; } #ifdef DIAGNOSTIC @@ -273,7 +274,6 @@ sleep_finish(struct sleep_state *sls, in panic("sleep_finish !SONPROC"); #endif - p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri; SCHED_UNLOCK(sls->sls_s); /* Index: kern/kern_sched.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sched.c,v retrieving revision 1.43 diff -u -p -r1.43 kern_sched.c --- kern/kern_sched.c 3 Jun 2016 15:21:23 -0000 1.43 +++ kern/kern_sched.c 6 Jul 2016 17:31:11 -0000 @@ -26,36 +26,37 @@ #include <sys/mutex.h> #include <sys/task.h> -#include <uvm/uvm_extern.h> +TAILQ_HEAD(, proc) sched_qs[SCHED_NQS]; +volatile uint32_t sched_whichqs; -void sched_kthreads_create(void *); +#ifdef MULTIPROCESSOR +struct taskq *sbartq; +#endif -int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); -struct proc *sched_steal_proc(struct cpu_info *); +struct proc *sched_select(struct cpu_info *); +void sched_kthreads_create(void *); -/* - * To help choosing which cpu should run which process we keep track - * of cpus which are currently idle and which cpus have processes - * queued. - */ -struct cpuset sched_idle_cpus; -struct cpuset sched_queued_cpus; -struct cpuset sched_all_cpus; +void +sched_init(void) +{ + struct cpu_info *ci = curcpu(); + int i; -/* - * Some general scheduler counters. - */ -uint64_t sched_nmigrations; /* Cpu migration counter */ -uint64_t sched_nomigrations; /* Cpu no migration counter */ -uint64_t sched_noidle; /* Times we didn't pick the idle task */ -uint64_t sched_stolen; /* Times we stole proc from other cpus */ -uint64_t sched_choose; /* Times we chose a cpu */ -uint64_t sched_wasidle; /* Times we came out of idle */ + for (i = 0; i < SCHED_NQS; i++) + TAILQ_INIT(&sched_qs[i]); + sched_whichqs = 0; #ifdef MULTIPROCESSOR -struct taskq *sbartq; + sbartq = taskq_create("sbar", 1, IPL_NONE, + TASKQ_MPSAFE | TASKQ_CANTSLEEP); + if (sbartq == NULL) + panic("unable to create sbar taskq"); #endif + ci->ci_randseed = (arc4random() & 0x7fffffff) + 1; + sched_init_cpu(ci); +} + /* * A few notes about cpu_switchto that is implemented in MD code. * @@ -74,30 +75,18 @@ struct taskq *sbartq; */ /* - * sched_init_cpu is called from main() for the boot cpu, then it's the - * responsibility of the MD code to call it for all other cpus. + * sched_init_cpu is called from sched_init() for the boot cpu, then + * it's the responsibility of the MD code to call it for all other cpus. */ void sched_init_cpu(struct cpu_info *ci) { struct schedstate_percpu *spc = &ci->ci_schedstate; - int i; - - for (i = 0; i < SCHED_NQS; i++) - TAILQ_INIT(&spc->spc_qs[i]); spc->spc_idleproc = NULL; - - kthread_create_deferred(sched_kthreads_create, ci); - LIST_INIT(&spc->spc_deadproc); - /* - * Slight hack here until the cpuset code handles cpu_info - * structures. - */ - cpuset_init_cpu(ci); - cpuset_add(&sched_all_cpus, ci); + kthread_create_deferred(sched_kthreads_create, ci); } void @@ -115,10 +104,46 @@ sched_kthreads_create(void *v) /* Name it as specified. */ snprintf(spc->spc_idleproc->p_comm, sizeof(spc->spc_idleproc->p_comm), "idle%d", num); + /* Always triggers a reschedule when an idle thread is running. */ + spc->spc_idleproc->p_usrpri = MAXPRI; num++; } +/* + * Returns 1 if a CPU can idle, 0 otherwise. + */ +static inline int +can_idle(struct cpu_info *ci) +{ +#ifdef MULTIPROCESSOR + struct schedstate_percpu *spc = &ci->ci_schedstate; +#endif /* MULTIPROCESSOR */ + + /* + * As soon as a wakeup() or roundrobin() called need_resched() + * for this CPU, it has to go through mi_switch() to clear the + * resched flag. + * + * Yes, it is racy as the thread that triggered the reschedule + * might already be executing on another CPU. In this case, + * if there's nothing else on the runqueue, this CPU will come + * back in its idle loop. + */ + if (want_resched(ci)) + return (0); + + if (sched_qs_empty(ci)) + return (1); + +#ifdef MULTIPROCESSOR + if ((spc->spc_schedflags & SPCF_SHOULDHALT) && (spc->spc_npeg == 0)) + return (1); +#endif /* MULTIPROCESSOR */ + + return (0); +} + void sched_idle(void *v) { @@ -136,19 +161,17 @@ sched_idle(void *v) * just go away for a while. */ SCHED_LOCK(s); - cpuset_add(&sched_idle_cpus, ci); p->p_stat = SSLEEP; p->p_cpu = ci; atomic_setbits_int(&p->p_flag, P_CPUPEG); mi_switch(); - cpuset_del(&sched_idle_cpus, ci); SCHED_UNLOCK(s); KASSERT(ci == curcpu()); KASSERT(curproc == spc->spc_idleproc); while (1) { - while (!cpu_is_idle(curcpu())) { + while (!can_idle(ci)) { struct proc *dead; SCHED_LOCK(s); @@ -164,24 +187,20 @@ sched_idle(void *v) splassert(IPL_NONE); - cpuset_add(&sched_idle_cpus, ci); cpu_idle_enter(); - while (spc->spc_whichqs == 0) { + while (!want_resched(ci)) { #ifdef MULTIPROCESSOR if (spc->spc_schedflags & SPCF_SHOULDHALT && (spc->spc_schedflags & SPCF_HALTED) == 0) { - cpuset_del(&sched_idle_cpus, ci); - SCHED_LOCK(s); + KASSERT(spc->spc_npeg == 0); atomic_setbits_int(&spc->spc_schedflags, - spc->spc_whichqs ? 0 : SPCF_HALTED); - SCHED_UNLOCK(s); + SPCF_HALTED); wakeup(spc); } -#endif +#endif /* MULTIPROCESSOR */ cpu_idle_cycle(); } cpu_idle_leave(); - cpuset_del(&sched_idle_cpus, ci); } } @@ -216,100 +235,94 @@ sched_exit(struct proc *p) SCHED_LOCK(s); idle = spc->spc_idleproc; idle->p_stat = SRUN; + idle->p_cpu = curcpu(); cpu_switchto(NULL, idle); panic("cpu_switchto returned"); } -/* - * Run queue management. - */ -void -sched_init_runqueues(void) -{ -#ifdef MULTIPROCESSOR - sbartq = taskq_create("sbar", 1, IPL_NONE, - TASKQ_MPSAFE | TASKQ_CANTSLEEP); - if (sbartq == NULL) - panic("unable to create sbar taskq"); -#endif -} - void setrunqueue(struct proc *p) { - struct schedstate_percpu *spc; int queue = p->p_priority >> 2; SCHED_ASSERT_LOCKED(); - spc = &p->p_cpu->ci_schedstate; - spc->spc_nrun++; - TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); - spc->spc_whichqs |= (1 << queue); - cpuset_add(&sched_queued_cpus, p->p_cpu); + TAILQ_INSERT_TAIL(&sched_qs[queue], p, p_runq); + sched_whichqs |= (1 << queue); - if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) - cpu_unidle(p->p_cpu); + if (p->p_flag & P_CPUPEG) + p->p_cpu->ci_schedstate.spc_npeg++; } void remrunqueue(struct proc *p) { - struct schedstate_percpu *spc; int queue = p->p_priority >> 2; SCHED_ASSERT_LOCKED(); - spc = &p->p_cpu->ci_schedstate; - spc->spc_nrun--; - TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); - if (TAILQ_EMPTY(&spc->spc_qs[queue])) { - spc->spc_whichqs &= ~(1 << queue); - if (spc->spc_whichqs == 0) - cpuset_del(&sched_queued_cpus, p->p_cpu); - } + TAILQ_REMOVE(&sched_qs[queue], p, p_runq); + if (TAILQ_EMPTY(&sched_qs[queue])) + sched_whichqs &= ~(1 << queue); + + if (p->p_flag & P_CPUPEG) + p->p_cpu->ci_schedstate.spc_npeg--; } +/* + * Select the first thread that can run on cpu ``ci'' from the runqueue. + * + * This is O(1) when there's no pegged thread in the runqueue. + */ struct proc * -sched_chooseproc(void) +sched_select(struct cpu_info *ci) { - struct schedstate_percpu *spc = &curcpu()->ci_schedstate; +#ifdef MULTIPROCESSOR + struct schedstate_percpu *spc = &ci->ci_schedstate; +#endif /* MULTIPROCESSOR */ struct proc *p; int queue; - SCHED_ASSERT_LOCKED(); + if (sched_qs_empty(ci)) + return (NULL); + for (queue = 0; queue < SCHED_NQS; queue++) { + TAILQ_FOREACH(p, &sched_qs[queue], p_runq) { #ifdef MULTIPROCESSOR - if (spc->spc_schedflags & SPCF_SHOULDHALT) { - if (spc->spc_whichqs) { - for (queue = 0; queue < SCHED_NQS; queue++) { - while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) { - remrunqueue(p); - p->p_cpu = sched_choosecpu(p); - setrunqueue(p); - if (p->p_cpu == curcpu()) { - KASSERT(p->p_flag & P_CPUPEG); - goto again; - } - } - } + /* Never run a thread pegged to another CPU. */ + if ((p->p_flag & P_CPUPEG) && p->p_cpu != ci) + continue; + + /* If it should halt, only run pegged threads. */ + if ((spc->spc_schedflags & SPCF_SHOULDHALT) && + (p->p_flag & P_CPUPEG) == 0) + continue; +#endif /* MULTIPROCESSOR */ + + return (p); } - p = spc->spc_idleproc; - KASSERT(p); - KASSERT(p->p_wchan == NULL); - p->p_stat = SRUN; - return (p); } -#endif + + return (NULL); +} + +struct proc * +sched_chooseproc(void) +{ + struct cpu_info *ci = curcpu(); + struct proc *p = NULL; + + SCHED_ASSERT_LOCKED(); again: - if (spc->spc_whichqs) { - queue = ffs(spc->spc_whichqs) - 1; - p = TAILQ_FIRST(&spc->spc_qs[queue]); + p = sched_select(ci); + + if (p != NULL) { remrunqueue(p); - sched_noidle++; KASSERT(p->p_stat == SRUN); - } else if ((p = sched_steal_proc(curcpu())) == NULL) { + } else { + struct schedstate_percpu *spc = &ci->ci_schedstate; + p = spc->spc_idleproc; if (p == NULL) { int s; @@ -328,263 +341,11 @@ again: } KASSERT(p); p->p_stat = SRUN; - } - - KASSERT(p->p_wchan == NULL); - return (p); -} - -struct cpu_info * -sched_choosecpu_fork(struct proc *parent, int flags) -{ -#ifdef MULTIPROCESSOR - struct cpu_info *choice = NULL; - fixpt_t load, best_load = ~0; - int run, best_run = INT_MAX; - struct cpu_info *ci; - struct cpuset set; - -#if 0 - /* - * XXX - * Don't do this until we have a painless way to move the cpu in exec. - * Preferably when nuking the old pmap and getting a new one on a - * new cpu. - */ - /* - * PPWAIT forks are simple. We know that the parent will not - * run until we exec and choose another cpu, so we just steal its - * cpu. - */ - if (flags & FORK_PPWAIT) - return (parent->p_cpu); -#endif - - /* - * Look at all cpus that are currently idle and have nothing queued. - * If there are none, pick the one with least queued procs first, - * then the one with lowest load average. - */ - cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); - cpuset_intersection(&set, &set, &sched_all_cpus); - if (cpuset_first(&set) == NULL) - cpuset_copy(&set, &sched_all_cpus); - - while ((ci = cpuset_first(&set)) != NULL) { - cpuset_del(&set, ci); - - load = ci->ci_schedstate.spc_ldavg; - run = ci->ci_schedstate.spc_nrun; - - if (choice == NULL || run < best_run || - (run == best_run &&load < best_load)) { - choice = ci; - best_load = load; - best_run = run; - } - } - - return (choice); -#else - return (curcpu()); -#endif -} - -struct cpu_info * -sched_choosecpu(struct proc *p) -{ -#ifdef MULTIPROCESSOR - struct cpu_info *choice = NULL; - int last_cost = INT_MAX; - struct cpu_info *ci; - struct cpuset set; - - /* - * If pegged to a cpu, don't allow it to move. - */ - if (p->p_flag & P_CPUPEG) - return (p->p_cpu); - - sched_choose++; - - /* - * Look at all cpus that are currently idle and have nothing queued. - * If there are none, pick the cheapest of those. - * (idle + queued could mean that the cpu is handling an interrupt - * at this moment and haven't had time to leave idle yet). - */ - cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); - cpuset_intersection(&set, &set, &sched_all_cpus); - - /* - * First, just check if our current cpu is in that set, if it is, - * this is simple. - * Also, our cpu might not be idle, but if it's the current cpu - * and it has nothing else queued and we're curproc, take it. - */ - if (cpuset_isset(&set, p->p_cpu) || - (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && - (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 && - curproc == p)) { - sched_wasidle++; - return (p->p_cpu); } - if (cpuset_first(&set) == NULL) - cpuset_copy(&set, &sched_all_cpus); - - while ((ci = cpuset_first(&set)) != NULL) { - int cost = sched_proc_to_cpu_cost(ci, p); - - if (choice == NULL || cost < last_cost) { - choice = ci; - last_cost = cost; - } - cpuset_del(&set, ci); - } - - if (p->p_cpu != choice) - sched_nmigrations++; - else - sched_nomigrations++; - - return (choice); -#else - return (curcpu()); -#endif -} - -/* - * Attempt to steal a proc from some cpu. - */ -struct proc * -sched_steal_proc(struct cpu_info *self) -{ - struct proc *best = NULL; -#ifdef MULTIPROCESSOR - struct schedstate_percpu *spc; - int bestcost = INT_MAX; - struct cpu_info *ci; - struct cpuset set; - - KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0); - - cpuset_copy(&set, &sched_queued_cpus); - - while ((ci = cpuset_first(&set)) != NULL) { - struct proc *p; - int queue; - int cost; - - cpuset_del(&set, ci); - - spc = &ci->ci_schedstate; - - queue = ffs(spc->spc_whichqs) - 1; - TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { - if (p->p_flag & P_CPUPEG) - continue; - - cost = sched_proc_to_cpu_cost(self, p); - - if (best == NULL || cost < bestcost) { - best = p; - bestcost = cost; - } - } - } - if (best == NULL) - return (NULL); - - spc = &best->p_cpu->ci_schedstate; - remrunqueue(best); - best->p_cpu = self; - - sched_stolen++; -#endif - return (best); -} - -#ifdef MULTIPROCESSOR -/* - * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). - */ -static int -log2(unsigned int i) -{ - int ret = 0; - - while (i >>= 1) - ret++; - - return (ret); -} - -/* - * Calculate the cost of moving the proc to this cpu. - * - * What we want is some guesstimate of how much "performance" it will - * cost us to move the proc here. Not just for caches and TLBs and NUMA - * memory, but also for the proc itself. A highly loaded cpu might not - * be the best candidate for this proc since it won't get run. - * - * Just total guesstimates for now. - */ - -int sched_cost_load = 1; -int sched_cost_priority = 1; -int sched_cost_runnable = 3; -int sched_cost_resident = 1; -#endif - -int -sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) -{ - int cost = 0; -#ifdef MULTIPROCESSOR - struct schedstate_percpu *spc; - int l2resident = 0; - - spc = &ci->ci_schedstate; - - /* - * First, account for the priority of the proc we want to move. - * More willing to move, the lower the priority of the destination - * and the higher the priority of the proc. - */ - if (!cpuset_isset(&sched_idle_cpus, ci)) { - cost += (p->p_priority - spc->spc_curpriority) * - sched_cost_priority; - cost += sched_cost_runnable; - } - if (cpuset_isset(&sched_queued_cpus, ci)) - cost += spc->spc_nrun * sched_cost_runnable; - - /* - * Try to avoid the primary cpu as it handles hardware interrupts. - * - * XXX Needs to be revisited when we distribute interrupts - * over cpus. - */ - if (CPU_IS_PRIMARY(ci)) - cost += sched_cost_runnable; - - /* - * Higher load on the destination means we don't want to go there. - */ - cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); - - /* - * If the proc is on this cpu already, lower the cost by how much - * it has been running and an estimate of its footprint. - */ - if (p->p_cpu == ci && p->p_slptime == 0) { - l2resident = - log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); - cost -= l2resident * sched_cost_resident; - } -#endif - return (cost); + KASSERT(p->p_wchan == NULL); + p->p_cpu = ci; + return (p); } /* @@ -620,7 +381,6 @@ sched_start_secondary_cpus(void) if (CPU_IS_PRIMARY(ci)) continue; - cpuset_add(&sched_all_cpus, ci); atomic_clearbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT | SPCF_HALTED); } @@ -640,7 +400,6 @@ sched_stop_secondary_cpus(void) if (CPU_IS_PRIMARY(ci)) continue; - cpuset_del(&sched_all_cpus, ci); atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); } CPU_INFO_FOREACH(cii, ci) { @@ -697,14 +456,14 @@ sched_barrier(struct cpu_info *ci) } } -#else +#else /* MULTIPROCESSOR */ void sched_barrier(struct cpu_info *ci) { } -#endif +#endif /* MULTIPROCESSOR */ /* * Functions to manipulate cpu sets. Index: kern/kern_fork.c =================================================================== RCS file: /cvs/src/sys/kern/kern_fork.c,v retrieving revision 1.187 diff -u -p -r1.187 kern_fork.c --- kern/kern_fork.c 25 Apr 2016 20:18:31 -0000 1.187 +++ kern/kern_fork.c 6 Jul 2016 17:31:11 -0000 @@ -486,7 +486,7 @@ fork1(struct proc *curp, int flags, void if ((flags & FORK_IDLE) == 0) { SCHED_LOCK(s); p->p_stat = SRUN; - p->p_cpu = sched_choosecpu_fork(curp, flags); + p->p_cpu = curcpu(); setrunqueue(p); SCHED_UNLOCK(s); } else Index: kern/kern_clock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clock.c,v retrieving revision 1.90 diff -u -p -r1.90 kern_clock.c --- kern/kern_clock.c 24 Mar 2016 05:40:56 -0000 1.90 +++ kern/kern_clock.c 6 Jul 2016 17:31:11 -0000 @@ -400,7 +400,8 @@ statclock(struct clockframe *frame) spc->spc_pscnt = psdiv; if (p != NULL) { - p->p_cpticks++; + if (p != spc->spc_idleproc) + p->p_cpticks++; /* * If no schedclock is provided, call it here at ~~12-25 Hz; * ~~16 Hz is best Index: kern/init_main.c =================================================================== RCS file: /cvs/src/sys/kern/init_main.c,v retrieving revision 1.253 diff -u -p -r1.253 init_main.c --- kern/init_main.c 17 May 2016 23:28:03 -0000 1.253 +++ kern/init_main.c 6 Jul 2016 17:31:11 -0000 @@ -328,17 +328,16 @@ main(void *framep) */ (void)chgproccnt(0, 1); - /* Initialize run queues */ - sched_init_runqueues(); sleep_queue_init(); - sched_init_cpu(curcpu()); - p->p_cpu->ci_randseed = (arc4random() & 0x7fffffff) + 1; /* Initialize task queues */ taskq_init(); /* Initialize the interface/address trees */ ifinit(); + + /* Initialize the scheduler */ + sched_init(); /* Lock the kernel on behalf of proc0. */ KERNEL_LOCK(); Index: dev/acpi/acpicpu.c =================================================================== RCS file: /cvs/src/sys/dev/acpi/acpicpu.c,v retrieving revision 1.74 diff -u -p -r1.74 acpicpu.c --- dev/acpi/acpicpu.c 17 Mar 2016 13:18:47 -0000 1.74 +++ dev/acpi/acpicpu.c 6 Jul 2016 17:31:11 -0000 @@ -1188,7 +1188,7 @@ acpicpu_idle(void) #endif /* something already queued? */ - if (!cpu_is_idle(ci)) + if (want_resched(ci)) return; /* @@ -1204,7 +1204,7 @@ acpicpu_idle(void) hints = (unsigned)best->address; microuptime(&start); atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING); - if (cpu_is_idle(ci)) { + if (!want_resched(ci)) { /* intel errata AAI65: cflush before monitor */ if (ci->ci_cflushsz != 0) { membar_sync(); Index: arch/sparc64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/sparc64/include/cpu.h,v retrieving revision 1.88 diff -u -p -r1.88 cpu.h --- arch/sparc64/include/cpu.h 28 Aug 2015 23:28:39 -0000 1.88 +++ arch/sparc64/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -240,8 +240,9 @@ extern void (*cpu_start_clock)(void); * Preempt the current process if in interrupt from user mode, * or after the current trap/syscall if in system mode. */ -extern void need_resched(struct cpu_info *); -#define clear_resched(ci) (ci)->ci_want_resched = 0 +void need_resched(struct cpu_info *); +#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) /* * This is used during profiling to integrate system time. Index: arch/sparc/sparc/trap.c =================================================================== RCS file: /cvs/src/sys/arch/sparc/sparc/trap.c,v retrieving revision 1.73 diff -u -p -r1.73 trap.c --- arch/sparc/sparc/trap.c 27 Feb 2016 13:08:07 -0000 1.73 +++ arch/sparc/sparc/trap.c 6 Jul 2016 17:31:11 -0000 @@ -199,7 +199,7 @@ void syscall(register_t, struct trapfram int ignore_bogus_traps = 0; -int want_ast = 0; +int cpu_want_ast = 0; /* * If someone stole the FPU while we were away, do not enable it @@ -300,9 +300,9 @@ trap(type, psr, pc, tf) break; case T_AST: - want_ast = 0; + cpu_want_ast = 0; uvmexp.softs++; - mi_ast(p, want_resched); + mi_ast(p, want_resched(curcpu())); break; case T_ILLINST: Index: arch/sparc/sparc/locore.s =================================================================== RCS file: /cvs/src/sys/arch/sparc/sparc/locore.s,v retrieving revision 1.101 diff -u -p -r1.101 locore.s --- arch/sparc/sparc/locore.s 23 May 2016 20:11:49 -0000 1.101 +++ arch/sparc/sparc/locore.s 6 Jul 2016 17:31:11 -0000 @@ -3093,8 +3093,8 @@ rft_kernel: * If returning to a valid window, just set psr and return. */ rft_user: -! sethi %hi(_C_LABEL(want_ast)), %l7 ! (done below) - ld [%l7 + %lo(_C_LABEL(want_ast))], %l7 +! sethi %hi(_C_LABEL(cpu_want_ast)), %l7 ! (done below) + ld [%l7 + %lo(_C_LABEL(cpu_want_ast))], %l7 tst %l7 ! want AST trap? bne,a softtrap ! yes, re-enter trap with type T_AST mov T_AST, %o0 @@ -3221,7 +3221,7 @@ rft_user_or_recover_pcb_windows: ld [%l6 + PCB_NSAVED], %l7 tst %l7 bz,a rft_user - sethi %hi(_C_LABEL(want_ast)), %l7 ! first instr of rft_user + sethi %hi(_C_LABEL(cpu_want_ast)), %l7 ! first instr of rft_user bg,a softtrap ! if (pcb_nsaved > 0) mov T_WINOF, %o0 ! trap(T_WINOF); @@ -4317,7 +4317,7 @@ ENTRY(write_user_windows) nop - .comm _C_LABEL(want_resched),4 + .comm _C_LABEL(cpu_want_resched),4 /* * Masterpaddr is the p->p_addr of the last process on the processor. * XXX masterpaddr is almost the same as cpcb Index: arch/sparc/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/sparc/include/cpu.h,v retrieving revision 1.35 diff -u -p -r1.35 cpu.h --- arch/sparc/include/cpu.h 2 Dec 2012 07:03:31 -0000 1.35 +++ arch/sparc/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -93,14 +93,16 @@ extern int eintstack[]; #define CLKF_PC(framep) ((framep)->pc) #define CLKF_INTR(framep) ((framep)->fp < (u_int)eintstack) +extern int cpu_want_resched; /* need_resched() was called */ +extern int cpu_want_ast; + /* * Preempt the current process if in interrupt from user mode, * or after the current trap/syscall if in system mode. */ -extern int want_resched; /* resched() was called */ -#define need_resched(ci) (want_resched = 1, want_ast = 1) -#define clear_resched(ci) want_resched = 0 -extern int want_ast; +#define need_resched(ci) (cpu_want_resched = 1, cpu_want_ast = 1) +#define clear_resched(ci) cpu_want_resched = 0 +#define want_resched(ci) (cpu_want_resched) /* * This is used during profiling to integrate system time. @@ -113,13 +115,13 @@ extern int want_ast; * buffer pages are invalid. On the sparc, request an ast to send us * through trap(), marking the proc as needing a profiling tick. */ -#define need_proftick(p) do { want_ast = 1; } while (0) +#define need_proftick(p) do { cpu_want_ast = 1; } while (0) /* * Notify the current process (p) that it has a signal pending, * process as soon as possible. */ -#define signotify(p) (want_ast = 1) +#define signotify(p) (cpu_want_ast = 1) extern int foundfpu; /* true => we have an FPU */ Index: arch/sh/sh/trap.c =================================================================== RCS file: /cvs/src/sys/arch/sh/sh/trap.c,v retrieving revision 1.35 diff -u -p -r1.35 trap.c --- arch/sh/sh/trap.c 27 Feb 2016 13:08:07 -0000 1.35 +++ arch/sh/sh/trap.c 6 Jul 2016 17:31:11 -0000 @@ -483,7 +483,7 @@ ast(struct proc *p, struct trapframe *tf p->p_md.md_astpending = 0; refreshcreds(p); uvmexp.softs++; - mi_ast(p, want_resched); + mi_ast(p, want_resched(curcpu())); userret(p); } } Index: arch/sh/sh/locore_c.c =================================================================== RCS file: /cvs/src/sys/arch/sh/sh/locore_c.c,v retrieving revision 1.12 diff -u -p -r1.12 locore_c.c --- arch/sh/sh/locore_c.c 18 Nov 2014 20:51:01 -0000 1.12 +++ arch/sh/sh/locore_c.c 6 Jul 2016 17:31:11 -0000 @@ -121,7 +121,7 @@ void (*__sh_switch_resume)(struct proc *); void cpu_switch_prepare(struct proc *, struct proc *); -int want_resched; +int cpu_want_resched; /* * Prepare context switch from oproc to nproc. Index: arch/sh/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/sh/include/cpu.h,v retrieving revision 1.27 diff -u -p -r1.27 cpu.h --- arch/sh/include/cpu.h 11 Jul 2014 10:53:07 -0000 1.27 +++ arch/sh/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -105,17 +105,19 @@ struct clockframe { #define PROC_PC(p) ((p)->p_md.md_regs->tf_spc) #define PROC_STACK(p) ((p)->p_md.md_regs->tf_r15) +extern int cpu_want_resched; /* need_resched() was called */ /* * Preempt the current process if in interrupt from user mode, * or after the current trap/syscall if in system mode. */ #define need_resched(ci) \ do { \ - want_resched = 1; \ + cpu_want_resched = 1; \ if (curproc != NULL) \ - aston(curproc); \ + aston(curproc); \ } while (/*CONSTCOND*/0) -#define clear_resched(ci) want_resched = 0 +#define clear_resched(ci) cpu_want_resched = 0 +#define want_resched(ci) (cpu_want_resched) /* * Give a profiling tick to the current process when the user profiling @@ -131,8 +133,6 @@ do { \ #define signotify(p) aston(p) #define aston(p) ((p)->p_md.md_astpending = 1) - -extern int want_resched; /* need_resched() was called */ /* * We need a machine-independent name for this. Index: arch/powerpc/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/powerpc/include/cpu.h,v retrieving revision 1.63 diff -u -p -r1.63 cpu.h --- arch/powerpc/include/cpu.h 7 May 2016 22:46:54 -0000 1.63 +++ arch/powerpc/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -181,7 +181,8 @@ do { \ if (ci->ci_curproc != NULL) \ aston(ci->ci_curproc); \ } while (0) -#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) #define need_proftick(p) aston(p) Index: arch/mips64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/mips64/include/cpu.h,v retrieving revision 1.110 diff -u -p -r1.110 cpu.h --- arch/mips64/include/cpu.h 6 Mar 2016 19:42:27 -0000 1.110 +++ arch/mips64/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -305,6 +305,7 @@ void cp0_calibrate(struct cpu_info *); aston((ci)->ci_curproc); \ } while(0) #define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) /* * Give a profiling tick to the current process when the user profiling Index: arch/m88k/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/m88k/include/cpu.h,v retrieving revision 1.64 diff -u -p -r1.64 cpu.h --- arch/m88k/include/cpu.h 2 Jul 2015 01:33:59 -0000 1.64 +++ arch/m88k/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -274,7 +274,9 @@ struct clockframe { #define PROC_PC(p) PC_REGS((struct reg *)((p)->p_md.md_tf)) #define PROC_STACK(p) ((p)->p_md.md_tf->tf_sp) +void need_resched(struct cpu_info *); #define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) /* * Give a profiling tick to the current process when the user profiling @@ -283,7 +285,6 @@ struct clockframe { */ #define need_proftick(p) aston(p) -void need_resched(struct cpu_info *); void signotify(struct proc *); void softipi(void); Index: arch/i386/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/i386/include/cpu.h,v retrieving revision 1.147 diff -u -p -r1.147 cpu.h --- arch/i386/include/cpu.h 15 Mar 2016 03:17:51 -0000 1.147 +++ arch/i386/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -244,14 +244,14 @@ void cpu_unidle(struct cpu_info *); #define curpcb curcpu()->ci_curpcb -#define want_resched (curcpu()->ci_want_resched) - /* * Preempt the current process if in interrupt from user mode, * or after the current trap/syscall if in system mode. */ -extern void need_resched(struct cpu_info *); -#define clear_resched(ci) (ci)->ci_want_resched = 0 +void need_resched(struct cpu_info *); +#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) + #define CLKF_USERMODE(frame) USERMODE((frame)->if_cs, (frame)->if_eflags) #define CLKF_PC(frame) ((frame)->if_eip) Index: arch/i386/i386/trap.c =================================================================== RCS file: /cvs/src/sys/arch/i386/i386/trap.c,v retrieving revision 1.125 diff -u -p -r1.125 trap.c --- arch/i386/i386/trap.c 28 Feb 2016 15:46:18 -0000 1.125 +++ arch/i386/i386/trap.c 6 Jul 2016 17:31:11 -0000 @@ -528,7 +528,7 @@ ast(struct trapframe *frame) p->p_md.md_regs = frame; refreshcreds(p); uvmexp.softs++; - mi_ast(p, want_resched); + mi_ast(p, want_resched(curcpu())); userret(p); } Index: arch/i386/i386/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/i386/i386/cpu.c,v retrieving revision 1.78 diff -u -p -r1.78 cpu.c --- arch/i386/i386/cpu.c 28 Jun 2016 05:37:50 -0000 1.78 +++ arch/i386/i386/cpu.c 6 Jul 2016 17:31:11 -0000 @@ -761,7 +761,7 @@ cpu_idle_mwait_cycle(void) panic("idle with interrupts blocked!"); /* something already queued? */ - if (!cpu_is_idle(ci)) + if (want_resched(ci)) return; /* @@ -775,7 +775,7 @@ cpu_idle_mwait_cycle(void) * the check in sched_idle() and here. */ atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY); - if (cpu_is_idle(ci)) { + if (!want_resched(ci)) { monitor(&ci->ci_mwait, 0, 0); if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING) mwait(0, 0); Index: arch/hppa/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/hppa/include/cpu.h,v retrieving revision 1.89 diff -u -p -r1.89 cpu.h --- arch/hppa/include/cpu.h 10 May 2016 14:52:03 -0000 1.89 +++ arch/hppa/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -244,8 +244,13 @@ void cpu_unidle(struct cpu_info *); #define cpu_unidle(ci) #endif -extern void need_resched(struct cpu_info *); -#define clear_resched(ci) (ci)->ci_want_resched = 0 +/* + * Preempt the current process if in interrupt from user mode, + * or after the current trap/syscall if in system mode. + */ +void need_resched(struct cpu_info *); +#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) #endif Index: arch/arm/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/arm/include/cpu.h,v retrieving revision 1.41 diff -u -p -r1.41 cpu.h --- arch/arm/include/cpu.h 4 Apr 2016 09:13:44 -0000 1.41 +++ arch/arm/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -267,9 +267,10 @@ extern int astpending; * Preempt the current process if in interrupt from user mode, * or after the current trap/syscall if in system mode. */ -extern int want_resched; /* resched() was called */ -#define need_resched(ci) (want_resched = 1, setsoftast()) -#define clear_resched(ci) want_resched = 0 +extern int cpu_want_resched; /* need_resched() was called */ +#define need_resched(ci) (cpu_want_resched = 1, setsoftast()) +#define clear_resched(ci) cpu_want_resched = 0 +#define want_resched(ci) (cpu_want_resched) /* * Give a profiling tick to the current process when the user profiling Index: arch/arm/arm/ast.c =================================================================== RCS file: /cvs/src/sys/arch/arm/arm/ast.c,v retrieving revision 1.14 diff -u -p -r1.14 ast.c --- arch/arm/arm/ast.c 18 Nov 2014 20:51:01 -0000 1.14 +++ arch/arm/arm/ast.c 6 Jul 2016 17:31:11 -0000 @@ -65,7 +65,7 @@ */ void ast(struct trapframe *); -int want_resched; +int cpu_want_resched; extern int astpending; /* @@ -91,7 +91,7 @@ ast(struct trapframe *tf) #endif uvmexp.softs++; - mi_ast(p, want_resched); + mi_ast(p, want_resched(curcpu())); userret(p); } Index: arch/amd64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.101 diff -u -p -r1.101 cpu.h --- arch/amd64/include/cpu.h 9 May 2016 22:45:07 -0000 1.101 +++ arch/amd64/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -216,12 +216,13 @@ extern struct cpu_info *cpu_info_list; #define CPU_INFO_UNIT(ci) ((ci)->ci_dev ? (ci)->ci_dev->dv_unit : 0) -/* +/* * Preempt the current process if in interrupt from user mode, * or after the current trap/syscall if in system mode. */ -extern void need_resched(struct cpu_info *); -#define clear_resched(ci) (ci)->ci_want_resched = 0 +void need_resched(struct cpu_info *); +#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) #if defined(MULTIPROCESSOR) Index: arch/amd64/amd64/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v retrieving revision 1.101 diff -u -p -r1.101 cpu.c --- arch/amd64/amd64/cpu.c 28 Jun 2016 05:37:50 -0000 1.101 +++ arch/amd64/amd64/cpu.c 6 Jul 2016 17:31:11 -0000 @@ -253,7 +253,7 @@ cpu_idle_mwait_cycle(void) panic("idle with interrupts blocked!"); /* something already queued? */ - if (!cpu_is_idle(ci)) + if (want_resched(ci)) return; /* @@ -267,7 +267,7 @@ cpu_idle_mwait_cycle(void) * the check in sched_idle() and here. */ atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY); - if (cpu_is_idle(ci)) { + if (!want_resched(ci)) { monitor(&ci->ci_mwait, 0, 0); if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING) mwait(0, 0); Index: arch/alpha/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/alpha/include/cpu.h,v retrieving revision 1.57 diff -u -p -r1.57 cpu.h --- arch/alpha/include/cpu.h 30 Mar 2016 15:39:46 -0000 1.57 +++ arch/alpha/include/cpu.h 6 Jul 2016 17:31:11 -0000 @@ -301,7 +301,8 @@ do { \ if ((ci)->ci_curproc != NULL) \ aston((ci)->ci_curproc); \ } while (/*CONSTCOND*/0) -#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define clear_resched(ci) (ci)->ci_want_resched = 0 +#define want_resched(ci) ((ci)->ci_want_resched) /* * Give a profiling tick to the current process when the user profiling