On Wed, Jan 18, 2012 at 04:41:35PM +0100, Gregor Best wrote:
> after a long time of silence here's a second iteration of the patch.
> I've addressed a few concerns voiced here:
>
> * Process lookup and friends now have O(log n) runtime. I achieved that
> by abusing RB-trees as priority queues since they have runtime
> O(log n) in all relevant algorithms.
I'm wondering if a heap tree isn't more suitable. But since they're
currently not in the kernel, I guess the RB tree is the best solution.
> * The algorithm for calculating a new deadline for a given process has
> been simplified and is now documented a bit better. It also derives
> the deadline offset from the value of hz (via rrticks_init) as
> suggested by Miod (?).
> * CPU rlimits are now honored again. The relevant code did not change, the
> new patch just doesn't remove rlimit enforcement anymore.
> * Timeslices are 20ms long instead of 10ms. This solves the issue of 0ms
> long timeslices on machines with hz < 100.
I'm not really happy having timeslices get larger. They're considerably
large already. Can you think of a way to derive a sane value based on
the hz?
> With recent improvements in the mainline scheduler and especially
> rthreads, the performance of the patched scheduler and mainline is now
> roughly similar, at least if throughput is concerned. I have the feeling
> that the system behaves "snappier" with my patch, but that might be some
> sort of placebo effect. I haven't yet come up with a reliable method to
> benchmark interactivity except for actually using the machine and doing
> stuff on it. It's interesting to note however that the patched scheduler
> achieves a performance similar to the default one without all the fancy
> methods for calculating how expensive it is to move a process from one
> CPU to another or related methods for preserving cache locality.
You'll want to test this on a multi-socket machine as well (preferably a
numa machine). You're currently migrating processes using your on-die L3
cache, which is considerably faster than memory.
> I use the patched scheduler exclusively on my Core2Duo machine with an
> MP build.
>
> The amount of lines removed versus added lines by this patch shifted
> towards "more added lines" but is still at 173 lines less than the
> default.
>
> Once again, comments, rants, insults, everything is welcome :)
Comments on the code inline. This is just me glancing over it, I'll try
and get more in-depth over the weekend.
> Index: sys/proc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.149
> diff -u -r1.149 proc.h
> --- sys/proc.h 7 Jan 2012 05:38:12 -0000 1.149
> +++ sys/proc.h 17 Jan 2012 16:10:09 -0000
> @@ -43,6 +43,7 @@
> #include <machine/proc.h> /* Machine-dependent proc substruct. */
> #include <sys/selinfo.h> /* For struct selinfo */
> #include <sys/queue.h>
> +#include <sys/tree.h>
> #include <sys/timeout.h> /* For struct timeout */
> #include <sys/event.h> /* For struct klist */
> #include <sys/mutex.h> /* For struct mutex */
> @@ -210,7 +211,9 @@
> #define PS_SINGLEUNWIND _P_SINGLEUNWIND
>
> struct proc {
> - TAILQ_ENTRY(proc) p_runq;
> + RB_ENTRY(proc) p_runq;
> + RB_ENTRY(proc) p_expq;
> + TAILQ_ENTRY(proc) p_slpq;
> LIST_ENTRY(proc) p_list; /* List of all processes. */
>
> struct process *p_p; /* The process of this thread. */
> @@ -251,6 +254,9 @@
> #endif
>
> /* scheduling */
> + struct timeval p_deadline; /* virtual deadline used for scheduling */
> + struct timeval p_deadline_set; /* time at which the deadline was set */
> + int p_rrticks; /* number of ticks this process is allowed to stay on a
> processor */
> u_int p_estcpu; /* Time averaged value of p_cpticks. */
> int p_cpticks; /* Ticks of cpu time. */
> fixpt_t p_pctcpu; /* %cpu for this process during p_swtime */
> Index: sys/sched.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/sched.h,v
> retrieving revision 1.30
> diff -u -r1.30 sched.h
> --- sys/sched.h 16 Nov 2011 20:50:19 -0000 1.30
> +++ sys/sched.h 17 Jan 2012 16:10:09 -0000
> @@ -87,8 +87,6 @@
> #define CP_IDLE 4
> #define CPUSTATES 5
>
> -#define SCHED_NQS 32 /* 32 run queues. */
> -
> /*
> * Per-CPU scheduler state.
> * XXX - expose to userland for now.
> @@ -99,7 +97,6 @@
> u_int spc_schedticks; /* ticks for schedclock() */
> u_int64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
> u_char spc_curpriority; /* usrpri of curproc */
> - int spc_rrticks; /* ticks until roundrobin() */
> int spc_pscnt; /* prof/stat counter */
> int spc_psdiv; /* prof/stat divisor */
> struct proc *spc_idleproc; /* idle proc for this cpu */
> @@ -107,9 +104,6 @@
> u_int spc_nrun; /* procs on the run queues */
> fixpt_t spc_ldavg; /* shortest load avg. for this cpu */
>
> - TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
> - volatile uint32_t spc_whichqs;
> -
> #ifdef notyet
> struct proc *spc_reaper; /* dead proc reaper */
> #endif
> @@ -119,18 +113,16 @@
> #ifdef _KERNEL
>
> /* spc_flags */
> -#define SPCF_SEENRR 0x0001 /* process has seen roundrobin() */
> -#define SPCF_SHOULDYIELD 0x0002 /* process should yield the CPU */
> -#define SPCF_SWITCHCLEAR (SPCF_SEENRR|SPCF_SHOULDYIELD)
> -#define SPCF_SHOULDHALT 0x0004 /* CPU should be vacated */
> -#define SPCF_HALTED 0x0008 /* CPU has been halted */
> +#define SPCF_SHOULDYIELD 0x0001 /* process should yield the CPU */
> +#define SPCF_SHOULDHALT 0x0002 /* CPU should be vacated */
> +#define SPCF_HALTED 0x0004 /* CPU has been halted */
>
> -#define SCHED_PPQ (128 / SCHED_NQS) /* priorities per queue
> */
> #define NICE_WEIGHT 2 /* priorities per nice level */
> -#define ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - SCHED_PPQ)
> +#define ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX)
>
> extern int schedhz; /* ideally: 16 */
> extern int rrticks_init; /* ticks per roundrobin() */
> +extern struct cpuset sched_idle_cpus;
>
> struct proc;
> void schedclock(struct proc *);
> @@ -147,18 +139,20 @@
> void cpu_switchto(struct proc *, struct proc *);
> struct proc *sched_chooseproc(void);
> struct cpu_info *sched_choosecpu(struct proc *);
> -struct cpu_info *sched_choosecpu_fork(struct proc *parent, int);
> +struct cpu_info *sched_choosecpu_fork(struct proc *parent);
> void cpu_idle_enter(void);
> void cpu_idle_cycle(void);
> void cpu_idle_leave(void);
> void sched_peg_curproc(struct cpu_info *ci);
>
> +void generate_deadline(struct proc *, char);
> +
> #ifdef MULTIPROCESSOR
> void sched_start_secondary_cpus(void);
> void sched_stop_secondary_cpus(void);
> #endif
>
> -#define curcpu_is_idle() (curcpu()->ci_schedstate.spc_whichqs == 0)
> +#define curcpu_is_idle() (cpuset_isset(&sched_idle_cpus, curcpu()))
>
> void sched_init_runqueues(void);
> void setrunqueue(struct proc *);
> Index: kern/kern_clock.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_clock.c,v
> retrieving revision 1.72
> diff -u -r1.72 kern_clock.c
> --- kern/kern_clock.c 7 Mar 2011 07:07:13 -0000 1.72
> +++ kern/kern_clock.c 17 Jan 2012 16:10:10 -0000
> @@ -241,7 +241,11 @@
> if (stathz == 0)
> statclock(frame);
>
> - if (--ci->ci_schedstate.spc_rrticks <= 0)
> + /*
> + * If the currently running process went over its round robin tick
> quota,
> + * preempt it.
> + */
> + if (p && (--(p->p_rrticks) <= 0))
> roundrobin(ci);
>
> /*
> Index: kern/kern_fork.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> retrieving revision 1.133
> diff -u -r1.133 kern_fork.c
> --- kern/kern_fork.c 14 Dec 2011 07:32:16 -0000 1.133
> +++ kern/kern_fork.c 17 Jan 2012 16:10:10 -0000
> @@ -225,6 +225,9 @@
> atomic_setbits_int(&pr->ps_flags, PS_CONTROLT);
>
> p->p_p = pr;
> +
> + /* Pretend we preempted this new process */
> + generate_deadline(p, 0);
> }
>
> /* print the 'table full' message once per 10 seconds */
> @@ -485,7 +488,7 @@
> getmicrotime(&p->p_stats->p_start);
> p->p_acflag = AFORK;
> p->p_stat = SRUN;
> - p->p_cpu = sched_choosecpu_fork(curp, flags);
> + p->p_cpu = sched_choosecpu_fork(curp);
> setrunqueue(p);
> SCHED_UNLOCK(s);
>
> Index: kern/kern_proc.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_proc.c,v
> retrieving revision 1.47
> diff -u -r1.47 kern_proc.c
> --- kern/kern_proc.c 18 Sep 2011 23:20:54 -0000 1.47
> +++ kern/kern_proc.c 17 Jan 2012 16:10:10 -0000
> @@ -398,8 +398,6 @@
> p->p_comm, p->p_pid, pst, p->p_flag, P_BITS);
> (*pr)(" pri=%u, usrpri=%u, nice=%d\n",
> p->p_priority, p->p_usrpri, p->p_p->ps_nice);
> - (*pr)(" forw=%p, list=%p,%p\n",
> - TAILQ_NEXT(p, p_runq), p->p_list.le_next, p->p_list.le_prev);
> (*pr)(" process=%p user=%p, vmspace=%p\n",
> p->p_p, p->p_addr, p->p_vmspace);
> (*pr)(" estcpu=%u, cpticks=%d, pctcpu=%u.%u, swtime=%u\n",
> Index: kern/kern_sched.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_sched.c,v
> retrieving revision 1.24
> diff -u -r1.24 kern_sched.c
> --- kern/kern_sched.c 12 Oct 2011 18:30:09 -0000 1.24
> +++ kern/kern_sched.c 17 Jan 2012 16:10:10 -0000
> @@ -19,6 +19,7 @@
>
> #include <sys/sched.h>
> #include <sys/proc.h>
> +#include <sys/tree.h>
> #include <sys/kthread.h>
> #include <sys/systm.h>
> #include <sys/resourcevar.h>
> @@ -32,9 +33,11 @@
>
> void sched_kthreads_create(void *);
>
> -int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
> struct proc *sched_steal_proc(struct cpu_info *);
>
> +RB_HEAD(runqueue, proc) sched_runqueue;
> +RB_HEAD(expqueue, proc) sched_expqueue;
> +
> /*
> * To help choosing which cpu should run which process we keep track
> * of cpus which are currently idle and which cpus have processes
> @@ -61,6 +64,27 @@
> * interrupts during the context switch.
> */
>
> +static int
> +sched_cmp_proc(struct proc *a, struct proc *b) {
> + if (a == b)
> + return 0;
> + if (timercmp(&(a->p_deadline), &(b->p_deadline), <))
> + return -1;
> + return 1;
> +}
Comment on this code is included in the function below.
> +
> +static int
> +sched_cmp_proc_exp(struct proc *a, struct proc *b) {
> + if (a == b)
> + return 0;
> + if (timercmp(&(a->p_deadline_set), &(b->p_deadline_set), <))
> + return -1;
> + return 1;
> +}
The above comparison will fail if:
a != b, but a->p_deadline_set == b->p_deadline_set.
Your code hides this from view, because of the a==b test before it,
turning it into a rare occurence. However when it hits during insert or
remove, you may find your tree getting corrupted and your invariants
getting violated.
What you want is a multi-set semantic, like this:
static int
sched_cmp_proc_exp(struct proc *a, struct proc *b)
{
int cmp;
if (timercmp(&(a->p_deadline_set), &(b->p_deadlint_set), <))
cmp = -1;
else
cmp = timercmp(&(a->p_deadline_set), &(b->p_deadlint_set), <);
/* Multiset behaviour: sort via identity. */
if (cmp == 0)
cmp = (a < b ? -1 : a > b);
return cmp;
}
Note: the above breaks RB_FIND and RB_NFIND, since a search clause
can never compare equal.
> +
> +RB_GENERATE_STATIC(runqueue, proc, p_runq, sched_cmp_proc);
> +RB_GENERATE_STATIC(expqueue, proc, p_expq, sched_cmp_proc_exp);
That's gonna add a lot of code.
Btw, even though the implementation works fine without it, I'd really
like if you could put matching RB_PROTOTYPE_STATIC before the generator
macro. Using GENERATE without PROTOTYPE is abuse of an implementation
detail.
> +
> /*
> * sched_init_cpu is called from main() for the boot cpu, then it's the
> * responsibility of the MD code to call it for all other cpus.
> @@ -69,10 +93,6 @@
> sched_init_cpu(struct cpu_info *ci)
> {
> struct schedstate_percpu *spc = &ci->ci_schedstate;
> - int i;
> -
> - for (i = 0; i < SCHED_NQS; i++)
> - TAILQ_INIT(&spc->spc_qs[i]);
>
> spc->spc_idleproc = NULL;
>
> @@ -106,6 +126,7 @@
> {
> struct schedstate_percpu *spc;
> struct proc *p = curproc;
> + struct proc *dead;
> struct cpu_info *ci = v;
> int s;
>
> @@ -120,7 +141,6 @@
> SCHED_LOCK(s);
> cpuset_add(&sched_idle_cpus, ci);
> p->p_stat = SSLEEP;
> - p->p_cpu = ci;
> atomic_setbits_int(&p->p_flag, P_CPUPEG);
> mi_switch();
> cpuset_del(&sched_idle_cpus, ci);
> @@ -130,38 +150,27 @@
> KASSERT(curproc == spc->spc_idleproc);
>
> while (1) {
> - while (!curcpu_is_idle()) {
> - struct proc *dead;
> -
> - SCHED_LOCK(s);
> - p->p_stat = SSLEEP;
> - mi_switch();
> - SCHED_UNLOCK(s);
> -
> - while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
> + while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
> LIST_REMOVE(dead, p_hash);
> exit2(dead);
> - }
> }
> + if ((spc->spc_schedflags & SPCF_SHOULDHALT) &&
> !(spc->spc_schedflags & SPCF_HALTED))
> + atomic_setbits_int(&spc->spc_schedflags, SPCF_HALTED);
>
> splassert(IPL_NONE);
>
> cpuset_add(&sched_idle_cpus, ci);
> +
> cpu_idle_enter();
> - while (spc->spc_whichqs == 0) {
> - if (spc->spc_schedflags & SPCF_SHOULDHALT &&
> - (spc->spc_schedflags & SPCF_HALTED) == 0) {
> - cpuset_del(&sched_idle_cpus, ci);
> - SCHED_LOCK(s);
> - atomic_setbits_int(&spc->spc_schedflags,
> - spc->spc_whichqs ? 0 : SPCF_HALTED);
> - SCHED_UNLOCK(s);
> - wakeup(spc);
> - }
> - cpu_idle_cycle();
> - }
> + cpu_idle_cycle();
> cpu_idle_leave();
> +
> cpuset_del(&sched_idle_cpus, ci);
> +
> + SCHED_LOCK(s);
> + p->p_stat = SSLEEP;
> + mi_switch();
> + SCHED_UNLOCK(s);
> }
> }
>
> @@ -206,63 +215,35 @@
> void
> sched_init_runqueues(void)
> {
> + RB_INIT(&sched_runqueue);
> + RB_INIT(&sched_expqueue);
> }
>
> void
> setrunqueue(struct proc *p)
> {
> - struct schedstate_percpu *spc;
> - int queue = p->p_priority >> 2;
> -
> SCHED_ASSERT_LOCKED();
> - spc = &p->p_cpu->ci_schedstate;
> - spc->spc_nrun++;
> -
> - TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
> - spc->spc_whichqs |= (1 << queue);
> - cpuset_add(&sched_queued_cpus, p->p_cpu);
> -
> - if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
> - cpu_unidle(p->p_cpu);
> + RB_INSERT(runqueue, &sched_runqueue, p);
> }
>
> void
> remrunqueue(struct proc *p)
> {
> - struct schedstate_percpu *spc;
> - int queue = p->p_priority >> 2;
> -
> SCHED_ASSERT_LOCKED();
> - spc = &p->p_cpu->ci_schedstate;
> - spc->spc_nrun--;
> -
> - TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
> - if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
> - spc->spc_whichqs &= ~(1 << queue);
> - if (spc->spc_whichqs == 0)
> - cpuset_del(&sched_queued_cpus, p->p_cpu);
> - }
> + if (!(RB_REMOVE(runqueue, &sched_runqueue, p)))
Is it ok to call this function for a process not on the run queue?
Your code implies it is. If you don't intend that to happen, put a
KASSERT in.
> + RB_REMOVE(expqueue, &sched_expqueue, p);
Please check return value. I understand that if a process is on the
runqueue, it must also be on the expqueue?
> }
>
> struct proc *
> sched_chooseproc(void)
> {
> struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> - struct proc *p;
> - int queue;
> + struct proc *p = NULL;
> + struct timeval now;
>
> SCHED_ASSERT_LOCKED();
>
> if (spc->spc_schedflags & SPCF_SHOULDHALT) {
> - if (spc->spc_whichqs) {
> - for (queue = 0; queue < SCHED_NQS; queue++) {
> - TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
> - remrunqueue(p);
> - p->p_cpu = sched_choosecpu(p);
> - setrunqueue(p);
> - }
> - }
> - }
> p = spc->spc_idleproc;
> KASSERT(p);
> KASSERT(p->p_wchan == NULL);
> @@ -270,16 +251,30 @@
> return (p);
> }
>
> -again:
> - if (spc->spc_whichqs) {
> - queue = ffs(spc->spc_whichqs) - 1;
> - p = TAILQ_FIRST(&spc->spc_qs[queue]);
> - remrunqueue(p);
> - KASSERT(p->p_stat == SRUN);
> - } else if ((p = sched_steal_proc(curcpu())) == NULL) {
> - p = spc->spc_idleproc;
> - if (p == NULL) {
> - int s;
> + if ((!RB_EMPTY(&sched_runqueue)) || (!RB_EMPTY(&sched_expqueue))) {
> + /*
> + * Move expired processes to the expired runqueue where they
> are sorted by the time they got their
> + * deadline set instead of the deadline itself.
> + * */
> + microuptime(&now);
> + while(!RB_EMPTY(&sched_runqueue)) {
> + p = RB_MIN(runqueue, &sched_runqueue);
> + if (!p) break;
> + if (!timercmp(&(p->p_deadline), &now, <)) break;
> + RB_INSERT(expqueue, &sched_expqueue, p);
> + RB_REMOVE(runqueue, &sched_runqueue, p);
> + }
> +
> + if (!RB_EMPTY(&sched_expqueue)) {
> + p = RB_MIN(expqueue, &sched_expqueue);
> + } else {
> + p = RB_MIN(runqueue, &sched_runqueue);
> + }
Brackets are not required here.
> + if (p)
> + remrunqueue(p);
> + } else {
> + while ((p = spc->spc_idleproc) == NULL) {
> + int s;
> /*
> * We get here if someone decides to switch during
> * boot before forking kthreads, bleh.
> @@ -291,13 +286,15 @@
> spl0();
> delay(10);
> SCHED_LOCK(s);
> - goto again;
> - }
> + }
> KASSERT(p);
> + }
> +
> + if (p) {
> p->p_stat = SRUN;
> + p->p_cpu = curcpu();
> }
>
> - KASSERT(p->p_wchan == NULL);
> return (p);
> }
>
> @@ -310,30 +307,13 @@
> uint64_t sched_nomigrations;
>
> struct cpu_info *
> -sched_choosecpu_fork(struct proc *parent, int flags)
> +sched_choosecpu_fork(struct proc *parent)
> {
> struct cpu_info *choice = NULL;
> fixpt_t load, best_load = ~0;
> - int run, best_run = INT_MAX;
> struct cpu_info *ci;
> struct cpuset set;
>
> -#if 0
> - /*
> - * XXX
> - * Don't do this until we have a painless way to move the cpu in exec.
> - * Preferably when nuking the old pmap and getting a new one on a
> - * new cpu.
> - */
> - /*
> - * PPWAIT forks are simple. We know that the parent will not
> - * run until we exec and choose another cpu, so we just steal its
> - * cpu.
> - */
> - if (flags & FORK_PPWAIT)
> - return (parent->p_cpu);
> -#endif
> -
> /*
> * Look at all cpus that are currently idle and have nothing queued.
> * If there are none, pick the one with least queued procs first,
> @@ -347,13 +327,10 @@
> cpuset_del(&set, ci);
>
> load = ci->ci_schedstate.spc_ldavg;
> - run = ci->ci_schedstate.spc_nrun;
>
> - if (choice == NULL || run < best_run ||
> - (run == best_run &&load < best_load)) {
> + if (choice == NULL || load < best_load) {
> choice = ci;
> best_load = load;
> - best_run = run;
> }
> }
>
> @@ -364,9 +341,6 @@
> sched_choosecpu(struct proc *p)
> {
> struct cpu_info *choice = NULL;
> - int last_cost = INT_MAX;
> - struct cpu_info *ci;
> - struct cpuset set;
>
> /*
> * If pegged to a cpu, don't allow it to move.
> @@ -374,169 +348,19 @@
> if (p->p_flag & P_CPUPEG)
> return (p->p_cpu);
>
> - sched_choose++;
> -
> - /*
> - * Look at all cpus that are currently idle and have nothing queued.
> - * If there are none, pick the cheapest of those.
> - * (idle + queued could mean that the cpu is handling an interrupt
> - * at this moment and haven't had time to leave idle yet).
> - */
> - cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
> -
> /*
> - * First, just check if our current cpu is in that set, if it is,
> - * this is simple.
> - * Also, our cpu might not be idle, but if it's the current cpu
> - * and it has nothing else queued and we're curproc, take it.
> + * Else, just pretend we forked a new process.
> */
> - if (cpuset_isset(&set, p->p_cpu) ||
> - (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
> - curproc == p)) {
> - sched_wasidle++;
> - return (p->p_cpu);
> - }
> -
> - if (cpuset_first(&set) == NULL)
> - cpuset_copy(&set, &sched_all_cpus);
> -
> - while ((ci = cpuset_first(&set)) != NULL) {
> - int cost = sched_proc_to_cpu_cost(ci, p);
> + sched_choose++;
>
> - if (choice == NULL || cost < last_cost) {
> - choice = ci;
> - last_cost = cost;
> - }
> - cpuset_del(&set, ci);
> - }
> + choice = sched_choosecpu_fork(p);
>
> if (p->p_cpu != choice)
> sched_nmigrations++;
> else
> sched_nomigrations++;
>
> - return (choice);
> -}
> -
> -/*
> - * Attempt to steal a proc from some cpu.
> - */
> -struct proc *
> -sched_steal_proc(struct cpu_info *self)
> -{
> - struct schedstate_percpu *spc;
> - struct proc *best = NULL;
> - int bestcost = INT_MAX;
> - struct cpu_info *ci;
> - struct cpuset set;
> -
> - cpuset_copy(&set, &sched_queued_cpus);
> -
> - while ((ci = cpuset_first(&set)) != NULL) {
> - struct proc *p;
> - int queue;
> - int cost;
> -
> - cpuset_del(&set, ci);
> -
> - spc = &ci->ci_schedstate;
> -
> - queue = ffs(spc->spc_whichqs) - 1;
> - TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
> - if (p->p_flag & P_CPUPEG)
> - continue;
> -
> - cost = sched_proc_to_cpu_cost(self, p);
> -
> - if (best == NULL || cost < bestcost) {
> - best = p;
> - bestcost = cost;
> - }
> - }
> - }
> - if (best == NULL)
> - return (NULL);
> -
> - spc = &best->p_cpu->ci_schedstate;
> - remrunqueue(best);
> - best->p_cpu = self;
> -
> - sched_stolen++;
> -
> - return (best);
> -}
> -
> -/*
> - * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
> - */
> -static int
> -log2(unsigned int i)
> -{
> - int ret = 0;
> -
> - while (i >>= 1)
> - ret++;
> -
> - return (ret);
> -}
> -
> -/*
> - * Calculate the cost of moving the proc to this cpu.
> - *
> - * What we want is some guesstimate of how much "performance" it will
> - * cost us to move the proc here. Not just for caches and TLBs and NUMA
> - * memory, but also for the proc itself. A highly loaded cpu might not
> - * be the best candidate for this proc since it won't get run.
> - *
> - * Just total guesstimates for now.
> - */
> -
> -int sched_cost_load = 1;
> -int sched_cost_priority = 1;
> -int sched_cost_runnable = 3;
> -int sched_cost_resident = 1;
> -
> -int
> -sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
> -{
> - struct schedstate_percpu *spc;
> - int l2resident = 0;
> - int cost;
> -
> - spc = &ci->ci_schedstate;
> -
> - cost = 0;
> -
> - /*
> - * First, account for the priority of the proc we want to move.
> - * More willing to move, the lower the priority of the destination
> - * and the higher the priority of the proc.
> - */
> - if (!cpuset_isset(&sched_idle_cpus, ci)) {
> - cost += (p->p_priority - spc->spc_curpriority) *
> - sched_cost_priority;
> - cost += sched_cost_runnable;
> - }
> - if (cpuset_isset(&sched_queued_cpus, ci)) {
> - cost += spc->spc_nrun * sched_cost_runnable;
> - }
> -
> - /*
> - * Higher load on the destination means we don't want to go there.
> - */
> - cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
> -
> - /*
> - * If the proc is on this cpu already, lower the cost by how much
> - * it has been running and an estimate of its footprint.
> - */
> - if (p->p_cpu == ci && p->p_slptime == 0) {
> - l2resident =
> - log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
> - cost -= l2resident * sched_cost_resident;
> - }
> -
> - return (cost);
> + return choice;
> }
>
> /*
> @@ -560,7 +384,6 @@
> }
>
> #ifdef MULTIPROCESSOR
> -
> void
> sched_start_secondary_cpus(void)
> {
> @@ -583,6 +406,9 @@
> {
> CPU_INFO_ITERATOR cii;
> struct cpu_info *ci;
> + int s;
> +
> + SCHED_LOCK(s);
>
> /*
> * Make sure we stop the secondary CPUs.
> @@ -601,14 +427,17 @@
>
> if (CPU_IS_PRIMARY(ci))
> continue;
> - while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
> +
> + SCHED_UNLOCK(s);
> + while (!(spc->spc_schedflags & SPCF_HALTED)) {
> sleep_setup(&sls, spc, PZERO, "schedstate");
> - sleep_finish(&sls,
> - (spc->spc_schedflags & SPCF_HALTED) == 0);
> + sleep_setup_timeout(&sls, 10);
> + sleep_finish(&sls, 1);
> }
> + SCHED_LOCK(s);
> }
> + SCHED_UNLOCK(s);
> }
> -
> #endif
>
> /*
> Index: kern/kern_synch.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_synch.c,v
> retrieving revision 1.99
> diff -u -r1.99 kern_synch.c
> --- kern/kern_synch.c 17 Jan 2012 02:34:18 -0000 1.99
> +++ kern/kern_synch.c 17 Jan 2012 16:10:10 -0000
> @@ -205,7 +205,7 @@
> p->p_wmesg = wmesg;
> p->p_slptime = 0;
> p->p_priority = prio & PRIMASK;
> - TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_runq);
> + TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
> }
>
> void
> @@ -342,7 +342,7 @@
> unsleep(struct proc *p)
> {
> if (p->p_wchan) {
> - TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_runq);
> + TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
> p->p_wchan = NULL;
> }
> }
> @@ -361,7 +361,7 @@
> SCHED_LOCK(s);
> qp = &slpque[LOOKUP(ident)];
> for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
> - pnext = TAILQ_NEXT(p, p_runq);
> + pnext = TAILQ_NEXT(p, p_slpq);
> #ifdef DIAGNOSTIC
> if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
> panic("wakeup: p_stat is %d", (int)p->p_stat);
> @@ -369,7 +369,7 @@
> if (p->p_wchan == ident) {
> --n;
> p->p_wchan = 0;
> - TAILQ_REMOVE(qp, p, p_runq);
> + TAILQ_REMOVE(qp, p, p_slpq);
> if (p->p_stat == SSLEEP) {
> /* OPTIMIZED EXPANSION OF setrunnable(p); */
> if (p->p_slptime > 1)
> Index: kern/sched_bsd.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sched_bsd.c,v
> retrieving revision 1.27
> diff -u -r1.27 sched_bsd.c
> --- kern/sched_bsd.c 7 Jul 2011 18:00:33 -0000 1.27
> +++ kern/sched_bsd.c 17 Jan 2012 16:10:10 -0000
> @@ -77,37 +77,23 @@
>
> timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
>
> - rrticks_init = hz / 10;
> + rrticks_init = hz / 50;
> schedcpu(&schedcpu_to);
> }
>
> /*
> - * Force switch among equal priority processes every 100ms.
> + * Force switch among equal priority processes every 20ms.
> */
> void
> roundrobin(struct cpu_info *ci)
> {
> struct schedstate_percpu *spc = &ci->ci_schedstate;
>
> - spc->spc_rrticks = rrticks_init;
> -
> if (ci->ci_curproc != NULL) {
> - if (spc->spc_schedflags & SPCF_SEENRR) {
> - /*
> - * The process has already been through a roundrobin
> - * without switching and may be hogging the CPU.
> - * Indicate that the process should yield.
> - */
> - atomic_setbits_int(&spc->spc_schedflags,
> - SPCF_SHOULDYIELD);
> - } else {
> - atomic_setbits_int(&spc->spc_schedflags,
> - SPCF_SEENRR);
> - }
> + atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
> }
>
> - if (spc->spc_nrun)
> - need_resched(ci);
> + need_resched(ci);
> }
>
> /*
> @@ -204,7 +190,6 @@
> struct timeout *to = (struct timeout *)arg;
> fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
> struct proc *p;
> - int s;
> unsigned int newcpu;
> int phz;
>
> @@ -233,7 +218,6 @@
> */
> if (p->p_slptime > 1)
> continue;
> - SCHED_LOCK(s);
> /*
> * p_pctcpu is only for ps.
> */
> @@ -250,17 +234,6 @@
> newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
> p->p_estcpu = newcpu;
> resetpriority(p);
> - if (p->p_priority >= PUSER) {
> - if (p->p_stat == SRUN &&
> - (p->p_priority / SCHED_PPQ) !=
> - (p->p_usrpri / SCHED_PPQ)) {
> - remrunqueue(p);
> - p->p_priority = p->p_usrpri;
> - setrunqueue(p);
> - } else
> - p->p_priority = p->p_usrpri;
> - }
> - SCHED_UNLOCK(s);
> }
> uvm_meter();
> wakeup(&lbolt);
> @@ -278,8 +251,6 @@
> unsigned int newcpu = p->p_estcpu;
> fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
>
> - SCHED_ASSERT_LOCKED();
> -
> if (p->p_slptime > 5 * loadfac)
> p->p_estcpu = 0;
> else {
> @@ -304,6 +275,7 @@
> SCHED_LOCK(s);
> p->p_priority = p->p_usrpri;
> p->p_stat = SRUN;
> + generate_deadline(p, 1);
> setrunqueue(p);
> p->p_stats->p_ru.ru_nvcsw++;
> mi_switch();
> @@ -332,6 +304,7 @@
> p->p_priority = p->p_usrpri;
> p->p_stat = SRUN;
> p->p_cpu = sched_choosecpu(p);
> + generate_deadline(p, 0);
> setrunqueue(p);
> p->p_stats->p_ru.ru_nivcsw++;
> mi_switch();
> @@ -372,14 +345,7 @@
> * process was running, and add that to its total so far.
> */
> microuptime(&tv);
> - if (timercmp(&tv, &spc->spc_runtime, <)) {
> -#if 0
> - printf("uptime is not monotonic! "
> - "tv=%lu.%06lu, runtime=%lu.%06lu\n",
> - tv.tv_sec, tv.tv_usec, spc->spc_runtime.tv_sec,
> - spc->spc_runtime.tv_usec);
> -#endif
> - } else {
> + if (timercmp(&tv, &spc->spc_runtime, >=)) {
> timersub(&tv, &spc->spc_runtime, &tv);
> timeradd(&p->p_rtime, &tv, &p->p_rtime);
> }
> @@ -403,7 +369,7 @@
> * Process is about to yield the CPU; clear the appropriate
> * scheduling flags.
> */
> - atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
> + atomic_clearbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
>
> nextproc = sched_chooseproc();
>
> @@ -435,9 +401,8 @@
> * be running on a new CPU now, so don't use the cache'd
> * schedstate_percpu pointer.
> */
> - KASSERT(p->p_cpu == curcpu());
> -
> - microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
> + if (p->p_cpu == curcpu())
> + microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
>
> #ifdef MULTIPROCESSOR
> /*
> @@ -515,30 +480,56 @@
> }
> p->p_stat = SRUN;
> p->p_cpu = sched_choosecpu(p);
> - setrunqueue(p);
> if (p->p_slptime > 1)
> updatepri(p);
> + setrunqueue(p);
> p->p_slptime = 0;
> resched_proc(p, p->p_priority);
> }
>
> /*
> * Compute the priority of a process when running in user mode.
> - * Arrange to reschedule if the resulting priority is better
> - * than that of the current process.
> */
> void
> resetpriority(struct proc *p)
> {
> - unsigned int newpriority;
> -
> - SCHED_ASSERT_LOCKED();
> + p->p_usrpri = min(((NZERO + (p->p_p->ps_nice))) << 1, MAXPRI);
Nice levels deserve a look all on their own. The current semantic is
very 4BSD, which was great when processes moved to disk on a regular
basis, but is utterly confusing nowadays.
However, you probably don't want to change too much in a single diff.
> +}
>
> - newpriority = PUSER + p->p_estcpu +
> - NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
> - newpriority = min(newpriority, MAXPRI);
> - p->p_usrpri = newpriority;
> - resched_proc(p, p->p_usrpri);
> +/*
> + * Generate a new virtual deadline for a process. The deadline is a soft
> + * one and has no purpose besides being used for choosing the next process
> + * to run. Also resets the number of round robin ticks available to the
> + * process if the previous timeslice expired and the process had to be
> preempted.
> + */
> +void
> +generate_deadline(struct proc *p, char voluntary) {
> + /*
> + * For nice values between 0 and 39 inclusively, the offset lies between
> + * 32 and 1280 milliseconds for a machine with hz=100. That means that
> + * processes with nice value=0 (i.e. -20 in userland) will be executed
> + * 32 milliseconds in the future at the latest. Processes with very
> + * little priority will be executed 1.28 seconds in the future at the
> very
> + * latest. The shift is done to ensure that the lowest possible offset
> is
> + * larger than the timeslice, in order to make sure that the scheduler
> does
> + * not degenerate to round robin behaviour when more than just a few
> processes
> + * with high priority are started.
> + *
> + * If the process voluntarily yielded its CPU, we reward it by halving
> its
> + * deadline offset.
> + */
> + unsigned int offset_msec = ((p->p_p->ps_nice + 1) * rrticks_init) <<
> (voluntary ? 2 : 3);
> + struct timeval offset = {
> + .tv_sec = offset_msec / 1000,
> + .tv_usec = offset_msec % 1000
tv_usec is measured in microseconds, msec suggests milliseconds.
I'd change that to:
.tv_usec = offset_msec % 1000 * 1000
> + };
> + struct timeval now;
> + microuptime(&now);
> + bcopy(&now, &(p->p_deadline_set), sizeof(struct timeval));
> +
> + timeradd(&now, &offset, &(p->p_deadline));
> + if (!voluntary)
> + p->p_rrticks = rrticks_init;
> }
>
> /*
> @@ -559,12 +550,8 @@
> void
> schedclock(struct proc *p)
> {
> - int s;
> -
> - SCHED_LOCK(s);
> p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
> resetpriority(p);
> if (p->p_priority >= PUSER)
> p->p_priority = p->p_usrpri;
> - SCHED_UNLOCK(s);
> }
I really like the deadlines. They convey intent very well.
--
Ariane