Hi, I haven't chased down the exact scenario, but using a min_vruntime which is about to change definitely seems to be what's causing my latency woes. Does the below cure your fairness woes as well?
(first bit is just some debug format changes for your convenience if you try it) diff -uprNX /root/dontdiff git/linux-2.6.sched-devel/kernel/sched_debug.c linux-2.6.23-rc7.d/kernel/sched_debug.c --- git/linux-2.6.sched-devel/kernel/sched_debug.c 2007-09-22 13:37:32.000000000 +0200 +++ linux-2.6.23-rc7.d/kernel/sched_debug.c 2007-09-21 12:12:27.000000000 +0200 @@ -67,7 +67,7 @@ print_task(struct seq_file *m, struct rq (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.sum_sleep_runtime)); @@ -83,10 +83,10 @@ static void print_rq(struct seq_file *m, SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key switches prio" - " exec-runtime sum-exec sum-sleep\n" + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" "------------------------------------------------------" - "------------------------------------------------"); + "----------------------------------------------------\n"); read_lock_irq(&tasklist_lock); @@ -134,7 +134,7 @@ void print_cfs_rq(struct seq_file *m, in spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %ld\n", "spread0", + SEQ_printf(m, " .%-30s: %ld\n", "nr_sync_min_vruntime", cfs_rq->nr_sync_min_vruntime); } diff -uprNX /root/dontdiff git/linux-2.6.sched-devel/kernel/sched_fair.c linux-2.6.23-rc7.d/kernel/sched_fair.c --- git/linux-2.6.sched-devel/kernel/sched_fair.c 2007-09-22 13:37:32.000000000 +0200 +++ linux-2.6.23-rc7.d/kernel/sched_fair.c 2007-09-23 12:53:11.000000000 +0200 @@ -290,14 +290,19 @@ __update_curr(struct cfs_rq *cfs_rq, str static void sync_vruntime(struct cfs_rq *cfs_rq) { struct rq *rq; - int cpu; + int cpu, wrote = 0; for_each_online_cpu(cpu) { rq = &per_cpu(runqueues, cpu); + if (!spin_trylock(&rq->lock)) + continue; cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, rq->cfs.min_vruntime); + spin_unlock(&rq->lock); + wrote++; } - schedstat_inc(cfs_rq, nr_sync_min_vruntime); + if (wrote) + schedstat_inc(cfs_rq, nr_sync_min_vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -306,8 +311,10 @@ static void update_curr(struct cfs_rq *c u64 now = rq_of(cfs_rq)->clock; unsigned long delta_exec; - if (unlikely(!curr)) + if (unlikely(!cfs_rq->nr_running)) return sync_vruntime(cfs_rq); + if (unlikely(!curr)) + return; /* * Get the amount of time the current task was running - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/