On Sun, 4 Nov 2007, Josh Carroll wrote:
Josh, thanks for your help so far. This has been very useful.
You're welcome, glad to help! Thanks for the effort and the patch.
Any testing you can run this through is appreciated. Anyone else lurking
in this thread who would like to is also welcome to report back findings.
Here are a few benchmarks comparing ULE and the patched ULE. I
experimented in changing the slice_min value from 2 to 4, in case that
might be useful info for you. Hopefully that helps a bit, but if not
it's just a few minutes of CPU time wasted :)
Josh, I included one too many changes in the diff and it made the results
ambiguous. I've scaled it back slightly by removing the changes to
sched_pickcpu() and included the patch in this email again. Can you run
through your tests once more? I'd like to commit this part soon as it
helps in other cases. I'm most interested in the buildworld numbers for
now.
Thanks,
Jeff
Sysbench results:
# threads slice=7 slice=13 slice_min=4 slice_min=2
4 2265.67 2250.36 2261.71 2297.08
8 2300.25 2310.02 2306.79 2313.61
12 2269.54 2304.04 2296.54 2279.73
16 2249.26 2252.04 2260.53 2245.76
It looks like with the default minimum (2), the performance for systat
is better with 4 and 8 threads (on a 4 core system), but worse for 12
and 16 threads.
Here are the results for ffmpeg (-threads 8):
slice=7 slice=13 slice_min=4 slice_min=2
1:37.00 1:39.09 1:38.12 1:38.06
The patch definitely improves things there, though not quite as good
as using a slice value of 7. But it does improve things. So it
slightly improves things for ffmpeg and also slightly increases the
performance of sysbench/MySQL (with 8 threads).
I also ran through buildworld for both slice_min of 2 and 4, and here
are the results, again with ULE as a base line:
slice=7 slice=13 slice_min=4 slice_min=2
13:40.56 13:44.28 13:46.64 13:45.80
So buildworld performance is about the same as with the default ULE
and default slice value.
Thanks,
Josh
_______________________________________________
freebsd-performance@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-performance
To unsubscribe, send any mail to "[EMAIL PROTECTED]"
Index: sched_ule.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.216
diff -u -r1.216 sched_ule.c
--- sched_ule.c 23 Oct 2007 00:52:24 -0000 1.216
+++ sched_ule.c 3 Nov 2007 21:36:27 -0000
@@ -88,7 +88,8 @@
short ts_flags; /* TSF_* flags. */
u_char ts_rqindex; /* Run queue index. */
u_char ts_cpu; /* CPU that we have affinity for. */
- int ts_slice; /* Ticks of slice remaining. */
+ int ts_slice; /* Ticks of slice used. */
+ int ts_score; /* Interactivity score. */
u_int ts_slptime; /* Number of ticks we vol. slept */
u_int ts_runtime; /* Number of ticks we were running */
/* The following variables are only used for pctcpu calculation */
@@ -102,6 +103,7 @@
/* flags kept in ts_flags */
#define TSF_BOUND 0x0001 /* Thread can not migrate. */
#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */
+#define TSF_INTERLOAD 0x0004 /* Interactive load on runq. */
static struct td_sched td_sched0;
@@ -167,13 +169,15 @@
* the shift factor. Without the shift the error rate
* due to rounding would be unacceptably high.
* realstathz: stathz is sometimes 0 and run off of hz.
- * sched_slice: Runtime of each thread before rescheduling.
+ * sched_slice_max: Maximum runtime of each thread before rescheduling.
+ * sched_slice_min: Minimum runtime of each thread before rescheduling.
* preempt_thresh: Priority threshold for preemption and remote IPIs.
*/
static int sched_interact = SCHED_INTERACT_THRESH;
static int realstathz;
static int tickincr;
-static int sched_slice;
+static int sched_slice_max = 1;
+static int sched_slice_min = 1;
#ifdef PREEMPTION
#ifdef FULL_PREEMPTION
static int preempt_thresh = PRI_MAX_IDLE;
@@ -194,6 +198,7 @@
struct runq tdq_realtime; /* real-time run queue. */
struct runq tdq_timeshare; /* timeshare run queue. */
struct runq tdq_idle; /* Queue of IDLE threads. */
+ unsigned int tdq_interload; /* Interactive load. */
int tdq_load; /* Aggregate load. */
u_char tdq_idx; /* Current insert index. */
u_char tdq_ridx; /* Current removal index. */
@@ -239,7 +244,6 @@
static int balance_interval = 128; /* Default set in sched_initticks(). */
static int pick_pri = 1;
static int affinity;
-static int tryself = 1;
static int steal_htt = 1;
static int steal_idle = 1;
static int steal_thresh = 2;
@@ -288,10 +292,12 @@
static void tdq_setup(struct tdq *);
static void tdq_load_add(struct tdq *, struct td_sched *);
static void tdq_load_rem(struct tdq *, struct td_sched *);
+static int tdq_slice(struct tdq *);
static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
-void tdq_print(int cpu);
-static void runq_print(struct runq *rq);
+void tdq_print(int);
+void sched_print(struct thread *);
+static void runq_print(struct runq *);
static void tdq_add(struct tdq *, struct thread *, int);
#ifdef SMP
static void tdq_move(struct tdq *, struct tdq *);
@@ -345,6 +351,26 @@
}
}
+void
+sched_print(struct thread *td)
+{
+ struct td_sched *ts;
+
+ if (td == NULL)
+ td = curthread;
+ ts = td->td_sched;
+ printf("flags: 0x%X\n", ts->ts_flags);
+ printf("rqindex: %d\n", ts->ts_rqindex);
+ printf("cpu: %d\n", ts->ts_cpu);
+ printf("slice: %d\n", ts->ts_slice);
+ printf("score: %d\n", ts->ts_score);
+ printf("slptime: %d\n", ts->ts_slptime);
+ printf("runtime: %d\n", ts->ts_runtime);
+ printf("ltick: %d\n", ts->ts_ltick);
+ printf("ftick: %d\n", ts->ts_ftick);
+ printf("ticks: %d\n", ts->ts_ticks);
+}
+
/*
* Print the status of a per-cpu thread queue. Should be a ddb show cmd.
*/
@@ -357,7 +383,9 @@
printf("tdq %d:\n", TDQ_ID(tdq));
printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq));
+ printf("\tinterload: %d\n", tdq->tdq_interload);
printf("\tload: %d\n", tdq->tdq_load);
+ printf("\tslice: %d\n", tdq_slice(tdq));
printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
printf("\trealtime runq:\n");
@@ -383,8 +411,12 @@
static __inline void
tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
{
+ u_char pri;
+
+ pri = ts->ts_thread->td_priority;
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+ TD_SET_RUNQ(ts->ts_thread);
#ifdef SMP
if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
tdq->tdq_transferable++;
@@ -392,15 +424,15 @@
ts->ts_flags |= TSF_XFERABLE;
}
#endif
- if (ts->ts_runq == &tdq->tdq_timeshare) {
- u_char pri;
-
- pri = ts->ts_thread->td_priority;
+ if (pri <= PRI_MAX_REALTIME) {
+ ts->ts_runq = &tdq->tdq_realtime;
+ } else if (pri <= PRI_MAX_TIMESHARE) {
+ ts->ts_runq = &tdq->tdq_timeshare;
KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
("Invalid priority %d on timeshare runq", pri));
/*
* This queue contains only priorities between MIN and MAX
- * realtime. Use the whole queue to represent these values.
+ * timeshare. Use the whole queue to represent these values.
*/
if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
@@ -416,8 +448,10 @@
} else
pri = tdq->tdq_ridx;
runq_add_pri(ts->ts_runq, ts, pri, flags);
+ return;
} else
- runq_add(ts->ts_runq, ts, flags);
+ ts->ts_runq = &tdq->tdq_idle;
+ runq_add(ts->ts_runq, ts, flags);
}
/*
@@ -443,13 +477,6 @@
runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
else
runq_remove_idx(ts->ts_runq, ts, NULL);
- /*
- * For timeshare threads we update the priority here so
- * the priority reflects the time we've been sleeping.
- */
- ts->ts_ltick = ticks;
- sched_pctcpu_update(ts);
- sched_priority(ts->ts_thread);
} else
runq_remove(ts->ts_runq, ts);
}
@@ -466,6 +493,8 @@
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
class = PRI_BASE(ts->ts_thread->td_pri_class);
+ tdq->tdq_interload += ts->ts_score;
+ ts->ts_flags |= TSF_INTERLOAD;
tdq->tdq_load++;
CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
if (class != PRI_ITHD &&
@@ -498,9 +527,37 @@
#endif
KASSERT(tdq->tdq_load != 0,
("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
+ ts->ts_flags &= ~TSF_INTERLOAD;
+ ts->ts_runq = NULL;
+ tdq->tdq_interload -= ts->ts_score;
tdq->tdq_load--;
CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
- ts->ts_runq = NULL;
+}
+
+/*
+ * Compute the maximum slice when the interload changes. This gives a soft
+ * upper bound on latency as the load increases.
+ */
+static int
+tdq_slice(struct tdq *tdq)
+{
+ int slice;
+ int load;
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ if (tdq->tdq_interload == 0)
+ return (sched_slice_max);
+ /*
+ * An interload of 100 is roughly equivalent to 100% cpu utilization
+ * requested. Calculate how many times overloaded we are and then
+ * divide the latency target by this number. None of this is precise
+ * but it does yield decreasing slice values within the [min, max]
+ * range as load increases.
+ */
+ load = (tdq->tdq_interload + 99) / 100;
+ slice = sched_slice_max / load;
+ slice = max(slice, sched_slice_min);
+ return (slice);
}
#ifdef SMP
@@ -1070,14 +1127,6 @@
cpu = self = PCPU_GET(cpuid);
if (smp_started == 0)
return (self);
- /*
- * Don't migrate a running thread from sched_switch().
- */
- if (flags & SRQ_OURSELF) {
- CTR1(KTR_ULE, "YIELDING %d",
- curthread->td_priority);
- return (self);
- }
pri = ts->ts_thread->td_priority;
cpu = ts->ts_cpu;
/*
@@ -1175,6 +1225,7 @@
runq_init(&tdq->tdq_timeshare);
runq_init(&tdq->tdq_idle);
tdq->tdq_load = 0;
+ tdq->tdq_interload = 0;
}
#ifdef SMP
@@ -1324,12 +1375,12 @@
* in case which sched_clock() called before sched_initticks().
*/
realstathz = hz;
- sched_slice = (realstathz/10); /* ~100ms */
tickincr = 1 << SCHED_TICK_SHIFT;
/* Add thread0's load since it's running. */
TDQ_LOCK(tdq);
thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
+ td_sched0.ts_score = 0;
tdq_load_add(tdq, &td_sched0);
TDQ_UNLOCK(tdq);
}
@@ -1344,7 +1395,8 @@
int incr;
realstathz = stathz ? stathz : hz;
- sched_slice = (realstathz/10); /* ~100ms */
+ sched_slice_max = realstathz / 15; /* ~66ms */
+ sched_slice_min = realstathz / 50; /* ~20ms */
/*
* tickincr is shifted out by 10 to avoid rounding errors due to
@@ -1374,7 +1426,6 @@
#endif
}
-
/*
* This is the core of the interactivity algorithm. Determines a score based
* on past behavior. It is the ratio of sleep time to run time scaled to
@@ -1389,15 +1440,6 @@
int div;
ts = td->td_sched;
- /*
- * The score is only needed if this is likely to be an interactive
- * task. Don't go through the expense of computing it if there's
- * no chance.
- */
- if (sched_interact <= SCHED_INTERACT_HALF &&
- ts->ts_runtime >= ts->ts_slptime)
- return (SCHED_INTERACT_HALF);
-
if (ts->ts_runtime > ts->ts_slptime) {
div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
return (SCHED_INTERACT_HALF +
@@ -1443,7 +1485,7 @@
* score. Negative nice values make it easier for a thread to be
* considered interactive.
*/
- score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
+ score = imax(0, td->td_sched->ts_score - td->td_proc->p_nice);
if (score < sched_interact) {
pri = PRI_MIN_REALTIME;
pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
@@ -1477,12 +1519,15 @@
sched_interact_update(struct thread *td)
{
struct td_sched *ts;
+ struct tdq *tdq;
u_int sum;
+ int score;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
sum = ts->ts_runtime + ts->ts_slptime;
if (sum < SCHED_SLP_RUN_MAX)
- return;
+ goto score;
/*
* This only happens from two places:
* 1) We have added an unusual amount of run time from fork_exit.
@@ -1490,13 +1535,13 @@
*/
if (sum > SCHED_SLP_RUN_MAX * 2) {
if (ts->ts_runtime > ts->ts_slptime) {
- ts->ts_runtime = SCHED_SLP_RUN_MAX;
+ ts->ts_runtime = SCHED_SLP_RUN_MAX / 2;
ts->ts_slptime = 1;
} else {
- ts->ts_slptime = SCHED_SLP_RUN_MAX;
+ ts->ts_slptime = SCHED_SLP_RUN_MAX / 2;
ts->ts_runtime = 1;
}
- return;
+ goto score;
}
/*
* If we have exceeded by more than 1/5th then the algorithm below
@@ -1506,10 +1551,19 @@
if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
ts->ts_runtime /= 2;
ts->ts_slptime /= 2;
- return;
+ goto score;
}
ts->ts_runtime = (ts->ts_runtime / 5) * 4;
ts->ts_slptime = (ts->ts_slptime / 5) * 4;
+score:
+ score = sched_interact_score(td);
+ if (ts->ts_flags & TSF_INTERLOAD) {
+ tdq = TDQ_CPU(ts->ts_cpu);
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ tdq->tdq_interload -= ts->ts_score;
+ tdq->tdq_interload += score;
+ }
+ ts->ts_score = score;
}
/*
@@ -1559,7 +1613,7 @@
{
/* Convert sched_slice to hz */
- return (hz/(realstathz/sched_slice));
+ return (hz/(realstathz/sched_slice_max));
}
/*
@@ -1598,16 +1652,19 @@
sched_thread_priority(struct thread *td, u_char prio)
{
struct td_sched *ts;
+ struct tdq *tdq;
CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, prio, curthread,
curthread->td_proc->p_comm);
ts = td->td_sched;
+ tdq = TDQ_CPU(ts->ts_cpu);
THREAD_LOCK_ASSERT(td, MA_OWNED);
- if (td->td_priority == prio)
+ if (td->td_priority <= prio) {
+ td->td_priority = prio;
return;
-
- if (TD_ON_RUNQ(td) && prio < td->td_priority) {
+ }
+ if (TD_ON_RUNQ(td)) {
/*
* If the priority has been elevated due to priority
* propagation, we may have to move ourselves to a new
@@ -1617,16 +1674,14 @@
sched_rem(td);
td->td_priority = prio;
sched_add(td, SRQ_BORROWING);
- } else {
#ifdef SMP
- struct tdq *tdq;
-
- tdq = TDQ_CPU(ts->ts_cpu);
+ } else if (TD_IS_RUNNING(td)) {
if (prio < tdq->tdq_lowpri)
tdq->tdq_lowpri = prio;
+ td->td_priority = prio;
#endif
+ } else
td->td_priority = prio;
- }
}
/*
@@ -1772,6 +1827,8 @@
tdn = TDQ_CPU(td->td_sched->ts_cpu);
#ifdef SMP
+ /* The load is being removed from the current cpu. */
+ tdq_load_rem(tdq, td->td_sched);
/*
* Do the lock dance required to avoid LOR. We grab an extra
* spinlock nesting to prevent preemption while we're
@@ -1863,12 +1920,11 @@
TD_SET_CAN_RUN(td);
} else if (TD_IS_RUNNING(td)) {
MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
- tdq_load_rem(tdq, ts);
srqflag = (flags & SW_PREEMPT) ?
SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
SRQ_OURSELF|SRQ_YIELDING;
if (ts->ts_cpu == cpuid)
- tdq_add(tdq, td, srqflag);
+ tdq_runq_add(tdq, ts, srqflag);
else
mtx = sched_switch_migrate(tdq, td, srqflag);
} else {
@@ -1970,22 +2026,18 @@
THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
/*
- * If we slept for more than a tick update our interactivity and
- * priority.
+ * Update interactivity and priority after a sleep.
*/
slptick = td->td_slptick;
td->td_slptick = 0;
- if (slptick && slptick != ticks) {
- u_int hzticks;
-
- hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
- ts->ts_slptime += hzticks;
+ if (slptick && slptick != ticks) {
+ ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
sched_interact_update(td);
sched_pctcpu_update(ts);
sched_priority(td);
+ /* Reset the slice value after we sleep. */
+ ts->ts_slice = 0;
}
- /* Reset the slice value after we sleep. */
- ts->ts_slice = sched_slice;
sched_add(td, SRQ_BORING);
}
@@ -2040,7 +2092,6 @@
*/
ts2->ts_slptime = ts->ts_slptime;
ts2->ts_runtime = ts->ts_runtime;
- ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */
}
/*
@@ -2188,25 +2239,26 @@
}
ts = td->td_sched;
/*
- * We only do slicing code for TIMESHARE threads.
- */
- if (td->td_pri_class != PRI_TIMESHARE)
- return;
- /*
* We used a tick; charge it to the thread so that we can compute our
* interactivity.
*/
td->td_sched->ts_runtime += tickincr;
sched_interact_update(td);
/*
+ * We only do slicing code for TIMESHARE threads.
+ */
+ if (td->td_pri_class != PRI_TIMESHARE)
+ return;
+ sched_priority(td);
+ /*
* We used up one time slice.
*/
- if (--ts->ts_slice > 0)
+ if (++ts->ts_slice < tdq_slice(tdq))
return;
/*
- * We're out of time, recompute priorities and requeue.
+ * We're out of time, force a requeue later.
*/
- sched_priority(td);
+ ts->ts_slice = 0;
td->td_flags |= TDF_NEEDRESCHED;
}
@@ -2328,11 +2380,10 @@
tdq_add(struct tdq *tdq, struct thread *td, int flags)
{
struct td_sched *ts;
- int class;
#ifdef SMP
+ int class;
int cpumask;
#endif
-
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
KASSERT((td->td_inhibitors == 0),
("sched_add: trying to run inhibited thread"));
@@ -2342,20 +2393,10 @@
("sched_add: thread swapped out"));
ts = td->td_sched;
- class = PRI_BASE(td->td_pri_class);
- TD_SET_RUNQ(td);
- if (ts->ts_slice == 0)
- ts->ts_slice = sched_slice;
- /*
- * Pick the run queue based on priority.
- */
- if (td->td_priority <= PRI_MAX_REALTIME)
- ts->ts_runq = &tdq->tdq_realtime;
- else if (td->td_priority <= PRI_MAX_TIMESHARE)
- ts->ts_runq = &tdq->tdq_timeshare;
- else
- ts->ts_runq = &tdq->tdq_idle;
+ tdq_runq_add(tdq, ts, flags);
+ tdq_load_add(tdq, ts);
#ifdef SMP
+ class = PRI_BASE(td->td_pri_class);
cpumask = 1 << ts->ts_cpu;
/*
* If we had been idle, clear our bit in the group and potentially
@@ -2378,8 +2419,6 @@
if (td->td_priority < tdq->tdq_lowpri)
tdq->tdq_lowpri = td->td_priority;
#endif
- tdq_runq_add(tdq, ts, flags);
- tdq_load_add(tdq, ts);
}
/*
@@ -2660,8 +2699,10 @@
"Scheduler");
SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
"Scheduler name");
-SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
- "Slice size for timeshare threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice_max, 0,
+ "Maximum slice size for timeshare threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &sched_slice_min, 0,
+ "Minimum slice size for timeshare threads");
SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
"Interactivity score threshold");
SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
@@ -2671,7 +2712,6 @@
"Pick the target cpu based on priority rather than load.");
SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
"Number of hz ticks to keep thread affinity for");
-SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
"Enables the long-term load balancer");
SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
_______________________________________________
freebsd-performance@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-performance
To unsubscribe, send any mail to "[EMAIL PROTECTED]"