[PATCH] sched: implement staircase deadline scheduler load weight fix

2007-04-22 Thread Con Kolivas
The task load_weight needs to be set every time the quota is set and wasn't
being set in activate_task which assumed it would not have changed. Due to
changes in where the default rr_interval is set on SMP this assumption
failed. Also if one were to change rr_interval on the fly it would break
again.

set_load_weight was unnecessarily complex in the relationship as it could
be simply set to the task_timeslice in milliseconds. It also would not scale
enough to pick up nice 19 tasks and could give them 0 weight with a small
enough rr_interval.

Thanks to Willy Tarreau [EMAIL PROTECTED] for spotting more smp balancing 
problems.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/sched.c |   36 +---
 1 file changed, 17 insertions(+), 19 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-22 21:37:25.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-22 23:04:34.0 +1000
@@ -102,8 +102,6 @@ unsigned long long __attribute__((weak))
  */
 int rr_interval __read_mostly = 8;
 
-#define DEF_TIMESLICE  (rr_interval * 20)
-
 /*
  * This contains a bitmap for each dynamic priority level with empty slots
  * for the valid priorities each different nice level can have. It allows
@@ -886,16 +884,11 @@ static int task_timeslice(struct task_st
 }
 
 /*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
- * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification. Scaled as multiples of milliseconds.
- */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define LOAD_WEIGHT(lp) \
-   (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define TASK_LOAD_WEIGHT(p)LOAD_WEIGHT(task_timeslice(p))
-#define RTPRIO_TO_LOAD_WEIGHT(rp)  \
-   (LOAD_WEIGHT((rr_interval + 20 + (rp
+ * The load weight is basically the task_timeslice in ms. Realtime tasks are
+ * special cased to be proportionately larger than nice -20 by their
+ * rt_priority. The weight for rt tasks can only be arbitrary at best.
+ */
+#define RTPRIO_TO_LOAD_WEIGHT(rp)  (rr_interval * 20 * (40 + rp))
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -912,7 +905,7 @@ static void set_load_weight(struct task_
 #endif
p-load_weight = RTPRIO_TO_LOAD_WEIGHT(p-rt_priority);
} else
-   p-load_weight = TASK_LOAD_WEIGHT(p);
+   p-load_weight = task_timeslice(p);
 }
 
 static inline void
@@ -995,7 +988,7 @@ static int effective_prio(struct task_st
  * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
  * Value returned is in microseconds.
  */
-static unsigned int rr_quota(struct task_struct *p)
+static inline unsigned int rr_quota(struct task_struct *p)
 {
int nice = TASK_NICE(p), rr = rr_interval;
 
@@ -1009,6 +1002,13 @@ static unsigned int rr_quota(struct task
return MS_TO_US(rr);
 }
 
+/* Every time we set the quota we need to set the load weight */
+static void set_quota(struct task_struct *p)
+{
+   p-quota = rr_quota(p);
+   set_load_weight(p);
+}
+
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  */
@@ -1036,7 +1036,7 @@ static void activate_task(struct task_st
 (now - p-timestamp)  20);
}
 
-   p-quota = rr_quota(p);
+   set_quota(p);
p-prio = effective_prio(p);
p-timestamp = now;
__activate_task(p, rq);
@@ -3885,8 +3885,7 @@ void set_user_nice(struct task_struct *p
p-static_prio = NICE_TO_PRIO(nice);
old_prio = p-prio;
p-prio = effective_prio(p);
-   p-quota = rr_quota(p);
-   set_load_weight(p);
+   set_quota(p);
delta = p-prio - old_prio;
 
if (queued) {
@@ -4020,8 +4019,7 @@ static void __setscheduler(struct task_s
p-normal_prio = normal_prio(p);
/* we are holding p-pi_lock already */
p-prio = rt_mutex_getprio(p);
-   p-quota = rr_quota(p);
-   set_load_weight(p);
+   set_quota(p);
 }
 
 /**

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ck] Re: [ANNOUNCE] Staircase Deadline cpu scheduler version 0.45

2007-04-22 Thread Con Kolivas
On Monday 23 April 2007 00:27, Michael Gerdau wrote:
  Anyway the more important part is... Can you test this patch please? Dump
  all the other patches I sent you post 045. Michael, if you could test too
  please?

 Have it up running for 40 minutes now and my perljobs show a constant
 cpu utilization of 100/50/50 in top most of the time. When the 100% job
 goes down to e.g. 70% these 30% are immediately reclaimed by the other
 two, i.e. the total sum of all three stays with 2% point of 200%.

 From here it seems as if your latest patch did what is was supposed to :-)

Excellent, thanks for testing. v0.46 with something close to this patch coming 
shortly.

 Best,
 Michael

 PS: While these numbercrunching jobs were running I started another
 kde session and have my children play supertux for 20 minutes. While
 the system occasionally was not as responsive as it is when there
 is little load, supertux remained very playable.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[ANNOUNCE] Staircase Deadline cpu scheduler version 0.46

2007-04-22 Thread Con Kolivas
Yet another significant bugfix for SMP balancing was just posted for the 
staircase deadline cpu scheduler which improves behaviour dramatically on any 
SMP machine.

Thanks to Willy Tarreau for noticing more bugs.

As requested was a version in the Makefile so this version of the patch 
adds -sd046 to the kernel version.

http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.46.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.46.patch

Renicing X to -10, while not essential, may be desirable on the desktop. 
Unlike the CFS scheduler which renices X without your intervention to 
nice -19, the SD patches do not alter nice level on their own.

See the patch just posted called 'sched: implement staircase deadline 
scheduler load  weight fix' for details of the fixes.

Thanks to all testing and giving feedback.

Well I'm exhausted...

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ck] Re: [ANNOUNCE] Staircase Deadline cpu scheduler version 0.45

2007-04-22 Thread Con Kolivas
On Monday 23 April 2007 00:22, Willy Tarreau wrote:
 On Sun, Apr 22, 2007 at 10:18:32PM +1000, Con Kolivas wrote:
  On Sunday 22 April 2007 21:42, Con Kolivas wrote:
 
  Willy I'm still investigating the idle time and fluctuating load as a
  separate issue. Is it possible the multiple ocbench processes are
  naturally synchronising and desynchronising and choosing to sleep and/or
  run at the same time? I can remove the idle time entirely by running
  ocbench at nice 19 which means they are all forced to run at basically
  the same time by the scheduler.
 
  Anyway the more important part is... Can you test this patch please? Dump
  all the other patches I sent you post 045. Michael, if you could test too
  please?

 OK, it's better now. All tasks equally run.

Excellent thank you very much (again!)

 X is still somewhat jerky, even 
 at nice -19. I'm sure it happens when it's waiting in the other array. We
 should definitely manage to get rid of this if we want to ensure low
 latency.

Yeah that would be correct. It's clearly possible to keep the whole design 
philosophy and priority system intact with SD and do away with the arrays if 
it becomes a continuous stream instead of two arrays but that requires some 
architectural changes. I've been concentrating on nailing all the remaining 
issues (and they kept cropping up as you've seen *blush*). However... I 
haven't quite figured out how to do that architectural change just yet either 
so let's just iron out all the bugs out of this now.

 Just FYI, the idle is often close to zero and the load is often close to
 30, even if still fluctuating :

 Hoping this helps !

I can say without a shadow of a doubt it has helped :) I'll respin the patch 
slightly differently and post it and release as v0.46.

 Willy

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ck] Re: [ANNOUNCE] Staircase Deadline cpu scheduler version 0.45

2007-04-22 Thread Con Kolivas
On Sunday 22 April 2007 23:07, Willy Tarreau wrote:
 On Sun, Apr 22, 2007 at 10:18:32PM +1000, Con Kolivas wrote:
  On Sunday 22 April 2007 21:42, Con Kolivas wrote:
 
  Willy I'm still investigating the idle time and fluctuating load as a
  separate issue.

 OK.

  Is it possible the multiple ocbench processes are naturally
  synchronising and desynchronising and choosing to sleep and/or run at the
  same time?

 I don't think so. They're independant processes, and I insist on reducing
 their X work in order to ensure they don't get perturbated by external
 factor. Their work consist in looping 250 ms and waiting 750 ms, then
 displaying a new progress line.

Well if they always wait 750ms and they always do 250ms of work, they will 
never actually get their 250ms in a continuous stream, and may be waiting on 
a runqueue while working. What I mean then is that scheduling could cause 
that synchronising and desynchronising unwittingly by fluctuating the 
absolute time over which they get their 250ms. The sleep always takes 750ms, 
but the actual physical time over which they get their 250ms fluctuates by 
scheduling aliasing. If instead the code said 500ms has passed while I only 
did 250ms work so I should sleep for 250ms less this aliasing would go away. 
Of course this is impossible since a fully loaded machine would mean each 
process should never sleep. I'm not arguing this is correct behaviour for the 
scheduler to cause this, mind you, nor am I saying it's wrong behaviour. I'm 
just trying to understand better how it happens and what (if anything) should 
be done about it. Overall their progress and cpu distribution appears 
identical, as you said. The difference is that the CFS design intrinsically 
manages this exact scenario by design with its sleep/run timing mechanism.

  I can remove the idle time entirely by running ocbench at nice 19
  which means they are all forced to run at basically the same time by the
  scheduler.

 It may indicate some special handling of nice ?

By running them nice 19 the scheduler has effectively just sequentially 
schedules them, and there is no aliasing.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ANNOUNCE] Staircase Deadline cpu scheduler version 0.46

2007-04-22 Thread Con Kolivas
On Monday 23 April 2007 03:58, Thomas Backlund wrote:
 mån 2007-04-23 klockan 01:03 +1000 skrev Con Kolivas:
  Yet another significant bugfix for SMP balancing was just posted for the
  staircase deadline cpu scheduler which improves behaviour dramatically on
  any SMP machine.
 
  Thanks to Willy Tarreau for noticing more bugs.
 
  As requested was a version in the Makefile so this version of the patch
  adds -sd046 to the kernel version.
 
  http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.46.patch
  http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.46.patch
 
  Renicing X to -10, while not essential, may be desirable on the desktop.
  Unlike the CFS scheduler which renices X without your intervention to
  nice -19, the SD patches do not alter nice level on their own.
 
  See the patch just posted called 'sched: implement staircase deadline
  scheduler load  weight fix' for details of the fixes.
 
  Thanks to all testing and giving feedback.
 
  Well I'm exhausted...

 This one broke 2.6.20.7 build...

 kernel/sched.c: In function ‘dependent_sleeper’:
 kernel/sched.c:3319: error: ‘DEF_TIMESLICE’ undeclared (first use in
 this function)
 kernel/sched.c:3319: error: (Each undeclared identifier is reported only
 once
 kernel/sched.c:3319: error: for each function it appears in.)

Apologies it was a blind merge.

Use this instead
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.46-1.patch

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[ANNOUNCE] Staircase Deadline cpu scheduler version 0.45

2007-04-21 Thread Con Kolivas
A significant bugfix for SMP balancing was just posted for the staircase 
deadline cpu scheduler which improves behaviour dramatically on any SMP 
machine.

Thanks to Willy Tarreau for noticing likely fault point.

Also requested was a version in the Makefile so this version of the patch 
adds -sd045 to the kernel version.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.45.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.45.patch

Incrementals from 0.44:
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7/sd-0.44-0.45.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sd-0.44-0.45.patch

Renicing X to -10, while not essential, may be desirable on the desktop. 
Unlike the CFS scheduler which renices X without your intervention to 
nice -19, the SD patches do not alter nice level on their own.

See the patch just posted called 'sched: implement staircase deadline 
scheduler ymf accounting fixes' for details of the fixes.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: ymf typo

2007-04-21 Thread Con Kolivas
Typo in comment, 1us not 1ms.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-22 14:22:14.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-22 14:22:34.0 +1000
@@ -3045,7 +3045,7 @@ update_cpu_clock(struct task_struct *p, 
/*
 * Called from context_switch there should be less than one
 * jiffy worth, and not negative/overflow. There should be
-* some time banked here so use a nominal 1ms.
+* some time banked here so use a nominal 1us.
 */
if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1)
time_diff = 1000;

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: implement staircase deadline scheduler ymf accounting fixes

2007-04-21 Thread Con Kolivas
This causes significant improvements on SMP hardware. I don't think the kernel
should be -nicing X by itself; that should be a sysadmin choice so I won't
be including that change in the SD patches. The following change will be in
the next release of SD (v0.45).

Andrew Please apply on top of yaf-fix

---
SMP balancing broke on converting time_slice to usecs.

update_cpu_clock is unnecessarily complex and doesn't allow sub usec values.

Thanks to Willy Tarreau <[EMAIL PROTECTED]> for picking up SMP idle anomalies.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c |   42 +-
 1 file changed, 17 insertions(+), 25 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-21 22:50:31.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-22 13:29:29.0 +1000
@@ -88,12 +88,10 @@ unsigned long long __attribute__((weak))
 #define SCHED_PRIO(p)  ((p)+MAX_RT_PRIO)
 
 /* Some helpers for converting to/from various scales.*/
-#define NS_TO_JIFFIES(TIME)((TIME) / (10 / HZ))
 #define JIFFIES_TO_NS(TIME)((TIME) * (10 / HZ))
+#define MS_TO_NS(TIME) ((TIME) * 100)
 #define MS_TO_US(TIME) ((TIME) * 1000)
-/* Can return 0 */
-#define MS_TO_JIFFIES(TIME)((TIME) * HZ / 1000)
-#define JIFFIES_TO_MS(TIME)((TIME) * 1000 / HZ)
+#define US_TO_MS(TIME) ((TIME) / 1000)
 
 #define TASK_PREEMPTS_CURR(p, curr)((p)->prio < (curr)->prio)
 
@@ -876,29 +874,28 @@ static void requeue_task(struct task_str
 
 /*
  * task_timeslice - the total duration a task can run during one major
- * rotation. Returns value in jiffies.
+ * rotation. Returns value in milliseconds as the smallest value can be 1.
  */
-static inline int task_timeslice(struct task_struct *p)
+static int task_timeslice(struct task_struct *p)
 {
-   int slice;
+   int slice = p->quota;   /* quota is in us */
 
-   slice = NS_TO_JIFFIES(p->quota);
if (!rt_task(p))
slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
-   return slice;
+   return US_TO_MS(slice);
 }
 
 /*
  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
  * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
+ * this code will need modification. Scaled as multiples of milliseconds.
  */
 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 #define LOAD_WEIGHT(lp) \
(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define TASK_LOAD_WEIGHT(p)LOAD_WEIGHT(task_timeslice(p))
 #define RTPRIO_TO_LOAD_WEIGHT(rp)  \
-   (LOAD_WEIGHT((MS_TO_JIFFIES(rr_interval) + 20 + (rp
+   (LOAD_WEIGHT((rr_interval + 20 + (rp
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -3035,32 +3032,27 @@ static void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
 int tick)
 {
-   cputime64_t time_diff = now - p->last_ran;
-   const unsigned int min_diff = 1000;
-   int us_time_diff;
+   long time_diff = now - p->last_ran;
 
if (tick) {
/*
 * Called from scheduler_tick() there should be less than two
 * jiffies worth, and not negative/overflow.
 */
-   if (time_diff > JIFFIES_TO_NS(2) || time_diff < min_diff)
+   if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0)
time_diff = JIFFIES_TO_NS(1);
} else {
/*
 * Called from context_switch there should be less than one
-* jiffy worth, and not negative/overflowed. In the case when
-* sched_clock fails to return high resolution values this
-* also ensures at least 1 min_diff gets banked.
+* jiffy worth, and not negative/overflow. There should be
+* some time banked here so use a nominal 1ms.
 */
-   if (time_diff > JIFFIES_TO_NS(1) || time_diff < min_diff)
-   time_diff = min_diff;
+   if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1)
+   time_diff = 1000;
}
/* time_slice accounting is done in usecs to avoid overflow on 32bit */
-   us_time_diff = time_diff;
-   us_time_diff /= 1000;
if (p != rq->idle && p->policy != SCHED_FIFO)
-   p->time_slice -= us_time_diff;
+   p->time_slice -= time_diff / 1000;
p->sched_time += time_diff;
p->last_ran = rq->most_recent_timestamp = now;
 }
@@ -4636,8 +4628,8 @@ long sys_sched_rr_get_interval(pid_t pid
if (retval)
goto out_unlock;
 
-   jiffies_to_timespec(p->policy == SCHED_FIFO

Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Saturday 21 April 2007 22:12, Willy Tarreau wrote:
> 2) SD-0.44
>
>Feels good, but becomes jerky at moderately high loads. I've started
>64 ocbench with a 250 ms busy loop and 750 ms sleep time. The system
>always responds correctly but under X, mouse jumps quite a bit and
>typing in xterm or even text console feels slightly jerky. The CPU is
>not completely used, and the load varies a lot (see below). However,
>the load is shared equally between all 64 ocbench, and they do not
>deviate even after 4000 iterations. X uses less than 1% CPU during
>those tests.

Found it. I broke SMP balancing again so there is serious scope for 
improvement on SMP hardware. That explains the huge load variations. Expect 
yet another fix soon, which should improve behaviour further :)

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Sunday 22 April 2007 04:17, Gene Heskett wrote:
> More first impressions of sd-0.44 vs CFS-v4

Thanks Gene.
>
> CFS-v4 is quite smooth in terms of the users experience but after prolonged
> observations approaching 24 hours, it appears to choke the cpu hog off a
> bit even when the system has nothing else to do.  My amanda runs went from
> 1 to 1.5 hours depending on how much time it took gzip to handle the amount
> of data tar handed it, up to about 165m & change, or nearly 3 hours pretty
> consistently over 5 runs.
>
> sd-0.44 so far seems to be handling the same load (theres a backup running
> right now) fairly well also, and possibly theres a bit more snap to the
> system now.  A switch to screen 1 from this screen 8, and the loading of
> that screen image, which is the Cassini shot of saturn from the backside,
> the one showing that teeny dot to the left of Saturn that is actually us,
> took 10 seconds with the stock 2.6.21-rc7, 3 seconds with the best of
> Ingo's patches, and now with Con's latest, is 1 second flat. Another screen
> however is 4 seconds, so maybe that first scren had been looked at since I
> rebooted. However, amanda is still getting estimates so gzip hasn't put a
> tiewrap around the kernels neck just yet.
>
> Some minutes later, gzip is smunching /usr/src, and the machine doesn't
> even know its running as sd-0.44 isn't giving gzip more than 75% to gzip,
> and probably averaging less than 50%. And it scared me a bit as it started
> out at not over 5% for the first minute or so.  Running in the 70's now
> according to gkrellm, with an occasional blip to 95%.  And the machine
> generally feels good.
>
> I had previously given CFS-v4 a 95 score but that was before I saw the
> general slowdown, and I believe my first impression of this one is also a
> 95.  This on a scale of the best one of the earlier CFS patches being 100,
> and stock 2.6.21-rc7 gets a 0.0.  This scheduler seems to be giving gzip
> ever more cpu as time progresses, and the cpu is warming up quite nicely,
> from about 132F idling to 149.9F now.  And my keyboard is still alive and
> well.

I'm not sure how much weight to put on what you see as the measured cpu usage. 
I have a feeling it's being wrongly reported in SD currently. Concentrate 
more on the actual progress and behaviour of things as you've already done.

> Generally speaking, Con, I believe this one is also a keeper.  And we'll
> see how long a backup run takes.

Great thanks for feedback.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Sunday 22 April 2007 08:54, Denis Vlasenko wrote:
> On Saturday 21 April 2007 18:00, Ingo Molnar wrote:
> > correct. Note that Willy reniced X back to 0 so it had no relevance on
> > his test. Also note that i pointed this change out in the -v4 CFS
> >
> > announcement:
> > || Changes since -v3:
> > ||
> > ||  - usability fix: automatic renicing of kernel threads such as
> > ||keventd, OOM tasks and tasks doing privileged hardware access
> > ||(such as Xorg).
> >
> > i've attached it below in a standalone form, feel free to put it into
> > SD! :)
>
> But X problems have nothing to do with "privileged hardware access".
> X problems are related to priority inversions between server and client
> processes, and "one server process - many client processes" case.

It's not a privileged hardware access reason that this code is there. This is 
obfuscation/advertising to make it look like there is a valid reason for X 
getting negative nice levels somehow in the kernel to make interactive 
testing of CFS better by default.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Sunday 22 April 2007 02:00, Ingo Molnar wrote:
> * Con Kolivas <[EMAIL PROTECTED]> wrote:
> > >   Feels even better, mouse movements are very smooth even under high
> > >   load. I noticed that X gets reniced to -19 with this scheduler.
> > >   I've not looked at the code yet but this looked suspicious to me.
> > >   I've reniced it to 0 and it did not change any behaviour. Still
> > >   very good.
> >
> > Looks like this code does it:
> >
> > +int sysctl_sched_privileged_nice_level __read_mostly = -19;
>
> correct. 

Oh I definitely was not advocating against renicing X, I just suspect that 
virtually all the users who gave glowing reports to CFS comparing it to SD 
had no idea it had reniced X to -19 behind their back and that they were 
comparing it to SD running X at nice 0. I think had they been comparing CFS 
with X nice -19 to SD running nice -10 in this interactivity soft and squishy 
comparison land their thoughts might have been different. I missed it in the 
announcement and had to go looking in the code since Willy just kinda tripped 
over it unwittingly as well.

> Note that Willy reniced X back to 0 so it had no relevance on 
> his test.

Oh yes I did notice that, but since the array swap is the remaining longest 
deadline in SD which would cause noticeable jerks, renicing X on SD by 
default would make the experience very different since reniced tasks do much 
better over array swaps compared to non niced tasks. I really should go and 
make the whole thing one circular list and blow away the array swap (if I can 
figure out how to do it). 

> Also note that i pointed this change out in the -v4 CFS 
>
> announcement:
> || Changes since -v3:
> ||
> ||  - usability fix: automatic renicing of kernel threads such as
> ||keventd, OOM tasks and tasks doing privileged hardware access
> ||(such as Xorg).

Reading the changelog in the gloss-over fashion that I unfortunately did, even 
I missed it. 

> i've attached it below in a standalone form, feel free to put it into
> SD! :)

Hmm well I have tried my very best to do all the changes without 
changing "policy" as much as possible since that trips over so many emotive 
issues that noone can agree on, and I don't have a strong opinion on this as 
I thought it would be better for it to be a config option for X in userspace 
instead. Either way it needs to be turned on/off by admin and doing it by 
default in the kernel is... not universally accepted as good. What else 
accesses ioports that can get privileged nice levels? Does this make it 
relatively exploitable just by poking an ioport?

>   Ingo
>
> ---
>  arch/i386/kernel/ioport.c   |   13 ++---
>  arch/x86_64/kernel/ioport.c |8 ++--
>  drivers/block/loop.c|5 -
>  include/linux/sched.h   |7 +++
>  kernel/sched.c  |   40

Thanks for the patch. I'll consider it. Since end users are testing this in 
fuzzy interactivity land I may simply be forced to do this just for 
comparisons to be meaningful between CFS and SD otherwise they're not really 
comparing them on a level playing field. I had almost given up SD for dead 
meat with all the momentum CFS had gained... until recently.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Saturday 21 April 2007 22:12, Willy Tarreau wrote:
> I promised to perform some tests on your code. I'm short in time right now,
> but I observed behaviours that should be commented on.

>   Feels even better, mouse movements are very smooth even under high load.
>   I noticed that X gets reniced to -19 with this scheduler. I've not looked
>   at the code yet but this looked suspicious to me.

Looks like this code does it:

+int sysctl_sched_privileged_nice_level __read_mostly = -19;

allows anything that sets sched_privileged_task one way or another gets 
nice -19, and this is enabled by default.

--- linux-cfs-2.6.20.7.q.orig/arch/i386/kernel/ioport.c
+++ linux-cfs-2.6.20.7.q/arch/i386/kernel/ioport.c

+   if (turn_on) {
+   if (!capable(CAP_SYS_RAWIO))
+   return -EPERM;
+   /*
+* Task will be accessing hardware IO ports,
+* mark it as special with the scheduler too:
+*/
+   sched_privileged_task(current);
+   }

presumably that selects out X as a privileged task... and sets it to nice -19 
by default.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Saturday 21 April 2007 22:12, Willy Tarreau wrote:
> Hi Ingo, Hi Con,
>
> I promised to perform some tests on your code. I'm short in time right now,
> but I observed behaviours that should be commented on.
>
> 1) machine : dual athlon 1533 MHz, 1G RAM, kernel 2.6.21-rc7 + either
> scheduler Test:  ./ocbench -R 25 -S 75 -x 8 -y 8
>ocbench: http://linux.1wt.eu/sched/
>
> 2) SD-0.44
>
>Feels good, but becomes jerky at moderately high loads. I've started
>64 ocbench with a 250 ms busy loop and 750 ms sleep time. The system
>always responds correctly but under X, mouse jumps quite a bit and
>typing in xterm or even text console feels slightly jerky. The CPU is
>not completely used, and the load varies a lot (see below). However,
>the load is shared equally between all 64 ocbench, and they do not
>deviate even after 4000 iterations. X uses less than 1% CPU during
>those tests.
>
>Here's the vmstat output :
[snip]

> 3) CFS-v4
>
>   Feels even better, mouse movements are very smooth even under high load.
>   I noticed that X gets reniced to -19 with this scheduler. I've not looked
>   at the code yet but this looked suspicious to me. I've reniced it to 0
> and it did not change any behaviour. Still very good. The 64 ocbench share
> equal CPU time and show exact same progress after 2000 iterations. The CPU
> load is more smoothly spread according to vmstat, and there's no idle (see
> below). BUT I now think it was wrong to let new processes start with no
> timeslice at all, because it can take tens of seconds to start a new
> process when only 64 ocbench are there. Simply starting "killall ocbench"
> takes about 10 seconds. On a smaller machine (VIA C3-533), it took me more
> than one minute to do "su -", even from console, so that's not X. BTW, X
> uses less than 1% CPU during those tests.
>
> [EMAIL PROTECTED]:~$ vmstat 1
[snip]

> 4) first impressions
>
> I think that CFS is based on a more promising concept but is less mature
> and is dangerous right now with certain workloads. SD shows some strange
> behaviours like not using all CPU available and a little jerkyness, but is
> more robust and may be the less risky solution for a first step towards
> a better scheduler in mainline, but it may also probably be the last O(1)
> scheduler, which may be replaced sometime later when CFS (or any other one)
> shows at the same time the smoothness of CFS and the robustness of SD.

I assumed from your description that you were running X nice 0 during all this 
testing and left the tunables from both SD and CFS at their defaults; this 
tends to have the effective equivalent of "timeslice" in CFS smaller than SD.

> I'm sorry not to spend more time on them right now, I hope that other
> people will do.

Thanks for that interesting testing you've done. The fluctuating cpu load and 
the apparently high idle time means there is almost certainly a bug still in 
the cpu accounting I do in update_cpu_clock. It looks suspicious to me 
already on just my first glance. Fortunately the throughput does not appear 
to be adversely affected on other benchmarks so I suspect it's lying about 
the idle time and it's not really there. Which means it's likely also 
accounting the cpu time wrongly. Which also means there's something I can fix 
and improve SD further. Great stuff, thanks! 

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[ANNOUNCE] Staircase Deadline cpu scheduler version 0.44

2007-04-21 Thread Con Kolivas
A significant bugfix for forking tasks was just posted, so here is an updated 
version of the staircase deadline cpu scheduler. This may cause noticeable
behavioural improvements under certain workloads (such as compiling software 
with make).

Thanks to Al Boldi for making me check the fork code!

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.44.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.44.patch

Incrementals in
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7/
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/

Renicing X to -10, while not essential, is preferable.

See the patch just posted called 'sched: implement staircase scheduler yaf 
fix' for full changelog.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: implement staircase scheduler yaf fix

2007-04-21 Thread Con Kolivas
While it annoys even me to keep posting fixes for SD, it is nice that fixing
these bugs improves the behaviour further. This change causes noticeable
improvements with loads that fork (ie make and friends).

Thanks Al!

Andrew please apply.
---
Management of time_slice sharing across fork was broken by changing
time_slice to a signed int.

first_time_slice was not being cleared anywhere near often enough. 

SCHED_BATCH tasks in the current implementation should advance prio_level
and best_static_prio.

Thanks Al Boldi <[EMAIL PROTECTED]> for making me check the fork code.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c |   50 +++---
 1 file changed, 31 insertions(+), 19 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-21 16:43:23.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-21 17:31:12.0 +1000
@@ -661,6 +661,12 @@ static void dequeue_task(struct task_str
__clear_bit(p->prio, p->array->prio_bitmap);
 }
 
+static void reset_first_time_slice(struct task_struct *p)
+{
+   if (unlikely(p->first_time_slice))
+   p->first_time_slice = 0;
+}
+
 /*
  * The task is being queued on a fresh array so it has its entitlement
  * bitmap cleared.
@@ -672,6 +678,7 @@ static void task_new_array(struct task_s
p->rotation = rq->prio_rotation;
p->time_slice = p->quota;
p->array = array;
+   reset_first_time_slice(p);
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
@@ -740,6 +747,7 @@ static void queue_expired(struct task_st
p->prio = p->normal_prio = first_prio_slot(p);
if (p->static_prio < rq->expired->best_static_prio)
rq->expired->best_static_prio = p->static_prio;
+   reset_first_time_slice(p);
 }
 
 #ifdef CONFIG_SMP
@@ -1661,13 +1669,20 @@ void fastcall sched_fork(struct task_str
 * resulting in more scheduling fairness.
 */
local_irq_disable();
-   current->time_slice >>= 1;
-   p->time_slice = current->time_slice;
-   /*
-* The remainder of the first timeslice might be recovered by
-* the parent if the child exits early enough.
-*/
-   p->first_time_slice = 1;
+   if (current->time_slice > 0) {
+   current->time_slice /= 2;
+   if (current->time_slice)
+   p->time_slice = current->time_slice;
+   else
+   p->time_slice = 1;
+   /*
+* The remainder of the first timeslice might be recovered by
+* the parent if the child exits early enough.
+*/
+   p->first_time_slice = 1;
+   } else
+   p->time_slice = 0;
+
p->timestamp = sched_clock();
local_irq_enable();
 out:
@@ -1748,7 +1763,7 @@ void fastcall sched_exit(struct task_str
 
parent = p->parent;
rq = task_rq_lock(parent, );
-   if (p->first_time_slice && task_cpu(p) == task_cpu(parent)) {
+   if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) {
parent->time_slice += p->time_slice;
if (unlikely(parent->time_slice > parent->quota))
parent->time_slice = parent->quota;
@@ -3148,8 +3163,7 @@ static void task_expired_entitlement(str
struct prio_array *old_array;
int overrun, old_prio;
 
-   if (unlikely(p->first_time_slice))
-   p->first_time_slice = 0;
+   reset_first_time_slice(p);
if (rt_task(p)) {
p->time_slice = p->quota;
list_move_tail(>run_list, p->array->queue + p->prio);
@@ -3251,9 +3265,10 @@ static void reset_prio_levels(struct rq 
  */
 static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
 {
+   struct prio_array *array = rq->active;
struct task_struct *next;
struct list_head *queue;
-   struct prio_array *array = rq->active;
+   int nstatic;
 
 retry:
if (idx >= MAX_PRIO) {
@@ -3281,14 +3296,11 @@ retry:
goto retry;
}
next->rotation = rq->prio_rotation;
-   if (likely(next->policy != SCHED_BATCH)) {
-   int nstatic = next->static_prio;
-
-   if (nstatic < array->best_static_prio)
-   array->best_static_prio = nstatic;
-   if (idx > rq->prio_level[USER_PRIO(nstatic)])
-   rq->prio_level[USER_PRIO(nstatic)] = idx;
-   }
+   nstatic = next->static_prio;
+   if (nstatic < array->best_static_prio)
+  

[PATCH] sched: implement staircase scheduler yaf fix

2007-04-21 Thread Con Kolivas
While it annoys even me to keep posting fixes for SD, it is nice that fixing
these bugs improves the behaviour further. This change causes noticeable
improvements with loads that fork (ie make and friends).

Thanks Al!

Andrew please apply.
---
Management of time_slice sharing across fork was broken by changing
time_slice to a signed int.

first_time_slice was not being cleared anywhere near often enough. 

SCHED_BATCH tasks in the current implementation should advance prio_level
and best_static_prio.

Thanks Al Boldi [EMAIL PROTECTED] for making me check the fork code.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/sched.c |   50 +++---
 1 file changed, 31 insertions(+), 19 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-21 16:43:23.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-21 17:31:12.0 +1000
@@ -661,6 +661,12 @@ static void dequeue_task(struct task_str
__clear_bit(p-prio, p-array-prio_bitmap);
 }
 
+static void reset_first_time_slice(struct task_struct *p)
+{
+   if (unlikely(p-first_time_slice))
+   p-first_time_slice = 0;
+}
+
 /*
  * The task is being queued on a fresh array so it has its entitlement
  * bitmap cleared.
@@ -672,6 +678,7 @@ static void task_new_array(struct task_s
p-rotation = rq-prio_rotation;
p-time_slice = p-quota;
p-array = array;
+   reset_first_time_slice(p);
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
@@ -740,6 +747,7 @@ static void queue_expired(struct task_st
p-prio = p-normal_prio = first_prio_slot(p);
if (p-static_prio  rq-expired-best_static_prio)
rq-expired-best_static_prio = p-static_prio;
+   reset_first_time_slice(p);
 }
 
 #ifdef CONFIG_SMP
@@ -1661,13 +1669,20 @@ void fastcall sched_fork(struct task_str
 * resulting in more scheduling fairness.
 */
local_irq_disable();
-   current-time_slice = 1;
-   p-time_slice = current-time_slice;
-   /*
-* The remainder of the first timeslice might be recovered by
-* the parent if the child exits early enough.
-*/
-   p-first_time_slice = 1;
+   if (current-time_slice  0) {
+   current-time_slice /= 2;
+   if (current-time_slice)
+   p-time_slice = current-time_slice;
+   else
+   p-time_slice = 1;
+   /*
+* The remainder of the first timeslice might be recovered by
+* the parent if the child exits early enough.
+*/
+   p-first_time_slice = 1;
+   } else
+   p-time_slice = 0;
+
p-timestamp = sched_clock();
local_irq_enable();
 out:
@@ -1748,7 +1763,7 @@ void fastcall sched_exit(struct task_str
 
parent = p-parent;
rq = task_rq_lock(parent, flags);
-   if (p-first_time_slice  task_cpu(p) == task_cpu(parent)) {
+   if (p-first_time_slice  0  task_cpu(p) == task_cpu(parent)) {
parent-time_slice += p-time_slice;
if (unlikely(parent-time_slice  parent-quota))
parent-time_slice = parent-quota;
@@ -3148,8 +3163,7 @@ static void task_expired_entitlement(str
struct prio_array *old_array;
int overrun, old_prio;
 
-   if (unlikely(p-first_time_slice))
-   p-first_time_slice = 0;
+   reset_first_time_slice(p);
if (rt_task(p)) {
p-time_slice = p-quota;
list_move_tail(p-run_list, p-array-queue + p-prio);
@@ -3251,9 +3265,10 @@ static void reset_prio_levels(struct rq 
  */
 static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
 {
+   struct prio_array *array = rq-active;
struct task_struct *next;
struct list_head *queue;
-   struct prio_array *array = rq-active;
+   int nstatic;
 
 retry:
if (idx = MAX_PRIO) {
@@ -3281,14 +3296,11 @@ retry:
goto retry;
}
next-rotation = rq-prio_rotation;
-   if (likely(next-policy != SCHED_BATCH)) {
-   int nstatic = next-static_prio;
-
-   if (nstatic  array-best_static_prio)
-   array-best_static_prio = nstatic;
-   if (idx  rq-prio_level[USER_PRIO(nstatic)])
-   rq-prio_level[USER_PRIO(nstatic)] = idx;
-   }
+   nstatic = next-static_prio;
+   if (nstatic  array-best_static_prio)
+   array-best_static_prio = nstatic;
+   if (idx  rq-prio_level[USER_PRIO(nstatic)])
+   rq-prio_level[USER_PRIO(nstatic)] = idx;
return next;
 }
 


-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More

[ANNOUNCE] Staircase Deadline cpu scheduler version 0.44

2007-04-21 Thread Con Kolivas
A significant bugfix for forking tasks was just posted, so here is an updated 
version of the staircase deadline cpu scheduler. This may cause noticeable
behavioural improvements under certain workloads (such as compiling software 
with make).

Thanks to Al Boldi for making me check the fork code!

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.44.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.44.patch

Incrementals in
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7/
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/

Renicing X to -10, while not essential, is preferable.

See the patch just posted called 'sched: implement staircase scheduler yaf 
fix' for full changelog.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Saturday 21 April 2007 22:12, Willy Tarreau wrote:
 Hi Ingo, Hi Con,

 I promised to perform some tests on your code. I'm short in time right now,
 but I observed behaviours that should be commented on.

 1) machine : dual athlon 1533 MHz, 1G RAM, kernel 2.6.21-rc7 + either
 scheduler Test:  ./ocbench -R 25 -S 75 -x 8 -y 8
ocbench: http://linux.1wt.eu/sched/

 2) SD-0.44

Feels good, but becomes jerky at moderately high loads. I've started
64 ocbench with a 250 ms busy loop and 750 ms sleep time. The system
always responds correctly but under X, mouse jumps quite a bit and
typing in xterm or even text console feels slightly jerky. The CPU is
not completely used, and the load varies a lot (see below). However,
the load is shared equally between all 64 ocbench, and they do not
deviate even after 4000 iterations. X uses less than 1% CPU during
those tests.

Here's the vmstat output :
[snip]

 3) CFS-v4

   Feels even better, mouse movements are very smooth even under high load.
   I noticed that X gets reniced to -19 with this scheduler. I've not looked
   at the code yet but this looked suspicious to me. I've reniced it to 0
 and it did not change any behaviour. Still very good. The 64 ocbench share
 equal CPU time and show exact same progress after 2000 iterations. The CPU
 load is more smoothly spread according to vmstat, and there's no idle (see
 below). BUT I now think it was wrong to let new processes start with no
 timeslice at all, because it can take tens of seconds to start a new
 process when only 64 ocbench are there. Simply starting killall ocbench
 takes about 10 seconds. On a smaller machine (VIA C3-533), it took me more
 than one minute to do su -, even from console, so that's not X. BTW, X
 uses less than 1% CPU during those tests.

 [EMAIL PROTECTED]:~$ vmstat 1
[snip]

 4) first impressions

 I think that CFS is based on a more promising concept but is less mature
 and is dangerous right now with certain workloads. SD shows some strange
 behaviours like not using all CPU available and a little jerkyness, but is
 more robust and may be the less risky solution for a first step towards
 a better scheduler in mainline, but it may also probably be the last O(1)
 scheduler, which may be replaced sometime later when CFS (or any other one)
 shows at the same time the smoothness of CFS and the robustness of SD.

I assumed from your description that you were running X nice 0 during all this 
testing and left the tunables from both SD and CFS at their defaults; this 
tends to have the effective equivalent of timeslice in CFS smaller than SD.

 I'm sorry not to spend more time on them right now, I hope that other
 people will do.

Thanks for that interesting testing you've done. The fluctuating cpu load and 
the apparently high idle time means there is almost certainly a bug still in 
the cpu accounting I do in update_cpu_clock. It looks suspicious to me 
already on just my first glance. Fortunately the throughput does not appear 
to be adversely affected on other benchmarks so I suspect it's lying about 
the idle time and it's not really there. Which means it's likely also 
accounting the cpu time wrongly. Which also means there's something I can fix 
and improve SD further. Great stuff, thanks! 

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Saturday 21 April 2007 22:12, Willy Tarreau wrote:
 I promised to perform some tests on your code. I'm short in time right now,
 but I observed behaviours that should be commented on.

   Feels even better, mouse movements are very smooth even under high load.
   I noticed that X gets reniced to -19 with this scheduler. I've not looked
   at the code yet but this looked suspicious to me.

Looks like this code does it:

+int sysctl_sched_privileged_nice_level __read_mostly = -19;

allows anything that sets sched_privileged_task one way or another gets 
nice -19, and this is enabled by default.

--- linux-cfs-2.6.20.7.q.orig/arch/i386/kernel/ioport.c
+++ linux-cfs-2.6.20.7.q/arch/i386/kernel/ioport.c

+   if (turn_on) {
+   if (!capable(CAP_SYS_RAWIO))
+   return -EPERM;
+   /*
+* Task will be accessing hardware IO ports,
+* mark it as special with the scheduler too:
+*/
+   sched_privileged_task(current);
+   }

presumably that selects out X as a privileged task... and sets it to nice -19 
by default.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Sunday 22 April 2007 02:00, Ingo Molnar wrote:
 * Con Kolivas [EMAIL PROTECTED] wrote:
 Feels even better, mouse movements are very smooth even under high
 load. I noticed that X gets reniced to -19 with this scheduler.
 I've not looked at the code yet but this looked suspicious to me.
 I've reniced it to 0 and it did not change any behaviour. Still
 very good.
 
  Looks like this code does it:
 
  +int sysctl_sched_privileged_nice_level __read_mostly = -19;

 correct. 

Oh I definitely was not advocating against renicing X, I just suspect that 
virtually all the users who gave glowing reports to CFS comparing it to SD 
had no idea it had reniced X to -19 behind their back and that they were 
comparing it to SD running X at nice 0. I think had they been comparing CFS 
with X nice -19 to SD running nice -10 in this interactivity soft and squishy 
comparison land their thoughts might have been different. I missed it in the 
announcement and had to go looking in the code since Willy just kinda tripped 
over it unwittingly as well.

 Note that Willy reniced X back to 0 so it had no relevance on 
 his test.

Oh yes I did notice that, but since the array swap is the remaining longest 
deadline in SD which would cause noticeable jerks, renicing X on SD by 
default would make the experience very different since reniced tasks do much 
better over array swaps compared to non niced tasks. I really should go and 
make the whole thing one circular list and blow away the array swap (if I can 
figure out how to do it). 

 Also note that i pointed this change out in the -v4 CFS 

 announcement:
 || Changes since -v3:
 ||
 ||  - usability fix: automatic renicing of kernel threads such as
 ||keventd, OOM tasks and tasks doing privileged hardware access
 ||(such as Xorg).

Reading the changelog in the gloss-over fashion that I unfortunately did, even 
I missed it. 

 i've attached it below in a standalone form, feel free to put it into
 SD! :)

Hmm well I have tried my very best to do all the changes without 
changing policy as much as possible since that trips over so many emotive 
issues that noone can agree on, and I don't have a strong opinion on this as 
I thought it would be better for it to be a config option for X in userspace 
instead. Either way it needs to be turned on/off by admin and doing it by 
default in the kernel is... not universally accepted as good. What else 
accesses ioports that can get privileged nice levels? Does this make it 
relatively exploitable just by poking an ioport?

   Ingo

 ---
  arch/i386/kernel/ioport.c   |   13 ++---
  arch/x86_64/kernel/ioport.c |8 ++--
  drivers/block/loop.c|5 -
  include/linux/sched.h   |7 +++
  kernel/sched.c  |   40

Thanks for the patch. I'll consider it. Since end users are testing this in 
fuzzy interactivity land I may simply be forced to do this just for 
comparisons to be meaningful between CFS and SD otherwise they're not really 
comparing them on a level playing field. I had almost given up SD for dead 
meat with all the momentum CFS had gained... until recently.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Sunday 22 April 2007 08:54, Denis Vlasenko wrote:
 On Saturday 21 April 2007 18:00, Ingo Molnar wrote:
  correct. Note that Willy reniced X back to 0 so it had no relevance on
  his test. Also note that i pointed this change out in the -v4 CFS
 
  announcement:
  || Changes since -v3:
  ||
  ||  - usability fix: automatic renicing of kernel threads such as
  ||keventd, OOM tasks and tasks doing privileged hardware access
  ||(such as Xorg).
 
  i've attached it below in a standalone form, feel free to put it into
  SD! :)

 But X problems have nothing to do with privileged hardware access.
 X problems are related to priority inversions between server and client
 processes, and one server process - many client processes case.

It's not a privileged hardware access reason that this code is there. This is 
obfuscation/advertising to make it look like there is a valid reason for X 
getting negative nice levels somehow in the kernel to make interactive 
testing of CFS better by default.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Sunday 22 April 2007 04:17, Gene Heskett wrote:
 More first impressions of sd-0.44 vs CFS-v4

Thanks Gene.

 CFS-v4 is quite smooth in terms of the users experience but after prolonged
 observations approaching 24 hours, it appears to choke the cpu hog off a
 bit even when the system has nothing else to do.  My amanda runs went from
 1 to 1.5 hours depending on how much time it took gzip to handle the amount
 of data tar handed it, up to about 165m  change, or nearly 3 hours pretty
 consistently over 5 runs.

 sd-0.44 so far seems to be handling the same load (theres a backup running
 right now) fairly well also, and possibly theres a bit more snap to the
 system now.  A switch to screen 1 from this screen 8, and the loading of
 that screen image, which is the Cassini shot of saturn from the backside,
 the one showing that teeny dot to the left of Saturn that is actually us,
 took 10 seconds with the stock 2.6.21-rc7, 3 seconds with the best of
 Ingo's patches, and now with Con's latest, is 1 second flat. Another screen
 however is 4 seconds, so maybe that first scren had been looked at since I
 rebooted. However, amanda is still getting estimates so gzip hasn't put a
 tiewrap around the kernels neck just yet.

 Some minutes later, gzip is smunching /usr/src, and the machine doesn't
 even know its running as sd-0.44 isn't giving gzip more than 75% to gzip,
 and probably averaging less than 50%. And it scared me a bit as it started
 out at not over 5% for the first minute or so.  Running in the 70's now
 according to gkrellm, with an occasional blip to 95%.  And the machine
 generally feels good.

 I had previously given CFS-v4 a 95 score but that was before I saw the
 general slowdown, and I believe my first impression of this one is also a
 95.  This on a scale of the best one of the earlier CFS patches being 100,
 and stock 2.6.21-rc7 gets a 0.0.  This scheduler seems to be giving gzip
 ever more cpu as time progresses, and the cpu is warming up quite nicely,
 from about 132F idling to 149.9F now.  And my keyboard is still alive and
 well.

I'm not sure how much weight to put on what you see as the measured cpu usage. 
I have a feeling it's being wrongly reported in SD currently. Concentrate 
more on the actual progress and behaviour of things as you've already done.

 Generally speaking, Con, I believe this one is also a keeper.  And we'll
 see how long a backup run takes.

Great thanks for feedback.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REPORT] cfs-v4 vs sd-0.44

2007-04-21 Thread Con Kolivas
On Saturday 21 April 2007 22:12, Willy Tarreau wrote:
 2) SD-0.44

Feels good, but becomes jerky at moderately high loads. I've started
64 ocbench with a 250 ms busy loop and 750 ms sleep time. The system
always responds correctly but under X, mouse jumps quite a bit and
typing in xterm or even text console feels slightly jerky. The CPU is
not completely used, and the load varies a lot (see below). However,
the load is shared equally between all 64 ocbench, and they do not
deviate even after 4000 iterations. X uses less than 1% CPU during
those tests.

Found it. I broke SMP balancing again so there is serious scope for 
improvement on SMP hardware. That explains the huge load variations. Expect 
yet another fix soon, which should improve behaviour further :)

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: implement staircase deadline scheduler ymf accounting fixes

2007-04-21 Thread Con Kolivas
This causes significant improvements on SMP hardware. I don't think the kernel
should be -nicing X by itself; that should be a sysadmin choice so I won't
be including that change in the SD patches. The following change will be in
the next release of SD (v0.45).

Andrew Please apply on top of yaf-fix

---
SMP balancing broke on converting time_slice to usecs.

update_cpu_clock is unnecessarily complex and doesn't allow sub usec values.

Thanks to Willy Tarreau [EMAIL PROTECTED] for picking up SMP idle anomalies.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/sched.c |   42 +-
 1 file changed, 17 insertions(+), 25 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-21 22:50:31.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-22 13:29:29.0 +1000
@@ -88,12 +88,10 @@ unsigned long long __attribute__((weak))
 #define SCHED_PRIO(p)  ((p)+MAX_RT_PRIO)
 
 /* Some helpers for converting to/from various scales.*/
-#define NS_TO_JIFFIES(TIME)((TIME) / (10 / HZ))
 #define JIFFIES_TO_NS(TIME)((TIME) * (10 / HZ))
+#define MS_TO_NS(TIME) ((TIME) * 100)
 #define MS_TO_US(TIME) ((TIME) * 1000)
-/* Can return 0 */
-#define MS_TO_JIFFIES(TIME)((TIME) * HZ / 1000)
-#define JIFFIES_TO_MS(TIME)((TIME) * 1000 / HZ)
+#define US_TO_MS(TIME) ((TIME) / 1000)
 
 #define TASK_PREEMPTS_CURR(p, curr)((p)-prio  (curr)-prio)
 
@@ -876,29 +874,28 @@ static void requeue_task(struct task_str
 
 /*
  * task_timeslice - the total duration a task can run during one major
- * rotation. Returns value in jiffies.
+ * rotation. Returns value in milliseconds as the smallest value can be 1.
  */
-static inline int task_timeslice(struct task_struct *p)
+static int task_timeslice(struct task_struct *p)
 {
-   int slice;
+   int slice = p-quota;   /* quota is in us */
 
-   slice = NS_TO_JIFFIES(p-quota);
if (!rt_task(p))
slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
-   return slice;
+   return US_TO_MS(slice);
 }
 
 /*
  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
  * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
+ * this code will need modification. Scaled as multiples of milliseconds.
  */
 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 #define LOAD_WEIGHT(lp) \
(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define TASK_LOAD_WEIGHT(p)LOAD_WEIGHT(task_timeslice(p))
 #define RTPRIO_TO_LOAD_WEIGHT(rp)  \
-   (LOAD_WEIGHT((MS_TO_JIFFIES(rr_interval) + 20 + (rp
+   (LOAD_WEIGHT((rr_interval + 20 + (rp
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -3035,32 +3032,27 @@ static void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
 int tick)
 {
-   cputime64_t time_diff = now - p-last_ran;
-   const unsigned int min_diff = 1000;
-   int us_time_diff;
+   long time_diff = now - p-last_ran;
 
if (tick) {
/*
 * Called from scheduler_tick() there should be less than two
 * jiffies worth, and not negative/overflow.
 */
-   if (time_diff  JIFFIES_TO_NS(2) || time_diff  min_diff)
+   if (time_diff  JIFFIES_TO_NS(2) || time_diff  0)
time_diff = JIFFIES_TO_NS(1);
} else {
/*
 * Called from context_switch there should be less than one
-* jiffy worth, and not negative/overflowed. In the case when
-* sched_clock fails to return high resolution values this
-* also ensures at least 1 min_diff gets banked.
+* jiffy worth, and not negative/overflow. There should be
+* some time banked here so use a nominal 1ms.
 */
-   if (time_diff  JIFFIES_TO_NS(1) || time_diff  min_diff)
-   time_diff = min_diff;
+   if (time_diff  JIFFIES_TO_NS(1) || time_diff  1)
+   time_diff = 1000;
}
/* time_slice accounting is done in usecs to avoid overflow on 32bit */
-   us_time_diff = time_diff;
-   us_time_diff /= 1000;
if (p != rq-idle  p-policy != SCHED_FIFO)
-   p-time_slice -= us_time_diff;
+   p-time_slice -= time_diff / 1000;
p-sched_time += time_diff;
p-last_ran = rq-most_recent_timestamp = now;
 }
@@ -4636,8 +4628,8 @@ long sys_sched_rr_get_interval(pid_t pid
if (retval)
goto out_unlock;
 
-   jiffies_to_timespec(p-policy == SCHED_FIFO ?
-   0 : task_timeslice(p), t);
+   t = ns_to_timespec(p-policy == SCHED_FIFO ? 0

[PATCH] sched: ymf typo

2007-04-21 Thread Con Kolivas
Typo in comment, 1us not 1ms.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/sched.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-22 14:22:14.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-22 14:22:34.0 +1000
@@ -3045,7 +3045,7 @@ update_cpu_clock(struct task_struct *p, 
/*
 * Called from context_switch there should be less than one
 * jiffy worth, and not negative/overflow. There should be
-* some time banked here so use a nominal 1ms.
+* some time banked here so use a nominal 1us.
 */
if (time_diff  JIFFIES_TO_NS(1) || time_diff  1)
time_diff = 1000;

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[ANNOUNCE] Staircase Deadline cpu scheduler version 0.45

2007-04-21 Thread Con Kolivas
A significant bugfix for SMP balancing was just posted for the staircase 
deadline cpu scheduler which improves behaviour dramatically on any SMP 
machine.

Thanks to Willy Tarreau for noticing likely fault point.

Also requested was a version in the Makefile so this version of the patch 
adds -sd045 to the kernel version.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.45.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.45.patch

Incrementals from 0.44:
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7/sd-0.44-0.45.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sd-0.44-0.45.patch

Renicing X to -10, while not essential, may be desirable on the desktop. 
Unlike the CFS scheduler which renices X without your intervention to 
nice -19, the SD patches do not alter nice level on their own.

See the patch just posted called 'sched: implement staircase deadline 
scheduler ymf accounting fixes' for details of the fixes.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


rr_interval experiments

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 01:01, Con Kolivas wrote:
> This then allows the maximum rr_interval to be as large as 5000
> milliseconds.

Just for fun, on a core2duo make allnoconfig make -j8 here are the build time 
differences (on a 1000HZ config) machine:

16ms:
53.68user 4.81system 0:34.27elapsed 170%CPU (0avgtext+0avgdata 0maxresident)k

1ms:
56.73user 4.83system 0:36.03elapsed 170%CPU (0avgtext+0avgdata 0maxresident)k

5000ms:
52.88user 4.77system 0:32.37elapsed 178%CPU (0avgtext+0avgdata 0maxresident)k

For the record, 16ms is what SD v0.43 would choose as the default value on 
this hardware. A load with a much lower natural context switching rate than a 
kernel compile, as you said Nick, would show even greater discrepancy in 
these results.

Fun eh? Note these are not for any comparison with anything else; just to show 
the effect rr_interval changes have on throughput.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 02:15, Mark Lord wrote:
> Con Kolivas wrote:
> > On Thursday 19 April 2007 23:17, Mark Lord wrote:
> >> Con Kolivas wrote:
> >> s go ahead and think up great ideas for other ways of metering out cpu
> >>
> >>> bandwidth for different purposes, but for X, given the absurd
> >>> simplicity of renicing, why keep fighting it? Again I reiterate that
> >>> most users of SD have not found the need to renice X anyway except if
> >>> they stick to old habits of make -j4 on uniprocessor and the like, and
> >>> I expect that those on CFS and Nicksched would also have similar
> >>> experiences.
> >>
> >> Just plain "make" (no -j2 or -j) is enough to kill interactivity
> >> on my 2GHz P-M single-core non-HT machine with SD.
> >>
> >> But with the very first posted version of CFS by Ingo,
> >> I can do "make -j2" no problem and still have a nicely interactive
> >> destop.
> >
> > Cool. Then there's clearly a bug with SD that manifests on your machine
> > as it should not have that effect at all (and doesn't on other people's
> > machines). I suggest trying the latest version which fixes some bugs.
>
> SD just doesn't do nearly as good as the stock scheduler, or CFS, here.
>
> I'm quite likely one of the few single-CPU/non-HT testers of this stuff.
> If it should ever get more widely used I think we'd hear a lot more
> complaints.

You are not really one of the few. A lot of my own work is done on a single 
core pentium M 1.7Ghz laptop. I am not endowed with truckloads of hardware 
like all the paid developers are. I recall extreme frustration myself when a 
developer a few years ago (around 2002) said he couldn't reproduce poor 
behaviour on his 4GB ram 4 x Xeon machine. Even today if I add up every 
machine I have in my house and work at my disposal it doesn't amount to that 
many cpus and that much ram.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 05:26, Ray Lee wrote:
> On 4/19/07, Con Kolivas <[EMAIL PROTECTED]> wrote:
> > The one fly in the ointment for
> > linux remains X. I am still, to this moment, completely and utterly
> > stunned at why everyone is trying to find increasingly complex unique
> > ways to manage X when all it needs is more cpu[1].
>
> [...and hence should be reniced]
>
> The problem is that X is not unique. There's postgresql, memcached,
> mysql, db2, a little embedded app I wrote... all of these perform work
> on behalf of another process. It's just most *noticeable* with X, as
> pretty much everyone is running that.
>
> If we had some way for the scheduler to decide to donate part of a
> client process's time slice to the server it just spoke to (with an
> exponential dampening factor -- take 50% from the client, give 25% to
> the server, toss the rest on the floor), that -- from my naive point
> of view -- would be a step toward fixing the underlying issue. Or I
> might be spouting crap, who knows.
>
> The problem is real, though, and not limited to X.
>
> While I have the floor, thank you, Con, for all your work.

You're welcome and thanks for taking the floor to speak. I would say you have 
actually agreed with me though. X is not unique, it's just an obvious so 
let's not design the cpu scheduler around the problem with X. Same goes for 
every other application. Leaving the choice to hand out differential cpu 
usage when they seem to need is should be up to the users. The donation idea 
has been done before in some fashion or other in things like "back-boost" 
which Linus himself tried in 2.5.X days. It worked lovely till it did the 
wrong thing and wreaked havoc. As is shown repeatedly, the workarounds and 
the tweaks and the bonuses and the decide on who to give advantage to, when 
done by the cpu scheduler, is also what is its undoing as it can't always get 
it right. The consequences of getting it wrong on the other hand are 
disastrous. The cpu scheduler core is a cpu bandwidth and latency 
proportionator and should be nothing more or less.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 04:16, Gene Heskett wrote:
> On Thursday 19 April 2007, Con Kolivas wrote:
>
> [and I snipped a good overview]
>
> >So yes go ahead and think up great ideas for other ways of metering out
> > cpu bandwidth for different purposes, but for X, given the absurd
> > simplicity of renicing, why keep fighting it? Again I reiterate that most
> > users of SD have not found the need to renice X anyway except if they
> > stick to old habits of make -j4 on uniprocessor and the like, and I
> > expect that those on CFS and Nicksched would also have similar
> > experiences.
>
> FWIW folks, I have never touched x's niceness, its running at the default
> -1 for all of my so-called 'tests', and I have another set to be rebooted
> to right now.  And yes, my kernel makeit script uses -j4 by default, and
> has used -j8 just for effects, which weren't all that different from what I
> expected in 'abusing' a UP system that way.  The system DID remain usable,
> not snappy, but usable.

Gene, you're agreeing with me. You've shown that you're very happy with a fair 
distribution of cpu and leaving X at nice 0.
>
> Having tried re-nicing X a while back, and having the rest of the system
> suffer in quite obvious ways for even 1 + or - from its default felt pretty
> bad from this users perspective.
>
> It is my considered opinion (yeah I know, I'm just a leaf in the hurricane
> of this list) that if X has to be re-niced from the 1 point advantage its
> had for ages, then something is basicly wrong with the overall scheduling,
> cpu or i/o, or both in combination.  FWIW I'm using cfq for i/o.

It's those who want X to have an unfair advantage that want it to do 
something "special". Your agreement that it works fine at nice 0 shows you 
don't want it to have an unfair advantage. Others who want it to have an 
unfair advantage _can_ renice it if they desire. But if the cpu scheduler 
gives X an unfair advantage within the kernel by default then you have _no_ 
choice. If you leave the choice up to userspace (renice or not) then both 
parties get their way. If you put it into the kernel only one party wins and 
there is no way for the Genes (and Cons) of this world to get it back.

Your opinion is as valuable as eveyone else's Gene. It is hard to get people 
to speak on as frightening a playground as the linux kernel mailing list so 
please do. 

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[ANNOUNCE] Staircase Deadline cpu scheduler version 0.43

2007-04-19 Thread Con Kolivas
In order to keep raising the standard for comparison for the alternative new 
scheduler developments, here is an updated version of the staircase deadline 
cpu scheduler.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.43.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.43.patch

Incrementals in
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7/
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/

Renicing X to -10, while not essential, is preferable.

See the 3 patches just posted for full changelog of 0.42-0.43.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 23:17, Mark Lord wrote:
> Con Kolivas wrote:
> s go ahead and think up great ideas for other ways of metering out cpu
>
> > bandwidth for different purposes, but for X, given the absurd simplicity
> > of renicing, why keep fighting it? Again I reiterate that most users of
> > SD have not found the need to renice X anyway except if they stick to old
> > habits of make -j4 on uniprocessor and the like, and I expect that those
> > on CFS and Nicksched would also have similar experiences.
>
> Just plain "make" (no -j2 or -j) is enough to kill interactivity
> on my 2GHz P-M single-core non-HT machine with SD.
>
> But with the very first posted version of CFS by Ingo,
> I can do "make -j2" no problem and still have a nicely interactive destop.

Cool. Then there's clearly a bug with SD that manifests on your machine as it 
should not have that effect at all (and doesn't on other people's machines). 
I suggest trying the latest version which fixes some bugs.

Thanks.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] [3/3] sched: increase ksoftirqd priority

2007-04-19 Thread Con Kolivas
More aggressive nice discrimination by the Staircase-Deadline cpu scheduler
means ksoftirqd is getting significantly less cpu than previously. Adjust
nice value accordingly for similar cpu distribution.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/softirq.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.21-rc7-sd/kernel/softirq.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/softirq.c   2007-04-20 00:30:08.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/softirq.c2007-04-20 00:30:31.0 
+1000
@@ -488,7 +488,7 @@ void __init softirq_init(void)
 
 static int ksoftirqd(void * __bind_cpu)
 {
-   set_user_nice(current, 19);
+   set_user_nice(current, 15);
current->flags |= PF_NOFREEZE;
 
set_current_state(TASK_INTERRUPTIBLE);

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] [1/3] sched: implement staircase deadline scheduler timeslice fixes

2007-04-19 Thread Con Kolivas
This is the first in a series of 3 patches to bring the staircase deadline cpu
scheduler up to version 0.43. They apply on top of
"[PATCH] sched: implement staircase deadline scheduler further improvements-1"
Assuming we're still queueing these up in -mm for comparison, that patch is
still outstanding.

---
There is no need for time_slice and quota to be stored in nanoseconds and
can overflow on 32bit when rr_intervals are large. Convert them to
microseconds.

This then allows the maximum rr_interval to be as large as 5000 milliseconds.

Alter the choice of initial rr_interval to scale more with cpus in an
understandable fashion along with explanation.

Don't check that rr_interval is at least one tick every time rr_quota is
called. Simply allow it to be less if the user desires and allow aliasing
to keep accounting sane overall.

Thanks to Nick Piggin for suggesting larger timeslices.
Thanks to Peter Zijlstra for help.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c  |   45 +++--
 kernel/sysctl.c |   11 ++-
 2 files changed, 29 insertions(+), 27 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-19 22:50:01.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-19 23:59:24.0 +1000
@@ -53,6 +53,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -89,7 +90,7 @@ unsigned long long __attribute__((weak))
 /* Some helpers for converting to/from various scales.*/
 #define NS_TO_JIFFIES(TIME)((TIME) / (10 / HZ))
 #define JIFFIES_TO_NS(TIME)((TIME) * (10 / HZ))
-#define MS_TO_NS(TIME) ((TIME) * 100)
+#define MS_TO_US(TIME) ((TIME) * 1000)
 /* Can return 0 */
 #define MS_TO_JIFFIES(TIME)((TIME) * HZ / 1000)
 #define JIFFIES_TO_MS(TIME)((TIME) * 1000 / HZ)
@@ -101,9 +102,8 @@ unsigned long long __attribute__((weak))
  * Value is in ms and set to a minimum of 8ms. Scales with number of cpus.
  * Tunable via /proc interface.
  */
-int rr_interval __read_mostly;
+int rr_interval __read_mostly = 8;
 
-#define RR_INTERVAL8
 #define DEF_TIMESLICE  (rr_interval * 20)
 
 /*
@@ -988,23 +988,20 @@ static int effective_prio(struct task_st
  * tick still. Below nice 0 they get progressively larger.
  * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
  * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
- * Value returned is in nanoseconds.
+ * Value returned is in microseconds.
  */
 static unsigned int rr_quota(struct task_struct *p)
 {
int nice = TASK_NICE(p), rr = rr_interval;
 
-   /* Ensure that rr_interval is at least 1 tick */
-   if (unlikely(!MS_TO_JIFFIES(rr)))
-   rr = rr_interval = JIFFIES_TO_MS(1) ? : 1;
if (!rt_task(p)) {
if (nice < -6) {
rr *= nice * nice;
rr /= 40;
-   } else if (nice > 0 && (rr * HZ / 1000 / 2) > 0)
-   rr /= 2;
+   } else if (nice > 0)
+   rr = rr / 2 ? : 1;
}
-   return MS_TO_NS(rr);
+   return MS_TO_US(rr);
 }
 
 /*
@@ -3015,16 +3012,17 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * This is called on clock ticks and on context switches.
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here.
+ * CPU scheduler quota accounting is also performed here in microseconds.
  * The value returned from sched_clock() occasionally gives bogus values so
  * some sanity checking is required.
  */
-static inline void
+static void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
 int tick)
 {
cputime64_t time_diff = now - p->last_ran;
-   unsigned int min_diff = 1000;
+   const unsigned int min_diff = 1000;
+   int us_time_diff;
 
if (tick) {
/*
@@ -3043,8 +3041,11 @@ update_cpu_clock(struct task_struct *p, 
if (time_diff > JIFFIES_TO_NS(1) || time_diff < min_diff)
time_diff = min_diff;
}
+   /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+   us_time_diff = time_diff;
+   us_time_diff /= 1000;
if (p != rq->idle && p->policy != SCHED_FIFO)
-   p->time_slice -= time_diff;
+   p->time_slice -= us_time_diff;
p->sched_time += time_diff;
p->last_ran = rq->most_recent_timestamp = now;
 }
@@ -3145,8 +3146,7 @@ void account_steal_time(struct task_stru
 static void task_expired_entitlement(struct rq *rq, struct task_struct *p)
 {
struct prio_array *old_array;
-   int overrun;
-   int old_prio;
+   int overrun, old_prio;
 
 

[PATCH] [2/3] sched: implement staircase deadline scheduler docupdate

2007-04-19 Thread Con Kolivas
Update documentation to reflect higher maximum rr_interval.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 Documentation/sysctl/kernel.txt |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.21-rc7-sd/Documentation/sysctl/kernel.txt
===
--- linux-2.6.21-rc7-sd.orig/Documentation/sysctl/kernel.txt2007-04-20 
00:29:12.0 +1000
+++ linux-2.6.21-rc7-sd/Documentation/sysctl/kernel.txt 2007-04-20 
00:29:32.0 +1000
@@ -298,7 +298,7 @@ overall. This value is in milliseconds a
 depends on the number of cpus available at scheduler initialisation
 with a minimum of 8.
 
-Valid values are from 1-100.
+Valid values are from 1-5000.
 
 ==
 

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Announce - Staircase Deadline cpu scheduler v0.42

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 22:55, Willy Tarreau wrote:
> On Thu, Apr 19, 2007 at 12:12:14PM +1000, Con Kolivas wrote:
> > On Thursday 19 April 2007 10:41, Con Kolivas wrote:
> > > On Thursday 19 April 2007 09:59, Con Kolivas wrote:
> > > > Since there is so much work currently ongoing with alternative cpu
> > > > schedulers, as a standard for comparison with the alternative virtual
> > > > deadline fair designs I've addressed a few issues in the Staircase
> > > > Deadline cpu scheduler which improve behaviour likely in a noticeable
> > > > fashion and released version 0.41.
> > > >
> > > > http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.pat
> > > >ch
> > > > http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.p
> > > >atch
> > > >
> > > > and an incremental for those on 0.40:
> > > > http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-imp
> > > >leme nt -staircase-deadline-scheduler-further-improvements.patch
> > > >
> > > > Remember to renice X to -10 for nicest desktop behaviour :)
> > > >
> > > > Have fun.
> > >
> > > Oops forgot to cc a few people
> > >
> > > Nick you said I should still have something to offer so here it is.
> > > Peter you said you never saw this design (it's a dual array affair
> > > sorry). Gene and Willy you were some of the early testers that noticed
> > > the advantages of the earlier designs,
> > > Matt you did lots of great earlier testing.
> > > WLI you inspired a lot of design ideas.
> > > Mike you were the stick.
> > > And a few others I've forgotten to mention and include.
> >
> > Version 0.42
> >
> > http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.patch
>
> Will give it a shoot ASAP, probably this week-end. I'm too short in time
> this week.

Great, thanks. By then there will almost certainly be a 0.43 so no rush.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
Ok, there are 3 known schedulers currently being "promoted" as solid 
replacements for the mainline scheduler which address most of the issues with 
mainline (and about 10 other ones not currently being promoted). The main way 
they do this is through attempting to maintain solid fairness. There is 
enough evidence mounting now from the numerous test cases fixed by much 
fairer designs that this is the way forward for a general purpose cpu 
scheduler which is what linux needs. 

Interactivity of just about everything that needs low latency (ie audio and 
video players) are easily managed by maintaining low latency between wakeups 
and scheduling of all these low cpu users. The one fly in the ointment for 
linux remains X. I am still, to this moment, completely and utterly stunned 
at why everyone is trying to find increasingly complex unique ways to manage 
X when all it needs is more cpu[1]. Now most of these are actually very good 
ideas about _extra_ features that would be desirable in the long run for 
linux, but given the ludicrous simplicity of renicing X I cannot fathom why 
people keep promoting these alternatives. At the time of 2.6.0 coming out we 
were desparately trying to get half decent interactivity within a reasonable 
time frame to release 2.6.0 without rewiring the whole scheduler. So I 
tweaked the crap out of the tunables that were already there[2].

So let's hear from the 3 people who generated the schedulers under the 
spotlight. These are recent snippets and by no means the only time these 
comments have been said. Without sounding too bold, we do know a thing or two 
about scheduling.

CFS:
On Thursday 19 April 2007 16:38, Ingo Molnar wrote:
> h. How about the following then: default to nice -10 for all
> (SCHED_NORMAL) kernel threads and all root-owned tasks. Root _is_
> special: root already has disk space reserved to it, root has special
> memory allocation allowances, etc. I dont see a reason why we couldnt by
> default make all root tasks have nice -10. This would be instantly loved
> by sysadmins i suspect ;-)
>
> (distros that go the extra mile of making Xorg run under non-root could
> also go another extra one foot to renice that X server to -10.)

Nicksched:
On Wednesday 18 April 2007 15:00, Nick Piggin wrote:
> What's wrong with allowing X to get more than it's fair share of CPU
> time by "fiddling with nice levels"? That's what they're there for.

and

Staircase-Deadline:
On Thursday 19 April 2007 09:59, Con Kolivas wrote:
> Remember to renice X to -10 for nicest desktop behaviour :)


[1]The one caveat I can think of is that when you share X sessions across 
multiple users -with a fair cpu scheduler-, having them all nice 0 also makes 
the distribution of cpu across the multiple users very even and smooth, 
without the expense of burning away the other person's cpu time they'd like 
for compute intensive non gui things. If you make a scheduler that always 
favours X this becomes impossible. I've had enough users offlist ask me to 
help them set up multiuser environments just like this with my schedulers 
because they were unable to do it with mainline, even with SCHED_BATCH, short 
of nicing everything +19. This makes the argument for not favouring X within 
the scheduler with tweaks even stronger.

[2] Nick was promoting renicing X with his Nicksched alternative at the time 
of 2.6.0 and while we were not violently opposed to renicing X, Nicksched was 
still very new on the scene and didn't have the luxury of extended testing 
and reiteration in time for 2.6 and he put the project on hold for some time 
after that. The correctness of his views on renicing certainly have become 
more obvious over time.


So yes go ahead and think up great ideas for other ways of metering out cpu 
bandwidth for different purposes, but for X, given the absurd simplicity of 
renicing, why keep fighting it? Again I reiterate that most users of SD have 
not found the need to renice X anyway except if they stick to old habits of 
make -j4 on uniprocessor and the like, and I expect that those on CFS and 
Nicksched would also have similar experiences.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Announce - Staircase Deadline cpu scheduler v0.42

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 20:22, Nick Piggin wrote:
> On Thu, Apr 19, 2007 at 07:40:04PM +1000, Con Kolivas wrote:
> > On Thursday 19 April 2007 13:22, Nick Piggin wrote:
> > > On Thu, Apr 19, 2007 at 12:12:14PM +1000, Con Kolivas wrote:
> > > > Version 0.42
> > > >
> > > > http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.p
> > > >atch
> > >
> > > OK, I run some tests later today...
> >
> > Thank you very much.
>
> lmbench numbers are roughly comparable to mainline (lmbench seems to be
> a bit erratic, but there isn't the obvious drop that cfs has).

Great.

Thanks again for doing these.

> Didn't worry about hackbench ;)
>
> kbuild:
> 2.6.21-rc7
> 508.87user 32.47system 2:17.82elapsed 392%CPU
> 509.05user 32.25system 2:17.84elapsed 392%CPU
> 508.75user 32.26system 2:17.83elapsed 392%CPU
> 508.63user 32.17system 2:17.88elapsed 392%CPU
> 509.01user 32.26system 2:17.90elapsed 392%CPU
> 509.08user 32.20system 2:17.95elapsed 392%CPU
>
> 2.6.21-rc7-sd42
> 512.78user 31.99system 2:18.41elapsed 393%CPU
> 512.55user 31.90system 2:18.57elapsed 392%CPU
> 513.05user 31.78system 2:18.48elapsed 393%CPU
> 512.46user 32.06system 2:18.63elapsed 392%CPU
> 512.78user 31.81system 2:18.49elapsed 393%CPU
> 512.41user 32.08system 2:18.70elapsed 392%CPU
>
> sd42 is doing about 745 context switches per second here, and perfomance is
> slightly below mainline. But it isn't doing badly.

Not bad. It's always impossible to know where the sweet spot will lie with 
these things. Basically the higher the better for this one thing of 
course.

> 507.87user 32.53system 2:17.50elapsed 392%CPU
> 508.47user 32.40system 2:17.56elapsed 393%CPU
> 508.59user 32.27system 2:17.53elapsed 393%CPU
>
> A few runs with rr_interval at 100 show that ctxsw numbers drop to 587, and
> performance is up to slightly above mainline.

Well they're nice numbers indeed. I don't even need to leave the maximum at 
100ms but I seriously doubt we'd see much improvement beyond there... I'm 
sure some renderfarm might enjoy values of 1 second though (like my -ck 
patchet already offers in compute mode for old fashioned staircase cpu 
sched).

> With the results I've got so far with all scedulers (actually I didn't try
> nicksched with a small timeslice, but I'm sure it would give the expected
> result)... I'd say 5ms might be too small a timeslice. Even 15ms will hurt
> some people I think.

On 4x (that's what your hardware was IIRC) SD would be setting it to 15ms. 
Mainline would be timeslice granularity setting 4x to about 40ms. Pulling a 
number out of my rear end generator I'd say 20ms for SD should make it close 
enough without sacrificing latency too much more.

> Although we could arguably tolerate this kind of regression, my box only
> has 1MB caches, and kbuild is naturally context switching at over 500 per
> second anyway. On something with bigger caches and less context switchy /
> more cache sensitive workloads, the regression could be quite a bit worse.
>
> (not directed at anyone in particular, but food for thought)

Heck if I'm going to keep offering SD as an alternative for comparison I may 
as well use the food you've offered me. I guess I better work on a v0.43 with 
some more of these very easy to do tweaks then. Thanks!

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Announce - Staircase Deadline cpu scheduler v0.42

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 13:22, Nick Piggin wrote:
> On Thu, Apr 19, 2007 at 12:12:14PM +1000, Con Kolivas wrote:
> > Version 0.42
> >
> > http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.patch
>
> OK, I run some tests later today...


Thank you very much.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Announce - Staircase Deadline cpu scheduler v0.42

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 13:22, Nick Piggin wrote:
 On Thu, Apr 19, 2007 at 12:12:14PM +1000, Con Kolivas wrote:
  Version 0.42
 
  http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.patch

 OK, I run some tests later today...


Thank you very much.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Announce - Staircase Deadline cpu scheduler v0.42

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 20:22, Nick Piggin wrote:
 On Thu, Apr 19, 2007 at 07:40:04PM +1000, Con Kolivas wrote:
  On Thursday 19 April 2007 13:22, Nick Piggin wrote:
   On Thu, Apr 19, 2007 at 12:12:14PM +1000, Con Kolivas wrote:
Version 0.42
   
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.p
   atch
  
   OK, I run some tests later today...
 
  Thank you very much.

 lmbench numbers are roughly comparable to mainline (lmbench seems to be
 a bit erratic, but there isn't the obvious drop that cfs has).

Great.

Thanks again for doing these.

 Didn't worry about hackbench ;)

 kbuild:
 2.6.21-rc7
 508.87user 32.47system 2:17.82elapsed 392%CPU
 509.05user 32.25system 2:17.84elapsed 392%CPU
 508.75user 32.26system 2:17.83elapsed 392%CPU
 508.63user 32.17system 2:17.88elapsed 392%CPU
 509.01user 32.26system 2:17.90elapsed 392%CPU
 509.08user 32.20system 2:17.95elapsed 392%CPU

 2.6.21-rc7-sd42
 512.78user 31.99system 2:18.41elapsed 393%CPU
 512.55user 31.90system 2:18.57elapsed 392%CPU
 513.05user 31.78system 2:18.48elapsed 393%CPU
 512.46user 32.06system 2:18.63elapsed 392%CPU
 512.78user 31.81system 2:18.49elapsed 393%CPU
 512.41user 32.08system 2:18.70elapsed 392%CPU

 sd42 is doing about 745 context switches per second here, and perfomance is
 slightly below mainline. But it isn't doing badly.

Not bad. It's always impossible to know where the sweet spot will lie with 
these things. Basically the higher the better for this one thing of 
course.

 507.87user 32.53system 2:17.50elapsed 392%CPU
 508.47user 32.40system 2:17.56elapsed 393%CPU
 508.59user 32.27system 2:17.53elapsed 393%CPU

 A few runs with rr_interval at 100 show that ctxsw numbers drop to 587, and
 performance is up to slightly above mainline.

Well they're nice numbers indeed. I don't even need to leave the maximum at 
100ms but I seriously doubt we'd see much improvement beyond there... I'm 
sure some renderfarm might enjoy values of 1 second though (like my -ck 
patchet already offers in compute mode for old fashioned staircase cpu 
sched).

 With the results I've got so far with all scedulers (actually I didn't try
 nicksched with a small timeslice, but I'm sure it would give the expected
 result)... I'd say 5ms might be too small a timeslice. Even 15ms will hurt
 some people I think.

On 4x (that's what your hardware was IIRC) SD would be setting it to 15ms. 
Mainline would be timeslice granularity setting 4x to about 40ms. Pulling a 
number out of my rear end generator I'd say 20ms for SD should make it close 
enough without sacrificing latency too much more.

 Although we could arguably tolerate this kind of regression, my box only
 has 1MB caches, and kbuild is naturally context switching at over 500 per
 second anyway. On something with bigger caches and less context switchy /
 more cache sensitive workloads, the regression could be quite a bit worse.

 (not directed at anyone in particular, but food for thought)

Heck if I'm going to keep offering SD as an alternative for comparison I may 
as well use the food you've offered me. I guess I better work on a v0.43 with 
some more of these very easy to do tweaks then. Thanks!

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
Ok, there are 3 known schedulers currently being promoted as solid 
replacements for the mainline scheduler which address most of the issues with 
mainline (and about 10 other ones not currently being promoted). The main way 
they do this is through attempting to maintain solid fairness. There is 
enough evidence mounting now from the numerous test cases fixed by much 
fairer designs that this is the way forward for a general purpose cpu 
scheduler which is what linux needs. 

Interactivity of just about everything that needs low latency (ie audio and 
video players) are easily managed by maintaining low latency between wakeups 
and scheduling of all these low cpu users. The one fly in the ointment for 
linux remains X. I am still, to this moment, completely and utterly stunned 
at why everyone is trying to find increasingly complex unique ways to manage 
X when all it needs is more cpu[1]. Now most of these are actually very good 
ideas about _extra_ features that would be desirable in the long run for 
linux, but given the ludicrous simplicity of renicing X I cannot fathom why 
people keep promoting these alternatives. At the time of 2.6.0 coming out we 
were desparately trying to get half decent interactivity within a reasonable 
time frame to release 2.6.0 without rewiring the whole scheduler. So I 
tweaked the crap out of the tunables that were already there[2].

So let's hear from the 3 people who generated the schedulers under the 
spotlight. These are recent snippets and by no means the only time these 
comments have been said. Without sounding too bold, we do know a thing or two 
about scheduling.

CFS:
On Thursday 19 April 2007 16:38, Ingo Molnar wrote:
 h. How about the following then: default to nice -10 for all
 (SCHED_NORMAL) kernel threads and all root-owned tasks. Root _is_
 special: root already has disk space reserved to it, root has special
 memory allocation allowances, etc. I dont see a reason why we couldnt by
 default make all root tasks have nice -10. This would be instantly loved
 by sysadmins i suspect ;-)

 (distros that go the extra mile of making Xorg run under non-root could
 also go another extra one foot to renice that X server to -10.)

Nicksched:
On Wednesday 18 April 2007 15:00, Nick Piggin wrote:
 What's wrong with allowing X to get more than it's fair share of CPU
 time by fiddling with nice levels? That's what they're there for.

and

Staircase-Deadline:
On Thursday 19 April 2007 09:59, Con Kolivas wrote:
 Remember to renice X to -10 for nicest desktop behaviour :)


[1]The one caveat I can think of is that when you share X sessions across 
multiple users -with a fair cpu scheduler-, having them all nice 0 also makes 
the distribution of cpu across the multiple users very even and smooth, 
without the expense of burning away the other person's cpu time they'd like 
for compute intensive non gui things. If you make a scheduler that always 
favours X this becomes impossible. I've had enough users offlist ask me to 
help them set up multiuser environments just like this with my schedulers 
because they were unable to do it with mainline, even with SCHED_BATCH, short 
of nicing everything +19. This makes the argument for not favouring X within 
the scheduler with tweaks even stronger.

[2] Nick was promoting renicing X with his Nicksched alternative at the time 
of 2.6.0 and while we were not violently opposed to renicing X, Nicksched was 
still very new on the scene and didn't have the luxury of extended testing 
and reiteration in time for 2.6 and he put the project on hold for some time 
after that. The correctness of his views on renicing certainly have become 
more obvious over time.


So yes go ahead and think up great ideas for other ways of metering out cpu 
bandwidth for different purposes, but for X, given the absurd simplicity of 
renicing, why keep fighting it? Again I reiterate that most users of SD have 
not found the need to renice X anyway except if they stick to old habits of 
make -j4 on uniprocessor and the like, and I expect that those on CFS and 
Nicksched would also have similar experiences.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Announce - Staircase Deadline cpu scheduler v0.42

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 22:55, Willy Tarreau wrote:
 On Thu, Apr 19, 2007 at 12:12:14PM +1000, Con Kolivas wrote:
  On Thursday 19 April 2007 10:41, Con Kolivas wrote:
   On Thursday 19 April 2007 09:59, Con Kolivas wrote:
Since there is so much work currently ongoing with alternative cpu
schedulers, as a standard for comparison with the alternative virtual
deadline fair designs I've addressed a few issues in the Staircase
Deadline cpu scheduler which improve behaviour likely in a noticeable
fashion and released version 0.41.
   
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.pat
   ch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.p
   atch
   
and an incremental for those on 0.40:
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-imp
   leme nt -staircase-deadline-scheduler-further-improvements.patch
   
Remember to renice X to -10 for nicest desktop behaviour :)
   
Have fun.
  
   Oops forgot to cc a few people
  
   Nick you said I should still have something to offer so here it is.
   Peter you said you never saw this design (it's a dual array affair
   sorry). Gene and Willy you were some of the early testers that noticed
   the advantages of the earlier designs,
   Matt you did lots of great earlier testing.
   WLI you inspired a lot of design ideas.
   Mike you were the stick.
   And a few others I've forgotten to mention and include.
 
  Version 0.42
 
  http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.patch

 Will give it a shoot ASAP, probably this week-end. I'm too short in time
 this week.

Great, thanks. By then there will almost certainly be a 0.43 so no rush.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] [1/3] sched: implement staircase deadline scheduler timeslice fixes

2007-04-19 Thread Con Kolivas
This is the first in a series of 3 patches to bring the staircase deadline cpu
scheduler up to version 0.43. They apply on top of
[PATCH] sched: implement staircase deadline scheduler further improvements-1
Assuming we're still queueing these up in -mm for comparison, that patch is
still outstanding.

---
There is no need for time_slice and quota to be stored in nanoseconds and
can overflow on 32bit when rr_intervals are large. Convert them to
microseconds.

This then allows the maximum rr_interval to be as large as 5000 milliseconds.

Alter the choice of initial rr_interval to scale more with cpus in an
understandable fashion along with explanation.

Don't check that rr_interval is at least one tick every time rr_quota is
called. Simply allow it to be less if the user desires and allow aliasing
to keep accounting sane overall.

Thanks to Nick Piggin for suggesting larger timeslices.
Thanks to Peter Zijlstra for help.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/sched.c  |   45 +++--
 kernel/sysctl.c |   11 ++-
 2 files changed, 29 insertions(+), 27 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-19 22:50:01.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-19 23:59:24.0 +1000
@@ -53,6 +53,7 @@
 #include linux/tsacct_kern.h
 #include linux/kprobes.h
 #include linux/delayacct.h
+#include linux/log2.h
 #include asm/tlb.h
 
 #include asm/unistd.h
@@ -89,7 +90,7 @@ unsigned long long __attribute__((weak))
 /* Some helpers for converting to/from various scales.*/
 #define NS_TO_JIFFIES(TIME)((TIME) / (10 / HZ))
 #define JIFFIES_TO_NS(TIME)((TIME) * (10 / HZ))
-#define MS_TO_NS(TIME) ((TIME) * 100)
+#define MS_TO_US(TIME) ((TIME) * 1000)
 /* Can return 0 */
 #define MS_TO_JIFFIES(TIME)((TIME) * HZ / 1000)
 #define JIFFIES_TO_MS(TIME)((TIME) * 1000 / HZ)
@@ -101,9 +102,8 @@ unsigned long long __attribute__((weak))
  * Value is in ms and set to a minimum of 8ms. Scales with number of cpus.
  * Tunable via /proc interface.
  */
-int rr_interval __read_mostly;
+int rr_interval __read_mostly = 8;
 
-#define RR_INTERVAL8
 #define DEF_TIMESLICE  (rr_interval * 20)
 
 /*
@@ -988,23 +988,20 @@ static int effective_prio(struct task_st
  * tick still. Below nice 0 they get progressively larger.
  * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
  * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
- * Value returned is in nanoseconds.
+ * Value returned is in microseconds.
  */
 static unsigned int rr_quota(struct task_struct *p)
 {
int nice = TASK_NICE(p), rr = rr_interval;
 
-   /* Ensure that rr_interval is at least 1 tick */
-   if (unlikely(!MS_TO_JIFFIES(rr)))
-   rr = rr_interval = JIFFIES_TO_MS(1) ? : 1;
if (!rt_task(p)) {
if (nice  -6) {
rr *= nice * nice;
rr /= 40;
-   } else if (nice  0  (rr * HZ / 1000 / 2)  0)
-   rr /= 2;
+   } else if (nice  0)
+   rr = rr / 2 ? : 1;
}
-   return MS_TO_NS(rr);
+   return MS_TO_US(rr);
 }
 
 /*
@@ -3015,16 +3012,17 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * This is called on clock ticks and on context switches.
  * Bank in p-sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here.
+ * CPU scheduler quota accounting is also performed here in microseconds.
  * The value returned from sched_clock() occasionally gives bogus values so
  * some sanity checking is required.
  */
-static inline void
+static void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
 int tick)
 {
cputime64_t time_diff = now - p-last_ran;
-   unsigned int min_diff = 1000;
+   const unsigned int min_diff = 1000;
+   int us_time_diff;
 
if (tick) {
/*
@@ -3043,8 +3041,11 @@ update_cpu_clock(struct task_struct *p, 
if (time_diff  JIFFIES_TO_NS(1) || time_diff  min_diff)
time_diff = min_diff;
}
+   /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+   us_time_diff = time_diff;
+   us_time_diff /= 1000;
if (p != rq-idle  p-policy != SCHED_FIFO)
-   p-time_slice -= time_diff;
+   p-time_slice -= us_time_diff;
p-sched_time += time_diff;
p-last_ran = rq-most_recent_timestamp = now;
 }
@@ -3145,8 +3146,7 @@ void account_steal_time(struct task_stru
 static void task_expired_entitlement(struct rq *rq, struct task_struct *p)
 {
struct prio_array *old_array;
-   int overrun;
-   int old_prio;
+   int overrun, old_prio;
 
if (unlikely(p

[PATCH] [2/3] sched: implement staircase deadline scheduler docupdate

2007-04-19 Thread Con Kolivas
Update documentation to reflect higher maximum rr_interval.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 Documentation/sysctl/kernel.txt |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.21-rc7-sd/Documentation/sysctl/kernel.txt
===
--- linux-2.6.21-rc7-sd.orig/Documentation/sysctl/kernel.txt2007-04-20 
00:29:12.0 +1000
+++ linux-2.6.21-rc7-sd/Documentation/sysctl/kernel.txt 2007-04-20 
00:29:32.0 +1000
@@ -298,7 +298,7 @@ overall. This value is in milliseconds a
 depends on the number of cpus available at scheduler initialisation
 with a minimum of 8.
 
-Valid values are from 1-100.
+Valid values are from 1-5000.
 
 ==
 

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] [3/3] sched: increase ksoftirqd priority

2007-04-19 Thread Con Kolivas
More aggressive nice discrimination by the Staircase-Deadline cpu scheduler
means ksoftirqd is getting significantly less cpu than previously. Adjust
nice value accordingly for similar cpu distribution.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/softirq.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.21-rc7-sd/kernel/softirq.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/softirq.c   2007-04-20 00:30:08.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/softirq.c2007-04-20 00:30:31.0 
+1000
@@ -488,7 +488,7 @@ void __init softirq_init(void)
 
 static int ksoftirqd(void * __bind_cpu)
 {
-   set_user_nice(current, 19);
+   set_user_nice(current, 15);
current-flags |= PF_NOFREEZE;
 
set_current_state(TASK_INTERRUPTIBLE);

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Thursday 19 April 2007 23:17, Mark Lord wrote:
 Con Kolivas wrote:
 s go ahead and think up great ideas for other ways of metering out cpu

  bandwidth for different purposes, but for X, given the absurd simplicity
  of renicing, why keep fighting it? Again I reiterate that most users of
  SD have not found the need to renice X anyway except if they stick to old
  habits of make -j4 on uniprocessor and the like, and I expect that those
  on CFS and Nicksched would also have similar experiences.

 Just plain make (no -j2 or -j) is enough to kill interactivity
 on my 2GHz P-M single-core non-HT machine with SD.

 But with the very first posted version of CFS by Ingo,
 I can do make -j2 no problem and still have a nicely interactive destop.

Cool. Then there's clearly a bug with SD that manifests on your machine as it 
should not have that effect at all (and doesn't on other people's machines). 
I suggest trying the latest version which fixes some bugs.

Thanks.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[ANNOUNCE] Staircase Deadline cpu scheduler version 0.43

2007-04-19 Thread Con Kolivas
In order to keep raising the standard for comparison for the alternative new 
scheduler developments, here is an updated version of the staircase deadline 
cpu scheduler.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.43.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.43.patch

Incrementals in
http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7/
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/

Renicing X to -10, while not essential, is preferable.

See the 3 patches just posted for full changelog of 0.42-0.43.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 04:16, Gene Heskett wrote:
 On Thursday 19 April 2007, Con Kolivas wrote:

 [and I snipped a good overview]

 So yes go ahead and think up great ideas for other ways of metering out
  cpu bandwidth for different purposes, but for X, given the absurd
  simplicity of renicing, why keep fighting it? Again I reiterate that most
  users of SD have not found the need to renice X anyway except if they
  stick to old habits of make -j4 on uniprocessor and the like, and I
  expect that those on CFS and Nicksched would also have similar
  experiences.

 FWIW folks, I have never touched x's niceness, its running at the default
 -1 for all of my so-called 'tests', and I have another set to be rebooted
 to right now.  And yes, my kernel makeit script uses -j4 by default, and
 has used -j8 just for effects, which weren't all that different from what I
 expected in 'abusing' a UP system that way.  The system DID remain usable,
 not snappy, but usable.

Gene, you're agreeing with me. You've shown that you're very happy with a fair 
distribution of cpu and leaving X at nice 0.

 Having tried re-nicing X a while back, and having the rest of the system
 suffer in quite obvious ways for even 1 + or - from its default felt pretty
 bad from this users perspective.

 It is my considered opinion (yeah I know, I'm just a leaf in the hurricane
 of this list) that if X has to be re-niced from the 1 point advantage its
 had for ages, then something is basicly wrong with the overall scheduling,
 cpu or i/o, or both in combination.  FWIW I'm using cfq for i/o.

It's those who want X to have an unfair advantage that want it to do 
something special. Your agreement that it works fine at nice 0 shows you 
don't want it to have an unfair advantage. Others who want it to have an 
unfair advantage _can_ renice it if they desire. But if the cpu scheduler 
gives X an unfair advantage within the kernel by default then you have _no_ 
choice. If you leave the choice up to userspace (renice or not) then both 
parties get their way. If you put it into the kernel only one party wins and 
there is no way for the Genes (and Cons) of this world to get it back.

Your opinion is as valuable as eveyone else's Gene. It is hard to get people 
to speak on as frightening a playground as the linux kernel mailing list so 
please do. 

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 05:26, Ray Lee wrote:
 On 4/19/07, Con Kolivas [EMAIL PROTECTED] wrote:
  The one fly in the ointment for
  linux remains X. I am still, to this moment, completely and utterly
  stunned at why everyone is trying to find increasingly complex unique
  ways to manage X when all it needs is more cpu[1].

 [...and hence should be reniced]

 The problem is that X is not unique. There's postgresql, memcached,
 mysql, db2, a little embedded app I wrote... all of these perform work
 on behalf of another process. It's just most *noticeable* with X, as
 pretty much everyone is running that.

 If we had some way for the scheduler to decide to donate part of a
 client process's time slice to the server it just spoke to (with an
 exponential dampening factor -- take 50% from the client, give 25% to
 the server, toss the rest on the floor), that -- from my naive point
 of view -- would be a step toward fixing the underlying issue. Or I
 might be spouting crap, who knows.

 The problem is real, though, and not limited to X.

 While I have the floor, thank you, Con, for all your work.

You're welcome and thanks for taking the floor to speak. I would say you have 
actually agreed with me though. X is not unique, it's just an obvious so 
let's not design the cpu scheduler around the problem with X. Same goes for 
every other application. Leaving the choice to hand out differential cpu 
usage when they seem to need is should be up to the users. The donation idea 
has been done before in some fashion or other in things like back-boost 
which Linus himself tried in 2.5.X days. It worked lovely till it did the 
wrong thing and wreaked havoc. As is shown repeatedly, the workarounds and 
the tweaks and the bonuses and the decide on who to give advantage to, when 
done by the cpu scheduler, is also what is its undoing as it can't always get 
it right. The consequences of getting it wrong on the other hand are 
disastrous. The cpu scheduler core is a cpu bandwidth and latency 
proportionator and should be nothing more or less.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Renice X for cpu schedulers

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 02:15, Mark Lord wrote:
 Con Kolivas wrote:
  On Thursday 19 April 2007 23:17, Mark Lord wrote:
  Con Kolivas wrote:
  s go ahead and think up great ideas for other ways of metering out cpu
 
  bandwidth for different purposes, but for X, given the absurd
  simplicity of renicing, why keep fighting it? Again I reiterate that
  most users of SD have not found the need to renice X anyway except if
  they stick to old habits of make -j4 on uniprocessor and the like, and
  I expect that those on CFS and Nicksched would also have similar
  experiences.
 
  Just plain make (no -j2 or -j) is enough to kill interactivity
  on my 2GHz P-M single-core non-HT machine with SD.
 
  But with the very first posted version of CFS by Ingo,
  I can do make -j2 no problem and still have a nicely interactive
  destop.
 
  Cool. Then there's clearly a bug with SD that manifests on your machine
  as it should not have that effect at all (and doesn't on other people's
  machines). I suggest trying the latest version which fixes some bugs.

 SD just doesn't do nearly as good as the stock scheduler, or CFS, here.

 I'm quite likely one of the few single-CPU/non-HT testers of this stuff.
 If it should ever get more widely used I think we'd hear a lot more
 complaints.

You are not really one of the few. A lot of my own work is done on a single 
core pentium M 1.7Ghz laptop. I am not endowed with truckloads of hardware 
like all the paid developers are. I recall extreme frustration myself when a 
developer a few years ago (around 2002) said he couldn't reproduce poor 
behaviour on his 4GB ram 4 x Xeon machine. Even today if I add up every 
machine I have in my house and work at my disposal it doesn't amount to that 
many cpus and that much ram.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


rr_interval experiments

2007-04-19 Thread Con Kolivas
On Friday 20 April 2007 01:01, Con Kolivas wrote:
 This then allows the maximum rr_interval to be as large as 5000
 milliseconds.

Just for fun, on a core2duo make allnoconfig make -j8 here are the build time 
differences (on a 1000HZ config) machine:

16ms:
53.68user 4.81system 0:34.27elapsed 170%CPU (0avgtext+0avgdata 0maxresident)k

1ms:
56.73user 4.83system 0:36.03elapsed 170%CPU (0avgtext+0avgdata 0maxresident)k

5000ms:
52.88user 4.77system 0:32.37elapsed 178%CPU (0avgtext+0avgdata 0maxresident)k

For the record, 16ms is what SD v0.43 would choose as the default value on 
this hardware. A load with a much lower natural context switching rate than a 
kernel compile, as you said Nick, would show even greater discrepancy in 
these results.

Fun eh? Note these are not for any comparison with anything else; just to show 
the effect rr_interval changes have on throughput.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Announce - Staircase Deadline cpu scheduler v0.42

2007-04-18 Thread Con Kolivas
On Thursday 19 April 2007 10:41, Con Kolivas wrote:
> On Thursday 19 April 2007 09:59, Con Kolivas wrote:
> > Since there is so much work currently ongoing with alternative cpu
> > schedulers, as a standard for comparison with the alternative virtual
> > deadline fair designs I've addressed a few issues in the Staircase
> > Deadline cpu scheduler which improve behaviour likely in a noticeable
> > fashion and released version 0.41.
> >
> > http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.patch
> > http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.patch
> >
> > and an incremental for those on 0.40:
> > http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-impleme
> >nt -staircase-deadline-scheduler-further-improvements.patch
> >
> > Remember to renice X to -10 for nicest desktop behaviour :)
> >
> > Have fun.
>
> Oops forgot to cc a few people
>
> Nick you said I should still have something to offer so here it is.
> Peter you said you never saw this design (it's a dual array affair sorry).
> Gene and Willy you were some of the early testers that noticed the
> advantages of the earlier designs,
> Matt you did lots of great earlier testing.
> WLI you inspired a lot of design ideas.
> Mike you were the stick.
> And a few others I've forgotten to mention and include.

Version 0.42

http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.patch

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] sched: implement staircase deadline scheduler further improvements-1

2007-04-18 Thread Con Kolivas
On Thursday 19 April 2007 09:48, Con Kolivas wrote:
> While the Staircase Deadline scheduler has not been completely killed off
> and is still in -mm I would like to fix some outstanding issues that I've
> found since it still serves for comparison with all the upcoming
> schedulers.
>
> While still in -mm can we queue this on top please?
>
> A set of staircase-deadline v 0.41 patches will make their way into the
> usual place for those willing to test it.
>
> http://ck.kolivas.org/patches/staircase-deadline/

Oops! Minor thinko! Here is a respin. Please apply this one instead.

I better make a 0.42 heh.

---
The prio_level was being inappropriately decreased if a higher priority
task was still using previous timeslice. Fix that.

Task expiration of higher priority tasks was not being taken into account
with allocating priority slots. Check the expired best_static_prio level
to facilitate that.

Explicitly check all better static priority prio_levels when deciding on
allocating slots for niced tasks.

These changes improve behaviour in many ways.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c |   64 ++---
 1 file changed, 43 insertions(+), 21 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-19 08:51:54.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-19 12:03:29.0 +1000
@@ -145,6 +145,12 @@ struct prio_array {
 */
DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
 
+   /*
+* The best static priority (of the dynamic priority tasks) queued
+* this array.
+*/
+   int best_static_prio;
+
 #ifdef CONFIG_SMP
/* For convenience looks back at rq */
struct rq *rq;
@@ -191,9 +197,9 @@ struct rq {
 
/*
 * The current dynamic priority level this runqueue is at per static
-* priority level, and the best static priority queued this rotation.
+* priority level.
 */
-   int prio_level[PRIO_RANGE], best_static_prio;
+   int prio_level[PRIO_RANGE];
 
/* How many times we have rotated the priority queue */
unsigned long prio_rotation;
@@ -669,7 +675,7 @@ static void task_new_array(struct task_s
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
-static inline int first_prio_slot(struct task_struct *p)
+static int first_prio_slot(struct task_struct *p)
 {
if (unlikely(p->policy == SCHED_BATCH))
return p->static_prio;
@@ -682,11 +688,18 @@ static inline int first_prio_slot(struct
  * level. SCHED_BATCH tasks do not use the priority matrix. They only take
  * priority slots from their static_prio and above.
  */
-static inline int next_entitled_slot(struct task_struct *p, struct rq *rq)
+static int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
+   int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio);
+   struct prio_array *array = rq->active;
DECLARE_BITMAP(tmp, PRIO_RANGE);
-   int search_prio, uprio = USER_PRIO(p->static_prio);
 
+   /*
+* Go straight to expiration if there are higher priority tasks
+* already expired.
+*/
+   if (p->static_prio > rq->expired->best_static_prio)
+   return MAX_PRIO;
if (!rq->prio_level[uprio])
rq->prio_level[uprio] = MAX_RT_PRIO;
/*
@@ -694,15 +707,22 @@ static inline int next_entitled_slot(str
 * static_prio are acceptable, and only if it's not better than
 * a queued better static_prio's prio_level.
 */
-   if (p->static_prio < rq->best_static_prio) {
-   search_prio = MAX_RT_PRIO;
+   if (p->static_prio < array->best_static_prio) {
if (likely(p->policy != SCHED_BATCH))
-   rq->best_static_prio = p->static_prio;
-   } else if (p->static_prio == rq->best_static_prio)
+   array->best_static_prio = p->static_prio;
+   } else if (p->static_prio == array->best_static_prio) {
search_prio = rq->prio_level[uprio];
-   else {
-   search_prio = max(rq->prio_level[uprio],
-   rq->prio_level[USER_PRIO(rq->best_static_prio)]);
+   } else {
+   int i;
+
+   search_prio = rq->prio_level[uprio];
+   /* A bound O(n) function, worst case n is 40 */
+   for (i = array->best_static_prio; i <= p->static_prio ; i++) {
+   if (!rq->prio_level[USER_PRIO(i)])
+   rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
+   search_prio = max(search_prio,
+ rq->pri

Re: [ck] Announce - Staircase Deadline cpu scheduler v0.41

2007-04-18 Thread Con Kolivas
On Thursday 19 April 2007 09:59, Con Kolivas wrote:
> Since there is so much work currently ongoing with alternative cpu
> schedulers, as a standard for comparison with the alternative virtual
> deadline fair designs I've addressed a few issues in the Staircase Deadline
> cpu scheduler which improve behaviour likely in a noticeable fashion and
> released version 0.41.
>
> http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.patch
> http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.patch
>
> and an incremental for those on 0.40:
> http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-implement
>-staircase-deadline-scheduler-further-improvements.patch
>
> Remember to renice X to -10 for nicest desktop behaviour :)
>
> Have fun.

Oops forgot to cc a few people

Nick you said I should still have something to offer so here it is.
Peter you said you never saw this design (it's a dual array affair sorry).
Gene and Willy you were some of the early testers that noticed the advantages 
of the earlier designs,
Matt you did lots of great earlier testing.
WLI you inspired a lot of design ideas.
Mike you were the stick.
And a few others I've forgotten to mention and include.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Announce - Staircase Deadline cpu scheduler v0.41

2007-04-18 Thread Con Kolivas
Since there is so much work currently ongoing with alternative cpu schedulers, 
as a standard for comparison with the alternative virtual deadline fair 
designs I've addressed a few issues in the Staircase Deadline cpu scheduler 
which improve behaviour likely in a noticeable fashion and released version 
0.41.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.patch

and an incremental for those on 0.40:
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-implement-staircase-deadline-scheduler-further-improvements.patch

Remember to renice X to -10 for nicest desktop behaviour :)

Have fun.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: implement staircase deadline scheduler further improvements

2007-04-18 Thread Con Kolivas
While the Staircase Deadline scheduler has not been completely killed off and 
is still in -mm I would like to fix some outstanding issues that I've found
since it still serves for comparison with all the upcoming schedulers.

While still in -mm can we queue this on top please?

A set of staircase-deadline v 0.41 patches will make their way into the usual
place for those willing to test it.

http://ck.kolivas.org/patches/staircase-deadline/
---
The prio_level was being inappropriately decreased if a higher priority
task was still using previous timeslice. Fix that.

Task expiration of higher priority tasks was not being taken into account
with allocating priority slots. Check the expired best_static_prio level
to facilitate that.

Explicitly check all better static priority prio_levels when deciding on
allocating slots for niced tasks.

These changes improve behaviour in many ways.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c |   61 ++---
 1 file changed, 41 insertions(+), 20 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-19 08:51:54.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-19 09:30:39.0 +1000
@@ -145,6 +145,12 @@ struct prio_array {
 */
DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
 
+   /*
+* The best static priority (of the dynamic priority tasks) queued
+* this array.
+*/
+   int best_static_prio;
+
 #ifdef CONFIG_SMP
/* For convenience looks back at rq */
struct rq *rq;
@@ -191,9 +197,9 @@ struct rq {
 
/*
 * The current dynamic priority level this runqueue is at per static
-* priority level, and the best static priority queued this rotation.
+* priority level.
 */
-   int prio_level[PRIO_RANGE], best_static_prio;
+   int prio_level[PRIO_RANGE];
 
/* How many times we have rotated the priority queue */
unsigned long prio_rotation;
@@ -669,7 +675,7 @@ static void task_new_array(struct task_s
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
-static inline int first_prio_slot(struct task_struct *p)
+static int first_prio_slot(struct task_struct *p)
 {
if (unlikely(p->policy == SCHED_BATCH))
return p->static_prio;
@@ -682,11 +688,18 @@ static inline int first_prio_slot(struct
  * level. SCHED_BATCH tasks do not use the priority matrix. They only take
  * priority slots from their static_prio and above.
  */
-static inline int next_entitled_slot(struct task_struct *p, struct rq *rq)
+static int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
+   int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio);
+   struct prio_array *array = rq->active;
DECLARE_BITMAP(tmp, PRIO_RANGE);
-   int search_prio, uprio = USER_PRIO(p->static_prio);
 
+   /*
+* Go straight to expiration if there are higher priority tasks
+* already expired.
+*/
+   if (p->static_prio > rq->expired->best_static_prio)
+   return MAX_PRIO;
if (!rq->prio_level[uprio])
rq->prio_level[uprio] = MAX_RT_PRIO;
/*
@@ -694,15 +707,21 @@ static inline int next_entitled_slot(str
 * static_prio are acceptable, and only if it's not better than
 * a queued better static_prio's prio_level.
 */
-   if (p->static_prio < rq->best_static_prio) {
-   search_prio = MAX_RT_PRIO;
+   if (p->static_prio < array->best_static_prio) {
if (likely(p->policy != SCHED_BATCH))
-   rq->best_static_prio = p->static_prio;
-   } else if (p->static_prio == rq->best_static_prio)
+   array->best_static_prio = p->static_prio;
+   } else if (p->static_prio == array->best_static_prio) {
search_prio = rq->prio_level[uprio];
-   else {
+   } else {
+   int i;
+
+   /* A bound O(n) function, worst case n is 40 */
+   for (i = array->best_static_prio; i <= p->static_prio ; i++) {
+   if (!rq->prio_level[USER_PRIO(i)])
+   rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
search_prio = max(rq->prio_level[uprio],
-   rq->prio_level[USER_PRIO(rq->best_static_prio)]);
+   rq->prio_level[USER_PRIO(i)]);
+   }
}
if (unlikely(p->policy == SCHED_BATCH)) {
search_prio = max(search_prio, p->static_prio);
@@ -718,6 +737,8 @@ static void queue_expired(struct task_st
 {
task_new_array(p, rq, rq->expired);
p->prio = p->normal_prio = 

Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 22:33, Con Kolivas wrote:
> On Wednesday 18 April 2007 22:14, Nick Piggin wrote:
> > On Wed, Apr 18, 2007 at 07:33:56PM +1000, Con Kolivas wrote:
> > > On Wednesday 18 April 2007 18:55, Nick Piggin wrote:
> > > > Again, for comparison 2.6.21-rc7 mainline:
> > > >
> > > > 508.87user 32.47system 2:17.82elapsed 392%CPU
> > > > 509.05user 32.25system 2:17.84elapsed 392%CPU
> > > > 508.75user 32.26system 2:17.83elapsed 392%CPU
> > > > 508.63user 32.17system 2:17.88elapsed 392%CPU
> > > > 509.01user 32.26system 2:17.90elapsed 392%CPU
> > > > 509.08user 32.20system 2:17.95elapsed 392%CPU
> > > >
> > > > So looking at elapsed time, a granularity of 100ms is just behind the
> > > > mainline score. However it is using slightly less user time and
> > > > slightly more idle time, which indicates that balancing might have
> > > > got a bit less aggressive.
> > > >
> > > > But anyway, it conclusively shows the efficiency impact of such tiny
> > > > timeslices.
> > >
> > > See test.kernel.org for how (the now defunct) SD was performing on
> > > kernbench. It had low latency _and_ equivalent throughput to mainline.
> > > Set the standard appropriately on both counts please.
> >
> > I can give it a run. Got an updated patch against -rc7?
>
> I said I wasn't pursuing it but since you're offering, the rc6 patch should
> apply ok.
>
> http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-sd-0.40.patch

Oh and if you go to the effort of trying you may as well try the timeslice 
tweak to see what effect it has on SD as well.

/proc/sys/kernel/rr_interval

100 is the highest.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 22:13, Nick Piggin wrote:
> On Wed, Apr 18, 2007 at 11:53:34AM +0200, Ingo Molnar wrote:
> > * Nick Piggin <[EMAIL PROTECTED]> wrote:
> > > So looking at elapsed time, a granularity of 100ms is just behind the
> > > mainline score. However it is using slightly less user time and
> > > slightly more idle time, which indicates that balancing might have got
> > > a bit less aggressive.
> > >
> > > But anyway, it conclusively shows the efficiency impact of such tiny
> > > timeslices.
> >
> > yeah, the 4% drop in a CPU-cache-sensitive workload like kernbench is
> > not unexpected when going to really frequent preemption. Clearly, the
> > default preemption granularity needs to be tuned up.
> >
> > I think you said you measured ~3msec average preemption rate per CPU?
>
> This was just looking at ctxsw numbers from running 2 cpu hogs on the
> same runqueue.
>
> > That would suggest the average cache-trashing cost was 120 usecs per
> > every 3 msec window. Taking that as a ballpark figure, to get the
> > difference back into the noise range we'd have to either use ~5 msec:
> >
> > echo 500 > /proc/sys/kernel/sched_granularity
> >
> > or 15 msec:
> >
> > echo 1500 > /proc/sys/kernel/sched_granularity
> >
> > (depending on whether it's 5x 3msec or 5x 1msec - i'm still not sure i
> > correctly understood your 3msec value. I'd have to know your kernbench
> > workload's approximate 'steady state' context-switch rate to do a more
> > accurate calculation.)
>
> The kernel compile (make -j8 on 4 thread system) is doing 1800 total
> context switches per second (450/s per runqueue) for cfs, and 670
> for mainline. Going up to 20ms granularity for cfs brings the context
> switch numbers similar, but user time is still a % or so higher. I'd
> be more worried about compute heavy threads which naturally don't do
> much context switching.

While kernel compiles are nice and easy to do I've seen enough criticism of 
them in the past to wonder about their usefulness as a standard benchmark on 
their own.

>
> Some other numbers on the same system
> Hackbench:2.6.21-rc7  cfs-v2 1ms[*]   nicksched
> 10 groups: Time: 1.3320.743   0.607
> 20 groups: Time: 1.1971.100   1.241
> 30 groups: Time: 1.7542.376   1.834
> 40 groups: Time: 3.4512.227   2.503
> 50 groups: Time: 3.7263.399   3.220
> 60 groups: Time: 3.5484.567   3.668
> 70 groups: Time: 4.2064.905   4.314
> 80 groups: Time: 4.5516.324   4.879
> 90 groups: Time: 7.9046.962   5.335
> 100 groups: Time: 7.293   7.799   5.857
> 110 groups: Time: 10.595  8.728   6.517
> 120 groups: Time: 7.543   9.304   7.082
> 130 groups: Time: 8.269   10.639  8.007
> 140 groups: Time: 11.867  8.250   8.302
> 150 groups: Time: 14.852  8.656   8.662
> 160 groups: Time: 9.648   9.313   9.541

Hackbench even more so. A prolonged discussion with Rusty Russell on this 
issue he suggested hackbench was more a pass/fail benchmark to ensure there 
was no starvation scenario that never ended, and very little value should be 
placed on the actual results returned from it.

Wli's concerns regarding some sort of standard framework for a battery of 
accepted meaningful benchmarks comes to mind as important rather than ones 
that highlight one over the other. So while interesting for their own 
endpoints, I certainly wouldn't put either benchmark as some sort of 
yardstick for a "winner". Note I'm not saying that we shouldn't be looking at 
them per se, but since the whole drive for a new scheduler is trying to be 
more objective we need to start expanding the range of benchmarks. Even 
though I don't feel the need to have SD in the "race" I guess it stands for 
more data to compare what is possible/where as well.

> Mainline seems pretty inconsistent here.
>
> lmbench 0K ctxsw latency bound to CPU0:
> tasks
> 2 2.593.422.50
> 4 3.263.543.09
> 8 3.013.643.22
> 163.003.663.50
> 322.993.703.49
> 643.094.173.50
> 128   4.805.584.74
> 256   5.796.375.76
>
> cfs is noticably disadvantaged.
>
> [*] 500ms didn't make much difference in either test.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 22:14, Nick Piggin wrote:
> On Wed, Apr 18, 2007 at 07:33:56PM +1000, Con Kolivas wrote:
> > On Wednesday 18 April 2007 18:55, Nick Piggin wrote:
> > > Again, for comparison 2.6.21-rc7 mainline:
> > >
> > > 508.87user 32.47system 2:17.82elapsed 392%CPU
> > > 509.05user 32.25system 2:17.84elapsed 392%CPU
> > > 508.75user 32.26system 2:17.83elapsed 392%CPU
> > > 508.63user 32.17system 2:17.88elapsed 392%CPU
> > > 509.01user 32.26system 2:17.90elapsed 392%CPU
> > > 509.08user 32.20system 2:17.95elapsed 392%CPU
> > >
> > > So looking at elapsed time, a granularity of 100ms is just behind the
> > > mainline score. However it is using slightly less user time and
> > > slightly more idle time, which indicates that balancing might have got
> > > a bit less aggressive.
> > >
> > > But anyway, it conclusively shows the efficiency impact of such tiny
> > > timeslices.
> >
> > See test.kernel.org for how (the now defunct) SD was performing on
> > kernbench. It had low latency _and_ equivalent throughput to mainline.
> > Set the standard appropriately on both counts please.
>
> I can give it a run. Got an updated patch against -rc7?

I said I wasn't pursuing it but since you're offering, the rc6 patch should 
apply ok.

http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-sd-0.40.patch

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 18:55, Nick Piggin wrote:
> On Tue, Apr 17, 2007 at 11:59:00AM +0200, Ingo Molnar wrote:
> > * Nick Piggin <[EMAIL PROTECTED]> wrote:
> > > 2.6.21-rc7-cfs-v2
> > > 534.80user 30.92system 2:23.64elapsed 393%CPU
> > > 534.75user 31.01system 2:23.70elapsed 393%CPU
> > > 534.66user 31.07system 2:23.76elapsed 393%CPU
> > > 534.56user 30.91system 2:23.76elapsed 393%CPU
> > > 534.66user 31.07system 2:23.67elapsed 393%CPU
> > > 535.43user 30.62system 2:23.72elapsed 393%CPU
> >
> > Thanks for testing this! Could you please try this also with:
> >
> >echo 1 > /proc/sys/kernel/sched_granularity
>
> 507.68user 31.87system 2:18.05elapsed 390%CPU
> 507.99user 31.93system 2:18.09elapsed 390%CPU
> 507.46user 31.78system 2:18.03elapsed 390%CPU
> 507.68user 31.93system 2:18.11elapsed 390%CPU
> 507.63user 31.98system 2:18.01elapsed 390%CPU
> 507.83user 31.94system 2:18.28elapsed 390%CPU
>
> > could you maybe even try a more extreme setting of:
> >
> >echo 5 > /proc/sys/kernel/sched_granularity
>
> 504.87user 32.13system 2:18.03elapsed 389%CPU
> 505.94user 32.29system 2:17.87elapsed 390%CPU
> 506.10user 31.90system 2:17.96elapsed 389%CPU
> 505.02user 32.02system 2:17.96elapsed 389%CPU
> 506.69user 31.96system 2:17.82elapsed 390%CPU
> 505.70user 31.84system 2:17.90elapsed 389%CPU
>
>
> Again, for comparison 2.6.21-rc7 mainline:
>
> 508.87user 32.47system 2:17.82elapsed 392%CPU
> 509.05user 32.25system 2:17.84elapsed 392%CPU
> 508.75user 32.26system 2:17.83elapsed 392%CPU
> 508.63user 32.17system 2:17.88elapsed 392%CPU
> 509.01user 32.26system 2:17.90elapsed 392%CPU
> 509.08user 32.20system 2:17.95elapsed 392%CPU
>
> So looking at elapsed time, a granularity of 100ms is just behind the
> mainline score. However it is using slightly less user time and
> slightly more idle time, which indicates that balancing might have got
> a bit less aggressive.
>
> But anyway, it conclusively shows the efficiency impact of such tiny
> timeslices.

See test.kernel.org for how (the now defunct) SD was performing on kernbench. 
It had low latency _and_ equivalent throughput to mainline. Set the standard 
appropriately on both counts please.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 18:55, Nick Piggin wrote:
 On Tue, Apr 17, 2007 at 11:59:00AM +0200, Ingo Molnar wrote:
  * Nick Piggin [EMAIL PROTECTED] wrote:
   2.6.21-rc7-cfs-v2
   534.80user 30.92system 2:23.64elapsed 393%CPU
   534.75user 31.01system 2:23.70elapsed 393%CPU
   534.66user 31.07system 2:23.76elapsed 393%CPU
   534.56user 30.91system 2:23.76elapsed 393%CPU
   534.66user 31.07system 2:23.67elapsed 393%CPU
   535.43user 30.62system 2:23.72elapsed 393%CPU
 
  Thanks for testing this! Could you please try this also with:
 
 echo 1  /proc/sys/kernel/sched_granularity

 507.68user 31.87system 2:18.05elapsed 390%CPU
 507.99user 31.93system 2:18.09elapsed 390%CPU
 507.46user 31.78system 2:18.03elapsed 390%CPU
 507.68user 31.93system 2:18.11elapsed 390%CPU
 507.63user 31.98system 2:18.01elapsed 390%CPU
 507.83user 31.94system 2:18.28elapsed 390%CPU

  could you maybe even try a more extreme setting of:
 
 echo 5  /proc/sys/kernel/sched_granularity

 504.87user 32.13system 2:18.03elapsed 389%CPU
 505.94user 32.29system 2:17.87elapsed 390%CPU
 506.10user 31.90system 2:17.96elapsed 389%CPU
 505.02user 32.02system 2:17.96elapsed 389%CPU
 506.69user 31.96system 2:17.82elapsed 390%CPU
 505.70user 31.84system 2:17.90elapsed 389%CPU


 Again, for comparison 2.6.21-rc7 mainline:

 508.87user 32.47system 2:17.82elapsed 392%CPU
 509.05user 32.25system 2:17.84elapsed 392%CPU
 508.75user 32.26system 2:17.83elapsed 392%CPU
 508.63user 32.17system 2:17.88elapsed 392%CPU
 509.01user 32.26system 2:17.90elapsed 392%CPU
 509.08user 32.20system 2:17.95elapsed 392%CPU

 So looking at elapsed time, a granularity of 100ms is just behind the
 mainline score. However it is using slightly less user time and
 slightly more idle time, which indicates that balancing might have got
 a bit less aggressive.

 But anyway, it conclusively shows the efficiency impact of such tiny
 timeslices.

See test.kernel.org for how (the now defunct) SD was performing on kernbench. 
It had low latency _and_ equivalent throughput to mainline. Set the standard 
appropriately on both counts please.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 22:14, Nick Piggin wrote:
 On Wed, Apr 18, 2007 at 07:33:56PM +1000, Con Kolivas wrote:
  On Wednesday 18 April 2007 18:55, Nick Piggin wrote:
   Again, for comparison 2.6.21-rc7 mainline:
  
   508.87user 32.47system 2:17.82elapsed 392%CPU
   509.05user 32.25system 2:17.84elapsed 392%CPU
   508.75user 32.26system 2:17.83elapsed 392%CPU
   508.63user 32.17system 2:17.88elapsed 392%CPU
   509.01user 32.26system 2:17.90elapsed 392%CPU
   509.08user 32.20system 2:17.95elapsed 392%CPU
  
   So looking at elapsed time, a granularity of 100ms is just behind the
   mainline score. However it is using slightly less user time and
   slightly more idle time, which indicates that balancing might have got
   a bit less aggressive.
  
   But anyway, it conclusively shows the efficiency impact of such tiny
   timeslices.
 
  See test.kernel.org for how (the now defunct) SD was performing on
  kernbench. It had low latency _and_ equivalent throughput to mainline.
  Set the standard appropriately on both counts please.

 I can give it a run. Got an updated patch against -rc7?

I said I wasn't pursuing it but since you're offering, the rc6 patch should 
apply ok.

http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-sd-0.40.patch

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 22:13, Nick Piggin wrote:
 On Wed, Apr 18, 2007 at 11:53:34AM +0200, Ingo Molnar wrote:
  * Nick Piggin [EMAIL PROTECTED] wrote:
   So looking at elapsed time, a granularity of 100ms is just behind the
   mainline score. However it is using slightly less user time and
   slightly more idle time, which indicates that balancing might have got
   a bit less aggressive.
  
   But anyway, it conclusively shows the efficiency impact of such tiny
   timeslices.
 
  yeah, the 4% drop in a CPU-cache-sensitive workload like kernbench is
  not unexpected when going to really frequent preemption. Clearly, the
  default preemption granularity needs to be tuned up.
 
  I think you said you measured ~3msec average preemption rate per CPU?

 This was just looking at ctxsw numbers from running 2 cpu hogs on the
 same runqueue.

  That would suggest the average cache-trashing cost was 120 usecs per
  every 3 msec window. Taking that as a ballpark figure, to get the
  difference back into the noise range we'd have to either use ~5 msec:
 
  echo 500  /proc/sys/kernel/sched_granularity
 
  or 15 msec:
 
  echo 1500  /proc/sys/kernel/sched_granularity
 
  (depending on whether it's 5x 3msec or 5x 1msec - i'm still not sure i
  correctly understood your 3msec value. I'd have to know your kernbench
  workload's approximate 'steady state' context-switch rate to do a more
  accurate calculation.)

 The kernel compile (make -j8 on 4 thread system) is doing 1800 total
 context switches per second (450/s per runqueue) for cfs, and 670
 for mainline. Going up to 20ms granularity for cfs brings the context
 switch numbers similar, but user time is still a % or so higher. I'd
 be more worried about compute heavy threads which naturally don't do
 much context switching.

While kernel compiles are nice and easy to do I've seen enough criticism of 
them in the past to wonder about their usefulness as a standard benchmark on 
their own.


 Some other numbers on the same system
 Hackbench:2.6.21-rc7  cfs-v2 1ms[*]   nicksched
 10 groups: Time: 1.3320.743   0.607
 20 groups: Time: 1.1971.100   1.241
 30 groups: Time: 1.7542.376   1.834
 40 groups: Time: 3.4512.227   2.503
 50 groups: Time: 3.7263.399   3.220
 60 groups: Time: 3.5484.567   3.668
 70 groups: Time: 4.2064.905   4.314
 80 groups: Time: 4.5516.324   4.879
 90 groups: Time: 7.9046.962   5.335
 100 groups: Time: 7.293   7.799   5.857
 110 groups: Time: 10.595  8.728   6.517
 120 groups: Time: 7.543   9.304   7.082
 130 groups: Time: 8.269   10.639  8.007
 140 groups: Time: 11.867  8.250   8.302
 150 groups: Time: 14.852  8.656   8.662
 160 groups: Time: 9.648   9.313   9.541

Hackbench even more so. A prolonged discussion with Rusty Russell on this 
issue he suggested hackbench was more a pass/fail benchmark to ensure there 
was no starvation scenario that never ended, and very little value should be 
placed on the actual results returned from it.

Wli's concerns regarding some sort of standard framework for a battery of 
accepted meaningful benchmarks comes to mind as important rather than ones 
that highlight one over the other. So while interesting for their own 
endpoints, I certainly wouldn't put either benchmark as some sort of 
yardstick for a winner. Note I'm not saying that we shouldn't be looking at 
them per se, but since the whole drive for a new scheduler is trying to be 
more objective we need to start expanding the range of benchmarks. Even 
though I don't feel the need to have SD in the race I guess it stands for 
more data to compare what is possible/where as well.

 Mainline seems pretty inconsistent here.

 lmbench 0K ctxsw latency bound to CPU0:
 tasks
 2 2.593.422.50
 4 3.263.543.09
 8 3.013.643.22
 163.003.663.50
 322.993.703.49
 643.094.173.50
 128   4.805.584.74
 256   5.796.375.76

 cfs is noticably disadvantaged.

 [*] 500ms didn't make much difference in either test.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-18 Thread Con Kolivas
On Wednesday 18 April 2007 22:33, Con Kolivas wrote:
 On Wednesday 18 April 2007 22:14, Nick Piggin wrote:
  On Wed, Apr 18, 2007 at 07:33:56PM +1000, Con Kolivas wrote:
   On Wednesday 18 April 2007 18:55, Nick Piggin wrote:
Again, for comparison 2.6.21-rc7 mainline:
   
508.87user 32.47system 2:17.82elapsed 392%CPU
509.05user 32.25system 2:17.84elapsed 392%CPU
508.75user 32.26system 2:17.83elapsed 392%CPU
508.63user 32.17system 2:17.88elapsed 392%CPU
509.01user 32.26system 2:17.90elapsed 392%CPU
509.08user 32.20system 2:17.95elapsed 392%CPU
   
So looking at elapsed time, a granularity of 100ms is just behind the
mainline score. However it is using slightly less user time and
slightly more idle time, which indicates that balancing might have
got a bit less aggressive.
   
But anyway, it conclusively shows the efficiency impact of such tiny
timeslices.
  
   See test.kernel.org for how (the now defunct) SD was performing on
   kernbench. It had low latency _and_ equivalent throughput to mainline.
   Set the standard appropriately on both counts please.
 
  I can give it a run. Got an updated patch against -rc7?

 I said I wasn't pursuing it but since you're offering, the rc6 patch should
 apply ok.

 http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-sd-0.40.patch

Oh and if you go to the effort of trying you may as well try the timeslice 
tweak to see what effect it has on SD as well.

/proc/sys/kernel/rr_interval

100 is the highest.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: implement staircase deadline scheduler further improvements

2007-04-18 Thread Con Kolivas
While the Staircase Deadline scheduler has not been completely killed off and 
is still in -mm I would like to fix some outstanding issues that I've found
since it still serves for comparison with all the upcoming schedulers.

While still in -mm can we queue this on top please?

A set of staircase-deadline v 0.41 patches will make their way into the usual
place for those willing to test it.

http://ck.kolivas.org/patches/staircase-deadline/
---
The prio_level was being inappropriately decreased if a higher priority
task was still using previous timeslice. Fix that.

Task expiration of higher priority tasks was not being taken into account
with allocating priority slots. Check the expired best_static_prio level
to facilitate that.

Explicitly check all better static priority prio_levels when deciding on
allocating slots for niced tasks.

These changes improve behaviour in many ways.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/sched.c |   61 ++---
 1 file changed, 41 insertions(+), 20 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-19 08:51:54.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-19 09:30:39.0 +1000
@@ -145,6 +145,12 @@ struct prio_array {
 */
DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
 
+   /*
+* The best static priority (of the dynamic priority tasks) queued
+* this array.
+*/
+   int best_static_prio;
+
 #ifdef CONFIG_SMP
/* For convenience looks back at rq */
struct rq *rq;
@@ -191,9 +197,9 @@ struct rq {
 
/*
 * The current dynamic priority level this runqueue is at per static
-* priority level, and the best static priority queued this rotation.
+* priority level.
 */
-   int prio_level[PRIO_RANGE], best_static_prio;
+   int prio_level[PRIO_RANGE];
 
/* How many times we have rotated the priority queue */
unsigned long prio_rotation;
@@ -669,7 +675,7 @@ static void task_new_array(struct task_s
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
-static inline int first_prio_slot(struct task_struct *p)
+static int first_prio_slot(struct task_struct *p)
 {
if (unlikely(p-policy == SCHED_BATCH))
return p-static_prio;
@@ -682,11 +688,18 @@ static inline int first_prio_slot(struct
  * level. SCHED_BATCH tasks do not use the priority matrix. They only take
  * priority slots from their static_prio and above.
  */
-static inline int next_entitled_slot(struct task_struct *p, struct rq *rq)
+static int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
+   int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p-static_prio);
+   struct prio_array *array = rq-active;
DECLARE_BITMAP(tmp, PRIO_RANGE);
-   int search_prio, uprio = USER_PRIO(p-static_prio);
 
+   /*
+* Go straight to expiration if there are higher priority tasks
+* already expired.
+*/
+   if (p-static_prio  rq-expired-best_static_prio)
+   return MAX_PRIO;
if (!rq-prio_level[uprio])
rq-prio_level[uprio] = MAX_RT_PRIO;
/*
@@ -694,15 +707,21 @@ static inline int next_entitled_slot(str
 * static_prio are acceptable, and only if it's not better than
 * a queued better static_prio's prio_level.
 */
-   if (p-static_prio  rq-best_static_prio) {
-   search_prio = MAX_RT_PRIO;
+   if (p-static_prio  array-best_static_prio) {
if (likely(p-policy != SCHED_BATCH))
-   rq-best_static_prio = p-static_prio;
-   } else if (p-static_prio == rq-best_static_prio)
+   array-best_static_prio = p-static_prio;
+   } else if (p-static_prio == array-best_static_prio) {
search_prio = rq-prio_level[uprio];
-   else {
+   } else {
+   int i;
+
+   /* A bound O(n) function, worst case n is 40 */
+   for (i = array-best_static_prio; i = p-static_prio ; i++) {
+   if (!rq-prio_level[USER_PRIO(i)])
+   rq-prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
search_prio = max(rq-prio_level[uprio],
-   rq-prio_level[USER_PRIO(rq-best_static_prio)]);
+   rq-prio_level[USER_PRIO(i)]);
+   }
}
if (unlikely(p-policy == SCHED_BATCH)) {
search_prio = max(search_prio, p-static_prio);
@@ -718,6 +737,8 @@ static void queue_expired(struct task_st
 {
task_new_array(p, rq, rq-expired);
p-prio = p-normal_prio = first_prio_slot(p);
+   if (p-static_prio  rq-expired-best_static_prio)
+   rq-expired-best_static_prio = p-static_prio;
 }
 
 #ifdef CONFIG_SMP
@@ -726,7 +747,7

Announce - Staircase Deadline cpu scheduler v0.41

2007-04-18 Thread Con Kolivas
Since there is so much work currently ongoing with alternative cpu schedulers, 
as a standard for comparison with the alternative virtual deadline fair 
designs I've addressed a few issues in the Staircase Deadline cpu scheduler 
which improve behaviour likely in a noticeable fashion and released version 
0.41.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.patch

and an incremental for those on 0.40:
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-implement-staircase-deadline-scheduler-further-improvements.patch

Remember to renice X to -10 for nicest desktop behaviour :)

Have fun.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ck] Announce - Staircase Deadline cpu scheduler v0.41

2007-04-18 Thread Con Kolivas
On Thursday 19 April 2007 09:59, Con Kolivas wrote:
 Since there is so much work currently ongoing with alternative cpu
 schedulers, as a standard for comparison with the alternative virtual
 deadline fair designs I've addressed a few issues in the Staircase Deadline
 cpu scheduler which improve behaviour likely in a noticeable fashion and
 released version 0.41.

 http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.patch
 http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.patch

 and an incremental for those on 0.40:
 http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-implement
-staircase-deadline-scheduler-further-improvements.patch

 Remember to renice X to -10 for nicest desktop behaviour :)

 Have fun.

Oops forgot to cc a few people

Nick you said I should still have something to offer so here it is.
Peter you said you never saw this design (it's a dual array affair sorry).
Gene and Willy you were some of the early testers that noticed the advantages 
of the earlier designs,
Matt you did lots of great earlier testing.
WLI you inspired a lot of design ideas.
Mike you were the stick.
And a few others I've forgotten to mention and include.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] sched: implement staircase deadline scheduler further improvements-1

2007-04-18 Thread Con Kolivas
On Thursday 19 April 2007 09:48, Con Kolivas wrote:
 While the Staircase Deadline scheduler has not been completely killed off
 and is still in -mm I would like to fix some outstanding issues that I've
 found since it still serves for comparison with all the upcoming
 schedulers.

 While still in -mm can we queue this on top please?

 A set of staircase-deadline v 0.41 patches will make their way into the
 usual place for those willing to test it.

 http://ck.kolivas.org/patches/staircase-deadline/

Oops! Minor thinko! Here is a respin. Please apply this one instead.

I better make a 0.42 heh.

---
The prio_level was being inappropriately decreased if a higher priority
task was still using previous timeslice. Fix that.

Task expiration of higher priority tasks was not being taken into account
with allocating priority slots. Check the expired best_static_prio level
to facilitate that.

Explicitly check all better static priority prio_levels when deciding on
allocating slots for niced tasks.

These changes improve behaviour in many ways.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 kernel/sched.c |   64 ++---
 1 file changed, 43 insertions(+), 21 deletions(-)

Index: linux-2.6.21-rc7-sd/kernel/sched.c
===
--- linux-2.6.21-rc7-sd.orig/kernel/sched.c 2007-04-19 08:51:54.0 
+1000
+++ linux-2.6.21-rc7-sd/kernel/sched.c  2007-04-19 12:03:29.0 +1000
@@ -145,6 +145,12 @@ struct prio_array {
 */
DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
 
+   /*
+* The best static priority (of the dynamic priority tasks) queued
+* this array.
+*/
+   int best_static_prio;
+
 #ifdef CONFIG_SMP
/* For convenience looks back at rq */
struct rq *rq;
@@ -191,9 +197,9 @@ struct rq {
 
/*
 * The current dynamic priority level this runqueue is at per static
-* priority level, and the best static priority queued this rotation.
+* priority level.
 */
-   int prio_level[PRIO_RANGE], best_static_prio;
+   int prio_level[PRIO_RANGE];
 
/* How many times we have rotated the priority queue */
unsigned long prio_rotation;
@@ -669,7 +675,7 @@ static void task_new_array(struct task_s
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
-static inline int first_prio_slot(struct task_struct *p)
+static int first_prio_slot(struct task_struct *p)
 {
if (unlikely(p-policy == SCHED_BATCH))
return p-static_prio;
@@ -682,11 +688,18 @@ static inline int first_prio_slot(struct
  * level. SCHED_BATCH tasks do not use the priority matrix. They only take
  * priority slots from their static_prio and above.
  */
-static inline int next_entitled_slot(struct task_struct *p, struct rq *rq)
+static int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
+   int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p-static_prio);
+   struct prio_array *array = rq-active;
DECLARE_BITMAP(tmp, PRIO_RANGE);
-   int search_prio, uprio = USER_PRIO(p-static_prio);
 
+   /*
+* Go straight to expiration if there are higher priority tasks
+* already expired.
+*/
+   if (p-static_prio  rq-expired-best_static_prio)
+   return MAX_PRIO;
if (!rq-prio_level[uprio])
rq-prio_level[uprio] = MAX_RT_PRIO;
/*
@@ -694,15 +707,22 @@ static inline int next_entitled_slot(str
 * static_prio are acceptable, and only if it's not better than
 * a queued better static_prio's prio_level.
 */
-   if (p-static_prio  rq-best_static_prio) {
-   search_prio = MAX_RT_PRIO;
+   if (p-static_prio  array-best_static_prio) {
if (likely(p-policy != SCHED_BATCH))
-   rq-best_static_prio = p-static_prio;
-   } else if (p-static_prio == rq-best_static_prio)
+   array-best_static_prio = p-static_prio;
+   } else if (p-static_prio == array-best_static_prio) {
search_prio = rq-prio_level[uprio];
-   else {
-   search_prio = max(rq-prio_level[uprio],
-   rq-prio_level[USER_PRIO(rq-best_static_prio)]);
+   } else {
+   int i;
+
+   search_prio = rq-prio_level[uprio];
+   /* A bound O(n) function, worst case n is 40 */
+   for (i = array-best_static_prio; i = p-static_prio ; i++) {
+   if (!rq-prio_level[USER_PRIO(i)])
+   rq-prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
+   search_prio = max(search_prio,
+ rq-prio_level[USER_PRIO(i)]);
+   }
}
if (unlikely(p-policy == SCHED_BATCH)) {
search_prio = max(search_prio, p-static_prio);
@@ -718,6 +738,8 @@ static void

Announce - Staircase Deadline cpu scheduler v0.42

2007-04-18 Thread Con Kolivas
On Thursday 19 April 2007 10:41, Con Kolivas wrote:
 On Thursday 19 April 2007 09:59, Con Kolivas wrote:
  Since there is so much work currently ongoing with alternative cpu
  schedulers, as a standard for comparison with the alternative virtual
  deadline fair designs I've addressed a few issues in the Staircase
  Deadline cpu scheduler which improve behaviour likely in a noticeable
  fashion and released version 0.41.
 
  http://ck.kolivas.org/patches/staircase-deadline/2.6.20.7-sd-0.41.patch
  http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.41.patch
 
  and an incremental for those on 0.40:
  http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7/sched-impleme
 nt -staircase-deadline-scheduler-further-improvements.patch
 
  Remember to renice X to -10 for nicest desktop behaviour :)
 
  Have fun.

 Oops forgot to cc a few people

 Nick you said I should still have something to offer so here it is.
 Peter you said you never saw this design (it's a dual array affair sorry).
 Gene and Willy you were some of the early testers that noticed the
 advantages of the earlier designs,
 Matt you did lots of great earlier testing.
 WLI you inspired a lot of design ideas.
 Mike you were the stick.
 And a few others I've forgotten to mention and include.

Version 0.42

http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc7-sd-0.42.patch

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Staircase cpu scheduler v17.1

2007-04-16 Thread Con Kolivas
Greetings all

Here is the current release of the Staircase cpu scheduler (the original 
generation I design that spurned development elsewhere for RSDL), for 
2.6.21-rc7

http://ck.kolivas.org/patches/pre-releases/2.6.21-rc7/2.6.21-rc7-ck1/patches/sched-staircase-17.1.patch

To remind people where this cpu scheduler fits into the picture:

-It is purpose built with interactivity first and foremost.
-It aims to be mostly fair most of the time
-It is has strong semantics desribing the cpu relationship between different 
nice levels (nice 19 is 1/20th the cpu of nice 0).
-It is resistant to most forms of starvation
-Latency of tasks that are not heavily cpu bound is exceptionally low 
irrespective of nice level -if they stay within their cpu bounds; What this 
means is you can have and audio application if it uses very little cpu 
running at nice 19 and it will still be unlikely to skip audio in the 
presence of a kernel compile nice -20.
-Therefore you can renice X or whatever to your heart's content, but then... 
you don't need to renice X with this design.
-The design is a single priority array very low overhead small codebase (the 
diffstat summary obviously muddied by removing more comments is 4 files 
changed, 418 insertions(+), 714 deletions(-))
4 files changed, 418 insertions(+), 714 deletions(-)

Disadavantages:
-There are heuristics
-There are some rare cpu usage patterns that can lead to excessive unfairness 
and relative starvation.

Bonuses:
With the addition of further patches in that same directory above it has:
- An interactive tunable flag which further increases the fairness and makes 
nice values more absolutely determine latency (instead of cpu usage vs 
entitlement determining latency as the default above)
/proc/sys/kernel/interactive 
- A compute tunable which makes timeslices much longer and has delayed 
preemption for maximum cpu cache utilisation for compute intensive workloads
/proc/sys/kernel/compute 
- A soft realtime unprivileged policy for normal users with a tunable maximum 
cpu usage set to 80% by default
/proc/sys/kernel/iso_cpu
- A background scheduling class that uses zero cpu usage resources if any 
other task wants cpu.

This is unashamedly a relatively unfair slightly starveable cpu scheduler with 
exceptional quality _Desktop_ performance as it was always intended to be. 

It is NOT intended for mainline use as mainline needs a general purpose cpu 
scheduler (remember!). I have no intention of pushing it as such given its 
disadvantages, and don't really care about those disadvantages as I have no 
intention of trying to "plug up" the theoretical exploits and disadvantages 
either since desktops aren't really affected BUT this scheduler is great fun 
to use. Unfortunately the version of this scheduler in plugsched is not up to 
date with this code. Perhaps if demand for plugsched somehow turns the world 
on its head then this code may have a place elsewhere too.

Enjoy! If you don't like it? Doesn't matter; you have a choice so just use 
something else. This is code that will only be in -ck.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Staircase cpu scheduler v17.1

2007-04-16 Thread Con Kolivas
Greetings all

Here is the current release of the Staircase cpu scheduler (the original 
generation I design that spurned development elsewhere for RSDL), for 
2.6.21-rc7

http://ck.kolivas.org/patches/pre-releases/2.6.21-rc7/2.6.21-rc7-ck1/patches/sched-staircase-17.1.patch

To remind people where this cpu scheduler fits into the picture:

-It is purpose built with interactivity first and foremost.
-It aims to be mostly fair most of the time
-It is has strong semantics desribing the cpu relationship between different 
nice levels (nice 19 is 1/20th the cpu of nice 0).
-It is resistant to most forms of starvation
-Latency of tasks that are not heavily cpu bound is exceptionally low 
irrespective of nice level -if they stay within their cpu bounds; What this 
means is you can have and audio application if it uses very little cpu 
running at nice 19 and it will still be unlikely to skip audio in the 
presence of a kernel compile nice -20.
-Therefore you can renice X or whatever to your heart's content, but then... 
you don't need to renice X with this design.
-The design is a single priority array very low overhead small codebase (the 
diffstat summary obviously muddied by removing more comments is 4 files 
changed, 418 insertions(+), 714 deletions(-))
4 files changed, 418 insertions(+), 714 deletions(-)

Disadavantages:
-There are heuristics
-There are some rare cpu usage patterns that can lead to excessive unfairness 
and relative starvation.

Bonuses:
With the addition of further patches in that same directory above it has:
- An interactive tunable flag which further increases the fairness and makes 
nice values more absolutely determine latency (instead of cpu usage vs 
entitlement determining latency as the default above)
/proc/sys/kernel/interactive 
- A compute tunable which makes timeslices much longer and has delayed 
preemption for maximum cpu cache utilisation for compute intensive workloads
/proc/sys/kernel/compute 
- A soft realtime unprivileged policy for normal users with a tunable maximum 
cpu usage set to 80% by default
/proc/sys/kernel/iso_cpu
- A background scheduling class that uses zero cpu usage resources if any 
other task wants cpu.

This is unashamedly a relatively unfair slightly starveable cpu scheduler with 
exceptional quality _Desktop_ performance as it was always intended to be. 

It is NOT intended for mainline use as mainline needs a general purpose cpu 
scheduler (remember!). I have no intention of pushing it as such given its 
disadvantages, and don't really care about those disadvantages as I have no 
intention of trying to plug up the theoretical exploits and disadvantages 
either since desktops aren't really affected BUT this scheduler is great fun 
to use. Unfortunately the version of this scheduler in plugsched is not up to 
date with this code. Perhaps if demand for plugsched somehow turns the world 
on its head then this code may have a place elsewhere too.

Enjoy! If you don't like it? Doesn't matter; you have a choice so just use 
something else. This is code that will only be in -ck.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 01:05, Ingo Molnar wrote:
> * Con Kolivas <[EMAIL PROTECTED]> wrote:
> > 2. Since then I've been thinking/working on a cpu scheduler design
> > that takes away all the guesswork out of scheduling and gives very
> > predictable, as fair as possible, cpu distribution and latency while
> > preserving as solid interactivity as possible within those confines.
>
> yeah. I think you were right on target with this call.

Yay thank goodness :) It's time to fix the damn cpu scheduler once and for 
all. Everyone uses this; it's no minor driver or $bigsmp or $bigram or 
$small_embedded_RT_hardware feature.

> I've applied the 
> sched.c change attached at the bottom of this mail to the CFS patch, if
> you dont mind. (or feel free to suggest some other text instead.)

>   *  2003-09-03   Interactivity tuning by Con Kolivas.
>   *  2004-04-02   Scheduler domains code by Nick Piggin
> + *  2007-04-15   Con Kolivas was dead right: fairness matters! :)

LOL that's awful. I'd prefer something meaningful like "Work begun on 
replacing all interactivity tuning with a fair virtual-deadline design by Con 
Kolivas".

While you're at it, it's worth getting rid of a few slightly pointless name 
changes too. Don't rename SCHED_NORMAL yet again, and don't call all your 
things sched_fair blah_fair __blah_fair and so on. It means that anything 
else is by proxy going to be considered unfair. Leave SCHED_NORMAL as is, 
replace the use of the word _fair with _cfs. I don't really care how many 
copyright notices you put into our already noisy bootup but it's redundant 
since there is no choice; we all get the same cpu scheduler.

> > 1. I tried in vain some time ago to push a working extensable
> > pluggable cpu scheduler framework (based on wli's work) for the linux
> > kernel. It was perma-vetoed by Linus and Ingo (and Nick also said he
> > didn't like it) as being absolutely the wrong approach and that we
> > should never do that. [...]
>
> i partially replied to that point to Will already, and i'd like to make
> it clear again: yes, i rejected plugsched 2-3 years ago (which already
> drifted away from wli's original codebase) and i would still reject it
> today.

No that was just me being flabbergasted by what appeared to be you posting 
your own plugsched. Note nowhere in the 40 iterations of rsdl->sd did I 
ask/suggest for plugsched. I said in my first announcement my aim was to 
create a scheduling policy robust enough for all situations rather than 
fantastic a lot of the time and awful sometimes. There are plenty of people 
ready to throw out arguments for plugsched now and I don't have the energy to 
continue that fight (I never did really).

But my question still stands about this comment:

>   case, all of SD's logic could be added via a kernel/sched_sd.c module
>   as well, if Con is interested in such an approach. ]

What exactly would be the purpose of such a module that governs nothing in 
particular? Since there'll be no pluggable scheduler by your admission it has 
no control over SCHED_NORMAL, and would require another scheduling policy for 
it to govern which there is no express way to use at the moment and people 
tend to just use the default without great effort. 

> First and foremost, please dont take such rejections too personally - i
> had my own share of rejections (and in fact, as i mentioned it in a
> previous mail, i had a fair number of complete project throwaways:
> 4g:4g, in-kernel Tux, irqrate and many others). I know that they can
> hurt and can demoralize, but if i dont like something it's my job to
> tell that.

Hmm? No that's not what this is about. Remember dynticks which was not 
originally my code but I tried to bring it up to mainline standard which I 
fought with for months? You came along with yet another rewrite from scratch 
and the flaws in the design I was working with were obvious so I instantly 
bowed down to that and never touched my code again. I didn't ask for credit 
back then, but obviously brought the requirement for a no idle tick 
implementation to the table.

> My view about plugsched: first please take a look at the latest
> plugsched code:
>
>   http://downloads.sourceforge.net/cpuse/plugsched-6.5-for-2.6.20.patch
>
>   26 files changed, 8951 insertions(+), 1495 deletions(-)
>
> As an experiment i've removed all the add-on schedulers (both the core
> and the include files, only kept the vanilla one) from the plugsched
> patch (and the makefile and kconfig complications, etc), to see the
> 'infrastructure cost', and it still gave:
>
>   12 files changed, 1933 insertions(+), 1479 deletions(-)

I do not see extra code per-se as being a bad thing. I've heard said a few 
times before "ever notice how when the correct solution is done it is a lot 
more code tha

Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 12:28, Nick Piggin wrote:
> So, on to something productive, we have 3 candidates for a new scheduler so
> far. How do we decide which way to go? (and yes, I still think switchable
> schedulers is wrong and a copout) This is one area where it is virtually
> impossible to discount any decent design on correctness/performance/etc.
> and even testing in -mm isn't really enough.

We're in agreement! YAY!

Actually this is simpler than that. I'm taking SD out of the picture. It has 
served it's purpose of proving that we need to seriously address all the 
scheduling issues and did more than a half decent job at it. Unfortunately I 
also cannot sit around supporting it forever by myself. My own life is more 
important, so consider SD not even running the race any more.

I'm off to continue maintaining permanent-out-of-tree leisurely code at my own 
pace. What's more is, I think I'll just stick to staircase Gen I version blah 
and shelve SD and try to have fond memories of SD as an intellectual 
prompting exercise only.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 05:00, Jonathan Lundell wrote:
> On Apr 15, 2007, at 10:59 AM, Linus Torvalds wrote:
> > It's a really good thing, and it means that if somebody shows that
> > your
> > code is flawed in some way (by, for example, making a patch that
> > people
> > claim gets better behaviour or numbers), any *good* programmer that
> > actually cares about his code will obviously suddenly be very
> > motivated to
> > out-do the out-doer!
>
> "No one who cannot rejoice in the discovery of his own mistakes
> deserves to be called a scholar."

Lovely comment. I realise this is not truly directed at me but clearly in the 
context it has been said people will assume it is directed my way, so while 
we're all spinning lkml quality rhetoric, let me have a right of reply.

One thing I have never tried to do was to ignore bug reports. I'm forever 
joking that I keep pulling code out of my arse to improve what I've done. 
RSDL/SD was no exception; heck it had 40 iterations. The reason I could not 
reply to bug report A with "Oh that is problem B so I'll fix it with code C" 
was, as I've said many many times over, health related. I did indeed try to 
fix many of them without spending hours replying to sometimes unpleasant 
emails. If health wasn't an issue there might have been 1000 iterations of 
SD.

There was only ever _one_ thing that I was absolutely steadfast on as a 
concept that I refused to fix that people might claim was "a mistake I did 
not rejoice in to be a scholar". That was that the _correct_ behaviour for a 
scheduler is to be fair such that proportional slowdown with load is (using 
that awful pun) a feature, not a bug. Now there are people who will still 
disagree violently with me on that. SD attempted to be a fairness first 
virtual-deadline design. If I failed on that front, then so be it (and at 
least one person certainly has said in lovely warm fuzzy friendly 
communication that I'm a global failure on all fronts with SD). But let me 
point out now that Ingo's shiny new scheduler is a fairness-first 
virtual-deadline design which will have proportional slowdown with load. So 
it will have a very similar feature. I dare anyone to claim that proportional 
slowdown with load is a bug, because I will no longer feel like I'm standing 
alone with a BFG9000 trying to defend my standpoint. Others can take up the 
post at last.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 01:16, Gene Heskett wrote:
> On Sunday 15 April 2007, Pekka Enberg wrote:
> >On 4/15/07, hui Bill Huey <[EMAIL PROTECTED]> wrote:
> >> The perception here is that there is that there is this expectation that
> >> sections of the Linux kernel are intentionally "churn squated" to
> >> prevent any other ideas from creeping in other than of the owner of that
> >> subsytem
> >
> >Strangely enough, my perception is that Ingo is simply trying to
> >address the issues Mike's testing discovered in RDSL and SD. It's not
> >surprising Ingo made it a separate patch set as Con has repeatedly
> >stated that the "problems" are in fact by design and won't be fixed.
>
> I won't get into the middle of this just yet, not having decided which dog
> I should bet on yet.  I've been running 2.6.21-rc6 + Con's 0.40 patch for
> about 24 hours, its been generally usable, but gzip still causes lots of 5
> to 10+ second lags when its running.  I'm coming to the conclusion that
> gzip simply doesn't play well with others...

Actually Gene I think you're being bitten here by something I/O bound since 
the cpu usage never tops out. If that's the case and gzip is dumping 
truckloads of writes then you're suffering something that irks me even more 
than the scheduler in linux, and that's how much writes hurt just about 
everything else. Try your testcase with bzip2 instead (since that won't be 
i/o bound), or drop your dirty ratio to as low as possible which helps a 
little bit (5% is the minimum)

echo 5 > /proc/sys/vm/dirty_ratio

and finally try the braindead noop i/o scheduler as well.

echo noop > /sys/block/sda/queue/scheduler

(replace sda with your drive obviously).

I'd wager a big one that's what causes your gzip pain. If it wasn't for the 
fact that I've decided to all but give up ever trying to provide code for 
mainline again, trying my best to make writes hurt less on linux would be my 
next big thing [tm]. 

Oh and for the others watching, (points to vm hackers) I found a bug when 
playing with the dirty ratio code. If you modify it to allow it drop below 5% 
but still above the minimum in the vm code, stalls happen somewhere in the vm 
where nothing much happens for sometimes 20 or 30 seconds worst case 
scenario. I had to drop a patch in 2.6.19 that allowed the dirty ratio to be 
set ultra low because these stalls were gross.

> Amazing to me, the cpu its using stays generally below 80%, and often below
> 60%, even while the kmail composer has a full sentence in its buffer that
> it still hasn't shown me when I switch to the htop screen to check, and
> back to the kmail screen to see if its updated yet.  The screen switch
> doesn't seem to lag so I don't think renicing x would be helpfull.  Those
> are the obvious lags, and I'll build & reboot to the CFS patch at some
> point this morning (whats left of it that is :).  And report in due time of
> course

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 05:00, Jonathan Lundell wrote:
 On Apr 15, 2007, at 10:59 AM, Linus Torvalds wrote:
  It's a really good thing, and it means that if somebody shows that
  your
  code is flawed in some way (by, for example, making a patch that
  people
  claim gets better behaviour or numbers), any *good* programmer that
  actually cares about his code will obviously suddenly be very
  motivated to
  out-do the out-doer!

 No one who cannot rejoice in the discovery of his own mistakes
 deserves to be called a scholar.

Lovely comment. I realise this is not truly directed at me but clearly in the 
context it has been said people will assume it is directed my way, so while 
we're all spinning lkml quality rhetoric, let me have a right of reply.

One thing I have never tried to do was to ignore bug reports. I'm forever 
joking that I keep pulling code out of my arse to improve what I've done. 
RSDL/SD was no exception; heck it had 40 iterations. The reason I could not 
reply to bug report A with Oh that is problem B so I'll fix it with code C 
was, as I've said many many times over, health related. I did indeed try to 
fix many of them without spending hours replying to sometimes unpleasant 
emails. If health wasn't an issue there might have been 1000 iterations of 
SD.

There was only ever _one_ thing that I was absolutely steadfast on as a 
concept that I refused to fix that people might claim was a mistake I did 
not rejoice in to be a scholar. That was that the _correct_ behaviour for a 
scheduler is to be fair such that proportional slowdown with load is (using 
that awful pun) a feature, not a bug. Now there are people who will still 
disagree violently with me on that. SD attempted to be a fairness first 
virtual-deadline design. If I failed on that front, then so be it (and at 
least one person certainly has said in lovely warm fuzzy friendly 
communication that I'm a global failure on all fronts with SD). But let me 
point out now that Ingo's shiny new scheduler is a fairness-first 
virtual-deadline design which will have proportional slowdown with load. So 
it will have a very similar feature. I dare anyone to claim that proportional 
slowdown with load is a bug, because I will no longer feel like I'm standing 
alone with a BFG9000 trying to defend my standpoint. Others can take up the 
post at last.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 12:28, Nick Piggin wrote:
 So, on to something productive, we have 3 candidates for a new scheduler so
 far. How do we decide which way to go? (and yes, I still think switchable
 schedulers is wrong and a copout) This is one area where it is virtually
 impossible to discount any decent design on correctness/performance/etc.
 and even testing in -mm isn't really enough.

We're in agreement! YAY!

Actually this is simpler than that. I'm taking SD out of the picture. It has 
served it's purpose of proving that we need to seriously address all the 
scheduling issues and did more than a half decent job at it. Unfortunately I 
also cannot sit around supporting it forever by myself. My own life is more 
important, so consider SD not even running the race any more.

I'm off to continue maintaining permanent-out-of-tree leisurely code at my own 
pace. What's more is, I think I'll just stick to staircase Gen I version blah 
and shelve SD and try to have fond memories of SD as an intellectual 
prompting exercise only.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 01:05, Ingo Molnar wrote:
 * Con Kolivas [EMAIL PROTECTED] wrote:
  2. Since then I've been thinking/working on a cpu scheduler design
  that takes away all the guesswork out of scheduling and gives very
  predictable, as fair as possible, cpu distribution and latency while
  preserving as solid interactivity as possible within those confines.

 yeah. I think you were right on target with this call.

Yay thank goodness :) It's time to fix the damn cpu scheduler once and for 
all. Everyone uses this; it's no minor driver or $bigsmp or $bigram or 
$small_embedded_RT_hardware feature.

 I've applied the 
 sched.c change attached at the bottom of this mail to the CFS patch, if
 you dont mind. (or feel free to suggest some other text instead.)

   *  2003-09-03   Interactivity tuning by Con Kolivas.
   *  2004-04-02   Scheduler domains code by Nick Piggin
 + *  2007-04-15   Con Kolivas was dead right: fairness matters! :)

LOL that's awful. I'd prefer something meaningful like Work begun on 
replacing all interactivity tuning with a fair virtual-deadline design by Con 
Kolivas.

While you're at it, it's worth getting rid of a few slightly pointless name 
changes too. Don't rename SCHED_NORMAL yet again, and don't call all your 
things sched_fair blah_fair __blah_fair and so on. It means that anything 
else is by proxy going to be considered unfair. Leave SCHED_NORMAL as is, 
replace the use of the word _fair with _cfs. I don't really care how many 
copyright notices you put into our already noisy bootup but it's redundant 
since there is no choice; we all get the same cpu scheduler.

  1. I tried in vain some time ago to push a working extensable
  pluggable cpu scheduler framework (based on wli's work) for the linux
  kernel. It was perma-vetoed by Linus and Ingo (and Nick also said he
  didn't like it) as being absolutely the wrong approach and that we
  should never do that. [...]

 i partially replied to that point to Will already, and i'd like to make
 it clear again: yes, i rejected plugsched 2-3 years ago (which already
 drifted away from wli's original codebase) and i would still reject it
 today.

No that was just me being flabbergasted by what appeared to be you posting 
your own plugsched. Note nowhere in the 40 iterations of rsdl-sd did I 
ask/suggest for plugsched. I said in my first announcement my aim was to 
create a scheduling policy robust enough for all situations rather than 
fantastic a lot of the time and awful sometimes. There are plenty of people 
ready to throw out arguments for plugsched now and I don't have the energy to 
continue that fight (I never did really).

But my question still stands about this comment:

   case, all of SD's logic could be added via a kernel/sched_sd.c module
   as well, if Con is interested in such an approach. ]

What exactly would be the purpose of such a module that governs nothing in 
particular? Since there'll be no pluggable scheduler by your admission it has 
no control over SCHED_NORMAL, and would require another scheduling policy for 
it to govern which there is no express way to use at the moment and people 
tend to just use the default without great effort. 

 First and foremost, please dont take such rejections too personally - i
 had my own share of rejections (and in fact, as i mentioned it in a
 previous mail, i had a fair number of complete project throwaways:
 4g:4g, in-kernel Tux, irqrate and many others). I know that they can
 hurt and can demoralize, but if i dont like something it's my job to
 tell that.

Hmm? No that's not what this is about. Remember dynticks which was not 
originally my code but I tried to bring it up to mainline standard which I 
fought with for months? You came along with yet another rewrite from scratch 
and the flaws in the design I was working with were obvious so I instantly 
bowed down to that and never touched my code again. I didn't ask for credit 
back then, but obviously brought the requirement for a no idle tick 
implementation to the table.

 My view about plugsched: first please take a look at the latest
 plugsched code:

   http://downloads.sourceforge.net/cpuse/plugsched-6.5-for-2.6.20.patch

   26 files changed, 8951 insertions(+), 1495 deletions(-)

 As an experiment i've removed all the add-on schedulers (both the core
 and the include files, only kept the vanilla one) from the plugsched
 patch (and the makefile and kconfig complications, etc), to see the
 'infrastructure cost', and it still gave:

   12 files changed, 1933 insertions(+), 1479 deletions(-)

I do not see extra code per-se as being a bad thing. I've heard said a few 
times before ever notice how when the correct solution is done it is a lot 
more code than the quick hack that ultimately fails?. Insert long winded 
discussion of perfect is the enemy of good here, _but_ I'm not arguing 
perfect versus good, I'm talking about solid code versus quick fix. Again, 
none of this comment is directed

Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-15 Thread Con Kolivas
On Monday 16 April 2007 01:16, Gene Heskett wrote:
 On Sunday 15 April 2007, Pekka Enberg wrote:
 On 4/15/07, hui Bill Huey [EMAIL PROTECTED] wrote:
  The perception here is that there is that there is this expectation that
  sections of the Linux kernel are intentionally churn squated to
  prevent any other ideas from creeping in other than of the owner of that
  subsytem
 
 Strangely enough, my perception is that Ingo is simply trying to
 address the issues Mike's testing discovered in RDSL and SD. It's not
 surprising Ingo made it a separate patch set as Con has repeatedly
 stated that the problems are in fact by design and won't be fixed.

 I won't get into the middle of this just yet, not having decided which dog
 I should bet on yet.  I've been running 2.6.21-rc6 + Con's 0.40 patch for
 about 24 hours, its been generally usable, but gzip still causes lots of 5
 to 10+ second lags when its running.  I'm coming to the conclusion that
 gzip simply doesn't play well with others...

Actually Gene I think you're being bitten here by something I/O bound since 
the cpu usage never tops out. If that's the case and gzip is dumping 
truckloads of writes then you're suffering something that irks me even more 
than the scheduler in linux, and that's how much writes hurt just about 
everything else. Try your testcase with bzip2 instead (since that won't be 
i/o bound), or drop your dirty ratio to as low as possible which helps a 
little bit (5% is the minimum)

echo 5  /proc/sys/vm/dirty_ratio

and finally try the braindead noop i/o scheduler as well.

echo noop  /sys/block/sda/queue/scheduler

(replace sda with your drive obviously).

I'd wager a big one that's what causes your gzip pain. If it wasn't for the 
fact that I've decided to all but give up ever trying to provide code for 
mainline again, trying my best to make writes hurt less on linux would be my 
next big thing [tm]. 

Oh and for the others watching, (points to vm hackers) I found a bug when 
playing with the dirty ratio code. If you modify it to allow it drop below 5% 
but still above the minimum in the vm code, stalls happen somewhere in the vm 
where nothing much happens for sometimes 20 or 30 seconds worst case 
scenario. I had to drop a patch in 2.6.19 that allowed the dirty ratio to be 
set ultra low because these stalls were gross.

 Amazing to me, the cpu its using stays generally below 80%, and often below
 60%, even while the kmail composer has a full sentence in its buffer that
 it still hasn't shown me when I switch to the htop screen to check, and
 back to the kmail screen to see if its updated yet.  The screen switch
 doesn't seem to lag so I don't think renicing x would be helpfull.  Those
 are the obvious lags, and I'll build  reboot to the CFS patch at some
 point this morning (whats left of it that is :).  And report in due time of
 course

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-14 Thread Con Kolivas
On Saturday 14 April 2007 06:21, Ingo Molnar wrote:
> [announce] [patch] Modular Scheduler Core and Completely Fair Scheduler
> [CFS]
>
> i'm pleased to announce the first release of the "Modular Scheduler Core
> and Completely Fair Scheduler [CFS]" patchset:
>
>http://redhat.com/~mingo/cfs-scheduler/sched-modular+cfs.patch
>
> This project is a complete rewrite of the Linux task scheduler. My goal
> is to address various feature requests and to fix deficiencies in the
> vanilla scheduler that were suggested/found in the past few years, both
> for desktop scheduling and for server scheduling workloads.

The casual observer will be completely confused by what on earth has happened 
here so let me try to demystify things for them.

1. I tried in vain some time ago to push a working extensable pluggable cpu 
scheduler framework (based on wli's work) for the linux kernel. It was 
perma-vetoed by Linus and Ingo (and Nick also said he didn't like it) as 
being absolutely the wrong approach and that we should never do that. Oddly 
enough the linux-kernel-mailing list was -dead- at the time and the 
discussion did not make it to the mailing list. Every time I've tried to 
forward it to the mailing list the spam filter decided to drop it so most 
people have not even seen this original veto-forever discussion.

2. Since then I've been thinking/working on a cpu scheduler design that takes 
away all the guesswork out of scheduling and gives very predictable, as fair 
as possible, cpu distribution and latency while preserving as solid 
interactivity as possible within those confines. For weeks now, Ingo has said 
that the interactivity regressions were showstoppers and we should address 
them, never mind the fact that the so-called regressions were purely "it 
slows down linearly with load" which to me is perfectly desirable behaviour. 
While this was not perma-vetoed, I predicted pretty accurately your intent 
was to veto it based on this.

People kept claiming scheduling problems were few and far between but what was 
really happening is users were terrified of lkml and instead used 1. windows 
and 2. 2.4 kernels. The problems were there.

So where are we now? Here is where your latest patch comes in.

As a solution to the many scheduling problems we finally all agree exist, you 
propose a patch that adds 1. a limited pluggable framework and 2. a fairness 
based cpu scheduler policy... o_O

So I should be happy at last now that the things I was promoting you are also 
promoting, right? Well I'll fill in the rest of the gaps and let other people 
decide how I should feel.

> as usual, any sort of feedback, bugreports, fixes and suggestions are
> more than welcome,

In the last 4 weeks I've spent time lying in bed drugged to the eyeballs and 
having trips in and out of hospitals for my condition. I appreciate greatly 
the sympathy and patience from people in this regard. However at one stage I 
virtually begged for support with my attempts and help with the code. Dmitry 
Adamushko is the only person who actually helped me with the code in the 
interim, while others poked sticks at it. Sure the sticks helped at times but 
the sticks always seemed to have their ends kerosene doused and flaming for 
reasons I still don't get. No other help was forthcoming.

Now that you're agreeing my direction was correct you've done the usual Linux 
kernel thing - ignore all my previous code and write your own version. Oh 
well, that I've come to expect; at least you get a copyright notice in the 
bootup and somewhere in the comments give me credit for proving it's 
possible. Let's give some other credit here too. William Lee Irwin provided 
the major architecture behind plugsched at my request and I simply finished 
the work and got it working. He is also responsible for many IRC discussions 
I've had about cpu scheduling fairness, designs, programming history and code 
help. Even though he did not contribute code directly to SD, his comments 
have been invaluable.

So let's look at the code.

kernel/sched.c
kernel/sched_fair.c
kernel/sched_rt.c

It turns out this is not a pluggable cpu scheduler framework at all, and I 
guess you didn't really promote it as such. It's a "modular scheduler core". 
Which means you moved code from sched.c into sched_fair.c and sched_rt.c. 
This abstracts out each _scheduling policy's_ functions into struct 
sched_class and allows each scheduling policy's functions to be in a separate 
file etc.

Ok so what it means is that instead of whole cpu schedulers being able to be 
plugged into this framework we can plug in only cpu scheduling policies 
hrm... So let's look on

-#define SCHED_NORMAL   0

Ok once upon a time we rename SCHED_OTHER which every other unix calls the 
standard policy 99.9% of applications used into a more meaningful name, 
SCHED_NORMAL. That's fine since all it did was change the description 
internally for those reading the code. Let's see what you've done now:


Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

2007-04-14 Thread Con Kolivas
On Saturday 14 April 2007 06:21, Ingo Molnar wrote:
 [announce] [patch] Modular Scheduler Core and Completely Fair Scheduler
 [CFS]

 i'm pleased to announce the first release of the Modular Scheduler Core
 and Completely Fair Scheduler [CFS] patchset:

http://redhat.com/~mingo/cfs-scheduler/sched-modular+cfs.patch

 This project is a complete rewrite of the Linux task scheduler. My goal
 is to address various feature requests and to fix deficiencies in the
 vanilla scheduler that were suggested/found in the past few years, both
 for desktop scheduling and for server scheduling workloads.

The casual observer will be completely confused by what on earth has happened 
here so let me try to demystify things for them.

1. I tried in vain some time ago to push a working extensable pluggable cpu 
scheduler framework (based on wli's work) for the linux kernel. It was 
perma-vetoed by Linus and Ingo (and Nick also said he didn't like it) as 
being absolutely the wrong approach and that we should never do that. Oddly 
enough the linux-kernel-mailing list was -dead- at the time and the 
discussion did not make it to the mailing list. Every time I've tried to 
forward it to the mailing list the spam filter decided to drop it so most 
people have not even seen this original veto-forever discussion.

2. Since then I've been thinking/working on a cpu scheduler design that takes 
away all the guesswork out of scheduling and gives very predictable, as fair 
as possible, cpu distribution and latency while preserving as solid 
interactivity as possible within those confines. For weeks now, Ingo has said 
that the interactivity regressions were showstoppers and we should address 
them, never mind the fact that the so-called regressions were purely it 
slows down linearly with load which to me is perfectly desirable behaviour. 
While this was not perma-vetoed, I predicted pretty accurately your intent 
was to veto it based on this.

People kept claiming scheduling problems were few and far between but what was 
really happening is users were terrified of lkml and instead used 1. windows 
and 2. 2.4 kernels. The problems were there.

So where are we now? Here is where your latest patch comes in.

As a solution to the many scheduling problems we finally all agree exist, you 
propose a patch that adds 1. a limited pluggable framework and 2. a fairness 
based cpu scheduler policy... o_O

So I should be happy at last now that the things I was promoting you are also 
promoting, right? Well I'll fill in the rest of the gaps and let other people 
decide how I should feel.

 as usual, any sort of feedback, bugreports, fixes and suggestions are
 more than welcome,

In the last 4 weeks I've spent time lying in bed drugged to the eyeballs and 
having trips in and out of hospitals for my condition. I appreciate greatly 
the sympathy and patience from people in this regard. However at one stage I 
virtually begged for support with my attempts and help with the code. Dmitry 
Adamushko is the only person who actually helped me with the code in the 
interim, while others poked sticks at it. Sure the sticks helped at times but 
the sticks always seemed to have their ends kerosene doused and flaming for 
reasons I still don't get. No other help was forthcoming.

Now that you're agreeing my direction was correct you've done the usual Linux 
kernel thing - ignore all my previous code and write your own version. Oh 
well, that I've come to expect; at least you get a copyright notice in the 
bootup and somewhere in the comments give me credit for proving it's 
possible. Let's give some other credit here too. William Lee Irwin provided 
the major architecture behind plugsched at my request and I simply finished 
the work and got it working. He is also responsible for many IRC discussions 
I've had about cpu scheduling fairness, designs, programming history and code 
help. Even though he did not contribute code directly to SD, his comments 
have been invaluable.

So let's look at the code.

kernel/sched.c
kernel/sched_fair.c
kernel/sched_rt.c

It turns out this is not a pluggable cpu scheduler framework at all, and I 
guess you didn't really promote it as such. It's a modular scheduler core. 
Which means you moved code from sched.c into sched_fair.c and sched_rt.c. 
This abstracts out each _scheduling policy's_ functions into struct 
sched_class and allows each scheduling policy's functions to be in a separate 
file etc.

Ok so what it means is that instead of whole cpu schedulers being able to be 
plugged into this framework we can plug in only cpu scheduling policies 
hrm... So let's look on

-#define SCHED_NORMAL   0

Ok once upon a time we rename SCHED_OTHER which every other unix calls the 
standard policy 99.9% of applications used into a more meaningful name, 
SCHED_NORMAL. That's fine since all it did was change the description 
internally for those reading the code. Let's see what you've done now:

+#define SCHED_FAIR   

Re: [BUG] scheduler: first timeslice of the exiting thread

2007-04-13 Thread Con Kolivas
On Monday 09 April 2007 16:09, Andrew Morton wrote:
> On Sat, 07 Apr 2007 16:31:39 +0900 Satoru Takeuchi 
<[EMAIL PROTECTED]> wrote:
> > When I was examining the following program ...
> >
> >   1. There are a large amount of small jobs takes several msecs,
> >  and the number of job increases constantly.
> >   2. The process creates a thread or a process per job (I examined both
> >  the thread model and the process model).
> >   3. Each child process/thread does the assigned job and exit
> > immediately.
> >
> > ... I found that the thread model's latency is longer than proess
> > model's one against my expectation. It's because of the current
> > sched_fork()/sched_exit() implementation as follows:
> >
> >   a) On sched_fork, the creator share its timeslice with new process.
> >   b) On sched_exit, if the exiting process didn't exhaust its first
> >  timeslice yet, it gives its timeslice to the parent.
> >
> > It has no problem on the process model since the creator is the parent.
> > However, on the thread model, the creator is not the parent, it is same
> > as the creator's parent. Hence, on this kind of program, the creator
> > can't retrieve shared timeslice and exausts its timeslice at a rate of
> > knots. In addition, somehow, the parent (typically shell?) gets extra
> > timeslice.
> >
> > I believe it's a bug and the exiting process should give its timeslice
> > to the creator. Now I have some patch plan to fix this problem as follow:
> >
> >  a) Add the field for the creator to task_struct. It needs extra memory.
> >  b) Doesn't add extra field and have thread's parent the creater, which
> > is same as process creation. However it has many side effects, for
> > example, we also need to change sys_getppid() implementation.
> >
> > What do you think? Any comments are welcome.
>
> This comes at an awkward time, because we might well merge the
> staircase/deadline work into 2.6.22, and I think it rewrites the part of
> the scheduler which is causing the problems you're observing.
>
> Has anyone verified that SD fixes this problem and the one at
> http://lkml.org/lkml/2007/4/7/21 ?

No, SD does nothing different in this regard. There is no notion of who made 
the thread and who the remaining timeslice should go to. As you say, some 
decision on sched_exit should probably be used to decide where to send it. In 
the absence of yet more fields in task_struct, the easiest thing to do would 
be to check if the the thread id is equal to the pid and if not, send it to 
the pid (or any parent of that if it no longer exists). Whether it's worth 
adding yet another field to task_struct to track this or not I do not have 
enough information to make an informed choice in this regard. Either way I'm 
low on patch-creation cycles so please feel free to provide your solution.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Staircase Deadline cpu scheduler v 0.40

2007-04-13 Thread Con Kolivas
Hi all

I'm alive and mostly well (yay!). I still should _not_ be spending extended 
periods on the computer for my neck but this last change was I believe 
essential and not too much effort.

Here's an update which should improve behaviour further with niced tasks.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.5-sd-0.40.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-sd-0.40.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-mm1-sd-0.40.patch

The last patch (which is for 2.6.21-rc6-mm1) can be used as an incremental 
from any 0.39 patched kernel.

Pretty much all of the architectural stuff for SD and bugfixes are done now. 
Barring bugs/fixes, I plan to start work on implementing SCHED_ISO and 
SCHED_IDLEPRIO for it for -ck over the next few weeks, but only once I'm 
_completely_ well, meaning extended periods offline again (sorry to those who 
are emailing me I'm not ignoring you intentionally!).

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: implement staircase deadline scheduler rework priomatrix

2007-04-13 Thread Con Kolivas
Rework the priority matrix used by the staircase-deadline cpu scheduler.

Ensuring every nice level uses priority slot 139 means that niced tasks will
not cause expiration prematurely for less niced tasks that have been sleeping.

This also increases the frequency less niced tasks preempt niced tasks, and
simplifies greatly the code that generates the priority matrix.

Update the documentation accordingly and explain why the priority matrix
exists in the first place.

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 Documentation/sched-design.txt |   13 +++--
 kernel/sched.c |   37 -
 2 files changed, 23 insertions(+), 27 deletions(-)

Index: linux-2.6.21-rc6-mm1/kernel/sched.c
===
--- linux-2.6.21-rc6-mm1.orig/kernel/sched.c2007-04-14 00:25:36.0 
+1000
+++ linux-2.6.21-rc6-mm1/kernel/sched.c 2007-04-14 00:25:46.0 +1000
@@ -113,15 +113,19 @@ int rr_interval __read_mostly;
  * for the valid priorities each different nice level can have. It allows
  * us to stagger the slots where differing priorities run in a way that
  * keeps latency differences between different nice levels at a minimum.
+ * The purpose of a pre-generated matrix is for rapid lookup of next slot in
+ * O(1) time without having to recalculate every time priority gets demoted.
+ * All nice levels use priority slot 39 as this allows less niced tasks to
+ * get all priority slots better than that before expiration is forced.
  * ie, where 0 means a slot for that priority, priority running from left to
- * right:
+ * right is from prio 0 to prio 39:
  * nice -20 
- * nice -10 1001000100100010001001000100010010001000
- * nice   0 0101010101010101010101010101010101010101
- * nice   5 1101011010110101101011010110101101011011
- * nice  10 0110111011011101110110111011101101110111
- * nice  15 0101101101011011
- * nice  19 1110
+ * nice -10 100010001000100010001000100010001001
+ * nice   0 1010101010101010101010101010101010101010
+ * nice   5 1011010110110101101101011011010110110110
+ * nice  10 1110111011101110111011101110111011101110
+ * nice  15 11101110111011101110
+ * nice  19 1110
  */
 static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
 __read_mostly;
@@ -6983,20 +6987,11 @@ void __init sched_init(void)
 
/* Generate the priority matrix */
for (i = 0; i < PRIO_RANGE; i++) {
-   if (i < 20) {
-   bitmap_zero(prio_matrix[i] , PRIO_RANGE);
-   j = PRIO_RANGE * PRIO_RANGE / (i + 1);
-   for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j)
-   __set_bit(k / PRIO_RANGE, prio_matrix[i]);
-   } else if (i == 20) {
-   bitmap_fill(prio_matrix[i], PRIO_RANGE);
-   for (k = 1; k < PRIO_RANGE; k += 2)
-   __clear_bit(k, prio_matrix[i]);
-   } else {
-   bitmap_fill(prio_matrix[i], PRIO_RANGE);
-   j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i + 1);
-   for (k = j; k < PRIO_RANGE * PRIO_RANGE; k += j)
-   __clear_bit(k / PRIO_RANGE, prio_matrix[i]);
+   bitmap_fill(prio_matrix[i], PRIO_RANGE);
+   j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i);
+   for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) {
+   __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE),
+   prio_matrix[i]);
}
}
 
Index: linux-2.6.21-rc6-mm1/Documentation/sched-design.txt
===
--- linux-2.6.21-rc6-mm1.orig/Documentation/sched-design.txt2007-04-13 
17:01:43.0 +1000
+++ linux-2.6.21-rc6-mm1/Documentation/sched-design.txt 2007-04-14 
00:25:46.0 +1000
@@ -268,13 +268,14 @@ there are 40 priority slots where a task
 and the allocation of slots is dependant on nice level. In the
 following table, a zero represents a slot where the task may run.
 
+PRIORITY:0..20.39
 nice -20 
-nice -10 1001000100100010001001000100010010001000
-nice   0 0101010101010101010101010101010101010101
-nice   5 1101011010110101101011010110101101011011
-nice  10 0110111011011101110110111011101101110111
-nice  15 0101101101011011
-nice  19 1110
+nice -10 100010001000100010001000100010001001
+nice   0 1010101010101010101010101010101010101010
+nice   5 1011010110110101101101011011010110110110
+nice  10 11101110111011101110111011

[PATCH] sched: implement staircase deadline scheduler rework priomatrix

2007-04-13 Thread Con Kolivas
Rework the priority matrix used by the staircase-deadline cpu scheduler.

Ensuring every nice level uses priority slot 139 means that niced tasks will
not cause expiration prematurely for less niced tasks that have been sleeping.

This also increases the frequency less niced tasks preempt niced tasks, and
simplifies greatly the code that generates the priority matrix.

Update the documentation accordingly and explain why the priority matrix
exists in the first place.

Signed-off-by: Con Kolivas [EMAIL PROTECTED]

---
 Documentation/sched-design.txt |   13 +++--
 kernel/sched.c |   37 -
 2 files changed, 23 insertions(+), 27 deletions(-)

Index: linux-2.6.21-rc6-mm1/kernel/sched.c
===
--- linux-2.6.21-rc6-mm1.orig/kernel/sched.c2007-04-14 00:25:36.0 
+1000
+++ linux-2.6.21-rc6-mm1/kernel/sched.c 2007-04-14 00:25:46.0 +1000
@@ -113,15 +113,19 @@ int rr_interval __read_mostly;
  * for the valid priorities each different nice level can have. It allows
  * us to stagger the slots where differing priorities run in a way that
  * keeps latency differences between different nice levels at a minimum.
+ * The purpose of a pre-generated matrix is for rapid lookup of next slot in
+ * O(1) time without having to recalculate every time priority gets demoted.
+ * All nice levels use priority slot 39 as this allows less niced tasks to
+ * get all priority slots better than that before expiration is forced.
  * ie, where 0 means a slot for that priority, priority running from left to
- * right:
+ * right is from prio 0 to prio 39:
  * nice -20 
- * nice -10 1001000100100010001001000100010010001000
- * nice   0 0101010101010101010101010101010101010101
- * nice   5 1101011010110101101011010110101101011011
- * nice  10 0110111011011101110110111011101101110111
- * nice  15 0101101101011011
- * nice  19 1110
+ * nice -10 100010001000100010001000100010001001
+ * nice   0 1010101010101010101010101010101010101010
+ * nice   5 1011010110110101101101011011010110110110
+ * nice  10 1110111011101110111011101110111011101110
+ * nice  15 11101110111011101110
+ * nice  19 1110
  */
 static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
 __read_mostly;
@@ -6983,20 +6987,11 @@ void __init sched_init(void)
 
/* Generate the priority matrix */
for (i = 0; i  PRIO_RANGE; i++) {
-   if (i  20) {
-   bitmap_zero(prio_matrix[i] , PRIO_RANGE);
-   j = PRIO_RANGE * PRIO_RANGE / (i + 1);
-   for (k = j; k  PRIO_RANGE * PRIO_RANGE; k += j)
-   __set_bit(k / PRIO_RANGE, prio_matrix[i]);
-   } else if (i == 20) {
-   bitmap_fill(prio_matrix[i], PRIO_RANGE);
-   for (k = 1; k  PRIO_RANGE; k += 2)
-   __clear_bit(k, prio_matrix[i]);
-   } else {
-   bitmap_fill(prio_matrix[i], PRIO_RANGE);
-   j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i + 1);
-   for (k = j; k  PRIO_RANGE * PRIO_RANGE; k += j)
-   __clear_bit(k / PRIO_RANGE, prio_matrix[i]);
+   bitmap_fill(prio_matrix[i], PRIO_RANGE);
+   j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i);
+   for (k = 0; k = PRIO_RANGE * (PRIO_RANGE - 1); k += j) {
+   __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE),
+   prio_matrix[i]);
}
}
 
Index: linux-2.6.21-rc6-mm1/Documentation/sched-design.txt
===
--- linux-2.6.21-rc6-mm1.orig/Documentation/sched-design.txt2007-04-13 
17:01:43.0 +1000
+++ linux-2.6.21-rc6-mm1/Documentation/sched-design.txt 2007-04-14 
00:25:46.0 +1000
@@ -268,13 +268,14 @@ there are 40 priority slots where a task
 and the allocation of slots is dependant on nice level. In the
 following table, a zero represents a slot where the task may run.
 
+PRIORITY:0..20.39
 nice -20 
-nice -10 1001000100100010001001000100010010001000
-nice   0 0101010101010101010101010101010101010101
-nice   5 1101011010110101101011010110101101011011
-nice  10 0110111011011101110110111011101101110111
-nice  15 0101101101011011
-nice  19 1110
+nice -10 100010001000100010001000100010001001
+nice   0 1010101010101010101010101010101010101010
+nice   5 1011010110110101101101011011010110110110
+nice  10 1110111011101110111011101110111011101110
+nice  15

Staircase Deadline cpu scheduler v 0.40

2007-04-13 Thread Con Kolivas
Hi all

I'm alive and mostly well (yay!). I still should _not_ be spending extended 
periods on the computer for my neck but this last change was I believe 
essential and not too much effort.

Here's an update which should improve behaviour further with niced tasks.

http://ck.kolivas.org/patches/staircase-deadline/2.6.20.5-sd-0.40.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-sd-0.40.patch
http://ck.kolivas.org/patches/staircase-deadline/2.6.21-rc6-mm1-sd-0.40.patch

The last patch (which is for 2.6.21-rc6-mm1) can be used as an incremental 
from any 0.39 patched kernel.

Pretty much all of the architectural stuff for SD and bugfixes are done now. 
Barring bugs/fixes, I plan to start work on implementing SCHED_ISO and 
SCHED_IDLEPRIO for it for -ck over the next few weeks, but only once I'm 
_completely_ well, meaning extended periods offline again (sorry to those who 
are emailing me I'm not ignoring you intentionally!).

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [BUG] scheduler: first timeslice of the exiting thread

2007-04-13 Thread Con Kolivas
On Monday 09 April 2007 16:09, Andrew Morton wrote:
 On Sat, 07 Apr 2007 16:31:39 +0900 Satoru Takeuchi 
[EMAIL PROTECTED] wrote:
  When I was examining the following program ...
 
1. There are a large amount of small jobs takes several msecs,
   and the number of job increases constantly.
2. The process creates a thread or a process per job (I examined both
   the thread model and the process model).
3. Each child process/thread does the assigned job and exit
  immediately.
 
  ... I found that the thread model's latency is longer than proess
  model's one against my expectation. It's because of the current
  sched_fork()/sched_exit() implementation as follows:
 
a) On sched_fork, the creator share its timeslice with new process.
b) On sched_exit, if the exiting process didn't exhaust its first
   timeslice yet, it gives its timeslice to the parent.
 
  It has no problem on the process model since the creator is the parent.
  However, on the thread model, the creator is not the parent, it is same
  as the creator's parent. Hence, on this kind of program, the creator
  can't retrieve shared timeslice and exausts its timeslice at a rate of
  knots. In addition, somehow, the parent (typically shell?) gets extra
  timeslice.
 
  I believe it's a bug and the exiting process should give its timeslice
  to the creator. Now I have some patch plan to fix this problem as follow:
 
   a) Add the field for the creator to task_struct. It needs extra memory.
   b) Doesn't add extra field and have thread's parent the creater, which
  is same as process creation. However it has many side effects, for
  example, we also need to change sys_getppid() implementation.
 
  What do you think? Any comments are welcome.

 This comes at an awkward time, because we might well merge the
 staircase/deadline work into 2.6.22, and I think it rewrites the part of
 the scheduler which is causing the problems you're observing.

 Has anyone verified that SD fixes this problem and the one at
 http://lkml.org/lkml/2007/4/7/21 ?

No, SD does nothing different in this regard. There is no notion of who made 
the thread and who the remaining timeslice should go to. As you say, some 
decision on sched_exit should probably be used to decide where to send it. In 
the absence of yet more fields in task_struct, the easiest thing to do would 
be to check if the the thread id is equal to the pid and if not, send it to 
the pid (or any parent of that if it no longer exists). Whether it's worth 
adding yet another field to task_struct to track this or not I do not have 
enough information to make an informed choice in this regard. Either way I'm 
low on patch-creation cycles so please feel free to provide your solution.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Ten percent test

2007-04-07 Thread Con Kolivas
On Friday 06 April 2007 20:03, Ingo Molnar wrote:
> * Con Kolivas <[EMAIL PROTECTED]> wrote:
> > > I was more focused on the general case, but all I should have to do
> > > to de-claw all of these sleep exploits is account rr time (only a
> > > couple of lines, done and building now).  It's only a couple of
> > > lines.
> >
> > The more you try to "de-claw" these sleep exploits the less effective
> > you make your precious interactive estimator. Feel free to keep adding
> > endless tweaks to undo the other tweaks in order to try and achieve
> > what SD has by design.
>
> firstly, testing on various workloads Mike's tweaks work pretty well,
> while SD still doesnt handle the high-load case all that well. Note that
> it was you who raised this whole issue to begin with: everything was
> pretty quiet in scheduling interactivity land.

I'm terribly sorry but you have completely missed my intentions then. I was 
_not_ trying to improve mainline's interactivity at all. My desire was to fix 
the unfairness that mainline has, across the board without compromising 
fairness. You said yourself that an approach that fixed a lot and had a small 
number of regressions would be worth it. In a surprisingly ironic turnaround 
two bizarre things happened. People found SD fixed a lot of their 
interactivity corner cases which were showstoppers. That didn't surprise me 
because any unfair design will by its nature get it wrong sometimes. The even 
_more_ surprising thing is that you're now using interactivity as the 
argument against SD. I did not set out to create better interactivity, I set 
out to create widespread fairness without too much compromise to 
interactivity. As I said from the _very first email_, there would be cases of 
interactivity in mainline that performed better.

> (There was one person who 
> reported wide-scale interactivity regressions against mainline but he
> didnt answer my followup posts to trace/debug the scenario.)

That was one user. As I mentioned in an earlier thread, the problem with email 
threads on drawn out issues on lkml is that all that people remember is the 
last one creating noise, and that has only been the noise from Mike for 2 
weeks now. Has everyone forgotten the many many users who reported the 
advantages first up which generated the interest in the first place? Why have 
they stopped reporting? Well the answer is obvious; all the signs suggest 
that SD is slated for mainline. It is on the path, Linus has suggested it and 
now akpm is asking if it's ready for 2.6.22. So they figure there is no point 
testing and replying any further. SD is ready for prime time, finalised and 
does everything I intended it to. This is where I have to reveal to them the 
horrible truth. This is no guarantee it will go in. In fact, this one point 
that you (Ingo) go on and on about is not only a quibble, but you will call 
it an absolute showstopper. As maintainer of the cpu scheduler, in its 
current form you will flatly refuse it goes to mainline citing the 5% of 
cases where interactivity has regressed. So people will tell me to fix it, 
right?... Read on for this to unfold.

> SD has a built-in "interactivity estimator" as well, but hardcoded into
> its design. SD has its own set of ugly-looking tweaks as well - for
> example the prio_matrix.

I'm sorry but this is a mis-representation to me, as I suggested on an earlier 
thread where I disagree about what an interactivity estimator is. The idea of 
fence posts in a clock that are passed as a way of metering out 
earliest-deadline-first in a design is well established. The matrix is simply 
an array designed for O(1) lookups of the fence posts. That is not the same 
as "oh how much have we slept in the last $magic_number period and how much 
extra time should we get for that".

> So it all comes down on 'what interactivity 
> heuristics is enough', and which one is more tweakable. So far i've yet
> to see SD address the hackbench and make -j interactivity
> problems/regression for example, while Mike has been busy addressing the
> 'exploits' reported against mainline.

And BANG there is the bullet you will use against SD from here to eternity. SD 
obeys fairness at all costs. Your interactivity regression is that SD causes 
progressive slowdown with load which by definition is fairness. You 
repeatedly ask me to address it and there is on unfailing truth; the only way 
to address it is to add unfairness to the design. So why don't I? Because the 
simple fact is that any unfairness no matter how carefully administered or 
metered will always have cases where it's wrong. Look at the title of this 
email for example - it's yet another exploit for the mainline sleep/run 
mechanism. This does _not_ mean I'm implying people are logging into servers 
and running ./tenp to hang the machine. What 

Re: Ten percent test

2007-04-07 Thread Con Kolivas
On Friday 06 April 2007 20:03, Ingo Molnar wrote:
 * Con Kolivas [EMAIL PROTECTED] wrote:
   I was more focused on the general case, but all I should have to do
   to de-claw all of these sleep exploits is account rr time (only a
   couple of lines, done and building now).  It's only a couple of
   lines.
 
  The more you try to de-claw these sleep exploits the less effective
  you make your precious interactive estimator. Feel free to keep adding
  endless tweaks to undo the other tweaks in order to try and achieve
  what SD has by design.

 firstly, testing on various workloads Mike's tweaks work pretty well,
 while SD still doesnt handle the high-load case all that well. Note that
 it was you who raised this whole issue to begin with: everything was
 pretty quiet in scheduling interactivity land.

I'm terribly sorry but you have completely missed my intentions then. I was 
_not_ trying to improve mainline's interactivity at all. My desire was to fix 
the unfairness that mainline has, across the board without compromising 
fairness. You said yourself that an approach that fixed a lot and had a small 
number of regressions would be worth it. In a surprisingly ironic turnaround 
two bizarre things happened. People found SD fixed a lot of their 
interactivity corner cases which were showstoppers. That didn't surprise me 
because any unfair design will by its nature get it wrong sometimes. The even 
_more_ surprising thing is that you're now using interactivity as the 
argument against SD. I did not set out to create better interactivity, I set 
out to create widespread fairness without too much compromise to 
interactivity. As I said from the _very first email_, there would be cases of 
interactivity in mainline that performed better.

 (There was one person who 
 reported wide-scale interactivity regressions against mainline but he
 didnt answer my followup posts to trace/debug the scenario.)

That was one user. As I mentioned in an earlier thread, the problem with email 
threads on drawn out issues on lkml is that all that people remember is the 
last one creating noise, and that has only been the noise from Mike for 2 
weeks now. Has everyone forgotten the many many users who reported the 
advantages first up which generated the interest in the first place? Why have 
they stopped reporting? Well the answer is obvious; all the signs suggest 
that SD is slated for mainline. It is on the path, Linus has suggested it and 
now akpm is asking if it's ready for 2.6.22. So they figure there is no point 
testing and replying any further. SD is ready for prime time, finalised and 
does everything I intended it to. This is where I have to reveal to them the 
horrible truth. This is no guarantee it will go in. In fact, this one point 
that you (Ingo) go on and on about is not only a quibble, but you will call 
it an absolute showstopper. As maintainer of the cpu scheduler, in its 
current form you will flatly refuse it goes to mainline citing the 5% of 
cases where interactivity has regressed. So people will tell me to fix it, 
right?... Read on for this to unfold.

 SD has a built-in interactivity estimator as well, but hardcoded into
 its design. SD has its own set of ugly-looking tweaks as well - for
 example the prio_matrix.

I'm sorry but this is a mis-representation to me, as I suggested on an earlier 
thread where I disagree about what an interactivity estimator is. The idea of 
fence posts in a clock that are passed as a way of metering out 
earliest-deadline-first in a design is well established. The matrix is simply 
an array designed for O(1) lookups of the fence posts. That is not the same 
as oh how much have we slept in the last $magic_number period and how much 
extra time should we get for that.

 So it all comes down on 'what interactivity 
 heuristics is enough', and which one is more tweakable. So far i've yet
 to see SD address the hackbench and make -j interactivity
 problems/regression for example, while Mike has been busy addressing the
 'exploits' reported against mainline.

And BANG there is the bullet you will use against SD from here to eternity. SD 
obeys fairness at all costs. Your interactivity regression is that SD causes 
progressive slowdown with load which by definition is fairness. You 
repeatedly ask me to address it and there is on unfailing truth; the only way 
to address it is to add unfairness to the design. So why don't I? Because the 
simple fact is that any unfairness no matter how carefully administered or 
metered will always have cases where it's wrong. Look at the title of this 
email for example - it's yet another exploit for the mainline sleep/run 
mechanism. This does _not_ mean I'm implying people are logging into servers 
and running ./tenp to hang the machine. What it demonstrates is a way of 
reproducing the scenario which is biting people with real world loads. It's 
entirely believable that a simple p2p app could be behaving like tenp, only 
generating a small

Re: Ten percent test

2007-04-06 Thread Con Kolivas
On Friday 06 April 2007 19:07, Mike Galbraith wrote:
> On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote:
> > On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
> > >  - fiftyp.c:  noticeable, but alot better than previously!
> >
> > fiftyp.c seems to have been stumbled across by accident as having an
> > effect when Xenofon was trying to recreate Mike's 50% x 3 test case. I
> > suggest a ten percent version like the following would be more useful as
> > a test for the harmful effect discovered in fiftyp.c. (/me throws in
> > obligatory code style change).
> >
> > Starts 15 processes that sleep ten times longer than they run. Change
> > forks to 15 times the number of cpus you have and it should work on any
> > size hardware.
>
> I was more focused on the general case, but all I should have to do to
> de-claw all of these sleep exploits is account rr time (only a couple of
> lines, done and building now).  It's only a couple of lines.

The more you try to "de-claw" these sleep exploits the less effective you make 
your precious interactive estimator. Feel free to keep adding endless tweaks 
to undo the other tweaks in order to try and achieve what SD has by design. 
You'll end up with an incresingly complex state machine design of 
interactivity tweaks and interactivity throttlers all fighting each other to 
the point where the intearactivity estimator doesn't do anything. What's the 
point in that? Eventually you'll have an estimator throttled to the point it 
does nothing and you end up with something far less interactive than SD which 
is as interactive as fairness allows, unlike mainline.

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Ten percent test

2007-04-06 Thread Con Kolivas
On Friday 06 April 2007 19:07, Mike Galbraith wrote:
 On Fri, 2007-04-06 at 11:03 +1000, Con Kolivas wrote:
  On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
- fiftyp.c:  noticeable, but alot better than previously!
 
  fiftyp.c seems to have been stumbled across by accident as having an
  effect when Xenofon was trying to recreate Mike's 50% x 3 test case. I
  suggest a ten percent version like the following would be more useful as
  a test for the harmful effect discovered in fiftyp.c. (/me throws in
  obligatory code style change).
 
  Starts 15 processes that sleep ten times longer than they run. Change
  forks to 15 times the number of cpus you have and it should work on any
  size hardware.

 I was more focused on the general case, but all I should have to do to
 de-claw all of these sleep exploits is account rr time (only a couple of
 lines, done and building now).  It's only a couple of lines.

The more you try to de-claw these sleep exploits the less effective you make 
your precious interactive estimator. Feel free to keep adding endless tweaks 
to undo the other tweaks in order to try and achieve what SD has by design. 
You'll end up with an incresingly complex state machine design of 
interactivity tweaks and interactivity throttlers all fighting each other to 
the point where the intearactivity estimator doesn't do anything. What's the 
point in that? Eventually you'll have an estimator throttled to the point it 
does nothing and you end up with something far less interactive than SD which 
is as interactive as fairness allows, unlike mainline.

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Ten percent test

2007-04-05 Thread Con Kolivas
On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
>  - fiftyp.c:  noticeable, but alot better than previously!

fiftyp.c seems to have been stumbled across by accident as having an effect 
when Xenofon was trying to recreate Mike's 50% x 3 test case. I suggest a ten 
percent version like the following would be more useful as a test for the 
harmful effect discovered in fiftyp.c. (/me throws in obligatory code style 
change).

Starts 15 processes that sleep ten times longer than they run. Change forks to 
15 times the number of cpus you have and it should work on any size hardware.

-- 
-ck
// gcc -O2 -o tenp tenp.c -lrt
// code from interbench.c
#include 
#include 
#include 
#include 
#include 
#include 
/*
 * Start $forks processes that run for 10% cpu time each. Set this to
 * 15 * number of cpus for best effect.
 */
int forks = 15;

unsigned long run_us = 10, sleep_us;
unsigned long loops_per_ms;

void terminal_error(const char *name)
{
	fprintf(stderr, "\n");
	perror(name);
	exit (1);
}

unsigned long long get_nsecs(struct timespec *myts)
{
	if (clock_gettime(CLOCK_REALTIME, myts))
		terminal_error("clock_gettime");
	return (myts->tv_sec * 10 + myts->tv_nsec );
}

void burn_loops(unsigned long loops)
{
	unsigned long i;

	/*
	 * We need some magic here to prevent the compiler from optimising
	 * this loop away. Otherwise trying to emulate a fixed cpu load
	 * with this loop will not work.
	 */
	for (i = 0 ; i < loops ; i++)
	 asm volatile("" : : : "memory");
}

/* Use this many usecs of cpu time */
void burn_usecs(unsigned long usecs)
{
	unsigned long ms_loops;

	ms_loops = loops_per_ms / 1000 * usecs;
	burn_loops(ms_loops);
}

void microsleep(unsigned long long usecs)
{
	struct timespec req, rem;

	rem.tv_sec = rem.tv_nsec = 0;

	req.tv_sec = usecs / 100;
	req.tv_nsec = (usecs - (req.tv_sec * 100)) * 1000;
continue_sleep:
	if ((nanosleep(, )) == -1) {
		if (errno == EINTR) {
			if (rem.tv_sec || rem.tv_nsec) {
req.tv_sec = rem.tv_sec;
req.tv_nsec = rem.tv_nsec;
goto continue_sleep;
			}
			goto out;
		}
		terminal_error("nanosleep");
	}
out:
	return;
}

/*
 * In an unoptimised loop we try to benchmark how many meaningless loops
 * per second we can perform on this hardware to fairly accurately
 * reproduce certain percentage cpu usage
 */
void calibrate_loop(void)
{
	unsigned long long start_time, loops_per_msec, run_time = 0,
		min_run_us = run_us;
	unsigned long loops;
	struct timespec myts;
	int i;

	printf("Calibrating loop\n");
	loops_per_msec = 100;
redo:
	/* Calibrate to within 1% accuracy */
	while (run_time > 101 || run_time < 99) {
		loops = loops_per_msec;
		start_time = get_nsecs();
		burn_loops(loops);
		run_time = get_nsecs() - start_time;
		loops_per_msec = (100 * loops_per_msec / run_time ? :
			loops_per_msec);
	}

	/* Rechecking after a pause increases reproducibility */
	microsleep(1);
	loops = loops_per_msec;
	start_time = get_nsecs();
	burn_loops(loops);
	run_time = get_nsecs() - start_time;

	/* Tolerate 5% difference on checking */
	if (run_time > 105 || run_time < 95)
		goto redo;
	loops_per_ms=loops_per_msec;
	printf("Calibrating sleep interval\n");
	microsleep(1);
	/* Find the smallest time interval close to 1ms that we can sleep */
	for (i = 0; i < 100; i++) {
		start_time=get_nsecs();
		microsleep(1000);
		run_time=get_nsecs()-start_time;
		run_time /= 1000;
		if (run_time < run_us && run_us > 1000)
			run_us = run_time;
	}
	/* Then set run_us to that duration and sleep_us to 9 x that */
	sleep_us = run_us * 9;
	printf("Calibrating run interval\n");
	microsleep(1);
	/* Do a few runs to see what really gets us run_us runtime */
	for (i = 0; i < 100; i++) {
		start_time=get_nsecs();
		burn_usecs(run_us);
		run_time=get_nsecs()-start_time;
		run_time /= 1000;
		if (run_time < min_run_us && run_time > run_us)
			min_run_us = run_time;
	}
	if (min_run_us < run_us)
		run_us = run_us * run_us / min_run_us;
	printf("Each fork will run for %lu usecs and sleep for %lu usecs\n",
		run_us, sleep_us);
}

int main(void){
	int i;

	calibrate_loop();
	printf("starting %d forks\n", forks);
	for(i = 1; i < forks; i++){
		if(!fork())
			break;
	}
	while(1){
		burn_usecs(run_us);
		microsleep(sleep_us);
	}
	return 0;
}


Re: [test] sched: SD-latest versus Mike's latest

2007-04-05 Thread Con Kolivas
On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
> * Mike Galbraith <[EMAIL PROTECTED]> wrote:
> > On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:
> > > looks interesting - could you send the patch?
> >
> > Ok, this is looking/feeling pretty good in testing.  Comments on
> > fugliness etc much appreciated.
> >
> > Below the numbers is a snapshot of my experimental tree.  It's a
> > mixture of my old throttling/anti-starvation tree and the task

Throttling to try to get to SD fairness? The mainline state machine becomes 
more complex than ever and fluctuates from interactive to fair by an as-yet 
unchosen magic number timeframe which ebbs and flows.

> > promotion patch, with the addition of a scheduling class for
> > interactive tasks to dish out some of that targeted unfairness I
> > mentioned.

Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New 
scheduling class just for X? Sounds like a very complicated 
userspace-changing way to just do the equivalent of "nice -n -10" obfuscated. 

> here's some test results, comparing SD-latest to Mike's-latest:
>
> re-testing the weak points of the vanilla scheduler + Mike's:
>
>  - thud.c:this workload has almost unnoticeable effect
>  - fiftyp.c:  noticeable, but alot better than previously!

Load of 1.5 makes mainline a doorstop without throttling.

> re-testing the weak points of SD:
>
>  - hackbench: still unusable under such type of high load - no improvement.

Load of 160. Is proportional slowdown bad?

>  - make -j:   still less interactive than Mike's - no improvement.

Depends on how big your job number vs cpu is. The better the throttling gets 
with mainline the better SD gets in this comparison. At equal fairness 
mainline does not have the low latency interactivity SD has.

Nice -10 X with SD is a far better solution than an ever increasing complexity 
state machine and a userspace-changing scheduling policy just for X. Half 
decent graphics cards get good interactivity with SD even without renicing.

>   Ingo

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Ten percent test

2007-04-05 Thread Con Kolivas
On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
  - fiftyp.c:  noticeable, but alot better than previously!

fiftyp.c seems to have been stumbled across by accident as having an effect 
when Xenofon was trying to recreate Mike's 50% x 3 test case. I suggest a ten 
percent version like the following would be more useful as a test for the 
harmful effect discovered in fiftyp.c. (/me throws in obligatory code style 
change).

Starts 15 processes that sleep ten times longer than they run. Change forks to 
15 times the number of cpus you have and it should work on any size hardware.

-- 
-ck
// gcc -O2 -o tenp tenp.c -lrt
// code from interbench.c
#include stdio.h
#include stdlib.h
#include time.h
#include unistd.h
#include errno.h
#include sys/types.h
/*
 * Start $forks processes that run for 10% cpu time each. Set this to
 * 15 * number of cpus for best effect.
 */
int forks = 15;

unsigned long run_us = 10, sleep_us;
unsigned long loops_per_ms;

void terminal_error(const char *name)
{
	fprintf(stderr, \n);
	perror(name);
	exit (1);
}

unsigned long long get_nsecs(struct timespec *myts)
{
	if (clock_gettime(CLOCK_REALTIME, myts))
		terminal_error(clock_gettime);
	return (myts-tv_sec * 10 + myts-tv_nsec );
}

void burn_loops(unsigned long loops)
{
	unsigned long i;

	/*
	 * We need some magic here to prevent the compiler from optimising
	 * this loop away. Otherwise trying to emulate a fixed cpu load
	 * with this loop will not work.
	 */
	for (i = 0 ; i  loops ; i++)
	 asm volatile( : : : memory);
}

/* Use this many usecs of cpu time */
void burn_usecs(unsigned long usecs)
{
	unsigned long ms_loops;

	ms_loops = loops_per_ms / 1000 * usecs;
	burn_loops(ms_loops);
}

void microsleep(unsigned long long usecs)
{
	struct timespec req, rem;

	rem.tv_sec = rem.tv_nsec = 0;

	req.tv_sec = usecs / 100;
	req.tv_nsec = (usecs - (req.tv_sec * 100)) * 1000;
continue_sleep:
	if ((nanosleep(req, rem)) == -1) {
		if (errno == EINTR) {
			if (rem.tv_sec || rem.tv_nsec) {
req.tv_sec = rem.tv_sec;
req.tv_nsec = rem.tv_nsec;
goto continue_sleep;
			}
			goto out;
		}
		terminal_error(nanosleep);
	}
out:
	return;
}

/*
 * In an unoptimised loop we try to benchmark how many meaningless loops
 * per second we can perform on this hardware to fairly accurately
 * reproduce certain percentage cpu usage
 */
void calibrate_loop(void)
{
	unsigned long long start_time, loops_per_msec, run_time = 0,
		min_run_us = run_us;
	unsigned long loops;
	struct timespec myts;
	int i;

	printf(Calibrating loop\n);
	loops_per_msec = 100;
redo:
	/* Calibrate to within 1% accuracy */
	while (run_time  101 || run_time  99) {
		loops = loops_per_msec;
		start_time = get_nsecs(myts);
		burn_loops(loops);
		run_time = get_nsecs(myts) - start_time;
		loops_per_msec = (100 * loops_per_msec / run_time ? :
			loops_per_msec);
	}

	/* Rechecking after a pause increases reproducibility */
	microsleep(1);
	loops = loops_per_msec;
	start_time = get_nsecs(myts);
	burn_loops(loops);
	run_time = get_nsecs(myts) - start_time;

	/* Tolerate 5% difference on checking */
	if (run_time  105 || run_time  95)
		goto redo;
	loops_per_ms=loops_per_msec;
	printf(Calibrating sleep interval\n);
	microsleep(1);
	/* Find the smallest time interval close to 1ms that we can sleep */
	for (i = 0; i  100; i++) {
		start_time=get_nsecs(myts);
		microsleep(1000);
		run_time=get_nsecs(myts)-start_time;
		run_time /= 1000;
		if (run_time  run_us  run_us  1000)
			run_us = run_time;
	}
	/* Then set run_us to that duration and sleep_us to 9 x that */
	sleep_us = run_us * 9;
	printf(Calibrating run interval\n);
	microsleep(1);
	/* Do a few runs to see what really gets us run_us runtime */
	for (i = 0; i  100; i++) {
		start_time=get_nsecs(myts);
		burn_usecs(run_us);
		run_time=get_nsecs(myts)-start_time;
		run_time /= 1000;
		if (run_time  min_run_us  run_time  run_us)
			min_run_us = run_time;
	}
	if (min_run_us  run_us)
		run_us = run_us * run_us / min_run_us;
	printf(Each fork will run for %lu usecs and sleep for %lu usecs\n,
		run_us, sleep_us);
}

int main(void){
	int i;

	calibrate_loop();
	printf(starting %d forks\n, forks);
	for(i = 1; i  forks; i++){
		if(!fork())
			break;
	}
	while(1){
		burn_usecs(run_us);
		microsleep(sleep_us);
	}
	return 0;
}


Re: [test] sched: SD-latest versus Mike's latest

2007-04-05 Thread Con Kolivas
On Thursday 05 April 2007 21:54, Ingo Molnar wrote:
 * Mike Galbraith [EMAIL PROTECTED] wrote:
  On Tue, 2007-04-03 at 08:01 +0200, Ingo Molnar wrote:
   looks interesting - could you send the patch?
 
  Ok, this is looking/feeling pretty good in testing.  Comments on
  fugliness etc much appreciated.
 
  Below the numbers is a snapshot of my experimental tree.  It's a
  mixture of my old throttling/anti-starvation tree and the task

Throttling to try to get to SD fairness? The mainline state machine becomes 
more complex than ever and fluctuates from interactive to fair by an as-yet 
unchosen magic number timeframe which ebbs and flows.

  promotion patch, with the addition of a scheduling class for
  interactive tasks to dish out some of that targeted unfairness I
  mentioned.

Nice -10 on mainline ruins the latency of nice 0 tasks unlike SD. New 
scheduling class just for X? Sounds like a very complicated 
userspace-changing way to just do the equivalent of nice -n -10 obfuscated. 

 here's some test results, comparing SD-latest to Mike's-latest:

 re-testing the weak points of the vanilla scheduler + Mike's:

  - thud.c:this workload has almost unnoticeable effect
  - fiftyp.c:  noticeable, but alot better than previously!

Load of 1.5 makes mainline a doorstop without throttling.

 re-testing the weak points of SD:

  - hackbench: still unusable under such type of high load - no improvement.

Load of 160. Is proportional slowdown bad?

  - make -j:   still less interactive than Mike's - no improvement.

Depends on how big your job number vs cpu is. The better the throttling gets 
with mainline the better SD gets in this comparison. At equal fairness 
mainline does not have the low latency interactivity SD has.

Nice -10 X with SD is a far better solution than an ever increasing complexity 
state machine and a userspace-changing scheduling policy just for X. Half 
decent graphics cards get good interactivity with SD even without renicing.

   Ingo

-- 
-ck
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.21-rc5-mm4

2007-04-04 Thread Con Kolivas
On Thursday 05 April 2007 08:10, Andrew Morton wrote:
> Thanks - that'll be the CPU scheduler changes.
>
> Con has produced a patch or two which might address this but afaik we don't
> yet have a definitive fix?
>
> I believe that reverting
> sched-implement-staircase-deadline-cpu-scheduler-staircase-improvements.pat
>ch will prevent it.

I posted a definitive fix which Michal tested for me offlist. Subject was:
 [PATCH] sched: implement staircase deadline cpu scheduler improvements fix

Sorry about relative noise prior to that. Akpm please pick it up.

Here again just in case.

---
Use of memset was bogus. Fix it.

Fix exiting recalc_task_prio without p->array being updated.

Microoptimisation courtesy of Dmitry Adamushko <[EMAIL PROTECTED]>

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c |   17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

Index: linux-2.6.21-rc5-mm4/kernel/sched.c
===
--- linux-2.6.21-rc5-mm4.orig/kernel/sched.c2007-04-04 12:14:29.0 
+1000
+++ linux-2.6.21-rc5-mm4/kernel/sched.c 2007-04-04 12:49:39.0 +1000
@@ -683,11 +683,13 @@ static void dequeue_task(struct task_str
  * The task is being queued on a fresh array so it has its entitlement
  * bitmap cleared.
  */
-static inline void task_new_array(struct task_struct *p, struct rq *rq)
+static void task_new_array(struct task_struct *p, struct rq *rq,
+  struct prio_array *array)
 {
bitmap_zero(p->bitmap, PRIO_RANGE);
p->rotation = rq->prio_rotation;
p->time_slice = p->quota;
+   p->array = array;
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
@@ -709,6 +711,8 @@ static inline int next_entitled_slot(str
DECLARE_BITMAP(tmp, PRIO_RANGE);
int search_prio, uprio = USER_PRIO(p->static_prio);
 
+   if (!rq->prio_level[uprio])
+   rq->prio_level[uprio] = MAX_RT_PRIO;
/*
 * Only priorities equal to the prio_level and above for their
 * static_prio are acceptable, and only if it's not better than
@@ -736,11 +740,8 @@ static inline int next_entitled_slot(str
 
 static void queue_expired(struct task_struct *p, struct rq *rq)
 {
-   p->array = rq->expired;
-   task_new_array(p, rq);
+   task_new_array(p, rq, rq->expired);
p->prio = p->normal_prio = first_prio_slot(p);
-   p->time_slice = p->quota;
-   p->rotation = rq->prio_rotation;
 }
 
 #ifdef CONFIG_SMP
@@ -800,9 +801,9 @@ static void recalc_task_prio(struct task
queue_expired(p, rq);
return;
} else
-   task_new_array(p, rq);
+   task_new_array(p, rq, array);
} else
-   task_new_array(p, rq);
+   task_new_array(p, rq, array);
 
queue_prio = next_entitled_slot(p, rq);
if (queue_prio >= MAX_PRIO) {
@@ -3445,7 +3446,7 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 static inline void reset_prio_levels(struct rq *rq)
 {
-   memset(rq->prio_level, MAX_RT_PRIO, ARRAY_SIZE(rq->prio_level));
+   memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE);
 }
 
 /*

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: implement staircase deadline cpu scheduler improvements fix

2007-04-04 Thread Con Kolivas
On Wednesday 04 April 2007 09:31, Michal Piotrowski wrote:
> Con Kolivas napisał(a):
> > On Wednesday 04 April 2007 08:20, Michal Piotrowski wrote:
> >> Michal Piotrowski napisał(a):
> >>> http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.21-rc5-mm4
> >>>/m m-oops
> >>> http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.21-rc5-mm4
> >>>/m m-config

> >> Con, I think that your
> >> sched-implement-staircase-deadline-cpu-scheduler-staircase-improvements.
> >>pat ch is causing this oops.
> >
> > Thanks for heads up!

Confirmed offline with Michal that the following patch fixes it. Thanks!

This should also make nice work better in the way the previous patch intended
it to.

---
Use of memset was bogus. Fix it.

Fix exiting recalc_task_prio without p->array being updated.

Microoptimisation courtesy of Dmitry Adamushko <[EMAIL PROTECTED]>

Signed-off-by: Con Kolivas <[EMAIL PROTECTED]>

---
 kernel/sched.c |   17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

Index: linux-2.6.21-rc5-mm4/kernel/sched.c
===
--- linux-2.6.21-rc5-mm4.orig/kernel/sched.c2007-04-04 12:14:29.0 
+1000
+++ linux-2.6.21-rc5-mm4/kernel/sched.c 2007-04-04 12:49:39.0 +1000
@@ -683,11 +683,13 @@ static void dequeue_task(struct task_str
  * The task is being queued on a fresh array so it has its entitlement
  * bitmap cleared.
  */
-static inline void task_new_array(struct task_struct *p, struct rq *rq)
+static void task_new_array(struct task_struct *p, struct rq *rq,
+  struct prio_array *array)
 {
bitmap_zero(p->bitmap, PRIO_RANGE);
p->rotation = rq->prio_rotation;
p->time_slice = p->quota;
+   p->array = array;
 }
 
 /* Find the first slot from the relevant prio_matrix entry */
@@ -709,6 +711,8 @@ static inline int next_entitled_slot(str
DECLARE_BITMAP(tmp, PRIO_RANGE);
int search_prio, uprio = USER_PRIO(p->static_prio);
 
+   if (!rq->prio_level[uprio])
+   rq->prio_level[uprio] = MAX_RT_PRIO;
/*
 * Only priorities equal to the prio_level and above for their
 * static_prio are acceptable, and only if it's not better than
@@ -736,11 +740,8 @@ static inline int next_entitled_slot(str
 
 static void queue_expired(struct task_struct *p, struct rq *rq)
 {
-   p->array = rq->expired;
-   task_new_array(p, rq);
+   task_new_array(p, rq, rq->expired);
p->prio = p->normal_prio = first_prio_slot(p);
-   p->time_slice = p->quota;
-   p->rotation = rq->prio_rotation;
 }
 
 #ifdef CONFIG_SMP
@@ -800,9 +801,9 @@ static void recalc_task_prio(struct task
queue_expired(p, rq);
return;
} else
-   task_new_array(p, rq);
+   task_new_array(p, rq, array);
} else
-   task_new_array(p, rq);
+   task_new_array(p, rq, array);
 
queue_prio = next_entitled_slot(p, rq);
if (queue_prio >= MAX_PRIO) {
@@ -3445,7 +3446,7 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 static inline void reset_prio_levels(struct rq *rq)
 {
-   memset(rq->prio_level, MAX_RT_PRIO, ARRAY_SIZE(rq->prio_level));
+   memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE);
 }
 
 /*

-- 
-ck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    1   2   3   4   5   6   7   8   9   10   >