* Ingo Molnar <[EMAIL PROTECTED]> wrote:

> also, could you possibly try these workloads you described with Mike's 
> latest patch too? Thanks,

i.e. the one below.

        Ingo

--------------------->
Subject: [patch] sched: improve fairness, v3
From: Mike Galbraith <[EMAIL PROTECTED]>

track interactive backlog as a way to detect when the load isn't really
an interactive load, that's very simple and has potential.

Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]>

---
 kernel/sched.c |  115 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 108 insertions(+), 7 deletions(-)

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -109,6 +109,7 @@ unsigned long long __attribute__((weak))
 #define MAX_SLEEP_AVG          (DEF_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT       (MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG       (JIFFIES_TO_NS(MAX_SLEEP_AVG))
+#define INTERACTIVE_LIMIT      (DEF_TIMESLICE * 4)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -167,6 +168,9 @@ unsigned long long __attribute__((weak))
        (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
                (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
+#define INTERACTIVE_BACKLOG_EXCEEDED(array) \
+       ((array)->interactive_ticks > INTERACTIVE_LIMIT)
+
 #define TASK_PREEMPTS_CURR(p, rq) \
        ((p)->prio < (rq)->curr->prio)
 
@@ -201,6 +205,7 @@ static inline unsigned int task_timeslic
 
 struct prio_array {
        unsigned int nr_active;
+       int interactive_ticks;
        DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
        struct list_head queue[MAX_PRIO];
 };
@@ -234,6 +239,7 @@ struct rq {
         */
        unsigned long nr_uninterruptible;
 
+       unsigned long switch_timestamp;
        unsigned long expired_timestamp;
        /* Cached timestamp set by update_cpu_clock() */
        unsigned long long most_recent_timestamp;
@@ -691,6 +697,8 @@ static void dequeue_task(struct task_str
        list_del(&p->run_list);
        if (list_empty(array->queue + p->prio))
                __clear_bit(p->prio, array->bitmap);
+       if (TASK_INTERACTIVE(p))
+               array->interactive_ticks -= p->time_slice;
 }
 
 static void enqueue_task(struct task_struct *p, struct prio_array *array)
@@ -700,6 +708,8 @@ static void enqueue_task(struct task_str
        __set_bit(p->prio, array->bitmap);
        array->nr_active++;
        p->array = array;
+       if (TASK_INTERACTIVE(p))
+               array->interactive_ticks += p->time_slice;
 }
 
 /*
@@ -882,7 +892,11 @@ static int recalc_task_prio(struct task_
        /* Caller must always ensure 'now >= p->timestamp' */
        unsigned long sleep_time = now - p->timestamp;
 
-       if (batch_task(p))
+       /*
+        * Migration timestamp adjustment may induce negative time.
+        * Ignore unquantifiable values as well as SCHED_BATCH tasks.
+        */ 
+       if (now < p->timestamp || batch_task(p))
                sleep_time = 0;
 
        if (likely(sleep_time > 0)) {
@@ -3051,9 +3065,9 @@ static inline int expired_starving(struc
 {
        if (rq->curr->static_prio > rq->best_expired_prio)
                return 1;
-       if (!STARVATION_LIMIT || !rq->expired_timestamp)
+       if (!STARVATION_LIMIT)
                return 0;
-       if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+       if (jiffies - rq->switch_timestamp > STARVATION_LIMIT * rq->nr_running)
                return 1;
        return 0;
 }
@@ -3131,8 +3145,74 @@ void account_steal_time(struct task_stru
                cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
+/*
+ * Promote and requeue the next lower priority task.  If no task
+ * is available in the active array, switch to the expired array.
+ * @rq: runqueue to search.
+ * @prio: priority at which to begin search.
+ */
+static inline void promote_next_lower(struct rq *rq, int prio)
+{
+       struct prio_array *array = rq->active;
+       struct task_struct *p = NULL;
+       unsigned long long now = rq->most_recent_timestamp;
+       unsigned long *bitmap;
+       unsigned long starving = JIFFIES_TO_NS(rq->nr_running * DEF_TIMESLICE);
+       int idx = prio + 1, found_noninteractive = 0;
+
+repeat:
+       bitmap = array->bitmap;
+       idx = find_next_bit(bitmap, MAX_PRIO, idx);
+       if (idx < MAX_PRIO) {
+               struct list_head *queue = array->queue + idx;
+
+               p = list_entry(queue->next, struct task_struct, run_list);
+               if (!TASK_INTERACTIVE(p))
+                       found_noninteractive = 1;
+
+               /* Skip non-starved queues. */
+               if (now < p->last_ran + starving) {
+                       idx++;
+                       p = NULL;
+                       goto repeat;
+               }
+       } else if (!found_noninteractive && array == rq->active) {
+               /* Nobody home, check the expired array. */
+               array = rq->expired;
+               idx = 0;
+               p = NULL;
+               goto repeat;
+       }
+
+       /* Found one, requeue it. */
+       if (p) {
+               dequeue_task(p, p->array);
+               if (array == rq->active)
+                       p->prio--;
+               /*
+                * If we pulled a task from the expired array, correct
+                * expired array info.  We can't afford a full search
+                * for best_expired_prio, but do the best we can.
+                */
+               else {
+                       idx = sched_find_first_bit(array->bitmap);
+                       if (idx < MAX_PRIO) {
+                               if (rq->best_expired_prio > idx)
+                                       rq->best_expired_prio = idx;
+                       } else {
+                               /* We emptied the array */
+                               rq->best_expired_prio = MAX_PRIO;
+                               rq->switch_timestamp = jiffies;
+                       }
+               }
+               enqueue_task(p, rq->active);
+       }
+}
+
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
+       int task_was_interactive;
+
        if (p->array != rq->active) {
                /* Task has expired but was not scheduled yet */
                set_tsk_need_resched(p);
@@ -3161,21 +3241,41 @@ static void task_running_tick(struct rq 
                }
                goto out_unlock;
        }
+
+       /*
+        * Tick off interactive task ticks from the active array.
+        */
+       task_was_interactive = TASK_INTERACTIVE(p);
+       if (task_was_interactive && --rq->active->interactive_ticks < 0)
+               rq->active->interactive_ticks = 0;
+
        if (!--p->time_slice) {
+               int expire_limit = STARVATION_LIMIT;
+
                dequeue_task(p, rq->active);
                set_tsk_need_resched(p);
                p->prio = effective_prio(p);
                p->time_slice = task_timeslice(p);
                p->first_time_slice = 0;
 
-               if (!rq->expired_timestamp)
-                       rq->expired_timestamp = jiffies;
                if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
                        enqueue_task(p, rq->expired);
                        if (p->static_prio < rq->best_expired_prio)
                                rq->best_expired_prio = p->static_prio;
+                       if (!task_was_interactive)
+                               rq->expired_timestamp = jiffies;
                } else
                        enqueue_task(p, rq->active);
+
+               /*
+                * If we haven't expired a non-interactive task within
+                * STARVATION_LIMIT ticks, or the current interactive
+                * load exceeds INTERACTIVE_BACKLOG, start promoting
+                * lower priority tasks.
+                */
+               if (time_after(jiffies, rq->expired_timestamp + expire_limit) ||
+                               INTERACTIVE_BACKLOG_EXCEEDED(rq->active))
+                       promote_next_lower(rq, p->prio);
        } else {
                /*
                 * Prevent a too long timeslice allowing a task to monopolize
@@ -3356,7 +3456,7 @@ need_resched_nonpreemptible:
                idle_balance(cpu, rq);
                if (!rq->nr_running) {
                        next = rq->idle;
-                       rq->expired_timestamp = 0;
+                       rq->switch_timestamp = jiffies;
                        goto switch_tasks;
                }
        }
@@ -3370,7 +3470,8 @@ need_resched_nonpreemptible:
                rq->active = rq->expired;
                rq->expired = array;
                array = rq->active;
-               rq->expired_timestamp = 0;
+               array->interactive_ticks = 0;
+               rq->switch_timestamp = jiffies;
                rq->best_expired_prio = MAX_PRIO;
        }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to