[PATCH 1/2] core scheduler changes

Srivatsa Vaddagiri Thu, 25 Jan 2007 22:06:35 -0800

This patch does several things:

        - Introduces the notion of control window (current set at 1
          sec - ideally the window size should be adjusted based on
          number of users to avoid rapid context switches). Bandwidth of each 
          user is controlled within this window.  rq->last_update tracks where 
          are in the current window.


        - Modifies scheduler_tick() to account cpu bandwidth consumption
          by a task group. Basically bandwidth consumed by a task is
          charged to itself (p->time_slice) -and- to its group as well.

        - A task is forced off the CPU once its group has expired the
          bandwidth in the current control window. Such a task is also
          marked as "starving".

        - schedule() avoids picking tasks whose group has expired its
          bandwidth in current control window. Any task (with non-zero
          p->timeslice) which is not picked to run in schedule() because of 
          this reason is marked "starving".
        
        - If a group has bandwidth left and it has starving tasks, then 
          schedule() prefers picking such tasks over non-starving tasks.
          This will avoid starvation of lower-priority tasks in a group.


Signed-off-by : Srivatsa Vaddagiri <[EMAIL PROTECTED]>

---


diff -puN include/linux/sched.h~cpu-controller-based-on-rtlimit_rt_cpu-patch 
include/linux/sched.h
--- 
linux-2.6.20-rc5/include/linux/sched.h~cpu-controller-based-on-rtlimit_rt_cpu-patch
 2007-01-19 15:17:27.000000000 +0530
+++ linux-2.6.20-rc5-vatsa/include/linux/sched.h        2007-01-26 
09:04:07.000000000 +0530
@@ -531,6 +531,10 @@ struct signal_struct {
 #define is_rt_policy(p)                ((p) != SCHED_NORMAL && (p) != 
SCHED_BATCH)
 #define has_rt_policy(p)       unlikely(is_rt_policy((p)->policy))
 
+#ifdef CONFIG_FAIRSCHED
+struct cpu_usage;
+#endif
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -555,6 +559,10 @@ struct user_struct {
        /* Hash table maintenance information */
        struct list_head uidhash_list;
        uid_t uid;
+#ifdef CONFIG_FAIRSCHED
+       int cpu_limit;
+       struct cpu_usage *cpu_usage;
+#endif
 };
 
 extern struct user_struct *find_user(uid_t);
@@ -1137,6 +1145,9 @@ static inline void put_task_struct(struc
                                        /* Not implemented yet, only for 486*/
 #define PF_STARTING    0x00000002      /* being created */
 #define PF_EXITING     0x00000004      /* getting shut down */
+#ifdef CONFIG_FAIRSCHED
+#define PF_STARVING    0x00000010      /* Task starving for CPU */
+#endif
 #define PF_FORKNOEXEC  0x00000040      /* forked but didn't exec */
 #define PF_SUPERPRIV   0x00000100      /* used super-user privileges */
 #define PF_DUMPCORE    0x00000200      /* dumped core */
diff -puN kernel/sched.c~cpu-controller-based-on-rtlimit_rt_cpu-patch 
kernel/sched.c
--- 
linux-2.6.20-rc5/kernel/sched.c~cpu-controller-based-on-rtlimit_rt_cpu-patch    
    2007-01-19 15:17:27.000000000 +0530
+++ linux-2.6.20-rc5-vatsa/kernel/sched.c       2007-01-26 09:04:07.000000000 
+0530
@@ -266,6 +266,9 @@ struct rq {
        unsigned long ttwu_local;
 #endif
        struct lock_class_key rq_lock_key;
+#ifdef CONFIG_FAIRSCHED
+       unsigned long last_update;
+#endif
 };
 
 static DEFINE_PER_CPU(struct rq, runqueues);
@@ -710,6 +713,126 @@ enqueue_task_head(struct task_struct *p,
        p->array = array;
 }
 
+#ifdef CONFIG_FAIRSCHED
+
+struct cpu_usage {
+       long tokens;
+       unsigned long last_update;
+       int starve_count;
+};
+
+#define task_starving(p)       (p->flags & PF_STARVING)
+
+/* Mark a task starving - either we shortcircuited its timeslice or we didnt
+ * pick it to run (because user ran out of bandwidth limit in current epoch).
+ */
+static inline void set_tsk_starving(struct task_struct *p)
+{
+       struct user_struct *user = p->user;
+       struct cpu_usage *cu;
+
+       if (task_starving(p) || !user->cpu_limit)
+               return;
+
+       cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+       cu->starve_count++;
+       p->flags |= PF_STARVING;
+}
+
+/* Clear a task's starving flag */
+static inline void clear_tsk_starving(struct task_struct *p)
+{
+       struct user_struct *user = p->user;
+       struct cpu_usage *cu;
+
+       if (!task_starving(p) || !user->cpu_limit)
+               return;
+
+       cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+       cu->starve_count--;
+       p->flags &= ~PF_STARVING;
+}
+
+/* Does the task's group have starving tasks? */
+static inline int is_user_starving(struct task_struct *p)
+{
+       struct user_struct *user = p->user;
+       struct cpu_usage *cu;
+
+       if (!user->cpu_limit)
+               return 0;
+
+       cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+       if (cu->starve_count)
+               return 1;
+
+       return 0;
+}
+
+/* Are we past the 1-sec control window? If so, all groups get to renew their
+ * expired tokens.
+ */
+static inline void adjust_control_window(void)
+{
+       struct rq *rq = this_rq();
+       unsigned long delta;
+
+       delta = jiffies - rq->last_update;
+       if (delta >= HZ)
+               rq->last_update += (delta/HZ) * HZ;
+}
+
+/* Account group's cpu usage */
+static inline void inc_cpu_usage(struct task_struct *p)
+{
+       struct user_struct *user = p->user;
+       struct cpu_usage *cu;
+
+       adjust_control_window();
+
+       if (!user->cpu_limit)
+               return;
+
+       cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+       cu->tokens--;
+}
+
+static inline int task_over_cpu_limit(struct task_struct *p)
+{
+       struct rq *rq = task_rq(p);
+       struct user_struct *user = p->user;
+       struct cpu_usage *cu;
+
+       adjust_control_window();
+
+       if (!user->cpu_limit)
+               return 0;
+
+       cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+       if (cu->last_update != rq->last_update) {
+               /* Replenish tokens */
+               cu->tokens += user->cpu_limit * HZ / 100;
+               cu->last_update = rq->last_update;
+       }
+
+       if (cu->tokens <= 0)
+               return 1;
+
+       return 0;
+}
+
+#else
+
+#define task_starving(p)       0
+
+static void inc_cpu_usage(struct task_struct *p) { }
+static int task_over_cpu_limit(struct task_struct *p) { return 0; }
+static void set_tsk_starving(struct task_struct *p) { }
+static void clear_tsk_starving(struct task_struct *p) { }
+static int is_user_starving(struct task_struct *p) { return 0;}
+
+#endif         /* CONFIG_FAIRSCHED */
+
 /*
  * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
@@ -1607,6 +1730,9 @@ void fastcall sched_fork(struct task_str
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_FAIRSCHED
+       p->flags &= ~PF_STARVING;
+#endif
        /*
         * Share the timeslice between parent and child, thus the
         * total amount of pending timeslices in the system doesn't change,
@@ -2074,6 +2200,7 @@ static void pull_task(struct rq *src_rq,
 {
        dequeue_task(p, src_array);
        dec_nr_running(p, src_rq);
+       clear_tsk_starving(p);
        set_task_cpu(p, this_cpu);
        inc_nr_running(p, this_rq);
        enqueue_task(p, this_array);
@@ -3137,6 +3264,9 @@ static void task_running_tick(struct rq 
                return;
        }
        spin_lock(&rq->lock);
+
+       inc_cpu_usage(p);
+
        /*
         * The task was running during this tick - update the
         * time slice counter. Note: we do not update a thread's
@@ -3163,17 +3293,18 @@ static void task_running_tick(struct rq 
                dequeue_task(p, rq->active);
                set_tsk_need_resched(p);
                p->prio = effective_prio(p);
-               p->time_slice = task_timeslice(p);
                p->first_time_slice = 0;
 
                if (!rq->expired_timestamp)
                        rq->expired_timestamp = jiffies;
-               if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+               if (!TASK_INTERACTIVE(p) || expired_starving(rq)
+                                               || task_over_cpu_limit(p)) {
                        enqueue_task(p, rq->expired);
                        if (p->static_prio < rq->best_expired_prio)
                                rq->best_expired_prio = p->static_prio;
                } else
                        enqueue_task(p, rq->active);
+               goto out_unlock;
        } else {
                /*
                 * Prevent a too long timeslice allowing a task to monopolize
@@ -3200,6 +3331,14 @@ static void task_running_tick(struct rq 
                        set_tsk_need_resched(p);
                }
        }
+
+       if (task_over_cpu_limit(p)) {
+               dequeue_task(p, rq->active);
+               set_tsk_need_resched(p);
+               enqueue_task(p, rq->expired);
+               set_tsk_starving(p);
+       }
+
 out_unlock:
        spin_unlock(&rq->lock);
 }
@@ -3416,7 +3555,7 @@ asmlinkage void __sched schedule(void)
        struct list_head *queue;
        unsigned long long now;
        unsigned long run_time;
-       int cpu, idx, new_prio;
+       int cpu, idx, new_prio, array_switch;
        long *switch_count;
        struct rq *rq;
 
@@ -3478,6 +3617,7 @@ need_resched_nonpreemptible:
                else {
                        if (prev->state == TASK_UNINTERRUPTIBLE)
                                rq->nr_uninterruptible++;
+                       clear_tsk_starving(prev);
                        deactivate_task(prev, rq);
                }
        }
@@ -3493,11 +3633,15 @@ need_resched_nonpreemptible:
                }
        }
 
+       array_switch = 0;
+
+pick_next_task:
        array = rq->active;
        if (unlikely(!array->nr_active)) {
                /*
                 * Switch the active and expired arrays.
                 */
+               array_switch++;
                schedstat_inc(rq, sched_switch);
                rq->active = rq->expired;
                rq->expired = array;
@@ -3510,6 +3654,25 @@ need_resched_nonpreemptible:
        queue = array->queue + idx;
        next = list_entry(queue->next, struct task_struct, run_list);
 
+       /* If we have done an array switch twice, it means we cant find any
+        * task which isn't above its limit and hence we just run the
+        * first task on the active array.
+        */
+       if (array_switch < 2 && (task_over_cpu_limit(next) ||
+                       (!task_starving(next) && is_user_starving(next)))) {
+               dequeue_task(next, rq->active);
+               enqueue_task(next, rq->expired);
+               if (next->time_slice)
+                       set_tsk_starving(next);
+               goto pick_next_task;
+       }
+
+       if (task_over_cpu_limit(next))
+               rq->last_update = jiffies;
+       if (!next->time_slice)
+               next->time_slice = task_timeslice(next);
+       clear_tsk_starving(next);
+
        if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
                unsigned long long delta = now - next->timestamp;
                if (unlikely((long long)(now - next->timestamp) < 0))
@@ -5061,6 +5224,7 @@ static int __migrate_task(struct task_st
        if (!cpu_isset(dest_cpu, p->cpus_allowed))
                goto out;
 
+       clear_tsk_starving(p);
        set_task_cpu(p, dest_cpu);
        if (p->array) {
                /*
_

-- 
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] core scheduler changes

Reply via email to