Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.

Use the fancy new hrtimers to provide a per group period

Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
---
 include/linux/sched.h    |    2 
 kernel/sched.c           |  229 ++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sched_rt.c        |   61 ++++++------
 kernel/sysctl.c          |    2 
 kernel/time/tick-sched.c |    5 -
 5 files changed, 237 insertions(+), 62 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
        struct rt_rq **rt_rq;
 
        unsigned int rt_ratio;
+       ktime_t rt_period;
 
        /*
         * shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
 #endif
        int rt_throttled;
        u64 rt_time;
+       struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
 
        struct cfs_rq cfs;
        struct rt_rq rt;
-       u64 rt_period_expire;
-       int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
-unsigned long rt_needs_cpu(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       u64 delta;
-
-       if (!rq->rt_throttled)
-               return 0;
-
-       if (rq->clock > rq->rt_period_expire)
-               return 1;
-
-       delta = rq->rt_period_expire - rq->clock;
-       do_div(delta, NSEC_PER_SEC / HZ);
-
-       return (unsigned long)delta;
-}
-
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 1000000;
 
 #define SCHED_RT_FRAC_SHIFT    16
 #define SCHED_RT_FRAC          (1UL << SCHED_RT_FRAC_SHIFT)
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #endif /* CONFIG_SMP */
 
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+       static const ktime_t ktime_zero = { .tv64 = 0 };
+       return ktime_add_ns(ktime_zero, ns);
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
        rq->tick_timestamp = rq->clock;
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
-       update_sched_rt_period(rq);
        spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -5287,6 +5275,158 @@ static inline void sched_init_granularit
        sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+       struct rt_rq *rt_rq =
+               container_of(timer, struct rt_rq, rt_period_timer);
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+       ktime_t now = ktime_get();
+
+       WARN_ON(smp_processor_id() != cpu_of(rq));
+       WARN_ON(!in_irq());
+
+       spin_lock(&rq->lock);
+       update_sched_rt_period(rt_rq);
+       spin_unlock(&rq->lock);
+
+       hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+       return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+       ktime_t period = sched_rt_period(rt_rq);
+
+       WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+       for (;;) {
+               ktime_t now = ktime_get();
+               hrtimer_forward(&rt_rq->rt_period_timer, now, period);
+               hrtimer_start(&rt_rq->rt_period_timer,
+                               rt_rq->rt_period_timer.expires,
+                               HRTIMER_MODE_ABS);
+               if (hrtimer_active(&rt_rq->rt_period_timer))
+                       break;
+       }
+}
+
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+       hrtimer_cancel(&rt_rq->rt_period_timer);
+}
+#endif
+
+static void sched_rt_period_start_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct rt_rq *rt_rq;
+
+       for_each_leaf_rt_rq(rt_rq, rq)
+               sched_rt_period_start(rt_rq);
+}
+
+#ifdef CONFIG_SMP
+static void sched_rt_period_stop_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct rt_rq *rt_rq;
+
+       for_each_leaf_rt_rq(rt_rq, rq)
+               sched_rt_period_stop(rt_rq);
+}
+
+static int sched_rt_period_hotplug(struct notifier_block *nfb,
+               unsigned long action, void *hcpu)
+{
+       int cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+       case CPU_DOWN_FAILED:
+       case CPU_DOWN_FAILED_FROZEN:
+               sched_rt_period_start_cpu(cpu);
+               return NOTIFY_OK;
+
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+               sched_rt_period_stop_cpu(cpu);
+               return NOTIFY_OK;
+
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               return NOTIFY_OK;
+
+       default:
+               return NOTIFY_DONE;
+       }
+
+       return NOTIFY_OK;
+}
+
+static void __init __sched_rt_period_init(void *arg)
+{
+       int cpu = smp_processor_id();
+       sched_rt_period_start_cpu(cpu);
+}
+
+static void __init sched_rt_period_init(void)
+{
+       on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
+       hotcpu_notifier(sched_rt_period_hotplug, 0);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void __sched_rt_period_init_tg(void *arg)
+{
+       struct task_group *tg = arg;
+       int cpu = smp_processor_id();
+
+       sched_rt_period_start(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+       on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
+}
+
+static void __sched_rt_period_destroy_tg(void *arg)
+{
+       struct task_group *tg = arg;
+       int cpu = smp_processor_id();
+
+       sched_rt_period_stop(tg->rt_rq[cpu]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+       on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#else /* CONFIG_SMP */
+static void __init sched_rt_period_init(void)
+{
+       sched_rt_period_start_cpu(0);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void sched_rt_period_init_tg(struct task_group *tg)
+{
+       sched_rt_period_start(tg->rt_rq[0]);
+}
+
+static void sched_rt_period_destroy_tg(struct task_group *tg)
+{
+       sched_rt_period_stop(tg->rt_rq[0]);
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -7068,6 +7208,7 @@ void __init sched_init_smp(void)
        if (set_cpus_allowed(current, non_isolated_cpus) < 0)
                BUG();
        sched_init_granularity();
+       sched_rt_period_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (nr_cpu_ids == 1)
@@ -7088,6 +7229,7 @@ void __init sched_init_smp(void)
 void __init sched_init_smp(void)
 {
        sched_init_granularity();
+       sched_rt_period_init();
 }
 #endif /* CONFIG_SMP */
 
@@ -7131,6 +7273,11 @@ static void init_rt_rq(struct rt_rq *rt_
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
 
+       hrtimer_init(&rt_rq->rt_period_timer,
+                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       rt_rq->rt_period_timer.function = sched_rt_period_timer;
+       rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
        rt_rq->rq = rq;
 #endif
@@ -7201,6 +7348,8 @@ void __init sched_init(void)
                                &per_cpu(init_sched_entity, i), i, 1);
 
                init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+               init_task_group.rt_period =
+                       ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                init_tg_rt_entry(rq, &init_task_group,
                                &per_cpu(init_rt_rq, i),
@@ -7208,8 +7357,6 @@ void __init sched_init(void)
 
                list_add(&init_task_group.list, &task_groups);
 #endif
-               rq->rt_period_expire = 0;
-               rq->rt_throttled = 0;
 
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                        rq->cpu_load[j] = 0;
@@ -7598,6 +7745,7 @@ struct task_group *sched_create_group(vo
 
        tg->shares = NICE_0_LOAD;
        tg->rt_ratio = 0; /* XXX */
+       tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
@@ -7637,6 +7785,8 @@ struct task_group *sched_create_group(vo
        list_add_rcu(&tg->list, &task_groups);
        unlock_task_group_list();
 
+       sched_rt_period_init_tg(tg);
+
        return tg;
 
 err:
@@ -7658,6 +7808,8 @@ void sched_destroy_group(struct task_gro
        struct rt_rq *rt_rq = NULL;
        int i;
 
+       sched_rt_period_destroy_tg(tg);
+
        lock_task_group_list();
        for_each_possible_cpu(i) {
                cfs_rq = tg->cfs_rq[i];
@@ -7815,6 +7967,19 @@ unsigned long sched_group_rt_ratio(struc
        return tg->rt_ratio;
 }
 
+int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period)
+{
+       tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC);
+       return 0;
+}
+
+unsigned long sched_group_rt_period(struct task_group *tg)
+{
+       u64 ns = ktime_to_ns(tg->rt_period);
+       do_div(ns, NSEC_PER_USEC);
+       return ns;
+}
+
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7903,6 +8068,17 @@ static u64 cpu_rt_ratio_read_uint(struct
        return (u64) tg->rt_ratio;
 }
 
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+               u64 rt_period_val)
+{
+       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_val);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+       return (u64) sched_group_rt_period(cgroup_tg(cgrp));
+}
+
 static struct cftype cpu_files[] = {
        {
                .name = "shares",
@@ -7914,6 +8090,11 @@ static struct cftype cpu_files[] = {
                .read_uint = cpu_rt_ratio_read_uint,
                .write_uint = cpu_rt_ratio_write_uint,
        },
+       {
+               .name = "rt_period_us",
+               .read_uint = cpu_rt_period_read_uint,
+               .write_uint = cpu_rt_period_write_uint,
+       },
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,8 +230,6 @@ static inline int select_nohz_load_balan
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -65,6 +65,17 @@ static inline unsigned int sched_rt_rati
        return rt_rq->tg->rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+       BUG_ON(!rt_rq->tg);
+       return rt_rq->tg->rt_period;
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+       return ktime_to_ns(sched_rt_period(rt_rq));
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
@@ -117,6 +128,16 @@ static inline unsigned int sched_rt_rati
        return sysctl_sched_rt_ratio;
 }
 
+static inline ktime_t sched_rt_period(struct rt_rq *rt_rq)
+{
+       return ns_to_ktime((u64)sysctl_sched_rt_period * NSEC_PER_USEC);
+}
+
+static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq)
+{
+       return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
@@ -174,15 +195,11 @@ static int sched_rt_ratio_exceeded(struc
        if (rt_rq->rt_throttled)
                return 1;
 
-       period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+       period = sched_rt_period_ns(rt_rq);
        ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
        if (rt_rq->rt_time > ratio) {
-               struct rq *rq = rq_of_rt_rq(rt_rq);
-
-               rq->rt_throttled = 1;
                rt_rq->rt_throttled = 1;
-
                sched_rt_ratio_dequeue(rt_rq);
                return 1;
        }
@@ -190,27 +207,16 @@ static int sched_rt_ratio_exceeded(struc
        return 0;
 }
 
-static void update_sched_rt_period(struct rq *rq)
+static void update_sched_rt_period(struct rt_rq *rt_rq)
 {
-       struct rt_rq *rt_rq;
-       u64 period;
-
-       while (rq->clock > rq->rt_period_expire) {
-               period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-               rq->rt_period_expire += period;
-
-               for_each_leaf_rt_rq(rt_rq, rq) {
-                       unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-                       u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-                       rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-                       if (rt_rq->rt_throttled) {
-                               rt_rq->rt_throttled = 0;
-                               sched_rt_ratio_enqueue(rt_rq);
-                       }
-               }
-
-               rq->rt_throttled = 0;
+       u64 period = sched_rt_period_ns(rt_rq);
+       unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+       u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+       rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+       if (rt_rq->rt_throttled) {
+               rt_rq->rt_throttled = 0;
+               sched_rt_ratio_enqueue(rt_rq);
        }
 }
 
@@ -238,11 +244,6 @@ static void update_curr_rt(struct rq *rq
        cpuacct_charge(curr, delta_exec);
 
        rt_rq->rt_time += delta_exec;
-       /*
-        * might make it a tad more accurate:
-        *
-        * update_sched_rt_period(rq);
-        */
        if (sched_rt_ratio_exceeded(rt_rq))
                resched_task(curr);
 }
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -311,7 +311,7 @@ static struct ctl_table kern_table[] = {
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "sched_rt_period_ms",
+               .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,7 +153,6 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
-       unsigned long rt_jiffies;
        struct tick_sched *ts;
        ktime_t last_update, expires, now, delta;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -217,10 +216,6 @@ void tick_nohz_stop_sched_tick(void)
        next_jiffies = get_next_timer_interrupt(last_jiffies);
        delta_jiffies = next_jiffies - last_jiffies;
 
-       rt_jiffies = rt_needs_cpu(cpu);
-       if (rt_jiffies && rt_jiffies < delta_jiffies)
-               delta_jiffies = rt_jiffies;
-
        if (rcu_needs_cpu(cpu))
                delta_jiffies = 1;
        /*

--

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to