I did something like this on top.. please have a look at the XXX and
integrate.

---

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2664,8 +2664,8 @@ static inline u64 cfs_rq_clock_task(stru
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-       int decayed;
        struct sched_avg *sa = &cfs_rq->avg;
+       int decayed;
 
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2695,15 +2695,16 @@ static inline int update_cfs_rq_load_avg
 static inline void update_load_avg(struct sched_entity *se, int update_tg)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       int cpu = cpu_of(rq_of(cfs_rq));
        u64 now = cfs_rq_clock_task(cfs_rq);
+       int cpu = cpu_of(rq_of(cfs_rq));
 
        /*
         * Track task load average for carrying it to new CPU after migrated, 
and
         * track group sched_entity load average for task_h_load calc in 
migration
         */
        __update_load_avg(now, cpu, &se->avg,
-               se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == 
se, NULL);
+                         se->on_rq * scale_load_down(se->load.weight),
+                         cfs_rq->curr == se, NULL);
 
        if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
                update_tg_load_avg(cfs_rq, 0);
@@ -2718,9 +2719,10 @@ static void attach_entity_load_avg(struc
         * update here. we have to update it with prev cfs_rq just before 
changing
         * se's cfs_rq, and get here soon.
         */
-       if (se->avg.last_update_time)
+       if (se->avg.last_update_time) {
                __update_load_avg(cfs_rq->avg.last_update_time, 
cpu_of(rq_of(cfs_rq)),
-                               &se->avg, 0, 0, NULL);
+                                 &se->avg, 0, 0, NULL);
+       }
 
        se->avg.last_update_time = cfs_rq->avg.last_update_time;
        cfs_rq->avg.load_avg += se->avg.load_avg;
@@ -2732,17 +2734,13 @@ static void attach_entity_load_avg(struc
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
        __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                       &se->avg, se->on_rq * scale_load_down(se->load.weight),
-                       cfs_rq->curr == se, NULL);
+                         &se->avg, se->on_rq * 
scale_load_down(se->load.weight),
+                         cfs_rq->curr == se, NULL);
 
-       cfs_rq->avg.load_avg =
-               max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-       cfs_rq->avg.load_sum =
-               max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-       cfs_rq->avg.util_avg =
-               max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-       cfs_rq->avg.util_sum =
-               max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - 
se->avg.load_avg, 0);
+       cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - 
se->avg.load_sum, 0);
+       cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - 
se->avg.util_avg, 0);
+       cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - 
se->avg.util_sum, 0);
 }
 
 /* Add the load generated by se into cfs_rq's load average */
@@ -2751,14 +2749,14 @@ enqueue_entity_load_avg(struct cfs_rq *c
 {
        struct sched_avg *sa = &se->avg;
        u64 now = cfs_rq_clock_task(cfs_rq);
-       int migrated = 0, decayed;
+       int migrated, decayed;
 
-       if (sa->last_update_time == 0)
-               migrated = 1;
-       else
+       migrated = !sa->last_update_time;
+       if (!migrated) {
                __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
                        se->on_rq * scale_load_down(se->load.weight),
                        cfs_rq->curr == se, NULL);
+       }
 
        decayed = update_cfs_rq_load_avg(now, cfs_rq);
 
@@ -2781,7 +2779,7 @@ dequeue_entity_load_avg(struct cfs_rq *c
        cfs_rq->runnable_load_avg =
                max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
        cfs_rq->runnable_load_sum =
-               max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+               max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
 
 /*
@@ -2849,6 +2847,11 @@ static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+
 static inline int idle_balance(struct rq *rq)
 {
        return 0;
@@ -7915,29 +7918,50 @@ prio_changed_fair(struct rq *rq, struct
                check_preempt_curr(rq, p, 0);
 }
 
-static void detach_task_cfs_rq(struct task_struct *p, int queued)
+static inline bool vruntime_normalized(struct task_struct *p)
 {
+       int queued = task_on_rq_queued(p);
        struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
        /*
-        * Ensure the task's vruntime is normalized, so that after attaching
-        * back it to a cfs_rq the enqueue_entity() will do the right thing.
-        *
         * If it's queued, then the dequeue_entity(.flags=0) will already
-        * have normalized the vruntime, if it's !queued, then only when
-        * the task is sleeping it still has a non-normalized vruntime.
-        *
+        * have normalized the vruntime.
+        */
+       if (queued)
+               return true;
+
+       /*
         * When !queued, vruntime of the task has usually NOT been normalized.
         * But there are some cases where it has already been normalized:
-        *
-        * - Moving a forked child which is waiting for being woken up by
+        */
+
+       /*
+        * - a forked child which is waiting for being woken up by
         *   wake_up_new_task().
-        * - Moving a task which has been woken up by try_to_wake_up() and
+        */
+       if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+               return true;
+
+       /*
+        * - a task which has been woken up by try_to_wake_up() and
         *   waiting for actually being woken up by sched_ttwu_pending().
+        *
+        * XXX this appears wrong!! check history, 
+        * we appear to always set queued and RUNNING under the same lock 
instance
+        * might be from before TASK_WAKING ?
         */
-       if (!queued && p->state != TASK_RUNNING &&
-           se->sum_exec_runtime && p->state != TASK_WAKING) {
+       if (p->state == TASK_RUNNING)
+               return true;
+
+       return false;
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       if (!vruntime_normalized(p)) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
@@ -7946,10 +7970,8 @@ static void detach_task_cfs_rq(struct ta
                se->vruntime -= cfs_rq->min_vruntime;
        }
 
-#ifdef CONFIG_SMP
        /* Catch up with the cfs_rq and remove our load when we leave */
        detach_entity_load_avg(cfs_rq, se);
-#endif
 }
 
 static void attach_task_cfs_rq(struct task_struct *p, int queued)
@@ -7965,49 +7987,23 @@ static void attach_task_cfs_rq(struct ta
        se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
 
-#ifdef CONFIG_SMP
        /* synchronize task with its cfs_rq */
        attach_entity_load_avg(cfs_rq, se);
-#endif
 
-       /*
-        * Ensure the task has a non-normalized vruntime when attaching it
-        * to a cfs_rq, so that enqueue_entity() at wake-up time will do
-        * the right thing.
-        *
-        * If it's queued, then the enqueue_entity(.flags=0) makes the task
-        * has non-normalized vruntime, if it's !queued, then it still has
-        * a normalized vruntime.
-        *
-        * When !queued, a task usually should have a non-normalized vruntime
-        * after being attached to a cfs_rq. But there are some cases where
-        * we should keep it normalized:
-        *
-        * - Moving a forked child which is waiting for being woken up by
-        *   wake_up_new_task().
-        * - Moving a task which has been woken up by try_to_wake_up() and
-        *   waiting for actually being woken up by sched_ttwu_pending().
-        */
-       if (!queued && p->state != TASK_RUNNING &&
-           se->sum_exec_runtime && p->state != TASK_WAKING)
+       if (!vruntime_normalized(p))
                se->vruntime += cfs_rq->min_vruntime;
 }
 
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
 {
-       detach_task_cfs_rq(p, task_on_rq_queued(p));
+       detach_task_cfs_rq(p);
 }
 
-/*
- * We switched to the sched_fair class.
- */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-       int queued = task_on_rq_queued(p);
-
-       attach_task_cfs_rq(p, queued);
+       attach_task_cfs_rq(p);
 
-       if (queued) {
+       if (task_on_rq_queued(p)) {
                /*
                 * We were most likely switched from sched_rt, so
                 * kick off the schedule if running, otherwise just see
@@ -8056,8 +8052,7 @@ static void task_move_group_fair(struct
 {
        detach_task_cfs_rq(p, queued);
        set_task_rq(p, task_cpu(p));
-
-       /* tell se's cfs_rq has been changed */
+       /* tell se's cfs_rq has been changed -- migrated */
        p->se.avg.last_update_time = 0;
        attach_task_cfs_rq(p, queued);
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to