Commit-ID:  b7fa30c9cc48c4f55663420472505d3b4f6e1705
Gitweb:     http://git.kernel.org/tip/b7fa30c9cc48c4f55663420472505d3b4f6e1705
Author:     Peter Zijlstra <pet...@infradead.org>
AuthorDate: Thu, 9 Jun 2016 15:07:50 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Tue, 14 Jun 2016 10:58:34 +0200

sched/fair: Fix post_init_entity_util_avg() serialization

Chris Wilson reported a divide by 0 at:

 post_init_entity_util_avg():

 >    725       if (cfs_rq->avg.util_avg != 0) {
 >    726               sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 > -> 727               sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 >    728
 >    729               if (sa->util_avg > cap)
 >    730                       sa->util_avg = cap;
 >    731       } else {

Which given the lack of serialization, and the code generated from
update_cfs_rq_load_avg() is entirely possible:

        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
                sa->load_avg = max_t(long, sa->load_avg - r, 0);
                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
                removed_load = 1;
        }

turns into:

  ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
  ffffffff8108706b:       48 85 c0                test   %rax,%rax
  ffffffff8108706e:       74 40                   je     ffffffff810870b0
  ffffffff81087070:       4c 89 f8                mov    %r15,%rax
  ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
  ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
  ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
  ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
  ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
  ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx

Which you'll note ends up with 'sa->load_avg - r' in memory at
ffffffff8108707a.

By calling post_init_entity_util_avg() under rq->lock we're sure to be
fully serialized against PELT updates and cannot observe intermediate
state like this.

Reported-by: Chris Wilson <ch...@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Andrey Ryabinin <aryabi...@virtuozzo.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Yuyang Du <yuyang...@intel.com>
Cc: bseg...@google.com
Cc: morten.rasmus...@arm.com
Cc: p...@google.com
Cc: steve.muc...@linaro.org
Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded 
value")
Link: 
http://lkml.kernel.org/r/20160609130750.gq30...@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/sched/core.c | 3 +--
 kernel/sched/fair.c | 8 +++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 017d539..13d0896 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p)
         */
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-       /* Post initialize new task's util average when its cfs_rq is set */
+       rq = __task_rq_lock(p, &rf);
        post_init_entity_util_avg(&p->se);
 
-       rq = __task_rq_lock(p, &rf);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e8..4e33ad1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg)
 
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-       struct cfs_rq *cfs_rq;
        struct sched_entity *se;
+       struct cfs_rq *cfs_rq;
+       struct rq *rq;
        int i;
 
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct 
task_group *parent)
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
        for_each_possible_cpu(i) {
+               rq = cpu_rq(i);
+
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct 
task_group *parent)
                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                init_entity_runnable_average(se);
+
+               raw_spin_lock_irq(&rq->lock);
                post_init_entity_util_avg(se);
+               raw_spin_unlock_irq(&rq->lock);
        }
 
        return 1;

Reply via email to