> +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>  {
> +     int decayed;
>  
> +     if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> +             long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> +             cfs_rq->avg.load_avg = 
> subtract_until_zero(cfs_rq->avg.load_avg, r);
> +             r *= LOAD_AVG_MAX;
> +             cfs_rq->avg.load_sum = 
> subtract_until_zero(cfs_rq->avg.load_sum, r);
>       }
>  
> +     decayed = __update_load_avg(now, &cfs_rq->avg, cfs_rq->load.weight);
>  
> +#ifndef CONFIG_64BIT
> +     if (cfs_rq->avg.last_update_time != cfs_rq->load_last_update_time_copy) 
> {
> +             smp_wmb();
> +             cfs_rq->load_last_update_time_copy = 
> cfs_rq->avg.last_update_time;
> +     }
> +#endif
>  
> +     return decayed;
> +}

So on every cfs_rq update we first process the 'pending' removals, then
decay and then store the current timestamp.

> +static inline void enqueue_entity_load_avg(struct sched_entity *se)
>  {
> +     struct sched_avg *sa = &se->avg;
> +     struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +     u64 now = cfs_rq_clock_task(cfs_rq);
> +     int migrated = 0, decayed;
>  
> +     if (sa->last_update_time == 0) {
> +             sa->last_update_time = now;
>  
> +             if (entity_is_task(se))
> +                     migrated = 1;
>       }
> +     else
> +             __update_load_avg(now, sa, se->on_rq * se->load.weight);
>  
> +     decayed = update_cfs_rq_load_avg(now, cfs_rq);
>  
> +     if (migrated) {
> +             cfs_rq->avg.load_avg += sa->load_avg;
> +             cfs_rq->avg.load_sum += sa->load_sum;
>       }
>  
> +     if (decayed || migrated)
> +             update_tg_load_avg(cfs_rq);
>  }

On enqueue we add ourselves to the cfs_rq.. and assume the entity is
'current' wrt updates since we did that when we just pulled it from the
old rq.

> @@ -4551,18 +4382,34 @@ migrate_task_rq_fair(struct task_struct *p, int 
> next_cpu)
>  {
>       struct sched_entity *se = &p->se;
>       struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +     u64 last_update_time;
>  
>       /*
> +      * Task on old CPU catches up with its old cfs_rq, and subtract itself 
> from
> +      * the cfs_rq (task must be off the queue now).
>        */
> +#ifndef CONFIG_64BIT
> +     u64 last_update_time_copy;
> +
> +     do {
> +             last_update_time_copy = cfs_rq->load_last_update_time_copy;
> +             smp_rmb();
> +             last_update_time = cfs_rq->avg.last_update_time;
> +     } while (last_update_time != last_update_time_copy);
> +#else
> +     last_update_time = cfs_rq->avg.last_update_time;
> +#endif
> +     __update_load_avg(last_update_time, &se->avg, 0);
> +     atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
> +
> +     /*
> +      * We are supposed to update the task to "current" time, then its up to 
> date
> +      * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
> +      * what current time is, so simply throw away the out-of-date time. This
> +      * will result in the wakee task is less decayed, but giving the wakee 
> more
> +      * load sounds not bad.
> +      */
> +     se->avg.last_update_time = 0;
>  
>       /* We have migrated, no longer consider this task hot */
>       se->exec_start = 0;


And here we try and make good on that assumption. The thing I worry
about is what happens if the machine is entirely idle...

What guarantees an semi up-to-date cfs_rq->avg.last_update_time.

Attachment: pgp8FwR4BKA5W.pgp
Description: PGP signature

Reply via email to