On Wed, Jul 01, 2015 at 07:36:35AM +0200, Mike Galbraith wrote:
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5897,7 +5897,7 @@ static int detach_tasks(struct lb_env *e
>  {
>       struct list_head *tasks = &env->src_rq->cfs_tasks;
>       struct task_struct *p;
> -     unsigned long load;
> +     unsigned long load, d_load = 0, s_load = env->src_rq->load.weight;
>       int detached = 0;
>  
>       lockdep_assert_held(&env->src_rq->lock);
> @@ -5936,6 +5936,11 @@ static int detach_tasks(struct lb_env *e
>  
>               detached++;
>               env->imbalance -= load;
> +             if (!load) {
> +                     load = min_t(unsigned long, env->imbalance, 
> p->se.load.weight);
> +                     trace_printk("%s:%d is non-contributor - count as 
> %ld\n", p->comm, p->pid, load);
> +             }
> +             d_load += load;
>  
>  #ifdef CONFIG_PREEMPT
>               /*
> @@ -5954,6 +5959,18 @@ static int detach_tasks(struct lb_env *e
>               if (env->imbalance <= 0)
>                       break;
>  
> +             /*
> +              * We don't want to bleed busiest_rq dry either.  Weighted load
> +              * and/or imbalance may be dinky, load contribution can even be
> +              * zero, perhaps causing us to over balancem we had not assigned
> +              * it above.
> +              */
> +             if (env->src_rq->load.weight <= env->dst_rq->load.weight + 
> d_load) {
> +                     trace_printk("OINK - imbal: %ld  load: %ld  run: %d  
> det: %d  sload_was: %ld sload_is: %ld  dload: %ld\n",
> +                             env->imbalance, load, env->src_rq->nr_running, 
> detached, s_load, env->src_rq->load.weight, env->dst_rq->load.weight+d_load);
> +                     break;
> +             }
> +
>               continue;
>  next:
>               list_move_tail(&p->se.group_node, tasks);
> 

I've tried to analyse how your patch would affect the situation in one
of the crash dumps which I have of the problem.

In this dump, cpu0 is in the middle of dequeuing all tasks from cpu1.
rcu_sched has already been detached and there are two tasks left, one of them
which is being processed by dequeue_entity_load_avg() called from
dequeue_task() at the time the watchdog hits.  lb_env is the following.
imbalance is, as you can see, 60.

 crash> struct lb_env 8054fd50
 struct lb_env {
   sd = 0x8fc13e00, 
   src_rq = 0x81297200, 
   src_cpu = 1, 
   dst_cpu = 0, 
   dst_rq = 0x8128e200, 
   dst_grpmask = 0x0, 
   new_dst_cpu = 0, 
   idle = CPU_NEWLY_IDLE, 
   imbalance = 60, 
   cpus = 0x8128d238, 
   flags = 0, 
   loop = 2, 
   loop_break = 32, 
   loop_max = 3, 
   fbq_type = all, 
   tasks = {
     next = 0x8fc4c6ec, 
     prev = 0x8fc4c6ec
   }
 }

Weights of the runqueues:

 crash> struct rq.load.weight runqueues:0,1
 [0]: 8128e200
   load.weight = 0,
 [1]: 81297200
   load.weight = 1935,

The only running tasks on the system are these three:

 crash> foreach RU ps
    PID    PPID  CPU   TASK    ST  %MEM     VSZ    RSS  COMM
 >     0      0   0  8056d8b0  RU   0.0       0      0  [swapper/0]
 >     0      0   1  8fc5cd18  RU   0.0       0      0  [swapper/1]
 >     0      0   2  8fc5c6b0  RU   0.0       0      0  [swapper/2]
 >     0      0   3  8fc5c048  RU   0.0       0      0  [swapper/3]
       7      2   0  8fc4c690  RU   0.0       0      0  [rcu_sched]
      30      2   1  8fd26108  RU   0.0       0      0  [kswapd0]
     413      1   1  8edda408  RU   0.6    1900    416  rngd

And the load.weight and load_avg_contribs for them and their parent SEs:

 crash> foreach 7 30 413 load
 PID: 7      TASK: 8fc4c690  CPU: 0   COMMAND: "rcu_sched"
  task_h_load():   325 [ = (load_avg_contrib {    5} * cfs_rq->h_load {   65}) 
/ (cfs_rq->runnable_load_avg {    0} + 1) ]
  SE: 8fc4c6d8 load_avg_contrib:     5 load.weight:  1024 PARENT: 00000000 
GROUPNAME: (null)
 
 PID: 30     TASK: 8fd26108  CPU: 1   COMMAND: "kswapd0"
  task_h_load():    10 [ = (load_avg_contrib {   10} * cfs_rq->h_load {  133}) 
/ (cfs_rq->runnable_load_avg {  128} + 1) ]
  SE: 8fd26150 load_avg_contrib:    10 load.weight:  1024 PARENT: 00000000 
GROUPNAME: (null)
 
 PID: 413    TASK: 8edda408  CPU: 1   COMMAND: "rngd"
  task_h_load():     0 [ = (load_avg_contrib {    0} * cfs_rq->h_load {    0}) 
/ (cfs_rq->runnable_load_avg {    0} + 1) ]
  SE: 8edda450 load_avg_contrib:     0 load.weight:  1024 PARENT: 8fffbd00 
GROUPNAME: (null)
  SE: 8fffbd00 load_avg_contrib:     0 load.weight:     2 PARENT: 8f531f80 
GROUPNAME: rngd@hwrng.service
  SE: 8f531f80 load_avg_contrib:     0 load.weight:  1024 PARENT: 8f456e00 
GROUPNAME: system-rngd.slice
  SE: 8f456e00 load_avg_contrib:   118 load.weight:   911 PARENT: 00000000 
GROUPNAME: system.slice

Given the above, we can see that with your patch:

 - dst_rq->load.weight is 0 and will not change in this loop.

 - src_rq->load.weight was 1935 + 1024 before the loop.  It will go down
   to 1935 (already has), 1024, and then 0.
 
 - d_load will be 325*, 335, and then 395.

(* - probably not exactly since rcu_sched has already had set_task_rq() called
[cfs_rq switched] on it, but I guess it's actually going to be much lower based
on the other dumps I see where rcu_sched hasn't be switched yet).

So, we will not hit the "if (env->src_rq->load.weight <=
env->dst_rq->load.weight + d_load)" condition to break out of the loop until we
actualy move all tasks.  So the patch will not have any effect on this case.
Or am I missing something?

We'll set up a test anyway with the patch; the problem usually takes a
couple of days to reproduce.

/Rabin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to