Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-17 Thread Parth Shah



On 10/16/19 5:26 PM, Vincent Guittot wrote:
> On Wed, 16 Oct 2019 at 09:21, Parth Shah  wrote:
>>
>>
>>
>> On 9/19/19 1:03 PM, Vincent Guittot wrote:
>>
>> [...]
>>
>>> Signed-off-by: Vincent Guittot 
>>> ---
>>>  kernel/sched/fair.c | 585 
>>> ++--
>>>  1 file changed, 380 insertions(+), 205 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 017aad0..d33379c 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -7078,11 +7078,26 @@ static unsigned long __read_mostly 
>>> max_load_balance_interval = HZ/10;
>>>
>>>  enum fbq_type { regular, remote, all };
>>>
>>> +/*
>>> + * group_type describes the group of CPUs at the moment of the load 
>>> balance.
>>> + * The enum is ordered by pulling priority, with the group with lowest 
>>> priority
>>> + * first so the groupe_type can be simply compared when selecting the 
>>> busiest
>>> + * group. see update_sd_pick_busiest().
>>> + */
>>>  enum group_type {
>>> - group_other = 0,
>>> + group_has_spare = 0,
>>> + group_fully_busy,
>>>   group_misfit_task,
>>> + group_asym_packing,
>>>   group_imbalanced,
>>> - group_overloaded,
>>> + group_overloaded
>>> +};
>>> +
>>> +enum migration_type {
>>> + migrate_load = 0,
>>> + migrate_util,
>>> + migrate_task,
>>> + migrate_misfit
>>>  };
>>>
>>>  #define LBF_ALL_PINNED   0x01
>>> @@ -7115,7 +7130,7 @@ struct lb_env {
>>>   unsigned intloop_max;
>>>
>>>   enum fbq_type   fbq_type;
>>> - enum group_type src_grp_type;
>>> + enum migration_type balance_type;
>>>   struct list_headtasks;
>>>  };
>>>
>>> @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
>>>  {
>>>   struct list_head *tasks = >src_rq->cfs_tasks;
>>>   struct task_struct *p;
>>> - unsigned long load;
>>> + unsigned long util, load;
>>>   int detached = 0;
>>>
>>>   lockdep_assert_held(>src_rq->lock);
>>> @@ -7380,19 +7395,53 @@ static int detach_tasks(struct lb_env *env)
>>>   if (!can_migrate_task(p, env))
>>>   goto next;
>>>
>>> - load = task_h_load(p);
>>> + switch (env->balance_type) {
>>> + case migrate_load:
>>> + load = task_h_load(p);
>>>
>>> - if (sched_feat(LB_MIN) && load < 16 && 
>>> !env->sd->nr_balance_failed)
>>> - goto next;
>>> + if (sched_feat(LB_MIN) &&
>>> + load < 16 && !env->sd->nr_balance_failed)
>>> + goto next;
>>>
>>> - if ((load / 2) > env->imbalance)
>>> - goto next;
>>> + if ((load / 2) > env->imbalance)
>>> + goto next;
>>> +
>>> + env->imbalance -= load;
>>> + break;
>>> +
>>> + case migrate_util:
>>> + util = task_util_est(p);
>>> +
>>> + if (util > env->imbalance)
>>
>> Can you please explain what would happen for
>> `if (util/2 > env->imbalance)` ?
>> just like when migrating load, even util shouldn't be migrated if
>> env->imbalance is just near the utilization of the task being moved, isn't 
>> it?
> 
> I have chosen uti and not util/2 to be conservative because
> migrate_util is used to fill spare capacity.
> With `if (util/2 > env->imbalance)`, we can more easily overload the
> local group or pick too much utilization from the overloaded group.
> 

fair enough. I missed the point that unlike migrate_load, with
migrate_util, env->imbalance is just spare capacity of the local group.

Thanks,
Parth

>>
>>> + goto next;
>>> +
>>> + env->imbalance -= util;
>>> + break;
>>> +[ ... ]
>>
>> Thanks,
>> Parth
>>



Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-16 Thread Vincent Guittot
On Wed, 16 Oct 2019 at 09:21, Parth Shah  wrote:
>
>
>
> On 9/19/19 1:03 PM, Vincent Guittot wrote:
>
> [...]
>
> > Signed-off-by: Vincent Guittot 
> > ---
> >  kernel/sched/fair.c | 585 
> > ++--
> >  1 file changed, 380 insertions(+), 205 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 017aad0..d33379c 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7078,11 +7078,26 @@ static unsigned long __read_mostly 
> > max_load_balance_interval = HZ/10;
> >
> >  enum fbq_type { regular, remote, all };
> >
> > +/*
> > + * group_type describes the group of CPUs at the moment of the load 
> > balance.
> > + * The enum is ordered by pulling priority, with the group with lowest 
> > priority
> > + * first so the groupe_type can be simply compared when selecting the 
> > busiest
> > + * group. see update_sd_pick_busiest().
> > + */
> >  enum group_type {
> > - group_other = 0,
> > + group_has_spare = 0,
> > + group_fully_busy,
> >   group_misfit_task,
> > + group_asym_packing,
> >   group_imbalanced,
> > - group_overloaded,
> > + group_overloaded
> > +};
> > +
> > +enum migration_type {
> > + migrate_load = 0,
> > + migrate_util,
> > + migrate_task,
> > + migrate_misfit
> >  };
> >
> >  #define LBF_ALL_PINNED   0x01
> > @@ -7115,7 +7130,7 @@ struct lb_env {
> >   unsigned intloop_max;
> >
> >   enum fbq_type   fbq_type;
> > - enum group_type src_grp_type;
> > + enum migration_type balance_type;
> >   struct list_headtasks;
> >  };
> >
> > @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
> >  {
> >   struct list_head *tasks = >src_rq->cfs_tasks;
> >   struct task_struct *p;
> > - unsigned long load;
> > + unsigned long util, load;
> >   int detached = 0;
> >
> >   lockdep_assert_held(>src_rq->lock);
> > @@ -7380,19 +7395,53 @@ static int detach_tasks(struct lb_env *env)
> >   if (!can_migrate_task(p, env))
> >   goto next;
> >
> > - load = task_h_load(p);
> > + switch (env->balance_type) {
> > + case migrate_load:
> > + load = task_h_load(p);
> >
> > - if (sched_feat(LB_MIN) && load < 16 && 
> > !env->sd->nr_balance_failed)
> > - goto next;
> > + if (sched_feat(LB_MIN) &&
> > + load < 16 && !env->sd->nr_balance_failed)
> > + goto next;
> >
> > - if ((load / 2) > env->imbalance)
> > - goto next;
> > + if ((load / 2) > env->imbalance)
> > + goto next;
> > +
> > + env->imbalance -= load;
> > + break;
> > +
> > + case migrate_util:
> > + util = task_util_est(p);
> > +
> > + if (util > env->imbalance)
>
> Can you please explain what would happen for
> `if (util/2 > env->imbalance)` ?
> just like when migrating load, even util shouldn't be migrated if
> env->imbalance is just near the utilization of the task being moved, isn't it?

I have chosen uti and not util/2 to be conservative because
migrate_util is used to fill spare capacity.
With `if (util/2 > env->imbalance)`, we can more easily overload the
local group or pick too much utilization from the overloaded group.

>
> > + goto next;
> > +
> > + env->imbalance -= util;
> > + break;
> > +[ ... ]
>
> Thanks,
> Parth
>


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-16 Thread Parth Shah



On 9/19/19 1:03 PM, Vincent Guittot wrote:

[...]

> Signed-off-by: Vincent Guittot 
> ---
>  kernel/sched/fair.c | 585 
> ++--
>  1 file changed, 380 insertions(+), 205 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 017aad0..d33379c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7078,11 +7078,26 @@ static unsigned long __read_mostly 
> max_load_balance_interval = HZ/10;
> 
>  enum fbq_type { regular, remote, all };
> 
> +/*
> + * group_type describes the group of CPUs at the moment of the load balance.
> + * The enum is ordered by pulling priority, with the group with lowest 
> priority
> + * first so the groupe_type can be simply compared when selecting the busiest
> + * group. see update_sd_pick_busiest().
> + */
>  enum group_type {
> - group_other = 0,
> + group_has_spare = 0,
> + group_fully_busy,
>   group_misfit_task,
> + group_asym_packing,
>   group_imbalanced,
> - group_overloaded,
> + group_overloaded
> +};
> +
> +enum migration_type {
> + migrate_load = 0,
> + migrate_util,
> + migrate_task,
> + migrate_misfit
>  };
> 
>  #define LBF_ALL_PINNED   0x01
> @@ -7115,7 +7130,7 @@ struct lb_env {
>   unsigned intloop_max;
> 
>   enum fbq_type   fbq_type;
> - enum group_type src_grp_type;
> + enum migration_type balance_type;
>   struct list_headtasks;
>  };
> 
> @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
>  {
>   struct list_head *tasks = >src_rq->cfs_tasks;
>   struct task_struct *p;
> - unsigned long load;
> + unsigned long util, load;
>   int detached = 0;
> 
>   lockdep_assert_held(>src_rq->lock);
> @@ -7380,19 +7395,53 @@ static int detach_tasks(struct lb_env *env)
>   if (!can_migrate_task(p, env))
>   goto next;
> 
> - load = task_h_load(p);
> + switch (env->balance_type) {
> + case migrate_load:
> + load = task_h_load(p);
> 
> - if (sched_feat(LB_MIN) && load < 16 && 
> !env->sd->nr_balance_failed)
> - goto next;
> + if (sched_feat(LB_MIN) &&
> + load < 16 && !env->sd->nr_balance_failed)
> + goto next;
> 
> - if ((load / 2) > env->imbalance)
> - goto next;
> + if ((load / 2) > env->imbalance)
> + goto next;
> +
> + env->imbalance -= load;
> + break;
> +
> + case migrate_util:
> + util = task_util_est(p);
> +
> + if (util > env->imbalance)

Can you please explain what would happen for
`if (util/2 > env->imbalance)` ?
just like when migrating load, even util shouldn't be migrated if
env->imbalance is just near the utilization of the task being moved, isn't it?

> + goto next;
> +
> + env->imbalance -= util;
> + break;
> +[ ... ]

Thanks,
Parth



Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Vincent Guittot
On Tue, 8 Oct 2019 at 19:55, Peter Zijlstra  wrote:
>
> On Thu, Sep 19, 2019 at 09:33:35AM +0200, Vincent Guittot wrote:
> > + if (busiest->group_type == group_asym_packing) {
> > + /*
> > +  * In case of asym capacity, we will try to migrate all load 
> > to
> > +  * the preferred CPU.
> > +  */
> > + env->balance_type = migrate_load;
> >   env->imbalance = busiest->group_load;
> >   return;
> >   }
>
> I was a bit surprised with this; I sorta expected a migrate_task,1 here.

I have just kept current mechanism

>
> The asym_packing thing has always been a nr_running issue to me. If
> there is something to run, we should run on as many siblings/cores as
> possible, but preferably the 'highest' ranked siblings/cores.

I can probably set the number of running task to migrate instead of the load
>


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Vincent Guittot
On Tue, 8 Oct 2019 at 19:39, Peter Zijlstra  wrote:
>
> On Tue, Oct 08, 2019 at 05:30:02PM +0200, Vincent Guittot wrote:
>
> > This is how I plan to get ride of the problem:
> > + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> > + unsigned int nr_diff = busiest->sum_h_nr_running;
> > + /*
> > +  * When prefer sibling, evenly spread running tasks on
> > +  * groups.
> > +  */
> > + env->migration_type = migrate_task;
> > + lsub_positive(_diff, local->sum_h_nr_running);
> > + env->imbalance = nr_diff >> 1;
> > + return;
> > + }
>
> I'm thinking the max_t(long, 0, ...); variant reads a lot simpler and
> really _should_ work given that -fno-strict-overflow / -fwrapv mandates
> 2s complement.

Another point that I have overlooked is that sum_h_nr_running is
unsigned int whereas imbalance is long

In fact, (long) (unsigned long A - unsigned long B) >> 1 works correctly
but
(long) (unsigned int A - unsigned int B) >> 1 doesn't


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Peter Zijlstra
On Thu, Sep 19, 2019 at 09:33:35AM +0200, Vincent Guittot wrote:
> + if (busiest->group_type == group_asym_packing) {
> + /*
> +  * In case of asym capacity, we will try to migrate all load to
> +  * the preferred CPU.
> +  */
> + env->balance_type = migrate_load;
>   env->imbalance = busiest->group_load;
>   return;
>   }

I was a bit surprised with this; I sorta expected a migrate_task,1 here.

The asym_packing thing has always been a nr_running issue to me. If
there is something to run, we should run on as many siblings/cores as
possible, but preferably the 'highest' ranked siblings/cores.



Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Valentin Schneider
On 08/10/2019 17:39, Valentin Schneider wrote:
>>
>> But -fno-strict-overflow mandates 2s complement for all such signed
>> issues.
>>
> 
> So then there really shouldn't be any ambiguity. I have no idea if
> -fno-strict-overflow then also lifts the undefinedness of the right shifts,
> gotta get my spade and dig some more.
> 

Bleh, no luck really.

Thinking about it some more you can't overflow by right shifting
(logical/arithmetic), so I dunno if signed right shift counts as "such
signed issues".


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Peter Zijlstra
On Tue, Oct 08, 2019 at 05:30:02PM +0200, Vincent Guittot wrote:

> This is how I plan to get ride of the problem:
> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> + unsigned int nr_diff = busiest->sum_h_nr_running;
> + /*
> +  * When prefer sibling, evenly spread running tasks on
> +  * groups.
> +  */
> + env->migration_type = migrate_task;
> + lsub_positive(_diff, local->sum_h_nr_running);
> + env->imbalance = nr_diff >> 1;
> + return;
> + }

I'm thinking the max_t(long, 0, ...); variant reads a lot simpler and
really _should_ work given that -fno-strict-overflow / -fwrapv mandates
2s complement.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Valentin Schneider
On 08/10/2019 17:33, Peter Zijlstra wrote:
> On Tue, Oct 08, 2019 at 03:34:04PM +0100, Valentin Schneider wrote:
>> On 08/10/2019 15:16, Peter Zijlstra wrote:
>>> On Wed, Oct 02, 2019 at 11:47:59AM +0100, Valentin Schneider wrote:
>>>
 Yeah, right shift on signed negative values are implementation defined.
>>>
>>> Seriously? Even under -fno-strict-overflow? There is a perfectly
>>> sensible operation for signed shift right, this stuff should not be
>>> undefined.
>>>
>>
>> Mmm good point. I didn't see anything relevant in the description of that
>> flag. All my copy of the C99 standard (draft) says at 6.5.7.5 is:
>>
>> """
>> The result of E1 >> E2 [...] If E1 has a signed type and a negative value,
>> the resulting value is implementation-defined.
>> """
>>
>> Arithmetic shift would make sense, but I think this stems from twos'
>> complement not being imposed: 6.2.6.2.2 says sign can be done with
>> sign + magnitude, twos complement or ones' complement...
> 
> But -fno-strict-overflow mandates 2s complement for all such signed
> issues.
> 

So then there really shouldn't be any ambiguity. I have no idea if
-fno-strict-overflow then also lifts the undefinedness of the right shifts,
gotta get my spade and dig some more.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Peter Zijlstra
On Tue, Oct 08, 2019 at 03:34:04PM +0100, Valentin Schneider wrote:
> On 08/10/2019 15:16, Peter Zijlstra wrote:
> > On Wed, Oct 02, 2019 at 11:47:59AM +0100, Valentin Schneider wrote:
> > 
> >> Yeah, right shift on signed negative values are implementation defined.
> > 
> > Seriously? Even under -fno-strict-overflow? There is a perfectly
> > sensible operation for signed shift right, this stuff should not be
> > undefined.
> > 
> 
> Mmm good point. I didn't see anything relevant in the description of that
> flag. All my copy of the C99 standard (draft) says at 6.5.7.5 is:
> 
> """
> The result of E1 >> E2 [...] If E1 has a signed type and a negative value,
> the resulting value is implementation-defined.
> """
> 
> Arithmetic shift would make sense, but I think this stems from twos'
> complement not being imposed: 6.2.6.2.2 says sign can be done with
> sign + magnitude, twos complement or ones' complement...

But -fno-strict-overflow mandates 2s complement for all such signed
issues.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Valentin Schneider
On 08/10/2019 16:30, Vincent Guittot wrote:
[...]
> 
> This is how I plan to get ride of the problem:
> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> + unsigned int nr_diff = busiest->sum_h_nr_running;
> + /*
> +  * When prefer sibling, evenly spread running tasks on
> +  * groups.
> +  */
> + env->migration_type = migrate_task;
> + lsub_positive(_diff, local->sum_h_nr_running);
> + env->imbalance = nr_diff >> 1;
> + return;
> + }
> 

I think this wants a 

/* Local could have more tasks than busiest */

atop the lsub, otherwise yeah that ought to work.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Vincent Guittot
Le Tuesday 08 Oct 2019 à 15:34:04 (+0100), Valentin Schneider a écrit :
> On 08/10/2019 15:16, Peter Zijlstra wrote:
> > On Wed, Oct 02, 2019 at 11:47:59AM +0100, Valentin Schneider wrote:
> > 
> >> Yeah, right shift on signed negative values are implementation defined.
> > 
> > Seriously? Even under -fno-strict-overflow? There is a perfectly
> > sensible operation for signed shift right, this stuff should not be
> > undefined.
> > 
> 
> Mmm good point. I didn't see anything relevant in the description of that
> flag. All my copy of the C99 standard (draft) says at 6.5.7.5 is:
> 
> """
> The result of E1 >> E2 [...] If E1 has a signed type and a negative value,
> the resulting value is implementation-defined.
> """
> 
> Arithmetic shift would make sense, but I think this stems from twos'
> complement not being imposed: 6.2.6.2.2 says sign can be done with
> sign + magnitude, twos complement or ones' complement...
> 
> I suppose when you really just want a division you should ask for division
> semantics - i.e. use '/'. I'd expect compilers to be smart enough to turn
> that into a shift if a power of 2 is involved, and to do something else
> if negative values can be involved.

This is how I plan to get ride of the problem:
+   if (busiest->group_weight == 1 || sds->prefer_sibling) {
+   unsigned int nr_diff = busiest->sum_h_nr_running;
+   /*
+* When prefer sibling, evenly spread running tasks on
+* groups.
+*/
+   env->migration_type = migrate_task;
+   lsub_positive(_diff, local->sum_h_nr_running);
+   env->imbalance = nr_diff >> 1;
+   return;
+   }



Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Valentin Schneider
On 08/10/2019 15:16, Peter Zijlstra wrote:
> On Wed, Oct 02, 2019 at 11:47:59AM +0100, Valentin Schneider wrote:
> 
>> Yeah, right shift on signed negative values are implementation defined.
> 
> Seriously? Even under -fno-strict-overflow? There is a perfectly
> sensible operation for signed shift right, this stuff should not be
> undefined.
> 

Mmm good point. I didn't see anything relevant in the description of that
flag. All my copy of the C99 standard (draft) says at 6.5.7.5 is:

"""
The result of E1 >> E2 [...] If E1 has a signed type and a negative value,
the resulting value is implementation-defined.
"""

Arithmetic shift would make sense, but I think this stems from twos'
complement not being imposed: 6.2.6.2.2 says sign can be done with
sign + magnitude, twos complement or ones' complement...

I suppose when you really just want a division you should ask for division
semantics - i.e. use '/'. I'd expect compilers to be smart enough to turn
that into a shift if a power of 2 is involved, and to do something else
if negative values can be involved.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Peter Zijlstra
On Wed, Oct 02, 2019 at 11:47:59AM +0100, Valentin Schneider wrote:

> Yeah, right shift on signed negative values are implementation defined.

Seriously? Even under -fno-strict-overflow? There is a perfectly
sensible operation for signed shift right, this stuff should not be
undefined.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-08 Thread Peter Zijlstra
On Wed, Oct 02, 2019 at 11:21:20AM +0200, Dietmar Eggemann wrote:

> I thought we should always order local variable declarations from
> longest to shortest line but can't find this rule in coding-style.rst
> either.

You're right though, that is generally encouraged. From last years
(2018) KS there was the notion of a subsystem handbook and the tip
tree's submissions thereto can be found there:

  https://lore.kernel.org/lkml/20181107171149.165693...@linutronix.de/

But for some raisin that never actually went anywhere...


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-02 Thread Valentin Schneider
On 02/10/2019 09:30, Vincent Guittot wrote:
>> Isn't that one somewhat risky?
>>
>> Say both groups are classified group_has_spare and we do prefer_sibling.
>> We'd select busiest as the one with the maximum number of busy CPUs, but it
>> could be so that busiest.sum_h_nr_running < local.sum_h_nr_running (because
>> pinned tasks or wakeup failed to properly spread stuff).
>>
>> The thing should be unsigned so at least we save ourselves from right
>> shifting a negative value, but we still end up with a gygornous imbalance
>> (which we then store into env.imbalance which *is* signed... Urgh).
> 
> so it's not clear what happen with a right shift on negative signed
> value and this seems to be compiler dependent so even
> max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1) might be wrong
> 

Yeah, right shift on signed negative values are implementation defined. This
is what I was worried about initially, but I think the expression resulting
from the subtraction is unsigned (both terms are unsigned) so this would
just wrap when busiest < local - but that is still a problem.


((local->idle_cpus - busiest->idle_cpus) >> 1) should be fine because we do
have this check in find_busiest_group() before heading off to
calculate_imbalance():

  if (busiest->group_type != group_overloaded &&
  (env->idle == CPU_NOT_IDLE ||
   local->idle_cpus <= (busiest->idle_cpus + 1)))
  /* ... */
  goto out_balanced;

which ensures the subtraction will be at least 2. We're missing something
equivalent for the sum_h_nr_running case.

> I'm going to update it
> 
> 
>>
>> [...]


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-02 Thread Dietmar Eggemann
On 02/10/2019 10:23, Vincent Guittot wrote:
> On Tue, 1 Oct 2019 at 18:53, Dietmar Eggemann  
> wrote:
>>
>> On 01/10/2019 10:14, Vincent Guittot wrote:
>>> On Mon, 30 Sep 2019 at 18:24, Dietmar Eggemann  
>>> wrote:

 Hi Vincent,

 On 19/09/2019 09:33, Vincent Guittot wrote:
>>
> [...]
> 
>>
> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> + /*
> +  * When prefer sibling, evenly spread running tasks 
> on
> +  * groups.
> +  */
> + env->balance_type = migrate_task;
> + env->imbalance = (busiest->sum_h_nr_running - 
> local->sum_h_nr_running) >> 1;
> + return;
> + }
> +
> + /*
> +  * If there is no overload, we just want to even the number 
> of
> +  * idle cpus.
> +  */
> + env->balance_type = migrate_task;
> + env->imbalance = max_t(long, 0, (local->idle_cpus - 
> busiest->idle_cpus) >> 1);

 Why do we need a max_t(long, 0, ...) here and not for the 'if
 (busiest->group_weight == 1 || sds->prefer_sibling)' case?
>>>
>>> For env->imbalance = (busiest->sum_h_nr_running - local->sum_h_nr_running) 
>>> >> 1;
>>>
>>> either we have sds->prefer_sibling && busiest->sum_nr_running >
>>> local->sum_nr_running + 1
>>
>> I see, this corresponds to
>>
>> /* Try to move all excess tasks to child's sibling domain */
>>if (sds.prefer_sibling && local->group_type == group_has_spare &&
>>busiest->sum_h_nr_running > local->sum_h_nr_running + 1)
>>goto force_balance;
>>
>> in find_busiest_group, I assume.
> 
> yes. But it seems that I missed a case:
> 
> prefer_sibling is set
> busiest->sum_h_nr_running <= local->sum_h_nr_running + 1 so we skip
> goto force_balance above
> But env->idle != CPU_NOT_IDLE  and local->idle_cpus >
> (busiest->idle_cpus + 1) so we also skip goto out_balance and finally
> call calculate_imbalance()
> 
> in calculate_imbalance with prefer_sibling set, imbalance =
> (busiest->sum_h_nr_running - local->sum_h_nr_running) >> 1;
> 
> so we probably want something similar to max_t(long, 0,
> (busiest->sum_h_nr_running - local->sum_h_nr_running) >> 1)

Makes sense.

Caught a couple of

[  369.310464] 0-3->4-7 2->5 env->imbalance = 2147483646
[  369.310796] 0-3->4-7 2->4 env->imbalance = 2147483647

in this if condition on h620 running hackbench.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-02 Thread Dietmar Eggemann
On 02/10/2019 08:44, Vincent Guittot wrote:
> On Tue, 1 Oct 2019 at 18:53, Dietmar Eggemann  
> wrote:
>>
>> On 01/10/2019 10:14, Vincent Guittot wrote:
>>> On Mon, 30 Sep 2019 at 18:24, Dietmar Eggemann  
>>> wrote:

 Hi Vincent,

 On 19/09/2019 09:33, Vincent Guittot wrote:
>>
>> [...]
>>
> @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
>   {
> struct list_head *tasks = >src_rq->cfs_tasks;
> struct task_struct *p;
> - unsigned long load;
> + unsigned long util, load;

 Minor: Order by length or reduce scope to while loop ?
>>>
>>> I don't get your point here
>>
>> Nothing dramatic here! Just
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index d0c3aa1dc290..a08f342ead89 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7333,8 +7333,8 @@ static const unsigned int sched_nr_migrate_break = 32;
>>  static int detach_tasks(struct lb_env *env)
>>  {
>> struct list_head *tasks = >src_rq->cfs_tasks;
>> -   struct task_struct *p;
>> unsigned long load, util;
>> +   struct task_struct *p;
> 
> hmm... I still don't get this.
> We usually gather pointers instead of interleaving them with other varaiables

I thought we should always order local variable declarations from
longest to shortest line but can't find this rule in coding-style.rst
either.

[...]


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-02 Thread Vincent Guittot
On Tue, 1 Oct 2019 at 19:47, Valentin Schneider
 wrote:
>
> On 19/09/2019 08:33, Vincent Guittot wrote:
>
> [...]
>
> > @@ -8283,69 +8363,133 @@ static inline void update_sd_lb_stats(struct 
> > lb_env *env, struct sd_lb_stats *sd
> >   */
> >  static inline void calculate_imbalance(struct lb_env *env, struct 
> > sd_lb_stats *sds)
> >  {
> > - unsigned long max_pull, load_above_capacity = ~0UL;
> >   struct sg_lb_stats *local, *busiest;
> >
> >   local = >local_stat;
> >   busiest = >busiest_stat;
> >
> > - if (busiest->group_asym_packing) {
> > + if (busiest->group_type == group_misfit_task) {
> > + /* Set imbalance to allow misfit task to be balanced. */
> > + env->balance_type = migrate_misfit;
> > + env->imbalance = busiest->group_misfit_task_load;
> > + return;
> > + }
> > +
> > + if (busiest->group_type == group_asym_packing) {
> > + /*
> > +  * In case of asym capacity, we will try to migrate all load 
> > to
> > +  * the preferred CPU.
> > +  */
> > + env->balance_type = migrate_load;
> >   env->imbalance = busiest->group_load;
> >   return;
> >   }
> >
> > + if (busiest->group_type == group_imbalanced) {
> > + /*
> > +  * In the group_imb case we cannot rely on group-wide averages
> > +  * to ensure CPU-load equilibrium, try to move any task to fix
> > +  * the imbalance. The next load balance will take care of
> > +  * balancing back the system.
> > +  */
> > + env->balance_type = migrate_task;
> > + env->imbalance = 1;
> > + return;
> > + }
> > +
> >   /*
> > -  * Avg load of busiest sg can be less and avg load of local sg can
> > -  * be greater than avg load across all sgs of sd because avg load
> > -  * factors in sg capacity and sgs with smaller group_type are
> > -  * skipped when updating the busiest sg:
> > +  * Try to use spare capacity of local group without overloading it or
> > +  * emptying busiest
> >*/
> > - if (busiest->group_type != group_misfit_task &&
> > - (busiest->avg_load <= sds->avg_load ||
> > -  local->avg_load >= sds->avg_load)) {
> > - env->imbalance = 0;
> > + if (local->group_type == group_has_spare) {
> > + if (busiest->group_type > group_fully_busy) {
> > + /*
> > +  * If busiest is overloaded, try to fill spare
> > +  * capacity. This might end up creating spare capacity
> > +  * in busiest or busiest still being overloaded but
> > +  * there is no simple way to directly compute the
> > +  * amount of load to migrate in order to balance the
> > +  * system.
> > +  */
> > + env->balance_type = migrate_util;
> > + env->imbalance = max(local->group_capacity, 
> > local->group_util) -
> > + local->group_util;
> > + return;
> > + }
> > +
> > + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> > + /*
> > +  * When prefer sibling, evenly spread running tasks on
> > +  * groups.
> > +  */
> > + env->balance_type = migrate_task;
> > + env->imbalance = (busiest->sum_h_nr_running - 
> > local->sum_h_nr_running) >> 1;
>
> Isn't that one somewhat risky?
>
> Say both groups are classified group_has_spare and we do prefer_sibling.
> We'd select busiest as the one with the maximum number of busy CPUs, but it
> could be so that busiest.sum_h_nr_running < local.sum_h_nr_running (because
> pinned tasks or wakeup failed to properly spread stuff).
>
> The thing should be unsigned so at least we save ourselves from right
> shifting a negative value, but we still end up with a gygornous imbalance
> (which we then store into env.imbalance which *is* signed... Urgh).

so it's not clear what happen with a right shift on negative signed
value and this seems to be compiler dependent so even
max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1) might be wrong

I'm going to update it


>
> [...]


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-02 Thread Vincent Guittot
On Tue, 1 Oct 2019 at 18:53, Dietmar Eggemann  wrote:
>
> On 01/10/2019 10:14, Vincent Guittot wrote:
> > On Mon, 30 Sep 2019 at 18:24, Dietmar Eggemann  
> > wrote:
> >>
> >> Hi Vincent,
> >>
> >> On 19/09/2019 09:33, Vincent Guittot wrote:
>
[...]

>
> >>> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> >>> + /*
> >>> +  * When prefer sibling, evenly spread running tasks 
> >>> on
> >>> +  * groups.
> >>> +  */
> >>> + env->balance_type = migrate_task;
> >>> + env->imbalance = (busiest->sum_h_nr_running - 
> >>> local->sum_h_nr_running) >> 1;
> >>> + return;
> >>> + }
> >>> +
> >>> + /*
> >>> +  * If there is no overload, we just want to even the number 
> >>> of
> >>> +  * idle cpus.
> >>> +  */
> >>> + env->balance_type = migrate_task;
> >>> + env->imbalance = max_t(long, 0, (local->idle_cpus - 
> >>> busiest->idle_cpus) >> 1);
> >>
> >> Why do we need a max_t(long, 0, ...) here and not for the 'if
> >> (busiest->group_weight == 1 || sds->prefer_sibling)' case?
> >
> > For env->imbalance = (busiest->sum_h_nr_running - local->sum_h_nr_running) 
> > >> 1;
> >
> > either we have sds->prefer_sibling && busiest->sum_nr_running >
> > local->sum_nr_running + 1
>
> I see, this corresponds to
>
> /* Try to move all excess tasks to child's sibling domain */
>if (sds.prefer_sibling && local->group_type == group_has_spare &&
>busiest->sum_h_nr_running > local->sum_h_nr_running + 1)
>goto force_balance;
>
> in find_busiest_group, I assume.

yes. But it seems that I missed a case:

prefer_sibling is set
busiest->sum_h_nr_running <= local->sum_h_nr_running + 1 so we skip
goto force_balance above
But env->idle != CPU_NOT_IDLE  and local->idle_cpus >
(busiest->idle_cpus + 1) so we also skip goto out_balance and finally
call calculate_imbalance()

in calculate_imbalance with prefer_sibling set, imbalance =
(busiest->sum_h_nr_running - local->sum_h_nr_running) >> 1;

so we probably want something similar to max_t(long, 0,
(busiest->sum_h_nr_running - local->sum_h_nr_running) >> 1)

>
> Haven't been able to recreate this yet on my arm64 platform since there
> is no prefer_sibling and in case local and busiest have
> group_type=group_has_spare they bailout in
>
>  if (busiest->group_type != group_overloaded &&
>   (env->idle == CPU_NOT_IDLE ||
>local->idle_cpus <= (busiest->idle_cpus + 1)))
>  goto out_balanced;
>
>
> [...]
>
> >>> - if (busiest->group_type == group_overloaded &&
> >>> - local->group_type   == group_overloaded) {
> >>> - load_above_capacity = busiest->sum_h_nr_running * 
> >>> SCHED_CAPACITY_SCALE;
> >>> - if (load_above_capacity > busiest->group_capacity) {
> >>> - load_above_capacity -= busiest->group_capacity;
> >>> - load_above_capacity *= scale_load_down(NICE_0_LOAD);
> >>> - load_above_capacity /= busiest->group_capacity;
> >>> - } else
> >>> - load_above_capacity = ~0UL;
> >>> + if (local->group_type < group_overloaded) {
> >>> + /*
> >>> +  * Local will become overloaded so the avg_load metrics are
> >>> +  * finally needed.
> >>> +  */
> >>
> >> How does this relate to the decision_matrix[local, busiest] (dm[])? E.g.
> >> dm[overload, overload] == avg_load or dm[fully_busy, overload] == force.
> >> It would be nice to be able to match all allowed fields of dm to code 
> >> sections.
> >
> > decision_matrix describes how it decides between balanced or unbalanced.
> > In case of dm[overload, overload], we use the avg_load to decide if it
> > is balanced or not
>
> OK, that's why you calculate sgs->avg_load in update_sg_lb_stats() only
> for 'sgs->group_type == group_overloaded'.
>
> > In case of dm[fully_busy, overload], the groups are unbalanced because
> > fully_busy < overload and we force the balance. Then
> > calculate_imbalance() uses the avg_load to decide how much will be
> > moved
>
> And in this case 'local->group_type < group_overloaded' in
> calculate_imbalance(), 'local->avg_load' and 'sds->avg_load' have to be
> calculated before using them in env->imbalance = min(...).
>
> OK, got it now.
>
> > dm[overload, overload]=force means that we force the balance and we
> > will compute later the imbalance. avg_load may be used to calculate
> > the imbalance
> > dm[overload, overload]=avg_load means that we compare the avg_load to
> > decide whether we need to balance load between groups
> > dm[overload, overload]=nr_idle means that we compare the number of
> > idle cpus to decide whether we need to balance.  In fact this is no
> > more true with patch 7 because we also take into 

Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-02 Thread Vincent Guittot
On Tue, 1 Oct 2019 at 18:53, Dietmar Eggemann  wrote:
>
> On 01/10/2019 10:14, Vincent Guittot wrote:
> > On Mon, 30 Sep 2019 at 18:24, Dietmar Eggemann  
> > wrote:
> >>
> >> Hi Vincent,
> >>
> >> On 19/09/2019 09:33, Vincent Guittot wrote:
>
> [...]
>
> >>> @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
> >>>   {
> >>> struct list_head *tasks = >src_rq->cfs_tasks;
> >>> struct task_struct *p;
> >>> - unsigned long load;
> >>> + unsigned long util, load;
> >>
> >> Minor: Order by length or reduce scope to while loop ?
> >
> > I don't get your point here
>
> Nothing dramatic here! Just
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d0c3aa1dc290..a08f342ead89 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7333,8 +7333,8 @@ static const unsigned int sched_nr_migrate_break = 32;
>  static int detach_tasks(struct lb_env *env)
>  {
> struct list_head *tasks = >src_rq->cfs_tasks;
> -   struct task_struct *p;
> unsigned long load, util;
> +   struct task_struct *p;

hmm... I still don't get this.
We usually gather pointers instead of interleaving them with other varaiables

> int detached = 0;
>
> lockdep_assert_held(>src_rq->lock);
>
> or
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d0c3aa1dc290..4d1864d43ed7 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7334,7 +7334,6 @@ static int detach_tasks(struct lb_env *env)
>  {
> struct list_head *tasks = >src_rq->cfs_tasks;
> struct task_struct *p;
> -   unsigned long load, util;
> int detached = 0;
>
> lockdep_assert_held(>src_rq->lock);
> @@ -7343,6 +7342,8 @@ static int detach_tasks(struct lb_env *env)
> return 0;
>
> while (!list_empty(tasks)) {
> +   unsigned long load, util;
> +
> /*
>
> [...]
>
> >>> @@ -8042,14 +8104,24 @@ static inline void update_sg_lb_stats(struct 
> >>> lb_env *env,
> >>> }
> >>> }
> >>>
> >>> - /* Adjust by relative CPU capacity of the group */
> >>> + /* Check if dst cpu is idle and preferred to this group */
> >>
> >> s/preferred to/preferred by ? or the preferred CPU of this group ?
> >
> > dst cpu doesn't belong to this group. We compare asym_prefer_cpu of
> > this group vs dst_cpu which belongs to another group
>
> Ah, in the sense of 'preferred over'. Got it now!
>
> [...]
>
> >>> + if (busiest->group_type == group_imbalanced) {
> >>> + /*
> >>> +  * In the group_imb case we cannot rely on group-wide 
> >>> averages
> >>> +  * to ensure CPU-load equilibrium, try to move any task to 
> >>> fix
> >>> +  * the imbalance. The next load balance will take care of
> >>> +  * balancing back the system.
> >>
> >> balancing back ?
> >
> > In case of imbalance, we don't try to balance the system but only try
> > to get rid of the pinned tasks problem. The system will still be
> > unbalanced after the migration and the next load balance will take
> > care of balancing the system
>
> OK.
>
> [...]
>
> >>> /*
> >>> -  * Avg load of busiest sg can be less and avg load of local sg can
> >>> -  * be greater than avg load across all sgs of sd because avg load
> >>> -  * factors in sg capacity and sgs with smaller group_type are
> >>> -  * skipped when updating the busiest sg:
> >>> +  * Try to use spare capacity of local group without overloading it 
> >>> or
> >>> +  * emptying busiest
> >>>  */
> >>> - if (busiest->group_type != group_misfit_task &&
> >>> - (busiest->avg_load <= sds->avg_load ||
> >>> -  local->avg_load >= sds->avg_load)) {
> >>> - env->imbalance = 0;
> >>> + if (local->group_type == group_has_spare) {
> >>> + if (busiest->group_type > group_fully_busy) {
> >>
> >> So this could be 'busiest->group_type == group_overloaded' here to match
> >> the comment below? Since you handle group_misfit_task,
> >> group_asym_packing, group_imbalanced above and return.
> >
> > This is just to be more robust in case some new states are added later
>
> OK, although I doubt that additional states can be added easily w/o
> carefully auditing the entire lb code ;-)
>
> [...]
>
> >>> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> >>> + /*
> >>> +  * When prefer sibling, evenly spread running tasks 
> >>> on
> >>> +  * groups.
> >>> +  */
> >>> + env->balance_type = migrate_task;
> >>> + env->imbalance = (busiest->sum_h_nr_running - 
> >>> local->sum_h_nr_running) >> 1;
> >>> + return;
> >>> + }
> >>> +
> >>> + /*
> >>> +  * If there is no overload, we just want to even the number 
> >>> of
> >>> +  * idle 

Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-01 Thread Valentin Schneider
On 19/09/2019 08:33, Vincent Guittot wrote:

[...]

> @@ -8283,69 +8363,133 @@ static inline void update_sd_lb_stats(struct lb_env 
> *env, struct sd_lb_stats *sd
>   */
>  static inline void calculate_imbalance(struct lb_env *env, struct 
> sd_lb_stats *sds)
>  {
> - unsigned long max_pull, load_above_capacity = ~0UL;
>   struct sg_lb_stats *local, *busiest;
>  
>   local = >local_stat;
>   busiest = >busiest_stat;
>  
> - if (busiest->group_asym_packing) {
> + if (busiest->group_type == group_misfit_task) {
> + /* Set imbalance to allow misfit task to be balanced. */
> + env->balance_type = migrate_misfit;
> + env->imbalance = busiest->group_misfit_task_load;
> + return;
> + }
> +
> + if (busiest->group_type == group_asym_packing) {
> + /*
> +  * In case of asym capacity, we will try to migrate all load to
> +  * the preferred CPU.
> +  */
> + env->balance_type = migrate_load;
>   env->imbalance = busiest->group_load;
>   return;
>   }
>  
> + if (busiest->group_type == group_imbalanced) {
> + /*
> +  * In the group_imb case we cannot rely on group-wide averages
> +  * to ensure CPU-load equilibrium, try to move any task to fix
> +  * the imbalance. The next load balance will take care of
> +  * balancing back the system.
> +  */
> + env->balance_type = migrate_task;
> + env->imbalance = 1;
> + return;
> + }
> +
>   /*
> -  * Avg load of busiest sg can be less and avg load of local sg can
> -  * be greater than avg load across all sgs of sd because avg load
> -  * factors in sg capacity and sgs with smaller group_type are
> -  * skipped when updating the busiest sg:
> +  * Try to use spare capacity of local group without overloading it or
> +  * emptying busiest
>*/
> - if (busiest->group_type != group_misfit_task &&
> - (busiest->avg_load <= sds->avg_load ||
> -  local->avg_load >= sds->avg_load)) {
> - env->imbalance = 0;
> + if (local->group_type == group_has_spare) {
> + if (busiest->group_type > group_fully_busy) {
> + /*
> +  * If busiest is overloaded, try to fill spare
> +  * capacity. This might end up creating spare capacity
> +  * in busiest or busiest still being overloaded but
> +  * there is no simple way to directly compute the
> +  * amount of load to migrate in order to balance the
> +  * system.
> +  */
> + env->balance_type = migrate_util;
> + env->imbalance = max(local->group_capacity, 
> local->group_util) -
> + local->group_util;
> + return;
> + }
> +
> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
> + /*
> +  * When prefer sibling, evenly spread running tasks on
> +  * groups.
> +  */
> + env->balance_type = migrate_task;
> + env->imbalance = (busiest->sum_h_nr_running - 
> local->sum_h_nr_running) >> 1;

Isn't that one somewhat risky?

Say both groups are classified group_has_spare and we do prefer_sibling.
We'd select busiest as the one with the maximum number of busy CPUs, but it
could be so that busiest.sum_h_nr_running < local.sum_h_nr_running (because
pinned tasks or wakeup failed to properly spread stuff).

The thing should be unsigned so at least we save ourselves from right
shifting a negative value, but we still end up with a gygornous imbalance
(which we then store into env.imbalance which *is* signed... Urgh).

[...]


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-01 Thread Dietmar Eggemann



On 01/10/2019 11:14, Vincent Guittot wrote:
> group_asym_packing
> 
> On Tue, 1 Oct 2019 at 10:15, Dietmar Eggemann  
> wrote:
>>
>> On 19/09/2019 09:33, Vincent Guittot wrote:
>>
>>
>> [...]
>>
>>> @@ -8042,14 +8104,24 @@ static inline void update_sg_lb_stats(struct lb_env 
>>> *env,
>>>   }
>>>   }
>>>
>>> - /* Adjust by relative CPU capacity of the group */
>>> + /* Check if dst cpu is idle and preferred to this group */
>>> + if (env->sd->flags & SD_ASYM_PACKING &&
>>> + env->idle != CPU_NOT_IDLE &&
>>> + sgs->sum_h_nr_running &&
>>> + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
>>> + sgs->group_asym_packing = 1;
>>> + }
>>> +
>>
>> Since asym_packing check is done per-sg rather per-CPU (like misfit_task), 
>> can you
>> not check for asym_packing in group_classify() directly (like for 
>> overloaded) and
>> so get rid of struct sg_lb_stats::group_asym_packing?
> 
> asym_packing uses a lot of fields of env to set group_asym_packing but
> env is not statistics and i'd like to remove env from
> group_classify() argument so it will use only statistics.
> At now, env is an arg of group_classify only for imbalance_pct field
> and I have replaced env by imabalnce_pct in a patch that is under
> preparation.
> With this change, I can use group_classify outside load_balance()

OK, I see. This relates to the 'update find_idlest_group() to be more
aligned with load_balance()' point mentioned in the cover letter I
assume. To make sure we can use load instead of runnable_load there as well.

[...]


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-01 Thread Dietmar Eggemann
On 01/10/2019 10:14, Vincent Guittot wrote:
> On Mon, 30 Sep 2019 at 18:24, Dietmar Eggemann  
> wrote:
>>
>> Hi Vincent,
>>
>> On 19/09/2019 09:33, Vincent Guittot wrote:

[...]

>>> @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
>>>   {
>>> struct list_head *tasks = >src_rq->cfs_tasks;
>>> struct task_struct *p;
>>> - unsigned long load;
>>> + unsigned long util, load;
>>
>> Minor: Order by length or reduce scope to while loop ?
> 
> I don't get your point here

Nothing dramatic here! Just

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0c3aa1dc290..a08f342ead89 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7333,8 +7333,8 @@ static const unsigned int sched_nr_migrate_break = 32;
 static int detach_tasks(struct lb_env *env)
 {
struct list_head *tasks = >src_rq->cfs_tasks;
-   struct task_struct *p;
unsigned long load, util;
+   struct task_struct *p;
int detached = 0;

lockdep_assert_held(>src_rq->lock);

or

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0c3aa1dc290..4d1864d43ed7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7334,7 +7334,6 @@ static int detach_tasks(struct lb_env *env)
 {
struct list_head *tasks = >src_rq->cfs_tasks;
struct task_struct *p;
-   unsigned long load, util;
int detached = 0;

lockdep_assert_held(>src_rq->lock);
@@ -7343,6 +7342,8 @@ static int detach_tasks(struct lb_env *env)
return 0;

while (!list_empty(tasks)) {
+   unsigned long load, util;
+
/*

[...]

>>> @@ -8042,14 +8104,24 @@ static inline void update_sg_lb_stats(struct lb_env 
>>> *env,
>>> }
>>> }
>>>
>>> - /* Adjust by relative CPU capacity of the group */
>>> + /* Check if dst cpu is idle and preferred to this group */
>>
>> s/preferred to/preferred by ? or the preferred CPU of this group ?
> 
> dst cpu doesn't belong to this group. We compare asym_prefer_cpu of
> this group vs dst_cpu which belongs to another group

Ah, in the sense of 'preferred over'. Got it now!

[...]

>>> + if (busiest->group_type == group_imbalanced) {
>>> + /*
>>> +  * In the group_imb case we cannot rely on group-wide averages
>>> +  * to ensure CPU-load equilibrium, try to move any task to fix
>>> +  * the imbalance. The next load balance will take care of
>>> +  * balancing back the system.
>>
>> balancing back ?
> 
> In case of imbalance, we don't try to balance the system but only try
> to get rid of the pinned tasks problem. The system will still be
> unbalanced after the migration and the next load balance will take
> care of balancing the system

OK.

[...]

>>> /*
>>> -  * Avg load of busiest sg can be less and avg load of local sg can
>>> -  * be greater than avg load across all sgs of sd because avg load
>>> -  * factors in sg capacity and sgs with smaller group_type are
>>> -  * skipped when updating the busiest sg:
>>> +  * Try to use spare capacity of local group without overloading it or
>>> +  * emptying busiest
>>>  */
>>> - if (busiest->group_type != group_misfit_task &&
>>> - (busiest->avg_load <= sds->avg_load ||
>>> -  local->avg_load >= sds->avg_load)) {
>>> - env->imbalance = 0;
>>> + if (local->group_type == group_has_spare) {
>>> + if (busiest->group_type > group_fully_busy) {
>>
>> So this could be 'busiest->group_type == group_overloaded' here to match
>> the comment below? Since you handle group_misfit_task,
>> group_asym_packing, group_imbalanced above and return.
> 
> This is just to be more robust in case some new states are added later

OK, although I doubt that additional states can be added easily w/o
carefully auditing the entire lb code ;-)

[...]

>>> + if (busiest->group_weight == 1 || sds->prefer_sibling) {
>>> + /*
>>> +  * When prefer sibling, evenly spread running tasks on
>>> +  * groups.
>>> +  */
>>> + env->balance_type = migrate_task;
>>> + env->imbalance = (busiest->sum_h_nr_running - 
>>> local->sum_h_nr_running) >> 1;
>>> + return;
>>> + }
>>> +
>>> + /*
>>> +  * If there is no overload, we just want to even the number of
>>> +  * idle cpus.
>>> +  */
>>> + env->balance_type = migrate_task;
>>> + env->imbalance = max_t(long, 0, (local->idle_cpus - 
>>> busiest->idle_cpus) >> 1);
>>
>> Why do we need a max_t(long, 0, ...) here and not for the 'if
>> (busiest->group_weight == 1 || sds->prefer_sibling)' case?
> 
> For env->imbalance = (busiest->sum_h_nr_running - local->sum_h_nr_running) >> 
> 1;
> 
> either we have sds->prefer_sibling && 

Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-01 Thread Vincent Guittot
group_asym_packing

On Tue, 1 Oct 2019 at 10:15, Dietmar Eggemann  wrote:
>
> On 19/09/2019 09:33, Vincent Guittot wrote:
>
>
> [...]
>
> > @@ -8042,14 +8104,24 @@ static inline void update_sg_lb_stats(struct lb_env 
> > *env,
> >   }
> >   }
> >
> > - /* Adjust by relative CPU capacity of the group */
> > + /* Check if dst cpu is idle and preferred to this group */
> > + if (env->sd->flags & SD_ASYM_PACKING &&
> > + env->idle != CPU_NOT_IDLE &&
> > + sgs->sum_h_nr_running &&
> > + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
> > + sgs->group_asym_packing = 1;
> > + }
> > +
>
> Since asym_packing check is done per-sg rather per-CPU (like misfit_task), 
> can you
> not check for asym_packing in group_classify() directly (like for overloaded) 
> and
> so get rid of struct sg_lb_stats::group_asym_packing?

asym_packing uses a lot of fields of env to set group_asym_packing but
env is not statistics and i'd like to remove env from
group_classify() argument so it will use only statistics.
At now, env is an arg of group_classify only for imbalance_pct field
and I have replaced env by imabalnce_pct in a patch that is under
preparation.
With this change, I can use group_classify outside load_balance()

>
> Something like (only compile tested).
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d0c3aa1dc290..b2848d6f8a2a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7692,7 +7692,6 @@ struct sg_lb_stats {
> unsigned int idle_cpus;
> unsigned int group_weight;
> enum group_type group_type;
> -   unsigned int group_asym_packing; /* Tasks should be moved to 
> preferred CPU */
> unsigned long group_misfit_task_load; /* A CPU has a task too big for 
> its capacity */
>  #ifdef CONFIG_NUMA_BALANCING
> unsigned int nr_numa_running;
> @@ -7952,6 +7951,20 @@ group_is_overloaded(struct lb_env *env, struct 
> sg_lb_stats *sgs)
> return false;
>  }
>
> +static inline bool
> +group_has_asym_packing(struct lb_env *env, struct sched_group *sg,
> +  struct sg_lb_stats *sgs)
> +{
> +   if (env->sd->flags & SD_ASYM_PACKING &&
> +   env->idle != CPU_NOT_IDLE &&
> +   sgs->sum_h_nr_running &&
> +   sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
> +   return true;
> +   }
> +
> +   return false;
> +}
> +
>  /*
>   * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
>   * per-CPU capacity than sched_group ref.
> @@ -7983,7 +7996,7 @@ group_type group_classify(struct lb_env *env,
> if (sg_imbalanced(group))
> return group_imbalanced;
>
> -   if (sgs->group_asym_packing)
> +   if (group_has_asym_packing(env, group, sgs))
> return group_asym_packing;
>
> if (sgs->group_misfit_task_load)
> @@ -8076,14 +8089,6 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
> }
> }
>
> -   /* Check if dst cpu is idle and preferred to this group */
> -   if (env->sd->flags & SD_ASYM_PACKING &&
> -   env->idle != CPU_NOT_IDLE &&
> -   sgs->sum_h_nr_running &&
> -   sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
> -   sgs->group_asym_packing = 1;
> -   }
> -
> sgs->group_capacity = group->sgc->capacity;
>
> [...]


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-01 Thread Dietmar Eggemann
On 19/09/2019 09:33, Vincent Guittot wrote:


[...]

> @@ -8042,14 +8104,24 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
>   }
>   }
>  
> - /* Adjust by relative CPU capacity of the group */
> + /* Check if dst cpu is idle and preferred to this group */
> + if (env->sd->flags & SD_ASYM_PACKING &&
> + env->idle != CPU_NOT_IDLE &&
> + sgs->sum_h_nr_running &&
> + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
> + sgs->group_asym_packing = 1;
> + }
> +

Since asym_packing check is done per-sg rather per-CPU (like misfit_task), can 
you
not check for asym_packing in group_classify() directly (like for overloaded) 
and
so get rid of struct sg_lb_stats::group_asym_packing?

Something like (only compile tested).

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0c3aa1dc290..b2848d6f8a2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7692,7 +7692,6 @@ struct sg_lb_stats {
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
-   unsigned int group_asym_packing; /* Tasks should be moved to preferred 
CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for 
its capacity */
 #ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -7952,6 +7951,20 @@ group_is_overloaded(struct lb_env *env, struct 
sg_lb_stats *sgs)
return false;
 }
 
+static inline bool
+group_has_asym_packing(struct lb_env *env, struct sched_group *sg,
+  struct sg_lb_stats *sgs)
+{
+   if (env->sd->flags & SD_ASYM_PACKING &&
+   env->idle != CPU_NOT_IDLE &&
+   sgs->sum_h_nr_running &&
+   sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
+   return true;
+   }
+
+   return false;
+}
+
 /*
  * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
  * per-CPU capacity than sched_group ref.
@@ -7983,7 +7996,7 @@ group_type group_classify(struct lb_env *env,
if (sg_imbalanced(group))
return group_imbalanced;
 
-   if (sgs->group_asym_packing)
+   if (group_has_asym_packing(env, group, sgs))
return group_asym_packing;
 
if (sgs->group_misfit_task_load)
@@ -8076,14 +8089,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
}
 
-   /* Check if dst cpu is idle and preferred to this group */
-   if (env->sd->flags & SD_ASYM_PACKING &&
-   env->idle != CPU_NOT_IDLE &&
-   sgs->sum_h_nr_running &&
-   sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
-   sgs->group_asym_packing = 1;
-   }
-
sgs->group_capacity = group->sgc->capacity;

[...]


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-10-01 Thread Vincent Guittot
On Mon, 30 Sep 2019 at 18:24, Dietmar Eggemann  wrote:
>
> Hi Vincent,
>
> On 19/09/2019 09:33, Vincent Guittot wrote:
>
> these are just some comments & questions based on a code study. Haven't
> run any tests with it yet.
>
> [...]
>
> > The type of sched_group has been extended to better reflect the type of
> > imbalance. We now have :
> > group_has_spare
> > group_fully_busy
> > group_misfit_task
> > group_asym_capacity
>
> s/group_asym_capacity/group_asym_packing

Yes, I forgot to change the commit message and the comments

>
> > group_imbalanced
> > group_overloaded
> >
> > Based on the type fo sched_group, load_balance now sets what it wants to
>
> s/fo/for

s/fo/of/

>
> > move in order to fix the imbalance. It can be some load as before but also
> > some utilization, a number of task or a type of task:
> > migrate_task
> > migrate_util
> > migrate_load
> > migrate_misfit
> >
> > This new load_balance algorithm fixes several pending wrong tasks
> > placement:
> > - the 1 task per CPU case with asymmetrics system
>
> s/asymmetrics/asymmetric

yes

>
> This stands for ASYM_CPUCAPACITY and ASYM_PACKING, right?

yes

>
> [...]
>
> >   #define LBF_ALL_PINNED   0x01
> > @@ -7115,7 +7130,7 @@ struct lb_env {
> > unsigned intloop_max;
> >
> > enum fbq_type   fbq_type;
> > - enum group_type src_grp_type;
> > + enum migration_type balance_type;
>
> Minor thing:
>
> Why not
>  enum migration_type migration_type;
> or
>  enum balance_type balance_type;
>
> We do the same for other enums like fbq_type or group_type.

yes, I can align

>
> > struct list_headtasks;
> >   };
> >
>
> The detach_tasks() comment still mentions only runnable load.

ok

>
> > @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
> >   {
> > struct list_head *tasks = >src_rq->cfs_tasks;
> > struct task_struct *p;
> > - unsigned long load;
> > + unsigned long util, load;
>
> Minor: Order by length or reduce scope to while loop ?

I don't get your point here

>
> > int detached = 0;
> >
> > lockdep_assert_held(>src_rq->lock);
> > @@ -7380,19 +7395,53 @@ static int detach_tasks(struct lb_env *env)
> > if (!can_migrate_task(p, env))
> > goto next;
> >
> > - load = task_h_load(p);
> > + switch (env->balance_type) {
> > + case migrate_load:
> > + load = task_h_load(p);
> >
> > - if (sched_feat(LB_MIN) && load < 16 && 
> > !env->sd->nr_balance_failed)
> > - goto next;
> > + if (sched_feat(LB_MIN) &&
> > + load < 16 && !env->sd->nr_balance_failed)
> > + goto next;
> >
> > - if ((load / 2) > env->imbalance)
> > - goto next;
> > + if ((load / 2) > env->imbalance)
> > + goto next;
> > +
> > + env->imbalance -= load;
> > + break;
> > +
> > + case migrate_util:
> > + util = task_util_est(p);
> > +
> > + if (util > env->imbalance)
> > + goto next;
> > +
> > + env->imbalance -= util;
> > + break;
> > +
> > + case migrate_task:
> > + /* Migrate task */
>
> Minor: IMHO, this comment is not necessary.

yes

>
> > + env->imbalance--;
> > + break;
> > +
> > + case migrate_misfit:
> > + load = task_h_load(p);
> > +
> > + /*
> > +  * utilization of misfit task might decrease a bit
>
> This patch still uses load. IMHO this comments becomes true only with 08/10.

even with 8/10 it's not correct and it has been removed.
I'm going to remove it.

>
> [...]
>
> > @@ -7707,13 +7755,11 @@ static inline void init_sd_lb_stats(struct 
> > sd_lb_stats *sds)
> > *sds = (struct sd_lb_stats){
> > .busiest = NULL,
> > .local = NULL,
> > - .total_running = 0UL,
> > .total_load = 0UL,
> > .total_capacity = 0UL,
> > .busiest_stat = {
> > - .avg_load = 0UL,
>
> There is a sentence in the comment above explaining why avg_load has to
> be cleared. And IMHO local group isn't cleared anymore but set/initialized.

Yes, I have to update it

>
> > - .sum_h_nr_running = 0,
> > - .group_type = group_other,
> > + .idle_cpus = UINT_MAX,
> > + .group_type = group_has_spare,
> > },
> > };
> >   }
>
> [...]
>
> > @@ -8042,14 +8104,24 @@ static inline void update_sg_lb_stats(struct 

Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-09-30 Thread Dietmar Eggemann
Hi Vincent,

On 19/09/2019 09:33, Vincent Guittot wrote:

these are just some comments & questions based on a code study. Haven't
run any tests with it yet.

[...]

> The type of sched_group has been extended to better reflect the type of
> imbalance. We now have :
> group_has_spare
> group_fully_busy
> group_misfit_task
> group_asym_capacity

s/group_asym_capacity/group_asym_packing

> group_imbalanced
> group_overloaded
>
> Based on the type fo sched_group, load_balance now sets what it wants to

s/fo/for

> move in order to fix the imbalance. It can be some load as before but also
> some utilization, a number of task or a type of task:
> migrate_task
> migrate_util
> migrate_load
> migrate_misfit
>
> This new load_balance algorithm fixes several pending wrong tasks
> placement:
> - the 1 task per CPU case with asymmetrics system

s/asymmetrics/asymmetric

This stands for ASYM_CPUCAPACITY and ASYM_PACKING, right?

[...]

>   #define LBF_ALL_PINNED   0x01
> @@ -7115,7 +7130,7 @@ struct lb_env {
> unsigned intloop_max;
>  
> enum fbq_type   fbq_type;
> - enum group_type src_grp_type;
> + enum migration_type balance_type;

Minor thing:

Why not
 enum migration_type migration_type;
or
 enum balance_type balance_type;

We do the same for other enums like fbq_type or group_type.

> struct list_headtasks;
>   };
>  

The detach_tasks() comment still mentions only runnable load.

> @@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
>   {
> struct list_head *tasks = >src_rq->cfs_tasks;
> struct task_struct *p;
> - unsigned long load;
> + unsigned long util, load;

Minor: Order by length or reduce scope to while loop ?

> int detached = 0;
>  
> lockdep_assert_held(>src_rq->lock);
> @@ -7380,19 +7395,53 @@ static int detach_tasks(struct lb_env *env)
> if (!can_migrate_task(p, env))
> goto next;
>  
> - load = task_h_load(p);
> + switch (env->balance_type) {
> + case migrate_load:
> + load = task_h_load(p);
>  
> - if (sched_feat(LB_MIN) && load < 16 && 
> !env->sd->nr_balance_failed)
> - goto next;
> + if (sched_feat(LB_MIN) &&
> + load < 16 && !env->sd->nr_balance_failed)
> + goto next;
>  
> - if ((load / 2) > env->imbalance)
> - goto next;
> + if ((load / 2) > env->imbalance)
> + goto next;
> +
> + env->imbalance -= load;
> + break;
> +
> + case migrate_util:
> + util = task_util_est(p);
> +
> + if (util > env->imbalance)
> + goto next;
> +
> + env->imbalance -= util;
> + break;
> +
> + case migrate_task:
> + /* Migrate task */

Minor: IMHO, this comment is not necessary.

> + env->imbalance--;
> + break;
> +
> + case migrate_misfit:
> + load = task_h_load(p);
> +
> + /*
> +  * utilization of misfit task might decrease a bit

This patch still uses load. IMHO this comments becomes true only with 08/10.

[...]

> @@ -7707,13 +7755,11 @@ static inline void init_sd_lb_stats(struct 
> sd_lb_stats *sds)
> *sds = (struct sd_lb_stats){
> .busiest = NULL,
> .local = NULL,
> - .total_running = 0UL,
> .total_load = 0UL,
> .total_capacity = 0UL,
> .busiest_stat = {
> - .avg_load = 0UL,

There is a sentence in the comment above explaining why avg_load has to
be cleared. And IMHO local group isn't cleared anymore but set/initialized.

> - .sum_h_nr_running = 0,
> - .group_type = group_other,
> + .idle_cpus = UINT_MAX,
> + .group_type = group_has_spare,
> },
> };
>   }

[...]

> @@ -8042,14 +8104,24 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
> }
> }
>  
> - /* Adjust by relative CPU capacity of the group */
> + /* Check if dst cpu is idle and preferred to this group */

s/preferred to/preferred by ? or the preferred CPU of this group ?

[...]

> @@ -8283,69 +8363,133 @@ static inline void update_sd_lb_stats(struct lb_env 
> *env, struct sd_lb_stats *sd
>*/
>   static inline void calculate_imbalance(struct lb_env *env, struct
sd_lb_stats *sds)
>   {
> - unsigned long max_pull, load_above_capacity = ~0UL;
> struct sg_lb_stats *local, *busiest;
>  
>

Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-09-30 Thread Vincent Guittot
On Mon, 30 Sep 2019 at 03:13, Rik van Riel  wrote:
>
> On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> >
> > Also the load balance decisions have been consolidated in the 3
> > functions
> > below after removing the few bypasses and hacks of the current code:
> > - update_sd_pick_busiest() select the busiest sched_group.
> > - find_busiest_group() checks if there is an imbalance between local
> > and
> >   busiest group.
> > - calculate_imbalance() decides what have to be moved.
>
> I really like the direction this series is going.

Thanks

>
> However, I suppose I should run these patches for
> a few days with some of our test workloads before
> I send out an ack for this patch :)

Yes more tests on different platform are welcome

>
> --
> All Rights Reversed.


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-09-29 Thread Rik van Riel
On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> 
> Also the load balance decisions have been consolidated in the 3
> functions
> below after removing the few bypasses and hacks of the current code:
> - update_sd_pick_busiest() select the busiest sched_group.
> - find_busiest_group() checks if there is an imbalance between local
> and
>   busiest group.
> - calculate_imbalance() decides what have to be moved.

I really like the direction this series is going.

However, I suppose I should run these patches for
a few days with some of our test workloads before
I send out an ack for this patch :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH v3 04/10] sched/fair: rework load_balance

2019-09-19 Thread Vincent Guittot
The load_balance algorithm contains some heuristics which have become
meaningless since the rework of the scheduler's metrics like the
introduction of PELT.

Furthermore, load is an ill-suited metric for solving certain task
placement imbalance scenarios. For instance, in the presence of idle CPUs,
we should simply try to get at least one task per CPU, whereas the current
load-based algorithm can actually leave idle CPUs alone simply because the
load is somewhat balanced. The current algorithm ends up creating virtual
and meaningless value like the avg_load_per_task or tweaks the state of a
group to make it overloaded whereas it's not, in order to try to migrate
tasks.

load_balance should better qualify the imbalance of the group and clearly
define what has to be moved to fix this imbalance.

The type of sched_group has been extended to better reflect the type of
imbalance. We now have :
group_has_spare
group_fully_busy
group_misfit_task
group_asym_capacity
group_imbalanced
group_overloaded

Based on the type fo sched_group, load_balance now sets what it wants to
move in order to fix the imbalance. It can be some load as before but also
some utilization, a number of task or a type of task:
migrate_task
migrate_util
migrate_load
migrate_misfit

This new load_balance algorithm fixes several pending wrong tasks
placement:
- the 1 task per CPU case with asymmetrics system
- the case of cfs task preempted by other class
- the case of tasks not evenly spread on groups with spare capacity

Also the load balance decisions have been consolidated in the 3 functions
below after removing the few bypasses and hacks of the current code:
- update_sd_pick_busiest() select the busiest sched_group.
- find_busiest_group() checks if there is an imbalance between local and
  busiest group.
- calculate_imbalance() decides what have to be moved.

Finally, the now unused field total_running of struct sd_lb_stats has been
removed.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 585 ++--
 1 file changed, 380 insertions(+), 205 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 017aad0..d33379c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7078,11 +7078,26 @@ static unsigned long __read_mostly 
max_load_balance_interval = HZ/10;
 
 enum fbq_type { regular, remote, all };
 
+/*
+ * group_type describes the group of CPUs at the moment of the load balance.
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the groupe_type can be simply compared when selecting the busiest
+ * group. see update_sd_pick_busiest().
+ */
 enum group_type {
-   group_other = 0,
+   group_has_spare = 0,
+   group_fully_busy,
group_misfit_task,
+   group_asym_packing,
group_imbalanced,
-   group_overloaded,
+   group_overloaded
+};
+
+enum migration_type {
+   migrate_load = 0,
+   migrate_util,
+   migrate_task,
+   migrate_misfit
 };
 
 #define LBF_ALL_PINNED 0x01
@@ -7115,7 +7130,7 @@ struct lb_env {
unsigned intloop_max;
 
enum fbq_type   fbq_type;
-   enum group_type src_grp_type;
+   enum migration_type balance_type;
struct list_headtasks;
 };
 
@@ -7347,7 +7362,7 @@ static int detach_tasks(struct lb_env *env)
 {
struct list_head *tasks = >src_rq->cfs_tasks;
struct task_struct *p;
-   unsigned long load;
+   unsigned long util, load;
int detached = 0;
 
lockdep_assert_held(>src_rq->lock);
@@ -7380,19 +7395,53 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
 
-   load = task_h_load(p);
+   switch (env->balance_type) {
+   case migrate_load:
+   load = task_h_load(p);
 
-   if (sched_feat(LB_MIN) && load < 16 && 
!env->sd->nr_balance_failed)
-   goto next;
+   if (sched_feat(LB_MIN) &&
+   load < 16 && !env->sd->nr_balance_failed)
+   goto next;
 
-   if ((load / 2) > env->imbalance)
-   goto next;
+   if ((load / 2) > env->imbalance)
+   goto next;
+
+   env->imbalance -= load;
+   break;
+
+   case migrate_util:
+   util = task_util_est(p);
+
+   if (util > env->imbalance)
+   goto next;
+
+   env->imbalance -= util;
+   break;
+
+   case migrate_task:
+   /* Migrate task */
+   env->imbalance--;
+   break;
+
+