fair: rework load_balance

Srikar Dronamraju Fri, 26 Jul 2019 06:59:32 -0700

> 
> The type of sched_group has been extended to better reflect the type of
> imbalance. We now have :
>       group_has_spare
>       group_fully_busy
>       group_misfit_task
>       group_asym_capacity
>       group_imbalanced
>       group_overloaded


How is group_fully_busy different from group_overloaded?

> 
> Based on the type fo sched_group, load_balance now sets what it wants to
> move in order to fix the imnbalance. It can be some load as before but
> also some utilization, a number of task or a type of task:
>       migrate_task
>       migrate_util
>       migrate_load
>       migrate_misfit

Can we club migrate_util and migrate_misfit?

> @@ -7361,19 +7357,46 @@ static int detach_tasks(struct lb_env *env)
>               if (!can_migrate_task(p, env))
>                       goto next;
>  
> -             load = task_h_load(p);
> +             if (env->src_grp_type == migrate_load) {
> +                     unsigned long load = task_h_load(p);
>  
> -             if (sched_feat(LB_MIN) && load < 16 && 
> !env->sd->nr_balance_failed)
> -                     goto next;
> +                     if (sched_feat(LB_MIN) &&
> +                         load < 16 && !env->sd->nr_balance_failed)
> +                             goto next;
> +
> +                     if ((load / 2) > env->imbalance)
> +                             goto next;

I know this existed before too but if the load is exactly or around 2x of
env->imbalance, the resultant imbalance after the load balance operation
would still be around env->imbalance. We may lose some cache affinity too.

Can we do something like.
                if (2 * load > 3 * env->imbalance)
                        goto next;

> @@ -7690,14 +7711,14 @@ static inline void init_sd_lb_stats(struct 
> sd_lb_stats *sds)
>       *sds = (struct sd_lb_stats){
>               .busiest = NULL,
>               .local = NULL,
> -             .total_running = 0UL,
>               .total_load = 0UL,
>               .total_capacity = 0UL,
>               .busiest_stat = {
>                       .avg_load = 0UL,
>                       .sum_nr_running = 0,
>                       .sum_h_nr_running = 0,
> -                     .group_type = group_other,
> +                     .idle_cpus = UINT_MAX,

Why are we initializing idle_cpus to UINT_MAX? Shouldnt this be set to 0?
I only see an increment and compare. 

> +                     .group_type = group_has_spare,
>               },
>       };
>  }
>  
>  static inline enum
> -group_type group_classify(struct sched_group *group,
> +group_type group_classify(struct lb_env *env,
> +                       struct sched_group *group,
>                         struct sg_lb_stats *sgs)
>  {
> -     if (sgs->group_no_capacity)
> +     if (group_is_overloaded(env, sgs))
>               return group_overloaded;
>  
>       if (sg_imbalanced(group))
> @@ -7953,7 +7975,13 @@ group_type group_classify(struct sched_group *group,
>       if (sgs->group_misfit_task_load)
>               return group_misfit_task;
>  
> -     return group_other;
> +     if (sgs->group_asym_capacity)
> +             return group_asym_capacity;
> +
> +     if (group_has_capacity(env, sgs))
> +             return group_has_spare;
> +
> +     return group_fully_busy;

If its not overloaded but also doesnt have capacity.
Does it mean its capacity is completely filled.
Cant we consider that as same as overloaded?

>  }
>  
>  
> -     if (sgs->sum_h_nr_running)
> -             sgs->load_per_task = sgs->group_load / sgs->sum_h_nr_running;
> +     sgs->group_capacity = group->sgc->capacity;
>  
>       sgs->group_weight = group->group_weight;
>  
> -     sgs->group_no_capacity = group_is_overloaded(env, sgs);
> -     sgs->group_type = group_classify(group, sgs);
> +     sgs->group_type = group_classify(env, group, sgs);
> +
> +     /* Computing avg_load makes sense only when group is overloaded */
> +     if (sgs->group_type != group_overloaded)
> +             sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
> +                             sgs->group_capacity;

Mismatch in comment and code?

> @@ -8079,11 +8120,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>       if (sgs->group_type < busiest->group_type)
>               return false;
>  
> -     if (sgs->avg_load <= busiest->avg_load)
> +     /* Select the overloaded group with highest avg_load */
> +     if (sgs->group_type == group_overloaded &&
> +         sgs->avg_load <= busiest->avg_load)
> +             return false;
> +
> +     /* Prefer to move from lowest priority CPU's work */
> +     if (sgs->group_type == group_asym_capacity && sds->busiest &&
> +         sched_asym_prefer(sg->asym_prefer_cpu, 
> sds->busiest->asym_prefer_cpu))
>               return false;

I thought this should have been 
        /* Prefer to move from lowest priority CPU's work */
        if (sgs->group_type == group_asym_capacity && sds->busiest &&
            sched_asym_prefer(sg->asym_prefer_cpu, 
sds->busiest->asym_prefer_cpu))
                return true;

> @@ -8357,72 +8318,115 @@ static inline void calculate_imbalance(struct lb_env 
> *env, struct sd_lb_stats *s
>       if (busiest->group_type == group_imbalanced) {
>               /*
>                * In the group_imb case we cannot rely on group-wide averages
> -              * to ensure CPU-load equilibrium, look at wider averages. XXX
> +              * to ensure CPU-load equilibrium, try to move any task to fix
> +              * the imbalance. The next load balance will take care of
> +              * balancing back the system.
>                */
> -             busiest->load_per_task =
> -                     min(busiest->load_per_task, sds->avg_load);
> +             env->src_grp_type = migrate_task;
> +             env->imbalance = 1;
> +             return;
>       }
>  
> -     /*
> -      * Avg load of busiest sg can be less and avg load of local sg can
> -      * be greater than avg load across all sgs of sd because avg load
> -      * factors in sg capacity and sgs with smaller group_type are
> -      * skipped when updating the busiest sg:
> -      */
> -     if (busiest->group_type != group_misfit_task &&
> -         (busiest->avg_load <= sds->avg_load ||
> -          local->avg_load >= sds->avg_load)) {
> -             env->imbalance = 0;
> -             return fix_small_imbalance(env, sds);
> +     if (busiest->group_type == group_misfit_task) {
> +             /* Set imbalance to allow misfit task to be balanced. */
> +             env->src_grp_type = migrate_misfit;
> +             env->imbalance = busiest->group_misfit_task_load;
> +             return;
>       }
>  
>       /*
> -      * If there aren't any idle CPUs, avoid creating some.
> +      * Try to use spare capacity of local group without overloading it or
> +      * emptying busiest
>        */
> -     if (busiest->group_type == group_overloaded &&
> -         local->group_type   == group_overloaded) {
> -             load_above_capacity = busiest->sum_h_nr_running * 
> SCHED_CAPACITY_SCALE;
> -             if (load_above_capacity > busiest->group_capacity) {
> -                     load_above_capacity -= busiest->group_capacity;
> -                     load_above_capacity *= scale_load_down(NICE_0_LOAD);
> -                     load_above_capacity /= busiest->group_capacity;
> -             } else
> -                     load_above_capacity = ~0UL;
> +     if (local->group_type == group_has_spare) {
> +             long imbalance;
> +
> +             /*
> +              * If there is no overload, we just want to even the number of
> +              * idle cpus.
> +              */
> +             env->src_grp_type = migrate_task;
> +             imbalance = max_t(long, 0, (local->idle_cpus - 
> busiest->idle_cpus) >> 1);

Shouldnt this be?
                imbalance = max_t(long, 0, (busiest->idle_cpus - 
local->idle_cpus) >> 1);
> +
> +             if (sds->prefer_sibling)
> +                     /*
> +                      * When prefer sibling, evenly spread running tasks on
> +                      * groups.
> +                      */
> +                     imbalance = (busiest->sum_nr_running - 
> local->sum_nr_running) >> 1;
> +
> +             if (busiest->group_type > group_fully_busy) {
> +                     /*
> +                      * If busiest is overloaded, try to fill spare
> +                      * capacity. This might end up creating spare capacity
> +                      * in busiest or busiest still being overloaded but
> +                      * there is no simple way to directly compute the
> +                      * amount of load to migrate in order to balance the
> +                      * system.
> +                      */
> +                     env->src_grp_type = migrate_util;
> +                     imbalance = max(local->group_capacity, 
> local->group_util) -
> +                                 local->group_util;
> +             }
> +
> +             env->imbalance = imbalance;
> +             return;
>       }
>  
>       /*
> -      * We're trying to get all the CPUs to the average_load, so we don't
> -      * want to push ourselves above the average load, nor do we wish to
> -      * reduce the max loaded CPU below the average load. At the same time,
> -      * we also don't want to reduce the group load below the group
> -      * capacity. Thus we look for the minimum possible imbalance.
> +      * Local is fully busy but have to take more load to relieve the
> +      * busiest group
>        */
> -     max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
> +     if (local->group_type < group_overloaded) {


What action are we taking if we find the local->group_type to be 
group_imbalanced
or group_misfit ?  We will fall here but I dont know if it right to look for
avg_load in that case.

> +             /*
> +              * Local will become overvloaded so the avg_load metrics are
> +              * finally needed
> +              */
>  
> -     /* How much load to actually move to equalise the imbalance */
> -     env->imbalance = min(
> -             max_pull * busiest->group_capacity,
> -             (sds->avg_load - local->avg_load) * local->group_capacity
> -     ) / SCHED_CAPACITY_SCALE;
> +             local->avg_load = (local->group_load*SCHED_CAPACITY_SCALE)
> +                                             / local->group_capacity;
>  
> -     /* Boost imbalance to allow misfit task to be balanced. */
> -     if (busiest->group_type == group_misfit_task) {
> -             env->imbalance = max_t(long, env->imbalance,
> -                                    busiest->group_misfit_task_load);
> +             sds->avg_load = (SCHED_CAPACITY_SCALE * sds->total_load)
> +                                             / sds->total_capacity;
>       }
>  
>       /*
> -      * if *imbalance is less than the average load per runnable task
> -      * there is no guarantee that any tasks will be moved so we'll have
> -      * a think about bumping its value to force at least one task to be
> -      * moved
> +      * Both group are or will become overloaded and we're trying to get
> +      * all the CPUs to the average_load, so we don't want to push
> +      * ourselves above the average load, nor do we wish to reduce the
> +      * max loaded CPU below the average load. At the same time, we also
> +      * don't want to reduce the group load below the group capacity.
> +      * Thus we look for the minimum possible imbalance.
>        */
> -     if (env->imbalance < busiest->load_per_task)
> -             return fix_small_imbalance(env, sds);
> +     env->src_grp_type = migrate_load;
> +     env->imbalance = min(
> +             (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
> +             (sds->avg_load - local->avg_load) * local->group_capacity
> +     ) / SCHED_CAPACITY_SCALE;
>  }

We calculated avg_load for !group_overloaded case, but seem to be using for
group_overloaded cases too.


-- 
Thanks and Regards
Srikar Dronamraju

Re: [PATCH 3/5] sched/fair: rework load_balance

Reply via email to