I've Cc:-ed a handful of gents who worked on CFS bandwidth details to widen the 
discussion. 
Patch quoted below.

Looks like a real bug that needs to be fixed - and at first sight the quota of 
1000 looks very 
low - could we improve the arithmetics perhaps?

A low quota of 1000 is used because there's many VMs or containers provisioned 
on the system 
that is triggering the bug, right?

Thanks,

        Ingo

* Phil Auld <pa...@redhat.com> wrote:

> From: "Phil Auld" <pa...@redhat.com>
> 
> sched/fair: Avoid throttle_list starvation with low cfs quota
> 
> With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, 
> distribute_cfs_runtime may not empty the throttled_list before it runs 
> out of runtime to distribute. In that case, due to the change from 
> c06f04c7048 to put throttled entries at the head of the list, later entries 
> on the list will starve.  Essentially, the same X processes will get pulled 
> off the list, given CPU time and then, when expired, get put back on the 
> head of the list where distribute_cfs_runtime will give runtime to the same 
> set of processes leaving the rest.
> 
> Fix the issue by setting a bit in struct cfs_bandwidth when 
> distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can 
> decide to put the throttled entry on the tail or the head of the list.  The 
> bit is set/cleared by the callers of distribute_cfs_runtime while they hold 
> cfs_bandwidth->lock. 
> 
> Signed-off-by: Phil Auld <pa...@redhat.com>
> Fixes: c06f04c70489 ("sched: Fix potential near-infinite 
> distribute_cfs_runtime() loop")
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: Ingo Molnar <mi...@kernel.org>
> Cc: sta...@vger.kernel.org
> ---
> 
> This is easy to reproduce with a handful of cpu consumers. I use crash on 
> the live system. In some cases you can simply look at the throttled list and 
> see the later entries are not changing:
> 
> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s 
> cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>   1     ffff90b56cb2d200  -976050
>   2     ffff90b56cb2cc00  -484925
>   3     ffff90b56cb2bc00  -658814
>   4     ffff90b56cb2ba00  -275365
>   5     ffff90b166a45600  -135138
>   6     ffff90b56cb2da00  -282505
>   7     ffff90b56cb2e000  -148065
>   8     ffff90b56cb2fa00  -872591
>   9     ffff90b56cb2c000  -84687
>  10     ffff90b56cb2f000  -87237
>  11     ffff90b166a40a00  -164582
> crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s 
> cfs_rq.runtime_remaining | paste - - | awk '{print $1"  "$4}' | pr -t -n3
>   1     ffff90b56cb2d200  -994147
>   2     ffff90b56cb2cc00  -306051
>   3     ffff90b56cb2bc00  -961321
>   4     ffff90b56cb2ba00  -24490
>   5     ffff90b166a45600  -135138
>   6     ffff90b56cb2da00  -282505
>   7     ffff90b56cb2e000  -148065
>   8     ffff90b56cb2fa00  -872591
>   9     ffff90b56cb2c000  -84687
>  10     ffff90b56cb2f000  -87237
>  11     ffff90b166a40a00  -164582
> 
> Sometimes it is easier to see by finding a process getting starved and 
> looking 
> at the sched_info:
> 
> crash> task ffff8eb765994500 sched_info
> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>   sched_info = {
>     pcount = 8, 
>     run_delay = 697094208, 
>     last_arrival = 240260125039, 
>     last_queued = 240260327513
>   }, 
> crash> task ffff8eb765994500 sched_info
> PID: 7800   TASK: ffff8eb765994500  CPU: 16  COMMAND: "cputest"
>   sched_info = {
>     pcount = 8, 
>     run_delay = 697094208, 
>     last_arrival = 240260125039, 
>     last_queued = 240260327513
>   }, 
> 
> 
>  fair.c  |   22 +++++++++++++++++++---
>  sched.h |    2 ++
>  2 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7fc4a371bdd2..f88e00705b55 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>       /*
>        * Add to the _head_ of the list, so that an already-started
> -      * distribute_cfs_runtime will not see us
> +      * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
> +      * not running add to the tail so that later runqueues don't get 
> starved.
>        */
> -     list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> +     if (cfs_b->distribute_running)
> +             list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> +     else
> +             list_add_tail_rcu(&cfs_rq->throttled_list, 
> &cfs_b->throttled_cfs_rq);
>  
>       /*
>        * If we're the first throttled task, make sure the bandwidth
> @@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct 
> cfs_bandwidth *cfs_b, int overrun)
>        * in us over-using our runtime if it is all used during this loop, but
>        * only by limited amounts in that extreme case.
>        */
> -     while (throttled && cfs_b->runtime > 0) {
> +     while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
>               runtime = cfs_b->runtime;
> +             cfs_b->distribute_running = 1;
>               raw_spin_unlock(&cfs_b->lock);
>               /* we can't nest cfs_b->lock while distributing bandwidth */
>               runtime = distribute_cfs_runtime(cfs_b, runtime,
>                                                runtime_expires);
>               raw_spin_lock(&cfs_b->lock);
>  
> +             cfs_b->distribute_running = 0;
>               throttled = !list_empty(&cfs_b->throttled_cfs_rq);
>  
>               cfs_b->runtime -= min(runtime, cfs_b->runtime);
> @@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct 
> cfs_bandwidth *cfs_b)
>  
>       /* confirm we're still not at a refresh boundary */
>       raw_spin_lock(&cfs_b->lock);
> +     if (cfs_b->distribute_running) {
> +             raw_spin_unlock(&cfs_b->lock);
> +             return;
> +     }
> +
>       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
>               raw_spin_unlock(&cfs_b->lock);
>               return;
> @@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct 
> cfs_bandwidth *cfs_b)
>               runtime = cfs_b->runtime;
>  
>       expires = cfs_b->runtime_expires;
> +     if (runtime)
> +             cfs_b->distribute_running = 1;
> +
>       raw_spin_unlock(&cfs_b->lock);
>  
>       if (!runtime)
> @@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct 
> cfs_bandwidth *cfs_b)
>       raw_spin_lock(&cfs_b->lock);
>       if (expires == cfs_b->runtime_expires)
>               cfs_b->runtime -= min(runtime, cfs_b->runtime);
> +     cfs_b->distribute_running = 0;
>       raw_spin_unlock(&cfs_b->lock);
>  }
>  
> @@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>       cfs_b->period_timer.function = sched_cfs_period_timer;
>       hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>       cfs_b->slack_timer.function = sched_cfs_slack_timer;
> +     cfs_b->distribute_running = 0;
>  }
>  
>  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 455fa330de04..9683f458aec7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -346,6 +346,8 @@ struct cfs_bandwidth {
>       int                     nr_periods;
>       int                     nr_throttled;
>       u64                     throttled_time;
> +
> +     bool                    distribute_running;
>  #endif
>  };
>  
> 
> 
> -- 

Reply via email to