Peter Zijlstra <[email protected]> writes:

> Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth().
>
> The more I look at that code the more I'm convinced its crack, that
> entire __start_cfs_bandwidth() thing is brain melting, we don't need to
> cancel a timer before starting it, *hrtimer_start*() will happily remove
> the timer for you if its still enqueued.
>
> Removing that, removes a big part of the problem, no more ugly cancel
> loop to get stuck in.
>
> So now, if I understand things right, the entire reason you have this
> cfs_b->lock guarded ->timer_active nonsense is to make sure we don't
> accidentally loose the timer.
>
> It appears to me that it should be possible to guarantee that same by
> unconditionally (re)starting the timer when !queued. Because regardless
> what hrtimer::function will return, if we beat it to (re)enqueue the
> timer, it doesn't matter.
>
> Now, because hrtimers don't come with any serialization guarantees we
> must ensure both handler and (re)start loop serialize their access to
> the hrtimer to avoid both trying to forward the timer at the same
> time.
>
> Update the rt bandwidth timer to match.
>
> This effectively reverts: 09dc4ab03936 ("sched/fair: Fix
> tg_set_cfs_bandwidth() deadlock on rq->lock").
>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Paul Turner <[email protected]>
> Cc: Ben Segall <[email protected]>
> Reported-by: Roman Gushchin <[email protected]>

Reviewed-by: Ben Segall <[email protected]>

So much nicer. Docs/comment issue only: s/loose/lose/


> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
>  kernel/sched/core.c  |   15 ++++++------
>  kernel/sched/fair.c  |   59 
> ++++++++++++---------------------------------------
>  kernel/sched/rt.c    |   14 +++++-------
>  kernel/sched/sched.h |    4 +--
>  4 files changed, 31 insertions(+), 61 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -92,10 +92,13 @@
>  
>  void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
>  {
> -     if (hrtimer_active(period_timer))
> -             return;
> +     /*
> +      * Do not forward the expiration time of active timers;
> +      * we do not want to loose an overrun.
> +      */
> +     if (!hrtimer_active(period_timer))
> +             hrtimer_forward_now(period_timer, period);
>  
> -     hrtimer_forward_now(period_timer, period);
>       hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
>  }
>  
> @@ -8098,10 +8101,8 @@ static int tg_set_cfs_bandwidth(struct t
>  
>       __refill_cfs_bandwidth_runtime(cfs_b);
>       /* restart the period timer (if active) to handle new period expiry */
> -     if (runtime_enabled && cfs_b->timer_active) {
> -             /* force a reprogram */
> -             __start_cfs_bandwidth(cfs_b, true);
> -     }
> +     if (runtime_enabled)
> +             start_cfs_bandwidth(cfs_b);
>       raw_spin_unlock_irq(&cfs_b->lock);
>  
>       for_each_online_cpu(i) {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct
>       if (cfs_b->quota == RUNTIME_INF)
>               amount = min_amount;
>       else {
> -             /*
> -              * If the bandwidth pool has become inactive, then at least one
> -              * period must have elapsed since the last consumption.
> -              * Refresh the global state and ensure bandwidth timer becomes
> -              * active.
> -              */
> -             if (!cfs_b->timer_active) {
> -                     __refill_cfs_bandwidth_runtime(cfs_b);
> -                     __start_cfs_bandwidth(cfs_b, false);
> -             }
> +             start_cfs_bandwidth(cfs_b);
>  
>               if (cfs_b->runtime > 0) {
>                       amount = min(cfs_b->runtime, min_amount);
> @@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_r
>       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>       struct sched_entity *se;
>       long task_delta, dequeue = 1;
> +     bool empty;
>  
>       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>  
> @@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_r
>       cfs_rq->throttled = 1;
>       cfs_rq->throttled_clock = rq_clock(rq);
>       raw_spin_lock(&cfs_b->lock);
> +     empty = list_empty(&cfs_rq->throttled_list);
> +
>       /*
>        * Add to the _head_ of the list, so that an already-started
>        * distribute_cfs_runtime will not see us
>        */
>       list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> -     if (!cfs_b->timer_active)
> -             __start_cfs_bandwidth(cfs_b, false);
> +
> +     /*
> +      * If we're the first throttled task, make sure the bandwidth
> +      * timer is running.
> +      */
> +     if (empty)
> +             start_cfs_bandwidth(cfs_b);
> +
>       raw_spin_unlock(&cfs_b->lock);
>  }
>  
> @@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(str
>       if (cfs_b->idle && !throttled)
>               goto out_deactivate;
>  
> -     /*
> -      * if we have relooped after returning idle once, we need to update our
> -      * status as actually running, so that other cpus doing
> -      * __start_cfs_bandwidth will stop trying to cancel us.
> -      */
> -     cfs_b->timer_active = 1;
> -
>       __refill_cfs_bandwidth_runtime(cfs_b);
>  
>       if (!throttled) {
> @@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(str
>       return 0;
>  
>  out_deactivate:
> -     cfs_b->timer_active = 0;
>       return 1;
>  }
>  
> @@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_sl
>  {
>       struct cfs_bandwidth *cfs_b =
>               container_of(timer, struct cfs_bandwidth, slack_timer);
> +
>       do_sched_cfs_slack_timer(cfs_b);
>  
>       return HRTIMER_NORESTART;
> @@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_pe
>  {
>       struct cfs_bandwidth *cfs_b =
>               container_of(timer, struct cfs_bandwidth, period_timer);
> -     ktime_t now;
>       int overrun;
>       int idle = 0;
>  
>       raw_spin_lock(&cfs_b->lock);
>       for (;;) {
> -             now = hrtimer_cb_get_time(timer);
> -             overrun = hrtimer_forward(timer, now, cfs_b->period);
> -
> +             overrun = hrtimer_forward_now(timer, cfs_b->period);
>               if (!overrun)
>                       break;
>  
> @@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct c
>       INIT_LIST_HEAD(&cfs_rq->throttled_list);
>  }
>  
> -/* requires cfs_b->lock, may release to reprogram timer */
> -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
> +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>  {
> -     /*
> -      * The timer may be active because we're trying to set a new bandwidth
> -      * period or because we're racing with the tear-down path
> -      * (timer_active==0 becomes visible before the hrtimer call-back
> -      * terminates).  In either case we ensure that it's re-programmed
> -      */
> -     while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
> -            hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
> -             /* bounce the lock to allow do_sched_cfs_period_timer to run */
> -             raw_spin_unlock(&cfs_b->lock);
> -             cpu_relax();
> -             raw_spin_lock(&cfs_b->lock);
> -             /* if someone else restarted the timer then we're done */
> -             if (!force && cfs_b->timer_active)
> -                     return;
> -     }
> -
> -     cfs_b->timer_active = 1;
>       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
>  }
>  
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_per
>  {
>       struct rt_bandwidth *rt_b =
>               container_of(timer, struct rt_bandwidth, rt_period_timer);
> -     ktime_t now;
> -     int overrun;
>       int idle = 0;
> +     int overrun;
>  
> +     raw_spin_lock(&rt_b->rt_runtime_lock);
>       for (;;) {
> -             now = hrtimer_cb_get_time(timer);
> -             overrun = hrtimer_forward(timer, now, rt_b->rt_period);
> -
> +             overrun = hrtimer_forward_now(timer, rt_b->rt_period);
>               if (!overrun)
>                       break;
>  
> +             raw_spin_unlock(&rt_b->rt_runtime_lock);
>               idle = do_sched_rt_period_timer(rt_b, overrun);
> +             raw_spin_lock(&rt_b->rt_runtime_lock);
>       }
> +     raw_spin_unlock(&rt_b->rt_runtime_lock);
>  
>       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
>  }
> @@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt
>       if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
>               return;
>  
> -     if (hrtimer_active(&rt_b->rt_period_timer))
> -             return;
> -
>       raw_spin_lock(&rt_b->rt_runtime_lock);
>       start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
>       raw_spin_unlock(&rt_b->rt_runtime_lock);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -215,7 +215,7 @@ struct cfs_bandwidth {
>       s64 hierarchical_quota;
>       u64 runtime_expires;
>  
> -     int idle, timer_active;
> +     int idle;
>       struct hrtimer period_timer, slack_timer;
>       struct list_head throttled_cfs_rq;
>  
> @@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cf
>  extern int sched_group_set_shares(struct task_group *tg, unsigned long 
> shares);
>  
>  extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
> -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
> +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
>  extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
>  
>  extern void free_rt_sched_group(struct task_group *tg);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to