On 2016/04/02 at 05:51, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:
>
>>>> I checked the code, currently only deadline accesses the
>>>> pi_waiters/pi_waiters_leftmost
>>>> without pi_lock held via rt_mutex_get_top_task(), other cases all have
>>>> pi_lock held.
>> Any better ideas is welcome.
> Something like the below _might_ work; but its late and I haven't looked
> at the PI code in a while. This basically caches a pointer to the top
> waiter task in the running task_struct, under pi_lock and rq->lock, and
> therefore we can use it with only rq->lock held.
>
> Since the task is blocked, and cannot unblock without taking itself from
> the block chain -- which would cause rt_mutex_setprio() to set another
> top waiter task, the lifetime rules should be good.

In rt_mutex_slowunlock(), we release pi_lock and and wait_lock first, then
wake up the top waiter, then call rt_mutex_adjust_prio(), so there is a small
window without any lock or irq disabled between the top waiter wake up
and rt_mutex_adjust_prio(), which can cause problems.

For example, before calling rt_mutex_adjust_prio() to adjust the cached pointer,
if current is preempted and the waken top waiter exited, after that, the task is
back, and it may enter enqueue_task_dl() before entering rt_mutex_adjust_prio(),
where the cached pointer is updated, so it will access a stale cached pointer.

Regards,
Xunlei

> Having this top waiter pointer around might also be useful to further
> implement bandwidth inheritance or such, but I've not thought about that
> too much.
>
> Lots of code deleted because we move the entire task->prio mapping muck
> into the scheduler, because we now pass a pi_task, not a prio.
>
> ---
>  include/linux/sched.h    |  1 +
>  include/linux/sched/rt.h | 20 +-------------------
>  kernel/locking/rtmutex.c | 45 +++++----------------------------------------
>  kernel/sched/core.c      | 34 +++++++++++++++++++++++-----------
>  kernel/sched/deadline.c  |  2 +-
>  5 files changed, 31 insertions(+), 71 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 52c4847b05e2..30169f38cb24 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1615,6 +1615,7 @@ struct task_struct {
>  
>       /* Protection of the PI data structures: */
>       raw_spinlock_t pi_lock;
> +     struct task_struct *pi_task;
>  
>       struct wake_q_node wake_q;
>  
> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
> index a30b172df6e1..0a8f071b16e3 100644
> --- a/include/linux/sched/rt.h
> +++ b/include/linux/sched/rt.h
> @@ -16,31 +16,13 @@ static inline int rt_task(struct task_struct *p)
>  }
>  
>  #ifdef CONFIG_RT_MUTEXES
> -extern int rt_mutex_getprio(struct task_struct *p);
> -extern void rt_mutex_setprio(struct task_struct *p, int prio);
> -extern int rt_mutex_get_effective_prio(struct task_struct *task, int 
> newprio);
> -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
> +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct 
> *pi_task);
>  extern void rt_mutex_adjust_pi(struct task_struct *p);
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
>       return tsk->pi_blocked_on != NULL;
>  }
>  #else
> -static inline int rt_mutex_getprio(struct task_struct *p)
> -{
> -     return p->normal_prio;
> -}
> -
> -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
> -                                           int newprio)
> -{
> -     return newprio;
> -}
> -
> -static inline struct task_struct *rt_mutex_get_top_task(struct task_struct 
> *task)
> -{
> -     return NULL;
> -}
>  # define rt_mutex_adjust_pi(p)               do { } while (0)
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
>  {
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..13b6b5922d3c 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct 
> rt_mutex_waiter *waiter)
>  }
>  
>  /*
> - * Calculate task priority from the waiter tree priority
> - *
> - * Return task->normal_prio when the waiter tree is empty or when
> - * the waiter is not allowed to do priority boosting
> - */
> -int rt_mutex_getprio(struct task_struct *task)
> -{
> -     if (likely(!task_has_pi_waiters(task)))
> -             return task->normal_prio;
> -
> -     return min(task_top_pi_waiter(task)->prio,
> -                task->normal_prio);
> -}
> -
> -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> -     if (likely(!task_has_pi_waiters(task)))
> -             return NULL;
> -
> -     return task_top_pi_waiter(task)->task;
> -}
> -
> -/*
> - * Called by sched_setscheduler() to get the priority which will be
> - * effective after the change.
> - */
> -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
> -{
> -     if (!task_has_pi_waiters(task))
> -             return newprio;
> -
> -     if (task_top_pi_waiter(task)->task->prio <= newprio)
> -             return task_top_pi_waiter(task)->task->prio;
> -     return newprio;
> -}
> -
> -/*
>   * Adjust the priority of a task, after its pi_waiters got modified.
>   *
>   * This can be both boosting and unboosting. task->pi_lock must be held.
>   */
>  static void __rt_mutex_adjust_prio(struct task_struct *task)
>  {
> -     int prio = rt_mutex_getprio(task);
> +     struct task_struct *pi_task = task;
> +
> +     if (unlikely(task_has_pi_waiters(task)))
> +             pi_task = task_top_pi_waiter(task)->task;
>  
> -     if (task->prio != prio || dl_prio(prio))
> -             rt_mutex_setprio(task, prio);
> +     rt_mutex_setprio(task, pi_task);
>  }
>  
>  /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8b489fcac37b..7d7e3a0eaeb0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3392,7 +3392,7 @@ EXPORT_SYMBOL(default_wake_function);
>  /*
>   * rt_mutex_setprio - set the current priority of a task
>   * @p: task
> - * @prio: prio value (kernel-internal form)
> + * @pi_task: top waiter, donating state
>   *
>   * This function changes the 'effective' priority of a task. It does
>   * not touch ->normal_prio like __setscheduler().
> @@ -3400,13 +3400,20 @@ EXPORT_SYMBOL(default_wake_function);
>   * Used by the rt_mutex code to implement priority inheritance
>   * logic. Call site only calls if the priority of the task changed.
>   */
> -void rt_mutex_setprio(struct task_struct *p, int prio)
> +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
>  {
> -     int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
> -     struct rq *rq;
> +     int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | 
> DEQUEUE_MOVE;
>       const struct sched_class *prev_class;
> +     struct rq *rq;
>  
> -     BUG_ON(prio > MAX_PRIO);
> +     /*
> +      * For FIFO/RR we simply donate prio; for DL things are
> +      * more interesting.
> +      */
> +     /* XXX used to be waiter->prio, not waiter->task->prio */
> +     prio = min(pi_task->prio, p->normal_prio);
> +     if (p->prio == prio && !dl_prio(prio))
> +             return;
>  
>       rq = __task_rq_lock(p);
>  
> @@ -3442,6 +3449,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>       if (running)
>               put_prev_task(rq, p);
>  
> +     if (pi_task == p)
> +             pi_task = NULL;
> +     p->pi_task = pi_task;
> +
>       /*
>        * Boosting condition are:
>        * 1. -rt task is running and holds mutex A
> @@ -3452,7 +3463,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>        *          running task
>        */
>       if (dl_prio(prio)) {
> -             struct task_struct *pi_task = rt_mutex_get_top_task(p);
>               if (!dl_prio(p->normal_prio) ||
>                   (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
>                       p->dl.dl_boosted = 1;
> @@ -3727,10 +3737,9 @@ static void __setscheduler(struct rq *rq, struct 
> task_struct *p,
>        * Keep a potential priority boosting if called from
>        * sched_setscheduler().
>        */
> -     if (keep_boost)
> -             p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
> -     else
> -             p->prio = normal_prio(p);
> +     p->prio = normal_prio(p);
> +     if (keep_boost && p->pi_task)
> +             p->prio = min(p->prio, p->pi_task->prio);
>  
>       if (dl_prio(p->prio))
>               p->sched_class = &dl_sched_class;
> @@ -4017,7 +4026,10 @@ static int __sched_setscheduler(struct task_struct *p,
>                * the runqueue. This will be done when the task deboost
>                * itself.
>                */
> -             new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
> +             new_effective_prio = newprio;
> +             if (p->pi_task)
> +                     new_effective_prio = min(new_effective_prio, 
> p->pi_task->prio);
> +
>               if (new_effective_prio == oldprio)
>                       queue_flags &= ~DEQUEUE_MOVE;
>       }
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index affd97ec9f65..6c5aa4612eb6 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -932,7 +932,7 @@ static void dequeue_dl_entity(struct sched_dl_entity 
> *dl_se)
>  
>  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>  {
> -     struct task_struct *pi_task = rt_mutex_get_top_task(p);
> +     struct task_struct *pi_task = p->pi_task;
>       struct sched_dl_entity *pi_se = &p->dl;
>  
>       /*

Reply via email to