Re: [PATCH RT] push RT tasks when overloaded (via schedule)

Peter Zijlstra Fri, 12 Oct 2007 10:07:54 -0700

On Fri, 2007-10-12 at 12:05 -0400, Steven Rostedt wrote:

> Index: linux-2.6.23-rt1/kernel/sched.c
> ===================================================================
> --- linux-2.6.23-rt1.orig/kernel/sched.c
> +++ linux-2.6.23-rt1/kernel/sched.c
> @@ -304,6 +304,7 @@ struct rq {
>  #ifdef CONFIG_PREEMPT_RT
>       unsigned long rt_nr_running;
>       unsigned long rt_nr_uninterruptible;
> +     int curr_prio;


still not convinced we want this PREEMPT_RT only.

>  #endif
>  
>       unsigned long switch_timestamp;
> @@ -1484,6 +1485,126 @@ next_in_queue:
>  
>  static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
>  
> +/* Only try this algorithm three times */
> +#define RT_PUSH_MAX_TRIES 3
> +
> +/* Will lock the rq it finds */
> +static int find_lowest_cpu(cpumask_t *cpu_mask, struct task_struct *task,
> +                              struct rq *this_rq)
> +{

Eeew, asymetric locking. At the very least name it:
find_lock_lowest_rq() or something like that.

Might as well return struct rq* while we're at it [1].

> +     struct rq *lowest_rq = NULL;
> +     int dst_cpu = -1;
> +     int cpu;
> +     int tries;
> +
> +     for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
> +             /*
> +              * Scan each rq for the lowest prio.
> +              */
> +             for_each_cpu_mask(cpu, *cpu_mask) {
> +                     struct rq *rq = &per_cpu(runqueues, cpu);
> +
> +                     if (cpu == smp_processor_id())
> +                             continue;
> +
> +                     /* We look for lowest RT prio or non-rt CPU */
> +                     if (rq->curr_prio >= MAX_RT_PRIO) {
> +                             lowest_rq = rq;
> +                             dst_cpu = cpu;
> +                             break;
> +                     }
> +
> +                     /* no locking for now */
> +                     if (rq->curr_prio > task->prio &&
> +                         (!lowest_rq || rq->curr_prio < 
> lowest_rq->curr_prio)) {
> +                             dst_cpu = cpu;
> +                             lowest_rq = rq;
> +                     }
> +             }
> +
> +             if (!lowest_rq) {
> +                     dst_cpu = -1;
> +                     break;
> +             }
> +
> +             /* if the prio of this runqueue changed, try again */
> +             if (double_lock_balance(this_rq, lowest_rq)) {
> +                     /*
> +                      * We had to unlock the run queue. In
> +                      * the mean time, task could have
> +                      * migrated already or had its affinity changed.
> +                      */
> +                     if (unlikely(task_rq(task) != this_rq ||
> +                                  !cpu_isset(dst_cpu, task->cpus_allowed))) {
> +                             spin_unlock(&lowest_rq->lock);
> +                             dst_cpu = -1;
> +                             lowest_rq = NULL;
> +                             break;
> +                     }
> +
> +             }
> +
> +             /* If this rq is still suitable use it. */
> +             if (lowest_rq->curr_prio > task->prio)
> +                     break;
> +
> +             /* try again */
> +             spin_unlock(&lowest_rq->lock);
> +             lowest_rq = NULL;
> +             dst_cpu = -1;
> +     }
> +
> +     return dst_cpu;
> +}
> +
> +/*
> + * If the current CPU has more than one RT task, see if the non
> + * running task can migrate over to a CPU that is running a task
> + * of lesser priority.
> + */
> +static int push_rt_task(struct rq *this_rq)
> +{
> +     struct task_struct *next_task;
> +     struct rq *lowest_rq;
> +     int dst_cpu;
> +     int ret = 0;
> +     cpumask_t cpu_mask;
> +
> +     assert_spin_locked(&this_rq->lock);
> +
> +     next_task = rt_next_highest_task(this_rq);
> +     if (!next_task)
> +             return 0;
> +
> +     cpus_and(cpu_mask, cpu_online_map, next_task->cpus_allowed);
> +
> +     /* We might release this_rq lock */
> +     get_task_struct(next_task);
> +
> +     /* find_lowest_rq locks cpu_rq(dst_cpu) if found */
> +     dst_cpu = find_lowest_cpu(&cpu_mask, next_task, this_rq); 
> +     if (dst_cpu < 0)
> +             goto out;
> +
> +     lowest_rq = cpu_rq(dst_cpu);
> +

[1] Because that is what we use anyway.

> +     assert_spin_locked(&lowest_rq->lock);
> +
> +     deactivate_task(this_rq, next_task, 0);
> +     set_task_cpu(next_task, dst_cpu);
> +     activate_task(lowest_rq, next_task, 0);
> +
> +     resched_task(lowest_rq->curr);
> +
> +     spin_unlock(&lowest_rq->lock);
> +
> +     ret = 1;
> +out:
> +     put_task_struct(next_task);
> +
> +     return ret;
> +}
> +
>  /*
>   * Pull RT tasks from other CPUs in the RT-overload
>   * case. Interrupts are disabled, local rq is locked.
> @@ -2202,19 +2323,28 @@ static inline void finish_task_switch(st
>        * be dropped twice.
>        *              Manfred Spraul <[EMAIL PROTECTED]>
>        */
> +     prev_state = prev->state;
> +     _finish_arch_switch(prev);
> +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
> +     rq->curr_prio = current->prio;
> +#endif
> +     finish_lock_switch(rq, prev);
>  #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
>       /*
>        * If we pushed an RT task off the runqueue,
> -      * then kick other CPUs, they might run it:
> -      */
> -     if (unlikely(rt_task(current) && rq->rt_nr_running > 1)) {
> -             schedstat_inc(rq, rto_schedule);
> -             smp_send_reschedule_allbutself_cpumask(current->cpus_allowed);
> +      * then kick other CPUs, they might run it.
> +      * Note we may release the rq lock, and since
> +      * the lock was owned by prev, we need to release it
> +      * first via finish_lock_switch and then reaquire it.
> +      */
> +     if (unlikely(rt_task(current))) {
> +             spin_lock(&rq->lock);
> +             /* push_rt_task will return true if it moved an RT */
> +             while (push_rt_task(rq))
> +                     ;
> +             spin_unlock(&rq->lock);
>       }
>  #endif
> -     prev_state = prev->state;
> -     _finish_arch_switch(prev);
> -     finish_lock_switch(rq, prev);

finish_lock_switch() seems to do spin_unlock_irq() in the tree I'm
looking at. which mean you just 'forgot' to re-enable IRQs.

>       fire_sched_in_preempt_notifiers(current);
>       trace_stop_sched_switched(current);
>       /*
> Index: linux-2.6.23-rt1/kernel/sched_rt.c
> ===================================================================
> --- linux-2.6.23-rt1.orig/kernel/sched_rt.c
> +++ linux-2.6.23-rt1/kernel/sched_rt.c
> @@ -96,6 +96,50 @@ static struct task_struct *pick_next_tas
>       return next;
>  }
>  
> +#ifdef CONFIG_PREEMPT_RT
> +static struct task_struct *rt_next_highest_task(struct rq *rq)
> +{
> +     struct rt_prio_array *array = &rq->rt.active;
> +     struct task_struct *next;
> +     struct list_head *queue;
> +     int idx;
> +
> +     if (likely (rq->rt_nr_running < 2))
> +             return NULL;
> +
> +     idx = sched_find_first_bit(array->bitmap);
> +     if (idx >= MAX_RT_PRIO) {
> +             WARN_ON(1); /* rt_nr_running is bad */
> +             return NULL;
> +     }
> +
> +     queue = array->queue + idx;
> +     next = list_entry(queue->next, struct task_struct, run_list);
> +     if (unlikely(next != current))
> +             return next;
> +
> +     if (queue->next->next != queue) {
> +             /* same prio task */
> +             next = list_entry(queue->next->next, struct task_struct, 
> run_list);
> +             goto out;
> +     }
> +
> +     /* slower, but more flexible */
> +     idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
> +     if (idx >= MAX_RT_PRIO) {
> +             WARN_ON(1); /* rt_nr_running was 2 and above! */
> +             return NULL;
> +     }
> +
> +     queue = array->queue + idx;
> +     next = list_entry(queue->next, struct task_struct, run_list);
> +
> + out:
> +     return next;
> +
> +}
> +#endif /* CONFIG_PREEMPT_RT */
> +
>  static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
>  {
>       update_curr_rt(rq);
> 
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RT] push RT tasks when overloaded (via schedule)

Reply via email to