[GIT PULL] scheduler fixes

Ingo Molnar Fri, 20 Feb 2015 05:43:44 -0800

Linus,

Please pull the latest sched-urgent-for-linus git tree from:


   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 2636ed5f8d15ff9395731593537b4b3fdf2af24d sched/rt: Avoid obvious 
configuration fail

This tree contains misc fixes: preempt_schedule_common() 
and io_schedule() recursion fixes, sched/dl fixes, a 
completion_done() revert, two sched/rt fixes and a comment 
update patch.

 Thanks,

        Ingo

------------------>
Frederic Weisbecker (1):
      sched: Fix preempt_schedule_common() triggering tracing recursion

Kirill Tkhai (2):
      sched/dl: Prevent enqueue of a sleeping task in dl_task_timer()
      sched/dl: Do update_rq_clock() in yield_task_dl()

NeilBrown (1):
      sched: Prevent recursion in io_schedule()

Oleg Nesterov (1):
      sched/completion: Serialize completion_done() with complete()

Peter Zijlstra (4):
      sched: Clarify ordering between task_rq_lock() and move_queued_task()
      sched: Make dl_task_time() use task_rq_lock()
      sched/autogroup: Fix failure to set cpu.rt_runtime_us
      sched/rt: Avoid obvious configuration fail


 include/linux/sched.h     |  10 ++--
 kernel/sched/auto_group.c |   6 +--
 kernel/sched/completion.c |  19 +++++++-
 kernel/sched/core.c       | 113 ++++++++++++----------------------------------
 kernel/sched/deadline.c   |  33 ++++++++++----
 kernel/sched/sched.h      |  76 +++++++++++++++++++++++++++++++
 6 files changed, 155 insertions(+), 102 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef98d2f..cb5cdc777c8a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -363,9 +363,6 @@ extern void show_regs(struct pt_regs *);
  */
 extern void show_stack(struct task_struct *task, unsigned long *sp);
 
-void io_schedule(void);
-long io_schedule_timeout(long timeout);
-
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
@@ -422,6 +419,13 @@ extern signed long schedule_timeout_uninterruptible(signed 
long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 
+extern long io_schedule_timeout(long timeout);
+
+static inline void io_schedule(void)
+{
+       io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+}
+
 struct nsproxy;
 struct user_namespace;
 
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
         * so we don't have to move tasks around upon policy change,
         * or flail around trying to allocate bandwidth on the fly.
         * A bandwidth exception in __sched_setscheduler() allows
-        * the policy change to proceed.  Thereafter, task_group()
-        * returns &root_task_group, so zero bandwidth is required.
+        * the policy change to proceed.
         */
        free_rt_sched_group(tg);
        tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct 
task_group *tg)
        if (tg != &root_task_group)
                return false;
 
-       if (p->sched_class != &fair_sched_class)
-               return false;
-
        /*
         * We can only assume the task group can't go away on us if
         * autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 7052d3fd4e7b..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x)
         * first without taking the lock so we can
         * return early in the blocking case.
         */
-       if (!ACCESS_ONCE(x->done))
+       if (!READ_ONCE(x->done))
                return 0;
 
        spin_lock_irqsave(&x->wait.lock, flags);
@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
  */
 bool completion_done(struct completion *x)
 {
-       return !!ACCESS_ONCE(x->done);
+       if (!READ_ONCE(x->done))
+               return false;
+
+       /*
+        * If ->done, we need to wait for complete() to release ->wait.lock
+        * otherwise we can end up freeing the completion before complete()
+        * is done referencing it.
+        *
+        * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+        * the loads of ->done and ->wait.lock such that we cannot observe
+        * the lock before complete() acquires it while observing the ->done
+        * after it's acquired the lock.
+        */
+       smp_rmb();
+       spin_unlock_wait(&x->wait.lock);
+       return true;
 }
 EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1f37fe7f77a4..a4869bd426ca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -307,66 +307,6 @@ __read_mostly int scheduler_running;
 int sysctl_sched_rt_runtime = 950000;
 
 /*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       lockdep_assert_held(&p->pi_lock);
-
-       for (;;) {
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
-       __acquires(p->pi_lock)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       for (;;) {
-               raw_spin_lock_irqsave(&p->pi_lock, *flags);
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
-
-static void __task_rq_unlock(struct rq *rq)
-       __releases(rq->lock)
-{
-       raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
-       __releases(rq->lock)
-       __releases(p->pi_lock)
-{
-       raw_spin_unlock(&rq->lock);
-       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-/*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static struct rq *this_rq_lock(void)
@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
 
-static void preempt_schedule_common(void)
+static void __sched notrace preempt_schedule_common(void)
 {
        do {
                __preempt_count_add(PREEMPT_ACTIVE);
@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
-void __sched io_schedule(void)
-{
-       struct rq *rq = raw_rq();
-
-       delayacct_blkio_start();
-       atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
-       schedule();
-       current->in_iowait = 0;
-       atomic_dec(&rq->nr_iowait);
-       delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
 long __sched io_schedule_timeout(long timeout)
 {
-       struct rq *rq = raw_rq();
+       int old_iowait = current->in_iowait;
+       struct rq *rq;
        long ret;
 
+       current->in_iowait = 1;
+       if (old_iowait)
+               blk_schedule_flush_plug(current);
+       else
+               blk_flush_plug(current);
+
        delayacct_blkio_start();
+       rq = raw_rq();
        atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
        ret = schedule_timeout(timeout);
-       current->in_iowait = 0;
+       current->in_iowait = old_iowait;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
+
        return ret;
 }
+EXPORT_SYMBOL(io_schedule_timeout);
 
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
@@ -7644,6 +7577,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 {
        struct task_struct *g, *p;
 
+       /*
+        * Autogroups do not have RT tasks; see autogroup_create().
+        */
+       if (task_group_is_autogroup(tg))
+               return 0;
+
        for_each_process_thread(g, p) {
                if (rt_task(p) && task_group(p) == tg)
                        return 1;
@@ -7736,6 +7675,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 {
        int i, err = 0;
 
+       /*
+        * Disallowing the root group RT runtime is BAD, it would disallow the
+        * kernel creating (and or operating) RT threads.
+        */
+       if (tg == &root_task_group && rt_runtime == 0)
+               return -EINVAL;
+
+       /* No period doesn't make any sense. */
+       if (rt_period == 0)
+               return -EINVAL;
+
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
        err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7792,9 +7742,6 @@ static int sched_group_set_rt_period(struct task_group 
*tg, long rt_period_us)
        rt_period = (u64)rt_period_us * NSEC_PER_USEC;
        rt_runtime = tg->rt_bandwidth.rt_runtime;
 
-       if (rt_period == 0)
-               return -EINVAL;
-
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a027799ae130..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
*timer)
                                                     struct sched_dl_entity,
                                                     dl_timer);
        struct task_struct *p = dl_task_of(dl_se);
+       unsigned long flags;
        struct rq *rq;
-again:
-       rq = task_rq(p);
-       raw_spin_lock(&rq->lock);
 
-       if (rq != task_rq(p)) {
-               /* Task was moved, retrying. */
-               raw_spin_unlock(&rq->lock);
-               goto again;
-       }
+       rq = task_rq_lock(current, &flags);
 
        /*
         * We need to take care of several possible races here:
@@ -541,6 +535,26 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
*timer)
 
        sched_clock_tick();
        update_rq_clock(rq);
+
+       /*
+        * If the throttle happened during sched-out; like:
+        *
+        *   schedule()
+        *     deactivate_task()
+        *       dequeue_task_dl()
+        *         update_curr_dl()
+        *           start_dl_timer()
+        *         __dequeue_task_dl()
+        *     prev->on_rq = 0;
+        *
+        * We can be both throttled and !queued. Replenish the counter
+        * but do not enqueue -- wait for our wakeup to do that.
+        */
+       if (!task_on_rq_queued(p)) {
+               replenish_dl_entity(dl_se, dl_se);
+               goto unlock;
+       }
+
        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
        if (dl_task(rq->curr))
                check_preempt_curr_dl(rq, p, 0);
@@ -555,7 +569,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
*timer)
                push_dl_task(rq);
 #endif
 unlock:
-       raw_spin_unlock(&rq->lock);
+       task_rq_unlock(rq, current, &flags);
 
        return HRTIMER_NORESTART;
 }
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
                rq->curr->dl.dl_yielded = 1;
                p->dl.runtime = 0;
        }
+       update_rq_clock(rq);
        update_curr_dl(rq);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0870db23d79c..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
 
 extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t 
period);
 
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       lockdep_assert_held(&p->pi_lock);
+
+       for (;;) {
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+                       return rq;
+               raw_spin_unlock(&rq->lock);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long 
*flags)
+       __acquires(p->pi_lock)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       for (;;) {
+               raw_spin_lock_irqsave(&p->pi_lock, *flags);
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               /*
+                *      move_queued_task()              task_rq_lock()
+                *
+                *      ACQUIRE (rq->lock)
+                *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
+                *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
+                *      [S] ->cpu = new_cpu             [L] task_rq()
+                *                                      [L] ->on_rq
+                *      RELEASE (rq->lock)
+                *
+                * If we observe the old cpu in task_rq_lock, the acquire of
+                * the old rq->lock will fully serialize against the stores.
+                *
+                * If we observe the new cpu in task_rq_lock, the acquire will
+                * pair with the WMB to ensure we must then also see migrating.
+                */
+               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+                       return rq;
+               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+               while (unlikely(task_on_rq_migrating(p)))
+                       cpu_relax();
+       }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+       __releases(rq->lock)
+{
+       raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+       __releases(rq->lock)
+       __releases(p->pi_lock)
+{
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fixes

Reply via email to