Linus,

Please pull the latest sched-core-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-core-for-linus

   # HEAD: 139b6fd26d85a65c4e0d2795b87b94f9505e5943 sched/Documentation: Remove 
unneeded word

The main scheduler changes in this cycle were:

  - various sched/deadline fixes and enhancements

  - rescheduling latency fixes/cleanups

  - rework the rq->clock code to be more consistent and 
    more robust.

  - minor micro-optimizations

  - ->avg.decay_count fixes

  - add a stack overflow check to might_sleep()

  - idle-poll handler fix, possibly resulting in power savings

  - misc smaller updates and fixes

 Thanks,

        Ingo

------------------>
Eric Sandeen (1):
      sched/debug: Check for stack overflow in ___might_sleep()

Frederic Weisbecker (2):
      sched: Fix missing preemption opportunity
      sched: Pull resched loop to __schedule() callers

Johan Hedberg (1):
      sched/wait: Introduce wait_on_bit_timeout()

Kirill Tkhai (1):
      sched/fair: Fix sched_entity::avg::decay_count initialization

Peter Zijlstra (4):
      sched/core: Validate rq_clock*() serialization
      sched/core: Rework rq->clock update skips
      sched/debug: Print rq->clock_task
      sched/deadline: Fix stale yield state

Preeti U Murthy (1):
      sched/idle: Add missing checks to the exit condition of cpu_idle_poll()

Sharon Dvir (1):
      sched/Documentation: Remove unneeded word

Tetsuo Handa (1):
      sched/debug: Fix potential call to __ffs(0) in sched_show_task()

Tim Chen (1):
      sched/rt: Reduce rq lock contention by eliminating locking of 
non-feasible target

Wanpeng Li (3):
      sched/deadline: Fix hrtick for a non-leftmost task
      sched/deadline: Avoid pointless __setscheduler()
      sched: Fix hrtick_start() on UP

Xunlei Pang (3):
      sched/fair: Fix the dealing with decay_count in 
__synchronize_entity_decay()
      sched/deadline: Modify cpudl::free_cpus to reflect rd->online
      sched/deadline: Remove cpu_active_mask from cpudl_find()

Yao Dongdong (1):
      sched/core: Remove check of p->sched_class


 include/linux/wait.h       |  26 +++++++++++
 kernel/locking/mutex.c     |   2 +-
 kernel/sched/core.c        | 107 +++++++++++++++++++++++++++++----------------
 kernel/sched/cpudeadline.c |  27 ++++++++++--
 kernel/sched/cpudeadline.h |   2 +
 kernel/sched/deadline.c    |  51 +++++++++++----------
 kernel/sched/debug.c       |   1 +
 kernel/sched/fair.c        |   7 ++-
 kernel/sched/idle.c        |   3 +-
 kernel/sched/rt.c          |  26 +++++++++--
 kernel/sched/sched.h       |  22 +++++++++-
 11 files changed, 197 insertions(+), 77 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 37423e0e1379..537d58eea8a0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -990,6 +990,32 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
 }
 
 /**
+ * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ * @timeout: timeout, in jiffies
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared. This is similar to wait_on_bit(), except also takes a
+ * timeout parameter.
+ *
+ * Returned value will be zero if the bit was cleared before the
+ * @timeout elapsed, or non-zero if the @timeout elapsed or process
+ * received a signal and the mode permitted wakeup on that signal.
+ */
+static inline int
+wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
+{
+       might_sleep();
+       if (!test_bit(bit, word))
+               return 0;
+       return out_of_line_wait_on_bit_timeout(word, bit,
+                                              bit_wait_timeout,
+                                              mode, timeout);
+}
+
+/**
  * wait_on_bit_action - wait for a bit to be cleared
  * @word: the word being waited on, a kernel virtual address
  * @bit: the bit of the word being waited on
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..5b49f652a3cf 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t 
*lock_count);
  * The mutex must later on be released by the same task that
  * acquired it. Recursive locking is not allowed. The task
  * may not exit without first unlocking the mutex. Also, kernel
- * memory where the mutex resides mutex must not be freed with
+ * memory where the mutex resides must not be freed with
  * the mutex still locked. The mutex must first be initialized
  * (or statically defined) before it can be locked. memset()-ing
  * the mutex to 0 is not allowed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e838095beb8..b269e8a2a516 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
 {
        s64 delta;
 
-       if (rq->skip_clock_update > 0)
+       lockdep_assert_held(&rq->lock);
+
+       if (rq->clock_skip_update & RQCF_ACT_SKIP)
                return;
 
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
  */
 void hrtick_start(struct rq *rq, u64 delay)
 {
+       /*
+        * Don't schedule slices shorter than 10000ns, that just
+        * doesn't make sense. Rely on vruntime for fairness.
+        */
+       delay = max_t(u64, delay, 10000LL);
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                        HRTIMER_MODE_REL_PINNED, 0);
 }
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct 
*p, int flags)
         * this case, we can save a useless back to back clock update.
         */
        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-               rq->skip_clock_update = 1;
+               rq_clock_skip_update(rq, true);
 }
 
 #ifdef CONFIG_SMP
@@ -1836,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
+#ifdef CONFIG_SMP
+       p->se.avg.decay_count           = 0;
+#endif
        INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
@@ -2755,6 +2765,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
  *          - explicit schedule() call
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in 
__schedule().
  */
 static void __sched __schedule(void)
 {
@@ -2763,7 +2777,6 @@ static void __sched __schedule(void)
        struct rq *rq;
        int cpu;
 
-need_resched:
        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
@@ -2783,6 +2796,8 @@ static void __sched __schedule(void)
        smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
 
+       rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2807,13 +2822,13 @@ static void __sched __schedule(void)
                switch_count = &prev->nvcsw;
        }
 
-       if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+       if (task_on_rq_queued(prev))
                update_rq_clock(rq);
 
        next = pick_next_task(rq, prev);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
-       rq->skip_clock_update = 0;
+       rq->clock_skip_update = 0;
 
        if (likely(prev != next)) {
                rq->nr_switches++;
@@ -2828,8 +2843,6 @@ static void __sched __schedule(void)
        post_schedule(rq);
 
        sched_preempt_enable_no_resched();
-       if (need_resched())
-               goto need_resched;
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2849,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
        struct task_struct *tsk = current;
 
        sched_submit_work(tsk);
-       __schedule();
+       do {
+               __schedule();
+       } while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
 
@@ -2884,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
 
+static void preempt_schedule_common(void)
+{
+       do {
+               __preempt_count_add(PREEMPT_ACTIVE);
+               __schedule();
+               __preempt_count_sub(PREEMPT_ACTIVE);
+
+               /*
+                * Check again in case we missed a preemption opportunity
+                * between schedule and now.
+                */
+               barrier();
+       } while (need_resched());
+}
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
@@ -2899,17 +2929,7 @@ asmlinkage __visible void __sched notrace 
preempt_schedule(void)
        if (likely(!preemptible()))
                return;
 
-       do {
-               __preempt_count_add(PREEMPT_ACTIVE);
-               __schedule();
-               __preempt_count_sub(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
-       } while (need_resched());
+       preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
@@ -3405,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
 
+static bool dl_param_changed(struct task_struct *p,
+               const struct sched_attr *attr)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       if (dl_se->dl_runtime != attr->sched_runtime ||
+               dl_se->dl_deadline != attr->sched_deadline ||
+               dl_se->dl_period != attr->sched_period ||
+               dl_se->flags != attr->sched_flags)
+               return true;
+
+       return false;
+}
+
 static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
                                bool user)
@@ -3533,7 +3567,7 @@ static int __sched_setscheduler(struct task_struct *p,
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
-               if (dl_policy(policy))
+               if (dl_policy(policy) && dl_param_changed(p, attr))
                        goto change;
 
                p->sched_reset_on_fork = reset_on_fork;
@@ -4225,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
 
-static void __cond_resched(void)
-{
-       __preempt_count_add(PREEMPT_ACTIVE);
-       __schedule();
-       __preempt_count_sub(PREEMPT_ACTIVE);
-}
-
 int __sched _cond_resched(void)
 {
        if (should_resched()) {
-               __cond_resched();
+               preempt_schedule_common();
                return 1;
        }
        return 0;
@@ -4260,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
-                       __cond_resched();
+                       preempt_schedule_common();
                else
                        cpu_relax();
                ret = 1;
@@ -4276,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
 
        if (should_resched()) {
                local_bh_enable();
-               __cond_resched();
+               preempt_schedule_common();
                local_bh_disable();
                return 1;
        }
@@ -4531,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
 {
        unsigned long free = 0;
        int ppid;
-       unsigned state;
+       unsigned long state = p->state;
 
-       state = p->state ? __ffs(p->state) + 1 : 0;
+       if (state)
+               state = __ffs(state) + 1;
        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
@@ -4766,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, 
int new_cpu)
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-       if (p->sched_class && p->sched_class->set_cpus_allowed)
+       if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
 
        cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7276,6 +7304,11 @@ void __init sched_init(void)
        enter_lazy_tlb(&init_mm, current);
 
        /*
+        * During early bootup we pretend to be a normal task:
+        */
+       current->sched_class = &fair_sched_class;
+
+       /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
         * but because we are the idle thread, we just pick up running again
@@ -7285,11 +7318,6 @@ void __init sched_init(void)
 
        calc_load_update = jiffies + LOAD_FREQ;
 
-       /*
-        * During early bootup we pretend to be a normal task:
-        */
-       current->sched_class = &fair_sched_class;
-
 #ifdef CONFIG_SMP
        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
        /* May be allocated at isolcpus cmdline parse time */
@@ -7351,6 +7379,9 @@ void ___might_sleep(const char *file, int line, int 
preempt_offset)
                        in_atomic(), irqs_disabled(),
                        current->pid, current->comm);
 
+       if (task_stack_end_corrupted(current))
+               printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        int best_cpu = -1;
        const struct sched_dl_entity *dl_se = &p->dl;
 
-       if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
+       if (later_mask &&
+           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
                best_cpu = cpumask_any(later_mask);
                goto out;
        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int 
is_valid)
 }
 
 /*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+       cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+       cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
  * cpudl_init - initialize the cpudl structure
  * @cp: the cpudl max-heap context
  */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
        if (!cp->elements)
                return -ENOMEM;
 
-       if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+       if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
                kfree(cp->elements);
                return -ENOMEM;
        }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
        for_each_possible_cpu(i)
                cp->elements[i].idx = IDX_INVALID;
 
-       cpumask_setall(cp->free_cpus);
-
        return 0;
 }
 
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
               struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
 int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
 #endif /* CONFIG_SMP */
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 726470d47f87..a027799ae130 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity 
*dl_se,
                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
                dl_se->runtime = pi_se->dl_runtime;
        }
+
+       if (dl_se->dl_yielded)
+               dl_se->dl_yielded = 0;
+       if (dl_se->dl_throttled)
+               dl_se->dl_throttled = 0;
 }
 
 /*
@@ -536,23 +541,19 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
*timer)
 
        sched_clock_tick();
        update_rq_clock(rq);
-       dl_se->dl_throttled = 0;
-       dl_se->dl_yielded = 0;
-       if (task_on_rq_queued(p)) {
-               enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-               if (dl_task(rq->curr))
-                       check_preempt_curr_dl(rq, p, 0);
-               else
-                       resched_curr(rq);
+       enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+       if (dl_task(rq->curr))
+               check_preempt_curr_dl(rq, p, 0);
+       else
+               resched_curr(rq);
 #ifdef CONFIG_SMP
-               /*
-                * Queueing this task back might have overloaded rq,
-                * check if we need to kick someone away.
-                */
-               if (has_pushable_dl_tasks(rq))
-                       push_dl_task(rq);
+       /*
+        * Queueing this task back might have overloaded rq,
+        * check if we need to kick someone away.
+        */
+       if (has_pushable_dl_tasks(rq))
+               push_dl_task(rq);
 #endif
-       }
 unlock:
        raw_spin_unlock(&rq->lock);
 
@@ -613,10 +614,9 @@ static void update_curr_dl(struct rq *rq)
 
        dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
        if (dl_runtime_exceeded(rq, dl_se)) {
+               dl_se->dl_throttled = 1;
                __dequeue_task_dl(rq, curr, 0);
-               if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
-                       dl_se->dl_throttled = 1;
-               else
+               if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
                        enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
                if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct 
task_struct *p, int flags)
         * its rq, the bandwidth timer callback (which clearly has not
         * run yet) will take care of this.
         */
-       if (p->dl.dl_throttled)
+       if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
                return;
 
        enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1073,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct 
task_struct *p, int queued)
 {
        update_curr_dl(rq);
 
-       if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+       /*
+        * Even when we have runtime, update_curr_dl() might have resulted in us
+        * not being the leftmost task anymore. In that case NEED_RESCHED will
+        * be set and schedule() will start a new hrtick for the next task.
+        */
+       if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+           is_leftmost(p, &rq->dl))
                start_hrtick_dl(rq, p);
 }
 
@@ -1166,9 +1172,6 @@ static int find_later_rq(struct task_struct *task)
         * We have to consider system topology and task affinity
         * first, then we can look for a suitable cpu.
         */
-       cpumask_copy(later_mask, task_rq(task)->rd->span);
-       cpumask_and(later_mask, later_mask, cpu_active_mask);
-       cpumask_and(later_mask, later_mask, &task->cpus_allowed);
        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
                        task, later_mask);
        if (best_cpu == -1)
@@ -1563,6 +1566,7 @@ static void rq_online_dl(struct rq *rq)
        if (rq->dl.overloaded)
                dl_set_overload(rq);
 
+       cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
        if (rq->dl.dl_nr_running > 0)
                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
 }
@@ -1574,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq)
                dl_clear_overload(rq);
 
        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+       cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
 
 void init_sched_dl_class(void)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do {                                                        
                \
        PN(next_balance);
        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", 
(long)(task_pid_nr(rq->curr)));
        PN(clock);
+       PN(clock_task);
        P(cpu_load[0]);
        P(cpu_load[1]);
        P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe331fc391f5..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
 {
        u32 slice;
 
-       p->se.avg.decay_count = 0;
        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
        p->se.avg.runnable_avg_sum = slice;
        p->se.avg.runnable_avg_period = slice;
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct 
sched_entity *se)
        u64 decays = atomic64_read(&cfs_rq->decay_counter);
 
        decays -= se->avg.decay_count;
+       se->avg.decay_count = 0;
        if (!decays)
                return 0;
 
        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
-       se->avg.decay_count = 0;
 
        return decays;
 }
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
                 * so we don't do microscopic update in schedule()
                 * and double the fastpath cost.
                 */
-                rq->skip_clock_update = 1;
+               rq_clock_skip_update(rq, true);
        }
 
        set_skip_buddy(se);
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
         */
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
+       delta = __rq_clock_broken(rq) - age_stamp;
 
-       delta = rq_clock(rq) - age_stamp;
        if (unlikely(delta < 0))
                delta = 0;
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..aaf1c1d5cf5d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
-       while (!tif_need_resched())
+       while (!tif_need_resched() &&
+               (cpu_idle_force_poll || tick_check_broadcast_expired()))
                cpu_relax();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth 
*rt_b, int overrun)
                                enqueue = 1;
 
                                /*
-                                * Force a clock update if the CPU was idle,
-                                * lest wakeup -> unthrottle time accumulate.
+                                * When we're idle and a woken (rt) task is
+                                * throttled check_preempt_curr() will set
+                                * skip_update and the time between the wakeup
+                                * and this unthrottle will get accounted as
+                                * 'runtime'.
                                 */
                                if (rt_rq->rt_nr_running && rq->curr == 
rq->idle)
-                                       rq->skip_clock_update = -1;
+                                       rq_clock_skip_update(rq, false);
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int 
sd_flag, int flags)
             curr->prio <= p->prio)) {
                int target = find_lowest_rq(p);
 
-               if (target != -1)
+               /*
+                * Don't bother moving it if the destination CPU is
+                * not running a lower priority task.
+                */
+               if (target != -1 &&
+                   p->prio < cpu_rq(target)->rt.highest_prio.curr)
                        cpu = target;
        }
        rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct 
*task, struct rq *rq)
 
                lowest_rq = cpu_rq(cpu);
 
+               if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+                       /*
+                        * Target rq has tasks of equal or higher priority,
+                        * retrying does not release any lock and is unlikely
+                        * to yield a different result.
+                        */
+                       lowest_rq = NULL;
+                       break;
+               }
+
                /* if the prio of this runqueue changed, try again */
                if (double_lock_balance(rq, lowest_rq)) {
                        /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..0870db23d79c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
        unsigned long last_sched_tick;
 #endif
-       int skip_clock_update;
-
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
        unsigned long next_balance;
        struct mm_struct *prev_mm;
 
+       unsigned int clock_skip_update;
        u64 clock;
        u64 clock_task;
 
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 #define raw_rq()               raw_cpu_ptr(&runqueues)
 
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+       return ACCESS_ONCE(rq->clock);
+}
+
 static inline u64 rq_clock(struct rq *rq)
 {
+       lockdep_assert_held(&rq->lock);
        return rq->clock;
 }
 
 static inline u64 rq_clock_task(struct rq *rq)
 {
+       lockdep_assert_held(&rq->lock);
        return rq->clock_task;
 }
 
+#define RQCF_REQ_SKIP  0x01
+#define RQCF_ACT_SKIP  0x02
+
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+       lockdep_assert_held(&rq->lock);
+       if (skip)
+               rq->clock_skip_update |= RQCF_REQ_SKIP;
+       else
+               rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
+
 #ifdef CONFIG_NUMA
 enum numa_topology_type {
        NUMA_DIRECT,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to