Re: [PATCH] sched: update_rq_clock() must skip ONE update

Peter Zijlstra Tue, 08 Apr 2014 08:54:35 -0700

On Thu, Apr 03, 2014 at 10:02:18AM +0200, Mike Galbraith wrote:
> Prevent large wakeup latencies from being accounted to the wrong task.
> 
> Cc: <sta...@vger.kernel.org>
> Signed-off-by:        Mike Galbraith <umgwanakikb...@gmail.com>
> ---
>  kernel/sched/core.c |    7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -118,7 +118,12 @@ void update_rq_clock(struct rq *rq)
>  {
>       s64 delta;
>  
> -     if (rq->skip_clock_update > 0)
> +     /*
> +      * Set during wakeup to indicate we are on the way to schedule().
> +      * Decrement to ensure that a very large latency is not accounted
> +      * to the wrong task.
> +      */
> +     if (rq->skip_clock_update-- > 0)
>               return;
>  
>       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;


OK; so as previously mentioned (Oct '13); I've entirely had it with
skip_clock_update bugs, so I got angry and did the below.

Its not something I can merge, not least because it uses trace_printk(),
but it should be usable to 1) demonstate the above actually helps and 2)
make damn sure we got it right this time :-)

I've not really stared at the output much yet; but when you select
function_graph tracer; we get lovely things like:

  8)               |                                          wake_up_process() 
{
  8)               |                                            
try_to_wake_up() {
  8)   0.076 us    |                                              
_raw_spin_lock_irqsave();
  8)   0.092 us    |                                              
task_waking_fair();
  8)   0.106 us    |                                              
select_task_rq_fair();
  8)   0.161 us    |                                              
_raw_spin_lock();
  8)               |                                              
ttwu_do_activate.constprop.103() {
  8)               |                                                
activate_task() {
  8)               |                                                  
enqueue_task() {
  8)               |                                                    
update_rq_clock() {
  8)               |                                                      /* 
clock update: 420411 */
  8)   0.084 us    |                                                      
sched_avg_update();
  8)   1.277 us    |                                                    }
  8)               |                                                    
enqueue_task_fair() {
  8)               |                                                      
enqueue_entity() {
  8)   0.083 us    |                                                        
update_curr();
  8)   0.071 us    |                                                        
__compute_runnable_contrib();
  8)   0.074 us    |                                                        
__update_entity_load_avg_contrib();
  8)   0.121 us    |                                                        
update_cfs_rq_blocked_load();
  8)   0.236 us    |                                                        
account_entity_enqueue();
  8)   0.076 us    |                                                        
update_cfs_shares();
  8)   0.075 us    |                                                        
place_entity();
  8)   0.123 us    |                                                        
__enqueue_entity();
  8)   5.260 us    |                                                      }
  8)   0.069 us    |                                                      
__compute_runnable_contrib();
  8)   0.073 us    |                                                      
hrtick_update();
  8)   7.146 us    |                                                    }
  8)   9.583 us    |                                                  }
  8) + 10.169 us   |                                                }
  8)               |                                                
wq_worker_waking_up() {
  8)   0.071 us    |                                                  
kthread_data();
  8)   0.682 us    |                                                }
  8)               |                                                
ttwu_do_wakeup() {
  8)               |                                                  
check_preempt_curr() {
  8)   0.077 us    |                                                    
resched_task();
  8)               |                                                    /* 
skip_clock_update on cpu: 8 */
  8)   1.188 us    |                                                  }
  8)   1.914 us    |                                                }
  8) + 14.533 us   |                                              }
  8)   0.071 us    |                                              
_raw_spin_unlock();
  8)   0.082 us    |                                              
_raw_spin_unlock_irqrestore();
  8) + 18.874 us   |                                            }
  8) + 19.509 us   |                                          }

...

  8)               |                                          wake_up_process() 
{
  8)               |                                            
try_to_wake_up() {
  8)   0.101 us    |                                              
_raw_spin_lock_irqsave();
  8)   0.089 us    |                                              
task_waking_fair();
  8)   0.071 us    |                                              
select_task_rq_fair();
  8)   0.070 us    |                                              
_raw_spin_lock();
  8)               |                                              
ttwu_do_activate.constprop.103() {
  8)               |                                                
activate_task() {
  8)               |                                                  
enqueue_task() {
  8)               |                                                    
update_rq_clock() {
  8)               |                                                      /* 
Invalid clock skip on cpu: 8 */
  8)               |                                                      /* 
clock update: 420413 */
  8)   0.942 us    |                                                    }
  8)               |                                                    
enqueue_task_fair() {
  8)               |                                                      
enqueue_entity() {
  8)   0.081 us    |                                                        
update_curr();
  8)   0.074 us    |                                                        
__compute_runnable_contrib();
  8)   0.069 us    |                                                        
__update_entity_load_avg_contrib();
  8)   0.091 us    |                                                        
update_cfs_rq_blocked_load();
  8)   0.108 us    |                                                        
account_entity_enqueue();
  8)   0.081 us    |                                                        
update_cfs_shares();
  8)   0.069 us    |                                                        
place_entity();
  8)   0.107 us    |                                                        
__enqueue_entity();
  8)   5.120 us    |                                                      }
  8)   0.068 us    |                                                      
hrtick_update();
  8)   6.410 us    |                                                    }
  8)   8.484 us    |                                                  }
  8)   9.045 us    |                                                }
  8)               |                                                
wq_worker_waking_up() {
  8)   0.074 us    |                                                  
kthread_data();
  8)   0.669 us    |                                                }
  8)               |                                                
ttwu_do_wakeup() {
  8)               |                                                  
check_preempt_curr() {
  8)   0.091 us    |                                                    
resched_task();
  8)               |                                                    /* 
skip_clock_update on cpu: 8 */
  8)   1.080 us    |                                                  }
  8)   1.709 us    |                                                }
  8) + 13.007 us   |                                              }
  8)   0.071 us    |                                              
_raw_spin_unlock();
  8)   0.090 us    |                                              
_raw_spin_unlock_irqrestore();
  8) + 17.105 us   |                                            }
  8) + 17.702 us   |                                          }

...

  8)               |  schedule_preempt_disabled() {
  8)               |    schedule() {
  8)               |    __schedule() {
  8)   0.105 us    |      rcu_note_context_switch();
  8)   0.078 us    |      _raw_spin_lock();
  8)               |      update_rq_clock() {
  8)               |        /* Invalid clock skip on cpu: 8 */
  8)               |        /* clock update: 420415 */
  8)   0.073 us    |        sched_avg_update();
  8)   1.630 us    |      }
  8)   0.080 us    |      pick_next_task_stop();
  8)   0.112 us    |      pick_next_task_dl();
  8)   0.088 us    |      pick_next_task_rt();
  8)               |      pick_next_task_fair() {
  8)               |        put_prev_task_idle() {
  8)   0.118 us    |          idle_exit_fair();
  8)   0.709 us    |        }
  8)               |        pick_next_entity() {
  8)   0.071 us    |          clear_buddies();
  8)   0.721 us    |        }
  8)               |        set_next_entity() {
  8)   0.139 us    |          __dequeue_entity();
  8)   0.732 us    |        }
  8)   3.804 us    |      }
 ------------------------------------------
  8)    <idle>-0    =>   <...>-220   
 ------------------------------------------

  8)               |      finish_task_switch() {
  8)   0.076 us    |        _raw_spin_unlock();
  8)   0.716 us    |      }
  8) ! 1876.643 us |    }
  8) ! 1877.297 us |  } /* schedule */

Also; did I say how much I hate that function_graph doesn't default to
latency-format ?

---
 kernel/sched/core.c      | 130 ++++++++++++++++++++++++++++++-----------------
 kernel/sched/deadline.c  |   5 +-
 kernel/sched/debug.c     |   7 +--
 kernel/sched/fair.c      |  50 ++++++++++--------
 kernel/sched/idle_task.c |   4 +-
 kernel/sched/proc.c      |   4 +-
 kernel/sched/rt.c        |   4 +-
 kernel/sched/sched.h     | 105 +++++++++++++++++++++++---------------
 lib/Kconfig.debug        |   7 +++
 9 files changed, 195 insertions(+), 121 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cae286824bb..0e5c3dc6ed29 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -118,11 +118,31 @@ void update_rq_clock(struct rq *rq)
 {
        s64 delta;
 
+#ifdef CONFIG_SCHED_DEBUG_CLOCK
+       if (rq->skip_clock_update > 0 && rq->clock_stamp != rq->clock_seq) {
+               rq->skip_clock_update = 0;
+               trace_printk("Invalid clock skip on cpu: %d\n", rq->cpu);
+               goto do_update;
+       }
+#endif
+
        if (rq->skip_clock_update > 0)
                return;
 
-       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-       rq->clock += delta;
+#ifdef CONFIG_SCHED_DEBUG_CLOCK
+       if (!(rq->clock_stamp & 1))
+               trace_printk("clock update outside of rq->lock\n");
+
+       if (rq->clock_stamp == rq->clock_seq)
+               trace_printk("superfluous clock update\n");
+
+do_update:
+       trace_printk("clock update: %u\n", rq->clock_seq);
+       rq->clock_stamp = rq->clock_seq;
+#endif
+
+       delta = sched_clock_cpu(cpu_of(rq)) - rq->__clock;
+       rq->__clock += delta;
        update_rq_clock_task(rq, delta);
 }
 
@@ -308,10 +328,10 @@ static inline struct rq *__task_rq_lock(struct 
task_struct *p)
 
        for (;;) {
                rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
+               rq_lock(rq);
                if (likely(rq == task_rq(p)))
                        return rq;
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq);
        }
 }
 
@@ -327,10 +347,10 @@ static struct rq *task_rq_lock(struct task_struct *p, 
unsigned long *flags)
        for (;;) {
                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
+               rq_lock(rq);
                if (likely(rq == task_rq(p)))
                        return rq;
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq);
                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
        }
 }
@@ -338,7 +358,7 @@ static struct rq *task_rq_lock(struct task_struct *p, 
unsigned long *flags)
 static void __task_rq_unlock(struct rq *rq)
        __releases(rq->lock)
 {
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 }
 
 static inline void
@@ -346,7 +366,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, 
unsigned long *flags)
        __releases(rq->lock)
        __releases(p->pi_lock)
 {
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
@@ -360,7 +380,7 @@ static struct rq *this_rq_lock(void)
 
        local_irq_disable();
        rq = this_rq();
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq);
 
        return rq;
 }
@@ -386,10 +406,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq);
        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 
        return HRTIMER_NORESTART;
 }
@@ -411,10 +431,10 @@ static void __hrtick_start(void *arg)
 {
        struct rq *rq = arg;
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq);
        __hrtick_restart(rq);
        rq->hrtick_csd_pending = 0;
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 }
 
 /*
@@ -515,7 +535,7 @@ void resched_task(struct task_struct *p)
 {
        int cpu;
 
-       lockdep_assert_held(&task_rq(p)->lock);
+       lockdep_assert_held(&task_rq(p)->__lock);
 
        if (test_tsk_need_resched(p))
                return;
@@ -539,10 +559,10 @@ void resched_cpu(int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
-       if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+       if (!raw_spin_trylock_irqsave(&rq->__lock, flags))
                return;
        resched_task(cpu_curr(cpu));
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_unlock_irqrestore(&rq->__lock, flags);
 }
 
 #ifdef CONFIG_SMP
@@ -837,7 +857,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        }
 #endif
 
-       rq->clock_task += delta;
+       rq->__clock_task += delta;
 
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || 
defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
        if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
@@ -967,8 +987,10 @@ void check_preempt_curr(struct rq *rq, struct task_struct 
*p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) {
+               trace_printk("skip_clock_update on cpu: %d\n", rq->cpu);
                rq->skip_clock_update = 1;
+       }
 }
 
 #ifdef CONFIG_SMP
@@ -1479,7 +1501,7 @@ static void sched_ttwu_pending(void)
        struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p;
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq);
 
        while (llist) {
                p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1487,7 +1509,7 @@ static void sched_ttwu_pending(void)
                ttwu_do_activate(rq, p, 0);
        }
 
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 }
 
 void scheduler_ipi(void)
@@ -1555,9 +1577,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
        }
 #endif
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq);
        ttwu_do_activate(rq, p, 0);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 }
 
 /**
@@ -1648,12 +1670,12 @@ static void try_to_wake_up_local(struct task_struct *p)
            WARN_ON_ONCE(p == current))
                return;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_held(&rq->__lock);
 
        if (!raw_spin_trylock(&p->pi_lock)) {
-               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock(&rq->__lock);
                raw_spin_lock(&p->pi_lock);
-               raw_spin_lock(&rq->lock);
+               raw_spin_lock(&rq->__lock);
        }
 
        if (!(p->state & TASK_NORMAL))
@@ -2170,10 +2192,12 @@ static inline void post_schedule(struct rq *rq)
        if (rq->post_schedule) {
                unsigned long flags;
 
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               local_irq_save(flags);
+               rq_lock(rq);
                if (rq->curr->sched_class->post_schedule)
                        rq->curr->sched_class->post_schedule(rq);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock(rq);
+               local_irq_restore(flags);
 
                rq->post_schedule = 0;
        }
@@ -2423,11 +2447,11 @@ void scheduler_tick(void)
 
        sched_clock_tick();
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq);
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        update_cpu_load_active(rq);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 
        perf_event_task_tick();
 
@@ -2670,7 +2694,8 @@ static void __sched __schedule(void)
         * done by the caller to avoid the race with signal_wake_up().
         */
        smp_mb__before_spinlock();
-       raw_spin_lock_irq(&rq->lock);
+       local_irq_disable();
+       rq_lock(rq);
 
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -2718,8 +2743,10 @@ static void __sched __schedule(void)
                 */
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
-       } else
-               raw_spin_unlock_irq(&rq->lock);
+       } else {
+               rq_unlock(rq);
+               local_irq_enable();
+       }
 
        post_schedule(rq);
 
@@ -4077,9 +4104,8 @@ SYSCALL_DEFINE0(sched_yield)
         * Since we are going to call schedule() anyway, there's
         * no need to preempt or enable interrupts:
         */
-       __release(rq->lock);
-       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-       do_raw_spin_unlock(&rq->lock);
+       preempt_disable();
+       rq_unlock(rq);
        sched_preempt_enable_no_resched();
 
        schedule();
@@ -4476,7 +4502,8 @@ void init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       local_irq_save(flags);
+       rq_lock(rq);
 
        __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
@@ -4502,7 +4529,8 @@ void init_idle(struct task_struct *idle, int cpu)
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock(rq);
+       local_irq_restore(flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
        init_idle_preempt_count(idle, cpu);
@@ -4801,11 +4829,11 @@ static void migrate_tasks(unsigned int dead_cpu)
 
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_cpu, next);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq);
 
                __migrate_task(next, dead_cpu, dest_cpu);
 
-               raw_spin_lock(&rq->lock);
+               rq_lock(rq);
        }
 
        rq->stop = stop;
@@ -5040,27 +5068,31 @@ migration_call(struct notifier_block *nfb, unsigned 
long action, void *hcpu)
 
        case CPU_ONLINE:
                /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               local_irq_save(flags);
+               rq_lock(rq);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
                        set_rq_online(rq);
                }
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock(rq);
+               local_irq_restore(flags);
                break;
 
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DYING:
                sched_ttwu_pending();
                /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               local_irq_save(flags);
+               rq_lock(rq);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
                migrate_tasks(cpu);
                BUG_ON(rq->nr_running != 1); /* the migration thread */
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock(rq);
+               local_irq_restore(flags);
                break;
 
        case CPU_DEAD:
@@ -5356,7 +5388,8 @@ static void rq_attach_root(struct rq *rq, struct 
root_domain *rd)
        struct root_domain *old_rd = NULL;
        unsigned long flags;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       local_irq_save(flags);
+       rq_lock(rq);
 
        if (rq->rd) {
                old_rd = rq->rd;
@@ -5382,7 +5415,8 @@ static void rq_attach_root(struct rq *rq, struct 
root_domain *rd)
        if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                set_rq_online(rq);
 
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock(rq);
+       local_irq_restore(flags);
 
        if (old_rd)
                call_rcu_sched(&old_rd->rcu, free_rootdomain);
@@ -6860,7 +6894,7 @@ void __init sched_init(void)
                struct rq *rq;
 
                rq = cpu_rq(i);
-               raw_spin_lock_init(&rq->lock);
+               raw_spin_lock_init(&rq->__lock);
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -7771,13 +7805,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, 
u64 period, u64 quota)
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                struct rq *rq = cfs_rq->rq;
 
-               raw_spin_lock_irq(&rq->lock);
+               raw_spin_lock_irq(&rq->__lock);
                cfs_rq->runtime_enabled = runtime_enabled;
                cfs_rq->runtime_remaining = 0;
 
                if (cfs_rq->throttled)
                        unthrottle_cfs_rq(cfs_rq);
-               raw_spin_unlock_irq(&rq->lock);
+               raw_spin_unlock_irq(&rq->__lock);
        }
        if (runtime_was_enabled && !runtime_enabled)
                cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27ef40925525..4682acb7976a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -514,7 +514,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
*timer)
                                                     dl_timer);
        struct task_struct *p = dl_task_of(dl_se);
        struct rq *rq = task_rq(p);
-       raw_spin_lock(&rq->lock);
+
+       rq_lock(rq);
 
        /*
         * We need to take care of a possible races here. In fact, the
@@ -544,7 +545,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
*timer)
 #endif
        }
 unlock:
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 
        return HRTIMER_NORESTART;
 }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f3344c31632a..9a12ea15469f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -188,7 +188,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       raw_spin_lock_irqsave(&rq->__lock, flags);
        if (cfs_rq->rb_leftmost)
                MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
        last = __pick_last_entity(cfs_rq);
@@ -196,7 +196,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
                max_vruntime = last->vruntime;
        min_vruntime = cfs_rq->min_vruntime;
        rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_unlock_irqrestore(&rq->__lock, flags);
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
                        SPLIT_NS(MIN_vruntime));
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -302,7 +302,8 @@ do {                                                        
                \
        P(nr_uninterruptible);
        PN(next_balance);
        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", 
(long)(task_pid_nr(rq->curr)));
-       PN(clock);
+       PN(__clock);
+       PN(__clock_task);
        P(cpu_load[0]);
        P(cpu_load[1]);
        P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e9bd0b1fa9e..60db6c533873 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3369,7 +3369,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
                                throttled_list) {
                struct rq *rq = rq_of(cfs_rq);
 
-               raw_spin_lock(&rq->lock);
+               raw_spin_lock(&rq->__lock);
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
 
@@ -3386,7 +3386,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
                        unthrottle_cfs_rq(cfs_rq);
 
 next:
-               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock(&rq->__lock);
 
                if (!remaining)
                        break;
@@ -4846,7 +4846,8 @@ static void yield_task_fair(struct rq *rq)
                 * so we don't do microscopic update in schedule()
                 * and double the fastpath cost.
                 */
-                rq->skip_clock_update = 1;
+               trace_printk("skip_clock_update on cpu: %d\n", rq->cpu);
+               rq->skip_clock_update = 1;
        }
 
        set_skip_buddy(se);
@@ -5375,7 +5376,8 @@ static void update_blocked_averages(int cpu)
        struct cfs_rq *cfs_rq;
        unsigned long flags;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       local_irq_save(flags);
+       rq_lock(rq);
        update_rq_clock(rq);
        /*
         * Iterates the task_group tree in a bottom up fashion, see
@@ -5390,7 +5392,8 @@ static void update_blocked_averages(int cpu)
                __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
        }
 
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock(rq);
+       local_irq_restore(flags);
 }
 
 /*
@@ -6565,7 +6568,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        sd->nr_balance_failed++;
 
                if (need_active_balance(&env)) {
-                       raw_spin_lock_irqsave(&busiest->lock, flags);
+                       raw_spin_lock_irqsave(&busiest->__lock, flags);
 
                        /* don't kick the active_load_balance_cpu_stop,
                         * if the curr task on busiest cpu can't be
@@ -6573,7 +6576,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         */
                        if (!cpumask_test_cpu(this_cpu,
                                        tsk_cpus_allowed(busiest->curr))) {
-                               raw_spin_unlock_irqrestore(&busiest->lock,
+                               raw_spin_unlock_irqrestore(&busiest->__lock,
                                                            flags);
                                env.flags |= LBF_ALL_PINNED;
                                goto out_one_pinned;
@@ -6589,7 +6592,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
-                       raw_spin_unlock_irqrestore(&busiest->lock, flags);
+                       raw_spin_unlock_irqrestore(&busiest->__lock, flags);
 
                        if (active_balance) {
                                stop_one_cpu_nowait(cpu_of(busiest),
@@ -6664,7 +6667,7 @@ static int idle_balance(struct rq *this_rq)
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
         */
-       raw_spin_unlock(&this_rq->lock);
+       raw_spin_unlock(&this_rq->__lock);
 
        update_blocked_averages(this_cpu);
        rcu_read_lock();
@@ -6702,7 +6705,7 @@ static int idle_balance(struct rq *this_rq)
        }
        rcu_read_unlock();
 
-       raw_spin_lock(&this_rq->lock);
+       raw_spin_lock(&this_rq->__lock);
 
        /*
         * While browsing the domains, we released the rq lock.
@@ -6753,7 +6756,7 @@ static int active_load_balance_cpu_stop(void *data)
        struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
 
-       raw_spin_lock_irq(&busiest_rq->lock);
+       raw_spin_lock_irq(&busiest_rq->__lock);
 
        /* make sure the requested cpu hasn't gone down in the meantime */
        if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -6803,7 +6806,7 @@ static int active_load_balance_cpu_stop(void *data)
        double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
        busiest_rq->active_balance = 0;
-       raw_spin_unlock_irq(&busiest_rq->lock);
+       raw_spin_unlock_irq(&busiest_rq->__lock);
        return 0;
 }
 
@@ -7091,10 +7094,12 @@ static void nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
 
                rq = cpu_rq(balance_cpu);
 
-               raw_spin_lock_irq(&rq->lock);
+               local_irq_disable();
+               rq_lock(rq);
                update_rq_clock(rq);
                update_idle_cpu_load(rq);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock(rq);
+               local_irq_enable();
 
                rebalance_domains(rq, CPU_IDLE);
 
@@ -7258,7 +7263,8 @@ static void task_fork_fair(struct task_struct *p)
        struct rq *rq = this_rq();
        unsigned long flags;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       local_irq_save(flags);
+       rq_lock(rq);
 
        update_rq_clock(rq);
 
@@ -7292,7 +7298,8 @@ static void task_fork_fair(struct task_struct *p)
 
        se->vruntime -= cfs_rq->min_vruntime;
 
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock(rq);
+       local_irq_restore(flags);
 }
 
 /*
@@ -7533,9 +7540,9 @@ void unregister_fair_sched_group(struct task_group *tg, 
int cpu)
        if (!tg->cfs_rq[cpu]->on_list)
                return;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       raw_spin_lock_irqsave(&rq->__lock, flags);
        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_unlock_irqrestore(&rq->__lock, flags);
 }
 
 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -7595,13 +7602,16 @@ int sched_group_set_shares(struct task_group *tg, 
unsigned long shares)
 
                se = tg->se[i];
                /* Propagate contribution to hierarchy */
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               local_irq_save(flags);
+               rq_lock(rq);
 
                /* Possible calls to update_curr() need rq clock */
                update_rq_clock(rq);
                for_each_sched_entity(se)
                        update_cfs_shares(group_cfs_rq(se));
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               rq_unlock(rq);
+               local_irq_restore(flags);
        }
 
 done:
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..91cee2d0bf18 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -39,10 +39,10 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 static void
 dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
-       raw_spin_unlock_irq(&rq->lock);
+       raw_spin_unlock_irq(&rq->__lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
        dump_stack();
-       raw_spin_lock_irq(&rq->lock);
+       raw_spin_lock_irq(&rq->__lock);
 }
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..d08b4f715189 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -561,7 +561,7 @@ void update_cpu_load_nohz(void)
        if (curr_jiffies == this_rq->last_load_update_tick)
                return;
 
-       raw_spin_lock(&this_rq->lock);
+       raw_spin_lock(&this_rq->__lock);
        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
        if (pending_updates) {
                this_rq->last_load_update_tick = curr_jiffies;
@@ -571,7 +571,7 @@ void update_cpu_load_nohz(void)
                 */
                __update_cpu_load(this_rq, 0, pending_updates);
        }
-       raw_spin_unlock(&this_rq->lock);
+       raw_spin_unlock(&this_rq->__lock);
 }
 #endif /* CONFIG_NO_HZ */
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf1618551..2845c9d172f0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -774,7 +774,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth 
*rt_b, int overrun)
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
                struct rq *rq = rq_of_rt_rq(rt_rq);
 
-               raw_spin_lock(&rq->lock);
+               rq_lock(rq);
                if (rt_rq->rt_time) {
                        u64 runtime;
 
@@ -807,7 +807,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth 
*rt_b, int overrun)
 
                if (enqueue)
                        sched_rt_rq_enqueue(rt_rq);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq);
        }
 
        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == 
RUNTIME_INF))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c9007f28d3a2..7a20e203c057 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,7 +517,7 @@ extern struct root_domain def_root_domain;
  */
 struct rq {
        /* runqueue lock: */
-       raw_spinlock_t lock;
+       raw_spinlock_t __lock;
 
        /*
         * nr_running and cpu_load should be in the same cacheline because
@@ -538,7 +538,6 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
        unsigned long last_sched_tick;
 #endif
-       int skip_clock_update;
 
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
@@ -568,8 +567,11 @@ struct rq {
        unsigned long next_balance;
        struct mm_struct *prev_mm;
 
-       u64 clock;
-       u64 clock_task;
+       unsigned int clock_seq;
+       unsigned int clock_stamp;
+       int skip_clock_update;
+       u64 __clock;
+       u64 __clock_task;
 
        atomic_t nr_iowait;
 
@@ -645,6 +647,24 @@ struct rq {
 #endif
 };
 
+static inline void rq_lock(struct rq *rq)
+{
+       raw_spin_lock(&rq->__lock);
+#ifdef CONFIG_SCHED_DEBUG_CLOCK
+       rq->clock_seq++;
+       barrier();
+#endif
+}
+
+static inline void rq_unlock(struct rq *rq)
+{
+#ifdef CONFIG_SCHED_DEBUG_CLOCK
+       barrier();
+       rq->clock_seq++;
+#endif
+       raw_spin_unlock(&rq->__lock);
+}
+
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -664,12 +684,26 @@ DECLARE_PER_CPU(struct rq, runqueues);
 
 static inline u64 rq_clock(struct rq *rq)
 {
-       return rq->clock;
+#ifdef CONFIG_SCHED_DEBUG_CLOCK
+       if (rq->clock_stamp != rq->clock_seq) {
+               trace_printk("reading invalid rq->clock: %u != %u\n", 
+                               rq->clock_stamp, rq->clock_seq);
+       }
+#endif
+
+       return rq->__clock;
 }
 
 static inline u64 rq_clock_task(struct rq *rq)
 {
-       return rq->clock_task;
+#ifdef CONFIG_SCHED_DEBUG_CLOCK
+       if (rq->clock_stamp != rq->clock_seq) {
+               trace_printk("reading invalid rq->clock_task: %u != %u\n",
+                               rq->clock_stamp, rq->clock_seq);
+       }
+#endif
+
+       return rq->__clock_task;
 }
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -990,16 +1024,17 @@ static inline void finish_lock_switch(struct rq *rq, 
struct task_struct *prev)
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
-       rq->lock.owner = current;
+       rq->__lock.owner = current;
 #endif
        /*
         * If we are tracking spinlock dependencies then we have to
         * fix up the runqueue lock - which gets 'carried over' from
         * prev into current:
         */
-       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+       spin_acquire(&rq->__lock.dep_map, 0, 0, _THIS_IP_);
 
-       raw_spin_unlock_irq(&rq->lock);
+       rq_unlock(rq);
+       local_irq_enable();
 }
 
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -1013,7 +1048,7 @@ static inline void prepare_lock_switch(struct rq *rq, 
struct task_struct *next)
         */
        next->on_cpu = 1;
 #endif
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq);
 }
 
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -1313,12 +1348,12 @@ static inline void double_rq_lock(struct rq *rq1, 
struct rq *rq2);
  * reduces latency compared to the unfair variant below.  However, it
  * also adds more overhead and therefore may reduce throughput.
  */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
-       raw_spin_unlock(&this_rq->lock);
+       raw_spin_unlock(&this_rq->__lock);
        double_rq_lock(this_rq, busiest);
 
        return 1;
@@ -1332,22 +1367,22 @@ static inline int _double_lock_balance(struct rq 
*this_rq, struct rq *busiest)
  * grant the double lock to lower cpus over higher ids under contention,
  * regardless of entry order into the function.
  */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
        int ret = 0;
 
-       if (unlikely(!raw_spin_trylock(&busiest->lock))) {
+       if (unlikely(!raw_spin_trylock(&busiest->__lock))) {
                if (busiest < this_rq) {
-                       raw_spin_unlock(&this_rq->lock);
-                       raw_spin_lock(&busiest->lock);
-                       raw_spin_lock_nested(&this_rq->lock,
+                       raw_spin_unlock(&this_rq->__lock);
+                       raw_spin_lock(&busiest->__lock);
+                       raw_spin_lock_nested(&this_rq->__lock,
                                              SINGLE_DEPTH_NESTING);
                        ret = 1;
                } else
-                       raw_spin_lock_nested(&busiest->lock,
+                       raw_spin_lock_nested(&busiest->__lock,
                                              SINGLE_DEPTH_NESTING);
        }
        return ret;
@@ -1355,25 +1390,11 @@ static inline int _double_lock_balance(struct rq 
*this_rq, struct rq *busiest)
 
 #endif /* CONFIG_PREEMPT */
 
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-{
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               raw_spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-
-       return _double_lock_balance(this_rq, busiest);
-}
-
 static inline void double_unlock_balance(struct rq *this_rq, struct rq 
*busiest)
        __releases(busiest->lock)
 {
-       raw_spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+       raw_spin_unlock(&busiest->__lock);
+       lock_set_subclass(&this_rq->__lock.dep_map, 0, _RET_IP_);
 }
 
 static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -1406,15 +1427,15 @@ static inline void double_rq_lock(struct rq *rq1, 
struct rq *rq2)
 {
        BUG_ON(!irqs_disabled());
        if (rq1 == rq2) {
-               raw_spin_lock(&rq1->lock);
+               raw_spin_lock(&rq1->__lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
        } else {
                if (rq1 < rq2) {
-                       raw_spin_lock(&rq1->lock);
-                       raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                       raw_spin_lock(&rq1->__lock);
+                       raw_spin_lock_nested(&rq2->__lock, 
SINGLE_DEPTH_NESTING);
                } else {
-                       raw_spin_lock(&rq2->lock);
-                       raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                       raw_spin_lock(&rq2->__lock);
+                       raw_spin_lock_nested(&rq1->__lock, 
SINGLE_DEPTH_NESTING);
                }
        }
 }
@@ -1429,9 +1450,9 @@ static inline void double_rq_unlock(struct rq *rq1, 
struct rq *rq2)
        __releases(rq1->lock)
        __releases(rq2->lock)
 {
-       raw_spin_unlock(&rq1->lock);
+       raw_spin_unlock(&rq1->__lock);
        if (rq1 != rq2)
-               raw_spin_unlock(&rq2->lock);
+               raw_spin_unlock(&rq2->__lock);
        else
                __release(rq2->lock);
 }
@@ -1450,7 +1471,7 @@ static inline void double_rq_lock(struct rq *rq1, struct 
rq *rq2)
 {
        BUG_ON(!irqs_disabled());
        BUG_ON(rq1 != rq2);
-       raw_spin_lock(&rq1->lock);
+       raw_spin_lock(&rq1->__lock);
        __acquire(rq2->lock);   /* Fake it out ;) */
 }
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dd7f8858188a..d8222df9f839 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -779,6 +779,13 @@ config SCHED_DEBUG
          that can help debug the scheduler. The runtime overhead of this
          option is minimal.
 
+config SCHED_DEBUG_CLOCK
+       bool "Debug rq clock"
+       depends on SCHED_DEBUG
+       default n
+       help
+         If you say Y here the ftrace output contains debug muck for rq->clock
+
 config SCHEDSTATS
        bool "Collect scheduler statistics"
        depends on DEBUG_KERNEL && PROC_FS
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] sched: update_rq_clock() must skip ONE update

Reply via email to