Re: [RFC] vmstat: Avoid waking up idle-cpu to service shepherd work

Peter Zijlstra Sat, 28 Mar 2015 06:45:58 -0700

On Sat, Mar 28, 2015 at 05:27:23PM +0530, viresh kumar wrote:
> So probably we need to make 'base' aligned to 8 bytes ?


Yeah, something like the below (at the very end) should ensure the thing
is cacheline aligned, that should give us a fair few bits.

> So, what you are suggesting is something like this (untested):

> @@ -1202,6 +1208,7 @@ static inline void __run_timers(struct tvec_base *base)
>                         timer_stats_account_timer(timer);
> 
>                         base->running_timer = timer;
> +                       tbase_set_running(timer->base);
>                         detach_expired_timer(timer, base);
> 
>                         if (irqsafe) {
> @@ -1216,6 +1223,7 @@ static inline void __run_timers(struct tvec_base *base)
>                 }
>         }
>         base->running_timer = NULL;
> +       tbase_clear_running(timer->base);
>         spin_unlock_irq(&base->lock);
>  }

That's broken. You need to clear running on all the timers you set it
on. Furthermore, you need to revalidate timer->base == base after
call_timer_fn().

Something like so:

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..489ce182f8ec 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1213,6 +1213,21 @@ static inline void __run_timers(struct tvec_base *base)
                                call_timer_fn(timer, fn, data);
                                spin_lock_irq(&base->lock);
                        }
+
+                       if (unlikely(timer->base != base)) {
+                               unsigned long flags;
+                               struct tvec_base *tbase;
+
+                               spin_unlock(&base->lock);
+
+                               tbase = lock_timer_base(timer, &flags);
+                               tbase_clear_running(timer->base);
+                               spin_unlock(&tbase->lock);
+
+                               spin_lock(&base->lock);
+                       } else {
+                               tbase_clear_running(timer->base);
+                       }
                }
        }
        base->running_timer = NULL;

Also, once you have tbase_running, we can take base->running_timer out
altogether.

> Now there are few issues I see here (Sorry if they are all imaginary):
> - In case a timer re-arms itself from its handler and is migrated from CPU A 
> to B, what
>   happens if the re-armed timer fires before the first handler finishes ? 
> i.e. timer->fn()
>   hasn't finished running on CPU A and it has fired again on CPU B. Wouldn't 
> this expose
>   us to a lot of other problems? It wouldn't be serialized to itself anymore ?

What I said above.

> - Because the timer has migrated to another CPU, the locking in __run_timers()
>   needs to be fixed. And that will make it complicated ..

Hardly.

>   - __run_timer() doesn't lock bases of other CPUs, and it has to do it now..

Yep, but rarely.

>   - We probably need to take locks of both local CPU and the one to which 
> timer migrated.

Nope, or rather, not at the same time. That's what the NULL magic buys
us.

> - Its possible now that there can be more than one running timer for a base, 
> which wasn't
>   true earlier. Not sure if it will break something.

Only if you messed it up real bad :-)

---
 kernel/time/timer.c | 36 ++++++++----------------------------
 1 file changed, 8 insertions(+), 28 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..c8c45bf50b2e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -93,6 +93,7 @@ struct tvec_base {
 struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
 
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -1534,46 +1535,25 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
 static int init_timers_cpu(int cpu)
 {
-       int j;
-       struct tvec_base *base;
+       struct tvec_base *base = per_cpu(tvec_bases, cpu);
        static char tvec_base_done[NR_CPUS];
+       int j;
 
        if (!tvec_base_done[cpu]) {
                static char boot_done;
 
-               if (boot_done) {
-                       /*
-                        * The APs use this path later in boot
-                        */
-                       base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-                                           cpu_to_node(cpu));
-                       if (!base)
-                               return -ENOMEM;
-
-                       /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
-                       if (WARN_ON(base != tbase_get_base(base))) {
-                               kfree(base);
-                               return -ENOMEM;
-                       }
-                       per_cpu(tvec_bases, cpu) = base;
+               if (!boot_done) {
+                       boot_done = 1; /* skip the boot cpu */
                } else {
-                       /*
-                        * This is for the boot CPU - we use compile-time
-                        * static initialisation because per-cpu memory isn't
-                        * ready yet and because the memory allocators are not
-                        * initialised either.
-                        */
-                       boot_done = 1;
-                       base = &boot_tvec_bases;
+                       base = per_cpu_ptr(&__tvec_bases);
+                       per_cpu(tvec_bases, cpu) = base;
                }
+
                spin_lock_init(&base->lock);
                tvec_base_done[cpu] = 1;
                base->cpu = cpu;
-       } else {
-               base = per_cpu(tvec_bases, cpu);
        }
 
-
        for (j = 0; j < TVN_SIZE; j++) {
                INIT_LIST_HEAD(base->tv5.vec + j);
                INIT_LIST_HEAD(base->tv4.vec + j);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] vmstat: Avoid waking up idle-cpu to service shepherd work

Reply via email to