On Sat, Mar 28, 2015 at 05:27:23PM +0530, viresh kumar wrote:
> So probably we need to make 'base' aligned to 8 bytes ?
Yeah, something like the below (at the very end) should ensure the thing
is cacheline aligned, that should give us a fair few bits.
> So, what you are suggesting is something like this (untested):
> @@ -1202,6 +1208,7 @@ static inline void __run_timers(struct tvec_base *base)
> timer_stats_account_timer(timer);
>
> base->running_timer = timer;
> + tbase_set_running(timer->base);
> detach_expired_timer(timer, base);
>
> if (irqsafe) {
> @@ -1216,6 +1223,7 @@ static inline void __run_timers(struct tvec_base *base)
> }
> }
> base->running_timer = NULL;
> + tbase_clear_running(timer->base);
> spin_unlock_irq(&base->lock);
> }
That's broken. You need to clear running on all the timers you set it
on. Furthermore, you need to revalidate timer->base == base after
call_timer_fn().
Something like so:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..489ce182f8ec 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1213,6 +1213,21 @@ static inline void __run_timers(struct tvec_base *base)
call_timer_fn(timer, fn, data);
spin_lock_irq(&base->lock);
}
+
+ if (unlikely(timer->base != base)) {
+ unsigned long flags;
+ struct tvec_base *tbase;
+
+ spin_unlock(&base->lock);
+
+ tbase = lock_timer_base(timer, &flags);
+ tbase_clear_running(timer->base);
+ spin_unlock(&tbase->lock);
+
+ spin_lock(&base->lock);
+ } else {
+ tbase_clear_running(timer->base);
+ }
}
}
base->running_timer = NULL;
Also, once you have tbase_running, we can take base->running_timer out
altogether.
> Now there are few issues I see here (Sorry if they are all imaginary):
> - In case a timer re-arms itself from its handler and is migrated from CPU A
> to B, what
> happens if the re-armed timer fires before the first handler finishes ?
> i.e. timer->fn()
> hasn't finished running on CPU A and it has fired again on CPU B. Wouldn't
> this expose
> us to a lot of other problems? It wouldn't be serialized to itself anymore ?
What I said above.
> - Because the timer has migrated to another CPU, the locking in __run_timers()
> needs to be fixed. And that will make it complicated ..
Hardly.
> - __run_timer() doesn't lock bases of other CPUs, and it has to do it now..
Yep, but rarely.
> - We probably need to take locks of both local CPU and the one to which
> timer migrated.
Nope, or rather, not at the same time. That's what the NULL magic buys
us.
> - Its possible now that there can be more than one running timer for a base,
> which wasn't
> true earlier. Not sure if it will break something.
Only if you messed it up real bad :-)
---
kernel/time/timer.c | 36 ++++++++----------------------------
1 file changed, 8 insertions(+), 28 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..c8c45bf50b2e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -93,6 +93,7 @@ struct tvec_base {
struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -1534,46 +1535,25 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible);
static int init_timers_cpu(int cpu)
{
- int j;
- struct tvec_base *base;
+ struct tvec_base *base = per_cpu(tvec_bases, cpu);
static char tvec_base_done[NR_CPUS];
+ int j;
if (!tvec_base_done[cpu]) {
static char boot_done;
- if (boot_done) {
- /*
- * The APs use this path later in boot
- */
- base = kzalloc_node(sizeof(*base), GFP_KERNEL,
- cpu_to_node(cpu));
- if (!base)
- return -ENOMEM;
-
- /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
- if (WARN_ON(base != tbase_get_base(base))) {
- kfree(base);
- return -ENOMEM;
- }
- per_cpu(tvec_bases, cpu) = base;
+ if (!boot_done) {
+ boot_done = 1; /* skip the boot cpu */
} else {
- /*
- * This is for the boot CPU - we use compile-time
- * static initialisation because per-cpu memory isn't
- * ready yet and because the memory allocators are not
- * initialised either.
- */
- boot_done = 1;
- base = &boot_tvec_bases;
+ base = per_cpu_ptr(&__tvec_bases);
+ per_cpu(tvec_bases, cpu) = base;
}
+
spin_lock_init(&base->lock);
tvec_base_done[cpu] = 1;
base->cpu = cpu;
- } else {
- base = per_cpu(tvec_bases, cpu);
}
-
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
INIT_LIST_HEAD(base->tv4.vec + j);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/