When a potential periodic timer is deleted through timer_del_sync, all cpus are scanned to determine if the timer is running on that cpu. In a NUMA configuration doing so will cause NUMA interlink traffic which limits the scalability of timers.
The following patch makes the timer remember where the timer was last started. It is then possible to only wait for the completion of the timer on that specific cpu. Signed-off-by: Shai Fultheim <[EMAIL PROTECTED]> Signed-off-by: Christoph Lameter <[EMAIL PROTECTED]> Index: linux-2.6.11/include/linux/timer.h =================================================================== --- linux-2.6.11.orig/include/linux/timer.h 2005-03-07 21:42:43.539328640 -0800 +++ linux-2.6.11/include/linux/timer.h 2005-03-07 21:42:50.993195480 -0800 @@ -19,10 +19,19 @@ struct timer_list { unsigned long data; struct tvec_t_base_s *base; +#ifdef CONFIG_SMP + struct tvec_t_base_s *last_running; +#endif }; #define TIMER_MAGIC 0x4b87ad6e +#ifdef CONFIG_SMP +#define TIMER_INIT_LASTRUNNING .last_running = NULL, +#else +#define TIMER_INIT_LASTRUNNING +#endif + #define TIMER_INITIALIZER(_function, _expires, _data) { \ .function = (_function), \ .expires = (_expires), \ @@ -30,6 +39,7 @@ struct timer_list { .base = NULL, \ .magic = TIMER_MAGIC, \ .lock = SPIN_LOCK_UNLOCKED, \ + TIMER_INIT_LASTRUNNING \ } /*** @@ -41,6 +51,9 @@ struct timer_list { */ static inline void init_timer(struct timer_list * timer) { +#ifdef CONFIG_SMP + timer->last_running = NULL; +#endif timer->base = NULL; timer->magic = TIMER_MAGIC; spin_lock_init(&timer->lock); Index: linux-2.6.11/kernel/timer.c =================================================================== --- linux-2.6.11.orig/kernel/timer.c 2005-03-07 21:42:43.539328640 -0800 +++ linux-2.6.11/kernel/timer.c 2005-03-07 22:01:27.733425160 -0800 @@ -84,6 +84,14 @@ static inline void set_running_timer(tve #endif } +static inline void set_last_running(struct timer_list *timer, + tvec_base_t *base) +{ +#ifdef CONFIG_SMP + timer->last_running = base; +#endif +} + /* Fake initialization */ static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; @@ -335,33 +343,41 @@ EXPORT_SYMBOL(del_timer); * * The function returns whether it has deactivated a pending timer or not. * - * del_timer_sync() is slow and complicated because it copes with timer - * handlers which re-arm the timer (periodic timers). If the timer handler - * is known to not do this (a single shot timer) then use - * del_singleshot_timer_sync() instead. + * del_timer_sync() copes with time handlers which re-arm the timer (periodic + * timers). If the timer handler is known to not do this (a single shot + * timer) then use del_singleshot_timer_sync() instead. */ int del_timer_sync(struct timer_list *timer) { tvec_base_t *base; - int i, ret = 0; + int ret = 0; check_timer(timer); del_again: ret += del_timer(timer); - for_each_online_cpu(i) { - base = &per_cpu(tvec_bases, i); - if (base->running_timer == timer) { - while (base->running_timer == timer) { - cpu_relax(); - preempt_check_resched(); - } - break; + /* Get where the timer ran last */ + base = timer->last_running; + if (base) { + /* + * If the timer is still executing then wait until the + * run is complete. + */ + while (base->running_timer == timer) { + cpu_relax(); + preempt_check_resched(); } } smp_rmb(); - if (timer_pending(timer)) + /* + * If the timer is no longer pending and its last run + * was where we checked then the timer + * is truly off. If the timer has been started on some other + * cpu in the meantime (due to a race condition) then + * we need to repeat what we have done. + */ + if (timer_pending(timer) || timer->last_running != base) goto del_again; return ret; @@ -464,6 +480,7 @@ repeat: set_running_timer(base, timer); smp_wmb(); timer->base = NULL; + set_last_running(timer, base); spin_unlock_irq(&base->lock); { u32 preempt_count = preempt_count(); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/