On Tue, May 20, 2014 at 11:02 AM, Peter Zijlstra <pet...@infradead.org> wrote: > Subject: perf: Fix mux_interval hrtimer wreckage > From: Peter Zijlstra <pet...@infradead.org> > Date: Tue May 20 10:09:32 CEST 2014 > > Thomas stumbled over the hrtimer_forward_now() in > perf_event_mux_interval_ms_store() and noticed its broken-ness. > > You cannot just change the expiry time of an active timer, it will > destroy the red-black tree order and cause havoc. > > Change it to (re)start the timer instead, (re)starting a timer will > dequeue and enqueue a timer and therefore preserve rb-tree order. > > Since we cannot enqueue remotely, wrap the thing in > cpu_function_call(), this however mandates that we restrict ourselves > to online cpus. Also serialize the entire setting so we don't get > multiple concurrent threads trying to update to different values. > > Also fix a problem in perf_mux_hrtimer_restart(), checking against > hrtimer_active() can actually loose us the timer when timer->state == > HRTIMER_STATE_CALLBACK and the callback has already decided NORESTART. > > Furthermore it doesn't make any sense to test > hrtimer_callback_running() when we already tested hrtimer_active(), > but with the above change, we explicitly must call it when > callback_running. > > Lastly, rename a few functions: > > s/perf_cpu_hrtimer_/perf_mux_hrtimer_/ -- because I could not find > the mux timer function > > s/\<hr\>/timer/ -- because that's the normal way of calling things. > > Fixes: 62b856397927 ("perf: Add sysfs entry to adjust multiplexing interval > per PMU") > Cc: Stephane Eranian <eran...@google.com> > Reported-by: Thomas Gleixner <t...@linutronix.de> > Signed-off-by: Peter Zijlstra <pet...@infradead.org> > Link: http://lkml.kernel.org/n/tip-ife5kqgnt7mviatc9fakz...@git.kernel.org
So, I tested this patch on tip.git and it panics my kernels as soon as I multiplex events. For instance running: $ perf stat -e cycles,cycles,cycles,cycles,cycles,cycles dd if=/dev/urandom of=/dev/null count=10000000 It dies as shown below, so something is wrong in the setup of that hrtimer now. [ 101.092337] RIP: 0010:[<ffffffffac06afad>] [<ffffffffac06afad>] __run_hrtimer.isra.31+0x9d/0x100 [ 101.101231] RSP: 0000:ffff88021fa43ed8 EFLAGS: 00010002 [ 101.106537] RAX: 00000000cf46cf46 RBX: ffff88021fa55718 RCX: 0000000000000018 [ 101.113664] RDX: 000000000000cf46 RSI: 000000177bc14ea6 RDI: ffff88021fa4d0a0 [ 101.120807] RBP: ffff88021fa43ef8 R08: 000000177bfe4df9 R09: 00000000000004c4 [ 101.127949] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88021fa4d0a0 [ 101.135083] R13: ffff88021fa4d0e0 R14: 0000000000000001 R15: 7fffffffffffffff [ 101.142201] FS: 0000000001e7e860(0063) GS:ffff88021fa40000(0000) knlGS:0000000000000000 [ 101.150289] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 101.156035] CR2: 00007f1139eb9000 CR3: 0000000215d84000 CR4: 00000000000407e0 [ 101.163154] Stack: [ 101.165171] ffff88021fa4d0e0 000000177bc12e1e ffff88021fa4d0a0 0000000000000001 [ 101.172652] ffff88021fa43f68 ffffffffac06b615 000000177bc12e1e ffff88021fa4d1d8 [ 101.180113] ffff88021fa4d198 ffff88021fa4d158 0000000300000002 000000177bc12e1e [ 101.187577] Call Trace: [ 101.190020] <IRQ> [ 101.191942] [<ffffffffac06b615>] hrtimer_interrupt+0xf5/0x230 [ 101.197986] [<ffffffffac02fd76>] local_apic_timer_interrupt+0x36/0x60 [ 101.204507] [<ffffffffac0303ee>] smp_apic_timer_interrupt+0x3e/0x60 [ 101.210864] [<ffffffffac5e068a>] apic_timer_interrupt+0x6a/0x70 > --- > kernel/events/core.c | 67 > ++++++++++++++++++++++++++++----------------------- > 1 file changed, 38 insertions(+), 29 deletions(-) > > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -45,9 +45,11 @@ > > #include <asm/irq_regs.h> > > +typedef int (*remote_function_f)(void *); > + > struct remote_function_call { > struct task_struct *p; > - int (*func)(void *info); > + remote_function_f func; > void *info; > int ret; > }; > @@ -80,7 +82,7 @@ static void remote_function(void *data) > * -EAGAIN - when the process moved away > */ > static int > -task_function_call(struct task_struct *p, int (*func) (void *info), void > *info) > +task_function_call(struct task_struct *p, remote_function_f func, void *info) > { > struct remote_function_call data = { > .p = p, > @@ -104,7 +106,7 @@ task_function_call(struct task_struct *p > * > * returns: @func return value or -ENXIO when the cpu is offline > */ > -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) > +static int cpu_function_call(int cpu, remote_function_f func, void *info) > { > struct remote_function_call data = { > .p = NULL, > @@ -759,7 +761,7 @@ perf_cgroup_mark_enabled(struct perf_eve > /* > * function must be called with interrupts disbled > */ > -static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) > +static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) > { > struct perf_cpu_context *cpuctx; > enum hrtimer_restart ret = HRTIMER_NORESTART; > @@ -783,7 +785,7 @@ static enum hrtimer_restart perf_cpu_hrt > } > > /* CPU is going down */ > -void perf_cpu_hrtimer_cancel(int cpu) > +void perf_mux_hrtimer_cancel(int cpu) > { > struct perf_cpu_context *cpuctx; > struct pmu *pmu; > @@ -810,11 +812,11 @@ void perf_cpu_hrtimer_cancel(int cpu) > local_irq_restore(flags); > } > > -static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) > +static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) > { > - struct hrtimer *hr = &cpuctx->hrtimer; > + struct hrtimer *timer = &cpuctx->hrtimer; > struct pmu *pmu = cpuctx->ctx.pmu; > - int timer; > + u64 interval; > > /* no multiplexing needed for SW PMU */ > if (pmu->task_ctx_nr == perf_sw_context) > @@ -824,31 +826,32 @@ static void __perf_cpu_hrtimer_init(stru > * check default is sane, if not set then force to > * default interval (1/tick) > */ > - timer = pmu->hrtimer_interval_ms; > - if (timer < 1) > - timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; > + interval = pmu->hrtimer_interval_ms; > + if (interval < 1) > + interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; > > - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); > + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); > > - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); > - hr->function = perf_cpu_hrtimer_handler; > + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); > + timer->function = perf_mux_hrtimer_handler; > } > > -static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) > +static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) > { > - struct hrtimer *hr = &cpuctx->hrtimer; > + struct hrtimer *timer = &cpuctx->hrtimer; > struct pmu *pmu = cpuctx->ctx.pmu; > > /* not for SW PMU */ > if (pmu->task_ctx_nr == perf_sw_context) > - return; > + return 0; > > - if (hrtimer_active(hr)) > - return; > + if (hrtimer_is_queued(timer)) > + return 0; > > - if (!hrtimer_callback_running(hr)) > - __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, > - 0, HRTIMER_MODE_REL_PINNED, 0); > + __hrtimer_start_range_ns(timer, cpuctx->hrtimer_interval, > + 0, HRTIMER_MODE_REL_PINNED, 0); > + > + return 0; > } > > void perf_pmu_disable(struct pmu *pmu) > @@ -1746,7 +1749,7 @@ group_sched_in(struct perf_event *group_ > > if (event_sched_in(group_event, cpuctx, ctx)) { > pmu->cancel_txn(pmu); > - perf_cpu_hrtimer_restart(cpuctx); > + perf_mux_hrtimer_restart(cpuctx); > return -EAGAIN; > } > > @@ -1793,7 +1796,7 @@ group_sched_in(struct perf_event *group_ > > pmu->cancel_txn(pmu); > > - perf_cpu_hrtimer_restart(cpuctx); > + perf_mux_hrtimer_restart(cpuctx); > > return -EAGAIN; > } > @@ -2061,7 +2064,7 @@ static int __perf_event_enable(void *inf > */ > if (leader != event) { > group_sched_out(leader, cpuctx, ctx); > - perf_cpu_hrtimer_restart(cpuctx); > + perf_mux_hrtimer_restart(cpuctx); > } > if (leader->attr.pinned) { > update_group_times(leader); > @@ -6396,6 +6399,8 @@ perf_event_mux_interval_ms_show(struct d > return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); > } > > +static DEFINE_MUTEX(mux_interval_mutex); > + > static ssize_t > perf_event_mux_interval_ms_store(struct device *dev, > struct device_attribute *attr, > @@ -6415,17 +6420,21 @@ perf_event_mux_interval_ms_store(struct > if (timer == pmu->hrtimer_interval_ms) > return count; > > + mutex_lock(&mux_interval_mutex); > pmu->hrtimer_interval_ms = timer; > > /* update all cpuctx for this PMU */ > - for_each_possible_cpu(cpu) { > + get_online_cpus(); > + for_each_online_cpu(cpu) { > struct perf_cpu_context *cpuctx; > cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); > cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); > > - if (hrtimer_active(&cpuctx->hrtimer)) > - hrtimer_forward_now(&cpuctx->hrtimer, > cpuctx->hrtimer_interval); > + cpu_function_call(cpu, > + (remote_function_f)perf_mux_hrtimer_restart, cpuctx); > } > + put_online_cpus(); > + mutex_unlock(&mux_interval_mutex); > > return count; > } > @@ -6531,7 +6540,7 @@ int perf_pmu_register(struct pmu *pmu, c > cpuctx->ctx.type = cpu_context; > cpuctx->ctx.pmu = pmu; > > - __perf_cpu_hrtimer_init(cpuctx, cpu); > + __perf_mux_hrtimer_init(cpuctx, cpu); > > INIT_LIST_HEAD(&cpuctx->rotation_list); > cpuctx->unique_pmu = pmu; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/