From: Kan Liang <kan.li...@intel.com> perf_event_aux funciton goes through all pmus and all events in whatever contexts to find the side-band event to output, which is unnecessary and expensive.
For example, the brk test case in lkp triggers many mmap operations, at the time, perf with cycles:pp is also running on the system. As a result, many perf_event_aux are invoked, and each would search all pmus and all events. If we enable the uncore support (even when uncore event are not really used), dozens of uncore pmus will be added into pmus list, which can significantly decrease brk_test's ops_per_sec. Based on our test, the ops_per_sec without uncore patch is 2647573, while the ops_per_sec with uncore patch is only 1768444, which is a 33.2% reduction. To get at the per cpu side-band event, this patch put the side-band events to four categories, which are tracked by 4 per-cpu lists. It only finds the interested events from masked category. To get at the per task side-band event, each task context for current task will be searched. Because we don't want to go update more global state on context switch. Reported-by: Huang, Ying <ying.hu...@linux.intel.com> Suggested-by: Peter Zijlstra (Intel) <pet...@infradead.org> Signed-off-by: Kan Liang <kan.li...@intel.com> --- The V1 patch is "perf/core: find auxiliary events in running pmus list" https://lkml.org/lkml/2016/2/24/961. This V2 patch almost changes everything compare with V1. The V2 patch is mainly based on Peter's suggestion. But I didn't rename perf_event_aux to perf_event_sb. Because it looks there are many aux things in the codes, e.g. AUX area in ring buffer. I'm not sure if we need to change all aux to sb. We may do the rename later in separate patch. include/linux/perf_event.h | 26 +++++++++ kernel/events/core.c | 135 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 144 insertions(+), 17 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 15588d4..953113e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -437,6 +437,31 @@ struct swevent_hlist { struct perf_cgroup; struct ring_buffer; +struct pmu_event_list { + raw_spinlock_t lock; + struct list_head list; +}; + +/* + * {mmap,mmap_data,mmap2} -> mmap + * {comm,comm_exec} -> comm + * task + * context_switch + */ +enum event_sb_channel { + sb_mmap = 0, + sb_comm, + sb_task, + sb_switch, + + sb_nr, +}; + +#define IS_SB_MMAP(attr) \ + (attr.mmap || attr.mmap_data || attr.mmap2) +#define IS_SB_COMM(attr) \ + (attr.comm || attr.comm_exec) + /** * struct perf_event - performance event kernel representation: */ @@ -589,6 +614,7 @@ struct perf_event { int cgrp_defer_enabled; #endif + struct list_head sb_list[sb_nr]; #endif /* CONFIG_PERF_EVENTS */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index de24fbc..bff49d0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -333,6 +333,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events[sb_nr]); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -3560,6 +3561,37 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event, enum event_sb_channel sb) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events[sb], event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list[sb]); + raw_spin_unlock(&pel->lock); +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + if (IS_SB_MMAP(event->attr)) + detach_sb_event(event, sb_mmap); + + if (IS_SB_COMM(event->attr)) + detach_sb_event(event, sb_comm); + + if (event->attr.task) + detach_sb_event(event, sb_task); + + if (event->attr.context_switch) + detach_sb_event(event, sb_switch); + +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3623,6 +3655,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -5720,13 +5754,41 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, rcu_read_unlock(); } +static void perf_event_sb_iterate(enum event_sb_channel sb, + perf_event_aux_output_cb output, + void *data) +{ + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events[sb]); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list[sb]) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + output(event, data); + } +} + +static void perf_event_sb_mask(unsigned int sb_mask, + perf_event_aux_output_cb output, + void *data) +{ + int sb; + + for (sb = 0; sb < sb_nr; sb++) { + if (!(sb_mask & (1 << sb))) + continue; + perf_event_sb_iterate(sb, output, data); + } +} + static void perf_event_aux(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) + struct perf_event_context *task_ctx, + unsigned int sb_mask) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; /* @@ -5741,21 +5803,17 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, } rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; + preempt_disable(); + perf_event_sb_mask(sb_mask, output, data); + + for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) perf_event_aux_ctx(ctx, output, data); -next: - put_cpu_ptr(pmu->pmu_cpu_context); } + preempt_enable(); rcu_read_unlock(); + } /* @@ -5852,7 +5910,8 @@ static void perf_event_task(struct task_struct *task, perf_event_aux(perf_event_task_output, &task_event, - task_ctx); + task_ctx, + (1 << sb_task) | (1 << sb_mmap) | (1 << sb_comm)); } void perf_event_fork(struct task_struct *task) @@ -5931,7 +5990,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) perf_event_aux(perf_event_comm_output, comm_event, - NULL); + NULL, + 1 << sb_comm); } void perf_event_comm(struct task_struct *task, bool exec) @@ -6162,7 +6222,8 @@ got_name: perf_event_aux(perf_event_mmap_output, mmap_event, - NULL); + NULL, + 1 << sb_mmap); kfree(buf); } @@ -6350,7 +6411,8 @@ static void perf_event_switch(struct task_struct *task, perf_event_aux(perf_event_switch_output, &switch_event, - NULL); + NULL, + 1 << sb_switch); } /* @@ -7841,6 +7903,37 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event, enum event_sb_channel sb) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events[sb], event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list[sb], &pel->list); + raw_spin_unlock(&pel->lock); +} + +static void account_pmu_sb_event(struct perf_event *event) +{ + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + return; + + if (IS_SB_MMAP(event->attr)) + attach_sb_event(event, sb_mmap); + + if (IS_SB_COMM(event->attr)) + attach_sb_event(event, sb_comm); + + if (event->attr.task) + attach_sb_event(event, sb_task); + + if (event->attr.context_switch) + attach_sb_event(event, sb_switch); + +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -7921,6 +8014,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -7938,6 +8033,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; + int i; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) @@ -7965,6 +8061,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->active_entry); INIT_HLIST_NODE(&event->hlist_entry); + for (i = 0; i < sb_nr; i++) + INIT_LIST_HEAD(&event->sb_list[i]); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); @@ -9360,11 +9458,14 @@ static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; int cpu; + int i; for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + for (i = 0; i < sb_nr; i++) + INIT_LIST_HEAD(&per_cpu(pmu_sb_events[i].list, cpu)); } } -- 2.5.0