Hi, Are there any new comments so far? Could you please suggest further steps forward?
Thanks, Alexey On 10.07.2017 16:03, Alexey Budankov wrote: > perf/core: complete replace of lists by rb trees for pinned and > flexible groups at perf_event_context > > By default, the userspace perf tool opens per-cpu task-bound events > when sampling, so for N logical events requested by the user, the tool > will open N * NR_CPUS events. > > In the kernel, we mux events with a hrtimer, periodically rotating the > flexible group list and trying to schedule each group in turn. We skip > groups whose cpu filter doesn't match. So when we get unlucky, we can > walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer invocation. > > This has been observed to result in significant overhead when running > the STREAM benchmark on 272 core Xeon Phi systems. > > One way to avoid this is to place our events into an rb tree sorted by > CPU filter, so that our hrtimer can skip to the current CPU's > list and ignore everything else. > > This patch implements complete replacement of lists by rb trees for > pinned and flexible groups. > > The patch set was tested on Xeon Phi using perf_fuzzer and tests > from here: https://github.com/deater/perf_event_tests > > The full patch set (v1-4) is attached for convenience. > > Branch revision: > * perf/core 007b811b4041989ec2dc91b9614aa2c41332723e > Merge tag 'perf-core-for-mingo-4.13-20170719' of > git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core > > Signed-off-by: Alexey Budankov <alexey.budan...@linux.intel.com> > --- > include/linux/perf_event.h | 20 +--------- > kernel/events/core.c | 94 > ++++++++++++++++------------------------------ > 2 files changed, 34 insertions(+), 80 deletions(-) > > diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > index 7b2cddf..8e1967f 100644 > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -603,13 +603,6 @@ struct perf_event { > */ > struct list_head group_list; > /* > - * Entry into the group_list list above; > - * the entry may be attached to the self group_list list above > - * in case the event is directly attached to the tree; > - */ > - struct list_head group_list_entry; > - > - /* > * We need storage to track the entries in perf_pmu_migrate_context; we > * cannot use the event_entry because of RCU and we want to keep the > * group in tact which avoids us using the other two entries. > @@ -749,15 +742,6 @@ struct perf_event { > #endif /* CONFIG_PERF_EVENTS */ > }; > > -/* > - * event groups keep group leader events arranged as an rb tree with > - * event->cpu key and as a list for the whole tree iterations; > - */ > -struct perf_event_groups { > - struct list_head list; > - struct rb_root tree; > -}; > - > /** > * struct perf_event_context - event context structure > * > @@ -778,8 +762,8 @@ struct perf_event_context { > struct mutex mutex; > > struct list_head active_ctx_list; > - struct perf_event_groups pinned_groups; > - struct perf_event_groups flexible_groups; > + struct rb_root pinned_groups; > + struct rb_root flexible_groups; > struct list_head event_list; > int nr_events; > int nr_active; > diff --git a/kernel/events/core.c b/kernel/events/core.c > index bddcb87..5142434 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -1466,7 +1466,7 @@ static enum event_type_t get_event_type(struct > perf_event *event) > * Extract pinned or flexible groups from the context > * based on event attrs bits; > */ > -static struct perf_event_groups * > +static struct rb_root * > get_event_groups(struct perf_event *event, struct perf_event_context *ctx) > { > if (event->attr.pinned) > @@ -1476,11 +1476,11 @@ get_event_groups(struct perf_event *event, struct > perf_event_context *ctx) > } > > static void > -perf_event_groups_insert(struct perf_event_groups *groups, > +perf_event_groups_insert(struct rb_root *groups, > struct perf_event *event); > > static void > -perf_event_groups_delete(struct perf_event_groups *groups, > +perf_event_groups_delete(struct rb_root *groups, > struct perf_event *event); > > /* > @@ -1490,7 +1490,7 @@ perf_event_groups_delete(struct perf_event_groups > *groups, > static void > add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) > { > - struct perf_event_groups *groups; > + struct rb_root *groups; > > groups = get_event_groups(event, ctx); > perf_event_groups_insert(groups, event); > @@ -1502,48 +1502,28 @@ add_event_to_groups(struct perf_event *event, struct > perf_event_context *ctx) > static void > del_event_from_groups(struct perf_event *event, struct perf_event_context > *ctx) > { > - struct perf_event_groups *groups; > + struct rb_root *groups; > > groups = get_event_groups(event, ctx); > perf_event_groups_delete(groups, event); > } > > /* > - * Helper function to test if event groups are empty; > - */ > -static int > -perf_event_groups_empty(struct perf_event_groups *groups) > -{ > - return list_empty(&groups->list); > -} > - > -/* > - * Helper function to Initialize event groups object; > - */ > -static void > -perf_event_groups_init(struct perf_event_groups *groups) > -{ > - INIT_LIST_HEAD(&groups->list); > - groups->tree = RB_ROOT; > -} > - > -/* > * Insert a group into a tree using event->cpu as a key. If event->cpu node > * is already attached to the tree then the event is added to the attached > * group's group_list list. > */ > static void > -perf_event_groups_insert(struct perf_event_groups *groups, > - struct perf_event *event) > +perf_event_groups_insert(struct rb_root *groups, struct perf_event *event) > { > struct rb_node **node; > struct rb_node *parent; > struct perf_event *node_event; > > WARN_ON_ONCE(!groups || !event); > - WARN_ON_ONCE(!list_empty(&event->group_list_entry)); > + WARN_ON_ONCE(!list_empty(&event->group_entry)); > > - node = &groups->tree.rb_node; > + node = &groups->rb_node; > parent = *node; > > while (*node) { > @@ -1556,16 +1536,16 @@ perf_event_groups_insert(struct perf_event_groups > *groups, > } else if (event->cpu > node_event->cpu) { > node = &parent->rb_right; > } else { > - list_add_tail(&event->group_list_entry, > + list_add_tail(&event->group_entry, > &node_event->group_list); > return; > } > } > > - list_add_tail(&event->group_list_entry, &event->group_list); > + list_add_tail(&event->group_entry, &event->group_list); > > rb_link_node(&event->group_node, parent, node); > - rb_insert_color(&event->group_node, &groups->tree); > + rb_insert_color(&event->group_node, groups); > } > > /* > @@ -1573,30 +1553,28 @@ perf_event_groups_insert(struct perf_event_groups > *groups, > * it also detaches all groups on the group's group_list list. > */ > static void > -perf_event_groups_delete(struct perf_event_groups *groups, > - struct perf_event *event) > +perf_event_groups_delete(struct rb_root *groups, struct perf_event *event) > { > struct perf_event *next; > > WARN_ON_ONCE(!event); > - WARN_ON_ONCE(list_empty(&event->group_list_entry)); > + WARN_ON_ONCE(list_empty(&event->group_entry)); > > - list_del_init(&event->group_list_entry); > + list_del_init(&event->group_entry); > > if (!RB_EMPTY_NODE(&event->group_node)) { > WARN_ON_ONCE(!groups); > - if (!RB_EMPTY_ROOT(&groups->tree)) { > + if (!RB_EMPTY_ROOT(groups)) { > if (list_empty(&event->group_list)) { > rb_erase(&event->group_node, &groups->tree); > } else { > next = list_first_entry(&event->group_list, > - struct perf_event, > group_list_entry); > + struct perf_event, group_entry); > list_replace_init(&event->group_list, > &next->group_list); > rb_replace_node(&event->group_node, > - &next->group_node, > &groups->tree); > + &next->group_node, groups); > } > - > } > RB_CLEAR_NODE(&event->group_node); > } > @@ -1606,14 +1584,14 @@ perf_event_groups_delete(struct perf_event_groups > *groups, > * Find group list by a cpu key and rotate it. > */ > static void > -perf_event_groups_rotate(struct perf_event_groups *groups, int cpu) > +perf_event_groups_rotate(struct rb_root *groups, int cpu) > { > struct rb_node *node; > struct perf_event *node_event; > > WARN_ON_ONCE(!groups); > > - node = groups->tree.rb_node; > + node = groups->rb_node; > > while (node) { > node_event = container_of(node, > @@ -1638,7 +1616,7 @@ perf_event_groups_rotate(struct perf_event_groups > *groups, int cpu) > typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *); > > static void > -perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu, > +perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu, > perf_event_groups_iterate_f callback, void *data) > { > struct rb_node *node; > @@ -1646,7 +1624,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups > *groups, int cpu, > > WARN_ON_ONCE(!groups); > > - node = groups->tree.rb_node; > + node = groups->rb_node; > > while (node) { > node_event = container_of(node, > @@ -1658,7 +1636,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups > *groups, int cpu, > node = node->rb_right; > } else { > list_for_each_entry(event, &node_event->group_list, > - group_list_entry) > + group_entry) > callback(event, data); > break; > } > @@ -1670,26 +1648,20 @@ perf_event_groups_iterate_cpu(struct > perf_event_groups *groups, int cpu, > * Iteration stops if the callback returns non zero. > */ > static int > -perf_event_groups_iterate(struct perf_event_groups *groups, > +perf_event_groups_iterate(struct rb_root *groups, > perf_event_groups_iterate_f callback, void *data) > { > int ret = 0; > - struct perf_event *event; > + struct rb_node *node; > > - WARN_ON_ONCE(!groups); > + struct perf_event *node_event, *event; > > - list_for_each_entry(event, &groups->list, group_list_entry) { > - ret = callback(event, data); > - if (ret) > - break; > - } > - > - /* will replace itration above in patch v5 4/4 > + WARN_ON_ONCE(!groups); > > for (node = rb_first(groups); node; node = rb_next(node)) { > node_event = container_of(node, struct perf_event, group_node); > list_for_each_entry(event, &node_event->group_list, > - group_list_entry) { > + group_entry) { > WARN_ON_ONCE(!(event->cpu == node_event->cpu)); > ret = callback(event, data); > if (ret) { > @@ -1698,8 +1670,6 @@ perf_event_groups_iterate(struct perf_event_groups > *groups, > } > } > > - */ > - > return ret; > } > > @@ -2965,7 +2935,9 @@ static void ctx_sched_out(struct perf_event_context > *ctx, > .cpuctx = cpuctx, > .ctx = ctx > }; > + > int cpu = smp_processor_id(); > + > lockdep_assert_held(&ctx->lock); > > if (likely(!ctx->nr_events)) { > @@ -3399,7 +3371,6 @@ ctx_sched_in(struct perf_event_context *ctx, > .ctx = ctx > }; > int cpu = smp_processor_id(); > - > lockdep_assert_held(&ctx->lock); > > if (likely(!ctx->nr_events)) > @@ -3490,7 +3461,7 @@ static void perf_event_context_sched_in(struct > perf_event_context *ctx, > * However, if task's ctx is not carrying any pinned > * events, no need to flip the cpuctx's events around. > */ > - if (!perf_event_groups_empty(&ctx->pinned_groups)) > + if (!RB_EMPTY_ROOT(&ctx->pinned_groups)) > cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux); > perf_event_sched_in(cpuctx, ctx, task, mux); > perf_pmu_enable(ctx->pmu); > @@ -4057,8 +4028,8 @@ static void __perf_event_init_context(struct > perf_event_context *ctx) > raw_spin_lock_init(&ctx->lock); > mutex_init(&ctx->mutex); > INIT_LIST_HEAD(&ctx->active_ctx_list); > - perf_event_groups_init(&ctx->pinned_groups); > - perf_event_groups_init(&ctx->flexible_groups); > + ctx->pinned_groups = RB_ROOT; > + ctx->flexible_groups = RB_ROOT; > INIT_LIST_HEAD(&ctx->event_list); > atomic_set(&ctx->refcount, 1); > } > @@ -9695,7 +9666,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, > INIT_LIST_HEAD(&event->sibling_list); > RB_CLEAR_NODE(&event->group_node); > INIT_LIST_HEAD(&event->group_list); > - INIT_LIST_HEAD(&event->group_list_entry); > INIT_LIST_HEAD(&event->rb_entry); > INIT_LIST_HEAD(&event->active_entry); > INIT_LIST_HEAD(&event->addr_filters.list); >