Hi,

Are there any new comments so far? Could you please suggest further steps 
forward?

Thanks,
Alexey

On 10.07.2017 16:03, Alexey Budankov wrote:
> perf/core: complete replace of lists by rb trees for pinned and 
>            flexible groups at perf_event_context
> 
> By default, the userspace perf tool opens per-cpu task-bound events
> when sampling, so for N logical events requested by the user, the tool
> will open N * NR_CPUS events.
> 
> In the kernel, we mux events with a hrtimer, periodically rotating the
> flexible group list and trying to schedule each group in turn. We skip
> groups whose cpu filter doesn't match. So when we get unlucky, we can
> walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer invocation.
> 
> This has been observed to result in significant overhead when running
> the STREAM benchmark on 272 core Xeon Phi systems.
> 
> One way to avoid this is to place our events into an rb tree sorted by
> CPU filter, so that our hrtimer can skip to the current CPU's
> list and ignore everything else.
> 
> This patch implements complete replacement of lists by rb trees for 
> pinned and flexible groups.
> 
> The patch set was tested on Xeon Phi using perf_fuzzer and tests 
> from here: https://github.com/deater/perf_event_tests
> 
> The full patch set (v1-4) is attached for convenience. 
> 
> Branch revision:
> * perf/core 007b811b4041989ec2dc91b9614aa2c41332723e 
>   Merge tag 'perf-core-for-mingo-4.13-20170719' of 
>   git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
> 
> Signed-off-by: Alexey Budankov <alexey.budan...@linux.intel.com>
> ---
>  include/linux/perf_event.h | 20 +---------
>  kernel/events/core.c       | 94 
> ++++++++++++++++------------------------------
>  2 files changed, 34 insertions(+), 80 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 7b2cddf..8e1967f 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -603,13 +603,6 @@ struct perf_event {
>        */
>       struct list_head                group_list;
>       /*
> -      * Entry into the group_list list above;
> -      * the entry may be attached to the self group_list list above
> -      * in case the event is directly attached to the tree;
> -      */
> -     struct list_head                group_list_entry;
> -
> -     /*
>        * We need storage to track the entries in perf_pmu_migrate_context; we
>        * cannot use the event_entry because of RCU and we want to keep the
>        * group in tact which avoids us using the other two entries.
> @@ -749,15 +742,6 @@ struct perf_event {
>  #endif /* CONFIG_PERF_EVENTS */
>  };
>  
> -/*
> - * event groups keep group leader events arranged as an rb tree with
> - * event->cpu key and as a list for the whole tree iterations;
> - */
> -struct perf_event_groups {
> -     struct list_head list;
> -     struct rb_root   tree;
> -};
> -
>  /**
>   * struct perf_event_context - event context structure
>   *
> @@ -778,8 +762,8 @@ struct perf_event_context {
>       struct mutex                    mutex;
>  
>       struct list_head                active_ctx_list;
> -     struct perf_event_groups        pinned_groups;
> -     struct perf_event_groups        flexible_groups;
> +     struct rb_root                  pinned_groups;
> +     struct rb_root                  flexible_groups;
>       struct list_head                event_list;
>       int                             nr_events;
>       int                             nr_active;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index bddcb87..5142434 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1466,7 +1466,7 @@ static enum event_type_t get_event_type(struct 
> perf_event *event)
>   * Extract pinned or flexible groups from the context
>   * based on event attrs bits;
>   */
> -static struct perf_event_groups *
> +static struct rb_root *
>  get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
>  {
>       if (event->attr.pinned)
> @@ -1476,11 +1476,11 @@ get_event_groups(struct perf_event *event, struct 
> perf_event_context *ctx)
>  }
>  
>  static void
> -perf_event_groups_insert(struct perf_event_groups *groups,
> +perf_event_groups_insert(struct rb_root *groups,
>               struct perf_event *event);
>  
>  static void
> -perf_event_groups_delete(struct perf_event_groups *groups,
> +perf_event_groups_delete(struct rb_root *groups,
>               struct perf_event *event);
>  
>  /*
> @@ -1490,7 +1490,7 @@ perf_event_groups_delete(struct perf_event_groups 
> *groups,
>  static void
>  add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
>  {
> -     struct perf_event_groups *groups;
> +     struct rb_root *groups;
>  
>       groups = get_event_groups(event, ctx);
>       perf_event_groups_insert(groups, event);
> @@ -1502,48 +1502,28 @@ add_event_to_groups(struct perf_event *event, struct 
> perf_event_context *ctx)
>  static void
>  del_event_from_groups(struct perf_event *event, struct perf_event_context 
> *ctx)
>  {
> -     struct perf_event_groups *groups;
> +     struct rb_root *groups;
>  
>       groups = get_event_groups(event, ctx);
>       perf_event_groups_delete(groups, event);
>  }
>  
>  /*
> - * Helper function to test if event groups are empty;
> - */
> -static int
> -perf_event_groups_empty(struct perf_event_groups *groups)
> -{
> -     return list_empty(&groups->list);
> -}
> -
> -/*
> - * Helper function to Initialize event groups object;
> - */
> -static void
> -perf_event_groups_init(struct perf_event_groups *groups)
> -{
> -     INIT_LIST_HEAD(&groups->list);
> -     groups->tree = RB_ROOT;
> -}
> -
> -/*
>   * Insert a group into a tree using event->cpu as a key. If event->cpu node
>   * is already attached to the tree then the event is added to the attached
>   * group's group_list list.
>   */
>  static void
> -perf_event_groups_insert(struct perf_event_groups *groups,
> -             struct perf_event *event)
> +perf_event_groups_insert(struct rb_root *groups, struct perf_event *event)
>  {
>       struct rb_node **node;
>       struct rb_node *parent;
>       struct perf_event *node_event;
>  
>       WARN_ON_ONCE(!groups || !event);
> -     WARN_ON_ONCE(!list_empty(&event->group_list_entry));
> +     WARN_ON_ONCE(!list_empty(&event->group_entry));
>  
> -     node = &groups->tree.rb_node;
> +     node = &groups->rb_node;
>       parent = *node;
>  
>       while (*node) {
> @@ -1556,16 +1536,16 @@ perf_event_groups_insert(struct perf_event_groups 
> *groups,
>               } else if (event->cpu > node_event->cpu) {
>                       node = &parent->rb_right;
>               } else {
> -                     list_add_tail(&event->group_list_entry,
> +                     list_add_tail(&event->group_entry,
>                                       &node_event->group_list);
>                       return;
>               }
>       }
>  
> -     list_add_tail(&event->group_list_entry, &event->group_list);
> +     list_add_tail(&event->group_entry, &event->group_list);
>  
>       rb_link_node(&event->group_node, parent, node);
> -     rb_insert_color(&event->group_node, &groups->tree);
> +     rb_insert_color(&event->group_node, groups);
>  }
>  
>  /*
> @@ -1573,30 +1553,28 @@ perf_event_groups_insert(struct perf_event_groups 
> *groups,
>   * it also detaches all groups on the group's group_list list.
>   */
>  static void
> -perf_event_groups_delete(struct perf_event_groups *groups,
> -             struct perf_event *event)
> +perf_event_groups_delete(struct rb_root *groups, struct perf_event *event)
>  {
>       struct perf_event *next;
>  
>       WARN_ON_ONCE(!event);
> -     WARN_ON_ONCE(list_empty(&event->group_list_entry));
> +     WARN_ON_ONCE(list_empty(&event->group_entry));
>  
> -     list_del_init(&event->group_list_entry);
> +     list_del_init(&event->group_entry);
>  
>       if (!RB_EMPTY_NODE(&event->group_node)) {
>               WARN_ON_ONCE(!groups);
> -             if (!RB_EMPTY_ROOT(&groups->tree)) {
> +             if (!RB_EMPTY_ROOT(groups)) {
>                       if (list_empty(&event->group_list)) {
>                               rb_erase(&event->group_node, &groups->tree);
>                       } else {
>                               next = list_first_entry(&event->group_list,
> -                                             struct perf_event, 
> group_list_entry);
> +                                             struct perf_event, group_entry);
>                               list_replace_init(&event->group_list,
>                                               &next->group_list);
>                               rb_replace_node(&event->group_node,
> -                                             &next->group_node, 
> &groups->tree);
> +                                             &next->group_node, groups);
>                       }
> -
>               }
>               RB_CLEAR_NODE(&event->group_node);
>       }
> @@ -1606,14 +1584,14 @@ perf_event_groups_delete(struct perf_event_groups 
> *groups,
>   * Find group list by a cpu key and rotate it.
>   */
>  static void
> -perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
> +perf_event_groups_rotate(struct rb_root *groups, int cpu)
>  {
>       struct rb_node *node;
>       struct perf_event *node_event;
>  
>       WARN_ON_ONCE(!groups);
>  
> -     node = groups->tree.rb_node;
> +     node = groups->rb_node;
>  
>       while (node) {
>               node_event = container_of(node,
> @@ -1638,7 +1616,7 @@ perf_event_groups_rotate(struct perf_event_groups 
> *groups, int cpu)
>  typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);
>  
>  static void
> -perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
> +perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu,
>               perf_event_groups_iterate_f callback, void *data)
>  {
>       struct rb_node *node;
> @@ -1646,7 +1624,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups 
> *groups, int cpu,
>  
>       WARN_ON_ONCE(!groups);
>  
> -     node = groups->tree.rb_node;
> +     node = groups->rb_node;
>  
>       while (node) {
>               node_event = container_of(node,
> @@ -1658,7 +1636,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups 
> *groups, int cpu,
>                       node = node->rb_right;
>               } else {
>                       list_for_each_entry(event, &node_event->group_list,
> -                                     group_list_entry)
> +                                     group_entry)
>                               callback(event, data);
>                       break;
>               }
> @@ -1670,26 +1648,20 @@ perf_event_groups_iterate_cpu(struct 
> perf_event_groups *groups, int cpu,
>   * Iteration stops if the callback returns non zero.
>   */
>  static int
> -perf_event_groups_iterate(struct perf_event_groups *groups,
> +perf_event_groups_iterate(struct rb_root *groups,
>               perf_event_groups_iterate_f callback, void *data)
>  {
>       int ret = 0;
> -     struct perf_event *event;
> +     struct rb_node *node;
>  
> -     WARN_ON_ONCE(!groups);
> +     struct perf_event *node_event, *event;
>  
> -     list_for_each_entry(event, &groups->list, group_list_entry) {
> -             ret = callback(event, data);
> -             if (ret)
> -                     break;
> -     }
> -
> -     /* will replace itration above in patch v5 4/4
> +     WARN_ON_ONCE(!groups);
>  
>       for (node = rb_first(groups); node; node = rb_next(node)) {
>               node_event = container_of(node, struct perf_event, group_node);
>               list_for_each_entry(event, &node_event->group_list,
> -                             group_list_entry) {
> +                             group_entry) {
>                       WARN_ON_ONCE(!(event->cpu == node_event->cpu));
>                       ret = callback(event, data);
>                       if (ret) {
> @@ -1698,8 +1670,6 @@ perf_event_groups_iterate(struct perf_event_groups 
> *groups,
>               }
>       }
>  
> -     */
> -
>       return ret;
>  }
>  
> @@ -2965,7 +2935,9 @@ static void ctx_sched_out(struct perf_event_context 
> *ctx,
>                       .cpuctx = cpuctx,
>                       .ctx = ctx
>       };
> +
>       int cpu = smp_processor_id();
> +
>       lockdep_assert_held(&ctx->lock);
>  
>       if (likely(!ctx->nr_events)) {
> @@ -3399,7 +3371,6 @@ ctx_sched_in(struct perf_event_context *ctx,
>                       .ctx = ctx
>       };
>       int cpu = smp_processor_id();
> -
>       lockdep_assert_held(&ctx->lock);
>  
>       if (likely(!ctx->nr_events))
> @@ -3490,7 +3461,7 @@ static void perf_event_context_sched_in(struct 
> perf_event_context *ctx,
>        * However, if task's ctx is not carrying any pinned
>        * events, no need to flip the cpuctx's events around.
>        */
> -     if (!perf_event_groups_empty(&ctx->pinned_groups))
> +     if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
>               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>       perf_event_sched_in(cpuctx, ctx, task, mux);
>       perf_pmu_enable(ctx->pmu);
> @@ -4057,8 +4028,8 @@ static void __perf_event_init_context(struct 
> perf_event_context *ctx)
>       raw_spin_lock_init(&ctx->lock);
>       mutex_init(&ctx->mutex);
>       INIT_LIST_HEAD(&ctx->active_ctx_list);
> -     perf_event_groups_init(&ctx->pinned_groups);
> -     perf_event_groups_init(&ctx->flexible_groups);
> +     ctx->pinned_groups = RB_ROOT;
> +     ctx->flexible_groups = RB_ROOT;
>       INIT_LIST_HEAD(&ctx->event_list);
>       atomic_set(&ctx->refcount, 1);
>  }
> @@ -9695,7 +9666,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>       INIT_LIST_HEAD(&event->sibling_list);
>       RB_CLEAR_NODE(&event->group_node);
>       INIT_LIST_HEAD(&event->group_list);
> -     INIT_LIST_HEAD(&event->group_list_entry);
>       INIT_LIST_HEAD(&event->rb_entry);
>       INIT_LIST_HEAD(&event->active_entry);
>       INIT_LIST_HEAD(&event->addr_filters.list);
> 

Reply via email to