This series of patches continues v2 and addresses captured comments.

Specifically this patch replaces pinned_groups and flexible_groups
lists of perf_event_context by red-black cpu indexed trees avoiding data structures duplication and introducing possibility to iterate event groups for a specific CPU only.

Signed-off-by: Alexey Budankov <[email protected]>
---
 include/linux/perf_event.h |  34 +++-
kernel/events/core.c | 386 +++++++++++++++++++++++++++++++++------------
 2 files changed, 315 insertions(+), 105 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a6358..2c1dcf1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -574,6 +574,27 @@ struct perf_event {
        struct list_head                sibling_list;

        /*
+        * Node on the pinned or flexible tree located at the event context;
+        * the node may be empty in case its event is not directly attached
+        * to the tree but to group_list list of the event directly
+        * attached to the tree;
+        */
+       struct rb_node                  group_node;
+       /*
+        * List keeps groups allocated for the same cpu;
+        * the list may be empty in case its event is not directly
+        * attached to the tree but to group_list list of the event directly
+        * attached to the tree;
+        */
+       struct list_head                group_list;
+       /*
+        * Entry into the group_list list above;
+        * the entry may be attached to the self group_list list above
+        * in case the event is directly attached to the pinned or
+        * flexible tree;
+        */
+       struct list_head                group_list_entry;
+       /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
@@ -741,8 +762,17 @@ struct perf_event_context {
        struct mutex                    mutex;

        struct list_head                active_ctx_list;
-       struct list_head                pinned_groups;
-       struct list_head                flexible_groups;
+       /*
+        * RB tree for pinned groups; keeps event's group_node
+        * nodes of attached pinned groups;
+        */
+       struct rb_root                  pinned_tree;
+       /*
+        * RB tree for flexible groups; keeps event's group_node
+        * nodes of attached flexible groups;
+        */
+       struct rb_root                  flexible_tree;
+
        struct list_head                event_list;
        int                             nr_events;
        int                             nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc63f8d..a3531f9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1458,13 +1458,142 @@ static enum event_type_t get_event_type(struct perf_event *event)
        return event_type;
 }

-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+static struct rb_root *
+ctx_group_cpu_tree(struct perf_event *event, struct perf_event_context *ctx)
 {
        if (event->attr.pinned)
-               return &ctx->pinned_groups;
+               return &ctx->pinned_tree;
        else
-               return &ctx->flexible_groups;
+               return &ctx->flexible_tree;
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
+{
+       struct rb_node **node;
+       struct rb_node *parent;
+
+       WARN_ON_ONCE(!tree || !event);
+
+       node = &tree->rb_node;
+       parent = *node;
+
+       while (*node) {
+               struct perf_event *node_event = container_of(*node,
+                               struct perf_event, group_node);
+
+               parent = *node;
+
+               if (event->cpu < node_event->cpu) {
+                       node = &parent->rb_left;
+               } else if (event->cpu > node_event->cpu) {
+                       node = &parent->rb_right;
+               } else {
+                       list_add_tail(&event->group_list_entry,
+                                       &node_event->group_list);
+                       return;
+               }
+       }
+
+       list_add_tail(&event->group_list_entry, &event->group_list);
+
+       rb_link_node(&event->group_node, parent, node);
+       rb_insert_color(&event->group_node, tree);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
+{
+       WARN_ON_ONCE(!tree || !event);
+
+       if (RB_EMPTY_NODE(&event->group_node)) {
+               list_del_init(&event->group_list_entry);
+       } else {
+               struct perf_event *list_event, *list_next;
+
+               rb_erase(&event->group_node, tree);
+               list_for_each_entry_safe(list_event, list_next,
+                               &event->group_list, group_list_entry)
+                       list_del_init(&list_event->group_list_entry);
+       }
+}
+
+/*
+ * Find group_list list by a cpu key and call provided callback for every
+ * group on the list. Iteration stops if the callback returns non zero.
+ */
+
+typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
+
+static int
+perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
+               perf_cpu_tree_callback_t callback, void *data)
+{
+       int ret = 0;
+       struct rb_node *node;
+       struct perf_event *event;
+
+       WARN_ON_ONCE(!tree);
+
+       node = tree->rb_node;
+
+       while (node) {
+               struct perf_event *node_event = container_of(node,
+                               struct perf_event, group_node);
+
+               if (cpu < node_event->cpu) {
+                       node = node->rb_left;
+               } else if (cpu > node_event->cpu) {
+                       node = node->rb_right;
+               } else {
+                       list_for_each_entry(event, &node_event->group_list,
+                                       group_list_entry) {
+                               ret = callback(event, data);
+                               if (ret)
+                                       return ret;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Iterate a tree and call provided callback for every group in the tree.
+ * Iteration stops if the callback returns non zero.
+ */
+static int
+perf_cpu_tree_iterate(struct rb_root *tree,
+               perf_cpu_tree_callback_t callback, void *data)
+{
+       int ret = 0;
+       struct rb_node *node;
+       struct perf_event *event;
+
+       WARN_ON_ONCE(!tree);
+
+       for (node = rb_first(tree); node; node = rb_next(node)) {
+               struct perf_event *node_event = container_of(node,
+                               struct perf_event, group_node);
+
+               list_for_each_entry(event, &node_event->group_list,
+                               group_list_entry) {
+                       ret = callback(event, data);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return 0;
 }

 /*
@@ -1485,12 +1614,12 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
-               struct list_head *list;
+               struct rb_root *tree;

                event->group_caps = event->event_caps;

-               list = ctx_group_list(event, ctx);
-               list_add_tail(&event->group_entry, list);
+               tree = ctx_group_cpu_tree(event, ctx);
+               perf_cpu_tree_insert(tree, event);
        }

        list_update_cgroup_event(event, ctx, true);
@@ -1680,8 +1809,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)

        list_del_rcu(&event->event_entry);

-       if (event->group_leader == event)
-               list_del_init(&event->group_entry);
+       if (event->group_leader == event) {
+               struct rb_root *tree;
+
+               tree = ctx_group_cpu_tree(event, ctx);
+               perf_cpu_tree_delete(tree, event);
+       }

        update_group_times(event);

@@ -2699,12 +2832,80 @@ int perf_event_refresh(struct perf_event *event, int refresh)
 }
 EXPORT_SYMBOL_GPL(perf_event_refresh);

+static void
+sched_in_group(struct perf_event *event,
+               struct perf_event_context *ctx,
+               struct perf_cpu_context *cpuctx,
+               int *can_add_hw)
+{
+       /* Ignore events in OFF or ERROR state and
+        * listen to the 'cpu' scheduling filter constraint
+        * of events:
+        */
+       if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event))
+               return;
+
+       /* may need to reset tstamp_enabled */
+       if (is_cgroup_event(event))
+               perf_cgroup_mark_enabled(event, ctx);
+
+       if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+               if (group_sched_in(event, cpuctx, ctx))
+                       *can_add_hw = 0;
+       }
+}
+
+struct group_sched_params {
+       struct perf_cpu_context *cpuctx;
+       struct perf_event_context *ctx;
+       int can_add_hw;
+};
+
+static int
+group_sched_in_pinned_callback(struct perf_event *event, void *data)
+{
+       struct group_sched_params *params = data;
+       int can_add_hw = 1;
+
+       sched_in_group(event, params->ctx, params->cpuctx, &can_add_hw);
+
+       if (!can_add_hw) {
+               update_group_times(event);
+               event->state = PERF_EVENT_STATE_ERROR;
+       }
+
+       return 0;
+}
+
+static int
+group_sched_in_flexible_callback(struct perf_event *event, void *data)
+{
+       struct group_sched_params *params = data;
+
+       sched_in_group(event, params->ctx, params->cpuctx,
+                       &params->can_add_hw);
+
+       return 0;
+}
+
+static int group_sched_out_callback(struct perf_event *event, void *data)
+{
+       struct group_sched_params *params = data;
+
+       group_sched_out(event, params->cpuctx, params->ctx);
+
+       return 0;
+}
+
 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
 {
        int is_active = ctx->is_active;
-       struct perf_event *event;
+       struct group_sched_params params = {
+                       .cpuctx = cpuctx,
+                       .ctx = ctx
+       };

        lockdep_assert_held(&ctx->lock);

@@ -2750,15 +2951,15 @@ static void ctx_sched_out(struct perf_event_context *ctx,
                return;

        perf_pmu_disable(ctx->pmu);
-       if (is_active & EVENT_PINNED) {
-               list_for_each_entry(event, &ctx->pinned_groups, group_entry)
-                       group_sched_out(event, cpuctx, ctx);
-       }

-       if (is_active & EVENT_FLEXIBLE) {
-               list_for_each_entry(event, &ctx->flexible_groups, group_entry)
-                       group_sched_out(event, cpuctx, ctx);
-       }
+       if (is_active & EVENT_PINNED)
+               perf_cpu_tree_iterate(&ctx->pinned_tree,
+                               group_sched_out_callback, &params);
+
+       if (is_active & EVENT_FLEXIBLE)
+               perf_cpu_tree_iterate(&ctx->flexible_tree,
+                               group_sched_out_callback, &params);
+
        perf_pmu_enable(ctx->pmu);
 }

@@ -3052,72 +3253,18 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 }

 static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-                   struct perf_cpu_context *cpuctx)
-{
-       struct perf_event *event;
-
-       list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-               if (event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-               if (!event_filter_match(event))
-                       continue;
-
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
-
-               if (group_can_go_on(event, cpuctx, 1))
-                       group_sched_in(event, cpuctx, ctx);
-
-               /*
-                * If this pinned group hasn't been scheduled,
-                * put it in error state.
-                */
-               if (event->state == PERF_EVENT_STATE_INACTIVE) {
-                       update_group_times(event);
-                       event->state = PERF_EVENT_STATE_ERROR;
-               }
-       }
-}
-
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-                     struct perf_cpu_context *cpuctx)
-{
-       struct perf_event *event;
-       int can_add_hw = 1;
-
-       list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-               /* Ignore events in OFF or ERROR state */
-               if (event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-               /*
-                * Listen to the 'cpu' scheduling filter constraint
-                * of events:
-                */
-               if (!event_filter_match(event))
-                       continue;
-
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
-
-               if (group_can_go_on(event, cpuctx, can_add_hw)) {
-                       if (group_sched_in(event, cpuctx, ctx))
-                               can_add_hw = 0;
-               }
-       }
-}
-
-static void
 ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
             struct task_struct *task)
 {
        int is_active = ctx->is_active;
-       u64 now;
+       struct group_sched_params params = {
+                       .cpuctx = cpuctx,
+                       .ctx = ctx,
+                       .can_add_hw = 1
+
+       };

        lockdep_assert_held(&ctx->lock);

@@ -3136,8 +3283,7 @@ ctx_sched_in(struct perf_event_context *ctx,

        if (is_active & EVENT_TIME) {
                /* start ctx time */
-               now = perf_clock();
-               ctx->timestamp = now;
+               ctx->timestamp = perf_clock();
                perf_cgroup_set_timestamp(task, ctx);
        }

@@ -3146,11 +3292,13 @@ ctx_sched_in(struct perf_event_context *ctx,
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED)
-               ctx_pinned_sched_in(ctx, cpuctx);
+               perf_cpu_tree_iterate(&ctx->pinned_tree,
+                               group_sched_in_pinned_callback, &params);

        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE)
-               ctx_flexible_sched_in(ctx, cpuctx);
+               perf_cpu_tree_iterate(&ctx->flexible_tree,
+                               group_sched_in_flexible_callback, &params);
 }

 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -3181,7 +3329,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
-       if (!list_empty(&ctx->pinned_groups))
+       if (!RB_EMPTY_ROOT(&ctx->pinned_tree))
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);
        perf_pmu_enable(ctx->pmu);
@@ -3410,14 +3558,25 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 /*
  * Round-robin a context's events:
  */
+static int rotate_ctx_callback(struct perf_event *event, void *data)
+{
+       list_rotate_left(&event->group_list);
+
+       return 1;
+}
+
 static void rotate_ctx(struct perf_event_context *ctx)
 {
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
-       if (!ctx->rotate_disable)
-               list_rotate_left(&ctx->flexible_groups);
+       if (!ctx->rotate_disable) {
+               perf_cpu_tree_iterate_cpu(&ctx->flexible_tree,
+                               -1, rotate_ctx_callback, NULL);
+               perf_cpu_tree_iterate_cpu(&ctx->flexible_tree,
+                               smp_processor_id(), rotate_ctx_callback, NULL);
+       }
 }

 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3743,8 +3902,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->active_ctx_list);
-       INIT_LIST_HEAD(&ctx->pinned_groups);
-       INIT_LIST_HEAD(&ctx->flexible_groups);
+       ctx->pinned_tree = RB_ROOT;
+       ctx->flexible_tree = RB_ROOT;
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
 }
@@ -9377,6 +9536,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->group_entry);
+       RB_CLEAR_NODE(&event->group_node);
+       INIT_LIST_HEAD(&event->group_list);
+       INIT_LIST_HEAD(&event->group_list_entry);
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->rb_entry);
@@ -10816,6 +10978,23 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
        return ret;
 }

+struct inherit_task_group_params {
+       struct task_struct *parent;
+       struct perf_event_context *parent_ctx;
+       struct task_struct *child;
+       int ctxn;
+       int inherited_all;
+};
+
+static int
+inherit_task_group_callback(struct perf_event *event, void *data)
+{
+       struct inherit_task_group_params *params = data;
+
+       return inherit_task_group(event, params->parent, params->parent_ctx,
+                        params->child, params->ctxn, &params->inherited_all);
+}
+
 /*
  * Initialize the perf_event context in task_struct
  */
@@ -10823,11 +11002,15 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
-       struct perf_event *event;
        struct task_struct *parent = current;
-       int inherited_all = 1;
        unsigned long flags;
        int ret = 0;
+       struct inherit_task_group_params params = {
+                       .parent = parent,
+                       .child = child,
+                       .ctxn = ctxn,
+                       .inherited_all = 1
+       };

        if (likely(!parent->perf_event_ctxp[ctxn]))
                return 0;
@@ -10839,7 +11022,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
        parent_ctx = perf_pin_task_context(parent, ctxn);
        if (!parent_ctx)
                return 0;
-
+       else
+               params.parent_ctx = parent_ctx;
        /*
         * No need to check if parent_ctx != NULL here; since we saw
         * it non-NULL earlier, the only reason for it to become NULL
@@ -10857,12 +11041,10 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
-       list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-               ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, &inherited_all);
-               if (ret)
-                       goto out_unlock;
-       }
+       ret = perf_cpu_tree_iterate(&parent_ctx->pinned_tree,
+                       inherit_task_group_callback, &params);
+       if (ret)
+               goto out_unlock;

        /*
         * We can't hold ctx->lock when iterating the ->flexible_group list due
@@ -10873,19 +11055,17 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

-       list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-               ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, &inherited_all);
-               if (ret)
-                       goto out_unlock;
-       }
+       ret = perf_cpu_tree_iterate(&parent_ctx->flexible_tree,
+                               inherit_task_group_callback, &params);
+       if (ret)
+               goto out_unlock;

        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;

        child_ctx = child->perf_event_ctxp[ctxn];

-       if (child_ctx && inherited_all) {
+       if (child_ctx && params.inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.

Reply via email to