On 05.09.2017 19:03, Peter Zijlstra wrote:
> On Tue, Sep 05, 2017 at 03:06:26PM +0300, Alexey Budankov wrote:
>> [ 6614.226305] WARNING: CPU: 45 PID: 43385 at kernel/events/core.c:239 
>> event_function+0xb3/0xe0
> 
> I think I avoided that problem by not radically rewriting
> perf_event_read() but fixing it instead:
> 
>   
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=perf/core&id=8ad650955ede95e4a6fd6afbda2a0b37d4af9c29
> 
> Full tree at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/core
> 
> 
> Very minimally tested so far, I'll continue tomorrow.
> 

The patch set v9 on top of peterz/queue perf/core repository above:

---
 include/linux/perf_event.h |  16 ++-
 kernel/events/core.c       | 307 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 267 insertions(+), 56 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2a6ae48..92cda40 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -557,7 +557,11 @@ struct perf_event {
         */
        struct list_head                group_entry;
        struct list_head                sibling_list;
-
+       /*
+        * Node on the pinned or flexible tree located at the event context;
+        */
+       struct rb_node                  group_node;
+       u64                             group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
@@ -689,6 +693,12 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
+
+struct perf_event_groups {
+       struct rb_root  tree;
+       u64             index;
+};
+
 /**
  * struct perf_event_context - event context structure
  *
@@ -709,8 +719,8 @@ struct perf_event_context {
        struct mutex                    mutex;
 
        struct list_head                active_ctx_list;
-       struct list_head                pinned_groups;
-       struct list_head                flexible_groups;
+       struct perf_event_groups        pinned_groups;
+       struct perf_event_groups        flexible_groups;
        struct list_head                event_list;
        int                             nr_events;
        int                             nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56e9214..8158f1d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1454,8 +1454,21 @@ static enum event_type_t get_event_type(struct 
perf_event *event)
        return event_type;
 }
 
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Helper function to initialize group leader event;
+ */
+void init_event_group(struct perf_event *event)
+{
+       RB_CLEAR_NODE(&event->group_node);
+       event->group_index = 0;
+}
+
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits;
+ */
+static struct perf_event_groups *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
 {
        if (event->attr.pinned)
                return &ctx->pinned_groups;
@@ -1464,6 +1477,169 @@ static enum event_type_t get_event_type(struct 
perf_event *event)
 }
 
 /*
+ * Helper function to initializes perf event groups object;
+ */
+void perf_event_groups_init(struct perf_event_groups *groups)
+{
+       groups->tree = RB_ROOT;
+       groups->index = 0;
+}
+
+/*
+ * Compare function for event groups;
+ * Implements complex key that first sorts by CPU and then by
+ * virtual index which provides ordering when rotating
+ * groups for the same CPU;
+ */
+int perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+{
+       if (left->cpu < right->cpu) {
+               return 1;
+       } else if (left->cpu > right->cpu) {
+               return 0;
+       } else {
+               if (left->group_index < right->group_index) {
+                       return 1;
+               } else if(left->group_index > right->group_index) {
+                       return 0;
+               } else {
+                       return 0;
+               }
+       }
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+               struct perf_event *event)
+{
+       struct perf_event *node_event;
+       struct rb_node *parent;
+       struct rb_node **node;
+
+       event->group_index = ++groups->index;
+
+       node = &groups->tree.rb_node;
+       parent = *node;
+
+       while (*node) {
+               parent = *node;
+               node_event = container_of(*node,
+                               struct perf_event, group_node);
+
+               if (perf_event_groups_less(event, node_event))
+                       node = &parent->rb_left;
+               else
+                       node = &parent->rb_right;
+       }
+
+       rb_link_node(&event->group_node, parent, node);
+       rb_insert_color(&event->group_node, &groups->tree);
+}
+
+/*
+ * Helper function to insert event into the pinned or
+ * flexible groups;
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+       struct perf_event_groups *groups;
+
+       groups = get_event_groups(event, ctx);
+       perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+               struct perf_event *event)
+{
+       if (!RB_EMPTY_NODE(&event->group_node) &&
+           !RB_EMPTY_ROOT(&groups->tree))
+               rb_erase(&event->group_node, &groups->tree);
+
+       init_event_group(event);
+}
+
+/*
+ * Helper function to delete event from its groups;
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+       struct perf_event_groups *groups;
+
+       groups = get_event_groups(event, ctx);
+       perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Get a group by a cpu key from groups tree with the least group_index;
+ */
+static struct perf_event *
+perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+{
+       struct perf_event *node_event = NULL, *match = NULL;
+       struct rb_node *node = groups->tree.rb_node;
+
+       while (node) {
+               node_event = container_of(node,
+                               struct perf_event, group_node);
+
+               if (cpu < node_event->cpu) {
+                       node = node->rb_left;
+               } else if (cpu > node_event->cpu) {
+                       node = node->rb_right;
+               } else {
+                       match = node_event;
+                       node = node->rb_left;
+               }
+       }
+
+       return match;
+}
+
+/*
+ * Find group list by a cpu key and rotate it.
+ */
+static void
+perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
+{
+       struct perf_event *event =
+                       perf_event_groups_first(groups, cpu);
+
+       if (event) {
+               perf_event_groups_delete(groups, event);
+               perf_event_groups_insert(groups, event);
+       }
+}
+
+/*
+ * Iterate event groups thru the whole tree.
+ */
+#define perf_event_groups_for_each(event, groups, node)                \
+       for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
+                               typeof(*event), node); event;   \
+               event = rb_entry_safe(rb_next(&event->node),    \
+                               typeof(*event), node))
+/*
+ * Iterate event groups with cpu == key.
+ */
+#define perf_event_groups_for_each_cpu(event, key, groups, node) \
+       for (event = perf_event_groups_first(groups, key);       \
+               event && event->cpu == key;                      \
+               event = rb_entry_safe(rb_next(&event->node),     \
+                               typeof(*event), node))
+
+/*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
  */
@@ -1483,12 +1659,8 @@ static enum event_type_t get_event_type(struct 
perf_event *event)
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
-               struct list_head *list;
-
                event->group_caps = event->event_caps;
-
-               list = ctx_group_list(event, ctx);
-               list_add_tail(&event->group_entry, list);
+               add_event_to_groups(event, ctx);
        }
 
        list_update_cgroup_event(event, ctx, true);
@@ -1682,7 +1854,7 @@ static void perf_group_attach(struct perf_event *event)
        list_del_rcu(&event->event_entry);
 
        if (event->group_leader == event)
-               list_del_init(&event->group_entry);
+               del_event_from_groups(event, ctx);
 
        /*
         * If event was in error state, then keep it
@@ -1700,7 +1872,6 @@ static void perf_group_attach(struct perf_event *event)
 static void perf_group_detach(struct perf_event *event)
 {
        struct perf_event *sibling, *tmp;
-       struct list_head *list = NULL;
 
        lockdep_assert_held(&event->ctx->lock);
 
@@ -1721,22 +1892,23 @@ static void perf_group_detach(struct perf_event *event)
                goto out;
        }
 
-       if (!list_empty(&event->group_entry))
-               list = &event->group_entry;
-
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, 
group_entry) {
-               if (list)
-                       list_move_tail(&sibling->group_entry, list);
+
                sibling->group_leader = sibling;
 
                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;
 
+               if (!RB_EMPTY_NODE(&event->group_node)) {
+                       list_del_init(&sibling->group_entry);
+                       add_event_to_groups(sibling, event->ctx);
+               }
+
                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
 
@@ -2180,6 +2352,22 @@ static int group_can_go_on(struct perf_event *event,
        return can_add_hw;
 }
 
+static int
+flexible_group_sched_in(struct perf_event *event,
+                       struct perf_event_context *ctx,
+                       struct perf_cpu_context *cpuctx,
+                       int *can_add_hw)
+{
+       if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event))
+               return 0;
+
+       if (group_can_go_on(event, cpuctx, *can_add_hw))
+               if (group_sched_in(event, cpuctx, ctx))
+                       *can_add_hw = 0;
+
+       return 1;
+}
+
 static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
 {
@@ -2646,6 +2834,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
 {
+       int sw = -1, cpu = smp_processor_id();
        int is_active = ctx->is_active;
        struct perf_event *event;
 
@@ -2694,12 +2883,20 @@ static void ctx_sched_out(struct perf_event_context 
*ctx,
 
        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
-               list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+               perf_event_groups_for_each_cpu(event, cpu,
+                               &ctx->pinned_groups, group_node)
+                       group_sched_out(event, cpuctx, ctx);
+               perf_event_groups_for_each_cpu(event, sw,
+                               &ctx->pinned_groups, group_node)
                        group_sched_out(event, cpuctx, ctx);
        }
 
        if (is_active & EVENT_FLEXIBLE) {
-               list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+               perf_event_groups_for_each_cpu(event, cpu,
+                               &ctx->flexible_groups, group_node)
+                       group_sched_out(event, cpuctx, ctx);
+               perf_event_groups_for_each_cpu(event, sw,
+                               &ctx->flexible_groups, group_node)
                        group_sched_out(event, cpuctx, ctx);
        }
        perf_pmu_enable(ctx->pmu);
@@ -2990,23 +3187,28 @@ static void cpu_ctx_sched_out(struct perf_cpu_context 
*cpuctx,
 ctx_pinned_sched_in(struct perf_event_context *ctx,
                    struct perf_cpu_context *cpuctx)
 {
+       int sw = -1, cpu = smp_processor_id();
        struct perf_event *event;
+       int can_add_hw;
+
+       perf_event_groups_for_each_cpu(event, sw,
+                       &ctx->pinned_groups, group_node) {
+               can_add_hw = 1;
+               if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) {
+                       if (event->state == PERF_EVENT_STATE_INACTIVE)
+                               perf_event_set_state(event,
+                                               PERF_EVENT_STATE_ERROR);
+               }
+       }
 
-       list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-               if (event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-               if (!event_filter_match(event))
-                       continue;
-
-               if (group_can_go_on(event, cpuctx, 1))
-                       group_sched_in(event, cpuctx, ctx);
-
-               /*
-                * If this pinned group hasn't been scheduled,
-                * put it in error state.
-                */
-               if (event->state == PERF_EVENT_STATE_INACTIVE)
-                       perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+       perf_event_groups_for_each_cpu(event, cpu,
+                       &ctx->pinned_groups, group_node) {
+               can_add_hw = 1;
+               if (flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw)) {
+                       if (event->state == PERF_EVENT_STATE_INACTIVE)
+                               perf_event_set_state(event,
+                                               PERF_EVENT_STATE_ERROR);
+               }
        }
 }
 
@@ -3014,25 +3216,19 @@ static void cpu_ctx_sched_out(struct perf_cpu_context 
*cpuctx,
 ctx_flexible_sched_in(struct perf_event_context *ctx,
                      struct perf_cpu_context *cpuctx)
 {
+       int sw = -1, cpu = smp_processor_id();
        struct perf_event *event;
        int can_add_hw = 1;
 
-       list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-               /* Ignore events in OFF or ERROR state */
-               if (event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-               /*
-                * Listen to the 'cpu' scheduling filter constraint
-                * of events:
-                */
-               if (!event_filter_match(event))
-                       continue;
+       perf_event_groups_for_each_cpu(event, sw,
+                       &ctx->flexible_groups, group_node)
+               flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
+
+       can_add_hw = 1;
+       perf_event_groups_for_each_cpu(event, cpu,
+                       &ctx->flexible_groups, group_node)
+               flexible_group_sched_in(event, ctx, cpuctx, &can_add_hw);
 
-               if (group_can_go_on(event, cpuctx, can_add_hw)) {
-                       if (group_sched_in(event, cpuctx, ctx))
-                               can_add_hw = 0;
-               }
-       }
 }
 
 static void
@@ -3113,7 +3309,7 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
-       if (!list_empty(&ctx->pinned_groups))
+       if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);
        perf_pmu_enable(ctx->pmu);
@@ -3350,8 +3546,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
-       if (!ctx->rotate_disable)
-               list_rotate_left(&ctx->flexible_groups);
+       if (!ctx->rotate_disable) {
+               int sw = -1, cpu = smp_processor_id();
+
+               perf_event_groups_rotate(&ctx->flexible_groups, sw);
+               perf_event_groups_rotate(&ctx->flexible_groups, cpu);
+       }
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3698,8 +3898,8 @@ static void __perf_event_init_context(struct 
perf_event_context *ctx)
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->active_ctx_list);
-       INIT_LIST_HEAD(&ctx->pinned_groups);
-       INIT_LIST_HEAD(&ctx->flexible_groups);
+       perf_event_groups_init(&ctx->pinned_groups);
+       perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
 }
@@ -9370,6 +9570,7 @@ static void account_event(struct perf_event *event)
        INIT_LIST_HEAD(&event->group_entry);
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
+       init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
@@ -10880,7 +11081,7 @@ static int perf_event_init_context(struct task_struct 
*child, int ctxn)
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
-       list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+       perf_event_groups_for_each(event, &parent_ctx->pinned_groups, 
group_node) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)
@@ -10896,7 +11097,7 @@ static int perf_event_init_context(struct task_struct 
*child, int ctxn)
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
-       list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+       perf_event_groups_for_each(event, &parent_ctx->flexible_groups, 
group_node) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)

Reply via email to