perf/core: addressing 4x slowdown during per-process profiling of
           STREAM benchmark on Intel Xeon Phi

Motivation:

The issue manifests like 4x slowdown when profiling single thread
STREAM benchmark on Intel Xeon Phi running RHEL7.2 (Intel MPSS distro).
Perf profiling is done in per-process mode and involves about 30 core
events. In case the benchmark is OpenMP based and runs under profiling
in 272 threads the overhead becomes even more dramatic:
512.144s against 36.192s (with this patch).

Solution:

cpu indexed trees for perf_event_context::pinned_group and
perf_event_context::flexible_group lists are introduced. Every tree
node keeps a list of groups allocated for the same cpu. A tree
references only groups located at the appropriate group list. The tree
provides capability to iterate over groups allocated for a specific
cpu only, what is exactly required by multiplexing timer interrupt
handler. The handler runs per-cpu and enables/disables groups using
group_sched_in()/group_sched_out() that call event_filter_match()
function filtering out groups allocated for cpus different from the
one executing the handler. Additionally for every filtered out group
group_sched_out() updates tstamps values to the current interrupt
time. This updating work is now done only once by
update_context_time() called by ctx_sched_out() before cpu groups
iteration. For this trick to work it is required that tstamps of
filtered out events would point to perf_event_context::tstamp_data
object instead of perf_event::tstam_data ones, as it is initialized
from an event allocation. tstamps references are switched by
group_sched_in()/group_sched_out() every time a group is checked for
its suitability for currently running cpu. When a thread enters some
cpu on a context switch a long run through pinned and flexible groups
is performed by perf_event_sched_in(, mux=0) with new parameter mux
set to 0 and filtered out groups tstamps are switched to
perf_event_context::tstamp_data object. Then a series of multiplexing
interrupts happens and the handler rotates the flexible groups calling
ctx_sched_out(,mux=1)/perf_event_sched_in(,mux=1) iterating over the
cpu tree lists only and avoiding long runs through the complete group
lists. This is where speedup comes from. Eventually when the thread
leaves the cpu ctx_sched_out(,mux=0) is called restoring tstamps
pointers to the events' perf_event::tstamp_data objects.

Locking:

The new perf_event_context::pinned_tree and
perf_event_context::flexible_tree are protected by
perf_evet_context::lock and perf_event_context::mutex, the same
locking schema as for appropriate perf_event_context::pinned_group and
perf_event_context::flexible_group. Indirect perf_event::tstamp
reference and perf_event::tstam_data object are protected by
perf_event::lock as earlier for original perf_event::tstamp_* fields.

Added API:

Objects:
1. struct perf_event_tstamp:
   - enabled
   - running
   - stopped
2. struct perf_event:
   - group_node
   - group_list
   - group_list_entry
   - tstamp
   - tstamp_data
3. struct perf_event_context:
   - pinned_tree
   - flexible_tree
   - tstamp_data

Functions:
1. insert a group into a tree using event->cpu as a key:
        int perf_cpu_tree_insert(struct rb_root *tree,
                struct perf_event *event);
2. delete a group from a tree, if the group is directly attached
   to a tree it also detaches all groups on the groups
   group_list list:
        int perf_cpu_tree_delete(struct rb_root *tree,
                struct perf_event *event);
3. find group_list list by a cpu key:
        struct list_head * perf_cpu_tree_find(struct rb_root *tree,
                int cpu);
4. enable a pinned group on a cpu:
        void ctx_sched_in_flexible_group(
                struct perf_event_context *ctx,
                struct perf_cpu_context *cpuctx,
                struct perf_event *group, int *can_add_hw);
5. enable a flexible group on a cpu; calls ctx_sched_in_flexible_group
   and updates group->state to ERROR in case of failure:
        void ctx_sched_in_pinned_group(struct perf_event_context *ctx,
                struct perf_cpu_context *cpuctx,
                struct perf_event *group);
6. enable per-cpu pinned tree's groups on a cpu:
        void ctx_pinned_sched_in_groups(struct perf_event_context *ctx,
                struct perf_cpu_context *cpuctx,
                struct list_head *groups);
7. enable per-cpu flexible tree's groups on a cpu:
        void ctx_flexible_sched_in_groups(
                struct perf_event_context *ctx,
                struct perf_cpu_context *cpuctx,
                struct list_head *groups);
8. disable per-cpu tree's groups on a cpu:
        void ctx_sched_out_groups(struct perf_event_context *ctx,
                struct perf_cpu_context *cpuctx,
                struct list_head *groups);
9. get a group tree based on event->attr.pinned attribute value:
        struct rb_root * ctx_cpu_tree(struct perf_event *event,
                struct perf_event_context *ctx);

Modified API:

Objects:
1. struct perf_event
2. struct perf_event_context

Functions:
1. perf_event_alloc()
2. add_event_to_ctx()
3. perf_event_enable_on_exec()
4. __perf_install_in_context()
5. __perf_event_init_context()
6. __perf_event_mark_enabled()
7. __perf_event_enable()
8. __perf_event_task_sched_out()
9. ctx_group_list()
10. list_add_event()
11. list_del_event()
12. perf_event_context_sched_in()
13. cpu_ctx_sched_in()
14. cpu_ctx_sched_out()
15. ctx_sched_in()
16. ctx_sched_out()
17. ctx_resched()
18. ctx_pinned_sched_in()
19. ctx_flexible_sched_in()
20. group_sched_in()
21. event_sched_in()
22. event_sched_out()
23. rotate_ctx()
24. perf_rotate_context()
25. update_context_time()
26. update_event_times()
27. calc_timer_values()
28. perf_cgroup_switch()
29. perf_cgroup_mark_enabled()

Signed-off-by: Alexey Budankov <alexey.budan...@linux.intel.com>
---
 include/linux/perf_event.h |  63 +++++--
kernel/events/core.c | 442 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 384 insertions(+), 121 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a6358..6bd380f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -550,6 +550,22 @@ struct pmu_event_list {
        struct list_head        list;
 };

+struct perf_event_tstamp {
+       /*
+        * These are timestamps used for computing total_time_enabled
+        * and total_time_running when the event is in INACTIVE or
+        * ACTIVE state, measured in nanoseconds from an arbitrary point
+        * in time.
+        * enabled: the notional time when the event was enabled
+        * running: the notional time when the event was scheduled on
+        * stopped: in INACTIVE state, the notional time when the
+        *      event was scheduled off.
+        */
+       u64 enabled;
+       u64 running;
+       u64 stopped;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -571,6 +587,27 @@ struct perf_event {
         * either sufficies for read.
         */
        struct list_head                group_entry;
+       /*
+        * Node on the pinned or flexible tree located at the event context;
+        * the node may be empty in case its event is not directly attached
+        * to the tree but to group_list list of the event directly
+        * attached to the tree;
+        */
+       struct rb_node                  group_node;
+       /*
+        * List keeps groups allocated for the same cpu;
+        * the list may be empty in case its event is not directly
+        * attached to the tree but to group_list list of the event directly
+        * attached to the tree;
+        */
+       struct list_head                group_list;
+       /*
+        * Entry into the group_list list above;
+        * the entry may be attached to the self group_list list above
+        * in case the event is directly attached to the pinned or
+        * flexible tree;
+        */
+       struct list_head                group_list_entry;
        struct list_head                sibling_list;

        /*
@@ -611,18 +648,11 @@ struct perf_event {
        u64                             total_time_running;

        /*
-        * These are timestamps used for computing total_time_enabled
-        * and total_time_running when the event is in INACTIVE or
-        * ACTIVE state, measured in nanoseconds from an arbitrary point
-        * in time.
-        * tstamp_enabled: the notional time when the event was enabled
-        * tstamp_running: the notional time when the event was scheduled on
-        * tstamp_stopped: in INACTIVE state, the notional time when the
-        *      event was scheduled off.
+        * tstamp points to the tstamp_data object below or to the object
+        * located at the event context;
         */
-       u64                             tstamp_enabled;
-       u64                             tstamp_running;
-       u64                             tstamp_stopped;
+       struct perf_event_tstamp        *tstamp;
+       struct perf_event_tstamp        tstamp_data;

        /*
         * timestamp shadows the actual context timing but it can
@@ -742,7 +772,17 @@ struct perf_event_context {

        struct list_head                active_ctx_list;
        struct list_head                pinned_groups;
+       /*
+        * Cpu tree for pinned groups; keeps event's group_node nodes
+        * of attached flexible groups;
+        */
+       struct rb_root                  pinned_tree;
        struct list_head                flexible_groups;
+       /*
+        * Cpu tree for flexible groups; keeps event's group_node nodes
+        * of attached flexible groups;
+        */
+       struct rb_root                  flexible_tree;
        struct list_head                event_list;
        int                             nr_events;
        int                             nr_active;
@@ -758,6 +798,7 @@ struct perf_event_context {
         */
        u64                             time;
        u64                             timestamp;
+       struct perf_event_tstamp        tstamp_data;

        /*
         * These fields let us detect when two contexts have both
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f8c27d3..9168406 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -555,11 +555,11 @@ void perf_sample_event_took(u64 sample_len_ns)
 static atomic64_t perf_event_id;

 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type);
+                             enum event_type_t event_type, int mux);

 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
                             enum event_type_t event_type,
-                            struct task_struct *task);
+                            struct task_struct *task, int mux);

 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
@@ -701,6 +701,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
        struct perf_cpu_context *cpuctx;
        struct list_head *list;
        unsigned long flags;
+       int mux = 0;

        /*
         * Disable interrupts and preemption to avoid this CPU's
@@ -716,7 +717,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
                perf_pmu_disable(cpuctx->ctx.pmu);

                if (mode & PERF_CGROUP_SWOUT) {
-                       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+                       cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
                        /*
                         * must not be done before ctxswout due
                         * to event_filter_match() in event_sched_out()
@@ -735,7 +736,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
                         */
                        cpuctx->cgrp = perf_cgroup_from_task(task,
                                                             &cpuctx->ctx);
-                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
                }
                perf_pmu_enable(cpuctx->ctx.pmu);
                perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -865,10 +866,10 @@ perf_cgroup_mark_enabled(struct perf_event *event,

        event->cgrp_defer_enabled = 0;

-       event->tstamp_enabled = tstamp - event->total_time_enabled;
+       event->tstamp->enabled = tstamp - event->total_time_enabled;
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
-                       sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+                       sub->tstamp->enabled = tstamp - sub->total_time_enabled;
                        sub->cgrp_defer_enabled = 0;
                }
        }
@@ -1383,6 +1384,9 @@ static void update_context_time(struct perf_event_context *ctx)

        ctx->time += now - ctx->timestamp;
        ctx->timestamp = now;
+
+       ctx->tstamp_data.running += ctx->time - ctx->tstamp_data.stopped;
+       ctx->tstamp_data.stopped = ctx->time;
 }

 static u64 perf_event_time(struct perf_event *event)
@@ -1424,16 +1428,16 @@ static void update_event_times(struct perf_event *event)
        else if (ctx->is_active)
                run_end = ctx->time;
        else
-               run_end = event->tstamp_stopped;
+               run_end = event->tstamp->stopped;

-       event->total_time_enabled = run_end - event->tstamp_enabled;
+       event->total_time_enabled = run_end - event->tstamp->enabled;

        if (event->state == PERF_EVENT_STATE_INACTIVE)
-               run_end = event->tstamp_stopped;
+               run_end = event->tstamp->stopped;
        else
                run_end = perf_event_time(event);

-       event->total_time_running = run_end - event->tstamp_running;
+       event->total_time_running = run_end - event->tstamp->running;

 }

@@ -1472,6 +1476,186 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
                return &ctx->flexible_groups;
 }

+static int
+perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
+{
+       struct rb_node **node;
+       struct rb_node *parent;
+
+       if (!tree || !event)
+               return 0;
+
+       node = &tree->rb_node;
+       parent = *node;
+
+       while (*node) {
+               struct perf_event *node_event = container_of(*node,
+                               struct perf_event, group_node);
+
+               parent = *node;
+
+               if (event->cpu < node_event->cpu) {
+                       node = &((*node)->rb_left);
+               } else if (event->cpu > node_event->cpu) {
+                       node = &((*node)->rb_right);
+               } else {
+                       list_add_tail(&event->group_list_entry,
+                                       &node_event->group_list);
+                       return 2;
+               }
+       }
+
+       list_add_tail(&event->group_list_entry, &event->group_list);
+
+       rb_link_node(&event->group_node, parent, node);
+       rb_insert_color(&event->group_node, tree);
+
+       return 1;
+}
+
+static int
+perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
+{
+       if (!tree || !event)
+               return 0;
+
+       if (RB_EMPTY_NODE(&event->group_node)) {
+               list_del_init(&event->group_list_entry);
+       } else {
+               struct perf_event *list_event, *list_next;
+
+               rb_erase(&event->group_node, tree);
+               list_for_each_entry_safe(list_event, list_next,
+                               &event->group_list, group_list_entry)
+                       list_del_init(&list_event->group_list_entry);
+       }
+
+       return 1;
+}
+
+static struct list_head *
+perf_cpu_tree_find(struct rb_root *tree, int cpu)
+{
+       struct rb_node *node;
+
+       if (!tree)
+               return NULL;
+
+       node = tree->rb_node;
+
+       while (node) {
+               struct perf_event *node_event = container_of(node,
+                               struct perf_event, group_node);
+
+               if (cpu < node_event->cpu)
+                       node = node->rb_left;
+               else if (cpu > node_event->cpu)
+                       node = node->rb_right;
+               else
+                       return &node_event->group_list;
+       }
+
+       return NULL;
+}
+
+static struct rb_root *
+ctx_cpu_tree(struct perf_event *event, struct perf_event_context *ctx)
+{
+       if (event->attr.pinned)
+               return &ctx->pinned_tree;
+       else
+               return &ctx->flexible_tree;
+}
+
+static inline int
+event_filter_match(struct perf_event *event);
+
+static int
+group_can_go_on(struct perf_event *event,
+               struct perf_cpu_context *cpuctx,
+               int can_add_hw);
+
+static int
+group_sched_in(struct perf_event *group_event,
+              struct perf_cpu_context *cpuctx,
+              struct perf_event_context *ctx);
+
+static void
+ctx_sched_in_flexible_group(struct perf_event_context *ctx,
+                   struct perf_cpu_context *cpuctx,
+                   struct perf_event *event, int *can_add_hw)
+{
+       if (event->state <= PERF_EVENT_STATE_OFF)
+               return;
+       if (!event_filter_match(event)) {
+               if (event->tstamp != &ctx->tstamp_data)
+                       event->tstamp = &ctx->tstamp_data;
+               return;
+       }
+
+       /* may need to reset tstamp_enabled */
+       if (is_cgroup_event(event))
+               perf_cgroup_mark_enabled(event, ctx);
+
+       if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+               if (group_sched_in(event, cpuctx, ctx))
+                       *can_add_hw = 0;
+       }
+}
+
+static void
+ctx_sched_in_pinned_group(struct perf_event_context *ctx,
+                   struct perf_cpu_context *cpuctx,
+                   struct perf_event *event)
+{
+       int can_add_hw = 1;
+
+       ctx_sched_in_flexible_group(ctx, cpuctx, event, &can_add_hw);
+       if (!can_add_hw) {
+               update_group_times(event);
+               event->state = PERF_EVENT_STATE_ERROR;
+       }
+}
+
+static void
+ctx_pinned_sched_in_groups(struct perf_event_context *ctx,
+                   struct perf_cpu_context *cpuctx,
+                   struct list_head *groups)
+{
+       struct perf_event *group;
+
+       list_for_each_entry(group, groups, group_list_entry)
+               ctx_sched_in_pinned_group(ctx, cpuctx, group);
+}
+
+static void
+ctx_flexible_sched_in_groups(struct perf_event_context *ctx,
+                     struct perf_cpu_context *cpuctx,
+                     struct list_head *groups)
+{
+       struct perf_event *group;
+       int can_add_hw = 1;
+
+       list_for_each_entry(group, groups, group_list_entry)
+               ctx_sched_in_flexible_group(ctx, cpuctx, group, &can_add_hw);
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+               struct perf_cpu_context *cpuctx,
+               struct perf_event_context *ctx);
+
+static void
+ctx_sched_out_groups(struct perf_event_context *ctx,
+               struct perf_cpu_context *cpuctx,
+               struct list_head *groups)
+{
+       struct perf_event *group;
+
+       list_for_each_entry(group, groups, group_list_entry)
+               group_sched_out(group, cpuctx, ctx);
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -1491,11 +1675,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         */
        if (event->group_leader == event) {
                struct list_head *list;
+               struct rb_root *tree;

                event->group_caps = event->event_caps;

                list = ctx_group_list(event, ctx);
                list_add_tail(&event->group_entry, list);
+
+               tree = ctx_cpu_tree(event, ctx);
+               perf_cpu_tree_insert(tree, event);
        }

        list_update_cgroup_event(event, ctx, true);
@@ -1685,8 +1873,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)

        list_del_rcu(&event->event_entry);

-       if (event->group_leader == event)
+       if (event->group_leader == event) {
+               struct rb_root *tree;
+
+               tree = ctx_cpu_tree(event, ctx);
+               perf_cpu_tree_delete(tree, event);
+
                list_del_init(&event->group_entry);
+       }

        update_group_times(event);

@@ -1811,9 +2005,9 @@ event_sched_out(struct perf_event *event,
         */
        if (event->state == PERF_EVENT_STATE_INACTIVE &&
            !event_filter_match(event)) {
-               delta = tstamp - event->tstamp_stopped;
-               event->tstamp_running += delta;
-               event->tstamp_stopped = tstamp;
+               delta = tstamp - event->tstamp->stopped;
+               event->tstamp->running += delta;
+               event->tstamp->stopped = tstamp;
        }

        if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -1821,7 +2015,7 @@ event_sched_out(struct perf_event *event,

        perf_pmu_disable(event->pmu);

-       event->tstamp_stopped = tstamp;
+       event->tstamp->stopped = tstamp;
        event->pmu->del(event, 0);
        event->oncpu = -1;
        event->state = PERF_EVENT_STATE_INACTIVE;
@@ -2096,7 +2290,7 @@ event_sched_in(struct perf_event *event,
                goto out;
        }

-       event->tstamp_running += tstamp - event->tstamp_stopped;
+       event->tstamp->running += tstamp - event->tstamp->stopped;

        if (!is_software_event(event))
                cpuctx->active_oncpu++;
@@ -2168,8 +2362,8 @@ group_sched_in(struct perf_event *group_event,
                        simulate = true;

                if (simulate) {
-                       event->tstamp_running += now - event->tstamp_stopped;
-                       event->tstamp_stopped = now;
+                       event->tstamp->running += now - event->tstamp->stopped;
+                       event->tstamp->stopped = now;
                } else {
                        event_sched_out(event, cpuctx, ctx);
                }
@@ -2221,43 +2415,45 @@ static void add_event_to_ctx(struct perf_event *event,

        list_add_event(event, ctx);
        perf_group_attach(event);
-       event->tstamp_enabled = tstamp;
-       event->tstamp_running = tstamp;
-       event->tstamp_stopped = tstamp;
+       event->tstamp->enabled = tstamp;
+       event->tstamp->running = tstamp;
+       event->tstamp->stopped = tstamp;
 }

 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type);
+                         enum event_type_t event_type, int mux);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
-            struct task_struct *task);
+            struct task_struct *task, int mux);

 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
                               struct perf_event_context *ctx,
                               enum event_type_t event_type)
 {
+       int mux = 0;
+
        if (!cpuctx->task_ctx)
                return;

        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;

-       ctx_sched_out(ctx, cpuctx, event_type);
+       ctx_sched_out(ctx, cpuctx, event_type, mux);
 }

 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
-                               struct task_struct *task)
+                               struct task_struct *task, int mux)
 {
-       cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+       cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
        if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+               ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
+       cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
        if (ctx)
-               ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+               ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
 }

 /*
@@ -2281,6 +2477,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 {
        enum event_type_t ctx_event_type = event_type & EVENT_ALL;
        bool cpu_event = !!(event_type & EVENT_CPU);
+       int mux = 0;

        /*
         * If pinned groups are involved, flexible groups also need to be
@@ -2301,11 +2498,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
-               cpu_ctx_sched_out(cpuctx, ctx_event_type);
+               cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
        else if (ctx_event_type & EVENT_PINNED)
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);

-       perf_event_sched_in(cpuctx, task_ctx, current);
+       perf_event_sched_in(cpuctx, task_ctx, current, mux);
        perf_pmu_enable(cpuctx->ctx.pmu);
 }

@@ -2323,6 +2520,7 @@ static int  __perf_install_in_context(void *info)
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;
+       int mux = 0;

        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx->task) {
@@ -2349,7 +2547,7 @@ static int  __perf_install_in_context(void *info)
        }

        if (reprogram) {
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
@@ -2468,10 +2666,10 @@ static void __perf_event_mark_enabled(struct perf_event *event)
        u64 tstamp = perf_event_time(event);

        event->state = PERF_EVENT_STATE_INACTIVE;
-       event->tstamp_enabled = tstamp - event->total_time_enabled;
+       event->tstamp->enabled = tstamp - event->total_time_enabled;
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-                       sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+                       sub->tstamp->enabled = tstamp - sub->total_time_enabled;
        }
 }

@@ -2485,13 +2683,14 @@ static void __perf_event_enable(struct perf_event *event,
 {
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;
+       int mux = 0;

        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <= PERF_EVENT_STATE_ERROR)
                return;

        if (ctx->is_active)
-               ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+               ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);

        __perf_event_mark_enabled(event);

@@ -2501,7 +2700,7 @@ static void __perf_event_enable(struct perf_event *event,
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
                        perf_cgroup_defer_enabled(event);
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
                return;
        }

@@ -2510,7 +2709,7 @@ static void __perf_event_enable(struct perf_event *event,
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
                return;
        }

@@ -2706,10 +2905,12 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);

 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
-                         enum event_type_t event_type)
+                         enum event_type_t event_type, int mux)
 {
        int is_active = ctx->is_active;
        struct perf_event *event;
+       struct list_head *groups;
+       int cpu = smp_processor_id();

        lockdep_assert_held(&ctx->lock);

@@ -2756,13 +2957,34 @@ static void ctx_sched_out(struct perf_event_context *ctx,

        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
-               list_for_each_entry(event, &ctx->pinned_groups, group_entry)
-                       group_sched_out(event, cpuctx, ctx);
+               if (mux) {
+                       groups = perf_cpu_tree_find(&ctx->pinned_tree, -1);
+                       if (groups)
+                               ctx_sched_out_groups(ctx, cpuctx, groups);
+                       groups = perf_cpu_tree_find(&ctx->pinned_tree, cpu);
+                       if (groups)
+                               ctx_sched_out_groups(ctx, cpuctx, groups);
+               } else {
+                       list_for_each_entry(event, &ctx->pinned_groups,
+                                       group_entry)
+                               group_sched_out(event, cpuctx, ctx);
+               }
        }

        if (is_active & EVENT_FLEXIBLE) {
-               list_for_each_entry(event, &ctx->flexible_groups, group_entry)
-                       group_sched_out(event, cpuctx, ctx);
+               if (mux) {
+                       groups = perf_cpu_tree_find(&ctx->flexible_tree, -1);
+                       if (groups)
+                               ctx_sched_out_groups(ctx, cpuctx, groups);
+                       groups = perf_cpu_tree_find(&ctx->flexible_tree, cpu);
+                       if (groups)
+                               ctx_sched_out_groups(ctx, cpuctx, groups);
+
+               } else {
+                       list_for_each_entry(event, &ctx->flexible_groups,
+                                       group_entry)
+                               group_sched_out(event, cpuctx, ctx);
+               }
        }
        perf_pmu_enable(ctx->pmu);
 }
@@ -3051,9 +3273,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
  * Called with IRQs disabled
  */
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-                             enum event_type_t event_type)
+                             enum event_type_t event_type, int mux)
 {
-       ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+       ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
 }

 static void
@@ -3061,29 +3283,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
                    struct perf_cpu_context *cpuctx)
 {
        struct perf_event *event;
-
-       list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-               if (event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-               if (!event_filter_match(event))
-                       continue;
-
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
-
-               if (group_can_go_on(event, cpuctx, 1))
-                       group_sched_in(event, cpuctx, ctx);
-
-               /*
-                * If this pinned group hasn't been scheduled,
-                * put it in error state.
-                */
-               if (event->state == PERF_EVENT_STATE_INACTIVE) {
-                       update_group_times(event);
-                       event->state = PERF_EVENT_STATE_ERROR;
-               }
-       }
+       list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+               ctx_sched_in_pinned_group(ctx, cpuctx, event);
 }

 static void
@@ -3092,37 +3293,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 {
        struct perf_event *event;
        int can_add_hw = 1;
-
-       list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-               /* Ignore events in OFF or ERROR state */
-               if (event->state <= PERF_EVENT_STATE_OFF)
-                       continue;
-               /*
-                * Listen to the 'cpu' scheduling filter constraint
-                * of events:
-                */
-               if (!event_filter_match(event))
-                       continue;
-
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
-
-               if (group_can_go_on(event, cpuctx, can_add_hw)) {
-                       if (group_sched_in(event, cpuctx, ctx))
-                               can_add_hw = 0;
-               }
-       }
+       list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+               ctx_sched_in_flexible_group(ctx, cpuctx, event, &can_add_hw);
 }

 static void
 ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type,
-            struct task_struct *task)
+            struct task_struct *task, int mux)
 {
        int is_active = ctx->is_active;
-       u64 now;
+       struct list_head *groups;
+       int cpu = smp_processor_id();

        lockdep_assert_held(&ctx->lock);

@@ -3141,8 +3324,7 @@ ctx_sched_in(struct perf_event_context *ctx,

        if (is_active & EVENT_TIME) {
                /* start ctx time */
-               now = perf_clock();
-               ctx->timestamp = now;
+               ctx->timestamp = perf_clock();
                perf_cgroup_set_timestamp(task, ctx);
        }

@@ -3150,27 +3332,50 @@ ctx_sched_in(struct perf_event_context *ctx,
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
-       if (is_active & EVENT_PINNED)
-               ctx_pinned_sched_in(ctx, cpuctx);
+       if (is_active & EVENT_PINNED) {
+               if (mux) {
+                       groups = perf_cpu_tree_find(&ctx->pinned_tree, -1);
+                       if (groups)
+                               ctx_pinned_sched_in_groups(ctx, cpuctx, groups);
+                       groups = perf_cpu_tree_find(&ctx->pinned_tree, cpu);
+                       if (groups)
+                               ctx_pinned_sched_in_groups(ctx, cpuctx, groups);
+               } else {
+                       ctx_pinned_sched_in(ctx, cpuctx);
+               }
+       }

        /* Then walk through the lower prio flexible groups */
-       if (is_active & EVENT_FLEXIBLE)
-               ctx_flexible_sched_in(ctx, cpuctx);
+       if (is_active & EVENT_FLEXIBLE) {
+               if (mux) {
+                       groups = perf_cpu_tree_find(&ctx->flexible_tree, -1);
+                       if (groups)
+                               ctx_flexible_sched_in_groups(ctx, cpuctx,
+                                               groups);
+                       groups = perf_cpu_tree_find(&ctx->flexible_tree, cpu);
+                       if (groups)
+                               ctx_flexible_sched_in_groups(ctx, cpuctx,
+                                               groups);
+               } else {
+                       ctx_flexible_sched_in(ctx, cpuctx);
+               }
+       }
 }

 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
                             enum event_type_t event_type,
-                            struct task_struct *task)
+                            struct task_struct *task, int mux)
 {
        struct perf_event_context *ctx = &cpuctx->ctx;

-       ctx_sched_in(ctx, cpuctx, event_type, task);
+       ctx_sched_in(ctx, cpuctx, event_type, task, mux);
 }

 static void perf_event_context_sched_in(struct perf_event_context *ctx,
                                        struct task_struct *task)
 {
        struct perf_cpu_context *cpuctx;
+       int mux = 0;

        cpuctx = __get_cpu_context(ctx);
        if (cpuctx->task_ctx == ctx)
@@ -3187,8 +3392,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         * events, no need to flip the cpuctx's events around.
         */
        if (!list_empty(&ctx->pinned_groups))
-               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-       perf_event_sched_in(cpuctx, ctx, task);
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
+       perf_event_sched_in(cpuctx, ctx, task, mux);
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
 }
@@ -3421,14 +3626,24 @@ static void rotate_ctx(struct perf_event_context *ctx)
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
-       if (!ctx->rotate_disable)
-               list_rotate_left(&ctx->flexible_groups);
+       if (!ctx->rotate_disable) {
+               struct list_head *groups;
+
+               groups = perf_cpu_tree_find(&ctx->flexible_tree, -1);
+               if (groups)
+                       list_rotate_left(groups);
+               groups = perf_cpu_tree_find(&ctx->flexible_tree,
+                               smp_processor_id());
+               if (groups)
+                       list_rotate_left(groups);
+       }
 }

 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
        int rotate = 0;
+       int mux = 1;

        if (cpuctx->ctx.nr_events) {
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
@@ -3447,15 +3662,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);

-       cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+       cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
        if (ctx)
-               ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+               ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, mux);

        rotate_ctx(&cpuctx->ctx);
        if (ctx)
                rotate_ctx(ctx);

-       perf_event_sched_in(cpuctx, ctx, current);
+       perf_event_sched_in(cpuctx, ctx, current, mux);

        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3507,6 +3722,7 @@ static void perf_event_enable_on_exec(int ctxn)
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
+       int mux = 0;

        local_irq_save(flags);
        ctx = current->perf_event_ctxp[ctxn];
@@ -3515,7 +3731,7 @@ static void perf_event_enable_on_exec(int ctxn)

        cpuctx = __get_cpu_context(ctx);
        perf_ctx_lock(cpuctx, ctx);
-       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+       ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
@@ -3528,7 +3744,7 @@ static void perf_event_enable_on_exec(int ctxn)
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
        } else {
-               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+               ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
        }
        perf_ctx_unlock(cpuctx, ctx);

@@ -3749,7 +3965,9 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->active_ctx_list);
        INIT_LIST_HEAD(&ctx->pinned_groups);
+       ctx->pinned_tree = RB_ROOT;
        INIT_LIST_HEAD(&ctx->flexible_groups);
+       ctx->flexible_tree = RB_ROOT;
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
 }
@@ -4848,8 +5066,8 @@ static void calc_timer_values(struct perf_event *event,

        *now = perf_clock();
        ctx_time = event->shadow_ctx_time + *now;
-       *enabled = ctx_time - event->tstamp_enabled;
-       *running = ctx_time - event->tstamp_running;
+       *enabled = ctx_time - event->tstamp->enabled;
+       *running = ctx_time - event->tstamp->running;
 }

 static void perf_event_init_userpage(struct perf_event *event)
@@ -9357,6 +9575,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->group_entry);
+       RB_CLEAR_NODE(&event->group_node);
+       INIT_LIST_HEAD(&event->group_list);
+       INIT_LIST_HEAD(&event->group_list_entry);
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->rb_entry);
@@ -9372,6 +9593,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        raw_spin_lock_init(&event->addr_filters.lock);

        atomic_long_set(&event->refcount, 1);
+       event->tstamp                = &event->tstamp_data;
        event->cpu           = cpu;
        event->attr          = *attr;
        event->group_leader  = group_leader;

Reply via email to