Dear Peter,

Could you please share you feedbacks/comments on this work?

Thanks,
Song

On Fri, Jun 8, 2018 at 3:47 PM, Song Liu <songliubrav...@fb.com> wrote:
> This patch tries to enable PMU sharing. To make perf event scheduling
> fast, we use special data structures.
>
> An array of "struct perf_event_dup" is added to the perf_event_context,
> to remember all the duplicated events under this ctx. All the events
> under this ctx has a "dup_id" pointing to its perf_event_dup. Compatible
> events under the same ctx share the same perf_event_dup. The following
> figure shows a simplified version of the data structure.
>
>       ctx ->  perf_event_dup -> master
>                      ^
>                      |
>          perf_event /|
>                      |
>          perf_event /
>
> Connection among perf_event and perf_event_dup are built when events are
> added or removed from the ctx. So these are not on the critical path of
> schedule or perf_rotate_context().
>
> On the critical paths (add, del read), sharing PMU counters doesn't
> increase the complexity. Helper functions event_pmu_[add|del|read]() are
> introduced to cover these cases. All these functions have O(1) time
> complexity.
>
> We allocate a separate perf_event for perf_event_dup->master. This needs
> extra attention, because perf_event_alloc() may sleep. To allocate the
> master event properly, a new pointer, tmp_master, is added to perf_event.
> tmp_master carries a separate perf_event into list_[add|del]_event().
> The master event has valid ->ctx and holds ctx->refcount.
>
> Details about the handling of the master event is added to
> include/linux/perf_event.h, before struct perf_event_dup.
>
> Cc: Tejun Heo <t...@kernel.org>
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: Jiri Olsa <jo...@kernel.org>
> Cc: Alexey Budankov <alexey.budan...@linux.intel.com>
> Signed-off-by: Song Liu <songliubrav...@fb.com>
> ---
>  include/linux/perf_event.h |  61 ++++++++++
>  kernel/events/core.c       | 284 
> +++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 335 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index bea0b0c..75ed2ad 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -695,6 +695,12 @@ struct perf_event {
>  #endif
>
>         struct list_head                sb_list;
> +
> +       /* for PMU sharing */
> +       struct perf_event               *tmp_master;
> +       u64                             dup_id;
> +       u64                             dup_base_count;
> +       u64                             dup_base_child_count;
>  #endif /* CONFIG_PERF_EVENTS */
>  };
>
> @@ -704,6 +710,58 @@ struct perf_event_groups {
>         u64             index;
>  };
>
> +/*
> + * Sharing PMU across compatible events
> + *
> + * If two perf_events in the same perf_event_context are counting same
> + * hardware events (instructions, cycles, etc.), they could share the
> + * hardware PMU counter.
> + *
> + * When a perf_event is added to the ctx (list_add_event), it is compared
> + * against other events in the ctx. If they can share the PMU counter,
> + * a perf_event_dup is allocated to represent the sharing.
> + *
> + * Each perf_event_dup has a virtual master event, which is called by
> + * pmu->add() and pmu->del(). We cannot call perf_event_alloc() in
> + * list_add_event(), so it is allocated and carried by event->tmp_master
> + * into list_add_event().
> + *
> + * Virtual master in different cases/paths:
> + *
> + * < I > perf_event_open() -> close() path:
> + *
> + * 1. Allocated by perf_event_alloc() in sys_perf_event_open();
> + * 2. event->tmp_master->ctx assigned in perf_install_in_context();
> + * 3.a. if used by ctx->dup_events, freed in perf_event_release_kernel();
> + * 3.b. if not used by ctx->dup_events, freed in perf_event_open().
> + *
> + * < II > inherit_event() path:
> + *
> + * 1. Allocated by perf_event_alloc() in inherit_event();
> + * 2. tmp_master->ctx assigned in inherit_event();
> + * 3.a. if used by ctx->dup_events, freed in perf_event_release_kernel();
> + * 3.b. if not used by ctx->dup_events, freed in inherit_event().
> + *
> + * < III > perf_pmu_migrate_context() path:
> + * all dup_events removed during migration (no sharing after the move).
> + *
> + * < IV > perf_event_create_kernel_counter() path:
> + * not supported yet.
> + */
> +struct perf_event_dup {
> +       /*
> +        * master event being called by pmu->add() and pmu->del().
> +        * This event is allocated with perf_event_alloc(). When
> +        * attached to a ctx, this event should hold ctx->refcount.
> +        */
> +       struct perf_event       *master;
> +       /* number of events in the ctx that shares the master */
> +       int                     total_event_count;
> +       /* number of active events of the master */
> +       int                     active_event_count;
> +};
> +
> +#define MAX_PERF_EVENT_DUP_PER_CTX 4
>  /**
>   * struct perf_event_context - event context structure
>   *
> @@ -759,6 +817,9 @@ struct perf_event_context {
>  #endif
>         void                            *task_ctx_data; /* pmu specific data 
> */
>         struct rcu_head                 rcu_head;
> +
> +       /* for PMU sharing. array is needed for O(1) access */
> +       struct perf_event_dup           
> dup_events[MAX_PERF_EVENT_DUP_PER_CTX];
>  };
>
>  /*
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 08f5e1b..e9cd752 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1655,6 +1655,146 @@ perf_event_groups_next(struct perf_event *event)
>                 event = rb_entry_safe(rb_next(&event->group_node),      \
>                                 typeof(*event), group_node))
>
> +static void _free_event(struct perf_event *event);
> +
> +
> +/* free event->tmp_master */
> +static inline void perf_event_free_tmp_master(struct perf_event *event)
> +{
> +       if (event->tmp_master) {
> +               _free_event(event->tmp_master);
> +               event->tmp_master = NULL;
> +       }
> +}
> +
> +static inline u64 dup_read_count(struct perf_event_dup *dup)
> +{
> +       return local64_read(&dup->master->count);
> +}
> +
> +static inline u64 dup_read_child_count(struct perf_event_dup *dup)
> +{
> +       return atomic64_read(&dup->master->child_count);
> +}
> +
> +/* Returns whether a perf_event can share PMU counter with other events */
> +static inline bool perf_event_can_share(struct perf_event *event)
> +{
> +       /* only do sharing for hardware events */
> +       if (is_software_event(event))
> +               return false;
> +
> +       /*
> +        * limit sharing to counting events.
> +        * perf-stat sets PERF_SAMPLE_IDENTIFIER for counting events, so
> +        * let that in.
> +        */
> +       if (event->attr.sample_type & ~PERF_SAMPLE_IDENTIFIER)
> +               return false;
> +
> +       return true;
> +}
> +
> +/*
> + * Returns whether the two events can share a PMU counter.
> + *
> + * Note: This function does NOT check perf_event_can_share() for
> + * the two events, they should be checked before this function
> + */
> +static inline bool perf_event_compatible(struct perf_event *event_a,
> +                                        struct perf_event *event_b)
> +{
> +       return event_a->attr.type == event_b->attr.type &&
> +               event_a->attr.config == event_b->attr.config &&
> +               event_a->attr.config1 == event_b->attr.config1 &&
> +               event_a->attr.config2 == event_b->attr.config2;
> +}
> +
> +/*
> + * After adding a event to the ctx, try find compatible event(s).
> + *
> + * This function should only be called at the end of list_add_event().
> + * Master event cannot be allocated or freed within this function. To add
> + * new master event, the event should already have a master event
> + * allocated (event->tmp_master).
> + */
> +static inline void perf_event_setup_dup(struct perf_event *event,
> +                                       struct perf_event_context *ctx)
> +
> +{
> +       struct perf_event *tmp;
> +       int empty_slot = -1;
> +       int match;
> +       int i;
> +
> +       if (!perf_event_can_share(event))
> +               goto not_dup;
> +
> +       /* look for sharing with existing dup events */
> +       for (i = 0; i < MAX_PERF_EVENT_DUP_PER_CTX; i++) {
> +               if (ctx->dup_events[i].master == NULL) {
> +                       if (empty_slot == -1)
> +                               empty_slot = i;
> +                       continue;
> +               }
> +
> +               if (perf_event_compatible(event, ctx->dup_events[i].master)) {
> +                       event->dup_id = i;
> +                       ctx->dup_events[i].total_event_count++;
> +                       return;
> +               }
> +       }
> +
> +       if (!event->tmp_master ||  /* perf_event_alloc() failed */
> +           empty_slot == -1)      /* no available dup_event */
> +               goto not_dup;
> +
> +       match = 0;
> +       /* look for dup with other events */
> +       list_for_each_entry(tmp, &ctx->event_list, event_entry) {
> +               if (tmp == event || tmp->dup_id != -1 ||
> +                   !perf_event_can_share(tmp) ||
> +                   !perf_event_compatible(event, tmp))
> +                       continue;
> +
> +               tmp->dup_id = empty_slot;
> +               match++;
> +       }
> +
> +       /* found at least one dup */
> +       if (match) {
> +               ctx->dup_events[empty_slot].master = event->tmp_master;
> +               ctx->dup_events[empty_slot].total_event_count = match + 1;
> +               event->dup_id = empty_slot;
> +               event->tmp_master = NULL;
> +               return;
> +       }
> +
> +not_dup:
> +       event->dup_id = -1;
> +}
> +
> +/*
> + * Remove the event from ctx->dup_events.
> + * This function should only be called from list_del_event(). Similar to
> + * perf_event_setup_dup(), we cannot call _free_event(master). If a master
> + * event should be freed, it is carried out of this function by the event
> + * (event->tmp_master).
> + */
> +static void perf_event_remove_dup(struct perf_event *event,
> +                                 struct perf_event_context *ctx)
> +
> +{
> +       if (event->dup_id == -1)
> +               return;
> +
> +       if (--ctx->dup_events[event->dup_id].total_event_count == 0) {
> +               event->tmp_master = ctx->dup_events[event->dup_id].master;
> +               ctx->dup_events[event->dup_id].master = NULL;
> +       }
> +       event->dup_id = -1;
> +}
> +
>  /*
>   * Add a event from the lists for its context.
>   * Must be called with ctx->mutex and ctx->lock held.
> @@ -1687,6 +1827,7 @@ list_add_event(struct perf_event *event, struct 
> perf_event_context *ctx)
>                 ctx->nr_stat++;
>
>         ctx->generation++;
> +       perf_event_setup_dup(event, ctx);
>  }
>
>  /*
> @@ -1853,6 +1994,7 @@ list_del_event(struct perf_event *event, struct 
> perf_event_context *ctx)
>         WARN_ON_ONCE(event->ctx != ctx);
>         lockdep_assert_held(&ctx->lock);
>
> +       perf_event_remove_dup(event, ctx);
>         /*
>          * We can have double detach due to exit/hot-unplug + close.
>          */
> @@ -1982,6 +2124,92 @@ event_filter_match(struct perf_event *event)
>                perf_cgroup_match(event) && pmu_filter_match(event);
>  }
>
> +/* PMU sharing aware version of event->pmu->add() */
> +static int event_pmu_add(struct perf_event *event,
> +                        struct perf_event_context *ctx)
> +{
> +       struct perf_event_dup *dup;
> +       int ret;
> +
> +       /* no sharing, just do event->pmu->add() */
> +       if (event->dup_id == -1)
> +               return event->pmu->add(event, PERF_EF_START);
> +
> +       dup = &ctx->dup_events[event->dup_id];
> +
> +       if (dup->active_event_count) {
> +               /* already enabled */
> +               dup->active_event_count++;
> +               dup->master->pmu->read(dup->master);
> +               event->dup_base_count = dup_read_count(dup);
> +               event->dup_base_child_count = dup_read_child_count(dup);
> +               return 0;
> +       }
> +
> +       /* try add master */
> +       ret = event->pmu->add(dup->master, PERF_EF_START);
> +
> +       if (!ret) {
> +               dup->active_event_count = 1;
> +               event->pmu->read(dup->master);
> +               event->dup_base_count = dup_read_count(dup);
> +               event->dup_base_child_count = dup_read_child_count(dup);
> +       }
> +       return ret;
> +}
> +
> +/*
> + * sync data count from dup->master to event, called on event_pmu_read()
> + * and event_pmu_del()
> + */
> +static void event_sync_dup_count(struct perf_event *event,
> +                                struct perf_event_dup *dup)
> +{
> +       u64 new_count;
> +       u64 new_child_count;
> +
> +       event->pmu->read(dup->master);
> +       new_count = dup_read_count(dup);
> +       new_child_count = dup_read_child_count(dup);
> +
> +       local64_add(new_count - event->dup_base_count, &event->count);
> +       atomic64_add(new_child_count - event->dup_base_child_count,
> +                    &event->child_count);
> +
> +       event->dup_base_count = new_count;
> +       event->dup_base_child_count = new_child_count;
> +}
> +
> +/* PMU sharing aware version of event->pmu->del() */
> +static void event_pmu_del(struct perf_event *event,
> +                         struct perf_event_context *ctx)
> +{
> +       struct perf_event_dup *dup;
> +
> +       if (event->dup_id == -1) {
> +               event->pmu->del(event, 0);
> +               return;
> +       }
> +
> +       dup = &ctx->dup_events[event->dup_id];
> +       event_sync_dup_count(event, dup);
> +       if (--dup->active_event_count == 0)
> +               event->pmu->del(dup->master, 0);
> +}
> +
> +/* PMU sharing aware version of event->pmu->read() */
> +static void event_pmu_read(struct perf_event *event)
> +{
> +       struct perf_event_dup *dup;
> +
> +       if (event->dup_id == -1) {
> +               event->pmu->read(event);
> +               return;
> +       }
> +       dup = &event->ctx->dup_events[event->dup_id];
> +       event_sync_dup_count(event, dup);
> +}
> +
>  static void
>  event_sched_out(struct perf_event *event,
>                   struct perf_cpu_context *cpuctx,
> @@ -2004,7 +2232,7 @@ event_sched_out(struct perf_event *event,
>
>         perf_pmu_disable(event->pmu);
>
> -       event->pmu->del(event, 0);
> +       event_pmu_del(event, ctx);
>         event->oncpu = -1;
>
>         if (event->pending_disable) {
> @@ -2276,7 +2504,7 @@ event_sched_in(struct perf_event *event,
>
>         perf_log_itrace_start(event);
>
> -       if (event->pmu->add(event, PERF_EF_START)) {
> +       if (event_pmu_add(event, ctx)) {
>                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
>                 event->oncpu = -1;
>                 ret = -EAGAIN;
> @@ -2561,6 +2789,10 @@ perf_install_in_context(struct perf_event_context *ctx,
>          * Ensures that if we can observe event->ctx, both the event and ctx
>          * will be 'complete'. See perf_iterate_sb_cpu().
>          */
> +       if (event->tmp_master) {
> +               event->tmp_master->ctx = ctx;
> +               get_ctx(ctx);
> +       }
>         smp_store_release(&event->ctx, ctx);
>
>         if (!task) {
> @@ -3011,7 +3243,7 @@ static void __perf_event_sync_stat(struct perf_event 
> *event,
>          * don't need to use it.
>          */
>         if (event->state == PERF_EVENT_STATE_ACTIVE)
> -               event->pmu->read(event);
> +               event_pmu_read(event);
>
>         perf_event_update_time(event);
>
> @@ -3867,14 +4099,14 @@ static void __perf_event_read(void *info)
>                 goto unlock;
>
>         if (!data->group) {
> -               pmu->read(event);
> +               event_pmu_read(event);
>                 data->ret = 0;
>                 goto unlock;
>         }
>
>         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
>
> -       pmu->read(event);
> +       event_pmu_read(event);
>
>         for_each_sibling_event(sub, event) {
>                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
> @@ -3882,7 +4114,7 @@ static void __perf_event_read(void *info)
>                          * Use sibling's PMU rather than @event's since
>                          * sibling could be on different (eg: software) PMU.
>                          */
> -                       sub->pmu->read(sub);
> +                       event_pmu_read(sub);
>                 }
>         }
>
> @@ -3946,7 +4178,7 @@ int perf_event_read_local(struct perf_event *event, u64 
> *value,
>          * oncpu == -1).
>          */
>         if (event->oncpu == smp_processor_id())
> -               event->pmu->read(event);
> +               event_pmu_read(event);
>
>         *value = local64_read(&event->count);
>         if (enabled || running) {
> @@ -4469,6 +4701,7 @@ static void free_event(struct perf_event *event)
>                 return;
>         }
>
> +       perf_event_free_tmp_master(event);
>         _free_event(event);
>  }
>
> @@ -4528,6 +4761,7 @@ static void put_event(struct perf_event *event)
>         if (!atomic_long_dec_and_test(&event->refcount))
>                 return;
>
> +       perf_event_free_tmp_master(event);
>         _free_event(event);
>  }
>
> @@ -6087,7 +6321,7 @@ static void perf_output_read_group(struct 
> perf_output_handle *handle,
>
>         if ((leader != event) &&
>             (leader->state == PERF_EVENT_STATE_ACTIVE))
> -               leader->pmu->read(leader);
> +               event_pmu_read(leader);
>
>         values[n++] = perf_event_count(leader);
>         if (read_format & PERF_FORMAT_ID)
> @@ -6100,7 +6334,7 @@ static void perf_output_read_group(struct 
> perf_output_handle *handle,
>
>                 if ((sub != event) &&
>                     (sub->state == PERF_EVENT_STATE_ACTIVE))
> -                       sub->pmu->read(sub);
> +                       event_pmu_read(sub);
>
>                 values[n++] = perf_event_count(sub);
>                 if (read_format & PERF_FORMAT_ID)
> @@ -9103,7 +9337,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct 
> hrtimer *hrtimer)
>         if (event->state != PERF_EVENT_STATE_ACTIVE)
>                 return HRTIMER_NORESTART;
>
> -       event->pmu->read(event);
> +       event_pmu_read(event);
>
>         perf_sample_data_init(&data, 0, event->hw.last_period);
>         regs = get_irq_regs();
> @@ -10498,6 +10732,11 @@ SYSCALL_DEFINE5(perf_event_open,
>                 goto err_cred;
>         }
>
> +       if (perf_event_can_share(event))
> +               event->tmp_master = perf_event_alloc(&event->attr, cpu,
> +                                                    task, NULL, NULL,
> +                                                    NULL, NULL, -1);
> +
>         if (is_sampling_event(event)) {
>                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
>                         err = -EOPNOTSUPP;
> @@ -10703,9 +10942,17 @@ SYSCALL_DEFINE5(perf_event_open,
>                 perf_remove_from_context(group_leader, 0);
>                 put_ctx(gctx);
>
> +               /*
> +                * move_group only happens to sw events, from sw ctx to hw
> +                * ctx. The sw events should not have valid dup_id. So it
> +                * is not necessary to handle dup_events.
> +                */
> +               WARN_ON_ONCE(group_leader->dup_id != -1);
> +
>                 for_each_sibling_event(sibling, group_leader) {
>                         perf_remove_from_context(sibling, 0);
>                         put_ctx(gctx);
> +                       WARN_ON_ONCE(sibling->dup_id != -1);
>                 }
>
>                 /*
> @@ -10763,6 +11010,9 @@ SYSCALL_DEFINE5(perf_event_open,
>                 put_task_struct(task);
>         }
>
> +       /* if event->tmp_master is not used by ctx->dup_events, free it */
> +       perf_event_free_tmp_master(event);
> +
>         mutex_lock(&current->perf_event_mutex);
>         list_add_tail(&event->owner_entry, &current->perf_event_list);
>         mutex_unlock(&current->perf_event_mutex);
> @@ -10907,6 +11157,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int 
> src_cpu, int dst_cpu)
>                 perf_remove_from_context(event, 0);
>                 unaccount_event_cpu(event, src_cpu);
>                 put_ctx(src_ctx);
> +               perf_event_free_tmp_master(event);
>                 list_add(&event->migrate_entry, &events);
>         }
>
> @@ -11257,6 +11508,11 @@ inherit_event(struct perf_event *parent_event,
>         if (IS_ERR(child_event))
>                 return child_event;
>
> +       if (perf_event_can_share(child_event))
> +               child_event->tmp_master = 
> perf_event_alloc(&parent_event->attr,
> +                                                          parent_event->cpu,
> +                                                          child, NULL, NULL,
> +                                                          NULL, NULL, -1);
>
>         if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
>             !child_ctx->task_ctx_data) {
> @@ -11311,6 +11567,10 @@ inherit_event(struct perf_event *parent_event,
>         child_event->overflow_handler = parent_event->overflow_handler;
>         child_event->overflow_handler_context
>                 = parent_event->overflow_handler_context;
> +       if (child_event->tmp_master) {
> +               child_event->tmp_master->ctx = child_ctx;
> +               get_ctx(child_ctx);
> +       }
>
>         /*
>          * Precalculate sample_data sizes
> @@ -11325,6 +11585,7 @@ inherit_event(struct perf_event *parent_event,
>         add_event_to_ctx(child_event, child_ctx);
>         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
>
> +       perf_event_free_tmp_master(child_event);
>         /*
>          * Link this into the parent event's child list
>          */
> @@ -11596,6 +11857,7 @@ static void perf_event_exit_cpu_context(int cpu)
>  {
>         struct perf_cpu_context *cpuctx;
>         struct perf_event_context *ctx;
> +       struct perf_event *event;
>         struct pmu *pmu;
>
>         mutex_lock(&pmus_lock);
> @@ -11607,6 +11869,8 @@ static void perf_event_exit_cpu_context(int cpu)
>                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 
> 1);
>                 cpuctx->online = 0;
>                 mutex_unlock(&ctx->mutex);
> +               list_for_each_entry(event, &ctx->event_list, event_entry)
> +                       perf_event_free_tmp_master(event);
>         }
>         cpumask_clear_cpu(cpu, perf_online_mask);
>         mutex_unlock(&pmus_lock);
> --
> 2.9.5
>

Reply via email to