intel: Support adaptive PEBSv4

Peter Zijlstra Tue, 19 Mar 2019 07:48:02 -0700

On Mon, Mar 18, 2019 at 02:41:25PM -0700, [email protected] wrote:
> From: Kan Liang <[email protected]>
> 
> Adaptive PEBS is a new way to report PEBS sampling information. Instead
> of a fixed size record for all PEBS events it allows to configure the
> PEBS record to only include the information needed. Events can then opt
> in to use such an extended record, or stay with a basic record which
> only contains the IP.
> 
> The major new feature is to support LBRs in PEBS record.
> This allows (much faster) large PEBS, while still supporting callstacks
> through callstack LBR.


Does it also allow normal LBR usage? Or does it have to be callstacks?

>  arch/x86/events/intel/core.c      |   2 +
>  arch/x86/events/intel/ds.c        | 293 ++++++++++++++++++++++++++++--
>  arch/x86/events/intel/lbr.c       |  22 +++
>  arch/x86/events/perf_event.h      |  14 ++
>  arch/x86/include/asm/msr-index.h  |   1 +
>  arch/x86/include/asm/perf_event.h |  42 +++++
>  6 files changed, 359 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 17096d3cd616..a964b9832b0c 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3446,6 +3446,8 @@ static int intel_pmu_cpu_prepare(int cpu)
>  {
>       struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
>  
> +     cpuc->pebs_record_size = x86_pmu.pebs_record_size;
> +
>       if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
>               cpuc->shared_regs = allocate_shared_regs(cpu);
>               if (!cpuc->shared_regs)

Does not apply... Didn't apply when you send it. At the very least you
could've refreshed the series before sending :/

> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 4a2206876baa..974284c5ed6c 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -906,17 +906,82 @@ static inline void pebs_update_threshold(struct 
> cpu_hw_events *cpuc)
>  
>       if (cpuc->n_pebs == cpuc->n_large_pebs) {
>               threshold = ds->pebs_absolute_maximum -
> -                     reserved * x86_pmu.pebs_record_size;
> +                     reserved * cpuc->pebs_record_size;
>       } else {
> -             threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
> +             threshold = ds->pebs_buffer_base + cpuc->pebs_record_size;
>       }
>  
>       ds->pebs_interrupt_threshold = threshold;
>  }
>  
> +static void adaptive_pebs_record_size_update(void)
> +{
> +     struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> +     u64 d = cpuc->pebs_data_cfg;
> +     int sz = sizeof(struct pebs_basic);
> +
> +     if (d & PEBS_DATACFG_MEMINFO)
> +             sz += sizeof(struct pebs_meminfo);
> +     if (d & PEBS_DATACFG_GPRS)
> +             sz += sizeof(struct pebs_gprs);
> +     if (d & PEBS_DATACFG_XMMS)
> +             sz += sizeof(struct pebs_xmm);
> +     if (d & PEBS_DATACFG_LBRS)
> +             sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry);
> +
> +     cpuc->pebs_record_size = sz;
> +}

You call that @d pebs_data_cfg elsewhere, why the inconsistency?

> +static u64 pebs_update_adaptive_cfg(struct perf_event *event)
> +{
> +     u64 sample_type = event->attr.sample_type;
> +     u64 pebs_data_cfg = 0;
> +
> +

too much whitespace

> +     if ((sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) ||
> +             event->attr.precise_ip < 2) {
> +
> +             if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |
> +                                PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT |
> +                                PERF_SAMPLE_TRANSACTION))
> +                     pebs_data_cfg |= PEBS_DATACFG_MEMINFO;
> +
> +             /*
> +              * Cases we need the registers:
> +              * + user requested registers
> +              * + precise_ip < 2 for the non event IP
> +              * + For RTM TSX weight we need GPRs too for the abort
> +              * code. But we don't want to force GPRs for all other
> +              * weights.  So add it only for the RTM abort event.
> +              */
> +             if (((sample_type & PERF_SAMPLE_REGS_INTR) &&
> +                     (event->attr.sample_regs_intr & 0xffffffff)) ||
> +                 (event->attr.precise_ip < 2) ||
> +                 ((sample_type & PERF_SAMPLE_WEIGHT) &&
> +                  ((event->attr.config & 0xffff) == 
> x86_pmu.force_gpr_event)))
> +                     pebs_data_cfg |= PEBS_DATACFG_GPRS;

I know it has a comment, but it would be nice for the code to be
readable too. This is horrible.

> +
> +             if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
> +                     (event->attr.sample_regs_intr >> 32))
> +                     pebs_data_cfg |= PEBS_DATACFG_XMMS;

indent fail

> +
> +             if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
> +                     /*
> +                      * For now always log all LBRs. Could configure this
> +                      * later.
> +                      */
> +                     pebs_data_cfg |= PEBS_DATACFG_LBRS |
> +                             ((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT);
> +             }
> +     }
> +     return pebs_data_cfg;
> +}
> +
>  static void
> -pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu 
> *pmu)
> +pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
> +               struct perf_event *event, bool add)
>  {
> +     struct pmu *pmu = event->ctx->pmu;
>       /*
>        * Make sure we get updated with the first PEBS
>        * event. It will trigger also during removal, but
> @@ -933,6 +998,19 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events 
> *cpuc, struct pmu *pmu)
>               update = true;
>       }
>  
> +     if (x86_pmu.intel_cap.pebs_baseline && add) {
> +             u64 pebs_data_cfg;
> +
> +             pebs_data_cfg = pebs_update_adaptive_cfg(event);
> +
> +             /* Update pebs_record_size if new event requires more data. */
> +             if (pebs_data_cfg & ~cpuc->pebs_data_cfg) {
> +                     cpuc->pebs_data_cfg |= pebs_data_cfg;
> +                     adaptive_pebs_record_size_update();
> +                     update = true;
> +             }
> +     }
> +
>       if (update)
>               pebs_update_threshold(cpuc);
>  }

Hurmph.. this only grows the PEBS record.


> @@ -947,7 +1025,7 @@ void intel_pmu_pebs_add(struct perf_event *event)
>       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
>               cpuc->n_large_pebs++;
>  
> -     pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
> +     pebs_update_state(needed_cb, cpuc, event, true);
>  }
>  
>  void intel_pmu_pebs_enable(struct perf_event *event)
> @@ -965,6 +1043,14 @@ void intel_pmu_pebs_enable(struct perf_event *event)
>       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
>               cpuc->pebs_enabled |= 1ULL << 63;
>  
> +     if (x86_pmu.intel_cap.pebs_baseline) {
> +             hwc->config |= ICL_EVENTSEL_ADAPTIVE;
> +             if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) {
> +                     wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg);
> +                     cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg;
> +             }
> +     }
> +
>       /*
>        * Use auto-reload if possible to save a MSR write in the PMI.
>        * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
> @@ -991,7 +1077,12 @@ void intel_pmu_pebs_del(struct perf_event *event)
>       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
>               cpuc->n_large_pebs--;
>  
> -     pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
> +     /* Clear both pebs_data_cfg and pebs_record_size for first PEBS. */

Weird comment..

> +     if (x86_pmu.intel_cap.pebs_baseline && !cpuc->n_pebs) {
> +             cpuc->pebs_data_cfg = 0;
> +             cpuc->pebs_record_size = sizeof(struct pebs_basic);
> +     }
> +     pebs_update_state(needed_cb, cpuc, event, false);

Why do we have to reset record_size? That'll be updated in
pebs_update_state() on the next add.

>  }
>  
>  void intel_pmu_pebs_disable(struct perf_event *event)
> @@ -1004,6 +1095,8 @@ void intel_pmu_pebs_disable(struct perf_event *event)
>  
>       cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
>  
> +     /* Delay reprograming DATA_CFG to next enable */
> +

No need for that I think.

>       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
>               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
>       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
> @@ -1013,6 +1106,7 @@ void intel_pmu_pebs_disable(struct perf_event *event)
>               wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
>  
>       hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
> +     hwc->config &= ~ICL_EVENTSEL_ADAPTIVE;

Just curious; the way I read the SDM, we could leave this set, is that
correct?

>  }
>  
>  void intel_pmu_pebs_enable_all(void)

> @@ -1323,19 +1558,20 @@ get_next_pebs_record_by_bit(void *base, void *top, 
> int bit)
>       if (base == NULL)
>               return NULL;
>  
> -     for (at = base; at < top; at += x86_pmu.pebs_record_size) {
> +     for (at = base; at < top; at = next_pebs_record(at)) {

That _should_ work with cpuc->pebs_record_size, right?

>               struct pebs_record_nhm *p = at;
> +             unsigned long status = get_pebs_status(p);
>  
> -             if (test_bit(bit, (unsigned long *)&p->status)) {
> +             if (test_bit(bit, (unsigned long *)&status)) {
>                       /* PEBS v3 has accurate status bits */
>                       if (x86_pmu.intel_cap.pebs_format >= 3)
>                               return at;
>  
> -                     if (p->status == (1 << bit))
> +                     if (status == (1 << bit))
>                               return at;
>  
>                       /* clear non-PEBS bit and re-check */
> -                     pebs_status = p->status & cpuc->pebs_enabled;
> +                     pebs_status = status & cpuc->pebs_enabled;
>                       pebs_status &= PEBS_COUNTER_MASK;
>                       if (pebs_status == (1 << bit))
>                               return at;

> @@ -1434,14 +1670,14 @@ static void __intel_pmu_pebs_event(struct perf_event 
> *event,
>               return;
>  
>       while (count > 1) {
> -             setup_pebs_sample_data(event, iregs, at, &data, &regs);
> +             x86_pmu.setup_pebs_sample_data(event, iregs, at, &data, &regs);
>               perf_event_output(event, &data, &regs);
> -             at += x86_pmu.pebs_record_size;
> +             at = next_pebs_record(at);
>               at = get_next_pebs_record_by_bit(at, top, bit);
>               count--;
>       }
>  
> -     setup_pebs_sample_data(event, iregs, at, &data, &regs);
> +     x86_pmu.setup_pebs_sample_data(event, iregs, at, &data, &regs);
>  
>       /*
>        * All but the last records are processed.
> @@ -1534,11 +1770,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs 
> *iregs)
>               return;
>       }
>  
> -     for (at = base; at < top; at += x86_pmu.pebs_record_size) {
> +     for (at = base; at < top; at = next_pebs_record(at)) {
>               struct pebs_record_nhm *p = at;
>               u64 pebs_status;
>  
> -             pebs_status = p->status & cpuc->pebs_enabled;
> +             pebs_status = get_pebs_status(p) & cpuc->pebs_enabled;
>               pebs_status &= mask;
>  
>               /* PEBS v3 has more accurate status bits */

How much work would intel_pmu_drain_pebs_icl() be?

I'm thinking that might not be terrible.

Re: [PATCH 03/22] perf/x86/intel: Support adaptive PEBSv4

Reply via email to