x86: output NMI overhead

Mark Rutland Thu, 24 Nov 2016 08:28:49 -0800

On Wed, Nov 23, 2016 at 04:44:40AM -0500, kan.li...@intel.com wrote:
> From: Kan Liang <kan.li...@intel.com>
> 
> NMI handler is one of the most important part which brings overhead.
> 
> There are lots of NMI during sampling. It's very expensive to log each
> NMI. So the accumulated time and NMI# will be output when event is going
> to be disabled or task is scheduling out.
> The newly introduced flag PERF_EF_LOG indicate to output the overhead
> log.
> 
> Signed-off-by: Kan Liang <kan.li...@intel.com>
> ---
>  arch/x86/events/core.c          | 19 ++++++++++++++-
>  arch/x86/events/perf_event.h    |  2 ++
>  include/linux/perf_event.h      |  1 +
>  include/uapi/linux/perf_event.h |  2 ++
>  kernel/events/core.c            | 54 
> ++++++++++++++++++++++-------------------
>  5 files changed, 52 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index d31735f..6c3b0ef 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -1397,6 +1397,11 @@ static void x86_pmu_del(struct perf_event *event, int 
> flags)
>  
>       perf_event_update_userpage(event);
>  
> +     if ((flags & PERF_EF_LOG) && cpuc->nmi_overhead.nr) {
> +             cpuc->nmi_overhead.cpu = smp_processor_id();
> +             perf_log_overhead(event, PERF_NMI_OVERHEAD, 
> &cpuc->nmi_overhead);
> +     }
> +
>  do_del:
>       if (x86_pmu.del) {
>               /*
> @@ -1475,11 +1480,21 @@ void perf_events_lapic_init(void)
>       apic_write(APIC_LVTPC, APIC_DM_NMI);
>  }
>  
> +static void
> +perf_caculate_nmi_overhead(u64 time)


s/caculate/calculate/ - this tripped me up when grepping.

> @@ -1492,8 +1507,10 @@ perf_event_nmi_handler(unsigned int cmd, struct 
> pt_regs *regs)
>       start_clock = sched_clock();
>       ret = x86_pmu.handle_irq(regs);
>       finish_clock = sched_clock();
> +     clock = finish_clock - start_clock;
>  
> -     perf_sample_event_took(finish_clock - start_clock);
> +     perf_caculate_nmi_overhead(clock);
> +     perf_sample_event_took(clock);

Ah, so it's the *sampling* overhead, not the NMI overhead.

This doesn't take into account the cost of entering/exiting the handler, which
could be larger than the sampling overhead (e.g. if the PMU is connected
through chained interrupt controllers).

>  enum perf_record_overhead_type {
> +     PERF_NMI_OVERHEAD       = 0,

As above, it may be worth calling this PERF_SAMPLE_OVERHEAD; this doesn't count
the entire cost of the NMI, and other architectures may want to implement this,
yet don't have NMI.

[...]

>  static void
>  event_sched_out(struct perf_event *event,
>                 struct perf_cpu_context *cpuctx,
> -               struct perf_event_context *ctx)
> +               struct perf_event_context *ctx,
> +               bool log_overhead)

Boolean parameter are always confusing. Why not pass the flags directly? That
way we can pass *which* overhead to log, and make the callsites easier to
understand.

>       event->tstamp_stopped = tstamp;
> -     event->pmu->del(event, 0);
> +     event->pmu->del(event, log_overhead ? PERF_EF_LOG : 0);

... which we could pass on here.

> @@ -1835,20 +1835,21 @@ event_sched_out(struct perf_event *event,
>  static void
>  group_sched_out(struct perf_event *group_event,
>               struct perf_cpu_context *cpuctx,
> -             struct perf_event_context *ctx)
> +             struct perf_event_context *ctx,
> +             bool log_overhead)

Likewise.

> @@ -1872,7 +1873,7 @@ __perf_remove_from_context(struct perf_event *event,
>  {
>       unsigned long flags = (unsigned long)info;
>  
> -     event_sched_out(event, cpuctx, ctx);
> +     event_sched_out(event, cpuctx, ctx, false);
>       if (flags & DETACH_GROUP)
>               perf_group_detach(event);
>       list_del_event(event, ctx);
> @@ -1918,9 +1919,9 @@ static void __perf_event_disable(struct perf_event 
> *event,
>       update_cgrp_time_from_event(event);
>       update_group_times(event);
>       if (event == event->group_leader)
> -             group_sched_out(event, cpuctx, ctx);
> +             group_sched_out(event, cpuctx, ctx, true);
>       else
> -             event_sched_out(event, cpuctx, ctx);
> +             event_sched_out(event, cpuctx, ctx, true);

Why does this differ from __perf_remove_from_context()?

What's the policy for when we do or do not measure overhead?

Thanks,
Mark.

Re: [PATCH 02/14] perf/x86: output NMI overhead

Reply via email to