[PATCH] powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc.
IMC trace-mode record has MSR[HV PR] bits added in the third DW. These bits can be used to set the cpumode for the instruction pointer captured in each sample. Add support in kernel to use these bits to set the cpumode for each sample. Signed-off-by: Anju T Sudhakar --- arch/powerpc/include/asm/imc-pmu.h | 5 + arch/powerpc/perf/imc-pmu.c| 29 - 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 4da4fcba0684..4f897993b710 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -99,6 +99,11 @@ struct trace_imc_data { */ #define IMC_TRACE_RECORD_TB1_MASK 0x3ffULL +/* + * Bit 0:1 in third DW of IMC trace record + * specifies the MSR[HV PR] values. + */ +#define IMC_TRACE_RECORD_VAL_HVPR(x) ((x) >> 62) /* * Device tree parser code detects IMC pmu support and diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index cb50a9e1fd2d..310922fed9eb 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1178,11 +1178,30 @@ static int trace_imc_prepare_sample(struct trace_imc_data *mem, header->size = sizeof(*header) + event->header_size; header->misc = 0; - if (is_kernel_addr(data->ip)) - header->misc |= PERF_RECORD_MISC_KERNEL; - else - header->misc |= PERF_RECORD_MISC_USER; - + if (cpu_has_feature(CPU_FTRS_POWER9)) { + if (is_kernel_addr(data->ip)) + header->misc |= PERF_RECORD_MISC_KERNEL; + else + header->misc |= PERF_RECORD_MISC_USER; + } else { + switch (IMC_TRACE_RECORD_VAL_HVPR(mem->val)) { + case 0:/* when MSR HV and PR not set in the trace-record */ + header->misc |= PERF_RECORD_MISC_GUEST_KERNEL; + break; + case 1: /* MSR HV is 0 and PR is 1 */ + header->misc |= PERF_RECORD_MISC_GUEST_USER; + break; + case 2: /* MSR Hv is 1 and PR is 0 */ + header->misc |= PERF_RECORD_MISC_HYPERVISOR; + break; + case 3: /* MSR HV is 1 and PR is 1 */ + header->misc |= PERF_RECORD_MISC_USER; + break; + default: + pr_info("IMC: Unable to set the flag based on MSR bits\n"); + break; + } + } perf_event_header__init_id(header, data, event); return 0; -- 2.25.4
[PATCH 2/2] powerpc/perf: Add support for outputting extended regs in perf intr_regs
The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the PMU which support extended registers. The generic code define the mask of extended registers as 0 for non supported architectures. Add support for extended registers in POWER9 architecture. For POWER9, the extended registers are mmcr0, mmc1 and mmcr2. REG_RESERVED mask is redefined to accommodate the extended registers. With patch: # perf record -I? available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26 r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2 # perf record -I ls # perf script -D PERF_RECORD_SAMPLE(IP, 0x1): 9019/9019: 0 period: 1 addr: 0 ... intr regs: mask 0x ABI 64-bit r00xc011b12c r10xc03f9a98b930 r20xc1a32100 r30xc03f8fe9a800 r40xc03fd181 r50x3e32557150 r60xc03f9a98b908 r70xffc1cdae06ac r80x818 [.] r31 0xc03ffd047230 nip 0xc011b2c0 msr 0x90009033 orig_r3 0xc011b21c ctr 0xc0119380 link 0xc011b12c xer 0x0 ccr 0x2800 softe 0x1 trap 0xf00 dar 0x0 dsisr 0x800 sier 0x0 mmcra 0x800 mmcr0 0x82008090 mmcr1 0x1e00 mmcr2 0x0 ... thread: perf:9019 Signed-off-by: Anju T Sudhakar --- arch/powerpc/include/asm/perf_event_server.h | 5 +++ arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++- arch/powerpc/perf/core-book3s.c | 1 + arch/powerpc/perf/perf_regs.c | 29 ++-- arch/powerpc/perf/power9-pmu.c| 1 + .../arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++- tools/perf/arch/powerpc/include/perf_regs.h | 6 +++- tools/perf/arch/powerpc/util/perf_regs.c | 33 +++ 8 files changed, 95 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 3e9703f44c7c..1d15953bd99e 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -55,6 +55,11 @@ struct power_pmu { int *blacklist_ev; /* BHRB entries in the PMU */ int bhrb_nr; + /* +* set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if +* the pmu supports extended perf regs capability +*/ + int capabilities; }; /* diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h index f599064dd8dc..604b831378fe 100644 --- a/arch/powerpc/include/uapi/asm/perf_regs.h +++ b/arch/powerpc/include/uapi/asm/perf_regs.h @@ -48,6 +48,17 @@ enum perf_event_powerpc_regs { PERF_REG_POWERPC_DSISR, PERF_REG_POWERPC_SIER, PERF_REG_POWERPC_MMCRA, - PERF_REG_POWERPC_MAX, + /* Extended registers */ + PERF_REG_POWERPC_MMCR0, + PERF_REG_POWERPC_MMCR1, + PERF_REG_POWERPC_MMCR2, + PERF_REG_EXTENDED_MAX, + /* Max regs without the extended regs */ + PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1, }; + +#define PERF_REG_PMU_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1) +#define PERF_REG_EXTENDED_MASK (((1ULL << (PERF_REG_EXTENDED_MAX))\ + - 1) - PERF_REG_PMU_MASK) + #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 3dcfecf858f3..f56b77800a7b 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2276,6 +2276,7 @@ int register_power_pmu(struct power_pmu *pmu) power_pmu.attr_groups = ppmu->attr_groups; + power_pmu.capabilities |= (ppmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS); #ifdef MSR_HV /* * Use FCHV to ignore kernel events if MSR.HV is set. diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c index a213a0aa5d25..57aa02568caf 100644 --- a/arch/powerpc/perf/perf_regs.c +++ b/arch/powerpc/perf/perf_regs.c @@ -15,7 +15,8 @@ #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) -#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1)) +#define REG_RESERVED (~(PERF_REG_EXTENDED_MASK) & \ + (~((1ULL << PERF_REG_POWERPC_MAX) - 1))) static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = { PT_REGS_OFFSET(PERF_REG_POWERPC_R0, gpr[0]), @@ -69,10 +70,22 @@ static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = { PT_REGS_OFFSET(PERF_REG_POWERPC_MMCRA, dsisr), }; +/* Function to return the extended register values */ +static u64 get_ext_regs_value(int idx) +{ + switch (idx) { + case PERF_REG_POWERPC_MMCR0: +
[PATCH 0/2] powerpc/perf: Add support for perf extended regs in powerpc
Patch set to add support for perf extended register capability in powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the PMU which support extended registers. The generic code define the mask of extended registers as 0 for non supported architectures. patch 2/2 defines this PERF_PMU_CAP_EXTENDED_REGS mask to output the values of mmcr0,mmcr1,mmcr2 for POWER9. x86/perf_regs.h is included by util/intel-pt.c, which will get compiled when buiding perf on powerpc. Since x86/perf_regs.h has `PERF_EXTENDED_REG_MASK` defined, defining `PERF_EXTENDED_REG_MASK` for powerpc to add support for perf extended regs will result in perf build error on powerpc. Currently powerpc architecture is not having support for auxtrace. So as a workaround for this issue, patch 1/2 set NO_AUXTRACE for powerpc. (Any other solutions are welcome.) Patch 2/2 also add extended regs to sample_reg_mask in the tool side to use with `-I?` option. Anju T Sudhakar (2): tools/perf: set no_auxtrace for powerpc powerpc/perf: Add support for outputting extended regs in perf intr_regs arch/powerpc/include/asm/perf_event_server.h | 5 +++ arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++- arch/powerpc/perf/core-book3s.c | 1 + arch/powerpc/perf/perf_regs.c | 29 ++-- arch/powerpc/perf/power9-pmu.c| 1 + .../arch/powerpc/include/uapi/asm/perf_regs.h | 13 +++- tools/perf/arch/powerpc/Makefile | 1 + tools/perf/arch/powerpc/include/perf_regs.h | 6 +++- tools/perf/arch/powerpc/util/perf_regs.c | 33 +++ 9 files changed, 96 insertions(+), 6 deletions(-) -- 2.20.1
[PATCH 1/2] tools/perf: set no_auxtrace for powerpc
x86/perf_regs.h is included by util/intel-pt.c, which will get compiled when buiding perf on powerpc. Since x86/perf_regs.h has `PERF_EXTENDED_REG_MASK` defined, defining `PERF_EXTENDED_REG_MASK` for powerpc to add support for perf extended regs will result in perf build error on powerpc. Currently powerpc architecture is not having support for auxtrace. So as a workaround for this issue, set NO_AUXTRACE for powerpc. Signed-off-by: Anju T Sudhakar --- tools/perf/arch/powerpc/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile index e58d00d62f02..9ebb5f513605 100644 --- a/tools/perf/arch/powerpc/Makefile +++ b/tools/perf/arch/powerpc/Makefile @@ -3,6 +3,7 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 endif +NO_AUXTRACE := 1 HAVE_KVM_STAT_SUPPORT := 1 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 PERF_HAVE_JITDUMP := 1 -- 2.20.1
[PATCH v4 2/2] powerpc/powernv: Re-enable imc trace-mode in kernel
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu" disables IMC(In-Memory Collection) trace-mode in kernel, since frequent mode switching between accumulation mode and trace mode via the spr LDBAR in the hardware can trigger a checkstop(system crash). Patch to re-enable imc-trace mode in kernel. The previous patch(1/2) in this series will address the mode switching issue by implementing a global lock, and will restrict the usage of accumulation and trace-mode at a time. Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 968b9a4d1cd9..7824cc364bc4 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -268,14 +268,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) domain = IMC_DOMAIN_THREAD; break; case IMC_TYPE_TRACE: - /* -* FIXME. Using trace_imc events to monitor application -* or KVM thread performance can cause a checkstop -* (system crash). -* Disable it for now. -*/ - pr_info_once("IMC: disabling trace_imc PMU\n"); - domain = -1; + domain = IMC_DOMAIN_TRACE; break; default: pr_warn("IMC Unknown Device type \n"); -- 2.20.1
[PATCH v4 1/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.
IMC(In-memory Collection Counters) does performance monitoring in two different modes, i.e accumulation mode(core-imc and thread-imc events), and trace mode(trace-imc events). A cpu thread can either be in accumulation-mode or trace-mode at a time and this is done via the LDBAR register in POWER architecture. The current design does not address the races between thread-imc and trace-imc events. Patch implements a global id and lock to avoid the races between core, trace and thread imc events. With this global id-lock implementation, the system can either run core, thread or trace imc events at a time. i.e. to run any core-imc events, thread/trace imc events should not be enabled/monitored. Signed-off-by: Anju T Sudhakar --- Changes from v3->v4: - Added mutex lock for thread, core and trace imc cpu offline path. Changes from v2->v3: - Addressed the off-line comments from Michael Ellerman - Optimized the *_event_init code path for trace, core and thread imc - Handled the global refc in cpuhotplug scenario - Re-order the patch series - Removed the selftest patches and will send as a follow up patch Changes from v1 -> v2: - Added self test patches to the series. --- arch/powerpc/perf/imc-pmu.c | 173 +++- 1 file changed, 149 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index cb50a9e1fd2d..eb82dda884e5 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem); static struct imc_pmu_ref *trace_imc_refc; static int trace_imc_mem_size; +/* + * Global data structure used to avoid races between thread, + * core and trace-imc + */ +static struct imc_pmu_ref imc_global_refc = { + .lock = __MUTEX_INITIALIZER(imc_global_refc.lock), + .id = 0, + .refc = 0, +}; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -698,6 +708,16 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) return -EINVAL; ref->refc = 0; + /* +* Reduce the global reference count, if this is the +* last cpu in this core and core-imc event running +* in this cpu. +*/ + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_CORE) + imc_global_refc.refc--; + + mutex_unlock(_global_refc.lock); } return 0; } @@ -710,6 +730,23 @@ static int core_imc_pmu_cpumask_init(void) ppc_core_imc_cpu_offline); } +static void reset_global_refc(struct perf_event *event) +{ + mutex_lock(_global_refc.lock); + imc_global_refc.refc--; + + /* +* If no other thread is running any +* event for this domain(thread/core/trace), +* set the global id to zero. +*/ + if (imc_global_refc.refc <= 0) { + imc_global_refc.refc = 0; + imc_global_refc.id = 0; + } + mutex_unlock(_global_refc.lock); +} + static void core_imc_counters_release(struct perf_event *event) { int rc, core_id; @@ -759,6 +796,8 @@ static void core_imc_counters_release(struct perf_event *event) ref->refc = 0; } mutex_unlock(>lock); + + reset_global_refc(event); } static int core_imc_event_init(struct perf_event *event) @@ -819,6 +858,29 @@ static int core_imc_event_init(struct perf_event *event) ++ref->refc; mutex_unlock(>lock); + /* +* Since the system can run either in accumulation or trace-mode +* of IMC at a time, core-imc events are allowed only if no other +* trace/thread imc events are enabled/monitored. +* +* Take the global lock, and check the refc.id +* to know whether any other trace/thread imc +* events are running. +*/ + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) { + /* +* No other trace/thread imc events are running in +* the system, so set the refc.id to core-imc. +*/ + imc_global_refc.id = IMC_DOMAIN_CORE; + imc_global_refc.refc++; + } else { + mutex_unlock(_global_refc.lock); + return -EBUSY; + } + mutex_unlock(_global_refc.lock); + event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK); event->destroy = core_imc_counters_release; return 0; @@ -877,7 +939,23 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu) static int ppc_thread_imc_cpu
[PATCH v3 2/2] powerpc/powernv: Re-enable imc trace-mode in kernel
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu" disables IMC(In-Memory Collection) trace-mode in kernel, since frequent mode switching between accumulation mode and trace mode via the spr LDBAR in the hardware can trigger a checkstop(system crash). Patch to re-enable imc-trace mode in kernel. The previous patch(1/2) in this series will address the mode switching issue by implementing a global lock, and will restrict the usage of accumulation and trace-mode at a time. Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 000b350d4060..3b4518f4b643 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) domain = IMC_DOMAIN_THREAD; break; case IMC_TYPE_TRACE: - /* -* FIXME. Using trace_imc events to monitor application -* or KVM thread performance can cause a checkstop -* (system crash). -* Disable it for now. -*/ - pr_info_once("IMC: disabling trace_imc PMU\n"); - domain = -1; + domain = IMC_DOMAIN_TRACE; break; default: pr_warn("IMC Unknown Device type \n"); -- 2.20.1
[PATCH v3 1/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.
IMC(In-memory Collection Counters) does performance monitoring in two different modes, i.e accumulation mode(core-imc and thread-imc events), and trace mode(trace-imc events). A cpu thread can either be in accumulation-mode or trace-mode at a time and this is done via the LDBAR register in POWER architecture. The current design does not address the races between thread-imc and trace-imc events. Patch implements a global id and lock to avoid the races between core, trace and thread imc events. With this global id-lock implementation, the system can either run core, thread or trace imc events at a time. i.e. to run any core-imc events, thread/trace imc events should not be enabled/monitored. Signed-off-by: Anju T Sudhakar --- Changes from v2->v3: - Addressed the off-line comments from Michael Ellerman - Optimized the *_event_init code path for trace, core and thread imc - Handled the global refc in cpuhotplug scenario - Re-order the patch series - Removed the selftest patches and will send as a follow up patch Changes from v1 -> v2: - Added self test patches to the series. --- arch/powerpc/perf/imc-pmu.c | 165 ++-- 1 file changed, 141 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index cb50a9e1fd2d..a366e2ec0351 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem); static struct imc_pmu_ref *trace_imc_refc; static int trace_imc_mem_size; +/* + * Global data structure used to avoid races between thread, + * core and trace-imc + */ +static struct imc_pmu_ref imc_global_refc = { + .lock = __MUTEX_INITIALIZER(imc_global_refc.lock), + .id = 0, + .refc = 0, +}; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -698,6 +708,13 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) return -EINVAL; ref->refc = 0; + /* +* Reduce the global reference count, if this is the +* last cpu in this core and core-imc event running +* in this cpu. +*/ + if (imc_global_refc.id == IMC_DOMAIN_CORE) + imc_global_refc.refc--; } return 0; } @@ -710,6 +727,23 @@ static int core_imc_pmu_cpumask_init(void) ppc_core_imc_cpu_offline); } +static void reset_global_refc(struct perf_event *event) +{ + mutex_lock(_global_refc.lock); + imc_global_refc.refc--; + + /* +* If no other thread is running any +* event for this domain(thread/core/trace), +* set the global id to zero. +*/ + if (imc_global_refc.refc <= 0) { + imc_global_refc.refc = 0; + imc_global_refc.id = 0; + } + mutex_unlock(_global_refc.lock); +} + static void core_imc_counters_release(struct perf_event *event) { int rc, core_id; @@ -759,6 +793,8 @@ static void core_imc_counters_release(struct perf_event *event) ref->refc = 0; } mutex_unlock(>lock); + + reset_global_refc(event); } static int core_imc_event_init(struct perf_event *event) @@ -819,6 +855,29 @@ static int core_imc_event_init(struct perf_event *event) ++ref->refc; mutex_unlock(>lock); + /* +* Since the system can run either in accumulation or trace-mode +* of IMC at a time, core-imc events are allowed only if no other +* trace/thread imc events are enabled/monitored. +* +* Take the global lock, and check the refc.id +* to know whether any other trace/thread imc +* events are running. +*/ + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) { + /* +* No other trace/thread imc events are running in +* the system, so set the refc.id to core-imc. +*/ + imc_global_refc.id = IMC_DOMAIN_CORE; + imc_global_refc.refc++; + } else { + mutex_unlock(_global_refc.lock); + return -EBUSY; + } + mutex_unlock(_global_refc.lock); + event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK); event->destroy = core_imc_counters_release; return 0; @@ -877,7 +936,20 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu) static int ppc_thread_imc_cpu_offline(unsigned int cpu) { - mtspr(SPRN_LDBAR, 0); + /* +* Set the bit 0 of LDBAR to zero. +* +* If bit 0 of LDBAR is unset, it will stop posting +* the coune
Re: [PATCH v5 07/10] powerpc/perf: open access for CAP_PERFMON privileged process
On 1/20/20 5:00 PM, Alexey Budankov wrote: Open access to monitoring for CAP_PERFMON privileged processes. For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_PERFMON capability. Providing the access under CAP_PERFMON capability singly, without the rest of CAP_SYS_ADMIN credentials, excludes chances to misuse the credentials and makes the operations more secure. Signed-off-by: Alexey Budankov --- Acked-by: Anju T Sudhakar
[PATCH v2 3/5] powerpc/perf: Add an interface sub-folder to imc pmu
From: Madhavan Srinivasan Patch adds an interface attribute folder to imc pmu. This is intended to include pmu intreface capabilities which will be useful to userspace likes selftest testcases. Patch adds a "glob_lck" file to notify to userspace of global lock mechanism added to imc devices like core, thread and trace. "glob_lck" will be used by selftest file to execute interface test for the global lock mechanism. Signed-off-by: Madhavan Srinivasan --- arch/powerpc/include/asm/imc-pmu.h | 11 ++- arch/powerpc/perf/imc-pmu.c| 19 +++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 4da4fcba0684..1b2c33c30e7c 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -87,8 +87,9 @@ struct trace_imc_data { /* Event attribute array index */ #define IMC_FORMAT_ATTR0 #define IMC_EVENT_ATTR 1 -#define IMC_CPUMASK_ATTR 2 -#define IMC_NULL_ATTR 3 +#define IMC_INTERFACE_ATTR 2 +#define IMC_CPUMASK_ATTR 3 +#define IMC_NULL_ATTR 4 /* PMU Format attribute macros */ #define IMC_EVENT_OFFSET_MASK 0xULL @@ -114,10 +115,10 @@ struct imc_pmu { /* * Attribute groups for the PMU. Slot 0 used for * format attribute, slot 1 used for cpusmask attribute, -* slot 2 used for event attribute. Slot 3 keep as -* NULL. +* slot 2 used for event attribute. Slot 3 used for interface +* attribute and Slot 4 is NULL. */ - const struct attribute_group *attr_groups[4]; + const struct attribute_group *attr_groups[5]; u32 counter_mem_size; int domain; /* diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 2e220f199530..3f49664f29f1 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -54,6 +54,24 @@ static struct imc_pmu_ref imc_global_refc = { .refc = 0, }; +static ssize_t glob_lck_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} + +static DEVICE_ATTR_RO(glob_lck); + +static struct attribute *imc_interface_attrs[] = { + _attr_glob_lck.attr, + NULL, +}; + +static struct attribute_group imc_interface_group = { + .name = "interface", + .attrs = imc_interface_attrs, +}; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -1462,6 +1480,7 @@ static int update_pmu_ops(struct imc_pmu *pmu) pmu->pmu.attr_groups = pmu->attr_groups; pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; pmu->attr_groups[IMC_FORMAT_ATTR] = _format_group; + pmu->attr_groups[IMC_INTERFACE_ATTR] = _interface_group; switch (pmu->domain) { case IMC_DOMAIN_NEST: -- 2.20.1
[PATCH v2 2/5] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.
IMC(In-memory Collection Counters) does performance monitoring in two different modes, i.e accumulation mode(core-imc and thread-imc events), and trace mode(trace-imc events). A cpu thread can either be in accumulation-mode or trace-mode at a time and this is done via the LDBAR register in POWER architecture. The current design does not address the races between thread-imc and trace-imc events. Patch implements a global id and lock to avoid the races between core, trace and thread imc events. With this global id-lock implementation, the system can either run core, thread or trace imc events at a time. i.e. to run any core-imc events, thread/trace imc events should not be enabled/monitored. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 177 +++- 1 file changed, 153 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index cb50a9e1fd2d..2e220f199530 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem); static struct imc_pmu_ref *trace_imc_refc; static int trace_imc_mem_size; +/* + * Global data structure used to avoid races between thread, + * core and trace-imc + */ +static struct imc_pmu_ref imc_global_refc = { + .lock = __MUTEX_INITIALIZER(imc_global_refc.lock), + .id = 0, + .refc = 0, +}; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -759,6 +769,20 @@ static void core_imc_counters_release(struct perf_event *event) ref->refc = 0; } mutex_unlock(>lock); + + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_CORE) { + imc_global_refc.refc--; + /* +* If no other thread is running any core-imc +* event, set the global id to zero. +*/ + if (imc_global_refc.refc <= 0) { + imc_global_refc.refc = 0; + imc_global_refc.id = 0; + } + } + mutex_unlock(_global_refc.lock); } static int core_imc_event_init(struct perf_event *event) @@ -779,6 +803,22 @@ static int core_imc_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; + /* +* Take the global lock, and make sure +* no other thread is running any trace OR thread imc event +*/ + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == 0) { + imc_global_refc.id = IMC_DOMAIN_CORE; + imc_global_refc.refc++; + } else if (imc_global_refc.id == IMC_DOMAIN_CORE) { + imc_global_refc.refc++; + } else { + mutex_unlock(_global_refc.lock); + return -EBUSY; + } + mutex_unlock(_global_refc.lock); + event->hw.idx = -1; pmu = imc_event_to_pmu(event); @@ -877,7 +917,16 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu) static int ppc_thread_imc_cpu_offline(unsigned int cpu) { - mtspr(SPRN_LDBAR, 0); + /* +* Toggle the bit 0 of LDBAR. +* +* If bit 0 of LDBAR is unset, it will stop posting +* the counetr data to memory. +* For thread-imc, bit 0 of LDBAR will be set to 1 in the +* event_add function. So toggle this bit here, to stop the updates +* to memory in the cpu_offline path. +*/ + mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) ^ (1UL << 63))); return 0; } @@ -889,6 +938,24 @@ static int thread_imc_cpu_init(void) ppc_thread_imc_cpu_offline); } +static void thread_imc_counters_release(struct perf_event *event) +{ + + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_THREAD) { + imc_global_refc.refc--; + /* +* If no other thread is running any thread-imc +* event, set the global id to zero. +*/ + if (imc_global_refc.refc <= 0) { + imc_global_refc.refc = 0; + imc_global_refc.id = 0; + } + } + mutex_unlock(_global_refc.lock); +} + static int thread_imc_event_init(struct perf_event *event) { u32 config = event->attr.config; @@ -905,6 +972,27 @@ static int thread_imc_event_init(struct perf_event *event) if (event->hw.sample_period) return -EINVAL; + mutex_lock(_global_refc.lock); + /* +* Check if any other thread is running +* core-engine, if not set the global id to +* thread-imc. +*/ + if (imc_global_refc.id == 0) { + imc_global_refc.id = IMC_DOMAIN_THREAD; + imc_global_refc.refc++; + } else if (imc
[PATCH v2 5/5] selftest/powerpc/pmu: Testcase for imc global lock mechanism
From: Madhavan Srinivasan Signed-off-by: Madhavan Srinivasan --- .../pmu/mem_counters/imc_global_lock_test.c | 49 ++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c index ea687ffc1990..f643dba8ecc0 100644 --- a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c +++ b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c @@ -5,9 +5,56 @@ #include "mem_counters.h" +static bool check_imc_interface_glob_lck(void) +{ + if (!access("/sys/devices/thread_imc/interface/glob_lck", F_OK)) + return true; + + return false; +} + static int testcase(void) { - return 0; + struct event events[2]; + + if (!check_imc_interface_glob_lck()) { + printf("Test not supported\n"); + return MAGIC_SKIP_RETURN_VALUE; + } + + if (!is_mem_counters_device_enabled(CORE) || !is_mem_counters_device_enabled(THREAD)) { + printf("%s: IMC device not found. So exiting the test\n", __FUNCTION__); + return -1; + } + + if (setup_mem_counters_event(THREAD, [0], 0xe0, "thread_imc/cycles")) { + printf("%s setup_mem_counters_event for thread_imc failed\n", __FUNCTION__); + return -1; + } + + if (setup_mem_counters_event(CORE, [1], 0xe0, "core_imc/cycles")) { + printf("%s setup_mem_counters_event for core_imc failed\n", __FUNCTION__); + return -1; + } + + if (event_open([0])) { + perror("thread_imc: perf_event_open"); + return -1; + } + + /* +* If we have the Global lock patchset applied to kernel +* event_open for events[1] should fail with resource busy +*/ + if (event_open_with_cpu([1], 0)) { + /* +* Check for the errno to certify the test result +*/ + if (errno == 16) // Resource busy (EBUSY) + return 0; + } + + return -1; } static int imc_global_lock_test(void) -- 2.20.1
[PATCH v2 4/5] selftest/powerpc/pmc: Support to include interface test for Memory Counter PMUs
From: Madhavan Srinivasan Patch to add support to include interface tests for memory counter PMUs as part of selftest. These PMUs are primarily used to understand socket/chip/core resourage usage. In PowerNV envirnoment, the perf interface registered to access these counters are called "In Memory Collection" (IMC) and in PowerVM, the perf interface registered to access these counters are called "hv_24x7". New folder "mem_counters" added under selftest/powerpc/pmu. This will include interface tests for both "imc" and "hv_24x7" pmus. Patch adds base/common functioned needed. To make blame easier, a place-holder test function added to this patch. Subsequent patch will fill in the actual test content. Signed-off-by: Madhavan Srinivasan --- tools/testing/selftests/powerpc/pmu/Makefile | 7 +- .../powerpc/pmu/mem_counters/Makefile | 21 .../pmu/mem_counters/imc_global_lock_test.c | 21 .../powerpc/pmu/mem_counters/mem_counters.c | 99 +++ .../powerpc/pmu/mem_counters/mem_counters.h | 36 +++ 5 files changed, 182 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/Makefile create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.h diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 19046db995fe..e352eceac0a9 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -8,7 +8,7 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. include ../../lib.mk -all: $(TEST_GEN_PROGS) ebb +all: $(TEST_GEN_PROGS) ebb mem_counters $(TEST_GEN_PROGS): $(EXTRA_SOURCES) @@ -43,4 +43,7 @@ clean: ebb: TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all -.PHONY: all run_tests clean ebb +mem_counters: + TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all + +.PHONY: all run_tests clean ebb mem_counters diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile b/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile new file mode 100644 index ..f39ebe30ab70 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/mem_counters/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../../../../../../scripts/Kbuild.include + +noarg: + $(MAKE) -C ../../ + +CFLAGS += -m64 + +# Toolchains may build PIE by default which breaks the assembly +no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ +$(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) + +LDFLAGS += $(no-pie-option) + +TEST_GEN_PROGS := imc_global_lock_test + +top_srcdir = ../../../../../.. +include ../../../lib.mk + +$(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c ./mem_counters.c \ + imc_global_lock_test.c diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c new file mode 100644 index ..ea687ffc1990 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2020, Madhavan Srinivasan, IBM Corp. + */ + +#include "mem_counters.h" + +static int testcase(void) +{ + return 0; +} + +static int imc_global_lock_test(void) +{ + return eat_cpu(testcase); +} + +int main(void) +{ + return test_harness(imc_global_lock_test, "imc_global_lock_test"); +} diff --git a/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c b/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c new file mode 100644 index ..b0ee1319f018 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2020, Madhavan Srinivasan, IBM Corp. + */ + +#include "mem_counters.h" + +/* + * mem_counters.c will contain common/basic functions + * to support testcases for both In Memory Collection (IMC) + * and hv_24x7 counters. + */ + + +/* + * Since device type enum starts with 1, + * have the first entry in the array as a placeholder. + */ +const char mem_counters_dev_path[][30] = { + "", + "/sys/devices/thread_imc", + "/sys/devices/trace_imc", + "/sys/devices/core_imc", + "/sys/devices/hv_24x7", + "", +}; + +const char mem_counters_dev_type_path[][35] = { + "", + "/sys/devices/thread_imc/type", + "/sys/devices/trace_imc/type", +
[PATCH v2 1/5] powerpc/powernv: Re-enable imc trace-mode in kernel
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu" disables IMC(In-Memory Collection) trace-mode in kernel, since frequent mode switching between accumulation mode and trace mode via the spr LDBAR in the hardware can trigger a checkstop(system crash). Patch to re-enable imc-trace mode in kernel. The following patch in this series will address the mode switching issue by implementing a global lock, and will restrict the usage of accumulation and trace-mode at a time. Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 000b350d4060..3b4518f4b643 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) domain = IMC_DOMAIN_THREAD; break; case IMC_TYPE_TRACE: - /* -* FIXME. Using trace_imc events to monitor application -* or KVM thread performance can cause a checkstop -* (system crash). -* Disable it for now. -*/ - pr_info_once("IMC: disabling trace_imc PMU\n"); - domain = -1; + domain = IMC_DOMAIN_TRACE; break; default: pr_warn("IMC Unknown Device type \n"); -- 2.20.1
[PATCH v2 0/5] Re-enable IMC trace-mode
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu" disables IMC(In-Memory Collection) trace-mode in kernel, since frequent mode switching between accumulation mode and trace mode via the spr LDBAR in the hardware can trigger a checkstop(system crash). This patch series re-enables IMC trace mode and fixes the mode switching issue by global lock mechanism. Patch 3/5,4/5 and 5/5 provides a selftest to verify the global-lock mechanism. Changes from v1 -> v2: - - Added self test patches to the series. Anju T Sudhakar (2): powerpc/powernv: Re-enable imc trace-mode in kernel powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events. Madhavan Srinivasan (3): powerpc/perf: Add an interface sub-folder to imc pmu selftest/powerpc/pmc: Support to include interface test for Memory Counter PMUs selftest/powerpc/pmu: Testcase for imc global lock mechanism arch/powerpc/include/asm/imc-pmu.h| 11 +- arch/powerpc/perf/imc-pmu.c | 196 +++--- arch/powerpc/platforms/powernv/opal-imc.c | 9 +- tools/testing/selftests/powerpc/pmu/Makefile | 7 +- .../powerpc/pmu/mem_counters/Makefile | 21 ++ .../pmu/mem_counters/imc_global_lock_test.c | 68 ++ .../powerpc/pmu/mem_counters/mem_counters.c | 99 + .../powerpc/pmu/mem_counters/mem_counters.h | 36 8 files changed, 408 insertions(+), 39 deletions(-) create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/Makefile create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/imc_global_lock_test.c create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.c create mode 100644 tools/testing/selftests/powerpc/pmu/mem_counters/mem_counters.h -- 2.18.1
[PATCH 2/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.
IMC(In-memory Collection Counters) does performance monitoring in two different modes, i.e accumulation mode(core-imc and thread-imc events), and trace mode(trace-imc events). A cpu thread can either be in accumulation-mode or trace-mode at a time and this is done via the LDBAR register in POWER architecture. The current design does not address the races between thread-imc and trace-imc events. Patch implements a global id and lock to avoid the races between core, trace and thread imc events. With this global id-lock implementation, the system can either run core, thread or trace imc events at a time. i.e. to run any core-imc events, thread/trace imc events should not be enabled/monitored. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 177 +++- 1 file changed, 153 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index cb50a9e1fd2d..2e220f199530 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem); static struct imc_pmu_ref *trace_imc_refc; static int trace_imc_mem_size; +/* + * Global data structure used to avoid races between thread, + * core and trace-imc + */ +static struct imc_pmu_ref imc_global_refc = { + .lock = __MUTEX_INITIALIZER(imc_global_refc.lock), + .id = 0, + .refc = 0, +}; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -759,6 +769,20 @@ static void core_imc_counters_release(struct perf_event *event) ref->refc = 0; } mutex_unlock(>lock); + + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_CORE) { + imc_global_refc.refc--; + /* +* If no other thread is running any core-imc +* event, set the global id to zero. +*/ + if (imc_global_refc.refc <= 0) { + imc_global_refc.refc = 0; + imc_global_refc.id = 0; + } + } + mutex_unlock(_global_refc.lock); } static int core_imc_event_init(struct perf_event *event) @@ -779,6 +803,22 @@ static int core_imc_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; + /* +* Take the global lock, and make sure +* no other thread is running any trace OR thread imc event +*/ + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == 0) { + imc_global_refc.id = IMC_DOMAIN_CORE; + imc_global_refc.refc++; + } else if (imc_global_refc.id == IMC_DOMAIN_CORE) { + imc_global_refc.refc++; + } else { + mutex_unlock(_global_refc.lock); + return -EBUSY; + } + mutex_unlock(_global_refc.lock); + event->hw.idx = -1; pmu = imc_event_to_pmu(event); @@ -877,7 +917,16 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu) static int ppc_thread_imc_cpu_offline(unsigned int cpu) { - mtspr(SPRN_LDBAR, 0); + /* +* Toggle the bit 0 of LDBAR. +* +* If bit 0 of LDBAR is unset, it will stop posting +* the counetr data to memory. +* For thread-imc, bit 0 of LDBAR will be set to 1 in the +* event_add function. So toggle this bit here, to stop the updates +* to memory in the cpu_offline path. +*/ + mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) ^ (1UL << 63))); return 0; } @@ -889,6 +938,24 @@ static int thread_imc_cpu_init(void) ppc_thread_imc_cpu_offline); } +static void thread_imc_counters_release(struct perf_event *event) +{ + + mutex_lock(_global_refc.lock); + if (imc_global_refc.id == IMC_DOMAIN_THREAD) { + imc_global_refc.refc--; + /* +* If no other thread is running any thread-imc +* event, set the global id to zero. +*/ + if (imc_global_refc.refc <= 0) { + imc_global_refc.refc = 0; + imc_global_refc.id = 0; + } + } + mutex_unlock(_global_refc.lock); +} + static int thread_imc_event_init(struct perf_event *event) { u32 config = event->attr.config; @@ -905,6 +972,27 @@ static int thread_imc_event_init(struct perf_event *event) if (event->hw.sample_period) return -EINVAL; + mutex_lock(_global_refc.lock); + /* +* Check if any other thread is running +* core-engine, if not set the global id to +* thread-imc. +*/ + if (imc_global_refc.id == 0) { + imc_global_refc.id = IMC_DOMAIN_THREAD; + imc_global_refc.refc++; + } else if (imc
[PATCH 1/2] powerpc/powernv: Re-enable imc trace-mode in kernel
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu" disables IMC(In-Memory Collection) trace-mode in kernel, since frequent mode switching between accumulation mode and trace mode via the spr LDBAR in the hardware can trigger a checkstop(system crash). Patch to re-enable imc-trace mode in kernel. The following patch in this series will address the mode switching issue by implementing a global lock, and will restrict the usage of accumulation and trace-mode at a time. Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 000b350d4060..3b4518f4b643 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -278,14 +278,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) domain = IMC_DOMAIN_THREAD; break; case IMC_TYPE_TRACE: - /* -* FIXME. Using trace_imc events to monitor application -* or KVM thread performance can cause a checkstop -* (system crash). -* Disable it for now. -*/ - pr_info_once("IMC: disabling trace_imc PMU\n"); - domain = -1; + domain = IMC_DOMAIN_TRACE; break; default: pr_warn("IMC Unknown Device type \n"); -- 2.20.1
[PATCH v3] platforms/powernv: Avoid re-registration of imc debugfs directory
export_imc_mode_and_cmd() function which creates the debugfs interface for imc-mode and imc-command, is invoked when each nest pmu units is registered. When the first nest pmu unit is registered, export_imc_mode_and_cmd() creates 'imc' directory under `/debug/powerpc/`. In the subsequent invocations debugfs_create_dir() function returns, since the directory already exists. The recent commit (debugfs: make error message a bit more verbose), throws a warning if we try to invoke `debugfs_create_dir()` with an already existing directory name. Address this warning by making the debugfs directory registration in the opal_imc_counters_probe() function, i.e invoke export_imc_mode_and_cmd() function from the probe function. Signed-off-by: Anju T Sudhakar --- Changes from v2 -> v3: * Invoke export_imc_mode_and_cmd(), which does the imc debugfs directory registration and deletion, from the probe fucntion. * Change the return type of imc_pmu_create() to get the control block address for nest units in the probe function * Remove unnecessary comments --- arch/powerpc/platforms/powernv/opal-imc.c | 39 +-- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index e04b206..3b4518f 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -59,10 +59,6 @@ static void export_imc_mode_and_cmd(struct device_node *node, imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); - /* -* Return here, either because 'imc' directory already exists, -* Or failed to create a new one. -*/ if (!imc_debugfs_parent) return; @@ -135,7 +131,6 @@ static int imc_get_mem_addr_nest(struct device_node *node, } pmu_ptr->imc_counter_mmaped = true; - export_imc_mode_and_cmd(node, pmu_ptr); kfree(base_addr_arr); kfree(chipid_arr); return 0; @@ -151,7 +146,7 @@ static int imc_get_mem_addr_nest(struct device_node *node, * and domain as the inputs. * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets */ -static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) +static struct imc_pmu *imc_pmu_create(struct device_node *parent, int pmu_index, int domain) { int ret = 0; struct imc_pmu *pmu_ptr; @@ -159,27 +154,23 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) /* Return for unknown domain */ if (domain < 0) - return -EINVAL; + return NULL; /* memory for pmu */ pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL); if (!pmu_ptr) - return -ENOMEM; + return NULL; /* Set the domain */ pmu_ptr->domain = domain; ret = of_property_read_u32(parent, "size", _ptr->counter_mem_size); - if (ret) { - ret = -EINVAL; + if (ret) goto free_pmu; - } if (!of_property_read_u32(parent, "offset", )) { - if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) { - ret = -EINVAL; + if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) goto free_pmu; - } } /* Function to register IMC pmu */ @@ -190,14 +181,14 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) if (pmu_ptr->domain == IMC_DOMAIN_NEST) kfree(pmu_ptr->mem_info); kfree(pmu_ptr); - return ret; + return NULL; } - return 0; + return pmu_ptr; free_pmu: kfree(pmu_ptr); - return ret; + return NULL; } static void disable_nest_pmu_counters(void) @@ -254,6 +245,7 @@ int get_max_nest_dev(void) static int opal_imc_counters_probe(struct platform_device *pdev) { struct device_node *imc_dev = pdev->dev.of_node; + struct imc_pmu *pmu; int pmu_count = 0, domain; bool core_imc_reg = false, thread_imc_reg = false; u32 type; @@ -269,6 +261,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) } for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) { + pmu = NULL; if (of_property_read_u32(imc_dev, "type", )) { pr_warn("IMC Device without type property\n"); continue; @@ -293,9 +286,13 @@ static int opal_imc_counters_probe(struct platform_device *pdev) break; } - if (!imc_pmu_create(imc_dev, pmu_count, domain)) { - if (domain == IMC_DO
[tip: perf/urgent] perf kvm: Move kvm-stat header file from conditional inclusion to common include section
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: 8067b3da970baa12e6045400fdf009673b8dd3c2 Gitweb: https://git.kernel.org/tip/8067b3da970baa12e6045400fdf009673b8dd3c2 Author:Anju T Sudhakar AuthorDate:Thu, 18 Jul 2019 23:47:47 +05:30 Committer: Arnaldo Carvalho de Melo CommitterDate: Fri, 20 Sep 2019 10:28:26 -03:00 perf kvm: Move kvm-stat header file from conditional inclusion to common include section Move kvm-stat header file to the common include section, and make the definitions in the header file under the conditional inclusion `#ifdef HAVE_KVM_STAT_SUPPORT`. This helps to define other 'perf kvm' related function prototypes in kvm-stat header file, which may not need kvm-stat support. Signed-off-by: Anju T Sudhakar Reviewed-By: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20190718181749.30612-1-a...@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kvm.c | 2 +- tools/perf/util/kvm-stat.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index ac6d6e0..2b822be 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -21,6 +21,7 @@ #include "util/top.h" #include "util/data.h" #include "util/ordered-events.h" +#include "util/kvm-stat.h" #include "ui/ui.h" #include @@ -59,7 +60,6 @@ static const char *get_filename_for_perf_kvm(void) } #ifdef HAVE_KVM_STAT_SUPPORT -#include "util/kvm-stat.h" void exit_event_get_key(struct evsel *evsel, struct perf_sample *sample, diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 4691363..8fd6ec2 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -2,6 +2,8 @@ #ifndef __PERF_KVM_STAT_H #define __PERF_KVM_STAT_H +#ifdef HAVE_KVM_STAT_SUPPORT + #include "tool.h" #include "stat.h" #include "record.h" @@ -144,5 +146,6 @@ extern const int decode_str_len; extern const char *kvm_exit_reason; extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; +#endif /* HAVE_KVM_STAT_SUPPORT */ #endif /* __PERF_KVM_STAT_H */
[tip: perf/urgent] perf kvm: Add arch neutral function to choose event for perf kvm record
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: 124eb5f82bf9395419b20205c4dcc1b8fcda7f29 Gitweb: https://git.kernel.org/tip/124eb5f82bf9395419b20205c4dcc1b8fcda7f29 Author:Anju T Sudhakar AuthorDate:Thu, 18 Jul 2019 23:47:48 +05:30 Committer: Arnaldo Carvalho de Melo CommitterDate: Fri, 20 Sep 2019 10:28:26 -03:00 perf kvm: Add arch neutral function to choose event for perf kvm record 'perf kvm record' uses 'cycles'(if the user did not specify any event) as the default event to profile the guest. This will not provide any proper samples from the guest incase of powerpc architecture, since in powerpc the PMUs are controlled by the guest rather than the host. Patch adds a function to pick an arch specific event for 'perf kvm record', instead of selecting 'cycles' as a default event for all architectures. For powerpc this function checks for any user specified event, and if there isn't any it returns invalid instead of proceeding with 'cycles' event. Signed-off-by: Anju T Sudhakar Reviewed-by: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20190718181749.30612-2-a...@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/kvm-stat.c | 37 - tools/perf/builtin-kvm.c| 12 +++- tools/perf/util/kvm-stat.h | 1 +- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index f0dbf7b..ec5b771 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -8,6 +8,7 @@ #include "book3s_hv_exits.h" #include "book3s_hcalls.h" +#include #define NR_TPS 4 @@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) return ret; } + +/* + * Incase of powerpc architecture, pmu registers are programmable + * by guest kernel. So monitoring guest via host may not provide + * valid samples. It is better to fail the "perf kvm record" + * with default "cycles" event to monitor guest in powerpc. + * + * Function to parse the arguments and return appropriate values. + */ +int kvm_add_default_arch_event(int *argc, const char **argv) +{ + const char **tmp; + bool event = false; + int i, j = *argc; + + const struct option event_options[] = { + OPT_BOOLEAN('e', "event", , NULL), + OPT_END() + }; + + tmp = calloc(j + 1, sizeof(char *)); + if (!tmp) + return -EINVAL; + + for (i = 0; i < j; i++) + tmp[i] = argv[i]; + + parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); + if (!event) { + free(tmp); + return -EINVAL; + } + + free(tmp); + return 0; +} diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 2b822be..6e3e366 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1514,11 +1514,21 @@ perf_stat: } #endif /* HAVE_KVM_STAT_SUPPORT */ +int __weak kvm_add_default_arch_event(int *argc __maybe_unused, + const char **argv __maybe_unused) +{ + return 0; +} + static int __cmd_record(const char *file_name, int argc, const char **argv) { - int rec_argc, i = 0, j; + int rec_argc, i = 0, j, ret; const char **rec_argv; + ret = kvm_add_default_arch_event(, argv); + if (ret) + return -EINVAL; + rec_argc = argc + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); rec_argv[i++] = strdup("record"); diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 8fd6ec2..6f0fa05 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -148,4 +148,5 @@ extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; #endif /* HAVE_KVM_STAT_SUPPORT */ +extern int kvm_add_default_arch_event(int *argc, const char **argv); #endif /* __PERF_KVM_STAT_H */
[tip: perf/urgent] perf kvm stat: Set 'trace_cycles' as default event for 'perf kvm record' in powerpc
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: 2bff2b828502b5e5d5ea5a52643d3542053df03f Gitweb: https://git.kernel.org/tip/2bff2b828502b5e5d5ea5a52643d3542053df03f Author:Anju T Sudhakar AuthorDate:Thu, 18 Jul 2019 23:47:49 +05:30 Committer: Arnaldo Carvalho de Melo CommitterDate: Fri, 20 Sep 2019 10:28:26 -03:00 perf kvm stat: Set 'trace_cycles' as default event for 'perf kvm record' in powerpc Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record' in powerpc. Signed-off-by: Anju T Sudhakar Reviewed-by: Ravi Bangoria Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20190718181749.30612-3-a...@linux.vnet.ibm.com [ Add missing pmu.h header, needed because this patch uses pmu_have_event() ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/kvm-stat.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index ec5b771..9cc1c4a 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -5,6 +5,7 @@ #include "util/debug.h" #include "util/evsel.h" #include "util/evlist.h" +#include "util/pmu.h" #include "book3s_hv_exits.h" #include "book3s_hcalls.h" @@ -177,8 +178,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) /* * Incase of powerpc architecture, pmu registers are programmable * by guest kernel. So monitoring guest via host may not provide - * valid samples. It is better to fail the "perf kvm record" - * with default "cycles" event to monitor guest in powerpc. + * valid samples with default 'cycles' event. It is better to use + * 'trace_imc/trace_cycles' event for guest profiling, since it + * can track the guest instruction pointer in the trace-record. * * Function to parse the arguments and return appropriate values. */ @@ -202,8 +204,14 @@ int kvm_add_default_arch_event(int *argc, const char **argv) parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); if (!event) { - free(tmp); - return -EINVAL; + if (pmu_have_event("trace_imc", "trace_cycles")) { + argv[j++] = strdup("-e"); + argv[j++] = strdup("trace_imc/trace_cycles/"); + *argc += 2; + } else { + free(tmp); + return -EINVAL; + } } free(tmp);
[PATCH v2] platform/powernv: Avoid re-registration of imc debugfs directory
export_imc_mode_and_cmd() function which creates the debugfs interface for imc-mode and imc-command, is invoked when each nest pmu units is registered. When the first nest pmu unit is registered, export_imc_mode_and_cmd() creates 'imc' directory under `/debug/powerpc/`. In the subsequent invocations debugfs_create_dir() function returns, since the directory already exists. The recent commit (debugfs: make error message a bit more verbose), throws a warning if we try to invoke `debugfs_create_dir()` with an already existing directory name. Address this warning by searching for an existing 'imc' directory, and do not invoke debugfs_create_dir(), if the debugfs interface for imc already exists. This patch is based on: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-August/195898.html Signed-off-by: Anju T Sudhakar Tested-by: Nageswara R Sastry --- Changes from v1 -> v2 * Minor changes in the commit message. --- arch/powerpc/platforms/powernv/opal-imc.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index e04b20625cb9..fc2f0e60a44d 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node *node, static u64 loc, *imc_mode_addr, *imc_cmd_addr; char mode[16], cmd[16]; u32 cb_offset; + struct dentry *dir = NULL; struct imc_mem_info *ptr = pmu_ptr->mem_info; + + /* Return, if 'imc' interface already exists */ + dir = debugfs_lookup("imc", powerpc_debugfs_root); + if (dir) { + dput(dir); + return; + } imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); - /* -* Return here, either because 'imc' directory already exists, -* Or failed to create a new one. -*/ + /* Return here, if failed to create the directory */ if (!imc_debugfs_parent) return; -- 2.20.1
Re: [PATCH] platform/powernv: Avoid re-registration of imc debugfs directory
Hi, On 8/21/19 10:16 AM, Oliver O'Halloran wrote: On Wed, Aug 21, 2019 at 2:10 PM Anju T Sudhakar wrote: export_imc_mode_and_cmd() function which creates the debugfs interface for imc-mode and imc-command, is invoked when each nest pmu units is registered. When the first nest pmu unit is registered, export_imc_mode_and_cmd() creates 'imc' directory under `/debug/powerpc`. In the subsequent invocations debugfs_create_dir() function returns, since the directory already exists. The recent commit (debugfs: make error message a bit more verbose), throws a warning if we try to invoke `debugfs_create_dir()` with an already existing directory name. Address this warning by lookup for an existing 'imc' directory, and do not invoke debugfs_create_dir(), if the debugfs interface for imc already exists. This patch is based on: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-July/192979.html Signed-off-by: Anju T Sudhakar Tested-by: Nageswara R Sastry --- arch/powerpc/platforms/powernv/opal-imc.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index e04b20625cb9..fc2f0e60a44d 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node *node, static u64 loc, *imc_mode_addr, *imc_cmd_addr; char mode[16], cmd[16]; u32 cb_offset; + struct dentry *dir = NULL; struct imc_mem_info *ptr = pmu_ptr->mem_info; + + /* Return, if 'imc' interface already exists */ + dir = debugfs_lookup("imc", powerpc_debugfs_root); + if (dir) { + dput(dir); + return; + } imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); Is there a reason why we create the debugfs directory here and not in opal_imc_counters_probe()? There's logic to remove the debugfs directory in _probe() already so it seems like a more natural place to it. Good point. But we can only create the parent directory, i.e 'imc' directory in `_probe()` function and the entries can be created only here. The reason is, this debugfs entries are only for IMC nest units. So, to get the imc mode and command values from the nest memory region we need the relevant offsets from the control block structure. Since imc_get_mem_addr_nest() function reads this address for each chip, we invoke the function to create the debugfs entries after this values are populated(i.e export_imc_mode_and_cmd() in invoked by imc_get_mem_addr_nest()). Also, if we create the parent directory in `_probe()` function, we need to track whether the entries(i.e imc_cmd and imc_mode) are created or not. Regards, Anju
[PATCH] platform/powernv: Avoid re-registration of imc debugfs directory
export_imc_mode_and_cmd() function which creates the debugfs interface for imc-mode and imc-command, is invoked when each nest pmu units is registered. When the first nest pmu unit is registered, export_imc_mode_and_cmd() creates 'imc' directory under `/debug/powerpc`. In the subsequent invocations debugfs_create_dir() function returns, since the directory already exists. The recent commit (debugfs: make error message a bit more verbose), throws a warning if we try to invoke `debugfs_create_dir()` with an already existing directory name. Address this warning by lookup for an existing 'imc' directory, and do not invoke debugfs_create_dir(), if the debugfs interface for imc already exists. This patch is based on: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-July/192979.html Signed-off-by: Anju T Sudhakar Tested-by: Nageswara R Sastry --- arch/powerpc/platforms/powernv/opal-imc.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index e04b20625cb9..fc2f0e60a44d 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node *node, static u64 loc, *imc_mode_addr, *imc_cmd_addr; char mode[16], cmd[16]; u32 cb_offset; + struct dentry *dir = NULL; struct imc_mem_info *ptr = pmu_ptr->mem_info; + + /* Return, if 'imc' interface already exists */ + dir = debugfs_lookup("imc", powerpc_debugfs_root); + if (dir) { + dput(dir); + return; + } imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); - /* -* Return here, either because 'imc' directory already exists, -* Or failed to create a new one. -*/ + /* Return here, if failed to create the directory */ if (!imc_debugfs_parent) return; -- 2.20.1
Re: [RFC PATCH 3/4] powerpc/perf: fix imc allocation failure
On 7/22/19 11:16 PM, Nicholas Piggin wrote: alloc_pages_node return value should be tested before applying page_address. Cc: Anju T Sudhakar Cc: Madhavan Srinivasan Signed-off-by: Nicholas Piggin --- Tested-by: Anju T Sudhakar
Re: [PATCH v2] powerpc/imc: Dont create debugfs files for cpu-less nodes
Hi Qian, On 7/16/19 12:11 AM, Qian Cai wrote: On Thu, 2019-07-11 at 14:53 +1000, Michael Ellerman wrote: Hi Maddy, Madhavan Srinivasan writes: diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 186109bdd41b..e04b20625cb9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node, if (of_property_read_u32(node, "cb_offset", _offset)) cb_offset = IMC_CNTL_BLK_OFFSET; - for_each_node(nid) { - loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; + while (ptr->vbase != NULL) { This means you'll bail out as soon as you find a node with no vbase, but it's possible we could have a CPU-less node intermingled with other nodes. So I think you want to keep the for loop, but continue if you see a NULL vbase? Not sure if this will also takes care of some of those messages during the boot on today's linux-next even without this patch. [ 18.077780][T1] debugfs: Directory 'imc' with parent 'powerpc' already present! This is introduced by a recent commit: c33d442328f55 (debugfs: make error message a bit more verbose). So basically, the debugfs imc_* file is created per node, and is created by the first nest unit which is being registered. For the subsequent nest units, debugfs_create_dir() will just return since the imc_* file already exist. The commit "c33d442328f55 (debugfs: make error message a bit more verbose)", prints a message if the debugfs file already exists in debugfs_create_dir(). That is why we are encountering these messages now. This patch (i.e, powerpc/imc: Dont create debugfs files for cpu-less nodes) will address the initial issue, i.e "numa crash while reading imc_* debugfs files for cpu less nodes", and will not address these debugfs messages. But yeah this is a good catch. We can have some checks to avoid these debugfs messages. Hi Michael, Do we need to have a separate patch to address these debugfs messages, or can we address the same in the next version of this patch itself? Thanks, Anju
[PATCH v2 3/3] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record' in powerpc. Signed-off-by: Anju T Sudhakar --- tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index c55e7405940e..0a06626fb18a 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) /* * Incase of powerpc architecture, pmu registers are programmable * by guest kernel. So monitoring guest via host may not provide - * valid samples. It is better to fail the "perf kvm record" - * with default "cycles" event to monitor guest in powerpc. + * valid samples with default 'cycles' event. It is better to use + * 'trace_imc/trace_cycles' event for guest profiling, since it + * can track the guest instruction pointer in the trace-record. * * Function to parse the arguments and return appropriate values. */ @@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char **argv) parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); if (!event) { - free(tmp); - return -EINVAL; + if (pmu_have_event("trace_imc", "trace_cycles")) { + argv[j++] = strdup("-e"); + argv[j++] = strdup("trace_imc/trace_cycles/"); + *argc += 2; + } else { + free(tmp); + return -EINVAL; + } } free(tmp); -- 2.20.1
[PATCH v2 2/3] tools/perf: Add arch neutral function to choose event for perf kvm record
'perf kvm record' uses 'cycles'(if the user did not specify any event) as the default event to profile the guest. This will not provide any proper samples from the guest incase of powerpc architecture, since in powerpc the PMUs are controlled by the guest rather than the host. Patch adds a function to pick an arch specific event for 'perf kvm record', instead of selecting 'cycles' as a default event for all architectures. For powerpc this function checks for any user specified event, and if there isn't any it returns invalid instead of proceeding with 'cycles' event. Signed-off-by: Anju T Sudhakar --- Changes from v1->v2 * Cross-build issue for aarch64, reported by Ravi is fixed. --- tools/perf/arch/powerpc/util/kvm-stat.c | 37 + tools/perf/builtin-kvm.c| 12 +++- tools/perf/util/kvm-stat.h | 1 + 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index f9db341c47b6..c55e7405940e 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -8,6 +8,7 @@ #include "book3s_hv_exits.h" #include "book3s_hcalls.h" +#include #define NR_TPS 4 @@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) return ret; } + +/* + * Incase of powerpc architecture, pmu registers are programmable + * by guest kernel. So monitoring guest via host may not provide + * valid samples. It is better to fail the "perf kvm record" + * with default "cycles" event to monitor guest in powerpc. + * + * Function to parse the arguments and return appropriate values. + */ +int kvm_add_default_arch_event(int *argc, const char **argv) +{ + const char **tmp; + bool event = false; + int i, j = *argc; + + const struct option event_options[] = { + OPT_BOOLEAN('e', "event", , NULL), + OPT_END() + }; + + tmp = calloc(j + 1, sizeof(char *)); + if (!tmp) + return -EINVAL; + + for (i = 0; i < j; i++) + tmp[i] = argv[i]; + + parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); + if (!event) { + free(tmp); + return -EINVAL; + } + + free(tmp); + return 0; +} diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 5d2b34d290a3..d03750da051b 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv) } #endif /* HAVE_KVM_STAT_SUPPORT */ +int __weak kvm_add_default_arch_event(int *argc __maybe_unused, + const char **argv __maybe_unused) +{ + return 0; +} + static int __cmd_record(const char *file_name, int argc, const char **argv) { - int rec_argc, i = 0, j; + int rec_argc, i = 0, j, ret; const char **rec_argv; + ret = kvm_add_default_arch_event(, argv); + if (ret) + return -EINVAL; + rec_argc = argc + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); rec_argv[i++] = strdup("record"); diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index b3b2670e1a2b..81a5bf4fbc71 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -148,4 +148,5 @@ extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; #endif /* HAVE_KVM_STAT_SUPPORT */ +extern int kvm_add_default_arch_event(int *argc, const char **argv); #endif /* __PERF_KVM_STAT_H */ -- 2.20.1
[PATCH v2 1/3] tools/perf: Move kvm-stat header file from conditional inclusion to common include section
Move kvm-stat header file to the common include section, and make the definitions in the header file under the conditional inclusion `#ifdef HAVE_KVM_STAT_SUPPORT`. This helps to define other perf kvm related function prototypes in kvm-stat header file, which may not need kvm-stat support. Signed-off-by: Anju T Sudhakar --- tools/perf/builtin-kvm.c | 2 +- tools/perf/util/kvm-stat.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index b33c83489120..5d2b34d290a3 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -19,6 +19,7 @@ #include "util/top.h" #include "util/data.h" #include "util/ordered-events.h" +#include "util/kvm-stat.h" #include #ifdef HAVE_TIMERFD_SUPPORT @@ -55,7 +56,6 @@ static const char *get_filename_for_perf_kvm(void) } #ifdef HAVE_KVM_STAT_SUPPORT -#include "util/kvm-stat.h" void exit_event_get_key(struct perf_evsel *evsel, struct perf_sample *sample, diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 1403dec189b4..b3b2670e1a2b 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -2,6 +2,8 @@ #ifndef __PERF_KVM_STAT_H #define __PERF_KVM_STAT_H +#ifdef HAVE_KVM_STAT_SUPPORT + #include "../perf.h" #include "tool.h" #include "stat.h" @@ -144,5 +146,6 @@ extern const int decode_str_len; extern const char *kvm_exit_reason; extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; +#endif /* HAVE_KVM_STAT_SUPPORT */ #endif /* __PERF_KVM_STAT_H */ -- 2.20.1
Re: power9 NUMA crash while reading debugfs imc_cmd
On 6/28/19 9:04 AM, Qian Cai wrote: On Jun 27, 2019, at 11:12 PM, Michael Ellerman wrote: Qian Cai writes: Read of debugfs imc_cmd file for a memory-less node will trigger a crash below on this power9 machine which has the following NUMA layout. What type of machine is it? description: PowerNV product: 8335-GTH (ibm,witherspoon) vendor: IBM width: 64 bits capabilities: smp powernv opal Hi Qian Cai, Could you please try with this patch: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-June/192803.html and see if the issue is resolved? Thanks, Anju
[PATCH] powerpc/imc: Dont create debugfs files for cpu-less nodes
From: Madhavan Srinivasan Commit <684d984038aa> ('powerpc/powernv: Add debugfs interface for imc-mode and imc') added debugfs interface for the nest imc pmu devices to support changing of different ucode modes. Primarily adding this capability for debug. But when doing so, the code did not consider the case of cpu-less nodes. So when reading the _cmd_ or _mode_ file of a cpu-less node will create this crash. [ 1139.415461][ T5301] Faulting instruction address: 0xc00d0d58 [ 1139.415492][ T5301] Oops: Kernel access of bad area, sig: 11 [#1] [ 1139.415509][ T5301] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=256 DEBUG_PAGEALLOC NUMA PowerNV [ 1139.415542][ T5301] Modules linked in: i2c_opal i2c_core ip_tables x_tables xfs sd_mod bnx2x mdio ahci libahci tg3 libphy libata firmware_class dm_mirror dm_region_hash dm_log dm_mod [ 1139.415595][ T5301] CPU: 67 PID: 5301 Comm: cat Not tainted 5.2.0-rc6-next- 20190627+ #19 [ 1139.415634][ T5301] NIP: c00d0d58 LR: c049aa18 CTR:c00d0d50 [ 1139.415675][ T5301] REGS: c00020194548f9e0 TRAP: 0300 Not tainted (5.2.0-rc6-next-20190627+) [ 1139.415705][ T5301] MSR: 90009033 CR:28022822 XER: [ 1139.415777][ T5301] CFAR: c049aa14 DAR: 0003fc08 DSISR:4000 IRQMASK: 0 [ 1139.415777][ T5301] GPR00: c049aa18 c00020194548fc70 c16f8b03fc08 [ 1139.415777][ T5301] GPR04: c00020194548fcd0 14884e7300011eaa [ 1139.415777][ T5301] GPR08: 7eea5a52 c00d0d50 [ 1139.415777][ T5301] GPR12: c00d0d50 c000201fff7f8c00 [ 1139.415777][ T5301] GPR16: 000d 7fffeb0c3368 [ 1139.415777][ T5301] GPR20: 0002 [ 1139.415777][ T5301] GPR24: 000200010ec9 [ 1139.415777][ T5301] GPR28: c00020194548fdf0 c00020049a584ef8 c00020049a584ea8 [ 1139.416116][ T5301] NIP [c00d0d58] imc_mem_get+0x8/0x20 [ 1139.416143][ T5301] LR [c049aa18] simple_attr_read+0x118/0x170 [ 1139.416158][ T5301] Call Trace: [ 1139.416182][ T5301] [c00020194548fc70] [c049a970]simple_attr_read+0x70/0x170 (unreliable) [ 1139.416255][ T5301] [c00020194548fd10] [c054385c]debugfs_attr_read+0x6c/0xb0 [ 1139.416305][ T5301] [c00020194548fd60] [c0454c1c]__vfs_read+0x3c/0x70 [ 1139.416363][ T5301] [c00020194548fd80] [c0454d0c] vfs_read+0xbc/0x1a0 [ 1139.416392][ T5301] [c00020194548fdd0] [c045519c]ksys_read+0x7c/0x140 [ 1139.416434][ T5301] [c00020194548fe20] [c000b108]system_call+0x5c/0x70 [ 1139.416473][ T5301] Instruction dump: [ 1139.416511][ T5301] 4e800020 6000 7c0802a6 6000 7c801d28 3860 4e800020 6000 [ 1139.416572][ T5301] 6000 6000 7c0802a6 6000 <7d201c28> 3860 f924 4e800020 [ 1139.416636][ T5301] ---[ end trace c44d1fb4ace04784 ]--- [ 1139.520686][ T5301] [ 1140.520820][ T5301] Kernel panic - not syncing: Fatal exception Patch adds a check to avoid creation of these files to cpu-less nodes. Fixes: 684d984038aa ('powerpc/powernv: Add debugfs interface for imc-mode and imc') Reported-by: Qian Cai Signed-off-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 186109bdd41b..12c8964a2f9c 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ static void export_imc_mode_and_cmd(struct device_node *node, int chip = 0, nid; char mode[16], cmd[16]; u32 cb_offset; + const struct cpumask *l_cpumask; imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); @@ -70,6 +72,14 @@ static void export_imc_mode_and_cmd(struct device_node *node, cb_offset = IMC_CNTL_BLK_OFFSET; for_each_node(nid) { + /* +* Since these are related to nest pmu, +* create only if the node has any cpu in it. +*/ + l_cpumask = cpumask_of_node(nid); + if (cpumask_empty(l_cpumask)) + continue; + loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET); sprintf(mode, "imc_mode_%d", nid); -- 2.20.1
Re: [PATCH v2] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.
Hi Leonardo, On 6/11/19 12:17 AM, Leonardo Bras wrote: On Mon, 2019-06-10 at 12:02 +0530, Anju T Sudhakar wrote: Nest and core imc(In-memory Collection counters) assigns a particular cpu as the designated target for counter data collection. During system boot, the first online cpu in a chip gets assigned as the designated cpu for that chip(for nest-imc) and the first online cpu in a core gets assigned as the designated cpu for that core(for core-imc). If the designated cpu goes offline, the next online cpu from the same chip(for nest-imc)/core(for core-imc) is assigned as the next target, and the event context is migrated to the target cpu. Currently, cpumask_any_but() function is used to find the target cpu. Though this function is expected to return a `random` cpu, this always returns the next online cpu. If all cpus in a chip/core is offlined in a sequential manner, starting from the first cpu, the event migration has to happen for all the cpus which goes offline. Since the migration process involves a grace period, the total time taken to offline all the cpus will be significantly high. Seems like a very interesting work. Out of curiosity, have you used 'chcpu -d' to create your benchmark? Here I did not use chcpu to disable the cpu. I used a script which will offline cpus 88-175 by echoing `0` to /sys/devices/system/cpu/cpu*/online. Regards, Anju
[PATCH RESEND 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record' in powerpc. Signed-off-by: Anju T Sudhakar --- tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index 66f8fe500945..b552884263df 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) /* * Incase of powerpc architecture, pmu registers are programmable * by guest kernel. So monitoring guest via host may not provide - * valid samples. It is better to fail the "perf kvm record" - * with default "cycles" event to monitor guest in powerpc. + * valid samples with default 'cycles' event. It is better to use + * 'trace_imc/trace_cycles' event for guest profiling, since it + * can track the guest instruction pointer in the trace-record. * * Function to parse the arguments and return appropriate values. */ @@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char **argv) parse_options(j, tmp, event_options, NULL, 0); if (!event) { - free(tmp); - return -EINVAL; + if (pmu_have_event("trace_imc", "trace_cycles")) { + argv[j++] = strdup("-e"); + argv[j++] = strdup("trace_imc/trace_cycles/"); + *argc += 2; + } else { + free(tmp); + return -EINVAL; + } } free(tmp); -- 2.17.2
[PATCH RESEND 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record
'perf kvm record' uses 'cycles'(if the user did not specify any event) as the default event to profile the guest. This will not provide any proper samples from the guest incase of powerpc architecture, since in powerpc the PMUs are controlled by the guest rather than the host. Patch adds a function to pick an arch specific event for 'perf kvm record', instead of selecting 'cycles' as a default event for all architectures. For powerpc this function checks for any user specified event, and if there isn't any it returns invalid instead of proceeding with 'cycles' event. Signed-off-by: Anju T Sudhakar --- tools/perf/arch/powerpc/util/kvm-stat.c | 37 + tools/perf/builtin-kvm.c| 12 +++- tools/perf/util/kvm-stat.h | 2 +- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index f9db341c47b6..66f8fe500945 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -8,6 +8,7 @@ #include "book3s_hv_exits.h" #include "book3s_hcalls.h" +#include #define NR_TPS 4 @@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) return ret; } + +/* + * Incase of powerpc architecture, pmu registers are programmable + * by guest kernel. So monitoring guest via host may not provide + * valid samples. It is better to fail the "perf kvm record" + * with default "cycles" event to monitor guest in powerpc. + * + * Function to parse the arguments and return appropriate values. + */ +int kvm_add_default_arch_event(int *argc, const char **argv) +{ + const char **tmp; + bool event = false; + int i, j = *argc; + + const struct option event_options[] = { + OPT_BOOLEAN('e', "event", , NULL), + OPT_END() + }; + + tmp = calloc(j + 1, sizeof(char *)); + if (!tmp) + return -EINVAL; + + for (i = 0; i < j; i++) + tmp[i] = argv[i]; + + parse_options(j, tmp, event_options, NULL, 0); + if (!event) { + free(tmp); + return -EINVAL; + } + + free(tmp); + return 0; +} diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index dbb6f737a3e2..fe33b3ec55c9 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv) } #endif /* HAVE_KVM_STAT_SUPPORT */ +int __weak kvm_add_default_arch_event(int *argc __maybe_unused, + const char **argv __maybe_unused) +{ + return 0; +} + static int __cmd_record(const char *file_name, int argc, const char **argv) { - int rec_argc, i = 0, j; + int rec_argc, i = 0, j, ret; const char **rec_argv; + ret = kvm_add_default_arch_event(, argv); + if (ret) + return -EINVAL; + rec_argc = argc + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); rec_argv[i++] = strdup("record"); diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 1403dec189b4..da38b56c46cb 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -144,5 +144,5 @@ extern const int decode_str_len; extern const char *kvm_exit_reason; extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; - +extern int kvm_add_default_arch_event(int *argc, const char **argv); #endif /* __PERF_KVM_STAT_H */ -- 2.17.2
[PATCH v2] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.
Nest and core imc(In-memory Collection counters) assigns a particular cpu as the designated target for counter data collection. During system boot, the first online cpu in a chip gets assigned as the designated cpu for that chip(for nest-imc) and the first online cpu in a core gets assigned as the designated cpu for that core(for core-imc). If the designated cpu goes offline, the next online cpu from the same chip(for nest-imc)/core(for core-imc) is assigned as the next target, and the event context is migrated to the target cpu. Currently, cpumask_any_but() function is used to find the target cpu. Though this function is expected to return a `random` cpu, this always returns the next online cpu. If all cpus in a chip/core is offlined in a sequential manner, starting from the first cpu, the event migration has to happen for all the cpus which goes offline. Since the migration process involves a grace period, the total time taken to offline all the cpus will be significantly high. Example: In a system which has 2 sockets, with NUMA node0 CPU(s): 0-87 NUMA node8 CPU(s): 88-175 Time taken to offline cpu 88-175: real2m56.099s user0m0.191s sys 0m0.000s Use cpumask_last() to choose the target cpu, when the designated cpu goes online, so the migration will happen only when the last_cpu in the mask goes offline. This way the time taken to offline all cpus in a chip/core can be reduced. With the patch, Time taken to offline cpu 88-175: real0m12.207s user0m0.171s sys 0m0.000s Offlining all cpus in reverse order is also taken care because, cpumask_any_but() is used to find the designated cpu if the last cpu in the mask goes offline. Since cpumask_any_but() always return the first cpu in the mask, that becomes the designated cpu and migration will happen only when the first_cpu in the mask goes offline. Example: With the patch, Time taken to offline cpu from 175-88: real0m9.330s user0m0.110s sys 0m0.000s Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- Changes from v1: Modified the commit log with more info. --- arch/powerpc/perf/imc-pmu.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 31fa753..fbfd6e7 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu) */ nid = cpu_to_node(cpu); l_cpumask = cpumask_of_node(nid); - target = cpumask_any_but(l_cpumask, cpu); + target = cpumask_last(l_cpumask); + + /* +* If this(target) is the last cpu in the cpumask for this chip, +* check for any possible online cpu in the chip. +*/ + if (unlikely(target == cpu)) + target = cpumask_any_but(l_cpumask, cpu); /* * Update the cpumask with the target cpu and @@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) return 0; /* Find any online cpu in that core except the current "cpu" */ - ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); + ncpu = cpumask_last(cpu_sibling_mask(cpu)); + + if (unlikely(ncpu == cpu)) + ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); if (ncpu >= 0 && ncpu < nr_cpu_ids) { cpumask_set_cpu(ncpu, _imc_cpumask); -- 1.8.3.1
[PATCH 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc
Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record' in powerpc. Signed-off-by: Anju T Sudhakar --- tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index 66f8fe500945..b552884263df 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) /* * Incase of powerpc architecture, pmu registers are programmable * by guest kernel. So monitoring guest via host may not provide - * valid samples. It is better to fail the "perf kvm record" - * with default "cycles" event to monitor guest in powerpc. + * valid samples with default 'cycles' event. It is better to use + * 'trace_imc/trace_cycles' event for guest profiling, since it + * can track the guest instruction pointer in the trace-record. * * Function to parse the arguments and return appropriate values. */ @@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char **argv) parse_options(j, tmp, event_options, NULL, 0); if (!event) { - free(tmp); - return -EINVAL; + if (pmu_have_event("trace_imc", "trace_cycles")) { + argv[j++] = strdup("-e"); + argv[j++] = strdup("trace_imc/trace_cycles/"); + *argc += 2; + } else { + free(tmp); + return -EINVAL; + } } free(tmp); -- 2.17.2
[PATCH 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record
'perf kvm record' uses 'cycles'(if the user did not specify any event) as the default event to profile the guest. This will not provide any proper samples from the guest incase of powerpc architecture, since in powerpc the PMUs are controlled by the guest rather than the host. Patch adds a function to pick an arch specific event for 'perf kvm record', instead of selecting 'cycles' as a default event for all architectures. For powerpc this function checks for any user specified event, and if there isn't any it returns invalid instead of proceeding with 'cycles' event. Signed-off-by: Anju T Sudhakar --- tools/perf/arch/powerpc/util/kvm-stat.c | 37 + tools/perf/builtin-kvm.c| 12 +++- tools/perf/util/kvm-stat.h | 2 +- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index f9db341c47b6..66f8fe500945 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -8,6 +8,7 @@ #include "book3s_hv_exits.h" #include "book3s_hcalls.h" +#include #define NR_TPS 4 @@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) return ret; } + +/* + * Incase of powerpc architecture, pmu registers are programmable + * by guest kernel. So monitoring guest via host may not provide + * valid samples. It is better to fail the "perf kvm record" + * with default "cycles" event to monitor guest in powerpc. + * + * Function to parse the arguments and return appropriate values. + */ +int kvm_add_default_arch_event(int *argc, const char **argv) +{ + const char **tmp; + bool event = false; + int i, j = *argc; + + const struct option event_options[] = { + OPT_BOOLEAN('e', "event", , NULL), + OPT_END() + }; + + tmp = calloc(j + 1, sizeof(char *)); + if (!tmp) + return -EINVAL; + + for (i = 0; i < j; i++) + tmp[i] = argv[i]; + + parse_options(j, tmp, event_options, NULL, 0); + if (!event) { + free(tmp); + return -EINVAL; + } + + free(tmp); + return 0; +} diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index dbb6f737a3e2..fe33b3ec55c9 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv) } #endif /* HAVE_KVM_STAT_SUPPORT */ +int __weak kvm_add_default_arch_event(int *argc __maybe_unused, + const char **argv __maybe_unused) +{ + return 0; +} + static int __cmd_record(const char *file_name, int argc, const char **argv) { - int rec_argc, i = 0, j; + int rec_argc, i = 0, j, ret; const char **rec_argv; + ret = kvm_add_default_arch_event(, argv); + if (ret) + return -EINVAL; + rec_argc = argc + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); rec_argv[i++] = strdup("record"); diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 1403dec189b4..da38b56c46cb 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -144,5 +144,5 @@ extern const int decode_str_len; extern const char *kvm_exit_reason; extern const char *kvm_entry_trace; extern const char *kvm_exit_trace; - +extern int kvm_add_default_arch_event(int *argc, const char **argv); #endif /* __PERF_KVM_STAT_H */ -- 2.17.2
Re: [PATCH] powerpc/powernv: Return for invalid IMC domain
Hi, On 5/21/19 5:18 PM, Michael Ellerman wrote: Anju T Sudhakar writes: Currently init_imc_pmu() can be failed either because an IMC unit with invalid domain(i.e an IMC node not supported by the kernel) is attempted a pmu-registration or something went wrong while registering a valid IMC unit. In both the cases kernel provides a 'Registration failed' error message. Example: Log message, when trace-imc node is not supported by the kernel, and the skiboot supports trace-imc node. So for kernel, trace-imc node is now an unknown domain. [1.731870] nest_phb5_imc performance monitor hardware support registered [1.731944] nest_powerbus0_imc performance monitor hardware support registered [1.734458] thread_imc performance monitor hardware support registered [1.734460] IMC Unknown Device type [1.734462] IMC PMU (null) Register failed [1.734558] nest_xlink0_imc performance monitor hardware support registered [1.734614] nest_xlink1_imc performance monitor hardware support registered [1.734670] nest_xlink2_imc performance monitor hardware support registered [1.747043] Initialise system trusted keyrings [1.747054] Key type blacklist registered To avoid ambiguity on the error message, return for invalid domain before attempting a pmu registration. What do we print once the patch is applied? Once the patch is applied, we return for invalid domains. so we will only have `/IMC Unknown Device type/` message printed for *unknown domains*. And `/IMC PMU (null) Register failed/` message will appear only if the registration fails for a *known domain*. Thanks, Anju
Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the
Hi, Somehow the subject of this patch didn't appear completely here. The Subject of this patch is as follows, `Subject [PATCH] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.` Thanks, Anju On 5/20/19 2:35 PM, Anju T Sudhakar wrote: Nest and core imc(In-memory Collection counters) assigns a particular cpu as the designated target for counter data collection. During system boot, the first online cpu in a chip gets assigned as the designated cpu for that chip(for nest-imc) and the first online cpu in a core gets assigned as the designated cpu for that core(for core-imc). If the designated cpu goes offline, the next online cpu from the same chip(for nest-imc)/core(for core-imc) is assigned as the next target, and the event context is migrated to the target cpu. Currently, cpumask_any_but() function is used to find the target cpu. Though this function is expected to return a `random` cpu, this always returns the next online cpu. If all cpus in a chip/core is offlined in a sequential manner, starting from the first cpu, the event migration has to happen for all the cpus which goes offline. Since the migration process involves a grace period, the total time taken to offline all the cpus will be significantly high.
[PATCH] powerpc/perf: Use cpumask_last() to determine the
Nest and core imc(In-memory Collection counters) assigns a particular cpu as the designated target for counter data collection. During system boot, the first online cpu in a chip gets assigned as the designated cpu for that chip(for nest-imc) and the first online cpu in a core gets assigned as the designated cpu for that core(for core-imc). If the designated cpu goes offline, the next online cpu from the same chip(for nest-imc)/core(for core-imc) is assigned as the next target, and the event context is migrated to the target cpu. Currently, cpumask_any_but() function is used to find the target cpu. Though this function is expected to return a `random` cpu, this always returns the next online cpu. If all cpus in a chip/core is offlined in a sequential manner, starting from the first cpu, the event migration has to happen for all the cpus which goes offline. Since the migration process involves a grace period, the total time taken to offline all the cpus will be significantly high. Example: In a system which has 2 sockets, with NUMA node0 CPU(s): 0-87 NUMA node8 CPU(s): 88-175 Time taken to offline cpu 88-175: real2m56.099s user0m0.191s sys 0m0.000s Use cpumask_last() to choose the target cpu, when the designated cpu goes online, so the migration will happen only when the last_cpu in the mask goes offline. This way the time taken to offline all cpus in a chip/core can be reduced. With the patch, Time taken to offline cpu 88-175: real0m12.207s user0m0.171s sys 0m0.000s cpumask_last() is a better way to find the target cpu, since in most of the cases cpuhotplug is performed in an increasing order(even in ppc64_cpu). cpumask_any_but() can still be used to check the possibility of other online cpus from the same chip/core if the last cpu in the mask goes offline. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 31fa753..fbfd6e7 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu) */ nid = cpu_to_node(cpu); l_cpumask = cpumask_of_node(nid); - target = cpumask_any_but(l_cpumask, cpu); + target = cpumask_last(l_cpumask); + + /* +* If this(target) is the last cpu in the cpumask for this chip, +* check for any possible online cpu in the chip. +*/ + if (unlikely(target == cpu)) + target = cpumask_any_but(l_cpumask, cpu); /* * Update the cpumask with the target cpu and @@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) return 0; /* Find any online cpu in that core except the current "cpu" */ - ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); + ncpu = cpumask_last(cpu_sibling_mask(cpu)); + + if (unlikely(ncpu == cpu)) + ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); if (ncpu >= 0 && ncpu < nr_cpu_ids) { cpumask_set_cpu(ncpu, _imc_cpumask); -- 1.8.3.1
[PATCH] powerpc/powernv: Return for invalid IMC domain
Currently init_imc_pmu() can be failed either because an IMC unit with invalid domain(i.e an IMC node not supported by the kernel) is attempted a pmu-registration or something went wrong while registering a valid IMC unit. In both the cases kernel provides a 'Registration failed' error message. Example: Log message, when trace-imc node is not supported by the kernel, and the skiboot supports trace-imc node. So for kernel, trace-imc node is now an unknown domain. [1.731870] nest_phb5_imc performance monitor hardware support registered [1.731944] nest_powerbus0_imc performance monitor hardware support registered [1.734458] thread_imc performance monitor hardware support registered [1.734460] IMC Unknown Device type [1.734462] IMC PMU (null) Register failed [1.734558] nest_xlink0_imc performance monitor hardware support registered [1.734614] nest_xlink1_imc performance monitor hardware support registered [1.734670] nest_xlink2_imc performance monitor hardware support registered [1.747043] Initialise system trusted keyrings [1.747054] Key type blacklist registered To avoid ambiguity on the error message, return for invalid domain before attempting a pmu registration. Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`) Reported-by: Pavaman Subramaniyam Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 58a0794..4e8b0e1 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) struct imc_pmu *pmu_ptr; u32 offset; + /* Return for unknown domain */ + if (domain < 0) + return -EINVAL; + /* memory for pmu */ pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL); if (!pmu_ptr) -- 1.8.3.1
[PATCH] powerpc/imc: Add documentation for IMC and trace-mode
Documentation for IMC(In-Memory Collection Counters) infrastructure and trace-mode of IMC. Signed-off-by: Anju T Sudhakar --- Documentation/powerpc/imc.txt | 195 ++ 1 file changed, 195 insertions(+) create mode 100644 Documentation/powerpc/imc.txt diff --git a/Documentation/powerpc/imc.txt b/Documentation/powerpc/imc.txt new file mode 100644 index ..9c32e059f3be --- /dev/null +++ b/Documentation/powerpc/imc.txt @@ -0,0 +1,195 @@ + === + IMC (In-Memory Collection Counters) + === + Date created: 10 May 2019 + +Table of Contents: +-- + - Basic overview + - IMC example Usage + - IMC Trace Mode + - LDBAR Register Layout + - TRACE_IMC_SCOM bit representation + - Trace IMC example usage + - Benefits of using IMC trace-mode + + +Basic overview +== + +IMC (In-Memory collection counters) is a hardware monitoring facility +that collects large number of hardware performance events at Nest level +(these are on-chip but off-core), Core level and Thread level. + +The Nest PMU counters are handled by a Nest IMC microcode which runs +in the OCC (On-Chip Controller) complex. The microcode collects the +counter data and moves the nest IMC counter data to memory. + +The Core and Thread IMC PMU counters are handled in the core. Core +level PMU counters give us the IMC counters' data per core and thread +level PMU counters give us the IMC counters' data per CPU thread. + +OPAL obtains the IMC PMU and supported events information from the +IMC Catalog and passes on to the kernel via the device tree. The event's +information contains : + - Event name + - Event Offset + - Event description +and, maybe : + - Event scale + - Event unit + +Some PMUs may have a common scale and unit values for all their +supported events. For those cases, the scale and unit properties for +those events must be inherited from the PMU. + +The event offset in the memory is where the counter data gets +accumulated. + +IMC catalog is available at: + https://github.com/open-power/ima-catalog + +The kernel discovers the IMC counters information in the device tree +at the "imc-counters" device node which has a compatible field +"ibm,opal-in-memory-counters". From the device tree, the kernel parses +the PMUs and their event's information and register the PMU and it +attributes in the kernel. + +IMC example usage += + +# perf list + + [...] + nest_mcs01/PM_MCS01_64B_RD_DISP_PORT01/[Kernel PMU event] + nest_mcs01/PM_MCS01_64B_RD_DISP_PORT23/[Kernel PMU event] + + [...] + core_imc/CPM_0THRD_NON_IDLE_PCYC/ [Kernel PMU event] + core_imc/CPM_1THRD_NON_IDLE_INST/ [Kernel PMU event] + + [...] + thread_imc/CPM_0THRD_NON_IDLE_PCYC/[Kernel PMU event] + thread_imc/CPM_1THRD_NON_IDLE_INST/[Kernel PMU event] + +To see per chip data for nest_mcs0/PM_MCS_DOWN_128B_DATA_XFER_MC0/ : + # ./perf stat -e "nest_mcs01/PM_MCS01_64B_WR_DISP_PORT01/" -a --per-socket + +To see non-idle instructions for core 0 : + # ./perf stat -e "core_imc/CPM_NON_IDLE_INST/" -C 0 -I 1000 + +To see non-idle instructions for a "make" : + # ./perf stat -e "thread_imc/CPM_NON_IDLE_PCYC/" make + + +IMC Trace-mode +=== + +POWER9 support two modes for IMC which are the Accumulation mode and +Trace mode. In Accumulation mode, event counts are accumulated in system +Memory. Hypervisor then reads the posted counts periodically or when +requested. In IMC Trace mode, the 64 bit trace scom value is initialized +with the event information. The CPMC*SEL and CPMC_LOAD in the trace scom, +specifies the event to be monitored and the sampling duration. On each +overflow in the CPMC*SEL, hardware snapshots the program counter along +with event counts and writes into memory pointed by LDBAR. + +LDBAR is a 64 bit special purpose per thread register, it has bits to +indicate whether hardware is configured for accumulation or trace mode. + +* LDBAR Register Layout: + 0 : Enable/Disable + 1 : 0 -> Accumulation Mode + 1 -> Trace Mode + 2:3 : Reserved + 4-6 : PB scope + 7 : Reserved + 8:50 : Counter Address + 51:63 : Reserved + +* TRACE_IMC_SCOM bit representation: + + 0:1 : SAMPSEL + 2:33: CPMC_LOAD + 34:40 : CPMC1SEL + 41:47 : CPMC2SEL + 48:50 : BUFFERSIZE + 51:63 : RESERVED + +CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL determines +the event to count. BUFFRSIZE indicates the memory range. On each overflow, +hardware snapshots program counter along with event counts and update the +memory and reloads the CMPC_LOAD value for the nex
Re: [PATCH v4 0/5] powerpc/perf: IMC trace-mode support
On 4/16/19 3:14 PM, Anju T Sudhakar wrote: Hi, Kindly ignore this series, since patch 5/5 in this series doesn't incorporate the event-format change that I've done in v4 of this series. Apologies for the inconvenience. I will post the updated v5 soon. s/v5/v4 Thanks, Anju On 4/15/19 3:41 PM, Anju T Sudhakar wrote: IMC (In-Memory collection counters) is a hardware monitoring facility that collects large number of hardware performance events. POWER9 support two modes for IMC which are the Accumulation mode and Trace mode. In Accumulation mode, event counts are accumulated in system Memory. Hypervisor then reads the posted counts periodically or when requested. In IMC Trace mode, the 64 bit trace scom value is initialized with the event information. The CPMC*SEL and CPMC_LOAD in the trace scom, specifies the event to be monitored and the sampling duration. On each overflow in the CPMC*SEL, hardware snapshots the program counter along with event counts and writes into memory pointed by LDBAR. LDBAR has bits to indicate whether hardware is configured for accumulation or trace mode. Currently the event monitored for trace-mode is fixed as cycle. Trace-IMC Implementation: -- To enable trace-imc, we need to * Add trace node in the DTS file for power9, so that the new trace node can be discovered by the kernel. Information included in the DTS file are as follows, (a snippet from the ima-catalog) TRACE_IMC: trace-events { #address-cells = <0x1>; #size-cells = <0x1>; event at 1020 { event-name = "cycles" ; reg = <0x1020 0x8>; desc = "Reference cycles" ; }; }; trace@0 { compatible = "ibm,imc-counters"; events-prefix = "trace_"; reg = <0x0 0x8>; events = < _IMC >; type = <0x2>; size = <0x4>; }; OP-BUILD changes needed to include the "trace node" is already pulled in to the ima-catalog repo. ps://github.com/open-power/op-build/commit/d3e75dc26d1283d7d5eb444bff1ec9e40d5dfc07 * Enchance the opal_imc_counters_* calls to support this new trace mode in imc. Add support to initialize the trace-mode scom. TRACE_IMC_SCOM bit representation: 0:1 : SAMPSEL 2:33 : CPMC_LOAD 34:40 : CPMC1SEL 41:47 : CPMC2SEL 48:50 : BUFFERSIZE 51:63 : RESERVED CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL determines the event to count. BUFFRSIZE indicates the memory range. On each overflow, hardware snapshots program counter along with event counts and update the memory and reloads the CMPC_LOAD value for the next sampling duration. IMC hardware does not support exceptions, so it quietly wraps around if memory buffer reaches the end. OPAL support for IMC trace mode is already upstream. * Set LDBAR spr to enable imc-trace mode. LDBAR Layout: 0 : Enable/Disable 1 : 0 -> Accumulation Mode 1 -> Trace Mode 2:3 : Reserved 4-6 : PB scope 7 : Reserved 8:50 : Counter Address 51:63 : Reserved -- PMI interrupt handling is avoided, since IMC trace mode snapshots the program counter and update to the memory. And this also provide a way for the operating system to do instruction sampling in real time without PMI(Performance Monitoring Interrupts) processing overhead. Performance data using 'perf top' with and without trace-imc event: PMI interrupts count when `perf top` command is executed without trace-imc event. # cat /proc/interrupts (a snippet from the output) 9944 1072 804 804 1644 804 1306 804 804 804 804 804 804 804 804 804 1961 1602 804 804 1258 [-] 803 803 803 803 803 803 803 803 803 803 803 804 804 804 804 804 804 804 804 804 803 803 803 803 803 803 1306 803 803 Performance monitoring interrupts `perf top` with trace-imc (executed right after 'perf top' without trace-imc event): # perf top -e trace_imc/trace_cycles/ 12.50% [kernel] [k] arch_cpu_idle 11.81% [kernel] [k] __next_timer_interrupt 11.22% [kernel] [k] rcu_idle_enter 10.25% [kernel] [k] find_next_bit 7.91% [kernel] [k] do_idle 7.69% [kernel] [k] rcu_dynticks_eqs_exit 5.20% [kernel] [k] tick_nohz_idle_stop_tick [---] # cat /proc/interrupts (a snippet from the output) 9944 1072 804 804 1644 804 1306 804 804 804 804 804 804 804 804 804 1961 1602 804 804 1258 [-] 8
[PATCH v4 5/5] powerpc/perf: Trace imc PMU functions
Add PMU functions to support trace-imc. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 205 +++- 1 file changed, 204 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 3fe0222885bc..cc9724561bf2 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -53,7 +53,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) return container_of(event->pmu, struct imc_pmu, pmu); } -PMU_FORMAT_ATTR(event, "config:0-40"); +PMU_FORMAT_ATTR(event, "config:0-61"); PMU_FORMAT_ATTR(offset, "config:0-31"); PMU_FORMAT_ATTR(rvalue, "config:32"); PMU_FORMAT_ATTR(mode, "config:33-40"); @@ -70,6 +70,25 @@ static struct attribute_group imc_format_group = { .attrs = imc_format_attrs, }; +/* Format attribute for imc trace-mode */ +PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19"); +PMU_FORMAT_ATTR(cpmc_event, "config:20-27"); +PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29"); +PMU_FORMAT_ATTR(cpmc_load, "config:30-61"); +static struct attribute *trace_imc_format_attrs[] = { + _attr_event.attr, + _attr_cpmc_reserved.attr, + _attr_cpmc_event.attr, + _attr_cpmc_samplesel.attr, + _attr_cpmc_load.attr, + NULL, +}; + +static struct attribute_group trace_imc_format_group = { +.name = "format", +.attrs = trace_imc_format_attrs, +}; + /* Get the cpumask printed to a buffer "buf" */ static ssize_t imc_pmu_cpumask_get_attr(struct device *dev, struct device_attribute *attr, @@ -1108,6 +1127,182 @@ static int trace_imc_cpu_init(void) ppc_trace_imc_cpu_offline); } +static u64 get_trace_imc_event_base_addr(void) +{ + return (u64)per_cpu(trace_imc_mem, smp_processor_id()); +} + +/* + * Function to parse trace-imc data obtained + * and to prepare the perf sample. + */ +static int trace_imc_prepare_sample(struct trace_imc_data *mem, + struct perf_sample_data *data, + u64 *prev_tb, + struct perf_event_header *header, + struct perf_event *event) +{ + /* Sanity checks for a valid record */ + if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb) + *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1)); + else + return -EINVAL; + + if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) != +be64_to_cpu(READ_ONCE(mem->tb2))) + return -EINVAL; + + /* Prepare perf sample */ + data->ip = be64_to_cpu(READ_ONCE(mem->ip)); + data->period = event->hw.last_period; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + header->misc = 0; + + if (is_kernel_addr(data->ip)) + header->misc |= PERF_RECORD_MISC_KERNEL; + else + header->misc |= PERF_RECORD_MISC_USER; + + perf_event_header__init_id(header, data, event); + + return 0; +} + +static void dump_trace_imc_data(struct perf_event *event) +{ + struct trace_imc_data *mem; + int i, ret; + u64 prev_tb = 0; + + mem = (struct trace_imc_data *)get_trace_imc_event_base_addr(); + for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data)); + i++, mem++) { + struct perf_sample_data data; + struct perf_event_header header; + + ret = trace_imc_prepare_sample(mem, , _tb, , event); + if (ret) /* Exit, if not a valid record */ + break; + else { + /* If this is a valid record, create the sample */ + struct perf_output_handle handle; + + if (perf_output_begin(, event, header.size)) + return; + + perf_output_sample(, , , event); + perf_output_end(); + } + } +} + +static int trace_imc_event_add(struct perf_event *event, int flags) +{ + int core_id = smp_processor_id() / threads_per_core; + struct imc_pmu_ref *ref = NULL; + u64 local_mem, ldbar_value; + + /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */ + local_mem = get_trace_imc_event_base_addr(); + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE; + + if (core_imc_refc) + ref = _imc_refc[core_id]; + if (!ref) { + /* If core-imc is not enabled, use trace-imc reference count */ + if (trace_imc_refc) +
[PATCH v4 4/5] powerpc/perf: Trace imc events detection and cpuhotplug
Patch detects trace-imc events, does memory initilizations for each online cpu, and registers cpuhotplug call-backs. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 104 ++ arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h| 1 + 3 files changed, 108 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 7fe258e17dfe..3fe0222885bc 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem); static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; +/* Trace IMC data structures */ +static DEFINE_PER_CPU(u64 *, trace_imc_mem); +static struct imc_pmu_ref *trace_imc_refc; +static int trace_imc_mem_size; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -1050,6 +1055,59 @@ static void thread_imc_event_del(struct perf_event *event, int flags) imc_event_update(event); } +/* + * Allocate a page of memory for each cpu, and load LDBAR with 0. + */ +static int trace_imc_mem_alloc(int cpu_id, int size) +{ + u64 *local_mem = per_cpu(trace_imc_mem, cpu_id); + int phys_id = cpu_to_node(cpu_id), rc = 0; + int core_id = (cpu_id / threads_per_core); + + if (!local_mem) { + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!local_mem) + return -ENOMEM; + per_cpu(trace_imc_mem, cpu_id) = local_mem; + + /* Initialise the counters for trace mode */ + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem), + get_hard_smp_processor_id(cpu_id)); + if (rc) { + pr_info("IMC:opal init failed for trace imc\n"); + return rc; + } + } + + /* Init the mutex, if not already */ + trace_imc_refc[core_id].id = core_id; + mutex_init(_imc_refc[core_id].lock); + + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int ppc_trace_imc_cpu_online(unsigned int cpu) +{ + return trace_imc_mem_alloc(cpu, trace_imc_mem_size); +} + +static int ppc_trace_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int trace_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, + "perf/powerpc/imc_trace:online", + ppc_trace_imc_cpu_online, + ppc_trace_imc_cpu_offline); +} + /* update_pmu_ops : Populate the appropriate operations for "pmu" */ static int update_pmu_ops(struct imc_pmu *pmu) { @@ -1172,6 +1230,18 @@ static void cleanup_all_thread_imc_memory(void) } } +static void cleanup_all_trace_imc_memory(void) +{ + int i, order = get_order(trace_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(trace_imc_mem, i)) + free_pages((u64)per_cpu(trace_imc_mem, i), order); + + } + kfree(trace_imc_refc); +} + /* Function to free the attr_groups which are dynamically allocated */ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) { @@ -1213,6 +1283,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); cleanup_all_thread_imc_memory(); } + + if (pmu_ptr->domain == IMC_DOMAIN_TRACE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE); + cleanup_all_trace_imc_memory(); + } } /* @@ -1295,6 +1370,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, thread_imc_pmu = pmu_ptr; break; + case IMC_DOMAIN_TRACE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); + trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), + GFP_KERNEL); + if (!trace_imc_refc) + return -ENOMEM; + + trace_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = trace_imc_mem_alloc(cpu, trace_imc_mem_size); + if (re
[PATCH v4 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc
LDBAR holds the memory address allocated for each cpu. For thread-imc the mode bit (i.e bit 1) of LDBAR is set to accumulation. Currently, ldbar is loaded with per cpu memory address and mode set to accumulation at boot time. To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del(). Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 28 +--- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index b1c37cc3fa98..51f1d3eaaa6d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -788,8 +788,11 @@ static int core_imc_event_init(struct perf_event *event) } /* - * Allocates a page of memory for each of the online cpus, and write the - * physical base address of that page to the LDBAR for that cpu. + * Allocates a page of memory for each of the online cpus, and load + * LDBAR with 0. + * The physical base address of the page allocated for a cpu will be + * written to the LDBAR for that cpu, when the thread-imc event + * is added. * * LDBAR Register Layout: * @@ -807,7 +810,7 @@ static int core_imc_event_init(struct perf_event *event) */ static int thread_imc_mem_alloc(int cpu_id, int size) { - u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + u64 *local_mem = per_cpu(thread_imc_mem, cpu_id); int nid = cpu_to_node(cpu_id); if (!local_mem) { @@ -824,9 +827,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size) per_cpu(thread_imc_mem, cpu_id) = local_mem; } - ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; - - mtspr(SPRN_LDBAR, ldbar_value); + mtspr(SPRN_LDBAR, 0); return 0; } @@ -977,6 +978,7 @@ static int thread_imc_event_add(struct perf_event *event, int flags) { int core_id; struct imc_pmu_ref *ref; + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id()); if (flags & PERF_EF_START) imc_event_start(event, flags); @@ -985,6 +987,9 @@ static int thread_imc_event_add(struct perf_event *event, int flags) return -EINVAL; core_id = smp_processor_id() / threads_per_core; + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + mtspr(SPRN_LDBAR, ldbar_value); + /* * imc pmus are enabled only when it is used. * See if this is triggered for the first time. @@ -1016,11 +1021,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id; struct imc_pmu_ref *ref; - /* -* Take a snapshot and calculate the delta and update -* the event counter values. -*/ - imc_event_update(event); + mtspr(SPRN_LDBAR, 0); core_id = smp_processor_id() / threads_per_core; ref = _imc_refc[core_id]; @@ -1039,6 +1040,11 @@ static void thread_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(>lock); + /* +* Take a snapshot and calculate the delta and update +* the event counter values. +*/ + imc_event_update(event); } /* update_pmu_ops : Populate the appropriate operations for "pmu" */ -- 2.17.2
[PATCH v4 3/5] powerpc/perf: Add privileged access check for thread_imc
From: Madhavan Srinivasan Add code to restrict user access to thread_imc pmu since some event report privilege level information. Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support') Signed-off-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 51f1d3eaaa6d..7fe258e17dfe 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -859,6 +859,9 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* Sampling not supported */ if (event->hw.sample_period) return -EINVAL; -- 2.17.2
[PATCH v4 1/5] powerpc/include: Add data structures and macros for IMC trace mode
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode and data structure to hold the trace-imc record data. Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since there is a new switch case added in the opal-calls for IMC. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/include/asm/imc-pmu.h | 39 + arch/powerpc/include/asm/opal-api.h | 1 + 2 files changed, 40 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 69f516ecb2fd..7c2ef0e42661 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -33,6 +33,7 @@ */ #define THREAD_IMC_LDBAR_MASK 0x0003e000ULL #define THREAD_IMC_ENABLE 0x8000ULL +#define TRACE_IMC_ENABLE 0x4000ULL /* * For debugfs interface for imc-mode and imc-command @@ -59,6 +60,34 @@ struct imc_events { char *scale; }; +/* + * Trace IMC hardware updates a 64bytes record on + * Core Performance Monitoring Counter (CPMC) + * overflow. Here is the layout for the trace imc record + * + * DW 0 : Timebase + * DW 1 : Program Counter + * DW 2 : PIDR information + * DW 3 : CPMC1 + * DW 4 : CPMC2 + * DW 5 : CPMC3 + * Dw 6 : CPMC4 + * DW 7 : Timebase + * . + * + * The following is the data structure to hold trace imc data. + */ +struct trace_imc_data { + u64 tb1; + u64 ip; + u64 val; + u64 cpmc1; + u64 cpmc2; + u64 cpmc3; + u64 cpmc4; + u64 tb2; +}; + /* Event attribute array index */ #define IMC_FORMAT_ATTR0 #define IMC_EVENT_ATTR 1 @@ -68,6 +97,13 @@ struct imc_events { /* PMU Format attribute macros */ #define IMC_EVENT_OFFSET_MASK 0xULL +/* + * Macro to mask bits 0:21 of first double word(which is the timebase) to + * compare with 8th double word (timebase) of trace imc record data. + */ +#define IMC_TRACE_RECORD_TB1_MASK 0x3ffULL + + /* * Device tree parser code detects IMC pmu support and * registers new IMC pmus. This structure will hold the @@ -113,6 +149,7 @@ struct imc_pmu_ref { enum { IMC_TYPE_THREAD = 0x1, + IMC_TYPE_TRACE = 0x2, IMC_TYPE_CORE = 0x4, IMC_TYPE_CHIP = 0x10, }; @@ -123,6 +160,8 @@ enum { #define IMC_DOMAIN_NEST1 #define IMC_DOMAIN_CORE2 #define IMC_DOMAIN_THREAD 3 +/* For trace-imc the domain is still thread but it operates in trace-mode */ +#define IMC_DOMAIN_TRACE 4 extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b239ea..a4130b21b159 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1118,6 +1118,7 @@ enum { enum { OPAL_IMC_COUNTERS_NEST = 1, OPAL_IMC_COUNTERS_CORE = 2, + OPAL_IMC_COUNTERS_TRACE = 3, }; -- 2.17.2
[PATCH v4 0/5] powerpc/perf: IMC trace-mode support
0 : Enable/Disable 1 : 0 -> Accumulation Mode 1 -> Trace Mode 2:3 : Reserved 4-6 : PB scope 7 : Reserved 8:50 : Counter Address 51:63 : Reserved -- PMI interrupt handling is avoided, since IMC trace mode snapshots the program counter and update to the memory. And this also provide a way for the operating system to do instruction sampling in real time without PMI(Performance Monitoring Interrupts) processing overhead. Performance data using 'perf top' with and without trace-imc event: PMI interrupts count when `perf top` command is executed without trace-imc event. # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803803804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts `perf top` with trace-imc (executed right after 'perf top' without trace-imc event): # perf top -e trace_imc/trace_cycles/ 12.50% [kernel] [k] arch_cpu_idle 11.81% [kernel] [k] __next_timer_interrupt 11.22% [kernel] [k] rcu_idle_enter 10.25% [kernel] [k] find_next_bit 7.91% [kernel] [k] do_idle 7.69% [kernel] [k] rcu_dynticks_eqs_exit 5.20% [kernel] [k] tick_nohz_idle_stop_tick [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803804804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts The PMI interrupts count remains the same. Changelog: -- >From v3 -> v4: * trace_imc_refc is introduced. So that even if, core-imc is disabled, trace-imc can be used. * trace_imc_pmu_sched_task is removed and opal start/stop is invoked in trace_imc_event_add/del function. Suggestions/comments are welcome. Anju T Sudhakar (4): powerpc/include: Add data structures and macros for IMC trace mode powerpc/perf: Rearrange setting of ldbar for thread-imc powerpc/perf: Trace imc events detection and cpuhotplug powerpc/perf: Trace imc PMU functions Madhavan Srinivasan (1): powerpc/perf: Add privileged access check for thread_imc
Re: [PATCH v4 0/5] powerpc/perf: IMC trace-mode support
Hi, Kindly ignore this series, since patch 5/5 in this series doesn't incorporate the event-format change that I've done in v4 of this series. Apologies for the inconvenience. I will post the updated v5 soon. Thanks, Anju On 4/15/19 3:41 PM, Anju T Sudhakar wrote: IMC (In-Memory collection counters) is a hardware monitoring facility that collects large number of hardware performance events. POWER9 support two modes for IMC which are the Accumulation mode and Trace mode. In Accumulation mode, event counts are accumulated in system Memory. Hypervisor then reads the posted counts periodically or when requested. In IMC Trace mode, the 64 bit trace scom value is initialized with the event information. The CPMC*SEL and CPMC_LOAD in the trace scom, specifies the event to be monitored and the sampling duration. On each overflow in the CPMC*SEL, hardware snapshots the program counter along with event counts and writes into memory pointed by LDBAR. LDBAR has bits to indicate whether hardware is configured for accumulation or trace mode. Currently the event monitored for trace-mode is fixed as cycle. Trace-IMC Implementation: -- To enable trace-imc, we need to * Add trace node in the DTS file for power9, so that the new trace node can be discovered by the kernel. Information included in the DTS file are as follows, (a snippet from the ima-catalog) TRACE_IMC: trace-events { #address-cells = <0x1>; #size-cells = <0x1>; event at 1020 { event-name = "cycles" ; reg = <0x1020 0x8>; desc = "Reference cycles" ; }; }; trace@0 { compatible = "ibm,imc-counters"; events-prefix = "trace_"; reg = <0x0 0x8>; events = < _IMC >; type = <0x2>; size = <0x4>; }; OP-BUILD changes needed to include the "trace node" is already pulled in to the ima-catalog repo. ps://github.com/open-power/op-build/commit/d3e75dc26d1283d7d5eb444bff1ec9e40d5dfc07 * Enchance the opal_imc_counters_* calls to support this new trace mode in imc. Add support to initialize the trace-mode scom. TRACE_IMC_SCOM bit representation: 0:1 : SAMPSEL 2:33: CPMC_LOAD 34:40 : CPMC1SEL 41:47 : CPMC2SEL 48:50 : BUFFERSIZE 51:63 : RESERVED CPMC_LOAD contains the sampling duration. SAMPSEL and CPMC*SEL determines the event to count. BUFFRSIZE indicates the memory range. On each overflow, hardware snapshots program counter along with event counts and update the memory and reloads the CMPC_LOAD value for the next sampling duration. IMC hardware does not support exceptions, so it quietly wraps around if memory buffer reaches the end. OPAL support for IMC trace mode is already upstream. * Set LDBAR spr to enable imc-trace mode. LDBAR Layout: 0 : Enable/Disable 1 : 0 -> Accumulation Mode 1 -> Trace Mode 2:3 : Reserved 4-6 : PB scope 7 : Reserved 8:50 : Counter Address 51:63 : Reserved -- PMI interrupt handling is avoided, since IMC trace mode snapshots the program counter and update to the memory. And this also provide a way for the operating system to do instruction sampling in real time without PMI(Performance Monitoring Interrupts) processing overhead. Performance data using 'perf top' with and without trace-imc event: PMI interrupts count when `perf top` command is executed without trace-imc event. # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803803804804804 804
[PATCH v4 5/5] powerpc/perf: Trace imc PMU functions
Add PMU functions to support trace-imc. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 183 1 file changed, 183 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 3fe0222885bc..3f433cc96b18 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1108,6 +1108,182 @@ static int trace_imc_cpu_init(void) ppc_trace_imc_cpu_offline); } +static u64 get_trace_imc_event_base_addr(void) +{ + return (u64)per_cpu(trace_imc_mem, smp_processor_id()); +} + +/* + * Function to parse trace-imc data obtained + * and to prepare the perf sample. + */ +static int trace_imc_prepare_sample(struct trace_imc_data *mem, + struct perf_sample_data *data, + u64 *prev_tb, + struct perf_event_header *header, + struct perf_event *event) +{ + /* Sanity checks for a valid record */ + if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb) + *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1)); + else + return -EINVAL; + + if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) != +be64_to_cpu(READ_ONCE(mem->tb2))) + return -EINVAL; + + /* Prepare perf sample */ + data->ip = be64_to_cpu(READ_ONCE(mem->ip)); + data->period = event->hw.last_period; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + header->misc = 0; + + if (is_kernel_addr(data->ip)) + header->misc |= PERF_RECORD_MISC_KERNEL; + else + header->misc |= PERF_RECORD_MISC_USER; + + perf_event_header__init_id(header, data, event); + + return 0; +} + +static void dump_trace_imc_data(struct perf_event *event) +{ + struct trace_imc_data *mem; + int i, ret; + u64 prev_tb = 0; + + mem = (struct trace_imc_data *)get_trace_imc_event_base_addr(); + for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data)); + i++, mem++) { + struct perf_sample_data data; + struct perf_event_header header; + + ret = trace_imc_prepare_sample(mem, , _tb, , event); + if (ret) /* Exit, if not a valid record */ + break; + else { + /* If this is a valid record, create the sample */ + struct perf_output_handle handle; + + if (perf_output_begin(, event, header.size)) + return; + + perf_output_sample(, , , event); + perf_output_end(); + } + } +} + +static int trace_imc_event_add(struct perf_event *event, int flags) +{ + int core_id = smp_processor_id() / threads_per_core; + struct imc_pmu_ref *ref = NULL; + u64 local_mem, ldbar_value; + + /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */ + local_mem = get_trace_imc_event_base_addr(); + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE; + + if (core_imc_refc) + ref = _imc_refc[core_id]; + if (!ref) { + /* If core-imc is not enabled, use trace-imc reference count */ + if (trace_imc_refc) + ref = _imc_refc[core_id]; + if (!ref) + return -EINVAL; + } + mtspr(SPRN_LDBAR, ldbar_value); + mutex_lock(>lock); + if (ref->refc == 0) { + if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE, + get_hard_smp_processor_id(smp_processor_id( { + mutex_unlock(>lock); + pr_err("trace-imc: Unable to start the counters for core %d\n", core_id); + mtspr(SPRN_LDBAR, 0); + return -EINVAL; + } + } + ++ref->refc; + mutex_unlock(>lock); + + return 0; +} + +static void trace_imc_event_read(struct perf_event *event) +{ + return; +} + +static void trace_imc_event_stop(struct perf_event *event, int flags) +{ + u64 local_mem = get_trace_imc_event_base_addr(); + dump_trace_imc_data(event); + memset((void *)local_mem, 0, sizeof(u64)); +} + +static void trace_imc_event_start(struct perf_event *event, int flags) +{ + return; +} + +static void trace_imc_event_del(struct perf_event *event, int flags) +{ + int core_id = smp_processor_id() / threads_per_core; + struct imc_pmu_ref *ref = NULL; + + if (core_imc_refc) + ref = _imc_r
[PATCH v4 4/5] powerpc/perf: Trace imc events detection and cpuhotplug
Patch detects trace-imc events, does memory initilizations for each online cpu, and registers cpuhotplug call-backs. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 104 ++ arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h| 1 + 3 files changed, 108 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 7fe258e17dfe..3fe0222885bc 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem); static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; +/* Trace IMC data structures */ +static DEFINE_PER_CPU(u64 *, trace_imc_mem); +static struct imc_pmu_ref *trace_imc_refc; +static int trace_imc_mem_size; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -1050,6 +1055,59 @@ static void thread_imc_event_del(struct perf_event *event, int flags) imc_event_update(event); } +/* + * Allocate a page of memory for each cpu, and load LDBAR with 0. + */ +static int trace_imc_mem_alloc(int cpu_id, int size) +{ + u64 *local_mem = per_cpu(trace_imc_mem, cpu_id); + int phys_id = cpu_to_node(cpu_id), rc = 0; + int core_id = (cpu_id / threads_per_core); + + if (!local_mem) { + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!local_mem) + return -ENOMEM; + per_cpu(trace_imc_mem, cpu_id) = local_mem; + + /* Initialise the counters for trace mode */ + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem), + get_hard_smp_processor_id(cpu_id)); + if (rc) { + pr_info("IMC:opal init failed for trace imc\n"); + return rc; + } + } + + /* Init the mutex, if not already */ + trace_imc_refc[core_id].id = core_id; + mutex_init(_imc_refc[core_id].lock); + + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int ppc_trace_imc_cpu_online(unsigned int cpu) +{ + return trace_imc_mem_alloc(cpu, trace_imc_mem_size); +} + +static int ppc_trace_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int trace_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, + "perf/powerpc/imc_trace:online", + ppc_trace_imc_cpu_online, + ppc_trace_imc_cpu_offline); +} + /* update_pmu_ops : Populate the appropriate operations for "pmu" */ static int update_pmu_ops(struct imc_pmu *pmu) { @@ -1172,6 +1230,18 @@ static void cleanup_all_thread_imc_memory(void) } } +static void cleanup_all_trace_imc_memory(void) +{ + int i, order = get_order(trace_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(trace_imc_mem, i)) + free_pages((u64)per_cpu(trace_imc_mem, i), order); + + } + kfree(trace_imc_refc); +} + /* Function to free the attr_groups which are dynamically allocated */ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) { @@ -1213,6 +1283,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); cleanup_all_thread_imc_memory(); } + + if (pmu_ptr->domain == IMC_DOMAIN_TRACE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE); + cleanup_all_trace_imc_memory(); + } } /* @@ -1295,6 +1370,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, thread_imc_pmu = pmu_ptr; break; + case IMC_DOMAIN_TRACE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); + trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), + GFP_KERNEL); + if (!trace_imc_refc) + return -ENOMEM; + + trace_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = trace_imc_mem_alloc(cpu, trace_imc_mem_size); + if (re
[PATCH v4 3/5] powerpc/perf: Add privileged access check for thread_imc
From: Madhavan Srinivasan Add code to restrict user access to thread_imc pmu since some event report privilege level information. Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support') Signed-off-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 51f1d3eaaa6d..7fe258e17dfe 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -859,6 +859,9 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* Sampling not supported */ if (event->hw.sample_period) return -EINVAL; -- 2.17.2
[PATCH v4 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc
LDBAR holds the memory address allocated for each cpu. For thread-imc the mode bit (i.e bit 1) of LDBAR is set to accumulation. Currently, ldbar is loaded with per cpu memory address and mode set to accumulation at boot time. To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del(). Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 28 +--- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index b1c37cc3fa98..51f1d3eaaa6d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -788,8 +788,11 @@ static int core_imc_event_init(struct perf_event *event) } /* - * Allocates a page of memory for each of the online cpus, and write the - * physical base address of that page to the LDBAR for that cpu. + * Allocates a page of memory for each of the online cpus, and load + * LDBAR with 0. + * The physical base address of the page allocated for a cpu will be + * written to the LDBAR for that cpu, when the thread-imc event + * is added. * * LDBAR Register Layout: * @@ -807,7 +810,7 @@ static int core_imc_event_init(struct perf_event *event) */ static int thread_imc_mem_alloc(int cpu_id, int size) { - u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + u64 *local_mem = per_cpu(thread_imc_mem, cpu_id); int nid = cpu_to_node(cpu_id); if (!local_mem) { @@ -824,9 +827,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size) per_cpu(thread_imc_mem, cpu_id) = local_mem; } - ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; - - mtspr(SPRN_LDBAR, ldbar_value); + mtspr(SPRN_LDBAR, 0); return 0; } @@ -977,6 +978,7 @@ static int thread_imc_event_add(struct perf_event *event, int flags) { int core_id; struct imc_pmu_ref *ref; + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id()); if (flags & PERF_EF_START) imc_event_start(event, flags); @@ -985,6 +987,9 @@ static int thread_imc_event_add(struct perf_event *event, int flags) return -EINVAL; core_id = smp_processor_id() / threads_per_core; + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + mtspr(SPRN_LDBAR, ldbar_value); + /* * imc pmus are enabled only when it is used. * See if this is triggered for the first time. @@ -1016,11 +1021,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id; struct imc_pmu_ref *ref; - /* -* Take a snapshot and calculate the delta and update -* the event counter values. -*/ - imc_event_update(event); + mtspr(SPRN_LDBAR, 0); core_id = smp_processor_id() / threads_per_core; ref = _imc_refc[core_id]; @@ -1039,6 +1040,11 @@ static void thread_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(>lock); + /* +* Take a snapshot and calculate the delta and update +* the event counter values. +*/ + imc_event_update(event); } /* update_pmu_ops : Populate the appropriate operations for "pmu" */ -- 2.17.2
[PATCH v4 1/5] powerpc/include: Add data structures and macros for IMC trace mode
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode and data structure to hold the trace-imc record data. Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since there is a new switch case added in the opal-calls for IMC. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/include/asm/imc-pmu.h | 39 + arch/powerpc/include/asm/opal-api.h | 1 + 2 files changed, 40 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 69f516ecb2fd..7c2ef0e42661 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -33,6 +33,7 @@ */ #define THREAD_IMC_LDBAR_MASK 0x0003e000ULL #define THREAD_IMC_ENABLE 0x8000ULL +#define TRACE_IMC_ENABLE 0x4000ULL /* * For debugfs interface for imc-mode and imc-command @@ -59,6 +60,34 @@ struct imc_events { char *scale; }; +/* + * Trace IMC hardware updates a 64bytes record on + * Core Performance Monitoring Counter (CPMC) + * overflow. Here is the layout for the trace imc record + * + * DW 0 : Timebase + * DW 1 : Program Counter + * DW 2 : PIDR information + * DW 3 : CPMC1 + * DW 4 : CPMC2 + * DW 5 : CPMC3 + * Dw 6 : CPMC4 + * DW 7 : Timebase + * . + * + * The following is the data structure to hold trace imc data. + */ +struct trace_imc_data { + u64 tb1; + u64 ip; + u64 val; + u64 cpmc1; + u64 cpmc2; + u64 cpmc3; + u64 cpmc4; + u64 tb2; +}; + /* Event attribute array index */ #define IMC_FORMAT_ATTR0 #define IMC_EVENT_ATTR 1 @@ -68,6 +97,13 @@ struct imc_events { /* PMU Format attribute macros */ #define IMC_EVENT_OFFSET_MASK 0xULL +/* + * Macro to mask bits 0:21 of first double word(which is the timebase) to + * compare with 8th double word (timebase) of trace imc record data. + */ +#define IMC_TRACE_RECORD_TB1_MASK 0x3ffULL + + /* * Device tree parser code detects IMC pmu support and * registers new IMC pmus. This structure will hold the @@ -113,6 +149,7 @@ struct imc_pmu_ref { enum { IMC_TYPE_THREAD = 0x1, + IMC_TYPE_TRACE = 0x2, IMC_TYPE_CORE = 0x4, IMC_TYPE_CHIP = 0x10, }; @@ -123,6 +160,8 @@ enum { #define IMC_DOMAIN_NEST1 #define IMC_DOMAIN_CORE2 #define IMC_DOMAIN_THREAD 3 +/* For trace-imc the domain is still thread but it operates in trace-mode */ +#define IMC_DOMAIN_TRACE 4 extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b239ea..a4130b21b159 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1118,6 +1118,7 @@ enum { enum { OPAL_IMC_COUNTERS_NEST = 1, OPAL_IMC_COUNTERS_CORE = 2, + OPAL_IMC_COUNTERS_TRACE = 3, }; -- 2.17.2
[PATCH v4 0/5] powerpc/perf: IMC trace-mode support
0 : Enable/Disable 1 : 0 -> Accumulation Mode 1 -> Trace Mode 2:3 : Reserved 4-6 : PB scope 7 : Reserved 8:50 : Counter Address 51:63 : Reserved -- PMI interrupt handling is avoided, since IMC trace mode snapshots the program counter and update to the memory. And this also provide a way for the operating system to do instruction sampling in real time without PMI(Performance Monitoring Interrupts) processing overhead. Performance data using 'perf top' with and without trace-imc event: PMI interrupts count when `perf top` command is executed without trace-imc event. # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803803804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts `perf top` with trace-imc (executed right after 'perf top' without trace-imc event): # perf top -e trace_imc/trace_cycles/ 12.50% [kernel] [k] arch_cpu_idle 11.81% [kernel] [k] __next_timer_interrupt 11.22% [kernel] [k] rcu_idle_enter 10.25% [kernel] [k] find_next_bit 7.91% [kernel] [k] do_idle 7.69% [kernel] [k] rcu_dynticks_eqs_exit 5.20% [kernel] [k] tick_nohz_idle_stop_tick [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803804804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts The PMI interrupts count remains the same. Changelog: -- >From v3 -> v4: * trace_imc_refc is introduced. So that even if, core-imc is disabled, trace-imc can be used. * trace_imc_pmu_sched_task is removed and opal start/stop is invoked in trace_imc_event_add/del function. Suggestions/comments are welcome. Anju T Sudhakar (4): powerpc/include: Add data structures and macros for IMC trace mode powerpc/perf: Rearrange setting of ldbar for thread-imc powerpc/perf: Trace imc events detection and cpuhotplug powerpc/perf: Trace imc PMU functions Madhavan Srinivasan (1): powerpc/perf: Add privileged access check for thread_imc arch/powerpc/include/asm/imc-pmu.h| 39 +++ arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/perf/imc-pmu.c | 318 +- arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h| 1 + 5 files changed, 351 insertions(+), 11 deletions(-) -- 2.17.2
[PATCH v3 5/5] powerpc/perf: Trace imc PMU functions
Add PMU functions to support trace-imc and define the format for trace-imc events. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 197 +++- 1 file changed, 196 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 1f09265c8fb0..0f1a30f11f6a 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -52,7 +52,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) return container_of(event->pmu, struct imc_pmu, pmu); } -PMU_FORMAT_ATTR(event, "config:0-40"); +PMU_FORMAT_ATTR(event, "config:0-61"); PMU_FORMAT_ATTR(offset, "config:0-31"); PMU_FORMAT_ATTR(rvalue, "config:32"); PMU_FORMAT_ATTR(mode, "config:33-40"); @@ -69,6 +69,25 @@ static struct attribute_group imc_format_group = { .attrs = imc_format_attrs, }; +/* Format attribute for imc trace-mode */ +PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19"); +PMU_FORMAT_ATTR(cpmc_event, "config:20-27"); +PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29"); +PMU_FORMAT_ATTR(cpmc_load, "config:30-61"); +static struct attribute *trace_imc_format_attrs[] = { + _attr_event.attr, + _attr_cpmc_reserved.attr, + _attr_cpmc_event.attr, + _attr_cpmc_samplesel.attr, + _attr_cpmc_load.attr, + NULL, +}; + +static struct attribute_group trace_imc_format_group = { + .name = "format", + .attrs = trace_imc_format_attrs, +}; + /* Get the cpumask printed to a buffer "buf" */ static ssize_t imc_pmu_cpumask_get_attr(struct device *dev, struct device_attribute *attr, @@ -1120,6 +1139,173 @@ static int trace_imc_cpu_init(void) ppc_trace_imc_cpu_offline); } +static u64 get_trace_imc_event_base_addr(void) +{ + return (u64)per_cpu(trace_imc_mem, smp_processor_id()); +} + +/* + * Function to parse trace-imc data obtained + * and to prepare the perf sample. + */ +static int trace_imc_prepare_sample(struct trace_imc_data *mem, + struct perf_sample_data *data, + u64 *prev_tb, + struct perf_event_header *header, + struct perf_event *event) +{ + /* Sanity checks for a valid record */ + if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb) + *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1)); + else + return -EINVAL; + + if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) != +be64_to_cpu(READ_ONCE(mem->tb2))) + return -EINVAL; + + /* Prepare perf sample */ + data->ip = be64_to_cpu(READ_ONCE(mem->ip)); + data->period = event->hw.last_period; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + header->misc = 0; + + if (is_kernel_addr(data->ip)) + header->misc |= PERF_RECORD_MISC_KERNEL; + else + header->misc |= PERF_RECORD_MISC_USER; + + perf_event_header__init_id(header, data, event); + + return 0; +} + +static void dump_trace_imc_data(struct perf_event *event) +{ + struct trace_imc_data *mem; + int i, ret; + u64 prev_tb = 0; + + mem = (struct trace_imc_data *)get_trace_imc_event_base_addr(); + for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data)); + i++, mem++) { + struct perf_sample_data data; + struct perf_event_header header; + + ret = trace_imc_prepare_sample(mem, , _tb, , event); + if (ret) /* Exit, if not a valid record */ + break; + else { + /* If this is a valid record, create the sample */ + struct perf_output_handle handle; + + if (perf_output_begin(, event, header.size)) + return; + + perf_output_sample(, , , event); + perf_output_end(); + } + } +} + +static int trace_imc_event_add(struct perf_event *event, int flags) +{ + /* Enable the sched_task to start the engine */ + perf_sched_cb_inc(event->ctx->pmu); + return 0; +} + +static void trace_imc_event_read(struct perf_event *event) +{ + dump_trace_imc_data(event); +} + +static void trace_imc_event_stop(struct perf_event *event, int flags) +{ + trace_imc_event_read(event); +} + +static void trace_imc_event_start(struct perf_event *event, int flags) +{ + return; +} + +static void trace_imc_event_del(struct perf_event *event, int flags) +{ +
[PATCH v3 4/5] powerpc/perf: Trace imc events detection and cpuhotplug
Patch detects trace-imc events, does memory initilizations for each online cpu, and registers cpuhotplug call-backs. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 91 +++ arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h| 1 + 3 files changed, 95 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 5ca80545a849..1f09265c8fb0 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem); static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; +/* Trace IMC data structures */ +static DEFINE_PER_CPU(u64 *, trace_imc_mem); +static int trace_imc_mem_size; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -1068,6 +1072,54 @@ static void thread_imc_event_del(struct perf_event *event, int flags) imc_event_update(event); } +/* + * Allocate a page of memory for each cpu, and load LDBAR with 0. + */ +static int trace_imc_mem_alloc(int cpu_id, int size) +{ + u64 *local_mem = per_cpu(trace_imc_mem, cpu_id); + int phys_id = cpu_to_node(cpu_id), rc = 0; + + if (!local_mem) { + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!local_mem) + return -ENOMEM; + per_cpu(trace_imc_mem, cpu_id) = local_mem; + + /* Initialise the counters for trace mode */ + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem), + get_hard_smp_processor_id(cpu_id)); + if (rc) { + pr_info("IMC:opal init failed for trace imc\n"); + return rc; + } + } + + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int ppc_trace_imc_cpu_online(unsigned int cpu) +{ + return trace_imc_mem_alloc(cpu, trace_imc_mem_size); +} + +static int ppc_trace_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int trace_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, + "perf/powerpc/imc_trace:online", + ppc_trace_imc_cpu_online, + ppc_trace_imc_cpu_offline); +} + /* update_pmu_ops : Populate the appropriate operations for "pmu" */ static int update_pmu_ops(struct imc_pmu *pmu) { @@ -1189,6 +1241,17 @@ static void cleanup_all_thread_imc_memory(void) } } +static void cleanup_all_trace_imc_memory(void) +{ + int i, order = get_order(trace_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(trace_imc_mem, i)) + free_pages((u64)per_cpu(trace_imc_mem, i), order); + + } +} + /* Function to free the attr_groups which are dynamically allocated */ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) { @@ -1230,6 +1293,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); cleanup_all_thread_imc_memory(); } + + if (pmu_ptr->domain == IMC_DOMAIN_TRACE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE); + cleanup_all_trace_imc_memory(); + } } /* @@ -1312,6 +1380,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, thread_imc_pmu = pmu_ptr; break; + case IMC_DOMAIN_TRACE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + trace_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = trace_imc_mem_alloc(cpu, trace_imc_mem_size); + if (res) { + cleanup_all_trace_imc_memory(); + goto err; + } + } + break; default: return -EINVAL; } @@ -1384,6 +1467,14 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id goto err_free_mem; } + break; + case IMC_DOMAIN_TRACE: + ret = trace_imc_cpu_init(); + if (ret) { + cleanup_
[PATCH v3 3/5] powerpc/perf: Add privileged access check for thread_imc
From: Madhavan Srinivasan Add code to restrict user access to thread_imc pmu since some event report privilege level information. Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support') Signed-off-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 3bef46f8417d..5ca80545a849 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -877,6 +877,9 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* Sampling not supported */ if (event->hw.sample_period) return -EINVAL; -- 2.17.1
[PATCH v3 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc
LDBAR holds the memory address allocated for each cpu. For thread-imc the mode bit (i.e bit 1) of LDBAR is set to accumulation. Currently, ldbar is loaded with per cpu memory address and mode set to accumulation at boot time. To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del(). Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/perf/imc-pmu.c | 28 +--- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index f292a3f284f1..3bef46f8417d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event) } /* - * Allocates a page of memory for each of the online cpus, and write the - * physical base address of that page to the LDBAR for that cpu. + * Allocates a page of memory for each of the online cpus, and load + * LDBAR with 0. + * The physical base address of the page allocated for a cpu will be + * written to the LDBAR for that cpu, when the thread-imc event + * is added. * * LDBAR Register Layout: * @@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event) */ static int thread_imc_mem_alloc(int cpu_id, int size) { - u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + u64 *local_mem = per_cpu(thread_imc_mem, cpu_id); int nid = cpu_to_node(cpu_id); if (!local_mem) { @@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size) per_cpu(thread_imc_mem, cpu_id) = local_mem; } - ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; - - mtspr(SPRN_LDBAR, ldbar_value); + mtspr(SPRN_LDBAR, 0); return 0; } @@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, int flags) { int core_id; struct imc_pmu_ref *ref; + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id()); if (flags & PERF_EF_START) imc_event_start(event, flags); @@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, int flags) return -EINVAL; core_id = smp_processor_id() / threads_per_core; + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + mtspr(SPRN_LDBAR, ldbar_value); + /* * imc pmus are enabled only when it is used. * See if this is triggered for the first time. @@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id; struct imc_pmu_ref *ref; - /* -* Take a snapshot and calculate the delta and update -* the event counter values. -*/ - imc_event_update(event); + mtspr(SPRN_LDBAR, 0); core_id = smp_processor_id() / threads_per_core; ref = _imc_refc[core_id]; @@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(>lock); + /* +* Take a snapshot and calculate the delta and update +* the event counter values. +*/ + imc_event_update(event); } /* update_pmu_ops : Populate the appropriate operations for "pmu" */ -- 2.17.1
[PATCH v3 0/5] powerpc/perf: IMC trace-mode support
bs.org/pipermail/skiboot/2018-December/012883.html * Set LDBAR spr to enable imc-trace mode. LDBAR Layout: 0 : Enable/Disable 1 : 0 -> Accumulation Mode 1 -> Trace Mode 2:3 : Reserved 4-6 : PB scope 7 : Reserved 8:50 : Counter Address 51:63 : Reserved Key benefit of imc trace-mode is, each sample record contains the address pointer along with other information. So that, we can profile the IP without interrupting the application. Performance data using 'perf top' with and without trace-imc event: When the application is monitored with trace-imc event, we dont take any PMI interrupts. PMI interrupts count when `perf top` command is executed without trac-imc event. # perf top 12.53% [kernel] [k] arch_cpu_idle 11.32% [kernel] [k] rcu_idle_enter 10.76% [kernel] [k] __next_timer_interrupt 9.49% [kernel] [k] find_next_bit 8.06% [kernel] [k] rcu_dynticks_eqs_exit 7.82% [kernel] [k] do_idle 5.71% [kernel] [k] tick_nohz_idle_stop_tic [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803803804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts `perf top` with trace-imc (right after 'perf top' without trace-imc event): # perf top -e trace_imc/trace_cycles/ 12.50% [kernel] [k] arch_cpu_idle 11.81% [kernel] [k] __next_timer_interrupt 11.22% [kernel] [k] rcu_idle_enter 10.25% [kernel] [k] find_next_bit 7.91% [kernel] [k] do_idle 7.69% [kernel] [k] rcu_dynticks_eqs_exit 5.20% [kernel] [k] tick_nohz_idle_stop_tick [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803804804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts The PMI interrupts count remains the same. Changelog: >From v2 -> v3 -- * Redefined the event format for trace-imc. Suggestions/comments are welcome. Anju T Sudhakar (4): powerpc/include: Add data structures and macros for IMC trace mode powerpc/perf: Rearrange setting of ldbar for thread-imc powerpc/perf: Trace imc events detection and cpuhotplug powerpc/perf: Trace imc PMU functions Madhavan Srinivasan (1): powerpc/perf: Add privileged access check for thread_imc arch/powerpc/include/asm/im
[PATCH v3 1/5] powerpc/include: Add data structures and macros for IMC trace mode
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode and data structure to hold the trace-imc record data. Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since there is a new switch case added in the opal-calls for IMC. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan --- arch/powerpc/include/asm/imc-pmu.h | 39 + arch/powerpc/include/asm/opal-api.h | 1 + 2 files changed, 40 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 69f516ecb2fd..7c2ef0e42661 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -33,6 +33,7 @@ */ #define THREAD_IMC_LDBAR_MASK 0x0003e000ULL #define THREAD_IMC_ENABLE 0x8000ULL +#define TRACE_IMC_ENABLE 0x4000ULL /* * For debugfs interface for imc-mode and imc-command @@ -59,6 +60,34 @@ struct imc_events { char *scale; }; +/* + * Trace IMC hardware updates a 64bytes record on + * Core Performance Monitoring Counter (CPMC) + * overflow. Here is the layout for the trace imc record + * + * DW 0 : Timebase + * DW 1 : Program Counter + * DW 2 : PIDR information + * DW 3 : CPMC1 + * DW 4 : CPMC2 + * DW 5 : CPMC3 + * Dw 6 : CPMC4 + * DW 7 : Timebase + * . + * + * The following is the data structure to hold trace imc data. + */ +struct trace_imc_data { + u64 tb1; + u64 ip; + u64 val; + u64 cpmc1; + u64 cpmc2; + u64 cpmc3; + u64 cpmc4; + u64 tb2; +}; + /* Event attribute array index */ #define IMC_FORMAT_ATTR0 #define IMC_EVENT_ATTR 1 @@ -68,6 +97,13 @@ struct imc_events { /* PMU Format attribute macros */ #define IMC_EVENT_OFFSET_MASK 0xULL +/* + * Macro to mask bits 0:21 of first double word(which is the timebase) to + * compare with 8th double word (timebase) of trace imc record data. + */ +#define IMC_TRACE_RECORD_TB1_MASK 0x3ffULL + + /* * Device tree parser code detects IMC pmu support and * registers new IMC pmus. This structure will hold the @@ -113,6 +149,7 @@ struct imc_pmu_ref { enum { IMC_TYPE_THREAD = 0x1, + IMC_TYPE_TRACE = 0x2, IMC_TYPE_CORE = 0x4, IMC_TYPE_CHIP = 0x10, }; @@ -123,6 +160,8 @@ enum { #define IMC_DOMAIN_NEST1 #define IMC_DOMAIN_CORE2 #define IMC_DOMAIN_THREAD 3 +/* For trace-imc the domain is still thread but it operates in trace-mode */ +#define IMC_DOMAIN_TRACE 4 extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b239ea..a4130b21b159 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1118,6 +1118,7 @@ enum { enum { OPAL_IMC_COUNTERS_NEST = 1, OPAL_IMC_COUNTERS_CORE = 2, + OPAL_IMC_COUNTERS_TRACE = 3, }; -- 2.17.1
[PATCH v2] powerpc/perf: Fix loop exit condition in nest_imc_event_init
The data structure (i.e struct imc_mem_info) to hold the memory address information for nest imc units is allocated based on the number of nodes in the system. nest_imc_event_init() traverse this struct array to calculate the memory base address for the event-cpu. If we fail to find a match for the event cpu's chip-id in imc_mem_info struct array, then the do-while loop will iterate until we crash. Fix this by changing the loop exit condition based on the number of non zero vbase elements in the array, since the allocation is done for nr_chips + 1. Reported-by: Dan Carpenter Fixes: 885dcd709ba91 ( powerpc/perf: Add nest IMC PMU support) Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 2 +- arch/powerpc/platforms/powernv/opal-imc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 4f34c75..d1009fe 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -508,7 +508,7 @@ static int nest_imc_event_init(struct perf_event *event) break; } pcni++; - } while (pcni); + } while (pcni->vbase != 0); if (!flag) return -ENODEV; diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 58a0794..3d27f02 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -127,7 +127,7 @@ static int imc_get_mem_addr_nest(struct device_node *node, nr_chips)) goto error; - pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(*pmu_ptr->mem_info), + pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info), GFP_KERNEL); if (!pmu_ptr->mem_info) goto error; -- 1.8.3.1
[PATCH v2 2/5] powerpc/perf: Rearrange setting of ldbar for thread-imc
LDBAR holds the memory address allocated for each cpu. For thread-imc the mode bit (i.e bit 1) of LDBAR is set to accumulation. Currently, ldbar is loaded with per cpu memory address and mode set to accumulation at boot time. To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del(). Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 28 +--- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index f292a3f284f1..3bef46f8417d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event) } /* - * Allocates a page of memory for each of the online cpus, and write the - * physical base address of that page to the LDBAR for that cpu. + * Allocates a page of memory for each of the online cpus, and load + * LDBAR with 0. + * The physical base address of the page allocated for a cpu will be + * written to the LDBAR for that cpu, when the thread-imc event + * is added. * * LDBAR Register Layout: * @@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event) */ static int thread_imc_mem_alloc(int cpu_id, int size) { - u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + u64 *local_mem = per_cpu(thread_imc_mem, cpu_id); int nid = cpu_to_node(cpu_id); if (!local_mem) { @@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size) per_cpu(thread_imc_mem, cpu_id) = local_mem; } - ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; - - mtspr(SPRN_LDBAR, ldbar_value); + mtspr(SPRN_LDBAR, 0); return 0; } @@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, int flags) { int core_id; struct imc_pmu_ref *ref; + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id()); if (flags & PERF_EF_START) imc_event_start(event, flags); @@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, int flags) return -EINVAL; core_id = smp_processor_id() / threads_per_core; + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + mtspr(SPRN_LDBAR, ldbar_value); + /* * imc pmus are enabled only when it is used. * See if this is triggered for the first time. @@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id; struct imc_pmu_ref *ref; - /* -* Take a snapshot and calculate the delta and update -* the event counter values. -*/ - imc_event_update(event); + mtspr(SPRN_LDBAR, 0); core_id = smp_processor_id() / threads_per_core; ref = _imc_refc[core_id]; @@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(>lock); + /* +* Take a snapshot and calculate the delta and update +* the event counter values. +*/ + imc_event_update(event); } /* update_pmu_ops : Populate the appropriate operations for "pmu" */ -- 2.17.1
[PATCH v2 5/5] powerpc/perf: Trace imc PMU functions
Add PMU functions to support trace-imc. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 175 1 file changed, 175 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 1f09265c8fb0..32ff0e449fca 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1120,6 +1120,173 @@ static int trace_imc_cpu_init(void) ppc_trace_imc_cpu_offline); } +static u64 get_trace_imc_event_base_addr(void) +{ + return (u64)per_cpu(trace_imc_mem, smp_processor_id()); +} + +/* + * Function to parse trace-imc data obtained + * and to prepare the perf sample. + */ +static int trace_imc_prepare_sample(struct trace_imc_data *mem, + struct perf_sample_data *data, + u64 *prev_tb, + struct perf_event_header *header, + struct perf_event *event) +{ + /* Sanity checks for a valid record */ + if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb) + *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1)); + else + return -EINVAL; + + if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) != +be64_to_cpu(READ_ONCE(mem->tb2))) + return -EINVAL; + + /* Prepare perf sample */ + data->ip = be64_to_cpu(READ_ONCE(mem->ip)); + data->period = event->hw.last_period; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + header->misc = 0; + + if (is_kernel_addr(data->ip)) + header->misc |= PERF_RECORD_MISC_KERNEL; + else + header->misc |= PERF_RECORD_MISC_USER; + + perf_event_header__init_id(header, data, event); + + return 0; +} + +static void dump_trace_imc_data(struct perf_event *event) +{ + struct trace_imc_data *mem; + int i, ret; + u64 prev_tb = 0; + + mem = (struct trace_imc_data *)get_trace_imc_event_base_addr(); + for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data)); + i++, mem++) { + struct perf_sample_data data; + struct perf_event_header header; + + ret = trace_imc_prepare_sample(mem, , _tb, , event); + if (ret) /* Exit, if not a valid record */ + break; + else { + /* If this is a valid record, create the sample */ + struct perf_output_handle handle; + + if (perf_output_begin(, event, header.size)) + return; + + perf_output_sample(, , , event); + perf_output_end(); + } + } +} + +static int trace_imc_event_add(struct perf_event *event, int flags) +{ + /* Enable the sched_task to start the engine */ + perf_sched_cb_inc(event->ctx->pmu); + return 0; +} + +static void trace_imc_event_read(struct perf_event *event) +{ + dump_trace_imc_data(event); +} + +static void trace_imc_event_stop(struct perf_event *event, int flags) +{ + trace_imc_event_read(event); +} + +static void trace_imc_event_start(struct perf_event *event, int flags) +{ + return; +} + +static void trace_imc_event_del(struct perf_event *event, int flags) +{ + perf_sched_cb_dec(event->ctx->pmu); +} + +void trace_imc_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + int core_id = smp_processor_id() / threads_per_core; + struct imc_pmu_ref *ref; + u64 local_mem, ldbar_value; + + /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */ + local_mem = get_trace_imc_event_base_addr(); + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE; + + ref = _imc_refc[core_id]; + if (!ref) + return; + + if (sched_in) { + mtspr(SPRN_LDBAR, ldbar_value); + mutex_lock(>lock); + if (ref->refc == 0) { + if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE, + get_hard_smp_processor_id(smp_processor_id( { + mutex_unlock(>lock); + pr_err("trace-imc: Unable to start the counters for core %d\n", core_id); + mtspr(SPRN_LDBAR, 0); + return; + } + } + ++ref->refc; + mutex_unlock(>lock); + } else { + mtspr(SPRN_LDBAR, 0); + mutex_lock(>lock); + ref->refc--; +
[PATCH v2 3/5] powerpc/perf: Add privileged access check for thread_imc
From: Madhavan Srinivasan Add code to restrict user access to thread_imc pmu since some event report privilege level information. Fixes: f74c89bd80fb3 ('powerpc/perf: Add thread IMC PMU support') Signed-off-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 3bef46f8417d..5ca80545a849 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -877,6 +877,9 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* Sampling not supported */ if (event->hw.sample_period) return -EINVAL; -- 2.17.1
[PATCH v2 4/5] powerpc/perf: Trace imc events detection and cpuhotplug
Patch detects trace-imc events, does memory initilizations for each online cpu, and registers cpuhotplug call-backs. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 91 +++ arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h| 1 + 3 files changed, 95 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 5ca80545a849..1f09265c8fb0 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem); static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; +/* Trace IMC data structures */ +static DEFINE_PER_CPU(u64 *, trace_imc_mem); +static int trace_imc_mem_size; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -1068,6 +1072,54 @@ static void thread_imc_event_del(struct perf_event *event, int flags) imc_event_update(event); } +/* + * Allocate a page of memory for each cpu, and load LDBAR with 0. + */ +static int trace_imc_mem_alloc(int cpu_id, int size) +{ + u64 *local_mem = per_cpu(trace_imc_mem, cpu_id); + int phys_id = cpu_to_node(cpu_id), rc = 0; + + if (!local_mem) { + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!local_mem) + return -ENOMEM; + per_cpu(trace_imc_mem, cpu_id) = local_mem; + + /* Initialise the counters for trace mode */ + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem), + get_hard_smp_processor_id(cpu_id)); + if (rc) { + pr_info("IMC:opal init failed for trace imc\n"); + return rc; + } + } + + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int ppc_trace_imc_cpu_online(unsigned int cpu) +{ + return trace_imc_mem_alloc(cpu, trace_imc_mem_size); +} + +static int ppc_trace_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int trace_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, + "perf/powerpc/imc_trace:online", + ppc_trace_imc_cpu_online, + ppc_trace_imc_cpu_offline); +} + /* update_pmu_ops : Populate the appropriate operations for "pmu" */ static int update_pmu_ops(struct imc_pmu *pmu) { @@ -1189,6 +1241,17 @@ static void cleanup_all_thread_imc_memory(void) } } +static void cleanup_all_trace_imc_memory(void) +{ + int i, order = get_order(trace_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(trace_imc_mem, i)) + free_pages((u64)per_cpu(trace_imc_mem, i), order); + + } +} + /* Function to free the attr_groups which are dynamically allocated */ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) { @@ -1230,6 +1293,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); cleanup_all_thread_imc_memory(); } + + if (pmu_ptr->domain == IMC_DOMAIN_TRACE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE); + cleanup_all_trace_imc_memory(); + } } /* @@ -1312,6 +1380,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, thread_imc_pmu = pmu_ptr; break; + case IMC_DOMAIN_TRACE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + trace_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = trace_imc_mem_alloc(cpu, trace_imc_mem_size); + if (res) { + cleanup_all_trace_imc_memory(); + goto err; + } + } + break; default: return -EINVAL; } @@ -1384,6 +1467,14 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id goto err_free_mem; } + break; + case IMC_DOMAIN_TRACE: + ret = trace_imc_cpu_init(); + if (ret) { + cleanup_all_trace_imc_memory(); + goto err_fre
[PATCH v2 1/5] powerpc/include: Add data structures and macros for IMC trace mode
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode and data structure to hold the trace-imc record data. Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since there is a new switch case added in the opal-calls for IMC. Signed-off-by: Anju T Sudhakar --- arch/powerpc/include/asm/imc-pmu.h | 39 + arch/powerpc/include/asm/opal-api.h | 1 + 2 files changed, 40 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 69f516ecb2fd..7c2ef0e42661 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -33,6 +33,7 @@ */ #define THREAD_IMC_LDBAR_MASK 0x0003e000ULL #define THREAD_IMC_ENABLE 0x8000ULL +#define TRACE_IMC_ENABLE 0x4000ULL /* * For debugfs interface for imc-mode and imc-command @@ -59,6 +60,34 @@ struct imc_events { char *scale; }; +/* + * Trace IMC hardware updates a 64bytes record on + * Core Performance Monitoring Counter (CPMC) + * overflow. Here is the layout for the trace imc record + * + * DW 0 : Timebase + * DW 1 : Program Counter + * DW 2 : PIDR information + * DW 3 : CPMC1 + * DW 4 : CPMC2 + * DW 5 : CPMC3 + * Dw 6 : CPMC4 + * DW 7 : Timebase + * . + * + * The following is the data structure to hold trace imc data. + */ +struct trace_imc_data { + u64 tb1; + u64 ip; + u64 val; + u64 cpmc1; + u64 cpmc2; + u64 cpmc3; + u64 cpmc4; + u64 tb2; +}; + /* Event attribute array index */ #define IMC_FORMAT_ATTR0 #define IMC_EVENT_ATTR 1 @@ -68,6 +97,13 @@ struct imc_events { /* PMU Format attribute macros */ #define IMC_EVENT_OFFSET_MASK 0xULL +/* + * Macro to mask bits 0:21 of first double word(which is the timebase) to + * compare with 8th double word (timebase) of trace imc record data. + */ +#define IMC_TRACE_RECORD_TB1_MASK 0x3ffULL + + /* * Device tree parser code detects IMC pmu support and * registers new IMC pmus. This structure will hold the @@ -113,6 +149,7 @@ struct imc_pmu_ref { enum { IMC_TYPE_THREAD = 0x1, + IMC_TYPE_TRACE = 0x2, IMC_TYPE_CORE = 0x4, IMC_TYPE_CHIP = 0x10, }; @@ -123,6 +160,8 @@ enum { #define IMC_DOMAIN_NEST1 #define IMC_DOMAIN_CORE2 #define IMC_DOMAIN_THREAD 3 +/* For trace-imc the domain is still thread but it operates in trace-mode */ +#define IMC_DOMAIN_TRACE 4 extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b239ea..a4130b21b159 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1118,6 +1118,7 @@ enum { enum { OPAL_IMC_COUNTERS_NEST = 1, OPAL_IMC_COUNTERS_CORE = 2, + OPAL_IMC_COUNTERS_TRACE = 3, }; -- 2.17.1
[PATCH v2 0/5] powerpc/perf: IMC trace-mode support
bs.org/pipermail/skiboot/2018-December/012883.html * Set LDBAR spr to enable imc-trace mode. LDBAR Layout: 0 : Enable/Disable 1 : 0 -> Accumulation Mode 1 -> Trace Mode 2:3 : Reserved 4-6 : PB scope 7 : Reserved 8:50 : Counter Address 51:63 : Reserved Key benefit of imc trace-mode is, each sample record contains the address pointer along with other information. So that, we can profile the IP without interrupting the application. Performance data using 'perf top' with and without trace-imc event: When the application is monitored with trace-imc event, we dont take any PMI interrupts. PMI interrupts count when `perf top` command is executed without trac-imc event. # perf top 12.53% [kernel] [k] arch_cpu_idle 11.32% [kernel] [k] rcu_idle_enter 10.76% [kernel] [k] __next_timer_interrupt 9.49% [kernel] [k] find_next_bit 8.06% [kernel] [k] rcu_dynticks_eqs_exit 7.82% [kernel] [k] do_idle 5.71% [kernel] [k] tick_nohz_idle_stop_tic [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803803804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts `perf top` with trace-imc (right after 'perf top' without trace-imc event): # perf top -e trace_imc/trace_cycles/ 12.50% [kernel] [k] arch_cpu_idle 11.81% [kernel] [k] __next_timer_interrupt 11.22% [kernel] [k] rcu_idle_enter 10.25% [kernel] [k] find_next_bit 7.91% [kernel] [k] do_idle 7.69% [kernel] [k] rcu_dynticks_eqs_exit 5.20% [kernel] [k] tick_nohz_idle_stop_tick [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803804804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts The PMI interrupts count remains the same. Changelog: >From v1 -> v2 -- * Added privileged access check for thread-imc and trace-imc Suggestions/comments are welcome. Anju T Sudhakar (4): powerpc/include: Add data structures and macros for IMC trace mode powerpc/perf: Rearrange setting of ldbar for thread-imc powerpc/perf: Trace imc events detection and cpuhotplug powerpc/perf: Trace imc PMU functions Madhavan Srinivasan (1): powerpc/perf: Add privileged access check for thread_imc arch/powerpc
[PATCH 0/4] powerpc/perf: IMC trace-mode support
e 1 : 0 -> Accumulation Mode 1 -> Trace Mode 2:3 : Reserved 4-6 : PB scope 7 : Reserved 8:50 : Counter Address 51:63 : Reserved Key benefit of imc trace-mode is, each sample record contains the address pointer along with other information. So that, we can profile the IP without interrupting the application. Performance data using 'perf top' with and without trace-imc event: When the application is monitored with trace-imc event, we dont take any PMI interrupts. PMI interrupts count when `perf top` command is executed without trac-imc event. # perf top 12.53% [kernel] [k] arch_cpu_idle 11.32% [kernel] [k] rcu_idle_enter 10.76% [kernel] [k] __next_timer_interrupt 9.49% [kernel] [k] find_next_bit 8.06% [kernel] [k] rcu_dynticks_eqs_exit 7.82% [kernel] [k] do_idle 5.71% [kernel] [k] tick_nohz_idle_stop_tic [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803803804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts `perf top` with trace-imc (right after 'perf top' without trace-imc event): # perf top -e trace_imc/trace_cycles/ 12.50% [kernel] [k] arch_cpu_idle 11.81% [kernel] [k] __next_timer_interrupt 11.22% [kernel] [k] rcu_idle_enter 10.25% [kernel] [k] find_next_bit 7.91% [kernel] [k] do_idle 7.69% [kernel] [k] rcu_dynticks_eqs_exit 5.20% [kernel] [k] tick_nohz_idle_stop_tick [---] # cat /proc/interrupts (a snippet from the output) 9944 1072804804 1644804 1306 804804804804804804804 804804 1961 1602804804 1258 [-] 803803803803803803803 803803803804804804804 804804804804804804803 803803803803803 1306803 803 Performance monitoring interrupts The PMI interrupts count remains the same. Anju T Sudhakar (4): powerpc/include: Add data structures and macros for IMC trace mode powerpc/perf: Rearrange setting of ldbar for thread-imc powerpc/perf: Trace imc events detectionand cpuhotplug powerpc/perf: Trace imc PMU functions arch/powerpc/include/asm/imc-pmu.h| 39 +++ arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/perf/imc-pmu.c | 291 +- arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h| 1 + 5 files changed, 324 insertions(+), 11 deletions(-) -- 2.17.1
[PATCH 4/4] powerpc/perf: Trace imc PMU functions
Add PMU functions to support trace-imc. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 172 1 file changed, 172 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d9ffe7f03f1e..18af7c3e2345 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1117,6 +1117,170 @@ static int trace_imc_cpu_init(void) ppc_trace_imc_cpu_offline); } +static u64 get_trace_imc_event_base_addr(void) +{ + return (u64)per_cpu(trace_imc_mem, smp_processor_id()); +} + +/* + * Function to parse trace-imc data obtained + * and to prepare the perf sample. + */ +static int trace_imc_prepare_sample(struct trace_imc_data *mem, + struct perf_sample_data *data, + u64 *prev_tb, + struct perf_event_header *header, + struct perf_event *event) +{ + /* Sanity checks for a valid record */ + if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb) + *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1)); + else + return -EINVAL; + + if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) != +be64_to_cpu(READ_ONCE(mem->tb2))) + return -EINVAL; + + /* Prepare perf sample */ + data->ip = be64_to_cpu(READ_ONCE(mem->ip)); + data->period = event->hw.last_period; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + header->misc = 0; + + if (is_kernel_addr(data->ip)) + header->misc |= PERF_RECORD_MISC_KERNEL; + else + header->misc |= PERF_RECORD_MISC_USER; + + perf_event_header__init_id(header, data, event); + + return 0; +} + +static void dump_trace_imc_data(struct perf_event *event) +{ + struct trace_imc_data *mem; + int i, ret; + u64 prev_tb = 0; + + mem = (struct trace_imc_data *)get_trace_imc_event_base_addr(); + for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data)); + i++, mem++) { + struct perf_sample_data data; + struct perf_event_header header; + + ret = trace_imc_prepare_sample(mem, , _tb, , event); + if (ret) /* Exit, if not a valid record */ + break; + else { + /* If this is a valid record, create the sample */ + struct perf_output_handle handle; + + if (perf_output_begin(, event, header.size)) + return; + + perf_output_sample(, , , event); + perf_output_end(); + } + } +} + +static int trace_imc_event_add(struct perf_event *event, int flags) +{ + /* Enable the sched_task to start the engine */ + perf_sched_cb_inc(event->ctx->pmu); + return 0; +} + +static void trace_imc_event_read(struct perf_event *event) +{ + dump_trace_imc_data(event); +} + +static void trace_imc_event_stop(struct perf_event *event, int flags) +{ + trace_imc_event_read(event); +} + +static void trace_imc_event_start(struct perf_event *event, int flags) +{ + return; +} + +static void trace_imc_event_del(struct perf_event *event, int flags) +{ + perf_sched_cb_dec(event->ctx->pmu); +} + +void trace_imc_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + int core_id = smp_processor_id() / threads_per_core; + struct imc_pmu_ref *ref; + u64 local_mem, ldbar_value; + + /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */ + local_mem = get_trace_imc_event_base_addr(); + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE; + + ref = _imc_refc[core_id]; + if (!ref) + return; + + if (sched_in) { + mtspr(SPRN_LDBAR, ldbar_value); + mutex_lock(>lock); + if (ref->refc == 0) { + if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE, + get_hard_smp_processor_id(smp_processor_id( { + mutex_unlock(>lock); + pr_err("trace-imc: Unable to start the counters for core %d\n", core_id); + mtspr(SPRN_LDBAR, 0); + return; + } + } + ++ref->refc; + mutex_unlock(>lock); + } else { + mtspr(SPRN_LDBAR, 0); + mutex_lock(>lock); + ref->refc--; +
[PATCH 3/4] powerpc/perf: Trace imc events detection and cpuhotplug
Patch detects trace-imc events, does memory initilizations for each online cpu, and registers cpuhotplug call-backs. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 91 +++ arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h| 1 + 3 files changed, 95 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 3bef46f8417d..d9ffe7f03f1e 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -43,6 +43,10 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem); static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; +/* Trace IMC data structures */ +static DEFINE_PER_CPU(u64 *, trace_imc_mem); +static int trace_imc_mem_size; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -1065,6 +1069,54 @@ static void thread_imc_event_del(struct perf_event *event, int flags) imc_event_update(event); } +/* + * Allocate a page of memory for each cpu, and load LDBAR with 0. + */ +static int trace_imc_mem_alloc(int cpu_id, int size) +{ + u64 *local_mem = per_cpu(trace_imc_mem, cpu_id); + int phys_id = cpu_to_node(cpu_id), rc = 0; + + if (!local_mem) { + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!local_mem) + return -ENOMEM; + per_cpu(trace_imc_mem, cpu_id) = local_mem; + + /* Initialise the counters for trace mode */ + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem), + get_hard_smp_processor_id(cpu_id)); + if (rc) { + pr_info("IMC:opal init failed for trace imc\n"); + return rc; + } + } + + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int ppc_trace_imc_cpu_online(unsigned int cpu) +{ + return trace_imc_mem_alloc(cpu, trace_imc_mem_size); +} + +static int ppc_trace_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int trace_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, + "perf/powerpc/imc_trace:online", + ppc_trace_imc_cpu_online, + ppc_trace_imc_cpu_offline); +} + /* update_pmu_ops : Populate the appropriate operations for "pmu" */ static int update_pmu_ops(struct imc_pmu *pmu) { @@ -1186,6 +1238,17 @@ static void cleanup_all_thread_imc_memory(void) } } +static void cleanup_all_trace_imc_memory(void) +{ + int i, order = get_order(trace_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(trace_imc_mem, i)) + free_pages((u64)per_cpu(trace_imc_mem, i), order); + + } +} + /* Function to free the attr_groups which are dynamically allocated */ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) { @@ -1227,6 +1290,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); cleanup_all_thread_imc_memory(); } + + if (pmu_ptr->domain == IMC_DOMAIN_TRACE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE); + cleanup_all_trace_imc_memory(); + } } /* @@ -1309,6 +1377,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, thread_imc_pmu = pmu_ptr; break; + case IMC_DOMAIN_TRACE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + trace_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = trace_imc_mem_alloc(cpu, trace_imc_mem_size); + if (res) { + cleanup_all_trace_imc_memory(); + goto err; + } + } + break; default: return -EINVAL; } @@ -1381,6 +1464,14 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id goto err_free_mem; } + break; + case IMC_DOMAIN_TRACE: + ret = trace_imc_cpu_init(); + if (ret) { + cleanup_all_trace_imc_memory(); + goto err_fre
[PATCH 1/4] powerpc/include: Add data structures and macros for IMC trace mode
Add the macros needed for IMC (In-Memory Collection Counters) trace-mode and data structure to hold the trace-imc record data. Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since there is a new switch case added in the opal-calls for IMC. Signed-off-by: Anju T Sudhakar --- arch/powerpc/include/asm/imc-pmu.h | 39 + arch/powerpc/include/asm/opal-api.h | 1 + 2 files changed, 40 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 69f516ecb2fd..7c2ef0e42661 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -33,6 +33,7 @@ */ #define THREAD_IMC_LDBAR_MASK 0x0003e000ULL #define THREAD_IMC_ENABLE 0x8000ULL +#define TRACE_IMC_ENABLE 0x4000ULL /* * For debugfs interface for imc-mode and imc-command @@ -59,6 +60,34 @@ struct imc_events { char *scale; }; +/* + * Trace IMC hardware updates a 64bytes record on + * Core Performance Monitoring Counter (CPMC) + * overflow. Here is the layout for the trace imc record + * + * DW 0 : Timebase + * DW 1 : Program Counter + * DW 2 : PIDR information + * DW 3 : CPMC1 + * DW 4 : CPMC2 + * DW 5 : CPMC3 + * Dw 6 : CPMC4 + * DW 7 : Timebase + * . + * + * The following is the data structure to hold trace imc data. + */ +struct trace_imc_data { + u64 tb1; + u64 ip; + u64 val; + u64 cpmc1; + u64 cpmc2; + u64 cpmc3; + u64 cpmc4; + u64 tb2; +}; + /* Event attribute array index */ #define IMC_FORMAT_ATTR0 #define IMC_EVENT_ATTR 1 @@ -68,6 +97,13 @@ struct imc_events { /* PMU Format attribute macros */ #define IMC_EVENT_OFFSET_MASK 0xULL +/* + * Macro to mask bits 0:21 of first double word(which is the timebase) to + * compare with 8th double word (timebase) of trace imc record data. + */ +#define IMC_TRACE_RECORD_TB1_MASK 0x3ffULL + + /* * Device tree parser code detects IMC pmu support and * registers new IMC pmus. This structure will hold the @@ -113,6 +149,7 @@ struct imc_pmu_ref { enum { IMC_TYPE_THREAD = 0x1, + IMC_TYPE_TRACE = 0x2, IMC_TYPE_CORE = 0x4, IMC_TYPE_CHIP = 0x10, }; @@ -123,6 +160,8 @@ enum { #define IMC_DOMAIN_NEST1 #define IMC_DOMAIN_CORE2 #define IMC_DOMAIN_THREAD 3 +/* For trace-imc the domain is still thread but it operates in trace-mode */ +#define IMC_DOMAIN_TRACE 4 extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b239ea..a4130b21b159 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1118,6 +1118,7 @@ enum { enum { OPAL_IMC_COUNTERS_NEST = 1, OPAL_IMC_COUNTERS_CORE = 2, + OPAL_IMC_COUNTERS_TRACE = 3, }; -- 2.17.1
[PATCH 2/4] powerpc/perf: Rearrange setting of ldbar for thread-imc
LDBAR holds the memory address allocated for each cpu. For thread-imc the mode bit (i.e bit 1) of LDBAR is set to accumulation. Currently, ldbar is loaded with per cpu memory address and mode set to accumulation at boot time. To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del(). Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 28 +--- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index f292a3f284f1..3bef46f8417d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -806,8 +806,11 @@ static int core_imc_event_init(struct perf_event *event) } /* - * Allocates a page of memory for each of the online cpus, and write the - * physical base address of that page to the LDBAR for that cpu. + * Allocates a page of memory for each of the online cpus, and load + * LDBAR with 0. + * The physical base address of the page allocated for a cpu will be + * written to the LDBAR for that cpu, when the thread-imc event + * is added. * * LDBAR Register Layout: * @@ -825,7 +828,7 @@ static int core_imc_event_init(struct perf_event *event) */ static int thread_imc_mem_alloc(int cpu_id, int size) { - u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + u64 *local_mem = per_cpu(thread_imc_mem, cpu_id); int nid = cpu_to_node(cpu_id); if (!local_mem) { @@ -842,9 +845,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size) per_cpu(thread_imc_mem, cpu_id) = local_mem; } - ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; - - mtspr(SPRN_LDBAR, ldbar_value); + mtspr(SPRN_LDBAR, 0); return 0; } @@ -995,6 +996,7 @@ static int thread_imc_event_add(struct perf_event *event, int flags) { int core_id; struct imc_pmu_ref *ref; + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id()); if (flags & PERF_EF_START) imc_event_start(event, flags); @@ -1003,6 +1005,9 @@ static int thread_imc_event_add(struct perf_event *event, int flags) return -EINVAL; core_id = smp_processor_id() / threads_per_core; + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + mtspr(SPRN_LDBAR, ldbar_value); + /* * imc pmus are enabled only when it is used. * See if this is triggered for the first time. @@ -1034,11 +1039,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id; struct imc_pmu_ref *ref; - /* -* Take a snapshot and calculate the delta and update -* the event counter values. -*/ - imc_event_update(event); + mtspr(SPRN_LDBAR, 0); core_id = smp_processor_id() / threads_per_core; ref = _imc_refc[core_id]; @@ -1057,6 +1058,11 @@ static void thread_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(>lock); + /* +* Take a snapshot and calculate the delta and update +* the event counter values. +*/ + imc_event_update(event); } /* update_pmu_ops : Populate the appropriate operations for "pmu" */ -- 2.17.1
[PATCH] powerpc/perf: Return accordingly on invalid chip-id in
Nest hardware counter memory resides in a per-chip reserve-memory. During nest_imc_event_init(), chip-id of the event-cpu is considered to calculate the base memory addresss for that cpu. Return, proper error condition if the chip_id calculated is invalid. Reported-by: Dan Carpenter Fixes: 885dcd709ba91 ("powerpc/perf: Add nest IMC PMU support") Reviewed-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 6954636b16d1..78514170cf71 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -496,6 +496,11 @@ static int nest_imc_event_init(struct perf_event *event) * Get the base memory addresss for this cpu. */ chip_id = cpu_to_chip_id(event->cpu); + + /* Return, if chip_id is not valid */ + if (chip_id < 0) + return -ENODEV; + pcni = pmu->mem_info; do { if (pcni->id == chip_id) { -- 2.17.1
[PATCH] powerpc/perf: Fix loop exit condition in nest_imc_event_init
The data structure (i.e struct imc_mem_info) to hold the memory address information for nest imc units is allocated based on the number of nodes in the system. nest_imc_event_init() traverse this struct array to calculate the memory base address for the event-cpu. If we fail to find a match for the event cpu's chip-id in imc_mem_info struct array, then the do-while loop will iterate until we crash. Fix this by changing the loop exit condition based on the number of nodes in the system. Reported-by: Dan Carpenter Fixes: 885dcd709ba91 ( powerpc/perf: Add nest IMC PMU support) Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 78514170cf71..e9dc771f3e3d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -459,7 +459,7 @@ static void nest_imc_counters_release(struct perf_event *event) static int nest_imc_event_init(struct perf_event *event) { - int chip_id, rc, node_id; + int chip_id, rc, node_id, nr_chips = num_possible_nodes(); u32 l_config, config = event->attr.config; struct imc_mem_info *pcni; struct imc_pmu *pmu; @@ -508,7 +508,7 @@ static int nest_imc_event_init(struct perf_event *event) break; } pcni++; - } while (pcni); + } while (--nr_chips); if (!flag) return -ENODEV; -- 2.17.1
Re: [bug report] powerpc/perf: Add nest IMC PMU support
Hi, On 10/18/18 3:03 PM, Dan Carpenter wrote: Hello Anju T Sudhakar, The patch 885dcd709ba9: "powerpc/perf: Add nest IMC PMU support" from Jul 19, 2017, leads to the following static checker warning: arch/powerpc/perf/imc-pmu.c:506 nest_imc_event_init() warn: 'pcni' can't be NULL. Unfortunately this warning didn't appear when I checked with smatch. Could you please provide the steps to reproduce this? This is the commit id with which I build smatch: commit 79fe36620a7a3a45d1a51d62238da250fb8db920 But anyway I am looking into the code part. Thanks for mentioning this. I will update soon. Thanks, Anju
[PATCH v2 4/4] powerpc/perf: Unregister thread-imc if core-imc not supported
Since thread-imc internally use the core-imc hardware infrastructure and is depended on it, having thread-imc in the kernel in the absence of core-imc is trivial. Patch disables thread-imc, if core-imc is not registered. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/imc-pmu.h| 1 + arch/powerpc/perf/imc-pmu.c | 12 arch/powerpc/platforms/powernv/opal-imc.c | 9 + 3 files changed, 22 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index d76cb11..69f516e 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -128,4 +128,5 @@ extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); extern void thread_imc_disable(void); extern int get_max_nest_dev(void); +extern void unregister_thread_imc(void); #endif /* __ASM_POWERPC_IMC_PMU_H */ diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index f563d30..d1977b6 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -40,6 +40,7 @@ static struct imc_pmu *core_imc_pmu; /* Thread IMC data structures and variables */ static DEFINE_PER_CPU(u64 *, thread_imc_mem); +static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; struct imc_pmu *imc_event_to_pmu(struct perf_event *event) @@ -1228,6 +1229,16 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) } } +/* + * Function to unregister thread-imc if core-imc + * is not registered. + */ +void unregister_thread_imc(void) +{ + imc_common_cpuhp_mem_free(thread_imc_pmu); + imc_common_mem_free(thread_imc_pmu); + perf_pmu_unregister(_imc_pmu->pmu); +} /* * imc_mem_init : Function to support memory allocation for core imc. @@ -1296,6 +1307,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, } } + thread_imc_pmu = pmu_ptr; break; default: return -EINVAL; diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 490bb72..58a0794 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -255,6 +255,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) { struct device_node *imc_dev = pdev->dev.of_node; int pmu_count = 0, domain; + bool core_imc_reg = false, thread_imc_reg = false; u32 type; /* @@ -292,6 +293,10 @@ static int opal_imc_counters_probe(struct platform_device *pdev) if (!imc_pmu_create(imc_dev, pmu_count, domain)) { if (domain == IMC_DOMAIN_NEST) pmu_count++; + if (domain == IMC_DOMAIN_CORE) + core_imc_reg = true; + if (domain == IMC_DOMAIN_THREAD) + thread_imc_reg = true; } } @@ -299,6 +304,10 @@ static int opal_imc_counters_probe(struct platform_device *pdev) if (pmu_count == 0) debugfs_remove_recursive(imc_debugfs_parent); + /* If core imc is not registered, unregister thread-imc */ + if (!core_imc_reg && thread_imc_reg) + unregister_thread_imc(); + return 0; } -- 2.7.4
[PATCH v2 3/4] powerpc/perf: Return appropriate value for unknown domain
Return proper error code for unknown domain during IMC initialization. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 61000d1..f563d30 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1371,7 +1371,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id break; default: - return -1; /* Unknown domain */ + return -EINVAL;/* Unknown domain */ } ret = update_events_in_group(parent, pmu_ptr); -- 2.7.4
[PATCH v2 2/4] powerpc/perf: Replace the direct return with goto statement
Replace the direct return statement in imc_mem_init() with goto, to adhere to the kernel coding style. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index c1665ff..61000d1 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1236,7 +1236,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, int pmu_index) { const char *s; - int nr_cores, cpu, res; + int nr_cores, cpu, res = -ENOMEM; if (of_property_read_string(parent, "name", )) return -ENODEV; @@ -1246,7 +1246,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, /* Update the pmu name */ pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s); if (!pmu_ptr->pmu.name) - return -ENOMEM; + goto err; /* Needed for hotplug/migration */ if (!per_nest_pmu_arr) { @@ -1254,7 +1254,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, sizeof(struct imc_pmu *), GFP_KERNEL); if (!per_nest_pmu_arr) - return -ENOMEM; + goto err; } per_nest_pmu_arr[pmu_index] = pmu_ptr; break; @@ -1262,21 +1262,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, /* Update the pmu name */ pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); if (!pmu_ptr->pmu.name) - return -ENOMEM; + goto err; nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), GFP_KERNEL); if (!pmu_ptr->mem_info) - return -ENOMEM; + goto err; core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), GFP_KERNEL); if (!core_imc_refc) { kfree(pmu_ptr->mem_info); - return -ENOMEM; + goto err; } core_imc_pmu = pmu_ptr; @@ -1285,14 +1285,14 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, /* Update the pmu name */ pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); if (!pmu_ptr->pmu.name) - return -ENOMEM; + goto err; thread_imc_mem_size = pmu_ptr->counter_mem_size; for_each_online_cpu(cpu) { res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size); if (res) { cleanup_all_thread_imc_memory(); - return res; + goto err; } } @@ -1302,6 +1302,8 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, } return 0; +err: + return res; } /* -- 2.7.4
[PATCH v2 1/4] powerpc/perf: Rearrange memory freeing in imc init
When any of the IMC (In-Memory Collection counter) devices fail to initialize, imc_common_mem_free() frees set of memory. In doing so, pmu_ptr pointer is also freed. But pmu_ptr pointer is used in subsequent function (imc_common_cpuhp_mem_free()) which is wrong. Patch here reorders the code to avoid such access. Also free the memory which is dynamically allocated during imc initialization, wherever required. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> Reviewed-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 32 --- arch/powerpc/platforms/powernv/opal-imc.c | 13 ++--- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 75fb23c..c1665ff 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1153,7 +1153,7 @@ static void cleanup_all_core_imc_memory(void) /* mem_info will never be NULL */ for (i = 0; i < nr_cores; i++) { if (ptr[i].vbase) - free_pages((u64)ptr->vbase, get_order(size)); + free_pages((u64)ptr[i].vbase, get_order(size)); } kfree(ptr); @@ -1191,7 +1191,6 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) if (pmu_ptr->attr_groups[IMC_EVENT_ATTR]) kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); - kfree(pmu_ptr); } /* @@ -1208,6 +1207,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); + per_nest_pmu_arr = NULL; } if (nest_pmus > 0) @@ -1319,10 +1319,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id int ret; ret = imc_mem_init(pmu_ptr, parent, pmu_idx); - if (ret) { - imc_common_mem_free(pmu_ptr); - return ret; - } + if (ret) + goto err_free_mem; switch (pmu_ptr->domain) { case IMC_DOMAIN_NEST: @@ -1337,7 +1335,9 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = init_nest_pmu_ref(); if (ret) { mutex_unlock(_init_lock); - goto err_free; + kfree(per_nest_pmu_arr); + per_nest_pmu_arr = NULL; + goto err_free_mem; } /* Register for cpu hotplug notification. */ ret = nest_pmu_cpumask_init(); @@ -1345,7 +1345,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id mutex_unlock(_init_lock); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); - goto err_free; + per_nest_pmu_arr = NULL; + goto err_free_mem; } } nest_pmus++; @@ -1355,7 +1356,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = core_imc_pmu_cpumask_init(); if (ret) { cleanup_all_core_imc_memory(); - return ret; + goto err_free_mem; } break; @@ -1363,7 +1364,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = thread_imc_cpu_init(); if (ret) { cleanup_all_thread_imc_memory(); - return ret; + goto err_free_mem; } break; @@ -1373,23 +1374,24 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = update_events_in_group(parent, pmu_ptr); if (ret) - goto err_free; + goto err_free_cpuhp_mem; ret = update_pmu_ops(pmu_ptr); if (ret) - goto err_free; + goto err_free_cpuhp_mem; ret = perf_pmu_register(_ptr->pmu, pmu_ptr->pmu.name, -1); if (ret) - goto err_free; + goto err_free_cpuhp_mem; pr_info("%s performance monitor hardware support registered\n", pmu_ptr->pmu.name); return 0; -err_free: - imc_common_mem_free(pmu_ptr); +err_free_cpuhp_mem:
[PATCH v2 0/4] powerpc/perf: IMC Cleanups
This patch series includes some cleanups and Unregistration of thread-imc pmu, if the kernel does not have core-imc registered. The entire patch set has been verified using the static checker smatch. Command used: $ make ARCH=powerpc CHECK="/smatch -p=kernel" C=1 vmlinux | tee warns.txt Tests Done: * Fail core-imc at init: nest-imc - working cpuhotplug - works as expected thread-imc - not registered * Fail thread-imc at init: nest-imc - works core-imc - works cpuhotplug - works * Fail nest-imc at init core-imc - works thread-imc -works cpuhotplug - works * Fail only one nest unit (say for mcs23) Other nest-units - works core-imc - works thread-imc - works cpuhotplug - works. * Kexec works The first three patches in this series addresses the comments by Dan Carpenter. Patch series is based on: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git (merge branch) Anju T Sudhakar (4): powerpc/perf: Rearrange memory freeing in imc init powerpc/perf: Replace the direct return with goto statement powerpc/perf: Return appropriate value for unknown domain powerpc/perf: Unregister thread-imc if core-imc not supported arch/powerpc/include/asm/imc-pmu.h| 1 + arch/powerpc/perf/imc-pmu.c | 64 +++ arch/powerpc/platforms/powernv/opal-imc.c | 22 +-- 3 files changed, 60 insertions(+), 27 deletions(-) -- 2.7.4
[PATCH] powerpc/perf: Remove sched_task function defined for thread-imc
Call trace observed while running perf-fuzzer: [ 329.228068] CPU: 43 PID: 9088 Comm: perf_fuzzer Not tainted 4.13.0-32-generic #35~lp1746225 [ 329.228070] task: c03f776ac900 task.stack: c03f77728000 [ 329.228071] NIP: c0299b70 LR: c02a4534 CTR: c029bb80 [ 329.228073] REGS: c03f7772b760 TRAP: 0700 Not tainted (4.13.0-32-generic) [ 329.228073] MSR: 9282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> [ 329.228079] CR: 24008822 XER: [ 329.228080] CFAR: c0299a70 SOFTE: 0 GPR00: c02a4534 c03f7772b9e0 c1606200 c03fef858908 GPR04: c03f776ac900 0001 003fee73 GPR08: c11220d8 0002 GPR12: c029bb80 c7a3d900 GPR16: GPR20: c03f776ad090 c0c71354 GPR24: c03fef716780 003fee73 c03fe69d4200 c03f776ad330 GPR28: c11220d8 0001 c14c6108 c03fef858900 [ 329.228098] NIP [c0299b70] perf_pmu_sched_task+0x170/0x180 [ 329.228100] LR [c02a4534] __perf_event_task_sched_in+0xc4/0x230 [ 329.228101] Call Trace: [ 329.228102] [c03f7772b9e0] [c02a0678] perf_iterate_sb+0x158/0x2a0 (unreliable) [ 329.228105] [c03f7772ba30] [c02a4534] __perf_event_task_sched_in+0xc4/0x230 [ 329.228107] [c03f7772bab0] [c01396dc] finish_task_switch+0x21c/0x310 [ 329.228109] [c03f7772bb60] [c0c71354] __schedule+0x304/0xb80 [ 329.228111] [c03f7772bc40] [c0c71c10] schedule+0x40/0xc0 [ 329.228113] [c03f7772bc60] [c01033f4] do_wait+0x254/0x2e0 [ 329.228115] [c03f7772bcd0] [c0104ac0] kernel_wait4+0xa0/0x1a0 [ 329.228117] [c03f7772bd70] [c0104c24] SyS_wait4+0x64/0xc0 [ 329.228121] [c03f7772be30] [c000b184] system_call+0x58/0x6c [ 329.228121] Instruction dump: [ 329.228123] 3beafea0 7faa4800 409eff18 e8010060 eb610028 ebc10040 7c0803a6 38210050 [ 329.228127] eb81ffe0 eba1ffe8 ebe1fff8 4e800020 <0fe0> 4bbc 6000 6042 [ 329.228131] ---[ end trace 8c46856d314c1811 ]--- [ 375.755943] hrtimer: interrupt took 31601 ns The context switch call-backs for thread-imc are defined in sched_task function. So when thread-imc events are grouped with software pmu events, perf_pmu_sched_task hits the WARN_ON_ONCE condition, since software PMUs are assumed not to have a sched_task defined. Patch to move the thread_imc enable/disable opal call back from sched_task to event_[add/del] function Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 108 +--- 1 file changed, 51 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d7532e7..71d9ba7 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -866,59 +866,6 @@ static int thread_imc_cpu_init(void) ppc_thread_imc_cpu_offline); } -void thread_imc_pmu_sched_task(struct perf_event_context *ctx, - bool sched_in) -{ - int core_id; - struct imc_pmu_ref *ref; - - if (!is_core_imc_mem_inited(smp_processor_id())) - return; - - core_id = smp_processor_id() / threads_per_core; - /* -* imc pmus are enabled only when it is used. -* See if this is triggered for the first time. -* If yes, take the mutex lock and enable the counters. -* If not, just increment the count in ref count struct. -*/ - ref = _imc_refc[core_id]; - if (!ref) - return; - - if (sched_in) { - mutex_lock(>lock); - if (ref->refc == 0) { - if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE, -get_hard_smp_processor_id(smp_processor_id( { - mutex_unlock(>lock); - pr_err("thread-imc: Unable to start the counter\ - for core %d\n", core_id); - return; - } - } - ++ref->refc; - mutex_unlock(>lock); - } else { - mutex_lock(>lock); - ref->refc--; - if (ref->refc == 0) { - if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, - get_hard_smp_processor_id(smp_processor_id( { - mutex_unlock(>lock); - pr_err("thread-imc: Unable to stop the counters\ - for c
Re: [PATCH v2] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()
On Wednesday 16 May 2018 12:18 PM, ppaidipe wrote: On 2018-05-16 12:05, Anju T Sudhakar wrote: Currently memory is allocated for core-imc based on cpu_present_mask, which has bit 'cpu' set iff cpu is populated. We use (cpu number / threads per core) as the array index to access the memory. Under some circumstances firmware marks a CPU as GUARDed CPU and boot the system, until cleared of errors, these CPU's are unavailable for all subsequent boots. GUARDed CPUs are possible but not present from linux view, so it blows a hole when we assume the max length of our allocation is driven by our max present cpus, where as one of the cpus might be online and be beyond the max present cpus, due to the hole. So (cpu number / threads per core) value bounds the array index and leads to memory overflow. Call trace observed during a guard test: Faulting instruction address: 0xc0149f1c cpu 0x69: Vector: 380 (Data Access Out of Range) at [c03fea303420] pc:c0149f1c: prefetch_freepointer+0x14/0x30 lr:c014e0f8: __kmalloc+0x1a8/0x1ac sp:c03fea3036a0 msr:90009033 dar:c9c54b2c91dbf6b7 current = 0xc03fea2c paca = 0xcfddd880 softe: 3 irq_happened: 0x01 pid = 1, comm = swapper/104 Linux version 4.16.7-openpower1 (smc@smc-desktop) (gcc version 6.4.0 (Buildroot 2018.02.1-6-ga8d1126)) #2 SMP Fri May 4 16:44:54 PDT 2018 enter ? for help call trace: __kmalloc+0x1a8/0x1ac (unreliable) init_imc_pmu+0x7f4/0xbf0 opal_imc_counters_probe+0x3fc/0x43c platform_drv_probe+0x48/0x80 driver_probe_device+0x22c/0x308 __driver_attach+0xa0/0xd8 bus_for_each_dev+0x88/0xb4 driver_attach+0x2c/0x40 bus_add_driver+0x1e8/0x228 driver_register+0xd0/0x114 __platform_driver_register+0x50/0x64 opal_imc_driver_init+0x24/0x38 do_one_initcall+0x150/0x15c kernel_init_freeable+0x250/0x254 kernel_init+0x1c/0x150 ret_from_kernel_thread+0x5c/0xc8 Allocating memory for core-imc based on cpu_possible_mask, which has bit 'cpu' set iff cpu is populatable, will fix this issue. Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com> Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> Reviewed-by: Balbir Singh <bsinghar...@gmail.com> I have verified this fix with both normal and kexec boot multiple times when system is having GARDed cores. Not seen any crash/memory corruption issues with this. Tested-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com> Thanks. -Anju --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d7532e7..75fb23c 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void) static void cleanup_all_core_imc_memory(void) { - int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); struct imc_mem_info *ptr = core_imc_pmu->mem_info; int size = core_imc_pmu->counter_mem_size; @@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, if (!pmu_ptr->pmu.name) return -ENOMEM; - nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), GFP_KERNEL);
[PATCH v2] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()
Currently memory is allocated for core-imc based on cpu_present_mask, which has bit 'cpu' set iff cpu is populated. We use (cpu number / threads per core) as the array index to access the memory. Under some circumstances firmware marks a CPU as GUARDed CPU and boot the system, until cleared of errors, these CPU's are unavailable for all subsequent boots. GUARDed CPUs are possible but not present from linux view, so it blows a hole when we assume the max length of our allocation is driven by our max present cpus, where as one of the cpus might be online and be beyond the max present cpus, due to the hole. So (cpu number / threads per core) value bounds the array index and leads to memory overflow. Call trace observed during a guard test: Faulting instruction address: 0xc0149f1c cpu 0x69: Vector: 380 (Data Access Out of Range) at [c03fea303420] pc:c0149f1c: prefetch_freepointer+0x14/0x30 lr:c014e0f8: __kmalloc+0x1a8/0x1ac sp:c03fea3036a0 msr:90009033 dar:c9c54b2c91dbf6b7 current = 0xc03fea2c paca= 0xcfddd880 softe: 3irq_happened: 0x01 pid = 1, comm = swapper/104 Linux version 4.16.7-openpower1 (smc@smc-desktop) (gcc version 6.4.0 (Buildroot 2018.02.1-6-ga8d1126)) #2 SMP Fri May 4 16:44:54 PDT 2018 enter ? for help call trace: __kmalloc+0x1a8/0x1ac (unreliable) init_imc_pmu+0x7f4/0xbf0 opal_imc_counters_probe+0x3fc/0x43c platform_drv_probe+0x48/0x80 driver_probe_device+0x22c/0x308 __driver_attach+0xa0/0xd8 bus_for_each_dev+0x88/0xb4 driver_attach+0x2c/0x40 bus_add_driver+0x1e8/0x228 driver_register+0xd0/0x114 __platform_driver_register+0x50/0x64 opal_imc_driver_init+0x24/0x38 do_one_initcall+0x150/0x15c kernel_init_freeable+0x250/0x254 kernel_init+0x1c/0x150 ret_from_kernel_thread+0x5c/0xc8 Allocating memory for core-imc based on cpu_possible_mask, which has bit 'cpu' set iff cpu is populatable, will fix this issue. Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com> Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> Reviewed-by: Balbir Singh <bsinghar...@gmail.com> --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d7532e7..75fb23c 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void) static void cleanup_all_core_imc_memory(void) { - int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); struct imc_mem_info *ptr = core_imc_pmu->mem_info; int size = core_imc_pmu->counter_mem_size; @@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, if (!pmu_ptr->pmu.name) return -ENOMEM; - nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), GFP_KERNEL); -- 2.7.4
Re: [PATCH] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()
On Friday 11 May 2018 07:13 PM, Anju T Sudhakar wrote: Currently memory is allocated for core-imc based on cpu_present_mask, which has bit 'cpu' set iff cpu is populated. We use (cpu number / threads per core) as as array index to access the memory. So in a system with guarded cores, since allocation happens based on cpu_present_mask, (cpu number / threads per core) bounds the index and leads to memory overflow. The issue is exposed in a guard test. The guard test will make some CPU's as un-available to the system during boot time as well as at runtime. So when the cpu is unavailable to the system during boot time, the memory allocation happens depending on the number of available cpus. And when we access the memory using (cpu number / threads per core) as the index the system crashes due to memory overflow. Allocating memory for core-imc based on cpu_possible_mask, which has bit 'cpu' set iff cpu is populatable, will fix this issue. Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com> Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> Cc: <sta...@vger.kernel.org> # v4.14 + --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d7532e7..75fb23c 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void) static void cleanup_all_core_imc_memory(void) { - int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); struct imc_mem_info *ptr = core_imc_pmu->mem_info; int size = core_imc_pmu->counter_mem_size; @@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, if (!pmu_ptr->pmu.name) return -ENOMEM; - nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), GFP_KERNEL);
Re: [PATCH] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()
Hi, On Saturday 12 May 2018 06:05 AM, Balbir Singh wrote: On Fri, May 11, 2018 at 11:43 PM, Anju T Sudhakar <a...@linux.vnet.ibm.com> wrote: Currently memory is allocated for core-imc based on cpu_present_mask, which has bit 'cpu' set iff cpu is populated. We use (cpu number / threads per core) as as array index to access the memory. So in a system with guarded cores, since allocation happens based on cpu_present_mask, (cpu number / threads per core) bounds the index and leads to memory overflow. The issue is exposed in a guard test. The guard test will make some CPU's as un-available to the system during boot time as well as at runtime. So when the cpu is unavailable to the system during boot time, the memory allocation happens depending on the number of available cpus. And when we access the memory using (cpu number / threads per core) as the index the system crashes due to memory overflow. Allocating memory for core-imc based on cpu_possible_mask, which has bit 'cpu' set iff cpu is populatable, will fix this issue. Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com> Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) The changelog does not clearly call out the confusion between present and possible. Guarded CPUs are possible but not present, so it blows a hole when we assume the max length of our allocation is driven by our max present cpus, where as one of the cpus might be online and be beyond the max present cpus, due to the hole.. Reviewed-by: Balbir Singh <bsinghar...@gmail.com> Balbir Singh. Thanks for the review. OK. I will update the commit message here. Regards, Anju
[PATCH] powerpc/perf: Fix memory allocation for core-imc based on num_possible_cpus()
Currently memory is allocated for core-imc based on cpu_present_mask, which has bit 'cpu' set iff cpu is populated. We use (cpu number / threads per core) as as array index to access the memory. So in a system with guarded cores, since allocation happens based on cpu_present_mask, (cpu number / threads per core) bounds the index and leads to memory overflow. The issue is exposed in a guard test. The guard test will make some CPU's as un-available to the system during boot time as well as at runtime. So when the cpu is unavailable to the system during boot time, the memory allocation happens depending on the number of available cpus. And when we access the memory using (cpu number / threads per core) as the index the system crashes due to memory overflow. Allocating memory for core-imc based on cpu_possible_mask, which has bit 'cpu' set iff cpu is populatable, will fix this issue. Reported-by: Pridhiviraj Paidipeddi <ppaid...@linux.vnet.ibm.com> Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d7532e7..75fb23c 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1146,7 +1146,7 @@ static int init_nest_pmu_ref(void) static void cleanup_all_core_imc_memory(void) { - int i, nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + int i, nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); struct imc_mem_info *ptr = core_imc_pmu->mem_info; int size = core_imc_pmu->counter_mem_size; @@ -1264,7 +1264,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, if (!pmu_ptr->pmu.name) return -ENOMEM; - nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); + nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), GFP_KERNEL); -- 2.7.4
[PATCH 3/4] powerpc/perf: Return appropriate value for unknown domain
Return proper error code for unknown domain during IMC initialization. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 1b285cd..4b4ca83 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1371,7 +1371,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id break; default: - return -1; /* Unknown domain */ + return -EINVAL;/* Unknown domain */ } ret = update_events_in_group(parent, pmu_ptr); -- 2.7.4
[PATCH 4/4] powerpc/perf: Unregister thread-imc if core-imc not supported
Enable thread-imc in the kernel, only if core-imc is registered. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/imc-pmu.h| 1 + arch/powerpc/perf/imc-pmu.c | 12 arch/powerpc/platforms/powernv/opal-imc.c | 9 + 3 files changed, 22 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index d76cb11..69f516e 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -128,4 +128,5 @@ extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); extern void thread_imc_disable(void); extern int get_max_nest_dev(void); +extern void unregister_thread_imc(void); #endif /* __ASM_POWERPC_IMC_PMU_H */ diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 4b4ca83..fa88785 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -40,6 +40,7 @@ static struct imc_pmu *core_imc_pmu; /* Thread IMC data structures and variables */ static DEFINE_PER_CPU(u64 *, thread_imc_mem); +static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; struct imc_pmu *imc_event_to_pmu(struct perf_event *event) @@ -1228,6 +1229,16 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) } } +/* + * Function to unregister thread-imc if core-imc + * is not registered. + */ +void unregister_thread_imc(void) +{ + imc_common_cpuhp_mem_free(thread_imc_pmu); + imc_common_mem_free(thread_imc_pmu); + perf_pmu_unregister(_imc_pmu->pmu); +} /* * imc_mem_init : Function to support memory allocation for core imc. @@ -1296,6 +1307,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, } } + thread_imc_pmu = pmu_ptr; break; default: return -EINVAL; diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 490bb72..58a0794 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -255,6 +255,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) { struct device_node *imc_dev = pdev->dev.of_node; int pmu_count = 0, domain; + bool core_imc_reg = false, thread_imc_reg = false; u32 type; /* @@ -292,6 +293,10 @@ static int opal_imc_counters_probe(struct platform_device *pdev) if (!imc_pmu_create(imc_dev, pmu_count, domain)) { if (domain == IMC_DOMAIN_NEST) pmu_count++; + if (domain == IMC_DOMAIN_CORE) + core_imc_reg = true; + if (domain == IMC_DOMAIN_THREAD) + thread_imc_reg = true; } } @@ -299,6 +304,10 @@ static int opal_imc_counters_probe(struct platform_device *pdev) if (pmu_count == 0) debugfs_remove_recursive(imc_debugfs_parent); + /* If core imc is not registered, unregister thread-imc */ + if (!core_imc_reg && thread_imc_reg) + unregister_thread_imc(); + return 0; } -- 2.7.4
[PATCH 2/4] powerpc/perf: Replace the direct return with goto statement
Replace the direct return statement in imc_mem_init() with goto, to adhere to the kernel coding style. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 258b0f4..1b285cd 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1236,7 +1236,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, int pmu_index) { const char *s; - int nr_cores, cpu, res; + int nr_cores, cpu, res = -ENOMEM; if (of_property_read_string(parent, "name", )) return -ENODEV; @@ -1246,7 +1246,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, /* Update the pmu name */ pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s); if (!pmu_ptr->pmu.name) - return -ENOMEM; + goto err; /* Needed for hotplug/migration */ if (!per_nest_pmu_arr) { @@ -1254,7 +1254,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, sizeof(struct imc_pmu *), GFP_KERNEL); if (!per_nest_pmu_arr) - return -ENOMEM; + goto err; } per_nest_pmu_arr[pmu_index] = pmu_ptr; break; @@ -1262,21 +1262,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, /* Update the pmu name */ pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); if (!pmu_ptr->pmu.name) - return -ENOMEM; + goto err; nr_cores = DIV_ROUND_UP(num_present_cpus(), threads_per_core); pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), GFP_KERNEL); if (!pmu_ptr->mem_info) - return -ENOMEM; + goto err; core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), GFP_KERNEL); if (!core_imc_refc) { kfree(pmu_ptr->mem_info); - return -ENOMEM; + goto err; } core_imc_pmu = pmu_ptr; @@ -1285,14 +1285,14 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, /* Update the pmu name */ pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); if (!pmu_ptr->pmu.name) - return -ENOMEM; + goto err; thread_imc_mem_size = pmu_ptr->counter_mem_size; for_each_online_cpu(cpu) { res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size); if (res) { cleanup_all_thread_imc_memory(); - return res; + goto err; } } @@ -1302,6 +1302,8 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, } return 0; +err: + return res; } /* -- 2.7.4
[PATCH 1/4] powerpc/perf: Rearrange memory freeing in imc init
When any of the IMC (In-Memory Collection counter) devices fail to initialize, imc_common_mem_free() frees set of memory. In doing so, pmu_ptr pointer is also freed. But pmu_ptr pointer is used in subsequent function (imc_common_cpuhp_mem_free()) which is wrong. Patch here reorders the code to avoid such access. Also free the memory which is dynamically allocated during imc initialization, wherever required. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- test matrix and static checker run details are updated in the cover letter patch is based on https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git (branch: merge) arch/powerpc/perf/imc-pmu.c | 32 --- arch/powerpc/platforms/powernv/opal-imc.c | 13 ++--- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d7532e7..258b0f4 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1153,7 +1153,7 @@ static void cleanup_all_core_imc_memory(void) /* mem_info will never be NULL */ for (i = 0; i < nr_cores; i++) { if (ptr[i].vbase) - free_pages((u64)ptr->vbase, get_order(size)); + free_pages((u64)ptr[i].vbase, get_order(size)); } kfree(ptr); @@ -1191,7 +1191,6 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) if (pmu_ptr->attr_groups[IMC_EVENT_ATTR]) kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); - kfree(pmu_ptr); } /* @@ -1208,6 +1207,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); + per_nest_pmu_arr = NULL; } if (nest_pmus > 0) @@ -1319,10 +1319,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id int ret; ret = imc_mem_init(pmu_ptr, parent, pmu_idx); - if (ret) { - imc_common_mem_free(pmu_ptr); - return ret; - } + if (ret) + goto err_free_mem; switch (pmu_ptr->domain) { case IMC_DOMAIN_NEST: @@ -1337,7 +1335,9 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = init_nest_pmu_ref(); if (ret) { mutex_unlock(_init_lock); - goto err_free; + kfree(per_nest_pmu_arr); + per_nest_pmu_arr = NULL; + goto err_free_mem; } /* Register for cpu hotplug notification. */ ret = nest_pmu_cpumask_init(); @@ -1345,7 +1345,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id mutex_unlock(_init_lock); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); - goto err_free; + per_nest_pmu_arr = NULL; + goto err_free_mem; } } nest_pmus++; @@ -1355,7 +1356,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = core_imc_pmu_cpumask_init(); if (ret) { cleanup_all_core_imc_memory(); - return ret; + goto err_free_mem; } break; @@ -1363,7 +1364,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = thread_imc_cpu_init(); if (ret) { cleanup_all_thread_imc_memory(); - return ret; + goto err_free_mem; } break; @@ -1373,23 +1374,24 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = update_events_in_group(parent, pmu_ptr); if (ret) - goto err_free; + goto err_free_cpuhp_mem; ret = update_pmu_ops(pmu_ptr); if (ret) - goto err_free; + goto err_free_cpuhp_mem; ret = perf_pmu_register(_ptr->pmu, pmu_ptr->pmu.name, -1); if (ret) - goto err_free; + goto err_free_cpuhp_mem; pr_info("%s performance monitor hardware support registered\n", pmu_p
[PATCH 0/4] powerpc/perf: IMC Cleanups
This patch series includes some cleanups and Unregistration of thread-imc pmu, if the kernel does not have core-imc registered. The entire patch set has been verified using the static checker smatch. Command used: $ make ARCH=powerpc CHECK="/smatch -p=kernel" C=1 vmlinux | tee warns.txt Tests Done: * Fail core-imc at init: nest-imc - working cpuhotplug - works as expected thread-imc - not registered * Fail thread-imc at init: nest-imc - works core-imc - works cpuhotplug - works * Fail nest-imc at init core-imc - works thread-imc -works cpuhotplug - works * Fail only one nest unit (say for mcs23) Other nest-units - works core-imc - works thread-imc - works cpuhotplug - works. * Kexec works The first three patches in this series addresses the comments by Dan Carpenter. Anju T Sudhakar (4): powerpc/perf: Rearrange memory freeing in imc init powerpc/perf: Replace the direct return with goto statement powerpc/perf: Return appropriate value for unknown domain powerpc/perf: Unregister thread-imc if core-imc not supported arch/powerpc/include/asm/imc-pmu.h| 1 + arch/powerpc/perf/imc-pmu.c | 64 +++ arch/powerpc/platforms/powernv/opal-imc.c | 22 +-- 3 files changed, 60 insertions(+), 27 deletions(-) -- 2.7.4
Re: [PATCH v3] powerpc/kernel/sysfs: Export ldbar spr to sysfs
Hi, On Tuesday 06 March 2018 04:35 PM, Michael Ellerman wrote: Anju T Sudhakar <a...@linux.vnet.ibm.com> writes: diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 4437c70..caefb64 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -757,6 +759,9 @@ static int register_cpu_online(unsigned int cpu) device_create_file(s, _attrs[i]); #ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_ARCH_300)) + device_create_file(s, _attr_ldbar); Is this register readable in supervisor state? This is a nice catch, thanks. :) The guest kernel can not access the register, it is only readable in the hypervisor state. I will resend the patch with a condition check so that this spr will not get registered for guest kernel. Regards, Anju cheers
Re: [bug report] powerpc/perf: Add nest IMC PMU support
Hi Dan Carpenter, On Wednesday 31 January 2018 08:55 PM, Dan Carpenter wrote: Hello Anju T Sudhakar, The patch 885dcd709ba9: "powerpc/perf: Add nest IMC PMU support" from Jul 19, 2017, leads to the following static checker warning: arch/powerpc/perf/imc-pmu.c:1393 init_imc_pmu() warn: 'pmu_ptr' was already freed. arch/powerpc/perf/imc-pmu.c 1317 int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_idx) 1318 { 1319 int ret; 1320 1321 ret = imc_mem_init(pmu_ptr, parent, pmu_idx); 1322 if (ret) { 1323 imc_common_mem_free(pmu_ptr); 1324 return ret; 1325 } Change this to: if (ret) goto err_free_mpu_ptr; Or something instead of a direct return. That's more normal kernel style. 1326 1327 switch (pmu_ptr->domain) { 1328 case IMC_DOMAIN_NEST: 1329 /* 1330 * Nest imc pmu need only one cpu per chip, we initialize the 1331 * cpumask for the first nest imc pmu and use the same for the 1332 * rest. To handle the cpuhotplug callback unregister, we track 1333 * the number of nest pmus in "nest_pmus". 1334 */ 1335 mutex_lock(_init_lock); 1336 if (nest_pmus == 0) { 1337 ret = init_nest_pmu_ref(); 1338 if (ret) { 1339 mutex_unlock(_init_lock); 1340 goto err_free; 1341 } 1342 /* Register for cpu hotplug notification. */ 1343 ret = nest_pmu_cpumask_init(); 1344 if (ret) { 1345 mutex_unlock(_init_lock); 1346 kfree(nest_imc_refc); 1347 kfree(per_nest_pmu_arr); 1348 goto err_free; 1349 } 1350 } 1351 nest_pmus++; 1352 mutex_unlock(_init_lock); 1353 break; 1354 case IMC_DOMAIN_CORE: 1355 ret = core_imc_pmu_cpumask_init(); 1356 if (ret) { 1357 cleanup_all_core_imc_memory(); 1358 return ret; These direct returns don't look correct... 1359 } 1360 1361 break; 1362 case IMC_DOMAIN_THREAD: 1363 ret = thread_imc_cpu_init(); 1364 if (ret) { 1365 cleanup_all_thread_imc_memory(); 1366 return ret; 1367 } 1368 1369 break; 1370 default: 1371 return -1; /* Unknown domain */ This one certainly looks like a memory leak. Plus -1 is -EPERM which is probably not the correct error code. 1372 } 1373 1374 ret = update_events_in_group(parent, pmu_ptr); 1375 if (ret) 1376 goto err_free; 1377 1378 ret = update_pmu_ops(pmu_ptr); 1379 if (ret) 1380 goto err_free; 1381 1382 ret = perf_pmu_register(_ptr->pmu, pmu_ptr->pmu.name, -1); 1383 if (ret) 1384 goto err_free; 1385 1386 pr_info("%s performance monitor hardware support registered\n", 1387 pmu_ptr->pmu.name); 1388 1389 return 0; 1390 1391 err_free: 1392 imc_common_mem_free(pmu_ptr); 1393 imc_common_cpuhp_mem_free(pmu_ptr); ^^^ This is a use after free, it should be in the reverse order. err_free_cpuhp: imc_common_cpuhp_mem_free(pmu_ptr); err_free_pmu_ptr: imc_common_mem_free(pmu_ptr); 1394 return ret; 1395 } regards, dan carpenter Apologies for the delayed response, I just got back from vacation. Thank you for pointing out this, I will rework the code and send. Regards, Anju
[PATCH v2] platform/powernv: Add debugfs interface for imc-mode and imc-command
In memory Collection (IMC) counter pmu driver controls the ucode's execution state. At the system boot, IMC perf driver pause the ucode. Ucode state is changed to "running" only when any of the nest units are monitored or profiled using perf tool. Nest units support only limited set of hardware counters and ucode is always programmed in the "production mode" ("accumulation") mode. This mode is configured to provide key performance metric data for most of the nest units. But ucode also supports other modes which would be used for "debug" to drill down specific nest units. That is, ucode when switched to "powerbus" debug mode (for example), will dynamically reconfigure the nest counters to target only "powerbus" related events in the hardware counters. This allows the IMC nest unit to focus on powerbus related transactions in the system in more detail. At this point, production mode events may or may not be counted. IMC nest counters has both in-band (ucode access) and out of band access to it. Since not all nest counter configurations are supported by ucode, out of band tools are used to characterize other nest counter configurations. Patch provides an interface via "debugfs" to enable the switching of ucode modes in the system. To switch ucode mode, one has to first pause the microcode (imc_cmd), and then write the target mode value to the "imc_mode" file. Proposed Approach === In the proposed approach, the function (export_imc_mode_and_cmd) which creates the debugfs interface for imc mode and command is implemented in opal-imc.c. Thus we can use imc_get_mem_addr() to get the homer base address for each chip. The interface to expose imc mode and command is required only if we have nest pmu units registered. Employing the existing data structures to track whether we have any nest units registered will require to extend data from perf side to opal-imc.c. Instead an integer is introduced to hold that information by counting successful nest unit registration. Debugfs interface is removed based on the integer count. Example for the interface: root@:/sys/kernel/debug/imc# ls imc_cmd_0 imc_cmd_8 imc_mode_0 imc_mode_8 Changes from v1 -> v2 - The latest commit by Rajarshi Das adds the control block offset in the ima-catalog file. So if there is cb_offset specified in the ima-catalog, that is used to export the imc mode/command, otherwise IMC_CNTL_BLK_OFFSET is used. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/imc-pmu.h| 7 +++ arch/powerpc/platforms/powernv/opal-imc.c | 77 +++ 2 files changed, 84 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index fad0e6f..e760401 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -35,6 +35,13 @@ #define THREAD_IMC_ENABLE 0x8000ULL /* + * For debugfs interface for imc-mode and imc-command + */ +#define IMC_CNTL_BLK_OFFSET0x3FC00 +#define IMC_CNTL_BLK_CMD_OFFSET8 +#define IMC_CNTL_BLK_MODE_OFFSET 32 + +/* * Structure to hold memory address information for imc units. */ struct imc_mem_info { diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 465ea10..dd4c9b8 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -21,6 +21,78 @@ #include #include #include +#include + +static struct dentry *imc_debugfs_parent; + +/* Helpers to export imc command and mode via debugfs */ +static int imc_mem_get(void *data, u64 *val) +{
[PATCH 2/3] powerpc/perf: IMC code cleanup with some code refactoring
Factor out memory freeing part for attribute elements from imc_common_cpuhp_mem_free(). Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 31 --- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index ef7f9dd..71f425f 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1157,6 +1157,15 @@ static void cleanup_all_thread_imc_memory(void) } } +/* Function to free the attr_groups which are dynamically allocated */ +static void imc_common_mem_free(struct imc_pmu *pmu_ptr) +{ + if (pmu_ptr->attr_groups[IMC_EVENT_ATTR]) + kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); + kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); + kfree(pmu_ptr); +} + /* * Common function to unregister cpu hotplug callback and * free the memory. @@ -1189,13 +1198,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); cleanup_all_thread_imc_memory(); } - - /* Only free the attr_groups which are dynamically allocated */ - if (pmu_ptr->attr_groups[IMC_EVENT_ATTR]) - kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); - kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); - kfree(pmu_ptr); - return; } @@ -1244,8 +1246,10 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), GFP_KERNEL); - if (!core_imc_refc) + if (!core_imc_refc) { + kfree(pmu_ptr->mem_info); return -ENOMEM; + } core_imc_pmu = pmu_ptr; break; @@ -1258,8 +1262,10 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, thread_imc_mem_size = pmu_ptr->counter_mem_size; for_each_online_cpu(cpu) { res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size); - if (res) + if (res) { + cleanup_all_thread_imc_memory(); return res; + } } break; @@ -1285,8 +1291,10 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id int ret; ret = imc_mem_init(pmu_ptr, parent, pmu_idx); - if (ret) - goto err_free; + if (ret) { + imc_common_mem_free(pmu_ptr); + return ret; + } switch (pmu_ptr->domain) { case IMC_DOMAIN_NEST: @@ -1353,6 +1361,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id return 0; err_free: + imc_common_mem_free(pmu_ptr); imc_common_cpuhp_mem_free(pmu_ptr); return ret; } -- 2.7.4
[PATCH 3/3] powerpc/perf: Passs struct imc_events as a parameter to imc_parse_event()
Remove the allocation of struct imc_events from imc_parse_event(). Instead pass imc_events as a parameter to imc_parse_event(), which is a pointer to a slot in the array allocated in update_events_in_group(). Reported by: Dan Carpenter ("powerpc/perf: Fix a sizeof() typo so we allocate less memory") Suggested-by: Michael Ellerman <m...@ellerman.id.au> Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/include/asm/imc-pmu.h | 2 +- arch/powerpc/perf/imc-pmu.c| 66 +++--- 2 files changed, 41 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index fad0e6f..080731d 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -71,7 +71,7 @@ struct imc_events { struct imc_pmu { struct pmu pmu; struct imc_mem_info *mem_info; - struct imc_events **events; + struct imc_events *events; /* * Attribute groups for the PMU. Slot 0 used for * format attribute, slot 1 used for cpusmask attribute, diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 71f425f..5cb1f31 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -116,17 +116,13 @@ static struct attribute *device_str_attr_create(const char *name, const char *st return >attr.attr; } -struct imc_events *imc_parse_event(struct device_node *np, const char *scale, - const char *unit, const char *prefix, u32 base) +static int imc_parse_event(struct device_node *np, const char *scale, + const char *unit, const char *prefix, + u32 base, struct imc_events *event) { - struct imc_events *event; const char *s; u32 reg; - event = kzalloc(sizeof(struct imc_events), GFP_KERNEL); - if (!event) - return NULL; - if (of_property_read_u32(np, "reg", )) goto error; /* Add the base_reg value to the "reg" */ @@ -157,14 +153,32 @@ struct imc_events *imc_parse_event(struct device_node *np, const char *scale, goto error; } - return event; + return 0; error: kfree(event->unit); kfree(event->scale); kfree(event->name); - kfree(event); + return -EINVAL; +} + +/* + * imc_free_events: Function to cleanup the events list, having + * "nr_entries". + */ +static void imc_free_events(struct imc_events *events, int nr_entries) +{ + int i; + + /* Nothing to clean, return */ + if (!events) + return; + for (i = 0; i < nr_entries; i++) { + kfree(events[i].unit); + kfree(events[i].scale); + kfree(events[i].name); + } - return NULL; + kfree(events); } /* @@ -176,9 +190,8 @@ static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) struct attribute_group *attr_group; struct attribute **attrs, *dev_str; struct device_node *np, *pmu_events; - struct imc_events *ev; u32 handle, base_reg; - int i=0, j=0, ct; + int i = 0, j = 0, ct, ret; const char *prefix, *g_scale, *g_unit; const char *ev_val_str, *ev_scale_str, *ev_unit_str; @@ -216,15 +229,17 @@ static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) ct = 0; /* Parse the events and update the struct */ for_each_child_of_node(pmu_events, np) { - ev = imc_parse_event(np, g_scale, g_unit, prefix, base_reg); - if (ev) - pmu->events[ct++] = ev; + ret = imc_parse_event(np, g_scale, g_unit, prefix, base_reg, >events[ct]); + if (!ret) + ct++; } /* Allocate memory for attribute group */ attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL); - if (!attr_group) + if (!attr_group) { + imc_free_events(pmu->events, ct); return -ENOMEM; + } /* * Allocate memory for attributes. @@ -237,31 +252,31 @@ static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL); if (!attrs) { kfree(attr_group); - kfree(pmu->events); + imc_free_events(pmu->events, ct); return -ENOMEM; } attr_group->name = "events"; attr_group->attrs = attrs; do { - ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i]->value); - dev_str = device_str_attr_create(pmu->
[PATCH 1/3] powerpc/perf: Remove thread_imc_pmu global variable from
Remove the global variable 'thread_imc_pmu', since it is not used in the code. Signed-off-by: Anju T Sudhakar <a...@linux.vnet.ibm.com> --- arch/powerpc/perf/imc-pmu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 4eb9e2b..ef7f9dd 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -40,7 +40,6 @@ static struct imc_pmu *core_imc_pmu; /* Thread IMC data structures and variables */ static DEFINE_PER_CPU(u64 *, thread_imc_mem); -static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; struct imc_pmu *imc_event_to_pmu(struct perf_event *event) @@ -1263,7 +1262,6 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, return res; } - thread_imc_pmu = pmu_ptr; break; default: return -EINVAL; -- 2.7.4
[PATCH 0/3] IMC code clean up and refactoring
The first patch removes the unused variable in the code for IMC(In-memory collection counters). The second patch does some code refactoring. The third patch in the series make struct imc_events as a parameter to the function imc_parse_event(). Anju T Sudhakar (3): powerpc/perf: Remove thread_imc_pmu global variable from imc code powerpc/perf: IMC code cleanup with some code refactoring powerpc/perf: Passs struct imc_events as a parameter to imc_parse_event() arch/powerpc/include/asm/imc-pmu.h | 2 +- arch/powerpc/perf/imc-pmu.c| 99 +++--- 2 files changed, 61 insertions(+), 40 deletions(-) -- 2.7.4