[tip: perf/core] perf/x86: Hybrid PMU support for counters
The following commit has been merged into the perf/core branch of tip: Commit-ID: d4b294bf84db7a84e295ddf19cb8e7f71b7bd045 Gitweb: https://git.kernel.org/tip/d4b294bf84db7a84e295ddf19cb8e7f71b7bd045 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:46 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00 perf/x86: Hybrid PMU support for counters The number of GP and fixed counters are different among hybrid PMUs. Each hybrid PMU should use its own counter related information. When handling a certain hybrid PMU, apply the number of counters from the corresponding hybrid PMU. When reserving the counters in the initialization of a new event, reserve all possible counters. The number of counter recored in the global x86_pmu is for the architecture counters which are available for all hybrid PMUs. KVM doesn't support the hybrid PMU yet. Return the number of the architecture counters for now. For the functions only available for the old platforms, e.g., intel_pmu_drain_pebs_nhm(), nothing is changed. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-7-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 55 --- arch/x86/events/intel/core.c | 8 +++-- arch/x86/events/intel/ds.c | 14 + arch/x86/events/perf_event.h | 4 +++- 4 files changed, 56 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7d3c19e..1aeb31c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -185,16 +185,29 @@ static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC +static inline int get_possible_num_counters(void) +{ + int i, num_counters = x86_pmu.num_counters; + + if (!is_hybrid()) + return num_counters; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) + num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters); + + return num_counters; +} + static bool reserve_pmc_hardware(void) { - int i; + int i, num_counters = get_possible_num_counters(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -205,7 +218,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu_config_addr(i)); - i = x86_pmu.num_counters; + i = num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -216,9 +229,9 @@ perfctr_fail: static void release_pmc_hardware(void) { - int i; + int i, num_counters = get_possible_num_counters(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } @@ -946,6 +959,7 @@ EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { + int num_counters = hybrid(cpuc->pmu, num_counters); struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; @@ -1021,7 +1035,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) { - int gpmax = x86_pmu.num_counters; + int gpmax = num_counters; /* * Do not allow scheduling of more than half the available @@ -1042,7 +1056,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { - gpmax = x86_pmu.num_counters - cpuc->n_pair; + gpmax = num_counters - cpuc->n_pair; WARN_ON(gpmax <= 0); } @@ -1129,10 +1143,12 @@ static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { + int num_counters = hybrid(cpuc->pmu, num_counters); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct perf_event *event; int n, max_count; - max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; + max_count = num_counters + num_counters_fixed; /* current number of events already accepted */ n = cpuc->n_events; @@ -1499,18 +1515,18
[tip: perf/core] perf/x86: Track pmu in per-CPU cpu_hw_events
The following commit has been merged into the perf/core branch of tip: Commit-ID: 61e76d53c39bb768ad264d379837cfc56b9e35b4 Gitweb: https://git.kernel.org/tip/61e76d53c39bb768ad264d379837cfc56b9e35b4 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:43 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:24 +02:00 perf/x86: Track pmu in per-CPU cpu_hw_events Some platforms, e.g. Alder Lake, have hybrid architecture. In the same package, there may be more than one type of CPU. The PMU capabilities are different among different types of CPU. Perf will register a dedicated PMU for each type of CPU. Add a 'pmu' variable in the struct cpu_hw_events to track the dedicated PMU of the current CPU. Current x86_get_pmu() use the global 'pmu', which will be broken on a hybrid platform. Modify it to apply the 'pmu' of the specific CPU. Initialize the per-CPU 'pmu' variable with the global 'pmu'. There is nothing changed for the non-hybrid platforms. The is_x86_event() will be updated in the later patch ("perf/x86: Register hybrid PMUs") for hybrid platforms. For the non-hybrid platforms, nothing is changed here. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618237865-33448-4-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 17 + arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 4 ++-- arch/x86/events/intel/lbr.c | 9 + arch/x86/events/perf_event.h | 4 +++- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index dd9f3c2..a49a8bd 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -45,9 +45,11 @@ #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; +static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, + .pmu = &pmu, }; DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); @@ -724,16 +726,23 @@ void x86_pmu_enable_all(int added) } } -static struct pmu pmu; - static inline int is_x86_event(struct perf_event *event) { return event->pmu == &pmu; } -struct pmu *x86_get_pmu(void) +struct pmu *x86_get_pmu(unsigned int cpu) { - return &pmu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + /* +* All CPUs of the hybrid type have been offline. +* The x86_get_pmu() should not be invoked. +*/ + if (WARN_ON_ONCE(!cpuc->pmu)) + return &pmu; + + return cpuc->pmu; } /* * Event scheduler state: diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7bbb5bb..f116c63 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4876,7 +4876,7 @@ static void update_tfa_sched(void *ignored) * and if so force schedule out for all event types all contexts */ if (test_bit(3, cpuc->active_mask)) - perf_pmu_resched(x86_get_pmu()); + perf_pmu_resched(x86_get_pmu(smp_processor_id())); } static ssize_t show_sysctl_tfa(struct device *cdev, diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ebae18..1bfea8c 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2192,7 +2192,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_TIME; x86_pmu.flags |= PMU_FL_PEBS_ALL; pebs_qual = "-baseline"; - x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; } else { /* Only basic record supported */ x86_pmu.large_pebs_flags &= @@ -2207,7 +2207,7 @@ void __init intel_ds_init(void) if (x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); - x86_get_pmu()->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; } break; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 21890da..bb4486c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -705,7 +705,7 @@ void intel_pmu_lbr_add(struct perf_event *event) void release_lbr_buffers(void) { - struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache; + struct kmem_cache *kmem_cache; struct cpu_hw_events *cpuc; int cpu; @@ -714,6 +714,7 @@ void release_lbr_buffers(void) for_each_possible_cpu(cpu) { cpuc = per_cpu_pt
[tip: perf/core] perf/x86/intel: Hybrid PMU support for perf capabilities
The following commit has been merged into the perf/core branch of tip: Commit-ID: d0946a882e6220229a29f9031641e54379be5a1e Gitweb: https://git.kernel.org/tip/d0946a882e6220229a29f9031641e54379be5a1e Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:44 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:24 +02:00 perf/x86/intel: Hybrid PMU support for perf capabilities Some platforms, e.g. Alder Lake, have hybrid architecture. Although most PMU capabilities are the same, there are still some unique PMU capabilities for different hybrid PMUs. Perf should register a dedicated pmu for each hybrid PMU. Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and capabilities for each hybrid PMU. The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the architecture features which are available on all hybrid PMUs. The architecture features are stored in the global x86_pmu.intel_cap. For Alder Lake, the model-specific features are perf metrics and PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap should be 0 for these two features. Perf should not use the global intel_cap to check the features on a hybrid system. Add a dedicated intel_cap in the x86_hybrid_pmu to store the model-specific capabilities. Use the dedicated intel_cap to replace the global intel_cap for thse two features. The dedicated intel_cap will be set in the following "Add Alder Lake Hybrid support" patch. Add is_hybrid() to distinguish a hybrid system. ADL may have an alternative configuration. With that configuration, the X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit. Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system. It will be assigned in the following "Add Alder Lake Hybrid support" patch as well. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 7 +-- arch/x86/events/intel/core.c | 22 + arch/x86/events/intel/ds.c | 2 +- arch/x86/events/perf_event.h | 33 +++- arch/x86/include/asm/msr-index.h | 3 +++- 5 files changed, 60 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a49a8bd..7fc2001 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -54,6 +54,7 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); +DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined @@ -1105,8 +1106,9 @@ static void del_nr_metric_event(struct cpu_hw_events *cpuc, static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { + union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); - if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) + if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) @@ -1581,6 +1583,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* @@ -1620,7 +1623,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; - if (x86_pmu.intel_cap.perf_metrics) + if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f116c63..dc9e2fb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3646,6 +3646,12 @@ static inline bool is_mem_loads_aux_event(struct perf_event *event) return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82); } +static inline bool intel_pmu_has_cap(struct perf_event *event, int idx) +{ + union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap); + + return test_bit(idx, (unsigned long *)&intel_cap->capabilities); +} static int intel_pmu_hw_config(struct perf_event *event) { @@ -3712,7 +3718,7 @@ static int intel_pmu_hw_config(struct perf_event *event) * with a slots event as group leader. When the slots event * is used in a metrics group, it too cannot support sampling. */ - if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) { + if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX
[tip: perf/core] perf/x86: Hybrid PMU support for intel_ctrl
The following commit has been merged into the perf/core branch of tip: Commit-ID: fc4b8fca2d8fc8aecd58508e81d55afe4ed76344 Gitweb: https://git.kernel.org/tip/fc4b8fca2d8fc8aecd58508e81d55afe4ed76344 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:45 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:24 +02:00 perf/x86: Hybrid PMU support for intel_ctrl The intel_ctrl is the counter mask of a PMU. The PMU counter information may be different among hybrid PMUs, each hybrid PMU should use its own intel_ctrl to check and access the counters. When handling a certain hybrid PMU, apply the intel_ctrl from the corresponding hybrid PMU. When checking the HW existence, apply the PMU and number of counters from the corresponding hybrid PMU as well. Perf will check the HW existence for each Hybrid PMU before registration. Expose the check_hw_exists() for a later patch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-6-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 14 +++--- arch/x86/events/intel/core.c | 14 +- arch/x86/events/perf_event.h | 10 -- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7fc2001..7d3c19e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -231,7 +231,7 @@ static void release_pmc_hardware(void) {} #endif -static bool check_hw_exists(void) +bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; @@ -242,7 +242,7 @@ static bool check_hw_exists(void) * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) @@ -256,13 +256,13 @@ static bool check_hw_exists(void) } } - if (x86_pmu.num_counters_fixed) { + if (num_counters_fixed) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; - for (i = 0; i < x86_pmu.num_counters_fixed; i++) { - if (fixed_counter_disabled(i)) + for (i = 0; i < num_counters_fixed; i++) { + if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03 << i*4)) { bios_fail = 1; @@ -1547,7 +1547,7 @@ void perf_event_print_debug(void) cpu, idx, prev_left); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - if (fixed_counter_disabled(idx)) + if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); @@ -1992,7 +1992,7 @@ static int __init init_hw_perf_events(void) pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists()) + if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) return 0; pr_cont("%s PMU driver.\n", x86_pmu.name); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dc9e2fb..2d56055 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2153,10 +2153,11 @@ static void intel_pmu_disable_all(void) static void __intel_pmu_enable_all(int added, bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); intel_pmu_lbr_enable_all(pmi); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, - x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); + intel_ctrl & ~cpuc->intel_ctrl_guest_mask); if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = @@ -2709,6 +2710,7 @@ int intel_pmu_save_and_restart(struct perf_event *event) static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); unsigned long flags; int idx; @@ -2724,7 +2726,7 @@ static void intel_pmu_reset(void) wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - if (fixed_counter_disabled(idx)) + if (fixed_counter_disabled(idx, cpuc->pmu)) continue; wrmsrl_saf
[tip: perf/core] perf/x86: Hybrid PMU support for hardware cache event
The following commit has been merged into the perf/core branch of tip: Commit-ID: 0d18f2dfead8dd63bf1186c9ef38528d6a615a55 Gitweb: https://git.kernel.org/tip/0d18f2dfead8dd63bf1186c9ef38528d6a615a55 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:48 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00 perf/x86: Hybrid PMU support for hardware cache event The hardware cache events are different among hybrid PMUs. Each hybrid PMU should have its own hw cache event table. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618237865-33448-9-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 5 ++--- arch/x86/events/perf_event.h | 9 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1aeb31c..e8cb892 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -376,8 +376,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); - val = hw_cache_event_ids[cache_type][cache_op][cache_result]; - + val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; @@ -385,7 +384,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return -EINVAL; hwc->config |= val; - attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2688e45..b65cf46 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -639,6 +639,15 @@ struct x86_hybrid_pmu { int num_counters; int num_counters_fixed; struct event_constraint unconstrained; + + u64 hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + u64 hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; }; static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
[tip: perf/core] perf/x86: Hybrid PMU support for unconstrained
The following commit has been merged into the perf/core branch of tip: Commit-ID: eaacf07d1116f6bf3b93b265515fccf2301097f2 Gitweb: https://git.kernel.org/tip/eaacf07d1116f6bf3b93b265515fccf2301097f2 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:47 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00 perf/x86: Hybrid PMU support for unconstrained The unconstrained value depends on the number of GP and fixed counters. Each hybrid PMU should use its own unconstrained. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618237865-33448-8-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/perf_event.h | 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3ea0126..4cfc382 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3147,7 +3147,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } } - return &unconstrained; + return &hybrid_var(cpuc->pmu, unconstrained); } static struct event_constraint * diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 0539ad4..2688e45 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -638,6 +638,7 @@ struct x86_hybrid_pmu { int max_pebs_events; int num_counters; int num_counters_fixed; + struct event_constraint unconstrained; }; static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu) @@ -658,6 +659,16 @@ extern struct static_key_false perf_is_hybrid; __Fp; \ })) +#define hybrid_var(_pmu, _var) \ +(*({ \ + typeof(&_var) __Fp = &_var; \ + \ + if (is_hybrid() && (_pmu)) \ + __Fp = &hybrid_pmu(_pmu)->_var; \ + \ + __Fp; \ +})) + /* * struct x86_pmu - generic x86 pmu */
[tip: perf/core] perf/x86: Hybrid PMU support for event constraints
The following commit has been merged into the perf/core branch of tip: Commit-ID: 24ee38ffe61a68fc35065fcab1908883a34c866b Gitweb: https://git.kernel.org/tip/24ee38ffe61a68fc35065fcab1908883a34c866b Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:49 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00 perf/x86: Hybrid PMU support for event constraints The events are different among hybrid PMUs. Each hybrid PMU should use its own event constraints. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-10-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 3 ++- arch/x86/events/intel/core.c | 5 +++-- arch/x86/events/intel/ds.c | 5 +++-- arch/x86/events/perf_event.h | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e8cb892..f92d234 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1518,6 +1518,7 @@ void perf_event_print_debug(void) struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int num_counters = hybrid(cpuc->pmu, num_counters); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); + struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); unsigned long flags; int idx; @@ -1537,7 +1538,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - if (x86_pmu.pebs_constraints) { + if (pebs_constraints) { rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 4cfc382..447a80f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3136,10 +3136,11 @@ struct event_constraint * x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { + struct event_constraint *event_constraints = hybrid(cpuc->pmu, event_constraints); struct event_constraint *c; - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { + if (event_constraints) { + for_each_event_constraint(c, event_constraints) { if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 312bf3b..f1402bc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -959,13 +959,14 @@ struct event_constraint intel_spr_pebs_event_constraints[] = { struct event_constraint *intel_pebs_constraints(struct perf_event *event) { + struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints); struct event_constraint *c; if (!event->attr.precise_ip) return NULL; - if (x86_pmu.pebs_constraints) { - for_each_event_constraint(c, x86_pmu.pebs_constraints) { + if (pebs_constraints) { + for_each_event_constraint(c, pebs_constraints) { if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index b65cf46..34b7fc9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -648,6 +648,8 @@ struct x86_hybrid_pmu { [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; + struct event_constraint *event_constraints; + struct event_constraint *pebs_constraints; }; static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
[tip: perf/core] perf/x86: Hybrid PMU support for extra_regs
The following commit has been merged into the perf/core branch of tip: Commit-ID: 183af7366b4e813ee4e0b995ff731e3ac28251f0 Gitweb: https://git.kernel.org/tip/183af7366b4e813ee4e0b995ff731e3ac28251f0 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:50 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00 perf/x86: Hybrid PMU support for extra_regs Different hybrid PMU may have different extra registers, e.g. Core PMU may have offcore registers, frontend register and ldlat register. Atom core may only have offcore registers and ldlat register. Each hybrid PMU should use its own extra_regs. An Intel Hybrid system should always have extra registers. Unconditionally allocate shared_regs for Intel Hybrid system. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-11-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 5 +++-- arch/x86/events/intel/core.c | 15 +-- arch/x86/events/perf_event.h | 1 + 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f92d234..57d3fe1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -154,15 +154,16 @@ again: */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { + struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; - if (!x86_pmu.extra_regs) + if (!extra_regs) return 0; - for (er = x86_pmu.extra_regs; er->msr; er++) { + for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 447a80f..f727aa5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2966,8 +2966,10 @@ intel_vlbr_constraints(struct perf_event *event) return NULL; } -static int intel_alt_er(int idx, u64 config) +static int intel_alt_er(struct cpu_hw_events *cpuc, + int idx, u64 config) { + struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs); int alt_idx = idx; if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) @@ -2979,7 +2981,7 @@ static int intel_alt_er(int idx, u64 config) if (idx == EXTRA_REG_RSP_1) alt_idx = EXTRA_REG_RSP_0; - if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) + if (config & ~extra_regs[alt_idx].valid_mask) return idx; return alt_idx; @@ -2987,15 +2989,16 @@ static int intel_alt_er(int idx, u64 config) static void intel_fixup_er(struct perf_event *event, int idx) { + struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); event->hw.extra_reg.idx = idx; if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; + event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; } else if (idx == EXTRA_REG_RSP_1) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; + event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } } @@ -3071,7 +3074,7 @@ again: */ c = NULL; } else { - idx = intel_alt_er(idx, reg->config); + idx = intel_alt_er(cpuc, idx, reg->config); if (idx != reg->idx) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -4155,7 +4158,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { cpuc->pebs_record_size = x86_pmu.pebs_record_size; - if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { + if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) goto err; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 34b7fc9..d8c448b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -650,6 +650,7 @@ struct x86_hybrid_pmu { [PERF_COUNT_HW_CACHE_RESULT_MAX]; struct event_constraint *event_constraints; struct event_constraint *pebs_constraints; + struct extra_reg*extra_regs; }; static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct p
[tip: perf/core] perf/x86/intel: Factor out intel_pmu_check_num_counters
The following commit has been merged into the perf/core branch of tip: Commit-ID: b8c4d1a87610ba20da1abddb7aacbde0b2817c1a Gitweb: https://git.kernel.org/tip/b8c4d1a87610ba20da1abddb7aacbde0b2817c1a Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:51 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00 perf/x86/intel: Factor out intel_pmu_check_num_counters Each Hybrid PMU has to check its own number of counters and mask fixed counters before registration. The intel_pmu_check_num_counters will be reused later to check the number of the counters for each hybrid PMU. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-12-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 38 ++- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f727aa5..d7e2021 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5064,6 +5064,26 @@ static const struct attribute_group *attr_update[] = { static struct attribute *empty_attrs; +static void intel_pmu_check_num_counters(int *num_counters, +int *num_counters_fixed, +u64 *intel_ctrl, u64 fixed_mask) +{ + if (*num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", +*num_counters, INTEL_PMC_MAX_GENERIC); + *num_counters = INTEL_PMC_MAX_GENERIC; + } + *intel_ctrl = (1ULL << *num_counters) - 1; + + if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", +*num_counters_fixed, INTEL_PMC_MAX_FIXED); + *num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED; +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -5703,20 +5723,10 @@ __init int intel_pmu_init(void) x86_pmu.attr_update = attr_update; - if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", -x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); - x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", -x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED; + intel_pmu_check_num_counters(&x86_pmu.num_counters, +&x86_pmu.num_counters_fixed, +&x86_pmu.intel_ctrl, +(u64)fixed_mask); /* AnyThread may be deprecated on arch perfmon v5 or later */ if (x86_pmu.intel_cap.anythread_deprecated)
[tip: perf/core] perf/x86/intel: Factor out intel_pmu_check_event_constraints
The following commit has been merged into the perf/core branch of tip: Commit-ID: bc14fe1beeec1d80ee39f03019c10e130c8d376b Gitweb: https://git.kernel.org/tip/bc14fe1beeec1d80ee39f03019c10e130c8d376b Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:52 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00 perf/x86/intel: Factor out intel_pmu_check_event_constraints Each Hybrid PMU has to check and update its own event constraints before registration. The intel_pmu_check_event_constraints will be reused later to check the event constraints of each hybrid PMU. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-13-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 82 --- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d7e2021..5c5f330 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5084,6 +5084,49 @@ static void intel_pmu_check_num_counters(int *num_counters, *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED; } +static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, + int num_counters, + int num_counters_fixed, + u64 intel_ctrl) +{ + struct event_constraint *c; + + if (!event_constraints) + return; + + /* +* event on fixed counter2 (REF_CYCLES) only works on this +* counter, so do not extend mask to generic counters +*/ + for_each_event_constraint(c, event_constraints) { + /* +* Don't extend the topdown slots and metrics +* events to the generic counters. +*/ + if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + /* +* Disable topdown slots and metrics events, +* if slots event is not in CPUID. +*/ + if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl)) + c->idxmsk64 = 0; + c->weight = hweight64(c->idxmsk64); + continue; + } + + if (c->cmask == FIXED_EVENT_FLAGS) { + /* Disabled fixed counters which are not in CPUID */ + c->idxmsk64 &= intel_ctrl; + + if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) + c->idxmsk64 |= (1ULL << num_counters) - 1; + } + c->idxmsk64 &= + ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed)); + c->weight = hweight64(c->idxmsk64); + } +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -5094,7 +5137,6 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; - struct event_constraint *c; unsigned int fixed_mask; struct extra_reg *er; bool pmem = false; @@ -5732,40 +5774,10 @@ __init int intel_pmu_init(void) if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; - if (x86_pmu.event_constraints) { - /* -* event on fixed counter2 (REF_CYCLES) only works on this -* counter, so do not extend mask to generic counters -*/ - for_each_event_constraint(c, x86_pmu.event_constraints) { - /* -* Don't extend the topdown slots and metrics -* events to the generic counters. -*/ - if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { - /* -* Disable topdown slots and metrics events, -* if slots event is not in CPUID. -*/ - if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl)) - c->idxmsk64 = 0; - c->weight = hweight64(c->idxmsk64); - continue; - } - - if (c->cmask == FIXED_EVENT_FLAGS) { - /* Disabled fixed counters which are not in CPUID */ - c->idxmsk64 &= x86_pmu.intel_ctrl; - - if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; -
[tip: perf/core] perf/x86/intel: Factor out intel_pmu_check_extra_regs
The following commit has been merged into the perf/core branch of tip: Commit-ID: 34d5b61f29eea656be4283213273c33d5987e4d2 Gitweb: https://git.kernel.org/tip/34d5b61f29eea656be4283213273c33d5987e4d2 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:53 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00 perf/x86/intel: Factor out intel_pmu_check_extra_regs Each Hybrid PMU has to check and update its own extra registers before registration. The intel_pmu_check_extra_regs will be reused later to check the extra registers of each hybrid PMU. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-14-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5c5f330..55ccfbb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5127,6 +5127,26 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con } } +static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs) +{ + struct extra_reg *er; + + /* +* Access extra MSR may cause #GP under certain circumstances. +* E.g. KVM doesn't support offcore event +* Check all extra_regs here. +*/ + if (!extra_regs) + return; + + for (er = extra_regs; er->msr; er++) { + er->extra_msr_access = check_msr(er->msr, 0x11UL); + /* Disable LBR select mapping */ + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) + x86_pmu.lbr_sel_map = NULL; + } +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -5138,7 +5158,6 @@ __init int intel_pmu_init(void) union cpuid10_eax eax; union cpuid10_ebx ebx; unsigned int fixed_mask; - struct extra_reg *er; bool pmem = false; int version, i; char *name; @@ -5795,19 +5814,7 @@ __init int intel_pmu_init(void) if (x86_pmu.lbr_nr) pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); - /* -* Access extra MSR may cause #GP under certain circumstances. -* E.g. KVM doesn't support offcore event -* Check all extra_regs here. -*/ - if (x86_pmu.extra_regs) { - for (er = x86_pmu.extra_regs; er->msr; er++) { - er->extra_msr_access = check_msr(er->msr, 0x11UL); - /* Disable LBR select mapping */ - if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) - x86_pmu.lbr_sel_map = NULL; - } - } + intel_pmu_check_extra_regs(x86_pmu.extra_regs); /* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) {
[tip: perf/core] perf/x86: Factor out x86_pmu_show_pmu_cap
The following commit has been merged into the perf/core branch of tip: Commit-ID: e11c1a7eb302ac8f6f47c18fa662546405a5fd83 Gitweb: https://git.kernel.org/tip/e11c1a7eb302ac8f6f47c18fa662546405a5fd83 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:55 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:27 +02:00 perf/x86: Factor out x86_pmu_show_pmu_cap The PMU capabilities are different among hybrid PMUs. Perf should dump the PMU capabilities information for each hybrid PMU. Factor out x86_pmu_show_pmu_cap() which shows the PMU capabilities information. The function will be reused later when registering a dedicated hybrid PMU. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-16-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 25 - arch/x86/events/perf_event.h | 3 +++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index ed8dcfb..2e7ae52 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1976,6 +1976,20 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl) +{ + pr_info("... version:%d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic registers: %d\n", num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %lu\n", + hweight641ULL << num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & intel_ctrl)); + pr_info("... event mask: %016Lx\n", intel_ctrl); +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2036,15 +2050,8 @@ static int __init init_hw_perf_events(void) pmu.attr_update = x86_pmu.attr_update; - pr_info("... version:%d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %lu\n", - hweight641ULL << x86_pmu.num_counters_fixed) - 1) - << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl)); - pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed, +x86_pmu.intel_ctrl); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index d8c448b..a3534e3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1092,6 +1092,9 @@ void x86_pmu_enable_event(struct perf_event *event); int x86_pmu_handle_irq(struct pt_regs *regs); +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl); + extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained;
[tip: perf/core] perf/x86: Remove temporary pmu assignment in event_init
The following commit has been merged into the perf/core branch of tip: Commit-ID: b98567298bad891774054113690b30bd90d5738d Gitweb: https://git.kernel.org/tip/b98567298bad891774054113690b30bd90d5738d Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:54 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:27 +02:00 perf/x86: Remove temporary pmu assignment in event_init The temporary pmu assignment in event_init is unnecessary. The assignment was introduced by commit 8113070d6639 ("perf_events: Add fast-path to the rescheduling code"). At that time, event->pmu is not assigned yet when initializing an event. The assignment is required. However, from commit 7e5b2a01d2ca ("perf: provide PMU when initing events"), the event->pmu is provided before event_init is invoked. The temporary pmu assignment in event_init should be removed. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-15-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 11 --- 1 file changed, 11 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 57d3fe1..ed8dcfb 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2291,7 +2291,6 @@ out: static int x86_pmu_event_init(struct perf_event *event) { - struct pmu *tmp; int err; switch (event->attr.type) { @@ -2306,20 +2305,10 @@ static int x86_pmu_event_init(struct perf_event *event) err = __x86_pmu_event_init(event); if (!err) { - /* -* we temporarily connect event to its pmu -* such that validate_group() can classify -* it as an x86 event using is_x86_event() -*/ - tmp = event->pmu; - event->pmu = &pmu; - if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); - - event->pmu = tmp; } if (err) { if (event->destroy)
[tip: perf/core] perf/x86: Register hybrid PMUs
The following commit has been merged into the perf/core branch of tip: Commit-ID: d9977c43bff895ed49a9d25e1f382b0a98bb271f Gitweb: https://git.kernel.org/tip/d9977c43bff895ed49a9d25e1f382b0a98bb271f Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:56 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:27 +02:00 perf/x86: Register hybrid PMUs Different hybrid PMUs have different PMU capabilities and events. Perf should registers a dedicated PMU for each of them. To check the X86 event, perf has to go through all possible hybrid pmus. All the hybrid PMUs are registered at boot time. Before the registration, add intel_pmu_check_hybrid_pmus() to check and update the counters information, the event constraints, the extra registers and the unique capabilities for each hybrid PMUs. Postpone the display of the PMU information and HW check to CPU_STARTING, because the boot CPU is the only online CPU in the init_hw_perf_events(). Perf doesn't know the availability of the other PMUs. Perf should display the PMU information only if the counters of the PMU are available. One type of CPUs may be all offline. For this case, users can still observe the PMU in /sys/devices, but its CPU mask is 0. All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS. The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned later in a separated patch. The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other hybrid PMUs, the PMU type id is not hard code. The event->cpu must be compatitable with the supported CPUs of the PMU. Add a check in the x86_pmu_event_init(). The events in a group must be from the same type of hybrid PMU. The fake cpuc used in the validation must be from the supported CPU of the event->pmu. Perf may not retrieve a valid core type from get_this_hybrid_cpu_type(). For example, ADL may have an alternative configuration. With that configuration, Perf cannot retrieve the core type from the CPUID leaf 0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way fails, invoke the platform specific get_hybrid_cpu_type(). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 137 +- arch/x86/events/intel/core.c | 93 ++- arch/x86/events/perf_event.h | 14 +++- 3 files changed, 223 insertions(+), 21 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 2e7ae52..bd465a8 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -478,7 +478,7 @@ int x86_setup_perfctr(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } - if (attr->type == PERF_TYPE_RAW) + if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) @@ -613,7 +613,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; - if (event->attr.type == PERF_TYPE_RAW) + if (event->attr.type == event->pmu->type) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { @@ -742,7 +742,17 @@ void x86_pmu_enable_all(int added) static inline int is_x86_event(struct perf_event *event) { - return event->pmu == &pmu; + int i; + + if (!is_hybrid()) + return event->pmu == &pmu; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) + return true; + } + + return false; } struct pmu *x86_get_pmu(unsigned int cpu) @@ -1990,6 +2000,23 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, pr_info("... event mask: %016Lx\n", intel_ctrl); } +/* + * The generic code is not hybrid friendly. The hybrid_pmu->pmu + * of the first registered PMU is unconditionally assigned to + * each possible cpuctx->ctx.pmu. + * Update the correct hybrid PMU to the cpuctx->ctx.pmu. + */ +void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu) +{ + struct perf_cpu_context *cpuctx; + + if (!pmu->pmu_cpu_context) + return; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->ctx.pmu = pmu; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -2050,8 +2077,11 @@ static int __init init_hw_perf_events(void) pmu.attr_update = x86_pmu.attr_update; - x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed, -x86_pmu.intel_ctrl); + if (!is_hybrid()) { +
[tip: perf/core] perf/x86: Add structures for the attributes of Hybrid PMUs
The following commit has been merged into the perf/core branch of tip: Commit-ID: a9c81ccdf52dd73a20178c40bca34cf52991fdea Gitweb: https://git.kernel.org/tip/a9c81ccdf52dd73a20178c40bca34cf52991fdea Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:57 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00 perf/x86: Add structures for the attributes of Hybrid PMUs Hybrid PMUs have different events and formats. In theory, Hybrid PMU specific attributes should be maintained in the dedicated struct x86_hybrid_pmu, but it wastes space because the events and formats are similar among Hybrid PMUs. To reduce duplication, all hybrid PMUs will share a group of attributes in the following patch. To distinguish an attribute from different Hybrid PMUs, a PMU aware attribute structure is introduced. A PMU type is required for the attribute structure. The type is internal usage. It is not visible in the sysfs API. Hybrid PMUs may support the same event name, but with different event encoding, e.g., the mem-loads event on an Atom PMU has different event encoding from a Core PMU. It brings issue if two attributes are created for them. Current sysfs_update_group finds an attribute by searching the attr name (aka event name). If two attributes have the same event name, the first attribute will be replaced. To address the issue, only one attribute is created for the event. The event_str is extended and stores event encodings from all Hybrid PMUs. Each event encoding is divided by ";". The order of the event encodings must follow the order of the hybrid PMU index. The event_str is internal usage as well. When a user wants to show the attribute of a Hybrid PMU, only the corresponding part of the string is displayed. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-18-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 43 +++- arch/x86/events/perf_event.h | 19 +++- include/linux/perf_event.h | 12 ++- 3 files changed, 74 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bd465a8..37ab109 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1860,6 +1860,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, pmu_attr->event_str_noht); } +ssize_t events_hybrid_sysfs_show(struct device *dev, +struct device_attribute *attr, +char *page) +{ + struct perf_pmu_events_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_hybrid_attr, attr); + struct x86_hybrid_pmu *pmu; + const char *str, *next_str; + int i; + + if (hweight64(pmu_attr->pmu_type) == 1) + return sprintf(page, "%s", pmu_attr->event_str); + + /* +* Hybrid PMUs may support the same event name, but with different +* event encoding, e.g., the mem-loads event on an Atom PMU has +* different event encoding from a Core PMU. +* +* The event_str includes all event encodings. Each event encoding +* is divided by ";". The order of the event encodings must follow +* the order of the hybrid PMU index. +*/ + pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + + str = pmu_attr->event_str; + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type)) + continue; + if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) { + next_str = strchr(str, ';'); + if (next_str) + return snprintf(page, next_str - str + 1, "%s", str); + else + return sprintf(page, "%s", str); + } + str = strchr(str, ';'); + str++; + } + + return 0; +} +EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); + EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS); EVENT_ATTR(cache-references, CACHE_REFERENCES); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4282ce4..e2be927 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -979,6 +979,22 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ .event_str_ht = ht, \ } +#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \ +static struct perf_pmu_events_hybrid_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\ + .id
[tip: perf/core] perf/x86/intel: Add Alder Lake Hybrid support
The following commit has been merged into the perf/core branch of tip: Commit-ID: f83d2f91d2590318e083d05bd7b1beda2489050e Gitweb: https://git.kernel.org/tip/f83d2f91d2590318e083d05bd7b1beda2489050e Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:31:00 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00 perf/x86/intel: Add Alder Lake Hybrid support Alder Lake Hybrid system has two different types of core, Golden Cove core and Gracemont core. The Golden Cove core is registered to "cpu_core" PMU. The Gracemont core is registered to "cpu_atom" PMU. The difference between the two PMUs include: - Number of GP and fixed counters - Events - The "cpu_core" PMU supports Topdown metrics. The "cpu_atom" PMU supports PEBS-via-PT. The "cpu_core" PMU is similar to the Sapphire Rapids PMU, but without PMEM. The "cpu_atom" PMU is similar to Tremont, but with different events, event_constraints, extra_regs and number of counters. The mem-loads AUX event workaround only applies to the Golden Cove core. Users may disable all CPUs of the same CPU type on the command line or in the BIOS. For this case, perf still register a PMU for the CPU type but the CPU mask is 0. Current caps/pmu_name is usually the microarch codename. Assign the "alderlake_hybrid" to the caps/pmu_name of both PMUs to indicate the hybrid Alder Lake microarchitecture. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-21-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 255 +- arch/x86/events/intel/ds.c | 7 +- arch/x86/events/perf_event.h | 7 +- 3 files changed, 268 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ba24638..5272f34 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2076,6 +2076,14 @@ static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_grt_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3full, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3full, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0), + EVENT_EXTRA_END +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -2430,6 +2438,16 @@ static int icl_set_topdown_event_period(struct perf_event *event) return 0; } +static int adl_set_topdown_event_period(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type != hybrid_big) + return 0; + + return icl_set_topdown_event_period(event); +} + static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) { u32 val; @@ -2570,6 +2588,17 @@ static u64 icl_update_topdown_event(struct perf_event *event) x86_pmu.num_topdown_events - 1); } +static u64 adl_update_topdown_event(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type != hybrid_big) + return 0; + + return icl_update_topdown_event(event); +} + + static void intel_pmu_read_topdown_event(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -3655,6 +3684,17 @@ static inline bool is_mem_loads_aux_event(struct perf_event *event) return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82); } +static inline bool require_mem_loads_aux_event(struct perf_event *event) +{ + if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX)) + return false; + + if (is_hybrid()) + return hybrid_pmu(event->pmu)->cpu_type == hybrid_big; + + return true; +} + static inline bool intel_pmu_has_cap(struct perf_event *event, int idx) { union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap); @@ -3779,7 +3819,7 @@ static int intel_pmu_hw_config(struct perf_event *event) * event. The rule is to simplify the implementation of the check. * That's because perf cannot have a complete group at the moment. */ - if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX && + if (require_mem_loads_aux_event(event) && (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) && is_mem_loads_event(event)) { struct perf_event *leader = event->group_leader; @@ -4056,6 +4096,39 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +adl_get_event_c
[tip: perf/core] perf/x86/intel: Add attr_update for Hybrid PMUs
The following commit has been merged into the perf/core branch of tip: Commit-ID: 58ae30c29a370c09eb49e0007d881a9aed13c5a3 Gitweb: https://git.kernel.org/tip/58ae30c29a370c09eb49e0007d881a9aed13c5a3 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:58 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00 perf/x86/intel: Add attr_update for Hybrid PMUs The attribute_group for Hybrid PMUs should be different from the previous cpu PMU. For example, cpumask is required for a Hybrid PMU. The PMU type should be included in the event and format attribute. Add hybrid_attr_update for the Hybrid PMU. Check the PMU type in is_visible() function. Only display the event or format for the matched Hybrid PMU. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-19-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 120 -- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 4881209..ba24638 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5118,6 +5118,106 @@ static const struct attribute_group *attr_update[] = { NULL, }; +static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + struct perf_pmu_events_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr); + + return pmu->cpu_type & pmu_attr->pmu_type; +} + +static umode_t hybrid_events_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0; +} + +static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu) +{ + int cpu = cpumask_first(&pmu->supported_cpus); + + return (cpu >= nr_cpu_ids) ? -1 : cpu; +} + +static umode_t hybrid_tsx_is_visible(struct kobject *kobj, +struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = +container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + int cpu = hybrid_find_supported_cpu(pmu); + + return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && cpu_has(&cpu_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0; +} + +static umode_t hybrid_format_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + struct perf_pmu_format_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr); + int cpu = hybrid_find_supported_cpu(pmu); + + return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0; +} + +static struct attribute_group hybrid_group_events_td = { + .name = "events", + .is_visible = hybrid_events_is_visible, +}; + +static struct attribute_group hybrid_group_events_mem = { + .name = "events", + .is_visible = hybrid_events_is_visible, +}; + +static struct attribute_group hybrid_group_events_tsx = { + .name = "events", + .is_visible = hybrid_tsx_is_visible, +}; + +static struct attribute_group hybrid_group_format_extra = { + .name = "format", + .is_visible = hybrid_format_is_visible, +}; + +static ssize_t intel_hybrid_get_attr_cpus(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + + return cpumap_print_to_pagebuf(true, buf, &pmu->supported_cpus); +} + +static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL); +static struct attribute *intel_hybrid_cpus_attrs[] = { + &dev_attr_cpus.attr, + NULL, +}; + +static struct attribute_group hybrid_group_cpus = { + .attrs = intel_hybrid_cpus_attrs, +}; + +static const struct attribute_group *hybrid_attr_update[] = { + &hybrid_group_events_td, + &hybrid_group_events_mem, + &hybrid_group_events_tsx, + &group_caps_gen, + &group_caps_lbr, + &hybrid_group_format_extra, + &group_default, + &hybrid_group_cpus, + NULL, +}; + static struct attribute *empty_attrs; static void intel_pmu_check_num_counters(int *num_counters, @@ -5861,14 +5961,22 @@ __init int intel
[tip: perf/core] perf/x86: Support filter_match callback
The following commit has been merged into the perf/core branch of tip: Commit-ID: 3e9a8b219e4cc897dba20e19185d0471f129f6f3 Gitweb: https://git.kernel.org/tip/3e9a8b219e4cc897dba20e19185d0471f129f6f3 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:30:59 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00 perf/x86: Support filter_match callback Implement filter_match callback for X86, which check whether an event is schedulable on the current CPU. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-20-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 10 ++ arch/x86/events/perf_event.h | 1 + 2 files changed, 11 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 37ab109..4f6595e 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2641,6 +2641,14 @@ static int x86_pmu_aux_output_match(struct perf_event *event) return 0; } +static int x86_pmu_filter_match(struct perf_event *event) +{ + if (x86_pmu.filter_match) + return x86_pmu.filter_match(event); + + return 1; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable= x86_pmu_disable, @@ -2668,6 +2676,8 @@ static struct pmu pmu = { .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, + + .filter_match = x86_pmu_filter_match, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e2be927..606fb6e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -879,6 +879,7 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); + int (*filter_match)(struct perf_event *event); /* * Hybrid support *
[tip: perf/core] perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
The following commit has been merged into the perf/core branch of tip: Commit-ID: 55bcf6ef314ae8ba81bcd74aa760247b635ed47b Gitweb: https://git.kernel.org/tip/55bcf6ef314ae8ba81bcd74aa760247b635ed47b Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:31:01 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00 perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE Current Hardware events and Hardware cache events have special perf types, PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The two types don't pass the PMU type in the user interface. For a hybrid system, the perf subsystem doesn't know which PMU the events belong to. The first capable PMU will always be assigned to the events. The events never get a chance to run on the other capable PMUs. Extend the two types to become PMU aware types. The PMU type ID is stored at attr.config[63:32]. Add a new PMU capability, PERF_PMU_CAP_EXTENDED_HW_TYPE, to indicate a PMU which supports the extended PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The PMU type is only required when searching a specific PMU. The PMU specific codes will only be interested in the 'real' config value, which is stored in the low 32 bit of the event->attr.config. Update the event->attr.config in the generic code, so the PMU specific codes don't need to calculate it separately. If a user specifies a PMU type, but the PMU doesn't support the extended type, error out. If an event cannot be initialized in a PMU specified by a user, error out immediately. Perf should not try to open it on other PMUs. The new PMU capability is only set for the X86 hybrid PMUs for now. Other architectures, e.g., ARM, may need it as well. The support on ARM may be implemented later separately. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618237865-33448-22-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 1 + include/linux/perf_event.h | 19 ++- include/uapi/linux/perf_event.h | 15 +++ kernel/events/core.c| 19 --- 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 4f6595e..3fe66b7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2173,6 +2173,7 @@ static int __init init_hw_perf_events(void) hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS; + hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61b3851..a763928 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -260,15 +260,16 @@ struct perf_event; /** * pmu::capabilities flags */ -#define PERF_PMU_CAP_NO_INTERRUPT 0x01 -#define PERF_PMU_CAP_NO_NMI0x02 -#define PERF_PMU_CAP_AUX_NO_SG 0x04 -#define PERF_PMU_CAP_EXTENDED_REGS 0x08 -#define PERF_PMU_CAP_EXCLUSIVE 0x10 -#define PERF_PMU_CAP_ITRACE0x20 -#define PERF_PMU_CAP_HETEROGENEOUS_CPUS0x40 -#define PERF_PMU_CAP_NO_EXCLUDE0x80 -#define PERF_PMU_CAP_AUX_OUTPUT0x100 +#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 +#define PERF_PMU_CAP_NO_NMI0x0002 +#define PERF_PMU_CAP_AUX_NO_SG 0x0004 +#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 +#define PERF_PMU_CAP_EXCLUSIVE 0x0010 +#define PERF_PMU_CAP_ITRACE0x0020 +#define PERF_PMU_CAP_HETEROGENEOUS_CPUS0x0040 +#define PERF_PMU_CAP_NO_EXCLUDE0x0080 +#define PERF_PMU_CAP_AUX_OUTPUT0x0100 +#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0200 struct perf_output_handle; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 0b58970..e54e639 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -38,6 +38,21 @@ enum perf_type_id { }; /* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0x00AA + * AA: hardware event ID + * : PMU type ID + * PERF_TYPE_HW_CACHE: 0x00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + *
[tip: perf/core] perf/x86/intel/uncore: Add Alder Lake support
The following commit has been merged into the perf/core branch of tip: Commit-ID: 772ed05f3c5ce722b9de6c4c2dd87538a33fb8d3 Gitweb: https://git.kernel.org/tip/772ed05f3c5ce722b9de6c4c2dd87538a33fb8d3 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:31:02 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00 perf/x86/intel/uncore: Add Alder Lake support The uncore subsystem for Alder Lake is similar to the previous Tiger Lake. The difference includes: - New MSR addresses for global control, fixed counters, CBOX and ARB. Add a new adl_uncore_msr_ops for uncore operations. - Add a new threshold field for CBOX. - New PCIIDs for IMC devices. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-23-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 7 +- arch/x86/events/intel/uncore.h | 1 +- arch/x86/events/intel/uncore_snb.c | 131 - 3 files changed, 139 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index a2b68bb..df7b07d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1752,6 +1752,11 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun adl_uncore_init __initconst = { + .cpu_init = adl_uncore_cpu_init, + .mmio_init = tgl_uncore_mmio_init, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1806,6 +1811,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 96569dc..2917910 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -582,6 +582,7 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); +void adl_uncore_cpu_init(void); void tgl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 5127128..0f63706 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -62,6 +62,8 @@ #define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 #define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43 #define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 +#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660 +#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK0x00ff @@ -131,12 +133,33 @@ #define ICL_UNC_ARB_PER_CTR0x3b1 #define ICL_UNC_ARB_PERFEVTSEL 0x3b3 +/* ADL uncore global control */ +#define ADL_UNC_PERF_GLOBAL_CTL0x2ff0 +#define ADL_UNC_FIXED_CTR_CTRL 0x2fde +#define ADL_UNC_FIXED_CTR 0x2fdf + +/* ADL Cbo register */ +#define ADL_UNC_CBO_0_PER_CTR0 0x2002 +#define ADL_UNC_CBO_0_PERFEVTSEL0 0x2000 +#define ADL_UNC_CTL_THRESHOLD 0x3f00 +#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ +SNB_UNC_CTL_UMASK_MASK | \ +SNB_UNC_CTL_EDGE_DET | \ +SNB_UNC_CTL_INVERT | \ +ADL_UNC_CTL_THRESHOLD) + +/* ADL ARB register */ +#define ADL_UNC_ARB_PER_CTR0 0x2FD2 +#define ADL_UNC_ARB_PERFEVTSEL00x2FD0 +#define ADL_UNC_ARB_MSR_OFFSET 0x8 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29"); /* Sandy Bridge uncore support */ static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -4
[tip: perf/core] perf/x86/cstate: Add Alder Lake CPU support
The following commit has been merged into the perf/core branch of tip: Commit-ID: d0ca946bcf84e1f9847571923bb1e6bd1264f424 Gitweb: https://git.kernel.org/tip/d0ca946bcf84e1f9847571923bb1e6bd1264f424 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:31:04 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00 perf/x86/cstate: Add Alder Lake CPU support Compared with the Rocket Lake, the CORE C1 Residency Counter is added for Alder Lake, but the CORE C3 Residency Counter is removed. Other counters are the same. Create a new adl_cstates for Alder Lake. Update the comments accordingly. The External Design Specification (EDS) is not published yet. It comes from an authoritative internal source. The patch has been tested on real hardware. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-25-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/cstate.c | 39 - 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 407eee5..4333990 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,7 +40,7 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT,GLM,CNL,TNT + * Available model: SLM,AMT,GLM,CNL,TNT,ADL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter *perf code: 0x01 @@ -51,46 +51,49 @@ *perf code: 0x02 *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT,RKL + * TNT,RKL,ADL *Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter *perf code: 0x03 *Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL,RKL + * ICL,TGL,RKL,ADL *Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. *perf code: 0x00 *Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT,RKL + * KBL,CML,ICL,TGL,TNT,RKL,ADL *Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. *perf code: 0x01 *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL + * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL, + * ADL *Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. *perf code: 0x02 *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT,RKL + * TNT,RKL,ADL *Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. *perf code: 0x03 *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, - * KBL,CML,ICL,TGL,RKL + * KBL,CML,ICL,TGL,RKL,ADL *Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. *perf code: 0x04 - *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL + *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL, + * ADL *Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. *perf code: 0x05 - *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL + *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL, + * ADL *Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Packag
[tip: perf/core] perf/x86/msr: Add Alder Lake CPU support
The following commit has been merged into the perf/core branch of tip: Commit-ID: 19d3a81fd92dc9b73950564955164ecfd0dfbea1 Gitweb: https://git.kernel.org/tip/19d3a81fd92dc9b73950564955164ecfd0dfbea1 Author:Kan Liang AuthorDate:Mon, 12 Apr 2021 07:31:03 -07:00 Committer: Peter Zijlstra CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00 perf/x86/msr: Add Alder Lake CPU support PPERF and SMI_COUNT MSRs are also supported on Alder Lake. The External Design Specification (EDS) is not published yet. It comes from an authoritative internal source. The patch has been tested on real hardware. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-24-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/msr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 680404c..c853b28 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -100,6 +100,8 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: case INTEL_FAM6_ROCKETLAKE: + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break;
[tip: perf/core] perf/x86: Move cpuc->running into P4 specific code
The following commit has been merged into the perf/core branch of tip: Commit-ID: 46ade4740bbf9bf4e804ddb2c85845cccd219f3c Gitweb: https://git.kernel.org/tip/46ade4740bbf9bf4e804ddb2c85845cccd219f3c Author:Kan Liang AuthorDate:Wed, 14 Apr 2021 07:36:29 -07:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 16:32:42 +02:00 perf/x86: Move cpuc->running into P4 specific code The 'running' variable is only used in the P4 PMU. Current perf sets the variable in the critical function x86_pmu_start(), which wastes cycles for everybody not running on P4. Move cpuc->running into the P4 specific p4_pmu_enable_event(). Add a static per-CPU 'p4_running' variable to replace the 'running' variable in the struct cpu_hw_events. Saves space for the generic structure. The p4_pmu_enable_all() also invokes the p4_pmu_enable_event(), but it should not set cpuc->running. Factor out __p4_pmu_enable_event() for p4_pmu_enable_all(). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618410990-21383-1-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 1 - arch/x86/events/intel/p4.c | 16 +--- arch/x86/events/perf_event.h | 1 - 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 18df171..dd9f3c2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1480,7 +1480,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); - __set_bit(idx, cpuc->running); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index a4cc660..9c10cbb 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -947,7 +947,7 @@ static void p4_pmu_enable_pebs(u64 config) (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } -static void p4_pmu_enable_event(struct perf_event *event) +static void __p4_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int thread = p4_ht_config_thread(hwc->config); @@ -983,6 +983,16 @@ static void p4_pmu_enable_event(struct perf_event *event) (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } +static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(X86_PMC_IDX_MAX)], p4_running); + +static void p4_pmu_enable_event(struct perf_event *event) +{ + int idx = event->hw.idx; + + __set_bit(idx, per_cpu(p4_running, smp_processor_id())); + __p4_pmu_enable_event(event); +} + static void p4_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -992,7 +1002,7 @@ static void p4_pmu_enable_all(int added) struct perf_event *event = cpuc->events[idx]; if (!test_bit(idx, cpuc->active_mask)) continue; - p4_pmu_enable_event(event); + __p4_pmu_enable_event(event); } } @@ -1012,7 +1022,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!test_bit(idx, cpuc->active_mask)) { /* catch in-flight IRQs */ - if (__test_and_clear_bit(idx, cpuc->running)) + if (__test_and_clear_bit(idx, per_cpu(p4_running, smp_processor_id( handled++; continue; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 53b2b5f..54a340e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -228,7 +228,6 @@ struct cpu_hw_events { */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; int n_events; /* the # of events in the below arrays */
[tip: perf/core] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The following commit has been merged into the perf/core branch of tip: Commit-ID: 01fd9661e168de7cfc4f947e7220fca0e6791999 Gitweb: https://git.kernel.org/tip/01fd9661e168de7cfc4f947e7220fca0e6791999 Author:Kan Liang AuthorDate:Wed, 14 Apr 2021 07:36:30 -07:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 16:32:43 +02:00 perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task The counter value of a perf task may leak to another RDPMC task. For example, a perf stat task as below is running on CPU 0. perf stat -e 'branches,cycles' -- taskset -c 0 ./workload In the meantime, an RDPMC task, which is also running on CPU 0, may read the GP counters periodically. (The RDPMC task creates a fixed event, but read four GP counters.) $ taskset -c 0 ./rdpmc_read_all_counters index 0x0 value 0x8001e5970f99 index 0x1 value 0x8005d750edb6 index 0x2 value 0x0 index 0x3 value 0x0 index 0x0 value 0x8002358e48a5 index 0x1 value 0x8006bd1e3bc9 index 0x2 value 0x0 index 0x3 value 0x0 It is a potential security issue. Once the attacker knows what the other thread is counting. The PerfMon counter can be used as a side-channel to attack cryptosystems. The counter value of the perf stat task leaks to the RDPMC task because perf never clears the counter when it's stopped. Two methods were considered to address the issue. - Unconditionally reset the counter in x86_pmu_del(). It can bring extra overhead even when there is no RDPMC task running. - Only reset the un-assigned dirty counters when the RDPMC task is scheduled in. The method is implemented here. The dirty counter is a counter, on which the assigned event has been deleted, but the counter is not reset. To track the dirty counters, add a 'dirty' variable in the struct cpu_hw_events. The current code doesn't reset the counter when the assigned event is deleted. Set the corresponding bit in the 'dirty' variable in x86_pmu_del(), if the RDPMC feature is available on the system. The security issue can only be found with an RDPMC task. The event for an RDPMC task requires the mmap buffer. This can be used to detect an RDPMC task. Once the event is detected in the event_mapped(), enable sched_task(), which is invoked in each context switch. Add a check in the sched_task() to clear the dirty counters, when the RDPMC task is scheduled in. Only the current un-assigned dirty counters are reset, bacuase the RDPMC assigned dirty counters will be updated soon. The RDPMC instruction is also supported on the older platforms. Add sched_task() for the core_pmu. The core_pmu doesn't support large PEBS and LBR callstack, the intel_pmu_pebs/lbr_sched_task() will be ignored. The RDPMC is not Intel-only feature. Add the dirty counters clear code in the X86 generic code. After applying the patch, $ taskset -c 0 ./rdpmc_read_all_counters index 0x0 value 0x0 index 0x1 value 0x0 index 0x2 value 0x0 index 0x3 value 0x0 index 0x0 value 0x0 index 0x1 value 0x0 index 0x2 value 0x0 index 0x3 value 0x0 Performance The performance of a context switch only be impacted when there are two or more perf users and one of the users must be an RDPMC user. In other cases, there is no performance impact. The worst-case occurs when there are two users: the RDPMC user only applies one counter; while the other user applies all available counters. When the RDPMC task is scheduled in, all the counters, other than the RDPMC assigned one, have to be reset. Here is the test result for the worst-case. The test is implemented on an Ice Lake platform, which has 8 GP counters and 3 fixed counters (Not include SLOTS counter). The lat_ctx is used to measure the context switching time. lat_ctx -s 128K -N 1000 processes 2 I instrument the lat_ctx to open all 8 GP counters and 3 fixed counters for one task. The other task opens a fixed counter and enable RDPMC. Without the patch: The context switch time is 4.97 us With the patch: The context switch time is 5.16 us There is ~4% performance drop for the context switching time in the worst-case. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618410990-21383-2-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 47 +++- arch/x86/events/perf_event.h | 1 +- 2 files changed, 48 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index dd9f3c2..e34eb72 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1585,6 +1585,8 @@ static void x86_pmu_del(struct perf_event *event, int flags) if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; + __set_bit(event->hw.idx, cpuc->dirty); + /* * Not a TXN, therefore cleanup properly. */ @@ -2304,12 +2306,46 @@ s
[tip: perf/core] perf/x86/intel/uncore: Generic support for the MSR type of uncore blocks
The following commit has been merged into the perf/core branch of tip: Commit-ID: d6c754130435ab786711bed75d04a2388a6b4da8 Gitweb: https://git.kernel.org/tip/d6c754130435ab786711bed75d04a2388a6b4da8 Author:Kan Liang AuthorDate:Wed, 17 Mar 2021 10:59:34 -07:00 Committer: Peter Zijlstra CommitterDate: Fri, 02 Apr 2021 10:04:54 +02:00 perf/x86/intel/uncore: Generic support for the MSR type of uncore blocks The discovery table provides the generic uncore block information for the MSR type of uncore blocks, e.g., the counter width, the number of counters, the location of control/counter registers, which is good enough to provide basic uncore support. It can be used as a fallback solution when the kernel doesn't support a platform. The name of the uncore box cannot be retrieved from the discovery table. uncore_type_&typeID_&boxID will be used as its name. Save the type ID and the box ID information in the struct intel_uncore_type. Factor out uncore_get_pmu_name() to handle different naming methods. Implement generic support for the MSR type of uncore block. Some advanced features, such as filters and constraints, cannot be retrieved from discovery tables. Features that rely on that information are not be supported here. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1616003977-90612-3-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 45 ++-- arch/x86/events/intel/uncore.h | 3 +- arch/x86/events/intel/uncore_discovery.c | 126 ++- arch/x86/events/intel/uncore_discovery.h | 18 +++- 4 files changed, 182 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d111370..dabc01f 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -10,7 +10,7 @@ static bool uncore_no_discover; module_param(uncore_no_discover, bool, 0); MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism " "(default: enable the discovery mechanism)."); -static struct intel_uncore_type *empty_uncore[] = { NULL, }; +struct intel_uncore_type *empty_uncore[] = { NULL, }; struct intel_uncore_type **uncore_msr_uncores = empty_uncore; struct intel_uncore_type **uncore_pci_uncores = empty_uncore; struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; @@ -834,6 +834,34 @@ static const struct attribute_group uncore_pmu_attr_group = { .attrs = uncore_pmu_attrs, }; +static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu) +{ + struct intel_uncore_type *type = pmu->type; + + /* +* No uncore block name in discovery table. +* Use uncore_type_&typeid_&boxid as name. +*/ + if (!type->name) { + if (type->num_boxes == 1) + sprintf(pmu->name, "uncore_type_%u", type->type_id); + else { + sprintf(pmu->name, "uncore_type_%u_%d", + type->type_id, type->box_ids[pmu->pmu_idx]); + } + return; + } + + if (type->num_boxes == 1) { + if (strlen(type->name) > 0) + sprintf(pmu->name, "uncore_%s", type->name); + else + sprintf(pmu->name, "uncore"); + } else + sprintf(pmu->name, "uncore_%s_%d", type->name, pmu->pmu_idx); + +} + static int uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; @@ -860,15 +888,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) pmu->pmu.attr_update = pmu->type->attr_update; } - if (pmu->type->num_boxes == 1) { - if (strlen(pmu->type->name) > 0) - sprintf(pmu->name, "uncore_%s", pmu->type->name); - else - sprintf(pmu->name, "uncore"); - } else { - sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, - pmu->pmu_idx); - } + uncore_get_pmu_name(pmu); ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); if (!ret) @@ -909,6 +929,10 @@ static void uncore_type_exit(struct intel_uncore_type *type) kfree(type->pmus); type->pmus = NULL; } + if (type->box_ids) { + kfree(type->box_ids); + type->box_ids = NULL; + } kfree(type->events_group); type->events_group = NULL; } @@ -1643,6 +1667,7 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = { }; static const struct intel_uncore_init_fun generic_uncore_init __initconst = { + .cpu_init = intel_uncore_generic_uncore_cpu_init, }; static const struct x86_cpu_id intel_uncore_match[] __initconst = { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events
[tip: perf/core] perf/x86/intel/uncore: Parse uncore discovery tables
The following commit has been merged into the perf/core branch of tip: Commit-ID: edae1f06c2cda41edffc93de6aedc8ba8dc883c3 Gitweb: https://git.kernel.org/tip/edae1f06c2cda41edffc93de6aedc8ba8dc883c3 Author:Kan Liang AuthorDate:Wed, 17 Mar 2021 10:59:33 -07:00 Committer: Peter Zijlstra CommitterDate: Fri, 02 Apr 2021 10:04:54 +02:00 perf/x86/intel/uncore: Parse uncore discovery tables A self-describing mechanism for the uncore PerfMon hardware has been introduced with the latest Intel platforms. By reading through an MMIO page worth of information, perf can 'discover' all the standard uncore PerfMon registers in a machine. The discovery mechanism relies on BIOS's support. With a proper BIOS, a PCI device with the unique capability ID 0x23 can be found on each die. Perf can retrieve the information of all available uncore PerfMons from the device via MMIO. The information is composed of one global discovery table and several unit discovery tables. - The global discovery table includes global uncore information of the die, e.g., the address of the global control register, the offset of the global status register, the number of uncore units, the offset of unit discovery tables, etc. - The unit discovery table includes generic uncore unit information, e.g., the access type, the counter width, the address of counters, the address of the counter control, the unit ID, the unit type, etc. The unit is also called "box" in the code. Perf can provide basic uncore support based on this information with the following patches. To locate the PCI device with the discovery tables, check the generic PCI ID first. If it doesn't match, go through the entire PCI device tree and locate the device with the unique capability ID. The uncore information is similar among dies. To save parsing time and space, only completely parse and store the discovery tables on the first die and the first box of each die. The parsed information is stored in an RB tree structure, intel_uncore_discovery_type. The size of the stored discovery tables varies among platforms. It's around 4KB for a Sapphire Rapids server. If a BIOS doesn't support the 'discovery' mechanism, the uncore driver will exit with -ENODEV. There is nothing changed. Add a module parameter to disable the discovery feature. If a BIOS gets the discovery tables wrong, users can have an option to disable the feature. For the current patchset, the uncore driver will exit with -ENODEV. In the future, it may fall back to the hardcode uncore driver on a known platform. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1616003977-90612-2-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/Makefile | 2 +- arch/x86/events/intel/uncore.c | 31 +- arch/x86/events/intel/uncore_discovery.c | 318 ++- arch/x86/events/intel/uncore_discovery.h | 105 +++- 4 files changed, 448 insertions(+), 8 deletions(-) create mode 100644 arch/x86/events/intel/uncore_discovery.c create mode 100644 arch/x86/events/intel/uncore_discovery.h diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index e67a588..10bde6c 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -3,6 +3,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL)+= ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL)+= lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o -intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o +intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o intel-cstate-objs := cstate.o diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 33c8180..d111370 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -4,7 +4,12 @@ #include #include #include "uncore.h" +#include "uncore_discovery.h" +static bool uncore_no_discover; +module_param(uncore_no_discover, bool, 0); +MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism " +"(default: enable the discovery mechanism)."); static struct intel_uncore_type *empty_uncore[] = { NULL, }; struct intel_uncore_type **uncore_msr_uncores = empty_uncore; struct intel_uncore_type **uncore_pci_uncores = empty_uncore; @@ -1637,6 +1642,9 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = { .mmio_init = snr_uncore_mmio_init, }; +static const struct intel_uncore_init_fun generic_uncore_init __initconst = { +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init),
[tip: perf/core] perf/x86/intel/uncore: Generic support for the PCI type of uncore blocks
The following commit has been merged into the perf/core branch of tip: Commit-ID: 42839ef4a20a4bda415974ff0e7d85ff540fffa4 Gitweb: https://git.kernel.org/tip/42839ef4a20a4bda415974ff0e7d85ff540fffa4 Author:Kan Liang AuthorDate:Wed, 17 Mar 2021 10:59:36 -07:00 Committer: Peter Zijlstra CommitterDate: Fri, 02 Apr 2021 10:04:55 +02:00 perf/x86/intel/uncore: Generic support for the PCI type of uncore blocks The discovery table provides the generic uncore block information for the PCI type of uncore blocks, which is good enough to provide basic uncore support. The PCI BUS and DEVFN information can be retrieved from the box control field. Introduce the uncore_pci_pmus_register() to register all the PCICFG type of uncore blocks. The old PCI probe/remove way is dropped. The PCI BUS and DEVFN information are different among dies. Add box_ctls to store the box control field of each die. Add a new BUS notifier for the PCI type of uncore block to support the hotplug. If the device is "hot remove", the corresponding registered PMU has to be unregistered. Perf cannot locate the PMU by searching a const pci_device_id table, because the discovery tables don't provide such information. Introduce uncore_pci_find_dev_pmu_from_types() to search the whole uncore_pci_uncores for the PMU. Implement generic support for the PCI type of uncore block. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1616003977-90612-5-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 91 +-- arch/x86/events/intel/uncore.h | 6 +- arch/x86/events/intel/uncore_discovery.c | 80 - arch/x86/events/intel/uncore_discovery.h | 7 ++- 4 files changed, 177 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 391fa7c..3109082 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1032,10 +1032,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die) return 0; } +static struct intel_uncore_pmu * +uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev) +{ + struct intel_uncore_type **types = uncore_pci_uncores; + struct intel_uncore_type *type; + u64 box_ctl; + int i, die; + + for (; *types; types++) { + type = *types; + for (die = 0; die < __uncore_max_dies; die++) { + for (i = 0; i < type->num_boxes; i++) { + if (!type->box_ctls[die]) + continue; + box_ctl = type->box_ctls[die] + type->pci_offsets[i]; + if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) && + pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) && + pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl)) + return &type->pmus[i]; + } + } + } + + return NULL; +} + /* * Find the PMU of a PCI device. * @pdev: The PCI device. * @ids: The ID table of the available PCI devices with a PMU. + * If NULL, search the whole uncore_pci_uncores. */ static struct intel_uncore_pmu * uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) @@ -1045,6 +1072,9 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) kernel_ulong_t data; unsigned int devfn; + if (!ids) + return uncore_pci_find_dev_pmu_from_types(pdev); + while (ids && ids->vendor) { if ((ids->vendor == pdev->vendor) && (ids->device == pdev->device)) { @@ -1283,6 +1313,48 @@ static void uncore_pci_sub_driver_init(void) uncore_pci_sub_driver = NULL; } +static int uncore_pci_bus_notify(struct notifier_block *nb, +unsigned long action, void *data) +{ + return uncore_bus_notify(nb, action, data, NULL); +} + +static struct notifier_block uncore_pci_notifier = { + .notifier_call = uncore_pci_bus_notify, +}; + + +static void uncore_pci_pmus_register(void) +{ + struct intel_uncore_type **types = uncore_pci_uncores; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct pci_dev *pdev; + u64 box_ctl; + int i, die; + + for (; *types; types++) { + type = *types; + for (die = 0; die < __uncore_max_dies; die++) { + for (i = 0; i < type->num_boxes; i++) { + if (!type->box_ctls[die]) + continue; + box_ctl = type->box_ctls[die] + type->pci_offsets[i]; +
[tip: perf/core] perf/x86/intel/uncore: Rename uncore_notifier to uncore_pci_sub_notifier
The following commit has been merged into the perf/core branch of tip: Commit-ID: 6477dc3934775f82a571fac469fd8c348e611095 Gitweb: https://git.kernel.org/tip/6477dc3934775f82a571fac469fd8c348e611095 Author:Kan Liang AuthorDate:Wed, 17 Mar 2021 10:59:35 -07:00 Committer: Peter Zijlstra CommitterDate: Fri, 02 Apr 2021 10:04:54 +02:00 perf/x86/intel/uncore: Rename uncore_notifier to uncore_pci_sub_notifier Perf will use a similar method to the PCI sub driver to register the PMUs for the PCI type of uncore blocks. The method requires a BUS notifier to support hotplug. The current BUS notifier cannot be reused, because it searches a const id_table for the corresponding registered PMU. The PCI type of uncore blocks in the discovery tables doesn't provide an id_table. Factor out uncore_bus_notify() and add the pointer of an id_table as a parameter. The uncore_bus_notify() will be reused in the following patch. The current BUS notifier is only used by the PCI sub driver. Its name is too generic. Rename it to uncore_pci_sub_notifier, which is specific for the PCI sub driver. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1616003977-90612-4-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index dabc01f..391fa7c 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1203,7 +1203,8 @@ static void uncore_pci_remove(struct pci_dev *pdev) } static int uncore_bus_notify(struct notifier_block *nb, -unsigned long action, void *data) +unsigned long action, void *data, +const struct pci_device_id *ids) { struct device *dev = data; struct pci_dev *pdev = to_pci_dev(dev); @@ -1214,7 +1215,7 @@ static int uncore_bus_notify(struct notifier_block *nb, if (action != BUS_NOTIFY_DEL_DEVICE) return NOTIFY_DONE; - pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table); + pmu = uncore_pci_find_dev_pmu(pdev, ids); if (!pmu) return NOTIFY_DONE; @@ -1226,8 +1227,15 @@ static int uncore_bus_notify(struct notifier_block *nb, return NOTIFY_OK; } -static struct notifier_block uncore_notifier = { - .notifier_call = uncore_bus_notify, +static int uncore_pci_sub_bus_notify(struct notifier_block *nb, +unsigned long action, void *data) +{ + return uncore_bus_notify(nb, action, data, +uncore_pci_sub_driver->id_table); +} + +static struct notifier_block uncore_pci_sub_notifier = { + .notifier_call = uncore_pci_sub_bus_notify, }; static void uncore_pci_sub_driver_init(void) @@ -1268,7 +1276,7 @@ static void uncore_pci_sub_driver_init(void) ids++; } - if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier)) + if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier)) notify = false; if (!notify) @@ -1319,7 +1327,7 @@ static void uncore_pci_exit(void) if (pcidrv_registered) { pcidrv_registered = false; if (uncore_pci_sub_driver) - bus_unregister_notifier(&pci_bus_type, &uncore_notifier); + bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier); pci_unregister_driver(uncore_pci_driver); uncore_types_exit(uncore_pci_uncores); kfree(uncore_extra_pci_dev);
[tip: perf/core] perf/x86/intel/uncore: Generic support for the MMIO type of uncore blocks
The following commit has been merged into the perf/core branch of tip: Commit-ID: c4c55e362a521d763356b9e02bc9a4348c71a471 Gitweb: https://git.kernel.org/tip/c4c55e362a521d763356b9e02bc9a4348c71a471 Author:Kan Liang AuthorDate:Wed, 17 Mar 2021 10:59:37 -07:00 Committer: Peter Zijlstra CommitterDate: Fri, 02 Apr 2021 10:04:55 +02:00 perf/x86/intel/uncore: Generic support for the MMIO type of uncore blocks The discovery table provides the generic uncore block information for the MMIO type of uncore blocks, which is good enough to provide basic uncore support. The box control field is composed of the BAR address and box control offset. When initializing the uncore blocks, perf should ioremap the address from the box control field. Implement the generic support for the MMIO type of uncore block. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1616003977-90612-6-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 1 +- arch/x86/events/intel/uncore.h | 1 +- arch/x86/events/intel/uncore_discovery.c | 98 +++- arch/x86/events/intel/uncore_discovery.h | 1 +- 4 files changed, 101 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 3109082..35b3470 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1755,6 +1755,7 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = { static const struct intel_uncore_init_fun generic_uncore_init __initconst = { .cpu_init = intel_uncore_generic_uncore_cpu_init, .pci_init = intel_uncore_generic_uncore_pci_init, + .mmio_init = intel_uncore_generic_uncore_mmio_init, }; static const struct x86_cpu_id intel_uncore_match[] __initconst = { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 76fc898..549cfb2 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -70,6 +70,7 @@ struct intel_uncore_type { union { unsigned *msr_offsets; unsigned *pci_offsets; + unsigned *mmio_offsets; }; unsigned *box_ids; struct event_constraint unconstrainted; diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 784d7b4..aba9bff 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -442,6 +442,90 @@ static struct intel_uncore_ops generic_uncore_pci_ops = { .read_counter = intel_generic_uncore_pci_read_counter, }; +#define UNCORE_GENERIC_MMIO_SIZE 0x4000 + +static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box) +{ + struct intel_uncore_type *type = box->pmu->type; + + if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets) + return 0; + + return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx]; +} + +static void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box) +{ + unsigned int box_ctl = generic_uncore_mmio_box_ctl(box); + struct intel_uncore_type *type = box->pmu->type; + resource_size_t addr; + + if (!box_ctl) { + pr_warn("Uncore type %d box %d: Invalid box control address.\n", + type->type_id, type->box_ids[box->pmu->pmu_idx]); + return; + } + + addr = box_ctl; + box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE); + if (!box->io_addr) { + pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n", + type->type_id, type->box_ids[box->pmu->pmu_idx], + (unsigned long long)addr); + return; + } + + writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr); +} + +static void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr); +} + +static void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(0, box->io_addr); +} + +static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, +struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(hwc->config, box->io_addr + hwc->config_base); +} + +static void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(0, box->io_addr + hwc->config_base); +} + +static struct intel_uncore_op
[tip: perf/urgent] perf/x86/intel: Fix a crash caused by zero PEBS status
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: d88d05a9e0b6d9356e97129d4ff9942d765f46ea Gitweb: https://git.kernel.org/tip/d88d05a9e0b6d9356e97129d4ff9942d765f46ea Author:Kan Liang AuthorDate:Fri, 12 Mar 2021 05:21:37 -08:00 Committer: Peter Zijlstra CommitterDate: Tue, 16 Mar 2021 21:44:39 +01:00 perf/x86/intel: Fix a crash caused by zero PEBS status A repeatable crash can be triggered by the perf_fuzzer on some Haswell system. https://lore.kernel.org/lkml/7170d3b-c17f-1ded-52aa-cc6d9ae99...@maine.edu/ For some old CPUs (HSW and earlier), the PEBS status in a PEBS record may be mistakenly set to 0. To minimize the impact of the defect, the commit was introduced to try to avoid dropping the PEBS record for some cases. It adds a check in the intel_pmu_drain_pebs_nhm(), and updates the local pebs_status accordingly. However, it doesn't correct the PEBS status in the PEBS record, which may trigger the crash, especially for the large PEBS. It's possible that all the PEBS records in a large PEBS have the PEBS status 0. If so, the first get_next_pebs_record_by_bit() in the __intel_pmu_pebs_event() returns NULL. The at = NULL. Since it's a large PEBS, the 'count' parameter must > 1. The second get_next_pebs_record_by_bit() will crash. Besides the local pebs_status, correct the PEBS status in the PEBS record as well. Fixes: 01330d7288e0 ("perf/x86: Allow zero PEBS status with only single active event") Reported-by: Vince Weaver Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/161298-140216-1-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ebae18..d32b302 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2010,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d */ if (!pebs_status && cpuc->pebs_enabled && !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; + pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events);
[tip: perf/urgent] perf/x86/intel: Fix unchecked MSR access error caused by VLBR_EVENT
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: 2dc0572f2cef87425147658698dce2600b799bd3 Gitweb: https://git.kernel.org/tip/2dc0572f2cef87425147658698dce2600b799bd3 Author:Kan Liang AuthorDate:Fri, 12 Mar 2021 05:21:38 -08:00 Committer: Peter Zijlstra CommitterDate: Tue, 16 Mar 2021 21:44:39 +01:00 perf/x86/intel: Fix unchecked MSR access error caused by VLBR_EVENT On a Haswell machine, the perf_fuzzer managed to trigger this message: [117248.075892] unchecked MSR access error: WRMSR to 0x3f1 (tried to write 0x0400) at rIP: 0x8106e4f4 (native_write_msr+0x4/0x20) [117248.089957] Call Trace: [117248.092685] intel_pmu_pebs_enable_all+0x31/0x40 [117248.097737] intel_pmu_enable_all+0xa/0x10 [117248.102210] __perf_event_task_sched_in+0x2df/0x2f0 [117248.107511] finish_task_switch.isra.0+0x15f/0x280 [117248.112765] schedule_tail+0xc/0x40 [117248.116562] ret_from_fork+0x8/0x30 A fake event called VLBR_EVENT may use the bit 58 of the PEBS_ENABLE, if the precise_ip is set. The bit 58 is reserved by the HW. Accessing the bit causes the unchecked MSR access error. The fake event doesn't support PEBS. The case should be rejected. Fixes: 097e4311cda9 ("perf/x86: Add constraint to create guest LBR event without hw counter") Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/161298-140216-2-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7bbb5bb..37ce384 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3659,6 +3659,9 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) + return -EINVAL; + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type &
[tip: perf/urgent] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: afbef30149587ad46f4780b1e0cc5e219745ce90 Gitweb: https://git.kernel.org/tip/afbef30149587ad46f4780b1e0cc5e219745ce90 Author:Kan Liang AuthorDate:Mon, 30 Nov 2020 11:38:41 -08:00 Committer: Ingo Molnar CommitterDate: Sat, 06 Mar 2021 12:52:44 +01:00 perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer in a context switch. For normal LBRs, a context switch can flip the address space and LBR entries are not tagged with an identifier, we need to wipe the LBR, even for per-cpu events. For LBR callstack, save/restore the stack is required during a context switch. Set PERF_ATTACH_SCHED_CB for the event with large PEBS & LBR. Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20201130193842.10569-2-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5bac48d..7bbb5bb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event) if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice
[tip: perf/urgent] perf/core: Flush PMU internal buffers for per-CPU events
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: a5398bffc01fe044848c5024e5e867e407f239b8 Gitweb: https://git.kernel.org/tip/a5398bffc01fe044848c5024e5e867e407f239b8 Author:Kan Liang AuthorDate:Mon, 30 Nov 2020 11:38:40 -08:00 Committer: Ingo Molnar CommitterDate: Sat, 06 Mar 2021 12:52:39 +01:00 perf/core: Flush PMU internal buffers for per-CPU events Sometimes the PMU internal buffers have to be flushed for per-CPU events during a context switch, e.g., large PEBS. Otherwise, the perf tool may report samples in locations that do not belong to the process where the samples are processed in, because PEBS does not tag samples with PID/TID. The current code only flush the buffers for a per-task event. It doesn't check a per-CPU event. Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the PMU internal buffers have to be flushed for this event during a context switch. Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx which is required to be flushed. Only need to invoke the sched_task() for per-CPU events in this patch. The per-task events have been handled in perf_event_context_sched_in/out already. Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches") Reported-by: Gabriel Marin Originally-by: Namhyung Kim Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.li...@linux.intel.com --- include/linux/perf_event.h | 2 ++- kernel/events/core.c | 42 + 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fab42cf..3f7f89e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -606,6 +606,7 @@ struct swevent_hlist { #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 +#define PERF_ATTACH_SCHED_CB 0x20 struct perf_cgroup; struct perf_buffer; @@ -872,6 +873,7 @@ struct perf_cpu_context { struct list_headcgrp_cpuctx_entry; #endif + struct list_headsched_cb_entry; int sched_cb_usage; int online; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0aeca5f..03db40f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -3461,11 +3462,16 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - --cpuctx->sched_cb_usage; + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } @@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - cpuctx->sched_cb_usage++; + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + + this_cpu_inc(perf_sched_cb_usages); } /* @@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + + if (prev == next) + return; + + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + /* will be handled in perf_event_context_sched_in/out */ + if (cpuctx->task_ctx) + continue; + + __perf_pmu_sched_task(cpuctx, sched_in); + } +} + static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); @@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); + + if (__this_cpu_read(perf_sched_cb_u
[tip: perf/urgent] perf/core: Flush PMU internal buffers for per-CPU events
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: e748d3716e0e581401630d36d3ef0fc8fa8f830d Gitweb: https://git.kernel.org/tip/e748d3716e0e581401630d36d3ef0fc8fa8f830d Author:Kan Liang AuthorDate:Mon, 30 Nov 2020 11:38:40 -08:00 Committer: Peter Zijlstra CommitterDate: Mon, 01 Mar 2021 11:02:18 +01:00 perf/core: Flush PMU internal buffers for per-CPU events Sometimes the PMU internal buffers have to be flushed for per-CPU events during a context switch, e.g., large PEBS. Otherwise, the perf tool may report samples in locations that do not belong to the process where the samples are processed in, because PEBS does not tag samples with PID/TID. The current code only flush the buffers for a per-task event. It doesn't check a per-CPU event. Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the PMU internal buffers have to be flushed for this event during a context switch. Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx which is required to be flushed. Only need to invoke the sched_task() for per-CPU events in this patch. The per-task events have been handled in perf_event_context_sched_in/out already. Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches") Reported-by: Gabriel Marin Originally-by: Namhyung Kim Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.li...@linux.intel.com --- include/linux/perf_event.h | 2 ++- kernel/events/core.c | 42 + 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fab42cf..3f7f89e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -606,6 +606,7 @@ struct swevent_hlist { #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 +#define PERF_ATTACH_SCHED_CB 0x20 struct perf_cgroup; struct perf_buffer; @@ -872,6 +873,7 @@ struct perf_cpu_context { struct list_headcgrp_cpuctx_entry; #endif + struct list_headsched_cb_entry; int sched_cb_usage; int online; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0aeca5f..03db40f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -3461,11 +3462,16 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - --cpuctx->sched_cb_usage; + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } @@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - cpuctx->sched_cb_usage++; + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + + this_cpu_inc(perf_sched_cb_usages); } /* @@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + + if (prev == next) + return; + + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + /* will be handled in perf_event_context_sched_in/out */ + if (cpuctx->task_ctx) + continue; + + __perf_pmu_sched_task(cpuctx, sched_in); + } +} + static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); @@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); + + if (__this_cpu_read(perf_sched_cb_usages)) + p
[tip: perf/urgent] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
The following commit has been merged into the perf/urgent branch of tip: Commit-ID: a8abc881981762631a22568d5e4b2c0ce4aeb15c Gitweb: https://git.kernel.org/tip/a8abc881981762631a22568d5e4b2c0ce4aeb15c Author:Kan Liang AuthorDate:Mon, 30 Nov 2020 11:38:41 -08:00 Committer: Peter Zijlstra CommitterDate: Mon, 01 Mar 2021 11:02:19 +01:00 perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer in a context switch. For normal LBRs, a context switch can flip the address space and LBR entries are not tagged with an identifier, we need to wipe the LBR, even for per-cpu events. For LBR callstack, save/restore the stack is required during a context switch. Set PERF_ATTACH_SCHED_CB for the event with large PEBS & LBR. Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201130193842.10569-2-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5bac48d..7bbb5bb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event) if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event) ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice
[tip: perf/core] perf/x86/intel: Add perf core PMU support for Sapphire Rapids
The following commit has been merged into the perf/core branch of tip: Commit-ID: 61b985e3e775a3a75fda04ce7ef1b1aefc4758bc Gitweb: https://git.kernel.org/tip/61b985e3e775a3a75fda04ce7ef1b1aefc4758bc Author:Kan Liang AuthorDate:Thu, 28 Jan 2021 14:40:10 -08:00 Committer: Peter Zijlstra CommitterDate: Mon, 01 Feb 2021 15:31:37 +01:00 perf/x86/intel: Add perf core PMU support for Sapphire Rapids Add perf core PMU support for the Intel Sapphire Rapids server, which is the successor of the Intel Ice Lake server. The enabling code is based on Ice Lake, but there are several new features introduced. The event encoding is changed and simplified, e.g., the event codes which are below 0x90 are restricted to counters 0-3. The event codes which above 0x90 are likely to have no restrictions. The event constraints, extra_regs(), and hardware cache events table are changed accordingly. A new Precise Distribution (PDist) facility is introduced, which further minimizes the skid when a precise event is programmed on the GP counter 0. Enable the Precise Distribution (PDist) facility with :ppp event. For this facility to work, the period must be initialized with a value larger than 127. Add spr_limit_period() to apply the limit for :ppp event. Two new data source fields, data block & address block, are added in the PEBS Memory Info Record for the load latency event. To enable the feature, - An auxiliary event has to be enabled together with the load latency event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is introduced to indicate the case. A new event, mem-loads-aux, is exposed to sysfs for the user tool. Add a check in hw_config(). If the auxiliary event is not detected, return an unique error -ENODATA. - The union perf_mem_data_src is extended to support the new fields. - Ice Lake and earlier models do not support block information, but the fields may be set by HW on some machines. Add pebs_no_block to explicitly indicate the previous platforms which don't support the new block fields. Accessing the new block fields are ignored on those platforms. A new store Latency facility is introduced, which leverages the PEBS facility where it can provide additional information about sampled stores. The additional information includes the data address, memory auxiliary info (e.g. Data Source, STLB miss) and the latency of the store access. To enable the facility, the new event (0x02cd) has to be programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is introduced to indicate the event. The store_latency_data() is introduced to parse the memory auxiliary info. The layout of access latency field of PEBS Memory Info Record has been changed. Two latency, instruction latency (bit 15:0) and cache access latency (bit 47:32) are recorded. - The cache access latency is similar to previous memory access latency. For loads, the latency starts by the actual cache access until the data is returned by the memory subsystem. For stores, the latency starts when the demand write accesses the L1 data cache and lasts until the cacheline write is completed in the memory subsystem. The cache access latency is stored in low 32bits of the sample type PERF_SAMPLE_WEIGHT_STRUCT. - The instruction latency starts by the dispatch of the load operation for execution and lasts until completion of the instruction it belongs to. Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction latency support. The instruction latency is stored in the bit 47:32 of the sample type PERF_SAMPLE_WEIGHT_STRUCT. Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The lower half of the register is the TMA level 1 metrics (legacy). The upper half is also divided into four 8-bit fields for the new level 2 metrics. Expose all eight Topdown metrics events to user space. The full description for the SPR features can be found at Intel Architecture Instruction Set Extensions and Future Features Programming Reference, 319433-041. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 307 - arch/x86/events/intel/ds.c| 118 ++- arch/x86/events/perf_event.h | 12 +- arch/x86/include/asm/perf_event.h | 8 +- include/uapi/linux/perf_event.h | 12 +- 5 files changed, 443 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 37830ac..58cd64e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_spr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3full, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, M
[tip: perf/core] perf/x86/intel: Support CPUID 10.ECX to disable fixed counters
The following commit has been merged into the perf/core branch of tip: Commit-ID: 32451614da2a9cf4296f90d3606ac77814fb519d Gitweb: https://git.kernel.org/tip/32451614da2a9cf4296f90d3606ac77814fb519d Author:Kan Liang AuthorDate:Thu, 28 Jan 2021 14:40:11 -08:00 Committer: Peter Zijlstra CommitterDate: Mon, 01 Feb 2021 15:31:37 +01:00 perf/x86/intel: Support CPUID 10.ECX to disable fixed counters With Architectural Performance Monitoring Version 5, CPUID 10.ECX cpu leaf indicates the fixed counter enumeration. This extends the previous count to a bitmap which allows disabling even lower fixed counters. It could be used by a Hypervisor. The existing intel_ctrl variable is used to remember the bitmask of the counters. All code that reads all counters is fixed to check this extra bitmask. Suggested-by: Peter Zijlstra (Intel) Originally-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-6-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 8 +++- arch/x86/events/intel/core.c | 34 -- arch/x86/events/perf_event.h | 5 + 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index cf0a52c..6ddeed3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -255,6 +255,8 @@ static bool check_hw_exists(void) if (ret) goto msr_fail; for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (fixed_counter_disabled(i)) + continue; if (val & (0x03 << i*4)) { bios_fail = 1; val_fail = val; @@ -1531,6 +1533,8 @@ void perf_event_print_debug(void) cpu, idx, prev_left); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -2012,7 +2016,9 @@ static int __init init_hw_perf_events(void) pr_info("... generic registers: %d\n", x86_pmu.num_counters); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); + pr_info("... fixed-purpose events: %lu\n", + hweight641ULL << x86_pmu.num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl)); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); if (!x86_pmu.read) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 58cd64e..67a7246 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2723,8 +2723,11 @@ static void intel_pmu_reset(void) wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } if (ds) ds->bts_index = ds->bts_buffer_base; @@ -5042,7 +5045,7 @@ __init int intel_pmu_init(void) union cpuid10_eax eax; union cpuid10_ebx ebx; struct event_constraint *c; - unsigned int unused; + unsigned int fixed_mask; struct extra_reg *er; bool pmem = false; int version, i; @@ -5064,7 +5067,7 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full); if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; @@ -5088,12 +5091,15 @@ __init int intel_pmu_init(void) * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: */ - if (version > 1) { + if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, assume); - } + + fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1; + } else if (version >= 5) + x86_pmu.num_counters_fixed = fls
[tip: perf/core] perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT
The following commit has been merged into the perf/core branch of tip: Commit-ID: 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c Gitweb: https://git.kernel.org/tip/2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c Author:Kan Liang AuthorDate:Thu, 28 Jan 2021 14:40:07 -08:00 Committer: Peter Zijlstra CommitterDate: Mon, 01 Feb 2021 15:31:36 +01:00 perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT Current PERF_SAMPLE_WEIGHT sample type is very useful to expresses the cost of an action represented by the sample. This allows the profiler to scale the samples to be more informative to the programmer. It could also help to locate a hotspot, e.g., when profiling by memory latencies, the expensive load appear higher up in the histograms. But current PERF_SAMPLE_WEIGHT sample type is solely determined by one factor. This could be a problem, if users want two or more factors to contribute to the weight. For example, Golden Cove core PMU can provide both the instruction latency and the cache Latency information as factors for the memory profiling. For current X86 platforms, although meminfo::latency is defined as a u64, only the lower 32 bits include the valid data in practice (No memory access could last than 4G cycles). The higher 32 bits can be used to store new factors. Add a new sample type, PERF_SAMPLE_WEIGHT_STRUCT, to indicate the new sample weight structure. It shares the same space as the PERF_SAMPLE_WEIGHT sample type. Users can apply either the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but they cannot apply both sample types simultaneously. Currently, only X86 and PowerPC use the PERF_SAMPLE_WEIGHT sample type. - For PowerPC, there is nothing changed for the PERF_SAMPLE_WEIGHT sample type. There is no effect for the new PERF_SAMPLE_WEIGHT_STRUCT sample type. PowerPC can re-struct the weight field similarly later. - For X86, the same value will be dumped for the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type for now. The following patches will apply the new factors for the PERF_SAMPLE_WEIGHT_STRUCT sample type. The field in the union perf_sample_weight should be shared among different architectures. A generic name is required, but it's hard to abstract a name that applies to all architectures. For example, on X86, the fields are to store all kinds of latency. While on PowerPC, it stores MMCRA[TECX/TECM], which should not be latency. So a general name prefix 'var$NUM' is used here. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-2-git-send-email-kan.li...@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 2 +- arch/x86/events/intel/ds.c | 17 ++--- include/linux/perf_event.h | 4 +-- include/uapi/linux/perf_event.h | 42 ++-- kernel/events/core.c| 11 +--- 5 files changed, 59 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 28206b1..869d999 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2195,7 +2195,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & PERF_SAMPLE_WEIGHT && ppmu->get_mem_weight) - ppmu->get_mem_weight(&data.weight); + ppmu->get_mem_weight(&data.weight.full); if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 67dbc91..2f54b1f 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -960,7 +960,8 @@ static void adaptive_pebs_record_size_update(void) } #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ - PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_WEIGHT_TYPE |\ PERF_SAMPLE_TRANSACTION |\ PERF_SAMPLE_DATA_PAGE_SIZE) @@ -987,7 +988,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && (attr->sample_regs_intr & PEBS_GP_REGS); - tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) && ((attr->config & INTEL_ARCH_EVENT_MASK) == x86_pmu.rtm_abort_event); @@ -1369,8 +1370,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEB
[tip: perf/core] perf/x86/intel: Factor out intel_update_topdown_event()
The following commit has been merged into the perf/core branch of tip: Commit-ID: 628d923a3c464db98c1c98bb1e0cd50804caf681 Gitweb: https://git.kernel.org/tip/628d923a3c464db98c1c98bb1e0cd50804caf681 Author:Kan Liang AuthorDate:Thu, 28 Jan 2021 14:40:08 -08:00 Committer: Peter Zijlstra CommitterDate: Mon, 01 Feb 2021 15:31:36 +01:00 perf/x86/intel: Factor out intel_update_topdown_event() Similar to Ice Lake, Intel Sapphire Rapids server also supports the topdown performance metrics feature. The difference is that Intel Sapphire Rapids server extends the PERF_METRICS MSR to feature TMA method level two metrics, which will introduce 8 metrics events. Current icl_update_topdown_event() only check 4 level one metrics events. Factor out intel_update_topdown_event() to facilitate the code sharing between Ice Lake and Sapphire Rapids. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-3-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 20 +--- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fe94008..d07408d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2325,8 +2325,8 @@ static void __icl_update_topdown_event(struct perf_event *event, } } -static void update_saved_topdown_regs(struct perf_event *event, - u64 slots, u64 metrics) +static void update_saved_topdown_regs(struct perf_event *event, u64 slots, + u64 metrics, int metric_end) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2335,7 +2335,7 @@ static void update_saved_topdown_regs(struct perf_event *event, event->hw.saved_slots = slots; event->hw.saved_metric = metrics; - for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) continue; other = cpuc->events[idx]; @@ -2350,7 +2350,8 @@ static void update_saved_topdown_regs(struct perf_event *event, * The PERF_METRICS and Fixed counter 3 are read separately. The values may be * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 icl_update_topdown_event(struct perf_event *event) + +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2366,7 +2367,7 @@ static u64 icl_update_topdown_event(struct perf_event *event) /* read PERF_METRICS */ rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); - for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) continue; other = cpuc->events[idx]; @@ -2392,7 +2393,7 @@ static u64 icl_update_topdown_event(struct perf_event *event) * Don't need to reset the PERF_METRICS and Fixed counter 3. * Because the values will be restored in next schedule in. */ - update_saved_topdown_regs(event, slots, metrics); + update_saved_topdown_regs(event, slots, metrics, metric_end); reset = false; } @@ -2401,12 +2402,17 @@ static u64 icl_update_topdown_event(struct perf_event *event) wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); wrmsrl(MSR_PERF_METRICS, 0); if (event) - update_saved_topdown_regs(event, 0, 0); + update_saved_topdown_regs(event, 0, 0, metric_end); } return slots; } +static u64 icl_update_topdown_event(struct perf_event *event) +{ + return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND); +} + static void intel_pmu_read_topdown_event(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
[tip: perf/core] perf/x86/intel: Filter unsupported Topdown metrics event
The following commit has been merged into the perf/core branch of tip: Commit-ID: 1ab5f235c176e93adc4f75000aae6c50fea9db00 Gitweb: https://git.kernel.org/tip/1ab5f235c176e93adc4f75000aae6c50fea9db00 Author:Kan Liang AuthorDate:Thu, 28 Jan 2021 14:40:09 -08:00 Committer: Peter Zijlstra CommitterDate: Mon, 01 Feb 2021 15:31:36 +01:00 perf/x86/intel: Filter unsupported Topdown metrics event Intel Sapphire Rapids server will introduce 8 metrics events. Intel Ice Lake only supports 4 metrics events. A perf tool user may mistakenly use the unsupported events via RAW format on Ice Lake. The user can still get a value from the unsupported Topdown metrics event once the following Sapphire Rapids enabling patch is applied. To enable the 8 metrics events on Intel Sapphire Rapids, the INTEL_TD_METRIC_MAX has to be updated, which impacts the is_metric_event(). The is_metric_event() is a generic function. On Ice Lake, the newly added SPR metrics events will be mistakenly accepted as metric events on creation. At runtime, the unsupported Topdown metrics events will be updated. Add a variable num_topdown_events in x86_pmu to indicate the available number of the Topdown metrics event on the platform. Apply the number into is_metric_event(). Only the supported Topdown metrics events should be created as metrics events. Apply the num_topdown_events in icl_update_topdown_event() as well. The function can be reused by the following patch. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-4-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 15 +-- arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 10 -- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d07408d..37830ac 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2410,7 +2410,8 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) static u64 icl_update_topdown_event(struct perf_event *event) { - return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND); + return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + +x86_pmu.num_topdown_events - 1); } static void intel_pmu_read_topdown_event(struct perf_event *event) @@ -3468,6 +3469,15 @@ static int core_pmu_hw_config(struct perf_event *event) return intel_pmu_bts_config(event); } +#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \ +((x86_pmu.num_topdown_events - 1) << 8)) + +static bool is_available_metric_event(struct perf_event *event) +{ + return is_metric_event(event) && + event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3541,7 +3551,7 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.config & X86_ALL_EVENT_FLAGS) return -EINVAL; - if (is_metric_event(event)) { + if (is_available_metric_event(event)) { struct perf_event *leader = event->group_leader; /* The metric events don't support sampling. */ @@ -5324,6 +5334,7 @@ __init int intel_pmu_init(void) x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.num_topdown_events = 4; x86_pmu.update_topdown_event = icl_update_topdown_event; x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; pr_cont("Icelake events, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 978a16e..15343cc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -775,6 +775,7 @@ struct x86_pmu { /* * Intel perf metrics */ + int num_topdown_events; u64 (*update_topdown_event)(struct perf_event *event); int (*set_topdown_event_period)(struct perf_event *event); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e2a4c78..7c2c302 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -280,8 +280,14 @@ struct x86_pmu_capability { #define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */ #define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */ #define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */ -#define INTEL_TD_METRIC_MAX
[tip: perf/core] perf/x86/intel/lbr: Fix the return type of get_lbr_cycles()
The following commit has been merged into the perf/core branch of tip: Commit-ID: f8129cd958b395575e5543ce25a8434874b04d3a Gitweb: https://git.kernel.org/tip/f8129cd958b395575e5543ce25a8434874b04d3a Author:Kan Liang AuthorDate:Wed, 25 Nov 2020 13:37:20 -08:00 Committer: Peter Zijlstra CommitterDate: Wed, 09 Dec 2020 17:08:58 +01:00 perf/x86/intel/lbr: Fix the return type of get_lbr_cycles() The cycle count of a timed LBR is always 1 in perf record -D. The cycle count is stored in the first 16 bits of the IA32_LBR_x_INFO register, but the get_lbr_cycles() return Boolean type. Use u16 to replace the Boolean type. Fixes: 47125db27e47 ("perf/x86/intel/lbr: Support Architectural LBR") Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20201125213720.15692-2-kan.li...@linux.intel.com --- arch/x86/events/intel/lbr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8961653..e2b0efc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -919,7 +919,7 @@ static __always_inline bool get_lbr_predicted(u64 info) return !(info & LBR_INFO_MISPRED); } -static __always_inline bool get_lbr_cycles(u64 info) +static __always_inline u16 get_lbr_cycles(u64 info) { if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
[tip: perf/core] perf/x86/intel: Fix rtm_abort_event encoding on Ice Lake
The following commit has been merged into the perf/core branch of tip: Commit-ID: 46b72e1bf4fc571da0c29c6fb3e5b2a2107a4c26 Gitweb: https://git.kernel.org/tip/46b72e1bf4fc571da0c29c6fb3e5b2a2107a4c26 Author:Kan Liang AuthorDate:Wed, 25 Nov 2020 13:37:19 -08:00 Committer: Peter Zijlstra CommitterDate: Wed, 09 Dec 2020 17:08:57 +01:00 perf/x86/intel: Fix rtm_abort_event encoding on Ice Lake According to the event list from icelake_core_v1.09.json, the encoding of the RTM_RETIRED.ABORTED event on Ice Lake should be, "EventCode": "0xc9", "UMask": "0x04", "EventName": "RTM_RETIRED.ABORTED", Correct the wrong encoding. Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20201125213720.15692-1-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 546cc89..6c0d18f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5466,7 +5466,7 @@ __init int intel_pmu_init(void) mem_attr = icl_events_attrs; td_attr = icl_td_events_attrs; tsx_attr = icl_tsx_events_attrs; - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.update_topdown_event = icl_update_topdown_event;
[tip: perf/core] perf/x86/intel: Add Tremont Topdown support
The following commit has been merged into the perf/core branch of tip: Commit-ID: c2208046bba6842dc232a600dc5cafc2fca41078 Gitweb: https://git.kernel.org/tip/c2208046bba6842dc232a600dc5cafc2fca41078 Author:Kan Liang AuthorDate:Tue, 08 Dec 2020 12:05:52 -08:00 Committer: Peter Zijlstra CommitterDate: Wed, 09 Dec 2020 17:08:59 +01:00 perf/x86/intel: Add Tremont Topdown support Tremont has four L1 Topdown events, TOPDOWN_FE_BOUND.ALL, TOPDOWN_BAD_SPECULATION.ALL, TOPDOWN_BE_BOUND.ALL and TOPDOWN_RETIRING.ALL. They are available on GP counters. Export them to sysfs and facilitate the perf stat tool. $perf stat --topdown -- sleep 1 Performance counter stats for 'sleep 1': retiring bad speculation frontend bound backend bound 24.9%16.8%31.7% 26.6% 1.001224610 seconds time elapsed 0.00115 seconds user 0.0 seconds sys Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1607457952-3519-1-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 6c0d18f..d4569bf 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1901,6 +1901,19 @@ static __initconst const u64 tnt_hw_cache_extra_regs }, }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0"); + +static struct attribute *tnt_events_attrs[] = { + EVENT_PTR(td_fe_bound_tnt), + EVENT_PTR(td_retiring_tnt), + EVENT_PTR(td_bad_spec_tnt), + EVENT_PTR(td_be_bound_tnt), + NULL, +}; + static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ff9fffull, RSP_0), @@ -5174,6 +5187,7 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.get_event_constraints = tnt_get_event_constraints; + td_attr = tnt_events_attrs; extra_attr = slm_format_attr; pr_cont("Tremont events, "); name = "Tremont";
[tip: perf/core] perf/x86/intel: Add Rocket Lake CPU support
The following commit has been merged into the perf/core branch of tip: Commit-ID: b14d0db5b8c86507c9810c1c8162c7d4a3c656bd Gitweb: https://git.kernel.org/tip/b14d0db5b8c86507c9810c1c8162c7d4a3c656bd Author:Kan Liang AuthorDate:Mon, 19 Oct 2020 08:35:25 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:39 +01:00 perf/x86/intel: Add Rocket Lake CPU support >From the perspective of Intel PMU, Rocket Lake is the same as Ice Lake and Tiger Lake. Share the perf code with them. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201019153528.13850-1-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7186098..4d70c7d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5436,6 +5436,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
[tip: perf/core] perf/x86/intel/uncore: Add Rocket Lake support
The following commit has been merged into the perf/core branch of tip: Commit-ID: 43bc103a8044b9f7963aa1684efbdc9bd60939de Gitweb: https://git.kernel.org/tip/43bc103a8044b9f7963aa1684efbdc9bd60939de Author:Kan Liang AuthorDate:Mon, 19 Oct 2020 08:35:28 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:40 +01:00 perf/x86/intel/uncore: Add Rocket Lake support For Rocket Lake, the MSR uncore, e.g., CBOX, ARB and CLOCKBOX, are the same as Tiger Lake. Share the perf code with it. For Rocket Lake and Tiger Lake, the 8th CBOX is not mapped into a different MSR space anymore. Add rkl_uncore_msr_init_box() to replace skl_uncore_msr_init_box(). The IMC uncore is the similar to Ice Lake. Add new PCIIDs of IMC for Rocket Lake. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201019153528.13850-4-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 6 ++ arch/x86/events/intel/uncore_snb.c | 20 +++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 86d012b..1db6a71 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1636,6 +1636,11 @@ static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { .mmio_init = tgl_l_uncore_mmio_init, }; +static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { + .cpu_init = tgl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1683,6 +1688,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index de3d962..6bbf54b 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -60,7 +60,8 @@ #define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12 #define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14 #define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 - +#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43 +#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK0x00ff @@ -405,6 +406,12 @@ static struct intel_uncore_type *tgl_msr_uncores[] = { NULL, }; +static void rkl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + void tgl_uncore_cpu_init(void) { uncore_msr_uncores = tgl_msr_uncores; @@ -412,6 +419,7 @@ void tgl_uncore_cpu_init(void) icl_uncore_cbox.ops = &skl_uncore_msr_ops; icl_uncore_clockbox.ops = &skl_uncore_msr_ops; snb_uncore_arb.ops = &skl_uncore_msr_ops; + skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box; } enum { @@ -880,6 +888,14 @@ static const struct pci_device_id icl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -973,6 +989,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver),/* 10th Gen Core Mobile */ + IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver), + IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver), { /* end marker */ } };
[tip: perf/core] perf/x86/intel: Add event constraint for CYCLE_ACTIVITY.STALLS_MEM_ANY
The following commit has been merged into the perf/core branch of tip: Commit-ID: 306e3e91edf1c6739a55312edd110d298ff498dd Gitweb: https://git.kernel.org/tip/306e3e91edf1c6739a55312edd110d298ff498dd Author:Kan Liang AuthorDate:Mon, 19 Oct 2020 09:45:29 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:41 +01:00 perf/x86/intel: Add event constraint for CYCLE_ACTIVITY.STALLS_MEM_ANY The event CYCLE_ACTIVITY.STALLS_MEM_ANY (0x14a3) should be available on all 8 GP counters on ICL, but it's only scheduled on the first four counters due to the current ICL constraint table. Add a line for the CYCLE_ACTIVITY.STALLS_MEM_ANY event in the ICL constraint table. Correct the comments for the CYCLE_ACTIVITY.CYCLES_MEM_ANY event. Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support") Reported-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20201019164529.32154-1-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 4d70c7d..0e590c5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -257,7 +257,8 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ - INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
[tip: perf/core] perf/x86/intel: Support PERF_SAMPLE_DATA_PAGE_SIZE
The following commit has been merged into the perf/core branch of tip: Commit-ID: 76a5433f95f32d8a17c9f836be2084ed947c466b Gitweb: https://git.kernel.org/tip/76a5433f95f32d8a17c9f836be2084ed947c466b Author:Kan Liang AuthorDate:Thu, 01 Oct 2020 06:57:47 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:38 +01:00 perf/x86/intel: Support PERF_SAMPLE_DATA_PAGE_SIZE The new sample type, PERF_SAMPLE_DATA_PAGE_SIZE, requires the virtual address. Update the data->addr if the sample type is set. The large PEBS is disabled with the sample type, because perf doesn't support munmap tracking yet. The PEBS buffer for large PEBS cannot be flushed for each munmap. Wrong page size may be calculated. The large PEBS can be enabled later separately when munmap tracking is supported. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-3-kan.li...@linux.intel.com --- arch/x86/events/intel/ds.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 404315d..444e5f0 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -959,7 +959,8 @@ static void adaptive_pebs_record_size_update(void) #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION |\ + PERF_SAMPLE_DATA_PAGE_SIZE) static u64 pebs_update_adaptive_cfg(struct perf_event *event) { @@ -1335,6 +1336,10 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ +PERF_SAMPLE_PHYS_ADDR |\ +PERF_SAMPLE_DATA_PAGE_SIZE) + static void setup_pebs_fixed_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -1449,7 +1454,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, } - if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && + if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; @@ -1577,7 +1582,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = get_data_src(event, meminfo->aux); - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; if (sample_type & PERF_SAMPLE_TRANSACTION)
[tip: perf/core] perf/x86/cstate: Add Rocket Lake CPU support
The following commit has been merged into the perf/core branch of tip: Commit-ID: cbea56395cba13173fffb9251cb23f146b51c792 Gitweb: https://git.kernel.org/tip/cbea56395cba13173fffb9251cb23f146b51c792 Author:Kan Liang AuthorDate:Mon, 19 Oct 2020 08:35:26 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:40 +01:00 perf/x86/cstate: Add Rocket Lake CPU support >From the perspective of Intel cstate residency counters, Rocket Lake is the same as Ice Lake and Tiger Lake. Share the code with them. Update the comments for Rocket Lake. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201019153528.13850-2-kan.li...@linux.intel.com --- arch/x86/events/intel/cstate.c | 19 ++- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 442e1ed..a161a0b 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -51,46 +51,46 @@ *perf code: 0x02 *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL *Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter *perf code: 0x03 *Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL + * ICL,TGL,RKL *Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. *perf code: 0x00 *Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT + * KBL,CML,ICL,TGL,TNT,RKL *Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. *perf code: 0x01 *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL,TNT + * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL *Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. *perf code: 0x02 *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * TNT,RKL *Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. *perf code: 0x03 *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, - * KBL,CML,ICL,TGL + * KBL,CML,ICL,TGL,RKL *Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. *perf code: 0x04 - *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL *Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. *perf code: 0x05 - *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL *Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. *perf code: 0x06 *Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT + * TNT,RKL *Scope: Package (physical package) * */ @@ -649,6 +649,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
[tip: perf/core] perf/x86/msr: Add Rocket Lake CPU support
The following commit has been merged into the perf/core branch of tip: Commit-ID: 907a196fbc70a48338ee8512da32f70fd33c97eb Gitweb: https://git.kernel.org/tip/907a196fbc70a48338ee8512da32f70fd33c97eb Author:Kan Liang AuthorDate:Mon, 19 Oct 2020 08:35:27 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:40 +01:00 perf/x86/msr: Add Rocket Lake CPU support Like Ice Lake and Tiger Lake, PPERF and SMI_COUNT MSRs are also supported by Rocket Lake. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201019153528.13850-3-kan.li...@linux.intel.com --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4be8f9c..680404c 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -99,6 +99,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break;
[tip: perf/core] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE
The following commit has been merged into the perf/core branch of tip: Commit-ID: 8d97e71811aaafe4abf611dc24822fd6e73df1a1 Gitweb: https://git.kernel.org/tip/8d97e71811aaafe4abf611dc24822fd6e73df1a1 Author:Kan Liang AuthorDate:Thu, 01 Oct 2020 06:57:46 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:38 +01:00 perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE Current perf can report both virtual addresses and physical addresses, but not the MMU page size. Without the MMU page size information of the utilized page, users cannot decide whether to promote/demote large pages to optimize memory usage. Add a new sample type for the data MMU page size. Current perf already has a facility to collect data virtual addresses. A page walker is required to walk the pages tables and calculate the MMU page size from a given virtual address. On some platforms, e.g., X86, the page walker is invoked in an NMI handler. So the page walker must be NMI-safe and low overhead. Besides, the page walker should work for both user and kernel virtual address. The existing generic page walker, e.g., walk_page_range_novma(), is a little bit complex and doesn't guarantee the NMI-safe. The follow_page() is only for user-virtual address. Add a new function perf_get_page_size() to walk the page tables and calculate the MMU page size. In the function: - Interrupts have to be disabled to prevent any teardown of the page tables. - For user space threads, the current->mm is used for the page walker. For kernel threads and the like, the current->mm is NULL. The init_mm is used for the page walker. The active_mm is not used here, because it can be NULL. Quote from Peter Zijlstra, "context_switch() can set prev->active_mm to NULL when it transfers it to @next. It does this before @current is updated. So an NMI that comes in between this active_mm swizzling and updating @current will see !active_mm." - The MMU page size is calculated from the page table level. The method should work for all architectures, but it has only been verified on X86. Should there be some architectures, which support perf, where the method doesn't work, it can be fixed later separately. Reporting the wrong page size would not be fatal for the architecture. Some under discussion features may impact the method in the future. Quote from Dave Hansen, "There are lots of weird things folks are trying to do with the page tables, like Address Space Isolation. For instance, if you get a perf NMI when running userspace, current->mm->pgd is *different* than the PGD that was in use when userspace was running. It's close enough today, but it might not stay that way." If the case happens later, lots of consecutive page walk errors will happen. The worst case is that lots of page-size '0' are returned, which would not be fatal. In the perf tool, a check is implemented to detect this case. Once it happens, a kernel patch could be implemented accordingly then. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-2-kan.li...@linux.intel.com --- include/linux/perf_event.h | 1 +- include/uapi/linux/perf_event.h | 4 +- kernel/events/core.c| 103 +++- 3 files changed, 107 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0c19d27..7e3785d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1034,6 +1034,7 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; + u64 data_page_size; } cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 077e7ee..cc6ea34 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -143,8 +143,9 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, - PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -896,6 +897,7 @@ enum perf_event_type { * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * { u64 size; *char data[size]; } && PERF_SAMPLE_AUX +* { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/co
[tip: perf/core] powerpc/perf: Support PERF_SAMPLE_DATA_PAGE_SIZE
The following commit has been merged into the perf/core branch of tip: Commit-ID: 4cb6a42e4c4bc1902644eced67563e7405d4588e Gitweb: https://git.kernel.org/tip/4cb6a42e4c4bc1902644eced67563e7405d4588e Author:Kan Liang AuthorDate:Thu, 01 Oct 2020 06:57:48 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 29 Oct 2020 11:00:39 +01:00 powerpc/perf: Support PERF_SAMPLE_DATA_PAGE_SIZE The new sample type, PERF_SAMPLE_DATA_PAGE_SIZE, requires the virtual address. Update the data->addr if the sample type is set. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-4-kan.li...@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 78fe349..ce22bd2 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2065,6 +2065,9 @@ static struct pmu power_pmu = { .sched_task = power_pmu_sched_task, }; +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_DATA_PAGE_SIZE) /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -2120,8 +2123,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, ~0ULL, event->hw.last_period); - if (event->attr.sample_type & - (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (event->attr.sample_type & PERF_SAMPLE_ADDR_TYPE) perf_get_data_addr(event, regs, &data.addr); if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
[tip: perf/core] perf/x86/intel: Check perf metrics feature for each CPU
The following commit has been merged into the perf/core branch of tip: Commit-ID: 80a5ce116fc084e8a25d5a936617699e2931b611 Gitweb: https://git.kernel.org/tip/80a5ce116fc084e8a25d5a936617699e2931b611 Author:Kan Liang AuthorDate:Thu, 01 Oct 2020 14:17:11 -07:00 Committer: Peter Zijlstra CommitterDate: Sat, 03 Oct 2020 16:30:56 +02:00 perf/x86/intel: Check perf metrics feature for each CPU It might be possible that different CPUs have different CPU metrics on a platform. In this case, writing the GLOBAL_CTRL_EN_PERF_METRICS bit to the GLOBAL_CTRL register of a CPU, which doesn't support the TopDown perf metrics feature, causes MSR access error. Current TopDown perf metrics feature is enumerated using the boot CPU's PERF_CAPABILITIES MSR. The MSR only indicates the boot CPU supports this feature. Check the PERF_CAPABILITIES MSR for each CPU. If any CPU doesn't support the perf metrics feature, disable the feature globally. Fixes: 59a854e2f3b9 ("perf/x86/intel: Support TopDown metrics on Ice Lake") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001211711.25708-1-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bdf28d2..7186098 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4083,6 +4083,17 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.counter_freezing) enable_counter_freeze(); + /* Disable perf metrics if any added CPU doesn't support it. */ + if (x86_pmu.intel_cap.perf_metrics) { + union perf_capabilities perf_cap; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); + if (!perf_cap.perf_metrics) { + x86_pmu.intel_cap.perf_metrics = 0; + x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + } + } + if (!cpuc->shared_regs) return;
[tip: perf/core] perf/x86/intel/uncore: Update Ice Lake uncore units
The following commit has been merged into the perf/core branch of tip: Commit-ID: 8f5d41f3a0f495435c88ebba8fc150c931c10fef Gitweb: https://git.kernel.org/tip/8f5d41f3a0f495435c88ebba8fc150c931c10fef Author:Kan Liang AuthorDate:Fri, 25 Sep 2020 06:49:04 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:01 +02:00 perf/x86/intel/uncore: Update Ice Lake uncore units There are some updates for the Icelake model specific uncore performance monitors. (The update can be found at 10th generation intel core processors families specification update Revision 004, ICL068) 1) Counter 0 of ARB uncore unit is not available for software use 2) The global 'enable bit' (bit 29) and 'freeze bit' (bit 31) of MSR_UNC_PERF_GLOBAL_CTRL cannot be used to control counter behavior. Needs to use local enable in event select MSR. Accessing the modified bit/registers will be ignored by HW. Users may observe inaccurate results with the current code. The changes of the MSR_UNC_PERF_GLOBAL_CTRL imply that groups cannot be read atomically anymore. Although the error of the result for a group becomes a bit bigger, it still far lower than not using a group. The group support is still kept. Only Remove the *_box() related implementation. Since the counter 0 of ARB uncore unit is not available, update the MSR address for the ARB uncore unit. There is no change for IMC uncore unit, which only include free-running counters. Fixes: 6e394376ee89 ("perf/x86/intel/uncore: Add Intel Icelake uncore support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200925134905.8839-2-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore_snb.c | 29 + 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index d2d43b6..2bdfcf8 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -126,6 +126,10 @@ #define ICL_UNC_CBO_0_PER_CTR0 0x702 #define ICL_UNC_CBO_MSR_OFFSET 0x8 +/* ICL ARB register */ +#define ICL_UNC_ARB_PER_CTR0x3b1 +#define ICL_UNC_ARB_PERFEVTSEL 0x3b3 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); @@ -313,6 +317,12 @@ void skl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_ops icl_uncore_msr_ops = { + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + static struct intel_uncore_type icl_uncore_cbox = { .name = "cbox", .num_counters = 4, @@ -321,7 +331,7 @@ static struct intel_uncore_type icl_uncore_cbox = { .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, .event_mask = SNB_UNC_RAW_EVENT_MASK, .msr_offset = ICL_UNC_CBO_MSR_OFFSET, - .ops= &skl_uncore_msr_ops, + .ops= &icl_uncore_msr_ops, .format_group = &snb_uncore_format_group, }; @@ -350,13 +360,25 @@ static struct intel_uncore_type icl_uncore_clockbox = { .single_fixed = 1, .event_mask = SNB_UNC_CTL_EV_SEL_MASK, .format_group = &icl_uncore_clock_format_group, - .ops= &skl_uncore_msr_ops, + .ops= &icl_uncore_msr_ops, .event_descs= icl_uncore_events, }; +static struct intel_uncore_type icl_uncore_arb = { + .name = "arb", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 44, + .perf_ctr = ICL_UNC_ARB_PER_CTR, + .event_ctl = ICL_UNC_ARB_PERFEVTSEL, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .ops= &icl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + static struct intel_uncore_type *icl_msr_uncores[] = { &icl_uncore_cbox, - &snb_uncore_arb, + &icl_uncore_arb, &icl_uncore_clockbox, NULL, }; @@ -374,7 +396,6 @@ void icl_uncore_cpu_init(void) { uncore_msr_uncores = icl_msr_uncores; icl_uncore_cbox.num_boxes = icl_get_cbox_num(); - snb_uncore_arb.ops = &skl_uncore_msr_ops; } static struct intel_uncore_type *tgl_msr_uncores[] = {
[tip: perf/core] perf/x86/intel/uncore: Reduce the number of CBOX counters
The following commit has been merged into the perf/core branch of tip: Commit-ID: ee139385432e919f4d1f59b80edbc073cdae1391 Gitweb: https://git.kernel.org/tip/ee139385432e919f4d1f59b80edbc073cdae1391 Author:Kan Liang AuthorDate:Fri, 25 Sep 2020 06:49:05 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:01 +02:00 perf/x86/intel/uncore: Reduce the number of CBOX counters An oops is triggered by the fuzzy test. [ 327.853081] unchecked MSR access error: RDMSR from 0x70c at rIP: 0xc082c820 (uncore_msr_read_counter+0x10/0x50 [intel_uncore]) [ 327.853083] Call Trace: [ 327.853085] [ 327.853089] uncore_pmu_event_start+0x85/0x170 [intel_uncore] [ 327.853093] uncore_pmu_event_add+0x1a4/0x410 [intel_uncore] [ 327.853097] ? event_sched_in.isra.118+0xca/0x240 There are 2 GP counters for each CBOX, but the current code claims 4 counters. Accessing the invalid registers triggers the oops. Fixes: 6e394376ee89 ("perf/x86/intel/uncore: Add Intel Icelake uncore support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200925134905.8839-3-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore_snb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 2bdfcf8..de3d962 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -325,7 +325,7 @@ static struct intel_uncore_ops icl_uncore_msr_ops = { static struct intel_uncore_type icl_uncore_cbox = { .name = "cbox", - .num_counters = 4, + .num_counters = 2, .perf_ctr_bits = 44, .perf_ctr = ICL_UNC_CBO_0_PER_CTR0, .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
[tip: perf/core] perf/x86/intel/uncore: Split the Ice Lake and Tiger Lake MSR uncore support
The following commit has been merged into the perf/core branch of tip: Commit-ID: 8abbcfefb5f7afabab4578bedd7cd400800cb039 Gitweb: https://git.kernel.org/tip/8abbcfefb5f7afabab4578bedd7cd400800cb039 Author:Kan Liang AuthorDate:Fri, 25 Sep 2020 06:49:03 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:00 +02:00 perf/x86/intel/uncore: Split the Ice Lake and Tiger Lake MSR uncore support Previously, the MSR uncore for the Ice Lake and Tiger Lake are identical. The code path is shared. However, with recent update, the global MSR_UNC_PERF_GLOBAL_CTRL register and ARB uncore unit are changed for the Ice Lake. Split the Ice Lake and Tiger Lake MSR uncore support. The changes only impact the MSR ops() and the ARB uncore unit. Other codes can still be shared between the Ice Lake and the Tiger Lake. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200925134905.8839-1-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 4 ++-- arch/x86/events/intel/uncore.h | 1 + arch/x86/events/intel/uncore_snb.c | 16 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index ce0a5ba..86d012b 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1627,12 +1627,12 @@ static const struct intel_uncore_init_fun icl_uncore_init __initconst = { }; static const struct intel_uncore_init_fun tgl_uncore_init __initconst = { - .cpu_init = icl_uncore_cpu_init, + .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_uncore_mmio_init, }; static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { - .cpu_init = icl_uncore_cpu_init, + .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_l_uncore_mmio_init, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index df544bc..83d2a7d 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -568,6 +568,7 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); +void tgl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index cb94ba8..d2d43b6 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -377,6 +377,22 @@ void icl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_type *tgl_msr_uncores[] = { + &icl_uncore_cbox, + &snb_uncore_arb, + &icl_uncore_clockbox, + NULL, +}; + +void tgl_uncore_cpu_init(void) +{ + uncore_msr_uncores = tgl_msr_uncores; + icl_uncore_cbox.num_boxes = icl_get_cbox_num(); + icl_uncore_cbox.ops = &skl_uncore_msr_ops; + icl_uncore_clockbox.ops = &skl_uncore_msr_ops; + snb_uncore_arb.ops = &skl_uncore_msr_ops; +} + enum { SNB_PCI_UNCORE_IMC, };
[tip: perf/core] perf/x86/intel: Fix Ice Lake event constraint table
The following commit has been merged into the perf/core branch of tip: Commit-ID: 010cb00265f150bf82b23c02ad1fb87ce5c781e1 Gitweb: https://git.kernel.org/tip/010cb00265f150bf82b23c02ad1fb87ce5c781e1 Author:Kan Liang AuthorDate:Mon, 28 Sep 2020 06:47:26 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00 perf/x86/intel: Fix Ice Lake event constraint table An error occues when sampling non-PEBS INST_RETIRED.PREC_DIST(0x01c0) event. perf record -e cpu/event=0xc0,umask=0x01/ -- sleep 1 Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu/event=0xc0,umask=0x01/). /bin/dmesg | grep -i perf may provide additional information. The idxmsk64 of the event is set to 0. The event never be successfully scheduled. The event should be limit to the fixed counter 0. Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support") Reported-by: Yi, Ammy Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20200928134726.13090-1-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 75dea67..bdf28d2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -243,7 +243,7 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
[tip: perf/core] perf/x86/msr: Add Jasper Lake support
The following commit has been merged into the perf/core branch of tip: Commit-ID: c3bb8a9fa31b99f5b7d2e45cd0a10db91349f4c9 Gitweb: https://git.kernel.org/tip/c3bb8a9fa31b99f5b7d2e45cd0a10db91349f4c9 Author:Kan Liang AuthorDate:Mon, 28 Sep 2020 05:30:42 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00 perf/x86/msr: Add Jasper Lake support The Jasper Lake processor is also a Tremont microarchitecture. From the perspective of perf MSR, there is nothing changed compared with Elkhart Lake. Share the code path with Elkhart Lake. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1601296242-32763-2-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index a949f6f..4be8f9c 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -78,6 +78,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM:
[tip: perf/core] perf/x86/intel: Add Jasper Lake support
The following commit has been merged into the perf/core branch of tip: Commit-ID: dbfd638889a0396f5fe14ff3cc2263ec1e1cac62 Gitweb: https://git.kernel.org/tip/dbfd638889a0396f5fe14ff3cc2263ec1e1cac62 Author:Kan Liang AuthorDate:Mon, 28 Sep 2020 05:30:41 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:01 +02:00 perf/x86/intel: Add Jasper Lake support The Jasper Lake processor is also a Tremont microarchitecture. From the perspective of Intel PMU, there is nothing changed compared with Elkhart Lake. Share the perf code with Elkhart Lake. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1601296242-32763-1-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c72e490..75dea67 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5135,6 +5135,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids));
[tip: perf/core] perf/x86/intel/uncore: Fix the scale of the IMC free-running events
The following commit has been merged into the perf/core branch of tip: Commit-ID: 8191016a026b8dfbb14dea64efc8e723ee99fe65 Gitweb: https://git.kernel.org/tip/8191016a026b8dfbb14dea64efc8e723ee99fe65 Author:Kan Liang AuthorDate:Mon, 28 Sep 2020 06:32:40 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00 perf/x86/intel/uncore: Fix the scale of the IMC free-running events The "MiB" result of the IMC free-running bandwidth events, uncore_imc_free_running/read/ and uncore_imc_free_running/write/ are 16 times too small. The "MiB" value equals the raw IMC free-running bandwidth counter value times a "scale" which is inaccurate. The IMC free-running bandwidth events should be incremented per 64B cache line, not DWs (4 bytes). The "scale" should be 6.103515625e-5. Fix the "scale" for both Snow Ridge and Ice Lake. Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Fixes: ee49532b38dd ("perf/x86/intel/uncore: Add IMC uncore support for Snow Ridge") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200928133240.12977-1-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 3f1e75f..7bdb182 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4807,10 +4807,10 @@ static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale,"3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale,"6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -5268,17 +5268,17 @@ static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale,"3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale,"6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"), - INTEL_UNCORE_EVENT_DESC(ddrt_read.scale,"3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_read.scale,"6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"), - INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_write.unit,"MiB"), { /* end: all zeroes */ }, };
[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_find_dev_pmu()
The following commit has been merged into the perf/core branch of tip: Commit-ID: 8ed2ccaa3fa990be61619a61b9bc3914eefdc18f Gitweb: https://git.kernel.org/tip/8ed2ccaa3fa990be61619a61b9bc3914eefdc18f Author:Kan Liang AuthorDate:Mon, 14 Sep 2020 07:34:16 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 24 Sep 2020 15:55:50 +02:00 perf/x86/intel/uncore: Factor out uncore_pci_find_dev_pmu() When an uncore PCI sub driver gets a remove notification, the corresponding PMU has to be retrieved and unregistered. The codes, which find the corresponding PMU by comparing the pci_device_id table, can be shared. Factor out uncore_pci_find_dev_pmu(), which will be used later. There is no functional change. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1600094060-82746-3-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 48 ++--- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index e14b03f..f6ff1b9 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1008,6 +1008,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, return 0; } + +/* + * Find the PMU of a PCI device. + * @pdev: The PCI device. + * @ids: The ID table of the available PCI devices with a PMU. + */ +static struct intel_uncore_pmu * +uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) +{ + struct intel_uncore_pmu *pmu = NULL; + struct intel_uncore_type *type; + kernel_ulong_t data; + unsigned int devfn; + + while (ids && ids->vendor) { + if ((ids->vendor == pdev->vendor) && + (ids->device == pdev->device)) { + data = ids->driver_data; + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data), + UNCORE_PCI_DEV_FUNC(data)); + if (devfn == pdev->devfn) { + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)]; + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)]; + break; + } + } + ids++; + } + return pmu; +} + /* * add a pci uncore device */ @@ -1039,21 +1070,8 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id */ if (id->driver_data & ~0x) { struct pci_driver *pci_drv = pdev->driver; - const struct pci_device_id *ids = pci_drv->id_table; - unsigned int devfn; - - while (ids && ids->vendor) { - if ((ids->vendor == pdev->vendor) && - (ids->device == pdev->device)) { - devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), - UNCORE_PCI_DEV_FUNC(ids->driver_data)); - if (devfn == pdev->devfn) { - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; - break; - } - } - ids++; - } + + pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); if (pmu == NULL) return -ENODEV; } else {
[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_get_dev_die_info()
The following commit has been merged into the perf/core branch of tip: Commit-ID: fe6507338d635f283e9618b5eaa35f503a8c375b Gitweb: https://git.kernel.org/tip/fe6507338d635f283e9618b5eaa35f503a8c375b Author:Kan Liang AuthorDate:Mon, 14 Sep 2020 07:34:15 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 24 Sep 2020 15:55:50 +02:00 perf/x86/intel/uncore: Factor out uncore_pci_get_dev_die_info() The socket and die information is required to register/unregister a PMU in the uncore PCI sub driver. The codes, which get the socket and die information from a BUS number, can be shared. Factor out uncore_pci_get_dev_die_info(), which will be used later. There is no functional change. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1600094060-82746-2-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 31 +++ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d5c6d3b..e14b03f 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -989,6 +989,26 @@ uncore_types_init(struct intel_uncore_type **types, bool setid) } /* + * Get the die information of a PCI device. + * @pdev: The PCI device. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. + */ +static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, + int *phys_id, int *die) +{ + *phys_id = uncore_pcibus_to_physid(pdev->bus); + if (*phys_id < 0) + return -ENODEV; + + *die = (topology_max_die_per_package() > 1) ? *phys_id : + topology_phys_to_logical_pkg(*phys_id); + if (*die < 0) + return -EINVAL; + + return 0; +} +/* * add a pci uncore device */ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -998,14 +1018,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id struct intel_uncore_box *box; int phys_id, die, ret; - phys_id = uncore_pcibus_to_physid(pdev->bus); - if (phys_id < 0) - return -ENODEV; - - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); - if (die < 0) - return -EINVAL; + ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die); + if (ret) + return ret; if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_pmu_unregister()
The following commit has been merged into the perf/core branch of tip: Commit-ID: cdcce92a3a03bccbb0b4a0342fc7e279fc507bc3 Gitweb: https://git.kernel.org/tip/cdcce92a3a03bccbb0b4a0342fc7e279fc507bc3 Author:Kan Liang AuthorDate:Mon, 14 Sep 2020 07:34:18 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 24 Sep 2020 15:55:51 +02:00 perf/x86/intel/uncore: Factor out uncore_pci_pmu_unregister() The PMU unregistration in the uncore PCI sub driver is similar as the normal PMU unregistration for a PCI device. The codes to unregister a PCI PMU can be shared. Factor out uncore_pci_pmu_unregister(), which will be used later. Use uncore_pci_get_dev_die_info() to replace the codes which retrieve the socket and die informaion. The pci_set_drvdata() is not included in uncore_pci_pmu_unregister() as well, because the uncore PCI sub driver will not touch the private driver data pointer of the device. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1600094060-82746-5-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 35 +++-- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 6c6f8b3..747d237 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1137,18 +1137,38 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id return ret; } +/* + * Unregister the PMU of a PCI device + * @pmu: The corresponding PMU is unregistered. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. + */ +static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, + int phys_id, int die) +{ + struct intel_uncore_box *box = pmu->boxes[die]; + + if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) + return; + + pmu->boxes[die] = NULL; + if (atomic_dec_return(&pmu->activeboxes) == 0) + uncore_pmu_unregister(pmu); + uncore_box_exit(box); + kfree(box); +} + static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; int i, phys_id, die; - phys_id = uncore_pcibus_to_physid(pdev->bus); + if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + return; box = pci_get_drvdata(pdev); if (!box) { - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { if (uncore_extra_pci_dev[die].dev[i] == pdev) { uncore_extra_pci_dev[die].dev[i] = NULL; @@ -1160,15 +1180,10 @@ static void uncore_pci_remove(struct pci_dev *pdev) } pmu = box->pmu; - if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) - return; pci_set_drvdata(pdev, NULL); - pmu->boxes[box->dieid] = NULL; - if (atomic_dec_return(&pmu->activeboxes) == 0) - uncore_pmu_unregister(pmu); - uncore_box_exit(box); - kfree(box); + + uncore_pci_pmu_unregister(pmu, phys_id, die); } static int __init uncore_pci_init(void)
[tip: perf/core] perf/x86/intel/uncore: Generic support for the PCI sub driver
The following commit has been merged into the perf/core branch of tip: Commit-ID: 95a7fc77443328ac8b68378df8e137a044ece5e8 Gitweb: https://git.kernel.org/tip/95a7fc77443328ac8b68378df8e137a044ece5e8 Author:Kan Liang AuthorDate:Mon, 14 Sep 2020 07:34:19 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 24 Sep 2020 15:55:51 +02:00 perf/x86/intel/uncore: Generic support for the PCI sub driver Some uncore counters may be located in the configuration space of a PCI device, which already has a bonded driver. Currently, the uncore driver cannot register a PCI uncore PMU for these counters, because, to register a PCI uncore PMU, the uncore driver must be bond to the device. However, one device can only have one bonded driver. Add an uncore PCI sub driver to support such kind of devices. The sub driver doesn't own the device. In initialization, the sub driver searches the device via pci_get_device(), and register the corresponding PMU for the device. In the meantime, the sub driver registers a PCI bus notifier, which is used to notify the sub driver once the device is removed. The sub driver can unregister the PMU accordingly. The sub driver only searches the devices defined in its id table. The id table varies on different platforms, which will be implemented in the following platform-specific patch. Suggested-by: Bjorn Helgaas Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1600094060-82746-6-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 81 +- arch/x86/events/intel/uncore.h | 1 +- 2 files changed, 82 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 747d237..ce0a5ba 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -12,6 +12,8 @@ struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; +/* The PCI driver for the device which the uncore doesn't own. */ +struct pci_driver *uncore_pci_sub_driver; /* pci bus to socket mapping */ DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); @@ -1186,6 +1188,80 @@ static void uncore_pci_remove(struct pci_dev *pdev) uncore_pci_pmu_unregister(pmu, phys_id, die); } +static int uncore_bus_notify(struct notifier_block *nb, +unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct intel_uncore_pmu *pmu; + int phys_id, die; + + /* Unregister the PMU when the device is going to be deleted. */ + if (action != BUS_NOTIFY_DEL_DEVICE) + return NOTIFY_DONE; + + pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table); + if (!pmu) + return NOTIFY_DONE; + + if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + return NOTIFY_DONE; + + uncore_pci_pmu_unregister(pmu, phys_id, die); + + return NOTIFY_OK; +} + +static struct notifier_block uncore_notifier = { + .notifier_call = uncore_bus_notify, +}; + +static void uncore_pci_sub_driver_init(void) +{ + const struct pci_device_id *ids = uncore_pci_sub_driver->id_table; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct pci_dev *pci_sub_dev; + bool notify = false; + unsigned int devfn; + int phys_id, die; + + while (ids && ids->vendor) { + pci_sub_dev = NULL; + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)]; + /* +* Search the available device, and register the +* corresponding PMU. +*/ + while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL, +ids->device, pci_sub_dev))) { + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), + UNCORE_PCI_DEV_FUNC(ids->driver_data)); + if (devfn != pci_sub_dev->devfn) + continue; + + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; + if (!pmu) + continue; + + if (uncore_pci_get_dev_die_info(pci_sub_dev, + &phys_id, &die)) + continue; + + if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu, +phys_id, die)) + notify = true; + } + ids++; + } + + if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier)) + notif
[tip: perf/core] perf/x86/intel/uncore: Support PCIe3 unit on Snow Ridge
The following commit has been merged into the perf/core branch of tip: Commit-ID: a3b1e8451d3fd54fe0df661c2c4f983932b3c0bc Gitweb: https://git.kernel.org/tip/a3b1e8451d3fd54fe0df661c2c4f983932b3c0bc Author:Kan Liang AuthorDate:Mon, 14 Sep 2020 07:34:20 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 24 Sep 2020 15:55:52 +02:00 perf/x86/intel/uncore: Support PCIe3 unit on Snow Ridge The Snow Ridge integrated PCIe3 uncore unit can be used to collect performance data, e.g. utilization, between PCIe devices, plugged into the PCIe port, and the components (in M2IOSF) responsible for translating and managing requests to/from the device. The performance data is very useful for analyzing the performance of PCIe devices. The device with the PCIe3 uncore PMON units is owned by the portdrv_pci driver. Create a PCI sub driver for the PCIe3 uncore PMON units. Here are some difference between PCIe3 uncore unit and other uncore pci units. - There may be several Root Ports on a system. But the uncore counters only exist in the Root Port A. A user can configure the channel mask to collect the data from other Root Ports. - The event format of the PCIe3 uncore unit is the same as IIO unit of SKX. - The Control Register of PCIe3 uncore unit is 64 bits. - The offset of each counters is 8, which is the same as M2M unit of SNR. - New MSR addresses for unit control, counter and counter config. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1600094060-82746-7-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 53 +++- 1 file changed, 53 insertions(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 62e88ad..495056f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -393,6 +393,11 @@ #define SNR_M2M_PCI_PMON_BOX_CTL 0x438 #define SNR_M2M_PCI_PMON_UMASK_EXT 0xff +/* SNR PCIE3 */ +#define SNR_PCIE3_PCI_PMON_CTL00x508 +#define SNR_PCIE3_PCI_PMON_CTR00x4e8 +#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e0 + /* SNR IMC */ #define SNR_IMC_MMIO_PMON_FIXED_CTL0x54 #define SNR_IMC_MMIO_PMON_FIXED_CTR0x38 @@ -4551,12 +4556,46 @@ static struct intel_uncore_type snr_uncore_m2m = { .format_group = &snr_m2m_uncore_format_group, }; +static void snr_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, (u32)(hwc->config | SNBEP_PMON_CTL_EN)); + pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32)); +} + +static struct intel_uncore_ops snr_pcie3_uncore_pci_ops = { + .init_box = snr_m2m_uncore_pci_init_box, + .disable_box= snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snr_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type snr_uncore_pcie3 = { + .name = "pcie3", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0, + .event_ctl = SNR_PCIE3_PCI_PMON_CTL0, + .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl= SNR_PCIE3_PCI_PMON_BOX_CTL, + .ops= &snr_pcie3_uncore_pci_ops, + .format_group = &skx_uncore_iio_format_group, +}; + enum { SNR_PCI_UNCORE_M2M, + SNR_PCI_UNCORE_PCIE3, }; static struct intel_uncore_type *snr_pci_uncores[] = { [SNR_PCI_UNCORE_M2M]= &snr_uncore_m2m, + [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3, NULL, }; @@ -4573,6 +4612,19 @@ static struct pci_driver snr_uncore_pci_driver = { .id_table = snr_uncore_pci_ids, }; +static const struct pci_device_id snr_uncore_pci_sub_ids[] = { + { /* PCIe3 RP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snr_uncore_pci_sub_driver = { + .name = "snr_uncore_sub", + .id_table = snr_uncore_pci_sub_ids, +}; + int snr_uncore_pci_init(void) { /* SNR UBOX DID */ @@ -4584,6 +4636,7 @@ int snr_uncore_pci_init(void) uncore_pci_uncores = snr_pci_uncores; uncore_pci_driver = &snr_uncore_pci_driver; + uncore_pci_sub_driver = &snr_uncore_pci_sub_driver; return 0; }
[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_pmu_register()
The following commit has been merged into the perf/core branch of tip: Commit-ID: 16fa64315c1bd2a61fb20d6aa9a542dd5bf52971 Gitweb: https://git.kernel.org/tip/16fa64315c1bd2a61fb20d6aa9a542dd5bf52971 Author:Kan Liang AuthorDate:Mon, 14 Sep 2020 07:34:17 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 24 Sep 2020 15:55:51 +02:00 perf/x86/intel/uncore: Factor out uncore_pci_pmu_register() The PMU registration in the uncore PCI sub driver is similar as the normal PMU registration for a PCI device. The codes to register a PCI PMU can be shared. Factor out uncore_pci_pmu_register(), which will be used later. The pci_set_drvdata() is not included in uncore_pci_pmu_register(). The uncore PCI sub driver doesn't own the PCI device. It will not touch the private driver data pointer for the device. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1600094060-82746-4-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/uncore.c | 82 - 1 file changed, 51 insertions(+), 31 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index f6ff1b9..6c6f8b3 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1040,13 +1040,61 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) } /* + * Register the PMU for a PCI device + * @pdev: The PCI device. + * @type: The corresponding PMU type of the device. + * @pmu: The corresponding PMU of the device. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. + */ +static int uncore_pci_pmu_register(struct pci_dev *pdev, + struct intel_uncore_type *type, + struct intel_uncore_pmu *pmu, + int phys_id, int die) +{ + struct intel_uncore_box *box; + int ret; + + if (WARN_ON_ONCE(pmu->boxes[die] != NULL)) + return -EINVAL; + + box = uncore_alloc_box(type, NUMA_NO_NODE); + if (!box) + return -ENOMEM; + + if (pmu->func_id < 0) + pmu->func_id = pdev->devfn; + else + WARN_ON_ONCE(pmu->func_id != pdev->devfn); + + atomic_inc(&box->refcnt); + box->pci_phys_id = phys_id; + box->dieid = die; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + + pmu->boxes[die] = box; + if (atomic_inc_return(&pmu->activeboxes) > 1) + return 0; + + /* First active box registers the pmu */ + ret = uncore_pmu_register(pmu); + if (ret) { + pmu->boxes[die] = NULL; + uncore_box_exit(box); + kfree(box); + } + return ret; +} + +/* * add a pci uncore device */ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu = NULL; - struct intel_uncore_box *box; int phys_id, die, ret; ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die); @@ -1082,38 +1130,10 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; } - if (WARN_ON_ONCE(pmu->boxes[die] != NULL)) - return -EINVAL; - - box = uncore_alloc_box(type, NUMA_NO_NODE); - if (!box) - return -ENOMEM; - - if (pmu->func_id < 0) - pmu->func_id = pdev->devfn; - else - WARN_ON_ONCE(pmu->func_id != pdev->devfn); - - atomic_inc(&box->refcnt); - box->pci_phys_id = phys_id; - box->dieid = die; - box->pci_dev = pdev; - box->pmu = pmu; - uncore_box_init(box); - pci_set_drvdata(pdev, box); + ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die); - pmu->boxes[die] = box; - if (atomic_inc_return(&pmu->activeboxes) > 1) - return 0; + pci_set_drvdata(pdev, pmu->boxes[die]); - /* First active box registers the pmu */ - ret = uncore_pmu_register(pmu); - if (ret) { - pci_set_drvdata(pdev, NULL); - pmu->boxes[die] = NULL; - uncore_box_exit(box); - kfree(box); - } return ret; }
[tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()
The following commit has been merged into the perf/core branch of tip: Commit-ID: 44fae179ce73a26733d9e2d346da4e1a1cb94647 Gitweb: https://git.kernel.org/tip/44fae179ce73a26733d9e2d346da4e1a1cb94647 Author:Kan Liang AuthorDate:Fri, 21 Aug 2020 12:57:53 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 10 Sep 2020 11:19:34 +02:00 perf/core: Pull pmu::sched_task() into perf_event_context_sched_out() The pmu::sched_task() is a context switch callback. It passes the cpuctx->task_ctx as a parameter to the lower code. To find the cpuctx->task_ctx, the current code iterates a cpuctx list. The same context will iterated in perf_event_context_sched_out() soon. Share the cpuctx->task_ctx can avoid the unnecessary iteration of the cpuctx list. The pmu::sched_task() is also required for the optimization case for equivalent contexts. The task_ctx_sched_out() will eventually disable and reenable the PMU when schedule out events. Add perf_pmu_disable() and perf_pmu_enable() around task_ctx_sched_out() don't break anything. Drop the cpuctx->ctx.lock for the pmu::sched_task(). The lock is for per-CPU context, which is not necessary for the per-task context schedule. No one uses sched_cb_entry, perf_sched_cb_usages, sched_cb_list, and perf_pmu_sched_task() any more. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200821195754.20159-2-kan.li...@linux.intel.com --- include/linux/perf_event.h | 1 +- kernel/events/core.c | 47 + 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 46a3974..0c19d27 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -872,7 +872,6 @@ struct perf_cpu_context { struct list_headcgrp_cpuctx_entry; #endif - struct list_headsched_cb_entry; int sched_cb_usage; int online; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3f5fec4..45edb85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -382,7 +382,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -3384,10 +3383,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; + struct pmu *pmu; if (likely(!ctx)) return; + pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -3417,11 +3418,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - struct pmu *pmu = ctx->pmu; WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); + /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such @@ -3433,6 +3438,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, else swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_pmu_enable(pmu); + /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of @@ -3455,21 +3462,22 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + + perf_pmu_enable(pmu); raw_spin_unlock(&ctx->lock); } } -static DEFINE_PER_CPU(struct list_head, sched_cb_list); - void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - this_cpu_dec(perf_sched_cb_usages); - - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + --cpuctx->sched_cb_usage; }
[tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in()
The following commit has been merged into the perf/core branch of tip: Commit-ID: 556cccad389717d6eb4f5a24b45ff41cad3aaabf Gitweb: https://git.kernel.org/tip/556cccad389717d6eb4f5a24b45ff41cad3aaabf Author:Kan Liang AuthorDate:Fri, 21 Aug 2020 12:57:52 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 10 Sep 2020 11:19:34 +02:00 perf/core: Pull pmu::sched_task() into perf_event_context_sched_in() The pmu::sched_task() is a context switch callback. It passes the cpuctx->task_ctx as a parameter to the lower code. To find the cpuctx->task_ctx, the current code iterates a cpuctx list. The same context was just iterated in perf_event_context_sched_in(), which is invoked right before the pmu::sched_task(). Reuse the cpuctx->task_ctx from perf_event_context_sched_in() can avoid the unnecessary iteration of the cpuctx list. Both pmu::sched_task and perf_event_context_sched_in() have to disable PMU. Pull the pmu::sched_task into perf_event_context_sched_in() can also save the overhead from the PMU disable and reenable. The new and old tasks may have equivalent contexts. The current code optimize this case by swapping the context, which avoids the scheduling. For this case, pmu::sched_task() is still required, e.g., restore the LBR content. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200821195754.20159-1-kan.li...@linux.intel.com --- kernel/events/core.c | 51 ++- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 57efe3b..3f5fec4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3491,30 +3491,36 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ +static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) +{ + struct pmu *pmu; + + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + + if (WARN_ON_ONCE(!pmu->sched_task)) + return; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); + + pmu->sched_task(cpuctx->task_ctx, sched_in); + + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); +} + static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx; - struct pmu *pmu; if (prev == next) return; - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ - - if (WARN_ON_ONCE(!pmu->sched_task)) - continue; - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(pmu); - - pmu->sched_task(cpuctx->task_ctx, sched_in); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) + __perf_pmu_sched_task(cpuctx, sched_in); - perf_pmu_enable(pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } } static void perf_event_switch(struct task_struct *task, @@ -3773,10 +3779,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; + struct pmu *pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) + if (cpuctx->task_ctx == ctx) { + if (cpuctx->sched_cb_usage) + __perf_pmu_sched_task(cpuctx, true); return; + } perf_ctx_lock(cpuctx, ctx); /* @@ -3786,7 +3796,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3798,7 +3808,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); - perf_pmu_enable(ctx->pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(cpuctx->task_ctx, true); + + perf_pmu_enable(pmu); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -3841,9 +3855,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_ev
[tip: perf/core] perf/x86/intel/ds: Fix x86_pmu_stop warning for large PEBS
The following commit has been merged into the perf/core branch of tip: Commit-ID: 35d1ce6bec133679ff16325d335217f108b84871 Gitweb: https://git.kernel.org/tip/35d1ce6bec133679ff16325d335217f108b84871 Author:Kan Liang AuthorDate:Wed, 02 Sep 2020 14:06:49 -07:00 Committer: Peter Zijlstra CommitterDate: Thu, 10 Sep 2020 11:19:33 +02:00 perf/x86/intel/ds: Fix x86_pmu_stop warning for large PEBS A warning as below may be triggered when sampling with large PEBS. [ 410.411250] perf: interrupt took too long (72145 > 71975), lowering kernel.perf_event_max_sample_rate to 2000 [ 410.724923] [ cut here ] [ 410.729822] WARNING: CPU: 0 PID: 16397 at arch/x86/events/core.c:1422 x86_pmu_stop+0x95/0xa0 [ 410.933811] x86_pmu_del+0x50/0x150 [ 410.937304] event_sched_out.isra.0+0xbc/0x210 [ 410.941751] group_sched_out.part.0+0x53/0xd0 [ 410.946111] ctx_sched_out+0x193/0x270 [ 410.949862] __perf_event_task_sched_out+0x32c/0x890 [ 410.954827] ? set_next_entity+0x98/0x2d0 [ 410.958841] __schedule+0x592/0x9c0 [ 410.962332] schedule+0x5f/0xd0 [ 410.965477] exit_to_usermode_loop+0x73/0x120 [ 410.969837] prepare_exit_to_usermode+0xcd/0xf0 [ 410.974369] ret_from_intr+0x2a/0x3a [ 410.977946] RIP: 0033:0x40123c [ 411.079661] ---[ end trace bc83adaea7bb664a ]--- In the non-overflow context, e.g., context switch, with large PEBS, perf may stop an event twice. An example is below. //max_samples_per_tick is adjusted to 2 //NMI is triggered intel_pmu_handle_irq() handle_pmi_common() drain_pebs() __intel_pmu_pebs_event() perf_event_overflow() __perf_event_account_interrupt() hwc->interrupts = 1 return 0 //A context switch happens right after the NMI. //In the same tick, the perf_throttled_seq is not changed. perf_event_task_sched_out() perf_pmu_sched_task() intel_pmu_drain_pebs_buffer() __intel_pmu_pebs_event() perf_event_overflow() __perf_event_account_interrupt() ++hwc->interrupts >= max_samples_per_tick return 1 x86_pmu_stop(); # First stop perf_event_context_sched_out() task_ctx_sched_out() ctx_sched_out() event_sched_out() x86_pmu_del() x86_pmu_stop(); # Second stop and trigger the warning Perf should only invoke the perf_event_overflow() in the overflow context. Current drain_pebs() is called from: - handle_pmi_common() -- overflow context - intel_pmu_pebs_sched_task() -- non-overflow context - intel_pmu_pebs_disable() -- non-overflow context - intel_pmu_auto_reload_read() -- possible overflow context With PERF_SAMPLE_READ + PERF_FORMAT_GROUP, the function may be invoked in the NMI handler. But, before calling the function, the PEBS buffer has already been drained. The __intel_pmu_pebs_event() will not be called in the possible overflow context. To fix the issue, an indicator is required to distinguish between the overflow context aka handle_pmi_common() and other cases. The dummy regs pointer can be used as the indicator. In the non-overflow context, perf should treat the last record the same as other PEBS records, and doesn't invoke the generic overflow handler. Fixes: 21509084f999 ("perf/x86/intel: Handle multiple records in the PEBS buffer") Reported-by: Like Xu Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Like Xu Link: https://lkml.kernel.org/r/20200902210649.2743-1-kan.li...@linux.intel.com --- arch/x86/events/intel/ds.c | 32 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 86848c5..404315d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -670,9 +670,7 @@ unlock: static inline void intel_pmu_drain_pebs_buffer(void) { - struct pt_regs regs; - - x86_pmu.drain_pebs(®s); + x86_pmu.drain_pebs(NULL); } /* @@ -1737,6 +1735,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); + struct pt_regs dummy_iregs; if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { /* @@ -1749,6 +1748,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, } else if (!intel_pmu_save_and_restart(event)) return; + if (!iregs) + iregs = &dummy_iregs; + while (count > 1) { setup_sample(event, iregs, at, &data, regs); perf_event_output(event, &data, regs); @@ -1758,16 +1760,22 @@ static void __intel_pmu_pebs_event(struct perf_event *event, } setup_sample(e
[tip: perf/core] perf/x86/intel: Move BTS index to 47
The following commit has been merged into the perf/core branch of tip: Commit-ID: d39fcc32893dac2d02900d99c38276a00cc54d60 Gitweb: https://git.kernel.org/tip/d39fcc32893dac2d02900d99c38276a00cc54d60 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:07 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:35 +02:00 perf/x86/intel: Move BTS index to 47 The bit 48 in the PERF_GLOBAL_STATUS is used to indicate the overflow status of the PERF_METRICS counters. Move the BTS index to the bit 47. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-5-kan.li...@linux.intel.com --- arch/x86/include/asm/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index fe8110a..58419e5 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -238,11 +238,11 @@ struct x86_pmu_capability { /* * We model BTS tracing as another fixed-mode PMC. * - * We choose a value in the middle of the fixed event range, since lower + * We choose the value 47 for the fixed index of BTS, since lower * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define INTEL_PMC_IDX_FIXED_BTS(INTEL_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS(INTEL_PMC_IDX_FIXED + 15) #define GLOBAL_STATUS_COND_CHG BIT_ULL(63) #define GLOBAL_STATUS_BUFFER_OVF_BIT 62
[tip: perf/core] perf/x86: Use event_base_rdpmc for the RDPMC userspace support
The following commit has been merged into the perf/core branch of tip: Commit-ID: 75608cb02ea5dd997990e2998eca3670cb71a18c Gitweb: https://git.kernel.org/tip/75608cb02ea5dd997990e2998eca3670cb71a18c Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:04 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:34 +02:00 perf/x86: Use event_base_rdpmc for the RDPMC userspace support The RDPMC index is always re-calculated for the RDPMC userspace support, which is unnecessary. The RDPMC index value is stored in the variable event_base_rdpmc for the kernel usage, which can be used for RDPMC userspace support as well. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-2-kan.li...@linux.intel.com --- arch/x86/events/core.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1cbf57d..8e108ea 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2208,17 +2208,12 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m static int x86_pmu_event_idx(struct perf_event *event) { - int idx = event->hw.idx; + struct hw_perf_event *hwc = &event->hw; - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return 0; - if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { - idx -= INTEL_PMC_IDX_FIXED; - idx |= 1 << 30; - } - - return idx + 1; + return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev,
[tip: perf/core] perf/x86/intel: Name the global status bit in NMI handler
The following commit has been merged into the perf/core branch of tip: Commit-ID: 60a2a271cf05cf046c522e1d7f62116b4bcb32a2 Gitweb: https://git.kernel.org/tip/60a2a271cf05cf046c522e1d7f62116b4bcb32a2 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:05 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:34 +02:00 perf/x86/intel: Name the global status bit in NMI handler Magic numbers are used in the current NMI handler for the global status bit. Use a meaningful name to replace the magic numbers to improve the readability of the code. Remove a Tab for all GLOBAL_STATUS_* and INTEL_PMC_IDX_FIXED_BTS macros to reduce the length of the line. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-3-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 4 ++-- arch/x86/include/asm/perf_event.h | 22 -- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5096347..ac1408f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2389,7 +2389,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) /* * PEBS overflow sets bit 62 in the global status register */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&status)) { u64 pebs_enabled = cpuc->pebs_enabled; handled++; @@ -2410,7 +2410,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) /* * Intel PT */ - if (__test_and_clear_bit(55, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && perf_guest_cbs->handle_intel_pt_intr)) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0c1b137..fd3eba6 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -225,16 +225,18 @@ struct x86_pmu_capability { * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) - -#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) -#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62) -#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) -#define GLOBAL_STATUS_ASIF BIT_ULL(60) -#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) -#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 -#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) -#define GLOBAL_STATUS_TRACE_TOPAPMIBIT_ULL(55) +#define INTEL_PMC_IDX_FIXED_BTS(INTEL_PMC_IDX_FIXED + 16) + +#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) +#define GLOBAL_STATUS_BUFFER_OVF_BIT 62 +#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT) +#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) +#define GLOBAL_STATUS_ASIF BIT_ULL(60) +#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) +#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) +#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT55 +#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) /* * We model guest LBR event tracing as another fixed-mode PMC like BTS.
[tip: perf/core] perf/x86/intel: Generic support for hardware TopDown metrics
The following commit has been merged into the perf/core branch of tip: Commit-ID: 7b2c05a15d29d0570a0d21da1e4fd5cbc85cbf13 Gitweb: https://git.kernel.org/tip/7b2c05a15d29d0570a0d21da1e4fd5cbc85cbf13 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:11 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00 perf/x86/intel: Generic support for hardware TopDown metrics Intro = The TopDown Microarchitecture Analysis (TMA) Method is a structured analysis methodology to identify critical performance bottlenecks in out-of-order processors. Current perf has supported the method. The method works well, but there is one problem. To collect the TopDown events, several GP counters have to be used. If a user wants to collect other events at the same time, the multiplexing probably be triggered, which impacts the accuracy. To free up the scarce GP counters, the hardware TopDown metrics feature is introduced from Ice Lake. The hardware implements an additional "metrics" register and a new Fixed Counter 3 that measures pipeline "slots". The TopDown events can be calculated from them instead. Events == The level 1 TopDown has four metrics. There is no event-code assigned to the TopDown metrics. Four metric events are exported as separate perf events, which map to the internal "metrics" counter register. Those events do not exist in hardware, but can be allocated by the scheduler. For the event mapping, a special 0x00 event code is used, which is reserved for fake events. The metric events start from umask 0x10. When setting up the metric events, they point to the Fixed Counter 3. They have to be specially handled. - Add the update_topdown_event() callback to read the additional metrics MSR and generate the metrics. - Add the set_topdown_event_period() callback to initialize metrics MSR and the fixed counter 3. - Add a variable n_metric_event to track the number of the accepted metrics events. The sharing between multiple users of the same metric without multiplexing is not allowed. - Only enable/disable the fixed counter 3 when there are no other active TopDown events, which avoid the unnecessary writing of the fixed control register. - Disable the PMU when reading the metrics event. The metrics MSR and the fixed counter 3 are read separately. The values may be modified by an NMI. All four metric events don't support sampling. Since they will be handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is introduced to indicate this case. The slots event can support both sampling and counting. For counting, the flag is also applied. For sampling, it will be handled normally as other normal events. Groups == The slots event is required in a Topdown group. To avoid reading the METRICS register multiple times, the metrics and slots value can only be updated by slots event in a group. All active slots and metrics events will be updated one time. Therefore, the slots event must be before any metric events in a Topdown group. NMI == The METRICS related register may be overflow. The bit 48 of the STATUS register will be set. If so, PERF_METRICS and Fixed counter 3 are required to be reset. The patch also update all active slots and metrics events in the NMI handler. The update_topdown_event() has to read two registers separately. The values may be modified by an NMI. PMU has to be disabled before calling the function. RDPMC == RDPMC is temporarily disabled. A later patch will enable it. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.li...@linux.intel.com --- arch/x86/events/core.c| 63 --- arch/x86/events/intel/core.c | 124 +++-- arch/x86/events/perf_event.h | 37 +- arch/x86/include/asm/msr-index.h | 1 +- arch/x86/include/asm/perf_event.h | 47 +++- 5 files changed, 257 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8e108ea..53fcf0a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -76,6 +76,9 @@ u64 x86_perf_event_update(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; + if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) + return x86_pmu.update_topdown_event(event); + /* * Careful: an NMI might modify the previous event value. * @@ -1031,6 +1034,42 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) return unsched ? -EINVAL : 0; } +static int add_nr_metric_event(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_metric_event(event)) { + if (cpuc->n_metric == INTEL_TD_METRIC_NUM) + return -EINVAL; + cpuc->n_metric
[tip: perf/core] perf/core: Add a new PERF_EV_CAP_SIBLING event capability
The following commit has been merged into the perf/core branch of tip: Commit-ID: 9f0c4fa111dc909ca545c45ea20ec84da555ce16 Gitweb: https://git.kernel.org/tip/9f0c4fa111dc909ca545c45ea20ec84da555ce16 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:10 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00 perf/core: Add a new PERF_EV_CAP_SIBLING event capability Current perf assumes that events in a group are independent. Close an event doesn't impact the value of the other events in the same group. If the closed event is a member, after the event closure, other events are still running like a group. If the closed event is a leader, other events are running as singleton events. Add PERF_EV_CAP_SIBLING to allow events to indicate they require being part of a group, and when the leader dies they cannot exist independently. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-8-kan.li...@linux.intel.com --- include/linux/perf_event.h | 4 - kernel/events/core.c | 38 - 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 04a49cc..6048650 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -576,9 +576,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. + * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and + * cannot be a group leader. If an event with this flag is detached from the + * group it is scheduled out and moved into an unrecoverable ERROR state. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKGBIT(1) +#define PERF_EV_CAP_SIBLINGBIT(2) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bfe8e3..57efe3b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2133,8 +2133,24 @@ static inline struct list_head *get_event_list(struct perf_event *event) return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; } +/* + * Events that have PERF_EV_CAP_SIBLING require being part of a group and + * cannot exist on their own, schedule them out and move them into the ERROR + * state. Also see _perf_event_enable(), it will not be able to recover + * this ERROR state. + */ +static inline void perf_remove_sibling_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + event_sched_out(event, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); +} + static void perf_group_detach(struct perf_event *event) { + struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; @@ -2153,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event) /* * If this is a sibling, remove it from its group. */ - if (event->group_leader != event) { + if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; @@ -2166,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + if (sibling->event_caps & PERF_EV_CAP_SIBLING) + perf_remove_sibling_event(sibling); + sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2183,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event) } out: - perf_event__header_size(event->group_leader); - - for_each_sibling_event(tmp, event->group_leader) + for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); + + perf_event__header_size(leader); } static bool is_orphaned_event(struct perf_event *event) @@ -2979,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { +out: raw_spin_unlock_irq(&ctx->lock); return; } @@ -2990,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event) * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ - if (event->
[tip: perf/core] perf/x86/intel: Introduce the fourth fixed counter
The following commit has been merged into the perf/core branch of tip: Commit-ID: 6f7225099d5f3ec3019f380a0da2b456b7796cb0 Gitweb: https://git.kernel.org/tip/6f7225099d5f3ec3019f380a0da2b456b7796cb0 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:06 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:35 +02:00 perf/x86/intel: Introduce the fourth fixed counter The fourth fixed counter, TOPDOWN.SLOTS, is introduced in Ice Lake to measure the level 1 TopDown events. Add MSR address and macros for the new fixed counter, which will be used in a later patch. Add comments to explain the event encoding rules for the fixed counters. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-4-kan.li...@linux.intel.com --- arch/x86/include/asm/perf_event.h | 23 --- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index fd3eba6..fe8110a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -197,12 +197,24 @@ struct x86_pmu_capability { */ /* - * All 3 fixed-mode PMCs are configured via this single MSR: + * All the fixed-mode PMCs are configured via this single MSR: */ #define MSR_ARCH_PERFMON_FIXED_CTR_CTRL0x38d /* - * The counts are available in three separate MSRs: + * There is no event-code assigned to the fixed-mode PMCs. + * + * For a fixed-mode PMC, which has an equivalent event on a general-purpose + * PMC, the event-code of the equivalent event is used for the fixed-mode PMC, + * e.g., Instr_Retired.Any and CPU_CLK_Unhalted.Core. + * + * For a fixed-mode PMC, which doesn't have an equivalent event, a + * pseudo-encoding is used, e.g., CPU_CLK_Unhalted.Ref and TOPDOWN.SLOTS. + * The pseudo event-code for a fixed-mode PMC must be 0x00. + * The pseudo umask-code is 0xX. The X equals the index of the fixed + * counter + 1, e.g., the fixed counter 2 has the pseudo-encoding 0x0300. + * + * The counts are available in separate MSRs: */ /* Instr_Retired.Any: */ @@ -213,11 +225,16 @@ struct x86_pmu_capability { #define MSR_ARCH_PERFMON_FIXED_CTR10x30a #define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) -/* CPU_CLK_Unhalted.Ref: */ +/* CPU_CLK_Unhalted.Ref: event=0x00,umask=0x3 (pseudo-encoding) */ #define MSR_ARCH_PERFMON_FIXED_CTR20x30b #define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) #define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) +/* TOPDOWN.SLOTS: event=0x00,umask=0x4 (pseudo-encoding) */ +#define MSR_ARCH_PERFMON_FIXED_CTR30x30c +#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3) +#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS) + /* * We model BTS tracing as another fixed-mode PMC. *
[tip: perf/core] perf/x86/intel: Use switch in intel_pmu_disable/enable_event
The following commit has been merged into the perf/core branch of tip: Commit-ID: 58da7dbe6f036fefe504a4bb452afbd39bba73f7 Gitweb: https://git.kernel.org/tip/58da7dbe6f036fefe504a4bb452afbd39bba73f7 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:09 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00 perf/x86/intel: Use switch in intel_pmu_disable/enable_event Currently, the if-else is used in the intel_pmu_disable/enable_event to check the type of an event. It works well, but with more and more types added later, e.g., perf metrics, compared to the switch statement, the if-else may impair the readability of the code. There is no harm to use the switch statement to replace the if-else here. Also, some optimizing compilers may compile a switch statement into a jump-table which is more efficient than if-else for a large number of cases. The performance gain may not be observed for now, because the number of cases is only 5, but the benefits may be observed with more and more types added in the future. Use switch to replace the if-else in the intel_pmu_disable/enable_event. If the idx is invalid, print a warning. For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't need to check the event->attr.precise_ip. Use return for the case. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 36 +++ 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ac1408f..76eab81 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2180,17 +2180,28 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - if (idx < INTEL_PMC_IDX_FIXED) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); x86_pmu_disable_event(event); - } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: intel_clear_masks(event, idx); intel_pmu_disable_fixed(event); - } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED_BTS: intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); - } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + return; + case INTEL_PMC_IDX_FIXED_VLBR: intel_clear_masks(event, idx); + break; + default: + intel_clear_masks(event, idx); + pr_warn("Failed to disable the event with invalid index %d\n", + idx); + return; + } /* * Needs to be called after x86_pmu_disable_event, @@ -2262,18 +2273,27 @@ static void intel_pmu_enable_event(struct perf_event *event) if (unlikely(event->attr.precise_ip)) intel_pmu_pebs_enable(event); - if (idx < INTEL_PMC_IDX_FIXED) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_set_masks(event, idx); __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); - } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: intel_set_masks(event, idx); intel_pmu_enable_fixed(event); - } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED_BTS: if (!__this_cpu_read(cpu_hw_events.enabled)) return; intel_pmu_enable_bts(hwc->config); - } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + break; + case INTEL_PMC_IDX_FIXED_VLBR: intel_set_masks(event, idx); + break; + default: + pr_warn("Failed to enable the event with invalid index %d\n", + idx); + } } static void intel_pmu_add_event(struct perf_event *event)
[tip: perf/core] perf/x86/intel: Fix the name of perf METRICS
The following commit has been merged into the perf/core branch of tip: Commit-ID: bbdbde2a415d9f479803266cae6fb0c1a9f6c80e Gitweb: https://git.kernel.org/tip/bbdbde2a415d9f479803266cae6fb0c1a9f6c80e Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:08 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:35 +02:00 perf/x86/intel: Fix the name of perf METRICS Bit 15 of the PERF_CAPABILITIES MSR indicates that the perf METRICS feature is supported. The perf METRICS is not a PEBS feature. Rename pebs_metrics_available perf_metrics. The bit is not used in the current code. It will be used in a later patch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-6-kan.li...@linux.intel.com --- arch/x86/events/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7b68ab5..5d453da 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -537,7 +537,7 @@ union perf_capabilities { */ u64 full_width_write:1; u64 pebs_baseline:1; - u64 pebs_metrics_available:1; + u64 perf_metrics:1; u64 pebs_output_pt_available:1; }; u64 capabilities;
[tip: perf/core] perf/x86: Add a macro for RDPMC offset of fixed counters
The following commit has been merged into the perf/core branch of tip: Commit-ID: 0e2e45e2ded4988f5641115fd996c75dc32e4be3 Gitweb: https://git.kernel.org/tip/0e2e45e2ded4988f5641115fd996c75dc32e4be3 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:12 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00 perf/x86: Add a macro for RDPMC offset of fixed counters The RDPMC base offset of fixed counters is hard-code. Use a meaningful name to replace the magic number to improve the readability of the code. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-10-kan.li...@linux.intel.com --- arch/x86/events/core.c| 3 ++- arch/x86/include/asm/perf_event.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 53fcf0a..ebf723f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1151,7 +1151,8 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (idx - INTEL_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30; + hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | + INTEL_PMC_FIXED_RDPMC_BASE; break; default: diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 000cab7..964ba31 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -196,6 +196,9 @@ struct x86_pmu_capability { * Fixed-purpose performance events: */ +/* RDPMC offset for Fixed PMCs */ +#define INTEL_PMC_FIXED_RDPMC_BASE (1 << 30) + /* * All the fixed-mode PMCs are configured via this single MSR: */
[tip: perf/core] perf/x86/intel: Support TopDown metrics on Ice Lake
The following commit has been merged into the perf/core branch of tip: Commit-ID: 59a854e2f3b90ad2cc7368ae392de40b981ad51d Gitweb: https://git.kernel.org/tip/59a854e2f3b90ad2cc7368ae392de40b981ad51d Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:13 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:37 +02:00 perf/x86/intel: Support TopDown metrics on Ice Lake Ice Lake supports the hardware TopDown metrics feature, which can free up the scarce GP counters. Update the event constraints for the metrics events. The metric counters do not exist, which are mapped to a dummy offset. The sharing between multiple users of the same metric without multiplexing is not allowed. Implement set_topdown_event_period for Ice Lake. The values in PERF_METRICS MSR are derived from the fixed counter 3. Both registers should start from zero. Implement update_topdown_event for Ice Lake. The metric is reported by multiplying the metric (fraction) with slots. To maintain accurate measurements, both registers are cleared for each update. The fixed counter 3 should always be cleared before the PERF_METRICS. Implement td_attr for the new metrics events and the new slots fixed counter. Make them visible to the perf user tools. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-11-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 118 +- arch/x86/events/perf_event.h | 13 +++- arch/x86/include/asm/msr-index.h | 2 +- arch/x86/include/asm/perf_event.h | 2 +- 4 files changed, 135 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 4a43668..db83334 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -247,6 +247,10 @@ static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ @@ -309,6 +313,12 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles, EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); +EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); +EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); + static struct attribute *snb_events_attrs[] = { EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), @@ -2232,6 +2242,99 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_pebs_del(event); } +static int icl_set_topdown_event_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + + /* +* The values in PERF_METRICS MSR are derived from fixed counter 3. +* Software should start both registers, PERF_METRICS and fixed +* counter 3, from zero. +* Clear PERF_METRICS and Fixed counter 3 in initialization. +* After that, both MSRs will be cleared for each read. +* Don't need to clear them again. +*/ + if (left == x86_pmu.max_period) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrl(MSR_PERF_METRICS, 0); + local64_set(&hwc->period_left, 0); + } + + perf_event_update_userpage(event); + + return 0; +} + +static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) +{ + u32 val; + + /* +* The metric is reported as an 8bit integer fraction +* suming up to 0xff. +* slots-in-metric = (Metric / 0xff) * slots +*/ + val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff; + return mul_u64_u32_div(slots, val, 0xff); +} + +static void __icl_update_topdown_event(struct perf_event *event, + u64 slots, u64 metrics) +{ + int idx = event->hw.idx; + u64 delta; + + if (is_metric_idx(idx)) + delta = icl_get_metrics_event_value(metrics, slots, idx); + else + delta = slots; + + loc
[tip: perf/core] perf/x86/intel: Support per-thread RDPMC TopDown metrics
The following commit has been merged into the perf/core branch of tip: Commit-ID: 2cb5383b30d47c446ec7d884cd80f93ffcc31817 Gitweb: https://git.kernel.org/tip/2cb5383b30d47c446ec7d884cd80f93ffcc31817 Author:Kan Liang AuthorDate:Thu, 23 Jul 2020 10:11:14 -07:00 Committer: Peter Zijlstra CommitterDate: Tue, 18 Aug 2020 16:34:37 +02:00 perf/x86/intel: Support per-thread RDPMC TopDown metrics Starts from Ice Lake, the TopDown metrics are directly available as fixed counters and do not require generic counters. Also, the TopDown metrics can be collected per thread. Extend the RDPMC usage to support per-thread TopDown metrics. The RDPMC index of the PERF_METRICS will be output if RDPMC users ask for the RDPMC index of the metrics events. To support per thread RDPMC TopDown, the metrics and slots counters have to be saved/restored during the context switching. The last_period and period_left are not used in the counting mode. Use the fields for saved_metric and saved_slots. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200723171117.9918-12-kan.li...@linux.intel.com --- arch/x86/events/core.c | 5 +- arch/x86/events/intel/core.c | 90 ++- include/linux/perf_event.h | 29 +++ 3 files changed, 102 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index ebf723f..0f3d015 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2257,7 +2257,10 @@ static int x86_pmu_event_idx(struct perf_event *event) if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return 0; - return hwc->event_base_rdpmc + 1; + if (is_metric_idx(hwc->idx)) + return INTEL_PMC_FIXED_RDPMC_METRICS + 1; + else + return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index db83334..c72e490 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2258,7 +2258,13 @@ static int icl_set_topdown_event_period(struct perf_event *event) if (left == x86_pmu.max_period) { wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); wrmsrl(MSR_PERF_METRICS, 0); - local64_set(&hwc->period_left, 0); + hwc->saved_slots = 0; + hwc->saved_metric = 0; + } + + if ((hwc->saved_slots) && is_slots_event(event)) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots); + wrmsrl(MSR_PERF_METRICS, hwc->saved_metric); } perf_event_update_userpage(event); @@ -2279,7 +2285,7 @@ static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) return mul_u64_u32_div(slots, val, 0xff); } -static void __icl_update_topdown_event(struct perf_event *event, +static u64 icl_get_topdown_value(struct perf_event *event, u64 slots, u64 metrics) { int idx = event->hw.idx; @@ -2290,7 +2296,50 @@ static void __icl_update_topdown_event(struct perf_event *event, else delta = slots; - local64_add(delta, &event->count); + return delta; +} + +static void __icl_update_topdown_event(struct perf_event *event, + u64 slots, u64 metrics, + u64 last_slots, u64 last_metrics) +{ + u64 delta, last = 0; + + delta = icl_get_topdown_value(event, slots, metrics); + if (last_slots) + last = icl_get_topdown_value(event, last_slots, last_metrics); + + /* +* The 8bit integer fraction of metric may be not accurate, +* especially when the changes is very small. +* For example, if only a few bad_spec happens, the fraction +* may be reduced from 1 to 0. If so, the bad_spec event value +* will be 0 which is definitely less than the last value. +* Avoid update event->count for this case. +*/ + if (delta > last) { + delta -= last; + local64_add(delta, &event->count); + } +} + +static void update_saved_topdown_regs(struct perf_event *event, + u64 slots, u64 metrics) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *other; + int idx; + + event->hw.saved_slots = slots; + event->hw.saved_metric = metrics; + + for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + if (!is_topdown_idx(idx)) + continue; + other = cpuc->events[idx]; + other->hw.saved_slots = slots; + other->hw.saved_metric = metrics; + } } /* @@ -2304,6 +2353,7 @@ static u64 icl_update_topdown_event(struct perf_event *event) st
[tip: x86/urgent] x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs
The following commit has been merged into the x86/urgent branch of tip: Commit-ID: 76d10256a97a7cab72b123d54b766a3c17da658c Gitweb: https://git.kernel.org/tip/76d10256a97a7cab72b123d54b766a3c17da658c Author:Kan Liang AuthorDate:Mon, 20 Jul 2020 06:50:51 -07:00 Committer: Ingo Molnar CommitterDate: Fri, 07 Aug 2020 01:32:00 +02:00 x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs An xstate size check warning is triggered on machines which support Architectural LBRs. XSAVE consistency problem, dumping leaves WARNING: CPU: 0 PID: 0 at arch/x86/kernel/fpu/xstate.c:649 fpu__init_system_xstate+0x4d4/0xd0e Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted intel-arch_lbr+ RIP: 0010:fpu__init_system_xstate+0x4d4/0xd0e The xstate size check routine, init_xstate_size(), compares the size retrieved from the hardware with the size of task->fpu, which is calculated by the software. The size from the hardware is the total size of the enabled xstates in XCR0 | IA32_XSS. Architectural LBR state is a dynamic supervisor feature, which sets the corresponding bit in the IA32_XSS at boot time. The size from the hardware includes the size of the Architectural LBR state. However, a dynamic supervisor feature doesn't allocate a buffer in the task->fpu. The size of task->fpu doesn't include the size of the Architectural LBR state. The mismatch will trigger the warning. Three options as below were considered to fix the issue: - Correct the size from the hardware by subtracting the size of the dynamic supervisor features. The purpose of the check is to compare the size CPU told with the size of the XSAVE buffer, which is calculated by the software. If the software mucks with the number from hardware, it removes the value of the check. This option is not a good option. - Prevent the hardware from counting the size of the dynamic supervisor feature by temporarily removing the corresponding bits in IA32_XSS. Two extra MSR writes are required to flip the IA32_XSS. The option is not pretty, but it is workable. The check is only called once at early boot time. The synchronization or context-switching doesn't need to be worried. This option is implemented here. - Remove the check entirely, because the check hasn't found any real problems. The option may be an alternative as option 2. This option is not implemented here. Add a new function, get_xsaves_size_no_dynamic(), which retrieves the total size without the dynamic supervisor features from the hardware. The size will be used to compare with the size of task->fpu. Fixes: f0dccc9da4c0 ("x86/fpu/xstate: Support dynamic supervisor feature for LBR") Reported-by: Chang S. Bae Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Reviewed-by: Dave Hansen Link: https://lore.kernel.org/r/1595253051-75374-1-git-send-email-kan.li...@linux.intel.com --- arch/x86/kernel/fpu/xstate.c | 33 - 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index be2a68a..6073e34 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -611,6 +611,10 @@ static void check_xstate_against_struct(int nr) * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. + * + * Dynamic XSAVE features allocate their own buffers and are not + * covered by these checks. Only the size of the buffer for task->fpu + * is checked here. */ static void do_extra_xstate_size_checks(void) { @@ -673,6 +677,33 @@ static unsigned int __init get_xsaves_size(void) return ebx; } +/* + * Get the total size of the enabled xstates without the dynamic supervisor + * features. + */ +static unsigned int __init get_xsaves_size_no_dynamic(void) +{ + u64 mask = xfeatures_mask_dynamic(); + unsigned int size; + + if (!mask) + return get_xsaves_size(); + + /* Disable dynamic features. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + + /* +* Ask the hardware what size is required of the buffer. +* This is the size required for the task->fpu buffer. +*/ + size = get_xsaves_size(); + + /* Re-enable dynamic features so XSAVES will work on them again. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + + return size; +} + static unsigned int __init get_xsave_size(void) { unsigned int eax, ebx, ecx, edx; @@ -710,7 +741,7 @@ static int __init init_xstate_size(void) xsave_size = get_xsave_size(); if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size(); + possible_xstate_size = get_xsaves_size_no_dynamic(); else possible_xstate_size = xsave_size;
[tip: x86/urgent] x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs
The following commit has been merged into the x86/urgent branch of tip: Commit-ID: ec8602b79088b0f3556d9c7a3a05313bc4e4a96f Gitweb: https://git.kernel.org/tip/ec8602b79088b0f3556d9c7a3a05313bc4e4a96f Author:Kan Liang AuthorDate:Mon, 20 Jul 2020 06:50:51 -07:00 Committer: Ingo Molnar CommitterDate: Thu, 06 Aug 2020 17:11:59 +02:00 x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs An xstate size check warning is triggered on machines which support Architectural LBRs. XSAVE consistency problem, dumping leaves WARNING: CPU: 0 PID: 0 at arch/x86/kernel/fpu/xstate.c:649 fpu__init_system_xstate+0x4d4/0xd0e Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted intel-arch_lbr+ RIP: 0010:fpu__init_system_xstate+0x4d4/0xd0e The xstate size check routine, init_xstate_size(), compares the size retrieved from the hardware with the size of task->fpu, which is calculated by the software. The size from the hardware is the total size of the enabled xstates in XCR0 | IA32_XSS. Architectural LBR state is a dynamic supervisor feature, which sets the corresponding bit in the IA32_XSS at boot time. The size from the hardware includes the size of the Architectural LBR state. However, a dynamic supervisor feature doesn't allocate a buffer in the task->fpu. The size of task->fpu doesn't include the size of the Architectural LBR state. The mismatch will trigger the warning. Three options as below were considered to fix the issue: - Correct the size from the hardware by subtracting the size of the dynamic supervisor features. The purpose of the check is to compare the size CPU told with the size of the XSAVE buffer, which is calculated by the software. If the software mucks with the number from hardware, it removes the value of the check. This option is not a good option. - Prevent the hardware from counting the size of the dynamic supervisor feature by temporarily removing the corresponding bits in IA32_XSS. Two extra MSR writes are required to flip the IA32_XSS. The option is not pretty, but it is workable. The check is only called once at early boot time. The synchronization or context-switching doesn't need to be worried. This option is implemented here. - Remove the check entirely, because the check hasn't found any real problems. The option may be an alternative as option 2. This option is not implemented here. Add a new function, get_xsaves_size_no_dynamic(), which retrieves the total size without the dynamic supervisor features from the hardware. The size will be used to compare with the size of task->fpu. Fixes: f0dccc9da4c0 ("x86/fpu/xstate: Support dynamic supervisor feature for LBR") Reported-by: Chang S. Bae Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Reviewed-by: Dave Hansen Link: https://lore.kernel.org/r/1595253051-75374-1-git-send-email-kan.li...@linux.intel.com --- arch/x86/kernel/fpu/xstate.c | 33 - 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index be2a68a..6073e34 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -611,6 +611,10 @@ static void check_xstate_against_struct(int nr) * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. + * + * Dynamic XSAVE features allocate their own buffers and are not + * covered by these checks. Only the size of the buffer for task->fpu + * is checked here. */ static void do_extra_xstate_size_checks(void) { @@ -673,6 +677,33 @@ static unsigned int __init get_xsaves_size(void) return ebx; } +/* + * Get the total size of the enabled xstates without the dynamic supervisor + * features. + */ +static unsigned int __init get_xsaves_size_no_dynamic(void) +{ + u64 mask = xfeatures_mask_dynamic(); + unsigned int size; + + if (!mask) + return get_xsaves_size(); + + /* Disable dynamic features. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + + /* +* Ask the hardware what size is required of the buffer. +* This is the size required for the task->fpu buffer. +*/ + size = get_xsaves_size(); + + /* Re-enable dynamic features so XSAVES will work on them again. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + + return size; +} + static unsigned int __init get_xsave_size(void) { unsigned int eax, ebx, ecx, edx; @@ -710,7 +741,7 @@ static int __init init_xstate_size(void) xsave_size = get_xsave_size(); if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size(); + possible_xstate_size = get_xsaves_size_no_dynamic(); else possible_xstate_size = xsave_size;
[tip: perf/core] x86/fpu: Use proper mask to replace full instruction mask
The following commit has been merged into the perf/core branch of tip: Commit-ID: a063bf249b9f8d8004f282031781322c1b527d13 Gitweb: https://git.kernel.org/tip/a063bf249b9f8d8004f282031781322c1b527d13 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:25 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00 x86/fpu: Use proper mask to replace full instruction mask When saving xstate to a kernel/user XSAVE area with the XSAVE family of instructions, the current code applies the 'full' instruction mask (-1), which tries to XSAVE all possible features. This method relies on hardware to trim 'all possible' down to what is enabled in the hardware. The code works well for now. However, there will be a problem, if some features are enabled in hardware, but are not suitable to be saved into all kernel XSAVE buffers, like task->fpu, due to performance consideration. One such example is the Last Branch Records (LBR) state. The LBR state only contains valuable information when LBR is explicitly enabled by the perf subsystem, and the size of an LBR state is large (808 bytes for now). To avoid both CPU overhead and space overhead at each context switch, the LBR state should not be saved into task->fpu like other state components. It should be saved/restored on demand when LBR is enabled in the perf subsystem. Current copy_xregs_to_* will trigger a buffer overflow for such cases. Three sites use the '-1' instruction mask which must be updated. Two are saving/restoring the xstate to/from a kernel-allocated XSAVE buffer and can use 'xfeatures_mask_all', which will save/restore all of the features present in a normal task FPU buffer. The last one saves the register state directly to a user buffer. It could also use 'xfeatures_mask_all'. Just as it was with the '-1' argument, any supervisor states in the mask will be filtered out by the hardware and not saved to the buffer. But, to be more explicit about what is expected to be saved, use xfeatures_mask_user() for the instruction mask. KVM includes the header file fpu/internal.h. To avoid 'undefined xfeatures_mask_all' compiling issue, move copy_fpregs_to_fpstate() to fpu/core.c and export it, because: - The xfeatures_mask_all is indirectly used via copy_fpregs_to_fpstate() by KVM. The function which is directly used by other modules should be exported. - The copy_fpregs_to_fpstate() is a function, while xfeatures_mask_all is a variable for the "internal" FPU state. It's safer to export a function than a variable, which may be implicitly changed by others. - The copy_fpregs_to_fpstate() is a big function with many checks. The removal of the inline keyword should not impact the performance. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-20-git-send-email-kan.li...@linux.intel.com --- arch/x86/include/asm/fpu/internal.h | 47 arch/x86/kernel/fpu/core.c | 39 +++- 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42159f4..d3724dc 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -274,7 +274,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) */ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) { - u64 mask = -1; + u64 mask = xfeatures_mask_all; u32 lmask = mask; u32 hmask = mask >> 32; int err; @@ -320,7 +320,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) */ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) { - u64 mask = -1; + u64 mask = xfeatures_mask_all; u32 lmask = mask; u32 hmask = mask >> 32; int err; @@ -356,6 +356,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) */ static inline int copy_xregs_to_user(struct xregs_state __user *buf) { + u64 mask = xfeatures_mask_user(); + u32 lmask = mask; + u32 hmask = mask >> 32; int err; /* @@ -367,7 +370,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) return -EFAULT; stac(); - XSTATE_OP(XSAVE, buf, -1, -1, err); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); return err; @@ -408,43 +411,7 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) return err; } -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact and we can - * keep registers active. - * - * The legacy FNSAVE instruction cleared all FPU state - * unconditionally, so registers are essentially destroyed. - * Modern FPU state can be kept in registers, if there are - * no pending FP exceptions. -
[tip: perf/core] perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline
The following commit has been merged into the perf/core branch of tip: Commit-ID: 020d91e5f32da4f4b929b3a6e680135fd526107c Gitweb: https://git.kernel.org/tip/020d91e5f32da4f4b929b3a6e680135fd526107c Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:17 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:53 +02:00 perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline The {rd,wr}lbr_{to,from} wrappers are invoked in hot paths, e.g. context switch and NMI handler. They should be always inline to achieve better performance. However, the CONFIG_OPTIMIZE_INLINING allows the compiler to uninline functions marked 'inline'. Mark the {rd,wr}lbr_{to,from} wrappers as __always_inline to force inline the wrappers. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-12-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/lbr.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b8baaf1..21f4f07 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -332,18 +332,18 @@ static u64 lbr_from_signext_quirk_rd(u64 val) return val; } -static inline void wrlbr_from(unsigned int idx, u64 val) +static __always_inline void wrlbr_from(unsigned int idx, u64 val) { val = lbr_from_signext_quirk_wr(val); wrmsrl(x86_pmu.lbr_from + idx, val); } -static inline void wrlbr_to(unsigned int idx, u64 val) +static __always_inline void wrlbr_to(unsigned int idx, u64 val) { wrmsrl(x86_pmu.lbr_to + idx, val); } -static inline u64 rdlbr_from(unsigned int idx) +static __always_inline u64 rdlbr_from(unsigned int idx) { u64 val; @@ -352,7 +352,7 @@ static inline u64 rdlbr_from(unsigned int idx) return lbr_from_signext_quirk_rd(val); } -static inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx) { u64 val;
[tip: perf/core] x86/fpu/xstate: Support dynamic supervisor feature for LBR
The following commit has been merged into the perf/core branch of tip: Commit-ID: f0dccc9da4c0fda049e99326f85db8c242fd781f Gitweb: https://git.kernel.org/tip/f0dccc9da4c0fda049e99326f85db8c242fd781f Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:26 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00 x86/fpu/xstate: Support dynamic supervisor feature for LBR Last Branch Records (LBR) registers are used to log taken branches and other control flows. In perf with call stack mode, LBR information is used to reconstruct a call stack. To get the complete call stack, perf has to save/restore all LBR registers during a context switch. Due to the large number of the LBR registers, e.g., the current platform has 96 LBR registers, this process causes a high CPU overhead. To reduce the CPU overhead during a context switch, an LBR state component that contains all the LBR related registers is introduced in hardware. All LBR registers can be saved/restored together using one XSAVES/XRSTORS instruction. However, the kernel should not save/restore the LBR state component at each context switch, like other state components, because of the following unique features of LBR: - The LBR state component only contains valuable information when LBR is enabled in the perf subsystem, but for most of the time, LBR is disabled. - The size of the LBR state component is huge. For the current platform, it's 808 bytes. If the kernel saves/restores the LBR state at each context switch, for most of the time, it is just a waste of space and cycles. To efficiently support the LBR state component, it is desired to have: - only context-switch the LBR when the LBR feature is enabled in perf. - only allocate an LBR-specific XSAVE buffer on demand. (Besides the LBR state, a legacy region and an XSAVE header have to be included in the buffer as well. There is a total of (808+576) byte overhead for the LBR-specific XSAVE buffer. The overhead only happens when the perf is actively using LBRs. There is still a space-saving, on average, when it replaces the constant 808 bytes of overhead for every task, all the time on the systems that support architectural LBR.) - be able to use XSAVES/XRSTORS for accessing LBR at run time. However, the IA32_XSS should not be adjusted at run time. (The XCR0 | IA32_XSS are used to determine the requested-feature bitmap (RFBM) of XSAVES.) A solution, called dynamic supervisor feature, is introduced to address this issue, which - does not allocate a buffer in each task->fpu; - does not save/restore a state component at each context switch; - sets the bit corresponding to the dynamic supervisor feature in IA32_XSS at boot time, and avoids setting it at run time. - dynamically allocates a specific buffer for a state component on demand, e.g. only allocates LBR-specific XSAVE buffer when LBR is enabled in perf. (Note: The buffer has to include the LBR state component, a legacy region and a XSAVE header space.) (Implemented in a later patch) - saves/restores a state component on demand, e.g. manually invokes the XSAVES/XRSTORS instruction to save/restore the LBR state to/from the buffer when perf is active and a call stack is required. (Implemented in a later patch) A new mask XFEATURE_MASK_DYNAMIC and a helper xfeatures_mask_dynamic() are introduced to indicate the dynamic supervisor feature. For the systems which support the Architecture LBR, LBR is the only dynamic supervisor feature for now. For the previous systems, there is no dynamic supervisor feature available. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-21-git-send-email-kan.li...@linux.intel.com --- arch/x86/include/asm/fpu/types.h | 7 +++- arch/x86/include/asm/fpu/xstate.h | 30 ++- arch/x86/kernel/fpu/xstate.c | 15 ++- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f098f6c..132e9cc 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -114,6 +114,12 @@ enum xfeature { XFEATURE_Hi16_ZMM, XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, + XFEATURE_RSRVD_COMP_10, + XFEATURE_RSRVD_COMP_11, + XFEATURE_RSRVD_COMP_12, + XFEATURE_RSRVD_COMP_13, + XFEATURE_RSRVD_COMP_14, + XFEATURE_LBR, XFEATURE_MAX, }; @@ -128,6 +134,7 @@ enum xfeature { #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) +#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_FPSSE(XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512
[tip: perf/core] perf/x86/intel/lbr: Create kmem_cache for the LBR context data
The following commit has been merged into the perf/core branch of tip: Commit-ID: 33cad284497cf40f55ad6029c06011de3538ebed Gitweb: https://git.kernel.org/tip/33cad284497cf40f55ad6029c06011de3538ebed Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:23 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:55 +02:00 perf/x86/intel/lbr: Create kmem_cache for the LBR context data A new kmem_cache method is introduced to allocate the PMU specific data task_ctx_data, which requires the PMU specific code to create a kmem_cache. Currently, the task_ctx_data is only used by the Intel LBR call stack feature, which is introduced since Haswell. The kmem_cache should be only created for Haswell and later platforms. There is no alignment requirement for the existing platforms. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-18-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/lbr.c | 21 +++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e4e249a..e784c1d 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1531,9 +1531,17 @@ void __init intel_pmu_lbr_init_snb(void) */ } +static inline struct kmem_cache * +create_lbr_kmem_cache(size_t size, size_t align) +{ + return kmem_cache_create("x86_lbr", size, align, 0, NULL); +} + /* haswell */ void intel_pmu_lbr_init_hsw(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1542,6 +1550,8 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + if (lbr_from_signext_quirk_needed()) static_branch_enable(&lbr_from_quirk_key); } @@ -1549,6 +1559,8 @@ void intel_pmu_lbr_init_hsw(void) /* skylake */ __init void intel_pmu_lbr_init_skl(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 32; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1558,6 +1570,8 @@ __init void intel_pmu_lbr_init_skl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + /* * SW branch filter usage: * - support syscall, sysret capture. @@ -1631,6 +1645,7 @@ void __init intel_pmu_arch_lbr_init(void) union cpuid28_ebx ebx; union cpuid28_ecx ecx; unsigned int unused_edx; + size_t size; u64 lbr_nr; /* Arch LBR Capabilities */ @@ -1655,8 +1670,10 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_br_type = ecx.split.lbr_br_type; x86_pmu.lbr_nr = lbr_nr; - x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) + - lbr_nr * sizeof(struct lbr_entry); + size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + x86_get_pmu()->task_ctx_size = size; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
[tip: perf/core] perf/x86/intel/lbr: Support XSAVES for arch LBR read
The following commit has been merged into the perf/core branch of tip: Commit-ID: c085fb8774671e83f6199a8e838fbc0e57094029 Gitweb: https://git.kernel.org/tip/c085fb8774671e83f6199a8e838fbc0e57094029 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:29 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:57 +02:00 perf/x86/intel/lbr: Support XSAVES for arch LBR read Reading LBR registers in a perf NMI handler for a non-PEBS event causes a high overhead because the number of LBR registers is huge. To reduce the overhead, the XSAVES instruction should be used to replace the LBR registers' reading method. The XSAVES buffer used for LBR read has to be per-CPU because the NMI handler invoked the lbr_read(). The existing task_ctx_data buffer cannot be used which is per-task and only be allocated for the LBR call stack mode. A new lbr_xsave pointer is introduced in the cpu_hw_events as an XSAVES buffer for LBR read. The XSAVES buffer should be allocated only when LBR is used by a non-PEBS event on the CPU because the total size of the lbr_xsave is not small (~1.4KB). The XSAVES buffer is allocated when a non-PEBS event is added, but it is lazily released in x86_release_hardware() when perf releases the entire PMU hardware resource, because perf may frequently schedule the event, e.g. high context switch. The lazy release method reduces the overhead of frequently allocate/free the buffer. If the lbr_xsave fails to be allocated, roll back to normal Arch LBR lbr_read(). Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-24-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/core.c | 1 +- arch/x86/events/intel/lbr.c | 40 ++- arch/x86/events/perf_event.h | 7 ++- 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6b1228a..1cbf57d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -358,6 +358,7 @@ void x86_release_hardware(void) if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); + release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index cb1a049..63f58bd 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -658,6 +658,7 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { + struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu.lbr_nr) @@ -695,6 +696,29 @@ void intel_pmu_lbr_add(struct perf_event *event) perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); + + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + kmem_cache && !cpuc->lbr_xsave && + (cpuc->lbr_users != cpuc->lbr_pebs_users)) + cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL); +} + +void release_lbr_buffers(void) +{ + struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache; + struct cpu_hw_events *cpuc; + int cpu; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + return; + + for_each_possible_cpu(cpu) { + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + if (kmem_cache && cpuc->lbr_xsave) { + kmem_cache_free(kmem_cache, cpuc->lbr_xsave); + cpuc->lbr_xsave = NULL; + } + } } void intel_pmu_lbr_del(struct perf_event *event) @@ -945,6 +969,19 @@ static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) intel_pmu_store_lbr(cpuc, NULL); } +static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) +{ + struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave; + + if (!xsave) { + intel_pmu_store_lbr(cpuc, NULL); + return; + } + copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR); + + intel_pmu_store_lbr(cpuc, xsave->lbr.entries); +} + void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1767,14 +1804,15 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_ctl_map = NULL; x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; - x86_pmu.lbr_read = intel_pmu_arch_lbr_read; if (arch_lbr_xsave) { x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves; x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave; pr_cont("XSAVE
[tip: perf/core] perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch
The following commit has been merged into the perf/core branch of tip: Commit-ID: ce711ea3cab9ad325d849792d442848e553095b8 Gitweb: https://git.kernel.org/tip/ce711ea3cab9ad325d849792d442848e553095b8 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:28 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00 perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch In the LBR call stack mode, LBR information is used to reconstruct a call stack. To get the complete call stack, perf has to save/restore all LBR registers during a context switch. Due to a large number of the LBR registers, this process causes a high CPU overhead. To reduce the CPU overhead during a context switch, use the XSAVES/XRSTORS instructions. Every XSAVE area must follow a canonical format: the legacy region, an XSAVE header and the extended region. Although the LBR information is only kept in the extended region, a space for the legacy region and XSAVE header is still required. Add a new dedicated structure for LBR XSAVES support. Before enabling XSAVES support, the size of the LBR state has to be sanity checked, because: - the size of the software structure is calculated from the max number of the LBR depth, which is enumerated by the CPUID leaf for Arch LBR. The size of the LBR state is enumerated by the CPUID leaf for XSAVE support of Arch LBR. If the values from the two CPUID leaves are not consistent, it may trigger a buffer overflow. For example, a hypervisor may unconsciously set inconsistent values for the two emulated CPUID. - unlike other state components, the size of an LBR state depends on the max number of LBRs, which may vary from generation to generation. Expose the function xfeature_size() for the sanity check. The LBR XSAVES support will be disabled if the size of the LBR state enumerated by CPUID doesn't match with the size of the software structure. The XSAVE instruction requires 64-byte alignment for state buffers. A new macro is added to reflect the alignment requirement. A 64-byte aligned kmem_cache is created for architecture LBR. Currently, the structure for each state component is maintained in fpu/types.h. The structure for the new LBR state component should be maintained in the same place. Move structure lbr_entry to fpu/types.h as well for broader sharing. Add dedicated lbr_save/lbr_restore functions for LBR XSAVES support, which invokes the corresponding xstate helpers to XSAVES/XRSTORS LBR information at the context switch when the call stack mode is enabled. Since the XSAVES/XRSTORS instructions will be eventually invoked, the dedicated functions is named with '_xsaves'/'_xrstors' postfix. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-23-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/lbr.c | 79 -- arch/x86/events/perf_event.h | 21 - arch/x86/include/asm/fpu/types.h | 20 - arch/x86/include/asm/fpu/xstate.h | 3 +- arch/x86/include/asm/perf_event.h | 4 +-- arch/x86/kernel/fpu/xstate.c | 2 +- 6 files changed, 119 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 3ad5289..cb1a049 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -483,6 +483,17 @@ static void intel_pmu_arch_lbr_restore(void *ctx) } } +/* + * Restore the Architecture LBR state from the xsave area in the perf + * context data for the task via the XRSTORS instruction. + */ +static void intel_pmu_arch_lbr_xrstors(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { if (static_cpu_has(X86_FEATURE_ARCH_LBR)) @@ -557,6 +568,17 @@ static void intel_pmu_arch_lbr_save(void *ctx) entries[x86_pmu.lbr_nr - 1].from = 0; } +/* + * Save the Architecture LBR state to the xsave area in the perf + * context data for the task via the XSAVES instruction. + */ +static void intel_pmu_arch_lbr_xsaves(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + static void __intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1639,12 +1661,40 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } +/* + * LBR state size is variable based on the max number of registers. + * This calculates the expected state size, which should match + * what the hardware enumerates for the size of XFEATURE_LBR. + */ +static inline unsigned int get_lbr_state_size(void) +{ + return sizeof(struct arch_lbr_state) +
[tip: perf/core] perf/core: Use kmem_cache to allocate the PMU specific data
The following commit has been merged into the perf/core branch of tip: Commit-ID: 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 Gitweb: https://git.kernel.org/tip/217c2a633ebb36f1cc6d249f4ef2e4a809d46818 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:22 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:55 +02:00 perf/core: Use kmem_cache to allocate the PMU specific data Currently, the PMU specific data task_ctx_data is allocated by the function kzalloc() in the perf generic code. When there is no specific alignment requirement for the task_ctx_data, the method works well for now. However, there will be a problem once a specific alignment requirement is introduced in future features, e.g., the Architecture LBR XSAVE feature requires 64-byte alignment. If the specific alignment requirement is not fulfilled, the XSAVE family of instructions will fail to save/restore the xstate to/from the task_ctx_data. The function kzalloc() itself only guarantees a natural alignment. A new method to allocate the task_ctx_data has to be introduced, which has to meet the requirements as below: - must be a generic method can be used by different architectures, because the allocation of the task_ctx_data is implemented in the perf generic code; - must be an alignment-guarantee method (The alignment requirement is not changed after the boot); - must be able to allocate/free a buffer (smaller than a page size) dynamically; - should not cause extra CPU overhead or space overhead. Several options were considered as below: - One option is to allocate a larger buffer for task_ctx_data. E.g., ptr = kmalloc(size + alignment, GFP_KERNEL); ptr &= ~(alignment - 1); This option causes space overhead. - Another option is to allocate the task_ctx_data in the PMU specific code. To do so, several function pointers have to be added. As a result, both the generic structure and the PMU specific structure will become bigger. Besides, extra function calls are added when allocating/freeing the buffer. This option will increase both the space overhead and CPU overhead. - The third option is to use a kmem_cache to allocate a buffer for the task_ctx_data. The kmem_cache can be created with a specific alignment requirement by the PMU at boot time. A new pointer for kmem_cache has to be added in the generic struct pmu, which would be used to dynamically allocate a buffer for the task_ctx_data at run time. Although the new pointer is added to the struct pmu, the existing variable task_ctx_size is not required anymore. The size of the generic structure is kept the same. The third option which meets all the aforementioned requirements is used to replace kzalloc() for the PMU specific data allocation. A later patch will remove the kzalloc() method and the related variables. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.li...@linux.intel.com --- include/linux/perf_event.h | 5 + kernel/events/core.c | 8 +++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 46fe5cf..09915ae 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -425,6 +425,11 @@ struct pmu { size_t task_ctx_size; /* +* Kmem cache of PMU specific data +*/ + struct kmem_cache *task_ctx_cache; + + /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage diff --git a/kernel/events/core.c b/kernel/events/core.c index 7509040..30d9b31 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1240,12 +1240,18 @@ static void get_ctx(struct perf_event_context *ctx) static void *alloc_task_ctx_data(struct pmu *pmu) { + if (pmu->task_ctx_cache) + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { - kfree(task_ctx_data); + if (pmu->task_ctx_cache && task_ctx_data) + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + else + kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head)
[tip: perf/core] x86/fpu/xstate: Add helpers for LBR dynamic supervisor feature
The following commit has been merged into the perf/core branch of tip: Commit-ID: 50f408d96d4d1a945d2c50c5fd8ed400883edf0e Gitweb: https://git.kernel.org/tip/50f408d96d4d1a945d2c50c5fd8ed400883edf0e Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:27 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00 x86/fpu/xstate: Add helpers for LBR dynamic supervisor feature The perf subsystem will only need to save/restore the LBR state. However, the existing helpers save all supported supervisor states to a kernel buffer, which will be unnecessary. Two helpers are introduced to only save/restore requested dynamic supervisor states. The supervisor features in XFEATURE_MASK_SUPERVISOR_SUPPORTED and XFEATURE_MASK_SUPERVISOR_UNSUPPORTED mask cannot be saved/restored using these helpers. The helpers will be used in the following patch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-22-git-send-email-kan.li...@linux.intel.com --- arch/x86/include/asm/fpu/xstate.h | 3 +- arch/x86/kernel/fpu/xstate.c | 72 ++- 2 files changed, 75 insertions(+) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 040c4d4..c029fce 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -106,6 +106,9 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void copy_supervisor_to_kernel(struct xregs_state *xsave); +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); + /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ int validate_user_xstate_header(const struct xstate_header *hdr); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index dcf0624..b0c22b7 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1361,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate) } } +/** + * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features saved into the xsave area + * + * Only the dynamic supervisor states sets in the mask are saved into the xsave + * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic + * supervisor feature). Besides the dynamic supervisor states, the legacy + * region and XSAVE header are also saved into the xsave area. The supervisor + * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* Should never fault when copying to a kernel buffer */ + WARN_ON_FPU(err); +} + +/** + * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features restored from the xsave area + * + * Only the dynamic supervisor states sets in the mask are restored from the + * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of + * dynamic supervisor feature). Besides the dynamic supervisor states, the + * legacy region and XSAVE header are also restored from the xsave area. The + * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + + /* Should never fault when copying from a kernel buffer */ + WARN_ON_FPU(err); +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the
[tip: perf/core] perf/x86/intel/lbr: Factor out intel_pmu_store_lbr
The following commit has been merged into the perf/core branch of tip: Commit-ID: 631618a0dca31dc23dcce38cf345c6139bd8a1e9 Gitweb: https://git.kernel.org/tip/631618a0dca31dc23dcce38cf345c6139bd8a1e9 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:19 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:54 +02:00 perf/x86/intel/lbr: Factor out intel_pmu_store_lbr The way to store the LBR information from a PEBS LBR record can be reused in Architecture LBR, because - The LBR information is stored like a stack. Entry 0 is always the youngest branch. - The layout of the LBR INFO MSR is similar. The LBR information may be retrieved from either the LBR registers (non-PEBS event) or a buffer (PEBS event). Extend rdlbr_*() to support both methods. Explicitly check the invalid entry (0s), which can avoid unnecessary MSR access if using a non-PEBS event. For a PEBS event, the check should slightly improve the performance as well. The invalid entries are cut. The intel_pmu_lbr_filter() doesn't need to check and filter them out. Cannot share the function with current model-specific LBR read, because the direction of the LBR growth is opposite. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-14-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/lbr.c | 82 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index d3d129c..0d7a859 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -348,28 +348,37 @@ static __always_inline void wrlbr_info(unsigned int idx, u64 val) wrmsrl(x86_pmu.lbr_info + idx, val); } -static __always_inline u64 rdlbr_from(unsigned int idx) +static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->from; + rdmsrl(x86_pmu.lbr_from + idx, val); return lbr_from_signext_quirk_rd(val); } -static __always_inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->to; + rdmsrl(x86_pmu.lbr_to + idx, val); return val; } -static __always_inline u64 rdlbr_info(unsigned int idx) +static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->info; + rdmsrl(x86_pmu.lbr_info + idx, val); return val; @@ -387,16 +396,16 @@ wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) static inline bool rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) { - u64 from = rdlbr_from(idx); + u64 from = rdlbr_from(idx, NULL); /* Don't read invalid entry */ if (!from) return false; lbr->from = from; - lbr->to = rdlbr_to(idx); + lbr->to = rdlbr_to(idx, NULL); if (need_info) - lbr->info = rdlbr_info(idx); + lbr->info = rdlbr_info(idx, NULL); return true; } @@ -432,7 +441,7 @@ void intel_pmu_lbr_restore(void *ctx) static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { - return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos); + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); } static void __intel_pmu_lbr_restore(void *ctx) @@ -709,8 +718,8 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; - from = rdlbr_from(lbr_idx); - to = rdlbr_to(lbr_idx); + from = rdlbr_from(lbr_idx, NULL); + to = rdlbr_to(lbr_idx, NULL); /* * Read LBR call stack entries @@ -722,7 +731,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; - info = rdlbr_info(lbr_idx); + info = rdlbr_info(lbr_idx, NULL); mis = !!(info & LBR_INFO_MISPRED); pred = !mis; in_tx = !!(info & LBR_INFO_IN_TX); @@ -777,6 +786,42 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, + struct lbr_entry *entries) +{ + struct perf_branch_entry *e; + struct lbr_entry *lbr; + u64 from, to, info; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr = entries ? &entries[i] : NULL; + e = &cpuc->lbr_entries[i]; + + from = rdlbr_from(i, lbr); +
[tip: perf/core] perf/x86/intel/lbr: Factor out a new struct for generic optimization
The following commit has been merged into the perf/core branch of tip: Commit-ID: 530bfff6480307d210734222a54d56af7f908957 Gitweb: https://git.kernel.org/tip/530bfff6480307d210734222a54d56af7f908957 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:11 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:52 +02:00 perf/x86/intel/lbr: Factor out a new struct for generic optimization To reduce the overhead of a context switch with LBR enabled, some generic optimizations were introduced, e.g. avoiding restore LBR if no one else touched them. The generic optimizations can also be used by Architecture LBR later. Currently, the fields for the generic optimizations are part of structure x86_perf_task_context, which will be deprecated by Architecture LBR. A new structure should be introduced for the common fields of generic optimization, which can be shared between Architecture LBR and model-specific LBR. Both 'valid_lbrs' and 'tos' are also used by the generic optimizations, but they are not moved into the new structure, because Architecture LBR is stack-like. The 'valid_lbrs' which records the index of the valid LBR is not required anymore. The TOS MSR will be removed. LBR registers may be cleared in the deep Cstate. If so, the generic optimizations should not be applied. Perf has to unconditionally restore the LBR registers. A generic function is required to detect the reset due to the deep Cstate. lbr_is_reset_in_cstate() is introduced. Currently, for the model-specific LBR, the TOS MSR is used to detect the reset. There will be another method introduced for Architecture LBR later. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-6-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/lbr.c | 38 +++ arch/x86/events/perf_event.h | 10 ++--- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b2b8dc9..bba9939 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -355,33 +355,37 @@ void intel_pmu_lbr_restore(void *ctx) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +static __always_inline bool +lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx) +{ + return !rdlbr_from(task_ctx->tos); +} + static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 tos; - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { + if (task_ctx->opt.lbr_callstack_users == 0 || + task_ctx->opt.lbr_stack_state == LBR_NONE) { intel_pmu_lbr_reset(); return; } - tos = task_ctx->tos; /* * Does not restore the LBR registers, if * - No one else touched them, and -* - Did not enter C6 +* - Was not cleared in Cstate */ if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; + (task_ctx->opt.log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(task_ctx)) { + task_ctx->opt.lbr_stack_state = LBR_NONE; return; } x86_pmu.lbr_restore(task_ctx); - task_ctx->lbr_stack_state = LBR_NONE; + task_ctx->opt.lbr_stack_state = LBR_NONE; } void intel_pmu_lbr_save(void *ctx) @@ -415,17 +419,17 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; + if (task_ctx->opt.lbr_callstack_users == 0) { + task_ctx->opt.lbr_stack_state = LBR_NONE; return; } x86_pmu.lbr_save(task_ctx); - task_ctx->lbr_stack_state = LBR_VALID; + task_ctx->opt.lbr_stack_state = LBR_VALID; cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->log_id; + cpuc->last_log_id = ++task_ctx->opt.log_id; } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, @@ -447,8 +451,8 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, if (!prev_ctx_data || !next_ctx_data) return; - swap(prev_ctx_data->lbr_callstack_users, -next_ctx_data->lbr_callstack_users); + swap(prev_ctx_data->opt.lbr_callstack_users, +next_ctx_data->opt.lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) @@ -503,7 +507,7 @@ void intel_pmu_lbr_add(struct perf_event *event) if (branch_user_callstack(cpuc->br_se
[tip: perf/core] perf/x86/intel/lbr: Add the function pointers for LBR save and restore
The following commit has been merged into the perf/core branch of tip: Commit-ID: 799571bf38fc2b4b744fa448184b5915739b10fd Gitweb: https://git.kernel.org/tip/799571bf38fc2b4b744fa448184b5915739b10fd Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:10 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:52 +02:00 perf/x86/intel/lbr: Add the function pointers for LBR save and restore The MSRs of Architectural LBR are different from previous model-specific LBR. Perf has to implement different functions to save and restore them. The function pointers for LBR save and restore are introduced. Perf should initialize the corresponding functions at boot time. The generic optimizations, e.g. avoiding restore LBR if no one else touched them, still apply for Architectural LBRs. The related codes are not moved to model-specific functions. Current model-specific LBR functions are set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-5-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 4 ++- arch/x86/events/intel/lbr.c | 79 +-- arch/x86/events/perf_event.h | 6 +++- 3 files changed, 59 insertions(+), 30 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 6414b47..50cb3c6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3981,6 +3981,8 @@ static __initconst const struct x86_pmu core_pmu = { .lbr_reset = intel_pmu_lbr_reset_64, .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore= intel_pmu_lbr_restore, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4029,6 +4031,8 @@ static __initconst const struct x86_pmu intel_pmu = { .lbr_reset = intel_pmu_lbr_reset_64, .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore= intel_pmu_lbr_restore, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b8943f4..b2b8dc9 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -323,31 +323,13 @@ static inline u64 rdlbr_to(unsigned int idx) return val; } -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +void intel_pmu_lbr_restore(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; int i; unsigned lbr_idx, mask; - u64 tos; - - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { - intel_pmu_lbr_reset(); - return; - } - - tos = task_ctx->tos; - /* -* Does not restore the LBR registers, if -* - No one else touched them, and -* - Did not enter C6 -*/ - if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; - return; - } + u64 tos = task_ctx->tos; mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { @@ -368,24 +350,48 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) } wrmsrl(x86_pmu.lbr_tos, tos); - task_ctx->lbr_stack_state = LBR_NONE; if (cpuc->lbr_select) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - unsigned lbr_idx, mask; - u64 tos, from; - int i; + u64 tos; - if (task_ctx->lbr_callstack_users == 0) { + if (task_ctx->lbr_callstack_users == 0 || + task_ctx->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + tos = task_ctx->tos; + /* +* Does not restore the LBR registers, if +* - No one else touched them, and +* - Did not enter C6 +*/ + if ((task_ctx == cpuc->last_task_ctx) && + (task_ctx->log_id == cpuc->last_log_id) && + rdlbr_from(tos)) { task_ctx->lbr_stack_state = LBR_NONE; return; } + x86_pmu.lbr_restore(task_ctx); + + task_ctx->lbr_stack_state = LBR_NONE; +} + +void intel_pmu_lbr_save(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; + unsigned lbr_idx, mask; +
[tip: perf/core] perf/x86: Expose CPUID enumeration bits for arch LBR
The following commit has been merged into the perf/core branch of tip: Commit-ID: af6cf129706b2f79e12f97e62d977e7f653cdfd1 Gitweb: https://git.kernel.org/tip/af6cf129706b2f79e12f97e62d977e7f653cdfd1 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:14 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:53 +02:00 perf/x86: Expose CPUID enumeration bits for arch LBR The LBR capabilities of Architecture LBR are retrieved from the CPUID enumeration once at boot time. The capabilities have to be saved for future usage. Several new fields are added into structure x86_pmu to indicate the capabilities. The fields will be used in the following patches. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-9-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/perf_event.h | 13 ++- arch/x86/include/asm/perf_event.h | 40 ++- 2 files changed, 53 insertions(+) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7dbf148..cc81177 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -693,6 +693,19 @@ struct x86_pmu { boollbr_double_abort; /* duplicated lbr aborts */ boollbr_pt_coexist;/* (LBR|BTS) may coexist with PT */ + /* +* Intel Architectural LBR CPUID Enumeration +*/ + unsigned intlbr_depth_mask:8; + unsigned intlbr_deep_c_reset:1; + unsigned intlbr_lip:1; + unsigned intlbr_cpl:1; + unsigned intlbr_filter:1; + unsigned intlbr_call_stack:1; + unsigned intlbr_mispred:1; + unsigned intlbr_timed_lbr:1; + unsigned intlbr_br_type:1; + void(*lbr_reset)(void); void(*lbr_read)(struct cpu_hw_events *cpuc); void(*lbr_save)(void *ctx); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2df7073..9ffce7d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -142,6 +142,46 @@ union cpuid10_edx { unsigned int full; }; +/* + * Intel Architectural LBR CPUID detection/enumeration details: + */ +union cpuid28_eax { + struct { + /* Supported LBR depth values */ + unsigned intlbr_depth_mask:8; + unsigned intreserved:22; + /* Deep C-state Reset */ + unsigned intlbr_deep_c_reset:1; + /* IP values contain LIP */ + unsigned intlbr_lip:1; + } split; + unsigned intfull; +}; + +union cpuid28_ebx { + struct { + /* CPL Filtering Supported */ + unsigned intlbr_cpl:1; + /* Branch Filtering Supported */ + unsigned intlbr_filter:1; + /* Call-stack Mode Supported */ + unsigned intlbr_call_stack:1; + } split; + unsigned intfull; +}; + +union cpuid28_ecx { + struct { + /* Mispredict Bit Supported */ + unsigned intlbr_mispred:1; + /* Timed LBRs Supported */ + unsigned intlbr_timed_lbr:1; + /* Branch Type Field Supported */ + unsigned intlbr_br_type:1; + } split; + unsigned intfull; +}; + struct x86_pmu_capability { int version; int num_counters_gp;
[tip: perf/core] perf/core: Factor out functions to allocate/free the task_ctx_data
The following commit has been merged into the perf/core branch of tip: Commit-ID: ff9ff926889dd8026b4ba55266a010c27f68604f Gitweb: https://git.kernel.org/tip/ff9ff926889dd8026b4ba55266a010c27f68604f Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:21 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:54 +02:00 perf/core: Factor out functions to allocate/free the task_ctx_data The method to allocate/free the task_ctx_data is going to be changed in the following patch. Currently, the task_ctx_data is allocated/freed in several different places. To avoid repeatedly modifying the same codes in several different places, alloc_task_ctx_data() and free_task_ctx_data() are factored out to allocate/free the task_ctx_data. The modification only needs to be applied once. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.li...@linux.intel.com --- kernel/events/core.c | 21 +++-- 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9b8f925..7509040 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1238,12 +1238,22 @@ static void get_ctx(struct perf_event_context *ctx) refcount_inc(&ctx->refcount); } +static void *alloc_task_ctx_data(struct pmu *pmu) +{ + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); +} + +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +{ + kfree(task_ctx_data); +} + static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -4471,7 +4481,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; @@ -4529,11 +4539,11 @@ retry: } } - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } @@ -12497,8 +12507,7 @@ inherit_event(struct perf_event *parent_event, !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, - GFP_KERNEL); + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM);
[tip: perf/core] perf/x86/intel/lbr: Add a function pointer for LBR reset
The following commit has been merged into the perf/core branch of tip: Commit-ID: 9f354a726cb1d4eb00a0784a27eaa0a3283cff71 Gitweb: https://git.kernel.org/tip/9f354a726cb1d4eb00a0784a27eaa0a3283cff71 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:08 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:51 +02:00 perf/x86/intel/lbr: Add a function pointer for LBR reset The method to reset Architectural LBRs is different from previous model-specific LBR. Perf has to implement a different function. A function pointer is introduced for LBR reset. The enum of LBR_FORMAT_* is also moved to perf_event.h. Perf should initialize the corresponding functions at boot time, and avoid checking lbr_format at run time. The current 64-bit LBR reset function is set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-3-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 7 +++ arch/x86/events/intel/lbr.c | 20 +++- arch/x86/events/perf_event.h | 17 + 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 582ddff..fe49e99 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3978,6 +3978,8 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dead = intel_pmu_cpu_dead, .check_period = intel_pmu_check_period, + + .lbr_reset = intel_pmu_lbr_reset_64, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4023,6 +4025,8 @@ static __initconst const struct x86_pmu intel_pmu = { .check_period = intel_pmu_check_period, .aux_output_match = intel_pmu_aux_output_match, + + .lbr_reset = intel_pmu_lbr_reset_64, }; static __init void intel_clovertown_quirk(void) @@ -4649,6 +4653,9 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index d03de75..7af27a7 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -8,17 +8,6 @@ #include "../perf_event.h" -enum { - LBR_FORMAT_32 = 0x00, - LBR_FORMAT_LIP = 0x01, - LBR_FORMAT_EIP = 0x02, - LBR_FORMAT_EIP_FLAGS= 0x03, - LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN= LBR_FORMAT_TIME, -}; - static const enum { LBR_EIP_FLAGS = 1, LBR_TSX = 2, @@ -194,7 +183,7 @@ static void __intel_pmu_lbr_disable(void) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } -static void intel_pmu_lbr_reset_32(void) +void intel_pmu_lbr_reset_32(void) { int i; @@ -202,7 +191,7 @@ static void intel_pmu_lbr_reset_32(void) wrmsrl(x86_pmu.lbr_from + i, 0); } -static void intel_pmu_lbr_reset_64(void) +void intel_pmu_lbr_reset_64(void) { int i; @@ -221,10 +210,7 @@ void intel_pmu_lbr_reset(void) if (!x86_pmu.lbr_nr) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_reset_32(); - else - intel_pmu_lbr_reset_64(); + x86_pmu.lbr_reset(); cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8147596..5c1ad43 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -180,6 +180,17 @@ struct x86_perf_task_context; #define MAX_LBR_ENTRIES32 enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS= 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN= LBR_FORMAT_TIME, +}; + +enum { X86_PERF_KFREE_SHARED = 0, X86_PERF_KFREE_EXCL = 1, X86_PERF_KFREE_MAX @@ -682,6 +693,8 @@ struct x86_pmu { boollbr_double_abort; /* duplicated lbr aborts */ boollbr_pt_coexist;/* (LBR|BTS) may coexist with PT */ + void(*lbr_reset)(void); + /* * Intel PT/LBR/BTS are exclusive */ @@ -1058,6 +1071,10 @@ u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); +void intel_pmu_lbr_reset_32(void); + +void intel_pmu_lbr_reset_64(v
[tip: perf/core] perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all()
The following commit has been merged into the perf/core branch of tip: Commit-ID: fda1f99f34a8f0975086bcfef34da865009995c1 Gitweb: https://git.kernel.org/tip/fda1f99f34a8f0975086bcfef34da865009995c1 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:18 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:54 +02:00 perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all() The previous model-specific LBR and Architecture LBR (legacy way) use a similar method to save/restore the LBR information, which directly accesses the LBR registers. The codes which read/write a set of LBR registers can be shared between them. Factor out two functions which are used to read/write a set of LBR registers. Add lbr_info into structure x86_pmu, and use it to replace the hardcoded LBR INFO MSR, because the LBR INFO MSR address of the previous model-specific LBR is different from Architecture LBR. The MSR address should be assigned at boot time. For now, only Sky Lake and later platforms have the LBR INFO MSR. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-13-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/lbr.c | 66 ++- arch/x86/events/perf_event.h | 2 +- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 21f4f07..d3d129c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -237,7 +237,7 @@ void intel_pmu_lbr_reset_64(void) wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, 0); + wrmsrl(x86_pmu.lbr_info + i, 0); } } @@ -343,6 +343,11 @@ static __always_inline void wrlbr_to(unsigned int idx, u64 val) wrmsrl(x86_pmu.lbr_to + idx, val); } +static __always_inline void wrlbr_info(unsigned int idx, u64 val) +{ + wrmsrl(x86_pmu.lbr_info + idx, val); +} + static __always_inline u64 rdlbr_from(unsigned int idx) { u64 val; @@ -361,8 +366,44 @@ static __always_inline u64 rdlbr_to(unsigned int idx) return val; } +static __always_inline u64 rdlbr_info(unsigned int idx) +{ + u64 val; + + rdmsrl(x86_pmu.lbr_info + idx, val); + + return val; +} + +static inline void +wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + wrlbr_from(idx, lbr->from); + wrlbr_to(idx, lbr->to); + if (need_info) + wrlbr_info(idx, lbr->info); +} + +static inline bool +rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + u64 from = rdlbr_from(idx); + + /* Don't read invalid entry */ + if (!from) + return false; + + lbr->from = from; + lbr->to = rdlbr_to(idx); + if (need_info) + lbr->info = rdlbr_info(idx); + + return true; +} + void intel_pmu_lbr_restore(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; int i; @@ -372,11 +413,7 @@ void intel_pmu_lbr_restore(void *ctx) mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr[i].from); - wrlbr_to(lbr_idx, task_ctx->lbr[i].to); - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); + wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -384,7 +421,7 @@ void intel_pmu_lbr_restore(void *ctx) wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + wrlbr_info(lbr_idx, 0); } wrmsrl(x86_pmu.lbr_tos, tos); @@ -427,23 +464,19 @@ static void __intel_pmu_lbr_restore(void *ctx) void intel_pmu_lbr_save(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; unsigned lbr_idx, mask; - u64 tos, from; + u64 tos; int i; mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - from = rdlbr_from(lbr_idx); - if (!from) + if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info))
[tip: perf/core] perf/x86/intel/lbr: Add a function pointer for LBR read
The following commit has been merged into the perf/core branch of tip: Commit-ID: c301b1d80ed5b806834fe0f739f028f65fb4fb16 Gitweb: https://git.kernel.org/tip/c301b1d80ed5b806834fe0f739f028f65fb4fb16 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:09 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:51 +02:00 perf/x86/intel/lbr: Add a function pointer for LBR read The method to read Architectural LBRs is different from previous model-specific LBR. Perf has to implement a different function. A function pointer for LBR read is introduced. Perf should initialize the corresponding function at boot time, and avoid checking lbr_format at run time. The current 64-bit LBR read function is set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-4-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/core.c | 6 +- arch/x86/events/intel/lbr.c | 9 +++-- arch/x86/events/perf_event.h | 5 + 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fe49e99..6414b47 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3980,6 +3980,7 @@ static __initconst const struct x86_pmu core_pmu = { .check_period = intel_pmu_check_period, .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4027,6 +4028,7 @@ static __initconst const struct x86_pmu intel_pmu = { .aux_output_match = intel_pmu_aux_output_match, .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, }; static __init void intel_clovertown_quirk(void) @@ -4653,8 +4655,10 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) { x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + x86_pmu.lbr_read = intel_pmu_lbr_read_32; + } intel_ds_init(); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 7af27a7..b8943f4 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -562,7 +562,7 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; u64 tos = intel_pmu_lbr_tos(); @@ -599,7 +599,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) * is the same as the linear address, allowing us to merge the LIP and EIP * LBR formats. */ -static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; @@ -704,10 +704,7 @@ void intel_pmu_lbr_read(void) cpuc->lbr_users == cpuc->lbr_pebs_users) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_read_32(cpuc); - else - intel_pmu_lbr_read_64(cpuc); + x86_pmu.lbr_read(cpuc); intel_pmu_lbr_filter(cpuc); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 5c1ad43..312d27f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -694,6 +694,7 @@ struct x86_pmu { boollbr_pt_coexist;/* (LBR|BTS) may coexist with PT */ void(*lbr_reset)(void); + void(*lbr_read)(struct cpu_hw_events *cpuc); /* * Intel PT/LBR/BTS are exclusive @@ -1085,6 +1086,10 @@ void intel_pmu_lbr_disable_all(void); void intel_pmu_lbr_read(void); +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void);
[tip: perf/core] x86/cpufeatures: Add Architectural LBRs feature bit
The following commit has been merged into the perf/core branch of tip: Commit-ID: bd657aa3dd8514e62486ce7f90b5e484c18d684d Gitweb: https://git.kernel.org/tip/bd657aa3dd8514e62486ce7f90b5e484c18d684d Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:07 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:51 +02:00 x86/cpufeatures: Add Architectural LBRs feature bit CPUID.(EAX=07H, ECX=0):EDX[19] indicates whether an Intel CPU supports Architectural LBRs. The "X86_FEATURE_..., word 18" is already mirrored from CPUID "0x0007:0 (EDX)". Add X86_FEATURE_ARCH_LBR under the "word 18" section. The feature will appear as "arch_lbr" in /proc/cpuinfo. The Architectural Last Branch Records (LBR) feature enables recording of software path history by logging taken branches and other control flows. The feature will be supported in the perf_events subsystem. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-2-git-send-email-kan.li...@linux.intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 02dabc9..72ba4c5 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -366,6 +366,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT(18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG(18*32+18) /* Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP(18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
[tip: perf/core] x86/msr-index: Add bunch of MSRs for Arch LBR
The following commit has been merged into the perf/core branch of tip: Commit-ID: d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f Gitweb: https://git.kernel.org/tip/d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:13 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:52 +02:00 x86/msr-index: Add bunch of MSRs for Arch LBR Add Arch LBR related MSRs and the new LBR INFO bits in MSR-index. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-8-git-send-email-kan.li...@linux.intel.com --- arch/x86/include/asm/msr-index.h | 16 1 file changed, 16 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e8370e6..bdc07fc 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -158,7 +158,23 @@ #define LBR_INFO_MISPRED BIT_ULL(63) #define LBR_INFO_IN_TX BIT_ULL(62) #define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYC_CNT_VALID BIT_ULL(60) #define LBR_INFO_CYCLES0x +#define LBR_INFO_BR_TYPE_OFFSET56 +#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) + +#define MSR_ARCH_LBR_CTL 0x14ce +#define ARCH_LBR_CTL_LBREN BIT(0) +#define ARCH_LBR_CTL_CPL_OFFSET1 +#define ARCH_LBR_CTL_CPL (0x3ull << ARCH_LBR_CTL_CPL_OFFSET) +#define ARCH_LBR_CTL_STACK_OFFSET 3 +#define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET) +#define ARCH_LBR_CTL_FILTER_OFFSET 16 +#define ARCH_LBR_CTL_FILTER(0x7full << ARCH_LBR_CTL_FILTER_OFFSET) +#define MSR_ARCH_LBR_DEPTH 0x14cf +#define MSR_ARCH_LBR_FROM_00x1500 +#define MSR_ARCH_LBR_TO_0 0x1600 +#define MSR_ARCH_LBR_INFO_00x1200 #define MSR_IA32_PEBS_ENABLE 0x03f1 #define MSR_PEBS_DATA_CFG 0x03f2
[tip: perf/core] perf/x86/intel/lbr: Unify the stored format of LBR information
The following commit has been merged into the perf/core branch of tip: Commit-ID: 5624986dc61b81a77fb6136bc232593483d1c254 Gitweb: https://git.kernel.org/tip/5624986dc61b81a77fb6136bc232593483d1c254 Author:Kan Liang AuthorDate:Fri, 03 Jul 2020 05:49:16 -07:00 Committer: Peter Zijlstra CommitterDate: Wed, 08 Jul 2020 11:38:53 +02:00 perf/x86/intel/lbr: Unify the stored format of LBR information Current LBR information in the structure x86_perf_task_context is stored in a different format from the PEBS LBR record and Architecture LBR, which prevents the sharing of the common codes. Use the format of the PEBS LBR record as a unified format. Use a generic name lbr_entry to replace pebs_lbr_entry. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-11-git-send-email-kan.li...@linux.intel.com --- arch/x86/events/intel/ds.c| 6 +++--- arch/x86/events/intel/lbr.c | 20 ++-- arch/x86/events/perf_event.h | 6 ++ arch/x86/include/asm/perf_event.h | 6 +- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index dc43cc1..86848c5 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -954,7 +954,7 @@ static void adaptive_pebs_record_size_update(void) if (pebs_data_cfg & PEBS_DATACFG_XMMS) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) - sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); cpuc->pebs_record_size = sz; } @@ -1595,10 +1595,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_LBRS) { - struct pebs_lbr *lbr = next_record; + struct lbr_entry *lbr = next_record; int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; - next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + next_record = next_record + num_lbr * sizeof(struct lbr_entry); if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 7742562..b8baaf1 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -372,11 +372,11 @@ void intel_pmu_lbr_restore(void *ctx) mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); + wrlbr_from(lbr_idx, task_ctx->lbr[i].from); + wrlbr_to(lbr_idx, task_ctx->lbr[i].to); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -440,10 +440,10 @@ void intel_pmu_lbr_save(void *ctx) from = rdlbr_from(lbr_idx); if (!from) break; - task_ctx->lbr_from[i] = from; - task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); + task_ctx->lbr[i].from = from; + task_ctx->lbr[i].to = rdlbr_to(lbr_idx); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; @@ -1179,7 +1179,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; @@ -1193,11 +1193,11 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr->lbr[i].info; + u64 info = lbr[i].info; struct perf_branch_entry *e = &cpuc->lbr_entries[i]; - e->from = lbr->lbr[i].from; - e->to = lbr->lbr[i].to; + e->from = lbr[i].from; + e->to = lbr[i].to; e->mispred = !!(info & LBR_INFO_MISPRED); e->predicted= !(info & LBR_INFO_MISPRED); e->in_tx= !!(info & LBR_INFO_IN_TX); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ba