[tip: perf/core] perf/x86: Hybrid PMU support for counters

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: d4b294bf84db7a84e295ddf19cb8e7f71b7bd045
Gitweb:
https://git.kernel.org/tip/d4b294bf84db7a84e295ddf19cb8e7f71b7bd045
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:46 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00

perf/x86: Hybrid PMU support for counters

The number of GP and fixed counters are different among hybrid PMUs.
Each hybrid PMU should use its own counter related information.

When handling a certain hybrid PMU, apply the number of counters from
the corresponding hybrid PMU.

When reserving the counters in the initialization of a new event,
reserve all possible counters.

The number of counter recored in the global x86_pmu is for the
architecture counters which are available for all hybrid PMUs. KVM
doesn't support the hybrid PMU yet. Return the number of the
architecture counters for now.

For the functions only available for the old platforms, e.g.,
intel_pmu_drain_pebs_nhm(), nothing is changed.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-7-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 55 ---
 arch/x86/events/intel/core.c |  8 +++--
 arch/x86/events/intel/ds.c   | 14 +
 arch/x86/events/perf_event.h |  4 +++-
 4 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7d3c19e..1aeb31c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -185,16 +185,29 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
+static inline int get_possible_num_counters(void)
+{
+   int i, num_counters = x86_pmu.num_counters;
+
+   if (!is_hybrid())
+   return num_counters;
+
+   for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
+   num_counters = max_t(int, num_counters, 
x86_pmu.hybrid_pmu[i].num_counters);
+
+   return num_counters;
+}
+
 static bool reserve_pmc_hardware(void)
 {
-   int i;
+   int i, num_counters = get_possible_num_counters();
 
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
goto perfctr_fail;
}
 
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
goto eventsel_fail;
}
@@ -205,7 +218,7 @@ eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu_config_addr(i));
 
-   i = x86_pmu.num_counters;
+   i = num_counters;
 
 perfctr_fail:
for (i--; i >= 0; i--)
@@ -216,9 +229,9 @@ perfctr_fail:
 
 static void release_pmc_hardware(void)
 {
-   int i;
+   int i, num_counters = get_possible_num_counters();
 
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
release_perfctr_nmi(x86_pmu_event_addr(i));
release_evntsel_nmi(x86_pmu_config_addr(i));
}
@@ -946,6 +959,7 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
 
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
+   int num_counters = hybrid(cpuc->pmu, num_counters);
struct event_constraint *c;
struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
@@ -1021,7 +1035,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int 
n, int *assign)
 
/* slow path */
if (i != n) {
-   int gpmax = x86_pmu.num_counters;
+   int gpmax = num_counters;
 
/*
 * Do not allow scheduling of more than half the available
@@ -1042,7 +1056,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int 
n, int *assign)
 * the extra Merge events needed by large increment events.
 */
if (x86_pmu.flags & PMU_FL_PAIR) {
-   gpmax = x86_pmu.num_counters - cpuc->n_pair;
+   gpmax = num_counters - cpuc->n_pair;
WARN_ON(gpmax <= 0);
}
 
@@ -1129,10 +1143,12 @@ static int collect_event(struct cpu_hw_events *cpuc, 
struct perf_event *event,
  */
 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event 
*leader, bool dogrp)
 {
+   int num_counters = hybrid(cpuc->pmu, num_counters);
+   int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct perf_event *event;
int n, max_count;
 
-   max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+   max_count = num_counters + num_counters_fixed;
 
/* current number of events already accepted */
n = cpuc->n_events;
@@ -1499,18 +1515,18

[tip: perf/core] perf/x86: Track pmu in per-CPU cpu_hw_events

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 61e76d53c39bb768ad264d379837cfc56b9e35b4
Gitweb:
https://git.kernel.org/tip/61e76d53c39bb768ad264d379837cfc56b9e35b4
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:43 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:24 +02:00

perf/x86: Track pmu in per-CPU cpu_hw_events

Some platforms, e.g. Alder Lake, have hybrid architecture. In the same
package, there may be more than one type of CPU. The PMU capabilities
are different among different types of CPU. Perf will register a
dedicated PMU for each type of CPU.

Add a 'pmu' variable in the struct cpu_hw_events to track the dedicated
PMU of the current CPU.

Current x86_get_pmu() use the global 'pmu', which will be broken on a
hybrid platform. Modify it to apply the 'pmu' of the specific CPU.

Initialize the per-CPU 'pmu' variable with the global 'pmu'. There is
nothing changed for the non-hybrid platforms.

The is_x86_event() will be updated in the later patch ("perf/x86:
Register hybrid PMUs") for hybrid platforms. For the non-hybrid
platforms, nothing is changed here.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618237865-33448-4-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 17 +
 arch/x86/events/intel/core.c |  2 +-
 arch/x86/events/intel/ds.c   |  4 ++--
 arch/x86/events/intel/lbr.c  |  9 +
 arch/x86/events/perf_event.h |  4 +++-
 5 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index dd9f3c2..a49a8bd 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -45,9 +45,11 @@
 #include "perf_event.h"
 
 struct x86_pmu x86_pmu __read_mostly;
+static struct pmu pmu;
 
 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
+   .pmu = &pmu,
 };
 
 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
@@ -724,16 +726,23 @@ void x86_pmu_enable_all(int added)
}
 }
 
-static struct pmu pmu;
-
 static inline int is_x86_event(struct perf_event *event)
 {
return event->pmu == &pmu;
 }
 
-struct pmu *x86_get_pmu(void)
+struct pmu *x86_get_pmu(unsigned int cpu)
 {
-   return &pmu;
+   struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+   /*
+* All CPUs of the hybrid type have been offline.
+* The x86_get_pmu() should not be invoked.
+*/
+   if (WARN_ON_ONCE(!cpuc->pmu))
+   return &pmu;
+
+   return cpuc->pmu;
 }
 /*
  * Event scheduler state:
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7bbb5bb..f116c63 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4876,7 +4876,7 @@ static void update_tfa_sched(void *ignored)
 * and if so force schedule out for all event types all contexts
 */
if (test_bit(3, cpuc->active_mask))
-   perf_pmu_resched(x86_get_pmu());
+   perf_pmu_resched(x86_get_pmu(smp_processor_id()));
 }
 
 static ssize_t show_sysctl_tfa(struct device *cdev,
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7ebae18..1bfea8c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2192,7 +2192,7 @@ void __init intel_ds_init(void)
PERF_SAMPLE_TIME;
x86_pmu.flags |= PMU_FL_PEBS_ALL;
pebs_qual = "-baseline";
-   x86_get_pmu()->capabilities |= 
PERF_PMU_CAP_EXTENDED_REGS;
+   x86_get_pmu(smp_processor_id())->capabilities 
|= PERF_PMU_CAP_EXTENDED_REGS;
} else {
/* Only basic record supported */
x86_pmu.large_pebs_flags &=
@@ -2207,7 +2207,7 @@ void __init intel_ds_init(void)
 
if (x86_pmu.intel_cap.pebs_output_pt_available) {
pr_cont("PEBS-via-PT, ");
-   x86_get_pmu()->capabilities |= 
PERF_PMU_CAP_AUX_OUTPUT;
+   x86_get_pmu(smp_processor_id())->capabilities 
|= PERF_PMU_CAP_AUX_OUTPUT;
}
 
break;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 21890da..bb4486c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -705,7 +705,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
 void release_lbr_buffers(void)
 {
-   struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+   struct kmem_cache *kmem_cache;
struct cpu_hw_events *cpuc;
int cpu;
 
@@ -714,6 +714,7 @@ void release_lbr_buffers(void)
 
for_each_possible_cpu(cpu) {
cpuc = per_cpu_pt

[tip: perf/core] perf/x86/intel: Hybrid PMU support for perf capabilities

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: d0946a882e6220229a29f9031641e54379be5a1e
Gitweb:
https://git.kernel.org/tip/d0946a882e6220229a29f9031641e54379be5a1e
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:44 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:24 +02:00

perf/x86/intel: Hybrid PMU support for perf capabilities

Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.

Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.

The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.

For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.

Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   |  7 +--
 arch/x86/events/intel/core.c | 22 +
 arch/x86/events/intel/ds.c   |  2 +-
 arch/x86/events/perf_event.h | 33 +++-
 arch/x86/include/asm/msr-index.h |  3 +++-
 5 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a49a8bd..7fc2001 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -54,6 +54,7 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 
 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
+DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
 
 /*
  * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
@@ -1105,8 +1106,9 @@ static void del_nr_metric_event(struct cpu_hw_events 
*cpuc,
 static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
 int max_count, int n)
 {
+   union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
 
-   if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
+   if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
return -EINVAL;
 
if (n >= max_count + cpuc->n_metric)
@@ -1581,6 +1583,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
 static void x86_pmu_del(struct perf_event *event, int flags)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+   union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
int i;
 
/*
@@ -1620,7 +1623,7 @@ static void x86_pmu_del(struct perf_event *event, int 
flags)
}
cpuc->event_constraint[i-1] = NULL;
--cpuc->n_events;
-   if (x86_pmu.intel_cap.perf_metrics)
+   if (intel_cap.perf_metrics)
del_nr_metric_event(cpuc, event);
 
perf_event_update_userpage(event);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f116c63..dc9e2fb 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3646,6 +3646,12 @@ static inline bool is_mem_loads_aux_event(struct 
perf_event *event)
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == 
X86_CONFIG(.event=0x03, .umask=0x82);
 }
 
+static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
+{
+   union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap);
+
+   return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
+}
 
 static int intel_pmu_hw_config(struct perf_event *event)
 {
@@ -3712,7 +3718,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 * with a slots event as group leader. When the slots event
 * is used in a metrics group, it too cannot support sampling.
 */
-   if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) {
+   if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX

[tip: perf/core] perf/x86: Hybrid PMU support for intel_ctrl

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: fc4b8fca2d8fc8aecd58508e81d55afe4ed76344
Gitweb:
https://git.kernel.org/tip/fc4b8fca2d8fc8aecd58508e81d55afe4ed76344
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:45 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:24 +02:00

perf/x86: Hybrid PMU support for intel_ctrl

The intel_ctrl is the counter mask of a PMU. The PMU counter information
may be different among hybrid PMUs, each hybrid PMU should use its own
intel_ctrl to check and access the counters.

When handling a certain hybrid PMU, apply the intel_ctrl from the
corresponding hybrid PMU.

When checking the HW existence, apply the PMU and number of counters
from the corresponding hybrid PMU as well. Perf will check the HW
existence for each Hybrid PMU before registration. Expose the
check_hw_exists() for a later patch.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-6-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 14 +++---
 arch/x86/events/intel/core.c | 14 +-
 arch/x86/events/perf_event.h | 10 --
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7fc2001..7d3c19e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -231,7 +231,7 @@ static void release_pmc_hardware(void) {}
 
 #endif
 
-static bool check_hw_exists(void)
+bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
 {
u64 val, val_fail = -1, val_new= ~0;
int i, reg, reg_fail = -1, ret = 0;
@@ -242,7 +242,7 @@ static bool check_hw_exists(void)
 * Check to see if the BIOS enabled any of the counters, if so
 * complain and bail.
 */
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
@@ -256,13 +256,13 @@ static bool check_hw_exists(void)
}
}
 
-   if (x86_pmu.num_counters_fixed) {
+   if (num_counters_fixed) {
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
-   for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
-   if (fixed_counter_disabled(i))
+   for (i = 0; i < num_counters_fixed; i++) {
+   if (fixed_counter_disabled(i, pmu))
continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
@@ -1547,7 +1547,7 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-   if (fixed_counter_disabled(idx))
+   if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
@@ -1992,7 +1992,7 @@ static int __init init_hw_perf_events(void)
pmu_check_apic();
 
/* sanity check that the hardware exists or is emulated */
-   if (!check_hw_exists())
+   if (!check_hw_exists(&pmu, x86_pmu.num_counters, 
x86_pmu.num_counters_fixed))
return 0;
 
pr_cont("%s PMU driver.\n", x86_pmu.name);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index dc9e2fb..2d56055 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2153,10 +2153,11 @@ static void intel_pmu_disable_all(void)
 static void __intel_pmu_enable_all(int added, bool pmi)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+   u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
 
intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
-   x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+  intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
 
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
struct perf_event *event =
@@ -2709,6 +2710,7 @@ int intel_pmu_save_and_restart(struct perf_event *event)
 static void intel_pmu_reset(void)
 {
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+   struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
unsigned long flags;
int idx;
 
@@ -2724,7 +2726,7 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-   if (fixed_counter_disabled(idx))
+   if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
wrmsrl_saf

[tip: perf/core] perf/x86: Hybrid PMU support for hardware cache event

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 0d18f2dfead8dd63bf1186c9ef38528d6a615a55
Gitweb:
https://git.kernel.org/tip/0d18f2dfead8dd63bf1186c9ef38528d6a615a55
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:48 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00

perf/x86: Hybrid PMU support for hardware cache event

The hardware cache events are different among hybrid PMUs. Each hybrid
PMU should have its own hw cache event table.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618237865-33448-9-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   |  5 ++---
 arch/x86/events/perf_event.h |  9 +
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1aeb31c..e8cb892 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -376,8 +376,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct 
perf_event *event)
return -EINVAL;
cache_result = array_index_nospec(cache_result, 
PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-   val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
+   val = hybrid_var(event->pmu, 
hw_cache_event_ids)[cache_type][cache_op][cache_result];
if (val == 0)
return -ENOENT;
 
@@ -385,7 +384,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct 
perf_event *event)
return -EINVAL;
 
hwc->config |= val;
-   attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+   attr->config1 = hybrid_var(event->pmu, 
hw_cache_extra_regs)[cache_type][cache_op][cache_result];
return x86_pmu_extra_regs(val, event);
 }
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 2688e45..b65cf46 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -639,6 +639,15 @@ struct x86_hybrid_pmu {
int num_counters;
int num_counters_fixed;
struct event_constraint unconstrained;
+
+   u64 hw_cache_event_ids
+   [PERF_COUNT_HW_CACHE_MAX]
+   [PERF_COUNT_HW_CACHE_OP_MAX]
+   [PERF_COUNT_HW_CACHE_RESULT_MAX];
+   u64 hw_cache_extra_regs
+   [PERF_COUNT_HW_CACHE_MAX]
+   [PERF_COUNT_HW_CACHE_OP_MAX]
+   [PERF_COUNT_HW_CACHE_RESULT_MAX];
 };
 
 static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)


[tip: perf/core] perf/x86: Hybrid PMU support for unconstrained

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: eaacf07d1116f6bf3b93b265515fccf2301097f2
Gitweb:
https://git.kernel.org/tip/eaacf07d1116f6bf3b93b265515fccf2301097f2
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:47 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00

perf/x86: Hybrid PMU support for unconstrained

The unconstrained value depends on the number of GP and fixed counters.
Each hybrid PMU should use its own unconstrained.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618237865-33448-8-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c |  2 +-
 arch/x86/events/perf_event.h | 11 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 3ea0126..4cfc382 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3147,7 +3147,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int 
idx,
}
}
 
-   return &unconstrained;
+   return &hybrid_var(cpuc->pmu, unconstrained);
 }
 
 static struct event_constraint *
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 0539ad4..2688e45 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -638,6 +638,7 @@ struct x86_hybrid_pmu {
int max_pebs_events;
int num_counters;
int num_counters_fixed;
+   struct event_constraint unconstrained;
 };
 
 static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
@@ -658,6 +659,16 @@ extern struct static_key_false perf_is_hybrid;
__Fp;   \
 }))
 
+#define hybrid_var(_pmu, _var) \
+(*({   \
+   typeof(&_var) __Fp = &_var; \
+   \
+   if (is_hybrid() && (_pmu))  \
+   __Fp = &hybrid_pmu(_pmu)->_var; \
+   \
+   __Fp;   \
+}))
+
 /*
  * struct x86_pmu - generic x86 pmu
  */


[tip: perf/core] perf/x86: Hybrid PMU support for event constraints

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 24ee38ffe61a68fc35065fcab1908883a34c866b
Gitweb:
https://git.kernel.org/tip/24ee38ffe61a68fc35065fcab1908883a34c866b
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:49 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:25 +02:00

perf/x86: Hybrid PMU support for event constraints

The events are different among hybrid PMUs. Each hybrid PMU should use
its own event constraints.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-10-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 3 ++-
 arch/x86/events/intel/core.c | 5 +++--
 arch/x86/events/intel/ds.c   | 5 +++--
 arch/x86/events/perf_event.h | 2 ++
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e8cb892..f92d234 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1518,6 +1518,7 @@ void perf_event_print_debug(void)
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
int num_counters = hybrid(cpuc->pmu, num_counters);
int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+   struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, 
pebs_constraints);
unsigned long flags;
int idx;
 
@@ -1537,7 +1538,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed:  %016llx\n", cpu, fixed);
-   if (x86_pmu.pebs_constraints) {
+   if (pebs_constraints) {
rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("CPU#%d: pebs:   %016llx\n", cpu, pebs);
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 4cfc382..447a80f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3136,10 +3136,11 @@ struct event_constraint *
 x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
  struct perf_event *event)
 {
+   struct event_constraint *event_constraints = hybrid(cpuc->pmu, 
event_constraints);
struct event_constraint *c;
 
-   if (x86_pmu.event_constraints) {
-   for_each_event_constraint(c, x86_pmu.event_constraints) {
+   if (event_constraints) {
+   for_each_event_constraint(c, event_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 312bf3b..f1402bc 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -959,13 +959,14 @@ struct event_constraint 
intel_spr_pebs_event_constraints[] = {
 
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
+   struct event_constraint *pebs_constraints = hybrid(event->pmu, 
pebs_constraints);
struct event_constraint *c;
 
if (!event->attr.precise_ip)
return NULL;
 
-   if (x86_pmu.pebs_constraints) {
-   for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+   if (pebs_constraints) {
+   for_each_event_constraint(c, pebs_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index b65cf46..34b7fc9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -648,6 +648,8 @@ struct x86_hybrid_pmu {
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
+   struct event_constraint *event_constraints;
+   struct event_constraint *pebs_constraints;
 };
 
 static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)


[tip: perf/core] perf/x86: Hybrid PMU support for extra_regs

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 183af7366b4e813ee4e0b995ff731e3ac28251f0
Gitweb:
https://git.kernel.org/tip/183af7366b4e813ee4e0b995ff731e3ac28251f0
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:50 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00

perf/x86: Hybrid PMU support for extra_regs

Different hybrid PMU may have different extra registers, e.g. Core PMU
may have offcore registers, frontend register and ldlat register. Atom
core may only have offcore registers and ldlat register. Each hybrid PMU
should use its own extra_regs.

An Intel Hybrid system should always have extra registers.
Unconditionally allocate shared_regs for Intel Hybrid system.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-11-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   |  5 +++--
 arch/x86/events/intel/core.c | 15 +--
 arch/x86/events/perf_event.h |  1 +
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f92d234..57d3fe1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -154,15 +154,16 @@ again:
  */
 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 {
+   struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
struct hw_perf_event_extra *reg;
struct extra_reg *er;
 
reg = &event->hw.extra_reg;
 
-   if (!x86_pmu.extra_regs)
+   if (!extra_regs)
return 0;
 
-   for (er = x86_pmu.extra_regs; er->msr; er++) {
+   for (er = extra_regs; er->msr; er++) {
if (er->event != (config & er->config_mask))
continue;
if (event->attr.config1 & ~er->valid_mask)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 447a80f..f727aa5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2966,8 +2966,10 @@ intel_vlbr_constraints(struct perf_event *event)
return NULL;
 }
 
-static int intel_alt_er(int idx, u64 config)
+static int intel_alt_er(struct cpu_hw_events *cpuc,
+   int idx, u64 config)
 {
+   struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs);
int alt_idx = idx;
 
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
@@ -2979,7 +2981,7 @@ static int intel_alt_er(int idx, u64 config)
if (idx == EXTRA_REG_RSP_1)
alt_idx = EXTRA_REG_RSP_0;
 
-   if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+   if (config & ~extra_regs[alt_idx].valid_mask)
return idx;
 
return alt_idx;
@@ -2987,15 +2989,16 @@ static int intel_alt_er(int idx, u64 config)
 
 static void intel_fixup_er(struct perf_event *event, int idx)
 {
+   struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
event->hw.extra_reg.idx = idx;
 
if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-   event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+   event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
} else if (idx == EXTRA_REG_RSP_1) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-   event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+   event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
 }
@@ -3071,7 +3074,7 @@ again:
 */
c = NULL;
} else {
-   idx = intel_alt_er(idx, reg->config);
+   idx = intel_alt_er(cpuc, idx, reg->config);
if (idx != reg->idx) {
raw_spin_unlock_irqrestore(&era->lock, flags);
goto again;
@@ -4155,7 +4158,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int 
cpu)
 {
cpuc->pebs_record_size = x86_pmu.pebs_record_size;
 
-   if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+   if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
goto err;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 34b7fc9..d8c448b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -650,6 +650,7 @@ struct x86_hybrid_pmu {
[PERF_COUNT_HW_CACHE_RESULT_MAX];
struct event_constraint *event_constraints;
struct event_constraint *pebs_constraints;
+   struct extra_reg*extra_regs;
 };
 
 static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct p

[tip: perf/core] perf/x86/intel: Factor out intel_pmu_check_num_counters

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: b8c4d1a87610ba20da1abddb7aacbde0b2817c1a
Gitweb:
https://git.kernel.org/tip/b8c4d1a87610ba20da1abddb7aacbde0b2817c1a
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:51 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00

perf/x86/intel: Factor out intel_pmu_check_num_counters

Each Hybrid PMU has to check its own number of counters and mask fixed
counters before registration.

The intel_pmu_check_num_counters will be reused later to check the
number of the counters for each hybrid PMU.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-12-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 38 ++-
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f727aa5..d7e2021 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5064,6 +5064,26 @@ static const struct attribute_group *attr_update[] = {
 
 static struct attribute *empty_attrs;
 
+static void intel_pmu_check_num_counters(int *num_counters,
+int *num_counters_fixed,
+u64 *intel_ctrl, u64 fixed_mask)
+{
+   if (*num_counters > INTEL_PMC_MAX_GENERIC) {
+   WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+*num_counters, INTEL_PMC_MAX_GENERIC);
+   *num_counters = INTEL_PMC_MAX_GENERIC;
+   }
+   *intel_ctrl = (1ULL << *num_counters) - 1;
+
+   if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+   WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+*num_counters_fixed, INTEL_PMC_MAX_FIXED);
+   *num_counters_fixed = INTEL_PMC_MAX_FIXED;
+   }
+
+   *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
+}
+
 __init int intel_pmu_init(void)
 {
struct attribute **extra_skl_attr = &empty_attrs;
@@ -5703,20 +5723,10 @@ __init int intel_pmu_init(void)
 
x86_pmu.attr_update = attr_update;
 
-   if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
-   WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
-   x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
-   }
-   x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
-
-   if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
-   WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
-   x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
-   }
-
-   x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
+   intel_pmu_check_num_counters(&x86_pmu.num_counters,
+&x86_pmu.num_counters_fixed,
+&x86_pmu.intel_ctrl,
+(u64)fixed_mask);
 
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)


[tip: perf/core] perf/x86/intel: Factor out intel_pmu_check_event_constraints

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: bc14fe1beeec1d80ee39f03019c10e130c8d376b
Gitweb:
https://git.kernel.org/tip/bc14fe1beeec1d80ee39f03019c10e130c8d376b
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:52 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00

perf/x86/intel: Factor out intel_pmu_check_event_constraints

Each Hybrid PMU has to check and update its own event constraints before
registration.

The intel_pmu_check_event_constraints will be reused later to check
the event constraints of each hybrid PMU.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-13-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 82 ---
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index d7e2021..5c5f330 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5084,6 +5084,49 @@ static void intel_pmu_check_num_counters(int 
*num_counters,
*intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
 }
 
+static void intel_pmu_check_event_constraints(struct event_constraint 
*event_constraints,
+ int num_counters,
+ int num_counters_fixed,
+ u64 intel_ctrl)
+{
+   struct event_constraint *c;
+
+   if (!event_constraints)
+   return;
+
+   /*
+* event on fixed counter2 (REF_CYCLES) only works on this
+* counter, so do not extend mask to generic counters
+*/
+   for_each_event_constraint(c, event_constraints) {
+   /*
+* Don't extend the topdown slots and metrics
+* events to the generic counters.
+*/
+   if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+   /*
+* Disable topdown slots and metrics events,
+* if slots event is not in CPUID.
+*/
+   if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl))
+   c->idxmsk64 = 0;
+   c->weight = hweight64(c->idxmsk64);
+   continue;
+   }
+
+   if (c->cmask == FIXED_EVENT_FLAGS) {
+   /* Disabled fixed counters which are not in CPUID */
+   c->idxmsk64 &= intel_ctrl;
+
+   if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+   c->idxmsk64 |= (1ULL << num_counters) - 1;
+   }
+   c->idxmsk64 &=
+   ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed));
+   c->weight = hweight64(c->idxmsk64);
+   }
+}
+
 __init int intel_pmu_init(void)
 {
struct attribute **extra_skl_attr = &empty_attrs;
@@ -5094,7 +5137,6 @@ __init int intel_pmu_init(void)
union cpuid10_edx edx;
union cpuid10_eax eax;
union cpuid10_ebx ebx;
-   struct event_constraint *c;
unsigned int fixed_mask;
struct extra_reg *er;
bool pmem = false;
@@ -5732,40 +5774,10 @@ __init int intel_pmu_init(void)
if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
 
-   if (x86_pmu.event_constraints) {
-   /*
-* event on fixed counter2 (REF_CYCLES) only works on this
-* counter, so do not extend mask to generic counters
-*/
-   for_each_event_constraint(c, x86_pmu.event_constraints) {
-   /*
-* Don't extend the topdown slots and metrics
-* events to the generic counters.
-*/
-   if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
-   /*
-* Disable topdown slots and metrics events,
-* if slots event is not in CPUID.
-*/
-   if (!(INTEL_PMC_MSK_FIXED_SLOTS & 
x86_pmu.intel_ctrl))
-   c->idxmsk64 = 0;
-   c->weight = hweight64(c->idxmsk64);
-   continue;
-   }
-
-   if (c->cmask == FIXED_EVENT_FLAGS) {
-   /* Disabled fixed counters which are not in 
CPUID */
-   c->idxmsk64 &= x86_pmu.intel_ctrl;
-
-   if (c->idxmsk64 != 
INTEL_PMC_MSK_FIXED_REF_CYCLES)
-   c->idxmsk64 |= (1ULL << 
x86_pmu.num_counters) - 1;
- 

[tip: perf/core] perf/x86/intel: Factor out intel_pmu_check_extra_regs

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 34d5b61f29eea656be4283213273c33d5987e4d2
Gitweb:
https://git.kernel.org/tip/34d5b61f29eea656be4283213273c33d5987e4d2
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:53 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:26 +02:00

perf/x86/intel: Factor out intel_pmu_check_extra_regs

Each Hybrid PMU has to check and update its own extra registers before
registration.

The intel_pmu_check_extra_regs will be reused later to check the extra
registers of each hybrid PMU.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-14-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 35 +--
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5c5f330..55ccfbb 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5127,6 +5127,26 @@ static void intel_pmu_check_event_constraints(struct 
event_constraint *event_con
}
 }
 
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
+{
+   struct extra_reg *er;
+
+   /*
+* Access extra MSR may cause #GP under certain circumstances.
+* E.g. KVM doesn't support offcore event
+* Check all extra_regs here.
+*/
+   if (!extra_regs)
+   return;
+
+   for (er = extra_regs; er->msr; er++) {
+   er->extra_msr_access = check_msr(er->msr, 0x11UL);
+   /* Disable LBR select mapping */
+   if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+   x86_pmu.lbr_sel_map = NULL;
+   }
+}
+
 __init int intel_pmu_init(void)
 {
struct attribute **extra_skl_attr = &empty_attrs;
@@ -5138,7 +5158,6 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
unsigned int fixed_mask;
-   struct extra_reg *er;
bool pmem = false;
int version, i;
char *name;
@@ -5795,19 +5814,7 @@ __init int intel_pmu_init(void)
if (x86_pmu.lbr_nr)
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
 
-   /*
-* Access extra MSR may cause #GP under certain circumstances.
-* E.g. KVM doesn't support offcore event
-* Check all extra_regs here.
-*/
-   if (x86_pmu.extra_regs) {
-   for (er = x86_pmu.extra_regs; er->msr; er++) {
-   er->extra_msr_access = check_msr(er->msr, 0x11UL);
-   /* Disable LBR select mapping */
-   if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
-   x86_pmu.lbr_sel_map = NULL;
-   }
-   }
+   intel_pmu_check_extra_regs(x86_pmu.extra_regs);
 
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {


[tip: perf/core] perf/x86: Factor out x86_pmu_show_pmu_cap

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: e11c1a7eb302ac8f6f47c18fa662546405a5fd83
Gitweb:
https://git.kernel.org/tip/e11c1a7eb302ac8f6f47c18fa662546405a5fd83
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:55 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:27 +02:00

perf/x86: Factor out x86_pmu_show_pmu_cap

The PMU capabilities are different among hybrid PMUs. Perf should dump
the PMU capabilities information for each hybrid PMU.

Factor out x86_pmu_show_pmu_cap() which shows the PMU capabilities
information. The function will be reused later when registering a
dedicated hybrid PMU.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-16-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 25 -
 arch/x86/events/perf_event.h |  3 +++
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index ed8dcfb..2e7ae52 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1976,6 +1976,20 @@ static void _x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
 }
 
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl)
+{
+   pr_info("... version:%d\n", x86_pmu.version);
+   pr_info("... bit width:  %d\n", x86_pmu.cntval_bits);
+   pr_info("... generic registers:  %d\n", num_counters);
+   pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
+   pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+   pr_info("... fixed-purpose events:   %lu\n",
+   hweight641ULL << num_counters_fixed) - 1)
+   << INTEL_PMC_IDX_FIXED) & intel_ctrl));
+   pr_info("... event mask: %016Lx\n", intel_ctrl);
+}
+
 static int __init init_hw_perf_events(void)
 {
struct x86_pmu_quirk *quirk;
@@ -2036,15 +2050,8 @@ static int __init init_hw_perf_events(void)
 
pmu.attr_update = x86_pmu.attr_update;
 
-   pr_info("... version:%d\n", x86_pmu.version);
-   pr_info("... bit width:  %d\n", x86_pmu.cntval_bits);
-   pr_info("... generic registers:  %d\n", x86_pmu.num_counters);
-   pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
-   pr_info("... max period: %016Lx\n", x86_pmu.max_period);
-   pr_info("... fixed-purpose events:   %lu\n",
-   hweight641ULL << x86_pmu.num_counters_fixed) - 1)
-   << INTEL_PMC_IDX_FIXED) & 
x86_pmu.intel_ctrl));
-   pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+   x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed,
+x86_pmu.intel_ctrl);
 
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index d8c448b..a3534e3 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1092,6 +1092,9 @@ void x86_pmu_enable_event(struct perf_event *event);
 
 int x86_pmu_handle_irq(struct pt_regs *regs);
 
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl);
+
 extern struct event_constraint emptyconstraint;
 
 extern struct event_constraint unconstrained;


[tip: perf/core] perf/x86: Remove temporary pmu assignment in event_init

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: b98567298bad891774054113690b30bd90d5738d
Gitweb:
https://git.kernel.org/tip/b98567298bad891774054113690b30bd90d5738d
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:54 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:27 +02:00

perf/x86: Remove temporary pmu assignment in event_init

The temporary pmu assignment in event_init is unnecessary.

The assignment was introduced by commit 8113070d6639 ("perf_events:
Add fast-path to the rescheduling code"). At that time, event->pmu is
not assigned yet when initializing an event. The assignment is required.
However, from commit 7e5b2a01d2ca ("perf: provide PMU when initing
events"), the event->pmu is provided before event_init is invoked.
The temporary pmu assignment in event_init should be removed.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-15-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 57d3fe1..ed8dcfb 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2291,7 +2291,6 @@ out:
 
 static int x86_pmu_event_init(struct perf_event *event)
 {
-   struct pmu *tmp;
int err;
 
switch (event->attr.type) {
@@ -2306,20 +2305,10 @@ static int x86_pmu_event_init(struct perf_event *event)
 
err = __x86_pmu_event_init(event);
if (!err) {
-   /*
-* we temporarily connect event to its pmu
-* such that validate_group() can classify
-* it as an x86 event using is_x86_event()
-*/
-   tmp = event->pmu;
-   event->pmu = &pmu;
-
if (event->group_leader != event)
err = validate_group(event);
else
err = validate_event(event);
-
-   event->pmu = tmp;
}
if (err) {
if (event->destroy)


[tip: perf/core] perf/x86: Register hybrid PMUs

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: d9977c43bff895ed49a9d25e1f382b0a98bb271f
Gitweb:
https://git.kernel.org/tip/d9977c43bff895ed49a9d25e1f382b0a98bb271f
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:56 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:27 +02:00

perf/x86: Register hybrid PMUs

Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.

To check the X86 event, perf has to go through all possible hybrid pmus.

All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.

Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.

One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.

All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.

The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.

The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().

The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.

Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 137 +-
 arch/x86/events/intel/core.c |  93 ++-
 arch/x86/events/perf_event.h |  14 +++-
 3 files changed, 223 insertions(+), 21 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 2e7ae52..bd465a8 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -478,7 +478,7 @@ int x86_setup_perfctr(struct perf_event *event)
local64_set(&hwc->period_left, hwc->sample_period);
}
 
-   if (attr->type == PERF_TYPE_RAW)
+   if (attr->type == event->pmu->type)
return x86_pmu_extra_regs(event->attr.config, event);
 
if (attr->type == PERF_TYPE_HW_CACHE)
@@ -613,7 +613,7 @@ int x86_pmu_hw_config(struct perf_event *event)
if (!event->attr.exclude_kernel)
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 
-   if (event->attr.type == PERF_TYPE_RAW)
+   if (event->attr.type == event->pmu->type)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
if (event->attr.sample_period && x86_pmu.limit_period) {
@@ -742,7 +742,17 @@ void x86_pmu_enable_all(int added)
 
 static inline int is_x86_event(struct perf_event *event)
 {
-   return event->pmu == &pmu;
+   int i;
+
+   if (!is_hybrid())
+   return event->pmu == &pmu;
+
+   for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+   if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu)
+   return true;
+   }
+
+   return false;
 }
 
 struct pmu *x86_get_pmu(unsigned int cpu)
@@ -1990,6 +2000,23 @@ void x86_pmu_show_pmu_cap(int num_counters, int 
num_counters_fixed,
pr_info("... event mask: %016Lx\n", intel_ctrl);
 }
 
+/*
+ * The generic code is not hybrid friendly. The hybrid_pmu->pmu
+ * of the first registered PMU is unconditionally assigned to
+ * each possible cpuctx->ctx.pmu.
+ * Update the correct hybrid PMU to the cpuctx->ctx.pmu.
+ */
+void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
+{
+   struct perf_cpu_context *cpuctx;
+
+   if (!pmu->pmu_cpu_context)
+   return;
+
+   cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+   cpuctx->ctx.pmu = pmu;
+}
+
 static int __init init_hw_perf_events(void)
 {
struct x86_pmu_quirk *quirk;
@@ -2050,8 +2077,11 @@ static int __init init_hw_perf_events(void)
 
pmu.attr_update = x86_pmu.attr_update;
 
-   x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed,
-x86_pmu.intel_ctrl);
+   if (!is_hybrid()) {
+  

[tip: perf/core] perf/x86: Add structures for the attributes of Hybrid PMUs

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: a9c81ccdf52dd73a20178c40bca34cf52991fdea
Gitweb:
https://git.kernel.org/tip/a9c81ccdf52dd73a20178c40bca34cf52991fdea
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:57 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00

perf/x86: Add structures for the attributes of Hybrid PMUs

Hybrid PMUs have different events and formats. In theory, Hybrid PMU
specific attributes should be maintained in the dedicated struct
x86_hybrid_pmu, but it wastes space because the events and formats are
similar among Hybrid PMUs.

To reduce duplication, all hybrid PMUs will share a group of attributes
in the following patch. To distinguish an attribute from different
Hybrid PMUs, a PMU aware attribute structure is introduced. A PMU type
is required for the attribute structure. The type is internal usage. It
is not visible in the sysfs API.

Hybrid PMUs may support the same event name, but with different event
encoding, e.g., the mem-loads event on an Atom PMU has different event
encoding from a Core PMU. It brings issue if two attributes are
created for them. Current sysfs_update_group finds an attribute by
searching the attr name (aka event name). If two attributes have the
same event name, the first attribute will be replaced.
To address the issue, only one attribute is created for the event. The
event_str is extended and stores event encodings from all Hybrid PMUs.
Each event encoding is divided by ";". The order of the event encodings
must follow the order of the hybrid PMU index. The event_str is internal
usage as well. When a user wants to show the attribute of a Hybrid PMU,
only the corresponding part of the string is displayed.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-18-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 43 +++-
 arch/x86/events/perf_event.h | 19 +++-
 include/linux/perf_event.h   | 12 ++-
 3 files changed, 74 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index bd465a8..37ab109 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1860,6 +1860,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct 
device_attribute *attr,
pmu_attr->event_str_noht);
 }
 
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+struct device_attribute *attr,
+char *page)
+{
+   struct perf_pmu_events_hybrid_attr *pmu_attr =
+   container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
+   struct x86_hybrid_pmu *pmu;
+   const char *str, *next_str;
+   int i;
+
+   if (hweight64(pmu_attr->pmu_type) == 1)
+   return sprintf(page, "%s", pmu_attr->event_str);
+
+   /*
+* Hybrid PMUs may support the same event name, but with different
+* event encoding, e.g., the mem-loads event on an Atom PMU has
+* different event encoding from a Core PMU.
+*
+* The event_str includes all event encodings. Each event encoding
+* is divided by ";". The order of the event encodings must follow
+* the order of the hybrid PMU index.
+*/
+   pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+   str = pmu_attr->event_str;
+   for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+   if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
+   continue;
+   if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
+   next_str = strchr(str, ';');
+   if (next_str)
+   return snprintf(page, next_str - str + 1, "%s", 
str);
+   else
+   return sprintf(page, "%s", str);
+   }
+   str = strchr(str, ';');
+   str++;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
+
 EVENT_ATTR(cpu-cycles, CPU_CYCLES  );
 EVENT_ATTR(instructions,   INSTRUCTIONS);
 EVENT_ATTR(cache-references,   CACHE_REFERENCES);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 4282ce4..e2be927 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -979,6 +979,22 @@ static struct perf_pmu_events_ht_attr event_attr_##v = {   
\
.event_str_ht   = ht,   \
 }
 
+#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \
+static struct perf_pmu_events_hybrid_attr event_attr_##v = {   \
+   .attr   = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\
+   .id   

[tip: perf/core] perf/x86/intel: Add Alder Lake Hybrid support

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: f83d2f91d2590318e083d05bd7b1beda2489050e
Gitweb:
https://git.kernel.org/tip/f83d2f91d2590318e083d05bd7b1beda2489050e
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:31:00 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00

perf/x86/intel: Add Alder Lake Hybrid support

Alder Lake Hybrid system has two different types of core, Golden Cove
core and Gracemont core. The Golden Cove core is registered to
"cpu_core" PMU. The Gracemont core is registered to "cpu_atom" PMU.

The difference between the two PMUs include:
- Number of GP and fixed counters
- Events
- The "cpu_core" PMU supports Topdown metrics.
  The "cpu_atom" PMU supports PEBS-via-PT.

The "cpu_core" PMU is similar to the Sapphire Rapids PMU, but without
PMEM.
The "cpu_atom" PMU is similar to Tremont, but with different events,
event_constraints, extra_regs and number of counters.

The mem-loads AUX event workaround only applies to the Golden Cove core.

Users may disable all CPUs of the same CPU type on the command line or
in the BIOS. For this case, perf still register a PMU for the CPU type
but the CPU mask is 0.

Current caps/pmu_name is usually the microarch codename. Assign the
"alderlake_hybrid" to the caps/pmu_name of both PMUs to indicate the
hybrid Alder Lake microarchitecture.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-21-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 255 +-
 arch/x86/events/intel/ds.c   |   7 +-
 arch/x86/events/perf_event.h |   7 +-
 3 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ba24638..5272f34 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2076,6 +2076,14 @@ static struct extra_reg intel_tnt_extra_regs[] 
__read_mostly = {
EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
+   /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+   INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3full, 
RSP_0),
+   INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3full, 
RSP_1),
+   INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+   EVENT_EXTRA_END
+};
+
 #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
 #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
 #define KNL_MCDRAM_LOCAL   BIT_ULL(21)
@@ -2430,6 +2438,16 @@ static int icl_set_topdown_event_period(struct 
perf_event *event)
return 0;
 }
 
+static int adl_set_topdown_event_period(struct perf_event *event)
+{
+   struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+   if (pmu->cpu_type != hybrid_big)
+   return 0;
+
+   return icl_set_topdown_event_period(event);
+}
+
 static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
 {
u32 val;
@@ -2570,6 +2588,17 @@ static u64 icl_update_topdown_event(struct perf_event 
*event)
 x86_pmu.num_topdown_events - 
1);
 }
 
+static u64 adl_update_topdown_event(struct perf_event *event)
+{
+   struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+   if (pmu->cpu_type != hybrid_big)
+   return 0;
+
+   return icl_update_topdown_event(event);
+}
+
+
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -3655,6 +3684,17 @@ static inline bool is_mem_loads_aux_event(struct 
perf_event *event)
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == 
X86_CONFIG(.event=0x03, .umask=0x82);
 }
 
+static inline bool require_mem_loads_aux_event(struct perf_event *event)
+{
+   if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX))
+   return false;
+
+   if (is_hybrid())
+   return hybrid_pmu(event->pmu)->cpu_type == hybrid_big;
+
+   return true;
+}
+
 static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
 {
union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap);
@@ -3779,7 +3819,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 * event. The rule is to simplify the implementation of the check.
 * That's because perf cannot have a complete group at the moment.
 */
-   if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+   if (require_mem_loads_aux_event(event) &&
(event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
is_mem_loads_event(event)) {
struct perf_event *leader = event->group_leader;
@@ -4056,6 +4096,39 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, 
int idx,
return c;
 }
 
+static struct event_constraint *
+adl_get_event_c

[tip: perf/core] perf/x86/intel: Add attr_update for Hybrid PMUs

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 58ae30c29a370c09eb49e0007d881a9aed13c5a3
Gitweb:
https://git.kernel.org/tip/58ae30c29a370c09eb49e0007d881a9aed13c5a3
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:58 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00

perf/x86/intel: Add attr_update for Hybrid PMUs

The attribute_group for Hybrid PMUs should be different from the
previous
cpu PMU. For example, cpumask is required for a Hybrid PMU. The PMU type
should be included in the event and format attribute.

Add hybrid_attr_update for the Hybrid PMU.
Check the PMU type in is_visible() function. Only display the event or
format for the matched Hybrid PMU.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-19-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 120 --
 1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 4881209..ba24638 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5118,6 +5118,106 @@ static const struct attribute_group *attr_update[] = {
NULL,
 };
 
+static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
+{
+   struct device *dev = kobj_to_dev(kobj);
+   struct x86_hybrid_pmu *pmu =
+   container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+   struct perf_pmu_events_hybrid_attr *pmu_attr =
+   container_of(attr, struct perf_pmu_events_hybrid_attr, 
attr.attr);
+
+   return pmu->cpu_type & pmu_attr->pmu_type;
+}
+
+static umode_t hybrid_events_is_visible(struct kobject *kobj,
+   struct attribute *attr, int i)
+{
+   return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0;
+}
+
+static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu)
+{
+   int cpu = cpumask_first(&pmu->supported_cpus);
+
+   return (cpu >= nr_cpu_ids) ? -1 : cpu;
+}
+
+static umode_t hybrid_tsx_is_visible(struct kobject *kobj,
+struct attribute *attr, int i)
+{
+   struct device *dev = kobj_to_dev(kobj);
+   struct x86_hybrid_pmu *pmu =
+container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+   int cpu = hybrid_find_supported_cpu(pmu);
+
+   return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && 
cpu_has(&cpu_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0;
+}
+
+static umode_t hybrid_format_is_visible(struct kobject *kobj,
+   struct attribute *attr, int i)
+{
+   struct device *dev = kobj_to_dev(kobj);
+   struct x86_hybrid_pmu *pmu =
+   container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+   struct perf_pmu_format_hybrid_attr *pmu_attr =
+   container_of(attr, struct perf_pmu_format_hybrid_attr, 
attr.attr);
+   int cpu = hybrid_find_supported_cpu(pmu);
+
+   return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode 
: 0;
+}
+
+static struct attribute_group hybrid_group_events_td  = {
+   .name   = "events",
+   .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_mem = {
+   .name   = "events",
+   .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_tsx = {
+   .name   = "events",
+   .is_visible = hybrid_tsx_is_visible,
+};
+
+static struct attribute_group hybrid_group_format_extra = {
+   .name   = "format",
+   .is_visible = hybrid_format_is_visible,
+};
+
+static ssize_t intel_hybrid_get_attr_cpus(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+   struct x86_hybrid_pmu *pmu =
+   container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+   return cpumap_print_to_pagebuf(true, buf, &pmu->supported_cpus);
+}
+
+static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL);
+static struct attribute *intel_hybrid_cpus_attrs[] = {
+   &dev_attr_cpus.attr,
+   NULL,
+};
+
+static struct attribute_group hybrid_group_cpus = {
+   .attrs  = intel_hybrid_cpus_attrs,
+};
+
+static const struct attribute_group *hybrid_attr_update[] = {
+   &hybrid_group_events_td,
+   &hybrid_group_events_mem,
+   &hybrid_group_events_tsx,
+   &group_caps_gen,
+   &group_caps_lbr,
+   &hybrid_group_format_extra,
+   &group_default,
+   &hybrid_group_cpus,
+   NULL,
+};
+
 static struct attribute *empty_attrs;
 
 static void intel_pmu_check_num_counters(int *num_counters,
@@ -5861,14 +5961,22 @@ __init int intel

[tip: perf/core] perf/x86: Support filter_match callback

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 3e9a8b219e4cc897dba20e19185d0471f129f6f3
Gitweb:
https://git.kernel.org/tip/3e9a8b219e4cc897dba20e19185d0471f129f6f3
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:30:59 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:28 +02:00

perf/x86: Support filter_match callback

Implement filter_match callback for X86, which check whether an event is
schedulable on the current CPU.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-20-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 10 ++
 arch/x86/events/perf_event.h |  1 +
 2 files changed, 11 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 37ab109..4f6595e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2641,6 +2641,14 @@ static int x86_pmu_aux_output_match(struct perf_event 
*event)
return 0;
 }
 
+static int x86_pmu_filter_match(struct perf_event *event)
+{
+   if (x86_pmu.filter_match)
+   return x86_pmu.filter_match(event);
+
+   return 1;
+}
+
 static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
.pmu_disable= x86_pmu_disable,
@@ -2668,6 +2676,8 @@ static struct pmu pmu = {
.check_period   = x86_pmu_check_period,
 
.aux_output_match   = x86_pmu_aux_output_match,
+
+   .filter_match   = x86_pmu_filter_match,
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e2be927..606fb6e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -879,6 +879,7 @@ struct x86_pmu {
 
int (*aux_output_match) (struct perf_event *event);
 
+   int (*filter_match)(struct perf_event *event);
/*
 * Hybrid support
 *


[tip: perf/core] perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 55bcf6ef314ae8ba81bcd74aa760247b635ed47b
Gitweb:
https://git.kernel.org/tip/55bcf6ef314ae8ba81bcd74aa760247b635ed47b
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:31:01 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00

perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE

Current Hardware events and Hardware cache events have special perf
types, PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The two types don't
pass the PMU type in the user interface. For a hybrid system, the perf
subsystem doesn't know which PMU the events belong to. The first capable
PMU will always be assigned to the events. The events never get a chance
to run on the other capable PMUs.

Extend the two types to become PMU aware types. The PMU type ID is
stored at attr.config[63:32].

Add a new PMU capability, PERF_PMU_CAP_EXTENDED_HW_TYPE, to indicate a
PMU which supports the extended PERF_TYPE_HARDWARE and
PERF_TYPE_HW_CACHE.

The PMU type is only required when searching a specific PMU. The PMU
specific codes will only be interested in the 'real' config value, which
is stored in the low 32 bit of the event->attr.config. Update the
event->attr.config in the generic code, so the PMU specific codes don't
need to calculate it separately.

If a user specifies a PMU type, but the PMU doesn't support the extended
type, error out.

If an event cannot be initialized in a PMU specified by a user, error
out immediately. Perf should not try to open it on other PMUs.

The new PMU capability is only set for the X86 hybrid PMUs for now.
Other architectures, e.g., ARM, may need it as well. The support on ARM
may be implemented later separately.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618237865-33448-22-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c  |  1 +
 include/linux/perf_event.h  | 19 ++-
 include/uapi/linux/perf_event.h | 15 +++
 kernel/events/core.c| 19 ---
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 4f6595e..3fe66b7 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2173,6 +2173,7 @@ static int __init init_hw_perf_events(void)
hybrid_pmu->pmu.type = -1;
hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
hybrid_pmu->pmu.capabilities |= 
PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+   hybrid_pmu->pmu.capabilities |= 
PERF_PMU_CAP_EXTENDED_HW_TYPE;
 
err = perf_pmu_register(&hybrid_pmu->pmu, 
hybrid_pmu->name,
(hybrid_pmu->cpu_type == 
hybrid_big) ? PERF_TYPE_RAW : -1);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61b3851..a763928 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -260,15 +260,16 @@ struct perf_event;
 /**
  * pmu::capabilities flags
  */
-#define PERF_PMU_CAP_NO_INTERRUPT  0x01
-#define PERF_PMU_CAP_NO_NMI0x02
-#define PERF_PMU_CAP_AUX_NO_SG 0x04
-#define PERF_PMU_CAP_EXTENDED_REGS 0x08
-#define PERF_PMU_CAP_EXCLUSIVE 0x10
-#define PERF_PMU_CAP_ITRACE0x20
-#define PERF_PMU_CAP_HETEROGENEOUS_CPUS0x40
-#define PERF_PMU_CAP_NO_EXCLUDE0x80
-#define PERF_PMU_CAP_AUX_OUTPUT0x100
+#define PERF_PMU_CAP_NO_INTERRUPT  0x0001
+#define PERF_PMU_CAP_NO_NMI0x0002
+#define PERF_PMU_CAP_AUX_NO_SG 0x0004
+#define PERF_PMU_CAP_EXTENDED_REGS 0x0008
+#define PERF_PMU_CAP_EXCLUSIVE 0x0010
+#define PERF_PMU_CAP_ITRACE0x0020
+#define PERF_PMU_CAP_HETEROGENEOUS_CPUS0x0040
+#define PERF_PMU_CAP_NO_EXCLUDE0x0080
+#define PERF_PMU_CAP_AUX_OUTPUT0x0100
+#define PERF_PMU_CAP_EXTENDED_HW_TYPE  0x0200
 
 struct perf_output_handle;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 0b58970..e54e639 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -38,6 +38,21 @@ enum perf_type_id {
 };
 
 /*
+ * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * PERF_TYPE_HARDWARE: 0x00AA
+ * AA: hardware event ID
+ * : PMU type ID
+ * PERF_TYPE_HW_CACHE: 0x00DDCCBB
+ * BB: hardware cache ID
+ * CC: hardware cache op ID
+ *  

[tip: perf/core] perf/x86/intel/uncore: Add Alder Lake support

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 772ed05f3c5ce722b9de6c4c2dd87538a33fb8d3
Gitweb:
https://git.kernel.org/tip/772ed05f3c5ce722b9de6c4c2dd87538a33fb8d3
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:31:02 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00

perf/x86/intel/uncore: Add Alder Lake support

The uncore subsystem for Alder Lake is similar to the previous Tiger
Lake.

The difference includes:
- New MSR addresses for global control, fixed counters, CBOX and ARB.
  Add a new adl_uncore_msr_ops for uncore operations.
- Add a new threshold field for CBOX.
- New PCIIDs for IMC devices.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-23-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c |   7 +-
 arch/x86/events/intel/uncore.h |   1 +-
 arch/x86/events/intel/uncore_snb.c | 131 -
 3 files changed, 139 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index a2b68bb..df7b07d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1752,6 +1752,11 @@ static const struct intel_uncore_init_fun 
rkl_uncore_init __initconst = {
.pci_init = skl_uncore_pci_init,
 };
 
+static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
+   .cpu_init = adl_uncore_cpu_init,
+   .mmio_init = tgl_uncore_mmio_init,
+};
+
 static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1806,6 +1811,8 @@ static const struct x86_cpu_id intel_uncore_match[] 
__initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,   &tgl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,  &rkl_uncore_init),
+   X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,   &adl_uncore_init),
+   X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,  &snr_uncore_init),
{},
 };
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 96569dc..2917910 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -582,6 +582,7 @@ void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 void skl_uncore_cpu_init(void);
 void icl_uncore_cpu_init(void);
+void adl_uncore_cpu_init(void);
 void tgl_uncore_cpu_init(void);
 void tgl_uncore_mmio_init(void);
 void tgl_l_uncore_mmio_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c 
b/arch/x86/events/intel/uncore_snb.c
index 5127128..0f63706 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -62,6 +62,8 @@
 #define PCI_DEVICE_ID_INTEL_TGL_H_IMC  0x9a36
 #define PCI_DEVICE_ID_INTEL_RKL_1_IMC  0x4c43
 #define PCI_DEVICE_ID_INTEL_RKL_2_IMC  0x4c53
+#define PCI_DEVICE_ID_INTEL_ADL_1_IMC  0x4660
+#define PCI_DEVICE_ID_INTEL_ADL_2_IMC  0x4641
 
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
@@ -131,12 +133,33 @@
 #define ICL_UNC_ARB_PER_CTR0x3b1
 #define ICL_UNC_ARB_PERFEVTSEL 0x3b3
 
+/* ADL uncore global control */
+#define ADL_UNC_PERF_GLOBAL_CTL0x2ff0
+#define ADL_UNC_FIXED_CTR_CTRL  0x2fde
+#define ADL_UNC_FIXED_CTR   0x2fdf
+
+/* ADL Cbo register */
+#define ADL_UNC_CBO_0_PER_CTR0 0x2002
+#define ADL_UNC_CBO_0_PERFEVTSEL0  0x2000
+#define ADL_UNC_CTL_THRESHOLD  0x3f00
+#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+SNB_UNC_CTL_UMASK_MASK | \
+SNB_UNC_CTL_EDGE_DET | \
+SNB_UNC_CTL_INVERT | \
+ADL_UNC_CTL_THRESHOLD)
+
+/* ADL ARB register */
+#define ADL_UNC_ARB_PER_CTR0   0x2FD2
+#define ADL_UNC_ARB_PERFEVTSEL00x2FD0
+#define ADL_UNC_ARB_MSR_OFFSET 0x8
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
 DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
 DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
 
 /* Sandy Bridge uncore support */
 static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct 
perf_event *event)
@@ -4

[tip: perf/core] perf/x86/cstate: Add Alder Lake CPU support

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: d0ca946bcf84e1f9847571923bb1e6bd1264f424
Gitweb:
https://git.kernel.org/tip/d0ca946bcf84e1f9847571923bb1e6bd1264f424
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:31:04 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00

perf/x86/cstate: Add Alder Lake CPU support

Compared with the Rocket Lake, the CORE C1 Residency Counter is added
for Alder Lake, but the CORE C3 Residency Counter is removed. Other
counters are the same.

Create a new adl_cstates for Alder Lake. Update the comments
accordingly.

The External Design Specification (EDS) is not published yet. It comes
from an authoritative internal source.

The patch has been tested on real hardware.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-25-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/cstate.c | 39 -
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 407eee5..4333990 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,7 +40,7 @@
  * Model specific counters:
  * MSR_CORE_C1_RES: CORE C1 Residency Counter
  *  perf code: 0x00
- *  Available model: SLM,AMT,GLM,CNL,TNT
+ *  Available model: SLM,AMT,GLM,CNL,TNT,ADL
  *  Scope: Core (each processor core has a MSR)
  * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
  *perf code: 0x01
@@ -51,46 +51,49 @@
  *perf code: 0x02
  *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
  * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
  *Scope: Core
  * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
  *perf code: 0x03
  *Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL
+ * ICL,TGL,RKL,ADL
  *Scope: Core
  * MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
  *perf code: 0x00
  *Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL,TNT,RKL
+ * KBL,CML,ICL,TGL,TNT,RKL,ADL
  *Scope: Package (physical package)
  * MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
  *perf code: 0x01
  *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
+ * ADL
  *Scope: Package (physical package)
  * MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
  *perf code: 0x02
  *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
  * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
  *Scope: Package (physical package)
  * MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
  *perf code: 0x03
  *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL
+ * KBL,CML,ICL,TGL,RKL,ADL
  *Scope: Package (physical package)
  * MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
  *perf code: 0x04
- *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL
  *Scope: Package (physical package)
  * MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
  *perf code: 0x05
- *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL
  *Scope: Package (physical package)
  * MSR_PKG_C10_RESIDENCY: Packag

[tip: perf/core] perf/x86/msr: Add Alder Lake CPU support

2021-04-20 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 19d3a81fd92dc9b73950564955164ecfd0dfbea1
Gitweb:
https://git.kernel.org/tip/19d3a81fd92dc9b73950564955164ecfd0dfbea1
Author:Kan Liang 
AuthorDate:Mon, 12 Apr 2021 07:31:03 -07:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 19 Apr 2021 20:03:29 +02:00

perf/x86/msr: Add Alder Lake CPU support

PPERF and SMI_COUNT MSRs are also supported on Alder Lake.

The External Design Specification (EDS) is not published yet. It comes
from an authoritative internal source.

The patch has been tested on real hardware.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1618237865-33448-24-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/msr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 680404c..c853b28 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -100,6 +100,8 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
case INTEL_FAM6_ROCKETLAKE:
+   case INTEL_FAM6_ALDERLAKE:
+   case INTEL_FAM6_ALDERLAKE_L:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;


[tip: perf/core] perf/x86: Move cpuc->running into P4 specific code

2021-04-16 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 46ade4740bbf9bf4e804ddb2c85845cccd219f3c
Gitweb:
https://git.kernel.org/tip/46ade4740bbf9bf4e804ddb2c85845cccd219f3c
Author:Kan Liang 
AuthorDate:Wed, 14 Apr 2021 07:36:29 -07:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:42 +02:00

perf/x86: Move cpuc->running into P4 specific code

The 'running' variable is only used in the P4 PMU. Current perf sets the
variable in the critical function x86_pmu_start(), which wastes cycles
for everybody not running on P4.

Move cpuc->running into the P4 specific p4_pmu_enable_event().

Add a static per-CPU 'p4_running' variable to replace the 'running'
variable in the struct cpu_hw_events. Saves space for the generic
structure.

The p4_pmu_enable_all() also invokes the p4_pmu_enable_event(), but it
should not set cpuc->running. Factor out __p4_pmu_enable_event() for
p4_pmu_enable_all().

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618410990-21383-1-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   |  1 -
 arch/x86/events/intel/p4.c   | 16 +---
 arch/x86/events/perf_event.h |  1 -
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 18df171..dd9f3c2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1480,7 +1480,6 @@ static void x86_pmu_start(struct perf_event *event, int 
flags)
 
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
-   __set_bit(idx, cpuc->running);
static_call(x86_pmu_enable)(event);
perf_event_update_userpage(event);
 }
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index a4cc660..9c10cbb 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -947,7 +947,7 @@ static void p4_pmu_enable_pebs(u64 config)
(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,  (u64)bind->metric_vert);
 }
 
-static void p4_pmu_enable_event(struct perf_event *event)
+static void __p4_pmu_enable_event(struct perf_event *event)
 {
struct hw_perf_event *hwc = &event->hw;
int thread = p4_ht_config_thread(hwc->config);
@@ -983,6 +983,16 @@ static void p4_pmu_enable_event(struct perf_event *event)
(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
+static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(X86_PMC_IDX_MAX)], 
p4_running);
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+   int idx = event->hw.idx;
+
+   __set_bit(idx, per_cpu(p4_running, smp_processor_id()));
+   __p4_pmu_enable_event(event);
+}
+
 static void p4_pmu_enable_all(int added)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -992,7 +1002,7 @@ static void p4_pmu_enable_all(int added)
struct perf_event *event = cpuc->events[idx];
if (!test_bit(idx, cpuc->active_mask))
continue;
-   p4_pmu_enable_event(event);
+   __p4_pmu_enable_event(event);
}
 }
 
@@ -1012,7 +1022,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
if (!test_bit(idx, cpuc->active_mask)) {
/* catch in-flight IRQs */
-   if (__test_and_clear_bit(idx, cpuc->running))
+   if (__test_and_clear_bit(idx, per_cpu(p4_running, 
smp_processor_id(
handled++;
continue;
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 53b2b5f..54a340e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -228,7 +228,6 @@ struct cpu_hw_events {
 */
struct perf_event   *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long   active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-   unsigned long   running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
 
int n_events; /* the # of events in the below 
arrays */


[tip: perf/core] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task

2021-04-16 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 01fd9661e168de7cfc4f947e7220fca0e6791999
Gitweb:
https://git.kernel.org/tip/01fd9661e168de7cfc4f947e7220fca0e6791999
Author:Kan Liang 
AuthorDate:Wed, 14 Apr 2021 07:36:30 -07:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:43 +02:00

perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task

The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.

perf stat -e 'branches,cycles' -- taskset -c 0 ./workload

In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)

$ taskset -c 0 ./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0

index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0

It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.

The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.

Two methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
  overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
  scheduled in. The method is implemented here.

The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.

The current code doesn't reset the counter when the assigned event is
deleted. Set the corresponding bit in the 'dirty' variable in
x86_pmu_del(), if the RDPMC feature is available on the system.

The security issue can only be found with an RDPMC task. The event for
an RDPMC task requires the mmap buffer. This can be used to detect an
RDPMC task. Once the event is detected in the event_mapped(), enable
sched_task(), which is invoked in each context switch. Add a check in
the sched_task() to clear the dirty counters, when the RDPMC task is
scheduled in. Only the current un-assigned dirty counters are reset,
bacuase the RDPMC assigned dirty counters will be updated soon.

The RDPMC instruction is also supported on the older platforms. Add
sched_task() for the core_pmu. The core_pmu doesn't support large PEBS
and LBR callstack, the intel_pmu_pebs/lbr_sched_task() will be ignored.

The RDPMC is not Intel-only feature. Add the dirty counters clear code
in the X86 generic code.

After applying the patch,

$ taskset -c 0 ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0

index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0

Performance

The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.

The worst-case occurs when there are two users: the RDPMC user only
applies one counter; while the other user applies all available
counters. When the RDPMC task is scheduled in, all the counters, other
than the RDPMC assigned one, have to be reset.

Here is the test result for the worst-case.

The test is implemented on an Ice Lake platform, which has 8 GP
counters and 3 fixed counters (Not include SLOTS counter).

The lat_ctx is used to measure the context switching time.

lat_ctx -s 128K -N 1000 processes 2

I instrument the lat_ctx to open all 8 GP counters and 3 fixed
counters for one task. The other task opens a fixed counter and enable
RDPMC.

Without the patch:
The context switch time is 4.97 us

With the patch:
The context switch time is 5.16 us

There is ~4% performance drop for the context switching time in the
worst-case.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1618410990-21383-2-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   | 47 +++-
 arch/x86/events/perf_event.h |  1 +-
 2 files changed, 48 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index dd9f3c2..e34eb72 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1585,6 +1585,8 @@ static void x86_pmu_del(struct perf_event *event, int 
flags)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto do_del;
 
+   __set_bit(event->hw.idx, cpuc->dirty);
+
/*
 * Not a TXN, therefore cleanup properly.
 */
@@ -2304,12 +2306,46 @@ s

[tip: perf/core] perf/x86/intel/uncore: Generic support for the MSR type of uncore blocks

2021-04-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: d6c754130435ab786711bed75d04a2388a6b4da8
Gitweb:
https://git.kernel.org/tip/d6c754130435ab786711bed75d04a2388a6b4da8
Author:Kan Liang 
AuthorDate:Wed, 17 Mar 2021 10:59:34 -07:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 02 Apr 2021 10:04:54 +02:00

perf/x86/intel/uncore: Generic support for the MSR type of uncore blocks

The discovery table provides the generic uncore block information for
the MSR type of uncore blocks, e.g., the counter width, the number of
counters, the location of control/counter registers, which is good
enough to provide basic uncore support. It can be used as a fallback
solution when the kernel doesn't support a platform.

The name of the uncore box cannot be retrieved from the discovery table.
uncore_type_&typeID_&boxID will be used as its name. Save the type ID
and the box ID information in the struct intel_uncore_type.
Factor out uncore_get_pmu_name() to handle different naming methods.

Implement generic support for the MSR type of uncore block.

Some advanced features, such as filters and constraints, cannot be
retrieved from discovery tables. Features that rely on that
information are not be supported here.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1616003977-90612-3-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c   |  45 ++--
 arch/x86/events/intel/uncore.h   |   3 +-
 arch/x86/events/intel/uncore_discovery.c | 126 ++-
 arch/x86/events/intel/uncore_discovery.h |  18 +++-
 4 files changed, 182 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index d111370..dabc01f 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -10,7 +10,7 @@ static bool uncore_no_discover;
 module_param(uncore_no_discover, bool, 0);
 MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon 
discovery mechanism "
 "(default: enable the discovery 
mechanism).");
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type *empty_uncore[] = { NULL, };
 struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
 struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
@@ -834,6 +834,34 @@ static const struct attribute_group uncore_pmu_attr_group 
= {
.attrs = uncore_pmu_attrs,
 };
 
+static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
+{
+   struct intel_uncore_type *type = pmu->type;
+
+   /*
+* No uncore block name in discovery table.
+* Use uncore_type_&typeid_&boxid as name.
+*/
+   if (!type->name) {
+   if (type->num_boxes == 1)
+   sprintf(pmu->name, "uncore_type_%u", type->type_id);
+   else {
+   sprintf(pmu->name, "uncore_type_%u_%d",
+   type->type_id, type->box_ids[pmu->pmu_idx]);
+   }
+   return;
+   }
+
+   if (type->num_boxes == 1) {
+   if (strlen(type->name) > 0)
+   sprintf(pmu->name, "uncore_%s", type->name);
+   else
+   sprintf(pmu->name, "uncore");
+   } else
+   sprintf(pmu->name, "uncore_%s_%d", type->name, pmu->pmu_idx);
+
+}
+
 static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 {
int ret;
@@ -860,15 +888,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu 
*pmu)
pmu->pmu.attr_update = pmu->type->attr_update;
}
 
-   if (pmu->type->num_boxes == 1) {
-   if (strlen(pmu->type->name) > 0)
-   sprintf(pmu->name, "uncore_%s", pmu->type->name);
-   else
-   sprintf(pmu->name, "uncore");
-   } else {
-   sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-   pmu->pmu_idx);
-   }
+   uncore_get_pmu_name(pmu);
 
ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
if (!ret)
@@ -909,6 +929,10 @@ static void uncore_type_exit(struct intel_uncore_type 
*type)
kfree(type->pmus);
type->pmus = NULL;
}
+   if (type->box_ids) {
+   kfree(type->box_ids);
+   type->box_ids = NULL;
+   }
kfree(type->events_group);
type->events_group = NULL;
 }
@@ -1643,6 +1667,7 @@ static const struct intel_uncore_init_fun snr_uncore_init 
__initconst = {
 };
 
 static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+   .cpu_init = intel_uncore_generic_uncore_cpu_init,
 };
 
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events

[tip: perf/core] perf/x86/intel/uncore: Parse uncore discovery tables

2021-04-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: edae1f06c2cda41edffc93de6aedc8ba8dc883c3
Gitweb:
https://git.kernel.org/tip/edae1f06c2cda41edffc93de6aedc8ba8dc883c3
Author:Kan Liang 
AuthorDate:Wed, 17 Mar 2021 10:59:33 -07:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 02 Apr 2021 10:04:54 +02:00

perf/x86/intel/uncore: Parse uncore discovery tables

A self-describing mechanism for the uncore PerfMon hardware has been
introduced with the latest Intel platforms. By reading through an MMIO
page worth of information, perf can 'discover' all the standard uncore
PerfMon registers in a machine.

The discovery mechanism relies on BIOS's support. With a proper BIOS,
a PCI device with the unique capability ID 0x23 can be found on each
die. Perf can retrieve the information of all available uncore PerfMons
from the device via MMIO. The information is composed of one global
discovery table and several unit discovery tables.
- The global discovery table includes global uncore information of the
  die, e.g., the address of the global control register, the offset of
  the global status register, the number of uncore units, the offset of
  unit discovery tables, etc.
- The unit discovery table includes generic uncore unit information,
  e.g., the access type, the counter width, the address of counters,
  the address of the counter control, the unit ID, the unit type, etc.
  The unit is also called "box" in the code.
Perf can provide basic uncore support based on this information
with the following patches.

To locate the PCI device with the discovery tables, check the generic
PCI ID first. If it doesn't match, go through the entire PCI device tree
and locate the device with the unique capability ID.

The uncore information is similar among dies. To save parsing time and
space, only completely parse and store the discovery tables on the first
die and the first box of each die. The parsed information is stored in
an
RB tree structure, intel_uncore_discovery_type. The size of the stored
discovery tables varies among platforms. It's around 4KB for a Sapphire
Rapids server.

If a BIOS doesn't support the 'discovery' mechanism, the uncore driver
will exit with -ENODEV. There is nothing changed.

Add a module parameter to disable the discovery feature. If a BIOS gets
the discovery tables wrong, users can have an option to disable the
feature. For the current patchset, the uncore driver will exit with
-ENODEV. In the future, it may fall back to the hardcode uncore driver
on a known platform.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1616003977-90612-2-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/Makefile   |   2 +-
 arch/x86/events/intel/uncore.c   |  31 +-
 arch/x86/events/intel/uncore_discovery.c | 318 ++-
 arch/x86/events/intel/uncore_discovery.h | 105 +++-
 4 files changed, 448 insertions(+), 8 deletions(-)
 create mode 100644 arch/x86/events/intel/uncore_discovery.c
 create mode 100644 arch/x86/events/intel/uncore_discovery.h

diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index e67a588..10bde6c 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -3,6 +3,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o
-intel-uncore-objs  := uncore.o uncore_nhmex.o uncore_snb.o 
uncore_snbep.o
+intel-uncore-objs  := uncore.o uncore_nhmex.o uncore_snb.o 
uncore_snbep.o uncore_discovery.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o
 intel-cstate-objs  := cstate.o
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 33c8180..d111370 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -4,7 +4,12 @@
 #include 
 #include 
 #include "uncore.h"
+#include "uncore_discovery.h"
 
+static bool uncore_no_discover;
+module_param(uncore_no_discover, bool, 0);
+MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon 
discovery mechanism "
+"(default: enable the discovery 
mechanism).");
 static struct intel_uncore_type *empty_uncore[] = { NULL, };
 struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
 struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
@@ -1637,6 +1642,9 @@ static const struct intel_uncore_init_fun snr_uncore_init 
__initconst = {
.mmio_init = snr_uncore_mmio_init,
 };
 
+static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+};
+
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP,  &nhm_uncore_init),

[tip: perf/core] perf/x86/intel/uncore: Generic support for the PCI type of uncore blocks

2021-04-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 42839ef4a20a4bda415974ff0e7d85ff540fffa4
Gitweb:
https://git.kernel.org/tip/42839ef4a20a4bda415974ff0e7d85ff540fffa4
Author:Kan Liang 
AuthorDate:Wed, 17 Mar 2021 10:59:36 -07:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 02 Apr 2021 10:04:55 +02:00

perf/x86/intel/uncore: Generic support for the PCI type of uncore blocks

The discovery table provides the generic uncore block information
for the PCI type of uncore blocks, which is good enough to provide
basic uncore support.

The PCI BUS and DEVFN information can be retrieved from the box control
field. Introduce the uncore_pci_pmus_register() to register all the
PCICFG type of uncore blocks. The old PCI probe/remove way is dropped.

The PCI BUS and DEVFN information are different among dies. Add box_ctls
to store the box control field of each die.

Add a new BUS notifier for the PCI type of uncore block to support the
hotplug. If the device is "hot remove", the corresponding registered PMU
has to be unregistered. Perf cannot locate the PMU by searching a const
pci_device_id table, because the discovery tables don't provide such
information. Introduce uncore_pci_find_dev_pmu_from_types() to search
the whole uncore_pci_uncores for the PMU.

Implement generic support for the PCI type of uncore block.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1616003977-90612-5-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c   | 91 +--
 arch/x86/events/intel/uncore.h   |  6 +-
 arch/x86/events/intel/uncore_discovery.c | 80 -
 arch/x86/events/intel/uncore_discovery.h |  7 ++-
 4 files changed, 177 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 391fa7c..3109082 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1032,10 +1032,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev 
*pdev, int *die)
return 0;
 }
 
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev)
+{
+   struct intel_uncore_type **types = uncore_pci_uncores;
+   struct intel_uncore_type *type;
+   u64 box_ctl;
+   int i, die;
+
+   for (; *types; types++) {
+   type = *types;
+   for (die = 0; die < __uncore_max_dies; die++) {
+   for (i = 0; i < type->num_boxes; i++) {
+   if (!type->box_ctls[die])
+   continue;
+   box_ctl = type->box_ctls[die] + 
type->pci_offsets[i];
+   if (pdev->devfn == 
UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) &&
+   pdev->bus->number == 
UNCORE_DISCOVERY_PCI_BUS(box_ctl) &&
+   pci_domain_nr(pdev->bus) == 
UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl))
+   return &type->pmus[i];
+   }
+   }
+   }
+
+   return NULL;
+}
+
 /*
  * Find the PMU of a PCI device.
  * @pdev: The PCI device.
  * @ids: The ID table of the available PCI devices with a PMU.
+ *   If NULL, search the whole uncore_pci_uncores.
  */
 static struct intel_uncore_pmu *
 uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
@@ -1045,6 +1072,9 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const 
struct pci_device_id *ids)
kernel_ulong_t data;
unsigned int devfn;
 
+   if (!ids)
+   return uncore_pci_find_dev_pmu_from_types(pdev);
+
while (ids && ids->vendor) {
if ((ids->vendor == pdev->vendor) &&
(ids->device == pdev->device)) {
@@ -1283,6 +1313,48 @@ static void uncore_pci_sub_driver_init(void)
uncore_pci_sub_driver = NULL;
 }
 
+static int uncore_pci_bus_notify(struct notifier_block *nb,
+unsigned long action, void *data)
+{
+   return uncore_bus_notify(nb, action, data, NULL);
+}
+
+static struct notifier_block uncore_pci_notifier = {
+   .notifier_call = uncore_pci_bus_notify,
+};
+
+
+static void uncore_pci_pmus_register(void)
+{
+   struct intel_uncore_type **types = uncore_pci_uncores;
+   struct intel_uncore_type *type;
+   struct intel_uncore_pmu *pmu;
+   struct pci_dev *pdev;
+   u64 box_ctl;
+   int i, die;
+
+   for (; *types; types++) {
+   type = *types;
+   for (die = 0; die < __uncore_max_dies; die++) {
+   for (i = 0; i < type->num_boxes; i++) {
+   if (!type->box_ctls[die])
+   continue;
+   box_ctl = type->box_ctls[die] + 
type->pci_offsets[i];
+ 

[tip: perf/core] perf/x86/intel/uncore: Rename uncore_notifier to uncore_pci_sub_notifier

2021-04-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 6477dc3934775f82a571fac469fd8c348e611095
Gitweb:
https://git.kernel.org/tip/6477dc3934775f82a571fac469fd8c348e611095
Author:Kan Liang 
AuthorDate:Wed, 17 Mar 2021 10:59:35 -07:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 02 Apr 2021 10:04:54 +02:00

perf/x86/intel/uncore: Rename uncore_notifier to uncore_pci_sub_notifier

Perf will use a similar method to the PCI sub driver to register
the PMUs for the PCI type of uncore blocks. The method requires a BUS
notifier to support hotplug. The current BUS notifier cannot be reused,
because it searches a const id_table for the corresponding registered
PMU. The PCI type of uncore blocks in the discovery tables doesn't
provide an id_table.

Factor out uncore_bus_notify() and add the pointer of an id_table as a
parameter. The uncore_bus_notify() will be reused in the following
patch.

The current BUS notifier is only used by the PCI sub driver. Its name is
too generic. Rename it to uncore_pci_sub_notifier, which is specific for
the PCI sub driver.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1616003977-90612-4-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index dabc01f..391fa7c 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1203,7 +1203,8 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 }
 
 static int uncore_bus_notify(struct notifier_block *nb,
-unsigned long action, void *data)
+unsigned long action, void *data,
+const struct pci_device_id *ids)
 {
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
@@ -1214,7 +1215,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (action != BUS_NOTIFY_DEL_DEVICE)
return NOTIFY_DONE;
 
-   pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table);
+   pmu = uncore_pci_find_dev_pmu(pdev, ids);
if (!pmu)
return NOTIFY_DONE;
 
@@ -1226,8 +1227,15 @@ static int uncore_bus_notify(struct notifier_block *nb,
return NOTIFY_OK;
 }
 
-static struct notifier_block uncore_notifier = {
-   .notifier_call = uncore_bus_notify,
+static int uncore_pci_sub_bus_notify(struct notifier_block *nb,
+unsigned long action, void *data)
+{
+   return uncore_bus_notify(nb, action, data,
+uncore_pci_sub_driver->id_table);
+}
+
+static struct notifier_block uncore_pci_sub_notifier = {
+   .notifier_call = uncore_pci_sub_bus_notify,
 };
 
 static void uncore_pci_sub_driver_init(void)
@@ -1268,7 +1276,7 @@ static void uncore_pci_sub_driver_init(void)
ids++;
}
 
-   if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier))
+   if (notify && bus_register_notifier(&pci_bus_type, 
&uncore_pci_sub_notifier))
notify = false;
 
if (!notify)
@@ -1319,7 +1327,7 @@ static void uncore_pci_exit(void)
if (pcidrv_registered) {
pcidrv_registered = false;
if (uncore_pci_sub_driver)
-   bus_unregister_notifier(&pci_bus_type, 
&uncore_notifier);
+   bus_unregister_notifier(&pci_bus_type, 
&uncore_pci_sub_notifier);
pci_unregister_driver(uncore_pci_driver);
uncore_types_exit(uncore_pci_uncores);
kfree(uncore_extra_pci_dev);


[tip: perf/core] perf/x86/intel/uncore: Generic support for the MMIO type of uncore blocks

2021-04-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: c4c55e362a521d763356b9e02bc9a4348c71a471
Gitweb:
https://git.kernel.org/tip/c4c55e362a521d763356b9e02bc9a4348c71a471
Author:Kan Liang 
AuthorDate:Wed, 17 Mar 2021 10:59:37 -07:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 02 Apr 2021 10:04:55 +02:00

perf/x86/intel/uncore: Generic support for the MMIO type of uncore blocks

The discovery table provides the generic uncore block information
for the MMIO type of uncore blocks, which is good enough to provide
basic uncore support.

The box control field is composed of the BAR address and box control
offset. When initializing the uncore blocks, perf should ioremap the
address from the box control field.

Implement the generic support for the MMIO type of uncore block.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1616003977-90612-6-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c   |  1 +-
 arch/x86/events/intel/uncore.h   |  1 +-
 arch/x86/events/intel/uncore_discovery.c | 98 +++-
 arch/x86/events/intel/uncore_discovery.h |  1 +-
 4 files changed, 101 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 3109082..35b3470 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1755,6 +1755,7 @@ static const struct intel_uncore_init_fun snr_uncore_init 
__initconst = {
 static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
.cpu_init = intel_uncore_generic_uncore_cpu_init,
.pci_init = intel_uncore_generic_uncore_pci_init,
+   .mmio_init = intel_uncore_generic_uncore_mmio_init,
 };
 
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 76fc898..549cfb2 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -70,6 +70,7 @@ struct intel_uncore_type {
union {
unsigned *msr_offsets;
unsigned *pci_offsets;
+   unsigned *mmio_offsets;
};
unsigned *box_ids;
struct event_constraint unconstrainted;
diff --git a/arch/x86/events/intel/uncore_discovery.c 
b/arch/x86/events/intel/uncore_discovery.c
index 784d7b4..aba9bff 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -442,6 +442,90 @@ static struct intel_uncore_ops generic_uncore_pci_ops = {
.read_counter   = intel_generic_uncore_pci_read_counter,
 };
 
+#define UNCORE_GENERIC_MMIO_SIZE   0x4000
+
+static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box)
+{
+   struct intel_uncore_type *type = box->pmu->type;
+
+   if (!type->box_ctls || !type->box_ctls[box->dieid] || 
!type->mmio_offsets)
+   return 0;
+
+   return type->box_ctls[box->dieid] + 
type->mmio_offsets[box->pmu->pmu_idx];
+}
+
+static void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
+{
+   unsigned int box_ctl = generic_uncore_mmio_box_ctl(box);
+   struct intel_uncore_type *type = box->pmu->type;
+   resource_size_t addr;
+
+   if (!box_ctl) {
+   pr_warn("Uncore type %d box %d: Invalid box control address.\n",
+   type->type_id, type->box_ids[box->pmu->pmu_idx]);
+   return;
+   }
+
+   addr = box_ctl;
+   box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+   if (!box->io_addr) {
+   pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
+   type->type_id, type->box_ids[box->pmu->pmu_idx],
+   (unsigned long long)addr);
+   return;
+   }
+
+   writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+   if (!box->io_addr)
+   return;
+
+   writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+   if (!box->io_addr)
+   return;
+
+   writel(0, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box 
*box,
+struct perf_event *event)
+{
+   struct hw_perf_event *hwc = &event->hw;
+
+   if (!box->io_addr)
+   return;
+
+   writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box 
*box,
+ struct perf_event *event)
+{
+   struct hw_perf_event *hwc = &event->hw;
+
+   if (!box->io_addr)
+   return;
+
+   writel(0, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_op

[tip: perf/urgent] perf/x86/intel: Fix a crash caused by zero PEBS status

2021-03-17 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: d88d05a9e0b6d9356e97129d4ff9942d765f46ea
Gitweb:
https://git.kernel.org/tip/d88d05a9e0b6d9356e97129d4ff9942d765f46ea
Author:Kan Liang 
AuthorDate:Fri, 12 Mar 2021 05:21:37 -08:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 16 Mar 2021 21:44:39 +01:00

perf/x86/intel: Fix a crash caused by zero PEBS status

A repeatable crash can be triggered by the perf_fuzzer on some Haswell
system.
https://lore.kernel.org/lkml/7170d3b-c17f-1ded-52aa-cc6d9ae99...@maine.edu/

For some old CPUs (HSW and earlier), the PEBS status in a PEBS record
may be mistakenly set to 0. To minimize the impact of the defect, the
commit was introduced to try to avoid dropping the PEBS record for some
cases. It adds a check in the intel_pmu_drain_pebs_nhm(), and updates
the local pebs_status accordingly. However, it doesn't correct the PEBS
status in the PEBS record, which may trigger the crash, especially for
the large PEBS.

It's possible that all the PEBS records in a large PEBS have the PEBS
status 0. If so, the first get_next_pebs_record_by_bit() in the
__intel_pmu_pebs_event() returns NULL. The at = NULL. Since it's a large
PEBS, the 'count' parameter must > 1. The second
get_next_pebs_record_by_bit() will crash.

Besides the local pebs_status, correct the PEBS status in the PEBS
record as well.

Fixes: 01330d7288e0 ("perf/x86: Allow zero PEBS status with only single active 
event")
Reported-by: Vince Weaver 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/161298-140216-1-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/ds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7ebae18..d32b302 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2010,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs 
*iregs, struct perf_sample_d
 */
if (!pebs_status && cpuc->pebs_enabled &&
!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
-   pebs_status = cpuc->pebs_enabled;
+   pebs_status = p->status = cpuc->pebs_enabled;
 
bit = find_first_bit((unsigned long *)&pebs_status,
x86_pmu.max_pebs_events);


[tip: perf/urgent] perf/x86/intel: Fix unchecked MSR access error caused by VLBR_EVENT

2021-03-17 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: 2dc0572f2cef87425147658698dce2600b799bd3
Gitweb:
https://git.kernel.org/tip/2dc0572f2cef87425147658698dce2600b799bd3
Author:Kan Liang 
AuthorDate:Fri, 12 Mar 2021 05:21:38 -08:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 16 Mar 2021 21:44:39 +01:00

perf/x86/intel: Fix unchecked MSR access error caused by VLBR_EVENT

On a Haswell machine, the perf_fuzzer managed to trigger this message:

[117248.075892] unchecked MSR access error: WRMSR to 0x3f1 (tried to
write 0x0400) at rIP: 0x8106e4f4
(native_write_msr+0x4/0x20)
[117248.089957] Call Trace:
[117248.092685]  intel_pmu_pebs_enable_all+0x31/0x40
[117248.097737]  intel_pmu_enable_all+0xa/0x10
[117248.102210]  __perf_event_task_sched_in+0x2df/0x2f0
[117248.107511]  finish_task_switch.isra.0+0x15f/0x280
[117248.112765]  schedule_tail+0xc/0x40
[117248.116562]  ret_from_fork+0x8/0x30

A fake event called VLBR_EVENT may use the bit 58 of the PEBS_ENABLE, if
the precise_ip is set. The bit 58 is reserved by the HW. Accessing the
bit causes the unchecked MSR access error.

The fake event doesn't support PEBS. The case should be rejected.

Fixes: 097e4311cda9 ("perf/x86: Add constraint to create guest LBR event 
without hw counter")
Reported-by: Vince Weaver 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/161298-140216-2-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7bbb5bb..37ce384 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3659,6 +3659,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
return ret;
 
if (event->attr.precise_ip) {
+   if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == 
INTEL_FIXED_VLBR_EVENT)
+   return -EINVAL;
+
if (!(event->attr.freq || (event->attr.wakeup_events && 
!event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &


[tip: perf/urgent] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR

2021-03-06 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: afbef30149587ad46f4780b1e0cc5e219745ce90
Gitweb:
https://git.kernel.org/tip/afbef30149587ad46f4780b1e0cc5e219745ce90
Author:Kan Liang 
AuthorDate:Mon, 30 Nov 2020 11:38:41 -08:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:52:44 +01:00

perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR

To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer
in a context switch.

For normal LBRs, a context switch can flip the address space and LBR
entries are not tagged with an identifier, we need to wipe the LBR, even
for per-cpu events.

For LBR callstack, save/restore the stack is required during a context
switch.

Set PERF_ATTACH_SCHED_CB for the event with large PEBS & LBR.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context 
switches")
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Link: https://lkml.kernel.org/r/20201130193842.10569-2-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5bac48d..7bbb5bb 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (!(event->attr.freq || (event->attr.wakeup_events && 
!event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &
- ~intel_pmu_large_pebs_flags(event)))
+ ~intel_pmu_large_pebs_flags(event))) {
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+   event->attach_state |= PERF_ATTACH_SCHED_CB;
+   }
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
@@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
+   event->attach_state |= PERF_ATTACH_SCHED_CB;
 
/*
 * BTS is set up earlier in this path, so don't account twice


[tip: perf/urgent] perf/core: Flush PMU internal buffers for per-CPU events

2021-03-06 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: a5398bffc01fe044848c5024e5e867e407f239b8
Gitweb:
https://git.kernel.org/tip/a5398bffc01fe044848c5024e5e867e407f239b8
Author:Kan Liang 
AuthorDate:Mon, 30 Nov 2020 11:38:40 -08:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:52:39 +01:00

perf/core: Flush PMU internal buffers for per-CPU events

Sometimes the PMU internal buffers have to be flushed for per-CPU events
during a context switch, e.g., large PEBS. Otherwise, the perf tool may
report samples in locations that do not belong to the process where the
samples are processed in, because PEBS does not tag samples with PID/TID.

The current code only flush the buffers for a per-task event. It doesn't
check a per-CPU event.

Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
PMU internal buffers have to be flushed for this event during a context
switch.

Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
which is required to be flushed.

Only need to invoke the sched_task() for per-CPU events in this patch.
The per-task events have been handled in perf_event_context_sched_in/out
already.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context 
switches")
Reported-by: Gabriel Marin 
Originally-by: Namhyung Kim 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.li...@linux.intel.com
---
 include/linux/perf_event.h |  2 ++-
 kernel/events/core.c   | 42 +
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fab42cf..3f7f89e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK   0x04
 #define PERF_ATTACH_TASK_DATA  0x08
 #define PERF_ATTACH_ITRACE 0x10
+#define PERF_ATTACH_SCHED_CB   0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -872,6 +873,7 @@ struct perf_cpu_context {
struct list_headcgrp_cpuctx_entry;
 #endif
 
+   struct list_headsched_cb_entry;
int sched_cb_usage;
 
int online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f..03db40f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3461,11 +3462,16 @@ unlock:
}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-   --cpuctx->sched_cb_usage;
+   this_cpu_dec(perf_sched_cb_usages);
+
+   if (!--cpuctx->sched_cb_usage)
+   list_del(&cpuctx->sched_cb_entry);
 }
 
 
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-   cpuctx->sched_cb_usage++;
+   if (!cpuctx->sched_cb_usage++)
+   list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+   this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct 
perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+   struct task_struct *next,
+   bool sched_in)
+{
+   struct perf_cpu_context *cpuctx;
+
+   if (prev == next)
+   return;
+
+   list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), 
sched_cb_entry) {
+   /* will be handled in perf_event_context_sched_in/out */
+   if (cpuctx->task_ctx)
+   continue;
+
+   __perf_pmu_sched_task(cpuctx, sched_in);
+   }
+}
+
 static void perf_event_switch(struct task_struct *task,
  struct task_struct *next_prev, bool sched_in);
 
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
int ctxn;
 
+   if (__this_cpu_read(perf_sched_cb_usages))
+   perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
 
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+   if (__this_cpu_read(perf_sched_cb_u

[tip: perf/urgent] perf/core: Flush PMU internal buffers for per-CPU events

2021-03-01 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: e748d3716e0e581401630d36d3ef0fc8fa8f830d
Gitweb:
https://git.kernel.org/tip/e748d3716e0e581401630d36d3ef0fc8fa8f830d
Author:Kan Liang 
AuthorDate:Mon, 30 Nov 2020 11:38:40 -08:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 11:02:18 +01:00

perf/core: Flush PMU internal buffers for per-CPU events

Sometimes the PMU internal buffers have to be flushed for per-CPU events
during a context switch, e.g., large PEBS. Otherwise, the perf tool may
report samples in locations that do not belong to the process where the
samples are processed in, because PEBS does not tag samples with PID/TID.

The current code only flush the buffers for a per-task event. It doesn't
check a per-CPU event.

Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
PMU internal buffers have to be flushed for this event during a context
switch.

Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
which is required to be flushed.

Only need to invoke the sched_task() for per-CPU events in this patch.
The per-task events have been handled in perf_event_context_sched_in/out
already.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context 
switches")
Reported-by: Gabriel Marin 
Originally-by: Namhyung Kim 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.li...@linux.intel.com
---
 include/linux/perf_event.h |  2 ++-
 kernel/events/core.c   | 42 +
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fab42cf..3f7f89e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK   0x04
 #define PERF_ATTACH_TASK_DATA  0x08
 #define PERF_ATTACH_ITRACE 0x10
+#define PERF_ATTACH_SCHED_CB   0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -872,6 +873,7 @@ struct perf_cpu_context {
struct list_headcgrp_cpuctx_entry;
 #endif
 
+   struct list_headsched_cb_entry;
int sched_cb_usage;
 
int online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f..03db40f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3461,11 +3462,16 @@ unlock:
}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-   --cpuctx->sched_cb_usage;
+   this_cpu_dec(perf_sched_cb_usages);
+
+   if (!--cpuctx->sched_cb_usage)
+   list_del(&cpuctx->sched_cb_entry);
 }
 
 
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-   cpuctx->sched_cb_usage++;
+   if (!cpuctx->sched_cb_usage++)
+   list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+   this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct 
perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+   struct task_struct *next,
+   bool sched_in)
+{
+   struct perf_cpu_context *cpuctx;
+
+   if (prev == next)
+   return;
+
+   list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), 
sched_cb_entry) {
+   /* will be handled in perf_event_context_sched_in/out */
+   if (cpuctx->task_ctx)
+   continue;
+
+   __perf_pmu_sched_task(cpuctx, sched_in);
+   }
+}
+
 static void perf_event_switch(struct task_struct *task,
  struct task_struct *next_prev, bool sched_in);
 
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
int ctxn;
 
+   if (__this_cpu_read(perf_sched_cb_usages))
+   perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
 
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+   if (__this_cpu_read(perf_sched_cb_usages))
+   p

[tip: perf/urgent] perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR

2021-03-01 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/urgent branch of tip:

Commit-ID: a8abc881981762631a22568d5e4b2c0ce4aeb15c
Gitweb:
https://git.kernel.org/tip/a8abc881981762631a22568d5e4b2c0ce4aeb15c
Author:Kan Liang 
AuthorDate:Mon, 30 Nov 2020 11:38:41 -08:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 11:02:19 +01:00

perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR

To supply a PID/TID for large PEBS, it requires flushing the PEBS buffer
in a context switch.

For normal LBRs, a context switch can flip the address space and LBR
entries are not tagged with an identifier, we need to wipe the LBR, even
for per-cpu events.

For LBR callstack, save/restore the stack is required during a context
switch.

Set PERF_ATTACH_SCHED_CB for the event with large PEBS & LBR.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context 
switches")
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201130193842.10569-2-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5bac48d..7bbb5bb 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3662,8 +3662,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (!(event->attr.freq || (event->attr.wakeup_events && 
!event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &
- ~intel_pmu_large_pebs_flags(event)))
+ ~intel_pmu_large_pebs_flags(event))) {
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+   event->attach_state |= PERF_ATTACH_SCHED_CB;
+   }
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
@@ -3676,6 +3678,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
+   event->attach_state |= PERF_ATTACH_SCHED_CB;
 
/*
 * BTS is set up earlier in this path, so don't account twice


[tip: perf/core] perf/x86/intel: Add perf core PMU support for Sapphire Rapids

2021-02-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 61b985e3e775a3a75fda04ce7ef1b1aefc4758bc
Gitweb:
https://git.kernel.org/tip/61b985e3e775a3a75fda04ce7ef1b1aefc4758bc
Author:Kan Liang 
AuthorDate:Thu, 28 Jan 2021 14:40:10 -08:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Feb 2021 15:31:37 +01:00

perf/x86/intel: Add perf core PMU support for Sapphire Rapids

Add perf core PMU support for the Intel Sapphire Rapids server, which is
the successor of the Intel Ice Lake server. The enabling code is based
on Ice Lake, but there are several new features introduced.

The event encoding is changed and simplified, e.g., the event codes
which are below 0x90 are restricted to counters 0-3. The event codes
which above 0x90 are likely to have no restrictions. The event
constraints, extra_regs(), and hardware cache events table are changed
accordingly.

A new Precise Distribution (PDist) facility is introduced, which
further minimizes the skid when a precise event is programmed on the GP
counter 0. Enable the Precise Distribution (PDist) facility with :ppp
event. For this facility to work, the period must be initialized with a
value larger than 127. Add spr_limit_period() to apply the limit for
:ppp event.

Two new data source fields, data block & address block, are added in the
PEBS Memory Info Record for the load latency event. To enable the
feature,
- An auxiliary event has to be enabled together with the load latency
  event on Sapphire Rapids. A new flag PMU_FL_MEM_LOADS_AUX is
  introduced to indicate the case. A new event, mem-loads-aux, is
  exposed to sysfs for the user tool.
  Add a check in hw_config(). If the auxiliary event is not detected,
  return an unique error -ENODATA.
- The union perf_mem_data_src is extended to support the new fields.
- Ice Lake and earlier models do not support block information, but the
  fields may be set by HW on some machines. Add pebs_no_block to
  explicitly indicate the previous platforms which don't support the new
  block fields. Accessing the new block fields are ignored on those
  platforms.

A new store Latency facility is introduced, which leverages the PEBS
facility where it can provide additional information about sampled
stores. The additional information includes the data address, memory
auxiliary info (e.g. Data Source, STLB miss) and the latency of the
store access. To enable the facility, the new event (0x02cd) has to be
programed on the GP counter 0. A new flag PERF_X86_EVENT_PEBS_STLAT is
introduced to indicate the event. The store_latency_data() is introduced
to parse the memory auxiliary info.

The layout of access latency field of PEBS Memory Info Record has been
changed. Two latency, instruction latency (bit 15:0) and cache access
latency (bit 47:32) are recorded.
- The cache access latency is similar to previous memory access latency.
  For loads, the latency starts by the actual cache access until the
  data is returned by the memory subsystem.
  For stores, the latency starts when the demand write accesses the L1
  data cache and lasts until the cacheline write is completed in the
  memory subsystem.
  The cache access latency is stored in low 32bits of the sample type
  PERF_SAMPLE_WEIGHT_STRUCT.
- The instruction latency starts by the dispatch of the load operation
  for execution and lasts until completion of the instruction it belongs
  to.
  Add a new flag PMU_FL_INSTR_LATENCY to indicate the instruction
  latency support. The instruction latency is stored in the bit 47:32
  of the sample type PERF_SAMPLE_WEIGHT_STRUCT.

Extends the PERF_METRICS MSR to feature TMA method level 2 metrics. The
lower half of the register is the TMA level 1 metrics (legacy). The
upper half is also divided into four 8-bit fields for the new level 2
metrics. Expose all eight Topdown metrics events to user space.

The full description for the SPR features can be found at Intel
Architecture Instruction Set Extensions and Future Features
Programming Reference, 319433-041.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1611873611-156687-5-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c  | 307 -
 arch/x86/events/intel/ds.c| 118 ++-
 arch/x86/events/perf_event.h  |  12 +-
 arch/x86/include/asm/perf_event.h |   8 +-
 include/uapi/linux/perf_event.h   |  12 +-
 5 files changed, 443 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 37830ac..58cd64e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] 
__read_mostly = {
EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+   INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3full, 
RSP_0),
+   INTEL_UEVENT_EXTRA_REG(0x012b, M

[tip: perf/core] perf/x86/intel: Support CPUID 10.ECX to disable fixed counters

2021-02-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 32451614da2a9cf4296f90d3606ac77814fb519d
Gitweb:
https://git.kernel.org/tip/32451614da2a9cf4296f90d3606ac77814fb519d
Author:Kan Liang 
AuthorDate:Thu, 28 Jan 2021 14:40:11 -08:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Feb 2021 15:31:37 +01:00

perf/x86/intel: Support CPUID 10.ECX to disable fixed counters

With Architectural Performance Monitoring Version 5, CPUID 10.ECX cpu
leaf indicates the fixed counter enumeration. This extends the previous
count to a bitmap which allows disabling even lower fixed counters.
It could be used by a Hypervisor.

The existing intel_ctrl variable is used to remember the bitmask of the
counters. All code that reads all counters is fixed to check this extra
bitmask.

Suggested-by: Peter Zijlstra (Intel) 
Originally-by: Andi Kleen 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1611873611-156687-6-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   |  8 +++-
 arch/x86/events/intel/core.c | 34 --
 arch/x86/events/perf_event.h |  5 +
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index cf0a52c..6ddeed3 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -255,6 +255,8 @@ static bool check_hw_exists(void)
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+   if (fixed_counter_disabled(i))
+   continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
val_fail = val;
@@ -1531,6 +1533,8 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+   if (fixed_counter_disabled(idx))
+   continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -2012,7 +2016,9 @@ static int __init init_hw_perf_events(void)
pr_info("... generic registers:  %d\n", x86_pmu.num_counters);
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
-   pr_info("... fixed-purpose events:   %d\n", 
x86_pmu.num_counters_fixed);
+   pr_info("... fixed-purpose events:   %lu\n",
+   hweight641ULL << x86_pmu.num_counters_fixed) - 1)
+   << INTEL_PMC_IDX_FIXED) & 
x86_pmu.intel_ctrl));
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
 
if (!x86_pmu.read)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 58cd64e..67a7246 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2723,8 +2723,11 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
}
-   for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+   for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+   if (fixed_counter_disabled(idx))
+   continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+   }
 
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -5042,7 +5045,7 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
struct event_constraint *c;
-   unsigned int unused;
+   unsigned int fixed_mask;
struct extra_reg *er;
bool pmem = false;
int version, i;
@@ -5064,7 +5067,7 @@ __init int intel_pmu_init(void)
 * Check whether the Architectural PerfMon supports
 * Branch Misses Retired hw_event or not.
 */
-   cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+   cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
 
@@ -5088,12 +5091,15 @@ __init int intel_pmu_init(void)
 * Quirk: v2 perfmon does not report fixed-purpose events, so
 * assume at least 3 events, when not running in a hypervisor:
 */
-   if (version > 1) {
+   if (version > 1 && version < 5) {
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
 
x86_pmu.num_counters_fixed =
max((int)edx.split.num_counters_fixed, assume);
-   }
+
+   fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+   } else if (version >= 5)
+   x86_pmu.num_counters_fixed = fls

[tip: perf/core] perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT

2021-02-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c
Gitweb:
https://git.kernel.org/tip/2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c
Author:Kan Liang 
AuthorDate:Thu, 28 Jan 2021 14:40:07 -08:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Feb 2021 15:31:36 +01:00

perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT

Current PERF_SAMPLE_WEIGHT sample type is very useful to expresses the
cost of an action represented by the sample. This allows the profiler
to scale the samples to be more informative to the programmer. It could
also help to locate a hotspot, e.g., when profiling by memory latencies,
the expensive load appear higher up in the histograms. But current
PERF_SAMPLE_WEIGHT sample type is solely determined by one factor. This
could be a problem, if users want two or more factors to contribute to
the weight. For example, Golden Cove core PMU can provide both the
instruction latency and the cache Latency information as factors for the
memory profiling.

For current X86 platforms, although meminfo::latency is defined as a
u64, only the lower 32 bits include the valid data in practice (No
memory access could last than 4G cycles). The higher 32 bits can be used
to store new factors.

Add a new sample type, PERF_SAMPLE_WEIGHT_STRUCT, to indicate the new
sample weight structure. It shares the same space as the
PERF_SAMPLE_WEIGHT sample type.

Users can apply either the PERF_SAMPLE_WEIGHT sample type or the
PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but
they cannot apply both sample types simultaneously.

Currently, only X86 and PowerPC use the PERF_SAMPLE_WEIGHT sample type.
- For PowerPC, there is nothing changed for the PERF_SAMPLE_WEIGHT
  sample type. There is no effect for the new PERF_SAMPLE_WEIGHT_STRUCT
  sample type. PowerPC can re-struct the weight field similarly later.
- For X86, the same value will be dumped for the PERF_SAMPLE_WEIGHT
  sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type for now.
  The following patches will apply the new factors for the
  PERF_SAMPLE_WEIGHT_STRUCT sample type.

The field in the union perf_sample_weight should be shared among
different architectures. A generic name is required, but it's hard to
abstract a name that applies to all architectures. For example, on X86,
the fields are to store all kinds of latency. While on PowerPC, it
stores MMCRA[TECX/TECM], which should not be latency. So a general name
prefix 'var$NUM' is used here.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1611873611-156687-2-git-send-email-kan.li...@linux.intel.com
---
 arch/powerpc/perf/core-book3s.c |  2 +-
 arch/x86/events/intel/ds.c  | 17 ++---
 include/linux/perf_event.h  |  4 +--
 include/uapi/linux/perf_event.h | 42 ++--
 kernel/events/core.c| 11 +---
 5 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 28206b1..869d999 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2195,7 +2195,7 @@ static void record_and_restart(struct perf_event *event, 
unsigned long val,
 
if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
ppmu->get_mem_weight)
-   ppmu->get_mem_weight(&data.weight);
+   ppmu->get_mem_weight(&data.weight.full);
 
if (perf_event_overflow(event, &data, regs))
power_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 67dbc91..2f54b1f 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -960,7 +960,8 @@ static void adaptive_pebs_record_size_update(void)
 }
 
 #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
-   PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+   PERF_SAMPLE_PHYS_ADDR |  \
+   PERF_SAMPLE_WEIGHT_TYPE |\
PERF_SAMPLE_TRANSACTION |\
PERF_SAMPLE_DATA_PAGE_SIZE)
 
@@ -987,7 +988,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event 
*event)
gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
   (attr->sample_regs_intr & PEBS_GP_REGS);
 
-   tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+   tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
 ((attr->config & INTEL_ARCH_EVENT_MASK) ==
  x86_pmu.rtm_abort_event);
 
@@ -1369,8 +1370,8 @@ static void setup_pebs_fixed_sample_data(struct 
perf_event *event,
/*
 * Use latency for weight (only avail with PEB

[tip: perf/core] perf/x86/intel: Factor out intel_update_topdown_event()

2021-02-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 628d923a3c464db98c1c98bb1e0cd50804caf681
Gitweb:
https://git.kernel.org/tip/628d923a3c464db98c1c98bb1e0cd50804caf681
Author:Kan Liang 
AuthorDate:Thu, 28 Jan 2021 14:40:08 -08:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Feb 2021 15:31:36 +01:00

perf/x86/intel: Factor out intel_update_topdown_event()

Similar to Ice Lake, Intel Sapphire Rapids server also supports the
topdown performance metrics feature. The difference is that Intel
Sapphire Rapids server extends the PERF_METRICS MSR to feature TMA
method level two metrics, which will introduce 8 metrics events. Current
icl_update_topdown_event() only check 4 level one metrics events.

Factor out intel_update_topdown_event() to facilitate the code sharing
between Ice Lake and Sapphire Rapids.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1611873611-156687-3-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fe94008..d07408d 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2325,8 +2325,8 @@ static void __icl_update_topdown_event(struct perf_event 
*event,
}
 }
 
-static void update_saved_topdown_regs(struct perf_event *event,
- u64 slots, u64 metrics)
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+ u64 metrics, int metric_end)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2335,7 +2335,7 @@ static void update_saved_topdown_regs(struct perf_event 
*event,
event->hw.saved_slots = slots;
event->hw.saved_metric = metrics;
 
-   for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) 
{
+   for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2350,7 +2350,8 @@ static void update_saved_topdown_regs(struct perf_event 
*event,
  * The PERF_METRICS and Fixed counter 3 are read separately. The values may be
  * modify by a NMI. PMU has to be disabled before calling this function.
  */
-static u64 icl_update_topdown_event(struct perf_event *event)
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2366,7 +2367,7 @@ static u64 icl_update_topdown_event(struct perf_event 
*event)
/* read PERF_METRICS */
rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
 
-   for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) 
{
+   for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2392,7 +2393,7 @@ static u64 icl_update_topdown_event(struct perf_event 
*event)
 * Don't need to reset the PERF_METRICS and Fixed counter 3.
 * Because the values will be restored in next schedule in.
 */
-   update_saved_topdown_regs(event, slots, metrics);
+   update_saved_topdown_regs(event, slots, metrics, metric_end);
reset = false;
}
 
@@ -2401,12 +2402,17 @@ static u64 icl_update_topdown_event(struct perf_event 
*event)
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrl(MSR_PERF_METRICS, 0);
if (event)
-   update_saved_topdown_regs(event, 0, 0);
+   update_saved_topdown_regs(event, 0, 0, metric_end);
}
 
return slots;
 }
 
+static u64 icl_update_topdown_event(struct perf_event *event)
+{
+   return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND);
+}
+
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);


[tip: perf/core] perf/x86/intel: Filter unsupported Topdown metrics event

2021-02-02 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 1ab5f235c176e93adc4f75000aae6c50fea9db00
Gitweb:
https://git.kernel.org/tip/1ab5f235c176e93adc4f75000aae6c50fea9db00
Author:Kan Liang 
AuthorDate:Thu, 28 Jan 2021 14:40:09 -08:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Feb 2021 15:31:36 +01:00

perf/x86/intel: Filter unsupported Topdown metrics event

Intel Sapphire Rapids server will introduce 8 metrics events. Intel
Ice Lake only supports 4 metrics events. A perf tool user may mistakenly
use the unsupported events via RAW format on Ice Lake. The user can
still get a value from the unsupported Topdown metrics event once the
following Sapphire Rapids enabling patch is applied.

To enable the 8 metrics events on Intel Sapphire Rapids, the
INTEL_TD_METRIC_MAX has to be updated, which impacts the
is_metric_event(). The is_metric_event() is a generic function.
On Ice Lake, the newly added SPR metrics events will be mistakenly
accepted as metric events on creation. At runtime, the unsupported
Topdown metrics events will be updated.

Add a variable num_topdown_events in x86_pmu to indicate the available
number of the Topdown metrics event on the platform. Apply the number
into is_metric_event(). Only the supported Topdown metrics events
should be created as metrics events.

Apply the num_topdown_events in icl_update_topdown_event() as well. The
function can be reused by the following patch.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1611873611-156687-4-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c  | 15 +--
 arch/x86/events/perf_event.h  |  1 +
 arch/x86/include/asm/perf_event.h | 10 --
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index d07408d..37830ac 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2410,7 +2410,8 @@ static u64 intel_update_topdown_event(struct perf_event 
*event, int metric_end)
 
 static u64 icl_update_topdown_event(struct perf_event *event)
 {
-   return intel_update_topdown_event(event, INTEL_PMC_IDX_TD_BE_BOUND);
+   return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+x86_pmu.num_topdown_events - 
1);
 }
 
 static void intel_pmu_read_topdown_event(struct perf_event *event)
@@ -3468,6 +3469,15 @@ static int core_pmu_hw_config(struct perf_event *event)
return intel_pmu_bts_config(event);
 }
 
+#define INTEL_TD_METRIC_AVAILABLE_MAX  (INTEL_TD_METRIC_RETIRING + \
+((x86_pmu.num_topdown_events - 1) << 
8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+   return is_metric_event(event) &&
+   event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
int ret = x86_pmu_hw_config(event);
@@ -3541,7 +3551,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.config & X86_ALL_EVENT_FLAGS)
return -EINVAL;
 
-   if (is_metric_event(event)) {
+   if (is_available_metric_event(event)) {
struct perf_event *leader = event->group_leader;
 
/* The metric events don't support sampling. */
@@ -5324,6 +5334,7 @@ __init int intel_pmu_init(void)
x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
+   x86_pmu.num_topdown_events = 4;
x86_pmu.update_topdown_event = icl_update_topdown_event;
x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
pr_cont("Icelake events, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 978a16e..15343cc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -775,6 +775,7 @@ struct x86_pmu {
/*
 * Intel perf metrics
 */
+   int num_topdown_events;
u64 (*update_topdown_event)(struct perf_event *event);
int (*set_topdown_event_period)(struct perf_event *event);
 
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index e2a4c78..7c2c302 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -280,8 +280,14 @@ struct x86_pmu_capability {
 #define INTEL_TD_METRIC_BAD_SPEC   0x8100  /* Bad speculation 
metric */
 #define INTEL_TD_METRIC_FE_BOUND   0x8200  /* FE bound metric */
 #define INTEL_TD_METRIC_BE_BOUND   0x8300  /* BE bound metric */
-#define INTEL_TD_METRIC_MAX   

[tip: perf/core] perf/x86/intel/lbr: Fix the return type of get_lbr_cycles()

2020-12-09 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: f8129cd958b395575e5543ce25a8434874b04d3a
Gitweb:
https://git.kernel.org/tip/f8129cd958b395575e5543ce25a8434874b04d3a
Author:Kan Liang 
AuthorDate:Wed, 25 Nov 2020 13:37:20 -08:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 09 Dec 2020 17:08:58 +01:00

perf/x86/intel/lbr: Fix the return type of get_lbr_cycles()

The cycle count of a timed LBR is always 1 in perf record -D.

The cycle count is stored in the first 16 bits of the IA32_LBR_x_INFO
register, but the get_lbr_cycles() return Boolean type.

Use u16 to replace the Boolean type.

Fixes: 47125db27e47 ("perf/x86/intel/lbr: Support Architectural LBR")
Reported-by: Stephane Eranian 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20201125213720.15692-2-kan.li...@linux.intel.com
---
 arch/x86/events/intel/lbr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8961653..e2b0efc 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -919,7 +919,7 @@ static __always_inline bool get_lbr_predicted(u64 info)
return !(info & LBR_INFO_MISPRED);
 }
 
-static __always_inline bool get_lbr_cycles(u64 info)
+static __always_inline u16 get_lbr_cycles(u64 info)
 {
if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
!(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))


[tip: perf/core] perf/x86/intel: Fix rtm_abort_event encoding on Ice Lake

2020-12-09 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 46b72e1bf4fc571da0c29c6fb3e5b2a2107a4c26
Gitweb:
https://git.kernel.org/tip/46b72e1bf4fc571da0c29c6fb3e5b2a2107a4c26
Author:Kan Liang 
AuthorDate:Wed, 25 Nov 2020 13:37:19 -08:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 09 Dec 2020 17:08:57 +01:00

perf/x86/intel: Fix rtm_abort_event encoding on Ice Lake

According to the event list from icelake_core_v1.09.json, the encoding
of the RTM_RETIRED.ABORTED event on Ice Lake should be,
"EventCode": "0xc9",
"UMask": "0x04",
"EventName": "RTM_RETIRED.ABORTED",

Correct the wrong encoding.

Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support")
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20201125213720.15692-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 546cc89..6c0d18f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5466,7 +5466,7 @@ __init int intel_pmu_init(void)
mem_attr = icl_events_attrs;
td_attr = icl_td_events_attrs;
tsx_attr = icl_tsx_events_attrs;
-   x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02);
+   x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
x86_pmu.update_topdown_event = icl_update_topdown_event;


[tip: perf/core] perf/x86/intel: Add Tremont Topdown support

2020-12-09 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: c2208046bba6842dc232a600dc5cafc2fca41078
Gitweb:
https://git.kernel.org/tip/c2208046bba6842dc232a600dc5cafc2fca41078
Author:Kan Liang 
AuthorDate:Tue, 08 Dec 2020 12:05:52 -08:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 09 Dec 2020 17:08:59 +01:00

perf/x86/intel: Add Tremont Topdown support

Tremont has four L1 Topdown events, TOPDOWN_FE_BOUND.ALL,
TOPDOWN_BAD_SPECULATION.ALL, TOPDOWN_BE_BOUND.ALL and
TOPDOWN_RETIRING.ALL. They are available on GP counters.

Export them to sysfs and facilitate the perf stat tool.

 $perf stat --topdown -- sleep 1

 Performance counter stats for 'sleep 1':

retiring  bad speculation   frontend bound
backend bound
   24.9%16.8%31.7%
26.6%

   1.001224610 seconds time elapsed

   0.00115 seconds user
   0.0 seconds sys

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1607457952-3519-1-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6c0d18f..d4569bf 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1901,6 +1901,19 @@ static __initconst const u64 tnt_hw_cache_extra_regs
},
 };
 
+EVENT_ATTR_STR(topdown-fe-bound,   td_fe_bound_tnt,
"event=0x71,umask=0x0");
+EVENT_ATTR_STR(topdown-retiring,   td_retiring_tnt,
"event=0xc2,umask=0x0");
+EVENT_ATTR_STR(topdown-bad-spec,   td_bad_spec_tnt,
"event=0x73,umask=0x6");
+EVENT_ATTR_STR(topdown-be-bound,   td_be_bound_tnt,
"event=0x74,umask=0x0");
+
+static struct attribute *tnt_events_attrs[] = {
+   EVENT_PTR(td_fe_bound_tnt),
+   EVENT_PTR(td_retiring_tnt),
+   EVENT_PTR(td_bad_spec_tnt),
+   EVENT_PTR(td_be_bound_tnt),
+   NULL,
+};
+
 static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 
0x800ff0ff9fffull, RSP_0),
@@ -5174,6 +5187,7 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.get_event_constraints = tnt_get_event_constraints;
+   td_attr = tnt_events_attrs;
extra_attr = slm_format_attr;
pr_cont("Tremont events, ");
name = "Tremont";


[tip: perf/core] perf/x86/intel: Add Rocket Lake CPU support

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: b14d0db5b8c86507c9810c1c8162c7d4a3c656bd
Gitweb:
https://git.kernel.org/tip/b14d0db5b8c86507c9810c1c8162c7d4a3c656bd
Author:Kan Liang 
AuthorDate:Mon, 19 Oct 2020 08:35:25 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:39 +01:00

perf/x86/intel: Add Rocket Lake CPU support

>From the perspective of Intel PMU, Rocket Lake is the same as Ice Lake
and Tiger Lake. Share the perf code with them.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201019153528.13850-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7186098..4d70c7d 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5436,6 +5436,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ICELAKE:
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
+   case INTEL_FAM6_ROCKETLAKE:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, 
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, 
sizeof(hw_cache_extra_regs));


[tip: perf/core] perf/x86/intel/uncore: Add Rocket Lake support

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 43bc103a8044b9f7963aa1684efbdc9bd60939de
Gitweb:
https://git.kernel.org/tip/43bc103a8044b9f7963aa1684efbdc9bd60939de
Author:Kan Liang 
AuthorDate:Mon, 19 Oct 2020 08:35:28 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:40 +01:00

perf/x86/intel/uncore: Add Rocket Lake support

For Rocket Lake, the MSR uncore, e.g., CBOX, ARB and CLOCKBOX, are the
same as Tiger Lake. Share the perf code with it.

For Rocket Lake and Tiger Lake, the 8th CBOX is not mapped into a
different MSR space anymore. Add rkl_uncore_msr_init_box() to replace
skl_uncore_msr_init_box().

The IMC uncore is the similar to Ice Lake. Add new PCIIDs of IMC for
Rocket Lake.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201019153528.13850-4-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c |  6 ++
 arch/x86/events/intel/uncore_snb.c | 20 +++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 86d012b..1db6a71 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1636,6 +1636,11 @@ static const struct intel_uncore_init_fun 
tgl_l_uncore_init __initconst = {
.mmio_init = tgl_l_uncore_mmio_init,
 };
 
+static const struct intel_uncore_init_fun rkl_uncore_init __initconst = {
+   .cpu_init = tgl_uncore_cpu_init,
+   .pci_init = skl_uncore_pci_init,
+};
+
 static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1683,6 +1688,7 @@ static const struct x86_cpu_id intel_uncore_match[] 
__initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,   &icx_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,   &tgl_uncore_init),
+   X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,  &rkl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,  &snr_uncore_init),
{},
 };
diff --git a/arch/x86/events/intel/uncore_snb.c 
b/arch/x86/events/intel/uncore_snb.c
index de3d962..6bbf54b 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -60,7 +60,8 @@
 #define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12
 #define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14
 #define PCI_DEVICE_ID_INTEL_TGL_H_IMC  0x9a36
-
+#define PCI_DEVICE_ID_INTEL_RKL_1_IMC  0x4c43
+#define PCI_DEVICE_ID_INTEL_RKL_2_IMC  0x4c53
 
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
@@ -405,6 +406,12 @@ static struct intel_uncore_type *tgl_msr_uncores[] = {
NULL,
 };
 
+static void rkl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+   if (box->pmu->pmu_idx == 0)
+   wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
 void tgl_uncore_cpu_init(void)
 {
uncore_msr_uncores = tgl_msr_uncores;
@@ -412,6 +419,7 @@ void tgl_uncore_cpu_init(void)
icl_uncore_cbox.ops = &skl_uncore_msr_ops;
icl_uncore_clockbox.ops = &skl_uncore_msr_ops;
snb_uncore_arb.ops = &skl_uncore_msr_ops;
+   skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box;
 }
 
 enum {
@@ -880,6 +888,14 @@ static const struct pci_device_id icl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -973,6 +989,8 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver),
IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile 
*/
IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver),/* 10th Gen Core Mobile 
*/
+   IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver),
+   IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver),
{  /* end marker */ }
 };
 


[tip: perf/core] perf/x86/intel: Add event constraint for CYCLE_ACTIVITY.STALLS_MEM_ANY

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 306e3e91edf1c6739a55312edd110d298ff498dd
Gitweb:
https://git.kernel.org/tip/306e3e91edf1c6739a55312edd110d298ff498dd
Author:Kan Liang 
AuthorDate:Mon, 19 Oct 2020 09:45:29 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:41 +01:00

perf/x86/intel: Add event constraint for CYCLE_ACTIVITY.STALLS_MEM_ANY

The event CYCLE_ACTIVITY.STALLS_MEM_ANY (0x14a3) should be available on
all 8 GP counters on ICL, but it's only scheduled on the first four
counters due to the current ICL constraint table.

Add a line for the CYCLE_ACTIVITY.STALLS_MEM_ANY event in the ICL
constraint table.
Correct the comments for the CYCLE_ACTIVITY.CYCLES_MEM_ANY event.

Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support")
Reported-by: Andi Kleen 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20201019164529.32154-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 4d70c7d..0e590c5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -257,7 +257,8 @@ static struct event_constraint 
intel_icl_event_constraints[] = {
INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff),  /* CYCLE_ACTIVITY.STALLS_TOTAL 
*/
-   INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff),  /* 
CYCLE_ACTIVITY.STALLS_MEM_ANY */
+   INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff),  /* 
CYCLE_ACTIVITY.CYCLES_MEM_ANY */
+   INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff),  /* 
CYCLE_ACTIVITY.STALLS_MEM_ANY */
INTEL_EVENT_CONSTRAINT(0xa3, 0xf),  /* CYCLE_ACTIVITY.* */
INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),


[tip: perf/core] perf/x86/intel: Support PERF_SAMPLE_DATA_PAGE_SIZE

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 76a5433f95f32d8a17c9f836be2084ed947c466b
Gitweb:
https://git.kernel.org/tip/76a5433f95f32d8a17c9f836be2084ed947c466b
Author:Kan Liang 
AuthorDate:Thu, 01 Oct 2020 06:57:47 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:38 +01:00

perf/x86/intel: Support PERF_SAMPLE_DATA_PAGE_SIZE

The new sample type, PERF_SAMPLE_DATA_PAGE_SIZE, requires the virtual
address. Update the data->addr if the sample type is set.

The large PEBS is disabled with the sample type, because perf doesn't
support munmap tracking yet. The PEBS buffer for large PEBS cannot be
flushed for each munmap. Wrong page size may be calculated. The large
PEBS can be enabled later separately when munmap tracking is supported.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201001135749.2804-3-kan.li...@linux.intel.com
---
 arch/x86/events/intel/ds.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 404315d..444e5f0 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -959,7 +959,8 @@ static void adaptive_pebs_record_size_update(void)
 
 #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
-   PERF_SAMPLE_TRANSACTION)
+   PERF_SAMPLE_TRANSACTION |\
+   PERF_SAMPLE_DATA_PAGE_SIZE)
 
 static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 {
@@ -1335,6 +1336,10 @@ static u64 get_data_src(struct perf_event *event, u64 
aux)
return val;
 }
 
+#define PERF_SAMPLE_ADDR_TYPE  (PERF_SAMPLE_ADDR | \
+PERF_SAMPLE_PHYS_ADDR |\
+PERF_SAMPLE_DATA_PAGE_SIZE)
+
 static void setup_pebs_fixed_sample_data(struct perf_event *event,
   struct pt_regs *iregs, void *__pebs,
   struct perf_sample_data *data,
@@ -1449,7 +1454,7 @@ static void setup_pebs_fixed_sample_data(struct 
perf_event *event,
}
 
 
-   if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
+   if ((sample_type & PERF_SAMPLE_ADDR_TYPE) &&
x86_pmu.intel_cap.pebs_format >= 1)
data->addr = pebs->dla;
 
@@ -1577,7 +1582,7 @@ static void setup_pebs_adaptive_sample_data(struct 
perf_event *event,
if (sample_type & PERF_SAMPLE_DATA_SRC)
data->data_src.val = get_data_src(event, meminfo->aux);
 
-   if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
+   if (sample_type & PERF_SAMPLE_ADDR_TYPE)
data->addr = meminfo->address;
 
if (sample_type & PERF_SAMPLE_TRANSACTION)


[tip: perf/core] perf/x86/cstate: Add Rocket Lake CPU support

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: cbea56395cba13173fffb9251cb23f146b51c792
Gitweb:
https://git.kernel.org/tip/cbea56395cba13173fffb9251cb23f146b51c792
Author:Kan Liang 
AuthorDate:Mon, 19 Oct 2020 08:35:26 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:40 +01:00

perf/x86/cstate: Add Rocket Lake CPU support

>From the perspective of Intel cstate residency counters, Rocket Lake is
the same as Ice Lake and Tiger Lake. Share the code with them. Update
the comments for Rocket Lake.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201019153528.13850-2-kan.li...@linux.intel.com
---
 arch/x86/events/intel/cstate.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 442e1ed..a161a0b 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -51,46 +51,46 @@
  *perf code: 0x02
  *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
  * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT
+ * TNT,RKL
  *Scope: Core
  * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
  *perf code: 0x03
  *Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL
+ * ICL,TGL,RKL
  *Scope: Core
  * MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
  *perf code: 0x00
  *Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL,TNT
+ * KBL,CML,ICL,TGL,TNT,RKL
  *Scope: Package (physical package)
  * MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
  *perf code: 0x01
  *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL,KBL,CML,ICL,TGL,TNT
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL
  *Scope: Package (physical package)
  * MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
  *perf code: 0x02
  *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
  * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT
+ * TNT,RKL
  *Scope: Package (physical package)
  * MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
  *perf code: 0x03
  *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL
+ * KBL,CML,ICL,TGL,RKL
  *Scope: Package (physical package)
  * MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
  *perf code: 0x04
- *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL
+ *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
  *Scope: Package (physical package)
  * MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
  *perf code: 0x05
- *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL
+ *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
  *Scope: Package (physical package)
  * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
  *perf code: 0x06
  *Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT
+ * TNT,RKL
  *Scope: Package (physical package)
  *
  */
@@ -649,6 +649,7 @@ static const struct x86_cpu_id intel_cstates_match[] 
__initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,   &icl_cstates),
+   X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,  &icl_cstates),
{ },
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);


[tip: perf/core] perf/x86/msr: Add Rocket Lake CPU support

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 907a196fbc70a48338ee8512da32f70fd33c97eb
Gitweb:
https://git.kernel.org/tip/907a196fbc70a48338ee8512da32f70fd33c97eb
Author:Kan Liang 
AuthorDate:Mon, 19 Oct 2020 08:35:27 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:40 +01:00

perf/x86/msr: Add Rocket Lake CPU support

Like Ice Lake and Tiger Lake, PPERF and SMI_COUNT MSRs are also
supported by Rocket Lake.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201019153528.13850-3-kan.li...@linux.intel.com
---
 arch/x86/events/msr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 4be8f9c..680404c 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -99,6 +99,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ICELAKE_D:
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
+   case INTEL_FAM6_ROCKETLAKE:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;


[tip: perf/core] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 8d97e71811aaafe4abf611dc24822fd6e73df1a1
Gitweb:
https://git.kernel.org/tip/8d97e71811aaafe4abf611dc24822fd6e73df1a1
Author:Kan Liang 
AuthorDate:Thu, 01 Oct 2020 06:57:46 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:38 +01:00

perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

Current perf can report both virtual addresses and physical addresses,
but not the MMU page size. Without the MMU page size information of the
utilized page, users cannot decide whether to promote/demote large pages
to optimize memory usage.

Add a new sample type for the data MMU page size.

Current perf already has a facility to collect data virtual addresses.
A page walker is required to walk the pages tables and calculate the
MMU page size from a given virtual address.

On some platforms, e.g., X86, the page walker is invoked in an NMI
handler. So the page walker must be NMI-safe and low overhead. Besides,
the page walker should work for both user and kernel virtual address.
The existing generic page walker, e.g., walk_page_range_novma(), is a
little bit complex and doesn't guarantee the NMI-safe. The follow_page()
is only for user-virtual address.

Add a new function perf_get_page_size() to walk the page tables and
calculate the MMU page size. In the function:
- Interrupts have to be disabled to prevent any teardown of the page
  tables.
- For user space threads, the current->mm is used for the page walker.
  For kernel threads and the like, the current->mm is NULL. The init_mm
  is used for the page walker. The active_mm is not used here, because
  it can be NULL.
  Quote from Peter Zijlstra,
  "context_switch() can set prev->active_mm to NULL when it transfers it
   to @next. It does this before @current is updated. So an NMI that
   comes in between this active_mm swizzling and updating @current will
   see !active_mm."
- The MMU page size is calculated from the page table level.

The method should work for all architectures, but it has only been
verified on X86. Should there be some architectures, which support perf,
where the method doesn't work, it can be fixed later separately.
Reporting the wrong page size would not be fatal for the architecture.

Some under discussion features may impact the method in the future.
Quote from Dave Hansen,
  "There are lots of weird things folks are trying to do with the page
   tables, like Address Space Isolation.  For instance, if you get a
   perf NMI when running userspace, current->mm->pgd is *different* than
   the PGD that was in use when userspace was running. It's close enough
   today, but it might not stay that way."
If the case happens later, lots of consecutive page walk errors will
happen. The worst case is that lots of page-size '0' are returned, which
would not be fatal.
In the perf tool, a check is implemented to detect this case. Once it
happens, a kernel patch could be implemented accordingly then.

Suggested-by: Peter Zijlstra 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201001135749.2804-2-kan.li...@linux.intel.com
---
 include/linux/perf_event.h  |   1 +-
 include/uapi/linux/perf_event.h |   4 +-
 kernel/events/core.c| 103 +++-
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0c19d27..7e3785d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1034,6 +1034,7 @@ struct perf_sample_data {
 
u64 phys_addr;
u64 cgroup;
+   u64 data_page_size;
 } cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 077e7ee..cc6ea34 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -143,8 +143,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_PHYS_ADDR   = 1U << 19,
PERF_SAMPLE_AUX = 1U << 20,
PERF_SAMPLE_CGROUP  = 1U << 21,
+   PERF_SAMPLE_DATA_PAGE_SIZE  = 1U << 22,
 
-   PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */
+   PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */
 
__PERF_SAMPLE_CALLCHAIN_EARLY   = 1ULL << 63, /* non-ABI; 
internal use */
 };
@@ -896,6 +897,7 @@ enum perf_event_type {
 *  { u64   phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 *  { u64   size;
 *char  data[size]; } && PERF_SAMPLE_AUX
+*  { u64   data_page_size;} && 
PERF_SAMPLE_DATA_PAGE_SIZE
 * };
 */
PERF_RECORD_SAMPLE  = 9,
diff --git a/kernel/events/core.c b/kernel/events/co

[tip: perf/core] powerpc/perf: Support PERF_SAMPLE_DATA_PAGE_SIZE

2020-10-29 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 4cb6a42e4c4bc1902644eced67563e7405d4588e
Gitweb:
https://git.kernel.org/tip/4cb6a42e4c4bc1902644eced67563e7405d4588e
Author:Kan Liang 
AuthorDate:Thu, 01 Oct 2020 06:57:48 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 29 Oct 2020 11:00:39 +01:00

powerpc/perf: Support PERF_SAMPLE_DATA_PAGE_SIZE

The new sample type, PERF_SAMPLE_DATA_PAGE_SIZE, requires the virtual
address. Update the data->addr if the sample type is set.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201001135749.2804-4-kan.li...@linux.intel.com
---
 arch/powerpc/perf/core-book3s.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 78fe349..ce22bd2 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2065,6 +2065,9 @@ static struct pmu power_pmu = {
.sched_task = power_pmu_sched_task,
 };
 
+#define PERF_SAMPLE_ADDR_TYPE  (PERF_SAMPLE_ADDR | \
+   PERF_SAMPLE_PHYS_ADDR | \
+   PERF_SAMPLE_DATA_PAGE_SIZE)
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -2120,8 +2123,7 @@ static void record_and_restart(struct perf_event *event, 
unsigned long val,
 
perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
-   if (event->attr.sample_type &
-   (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
+   if (event->attr.sample_type & PERF_SAMPLE_ADDR_TYPE)
perf_get_data_addr(event, regs, &data.addr);
 
if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {


[tip: perf/core] perf/x86/intel: Check perf metrics feature for each CPU

2020-10-05 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 80a5ce116fc084e8a25d5a936617699e2931b611
Gitweb:
https://git.kernel.org/tip/80a5ce116fc084e8a25d5a936617699e2931b611
Author:Kan Liang 
AuthorDate:Thu, 01 Oct 2020 14:17:11 -07:00
Committer: Peter Zijlstra 
CommitterDate: Sat, 03 Oct 2020 16:30:56 +02:00

perf/x86/intel: Check perf metrics feature for each CPU

It might be possible that different CPUs have different CPU metrics on a
platform. In this case, writing the GLOBAL_CTRL_EN_PERF_METRICS bit to
the GLOBAL_CTRL register of a CPU, which doesn't support the TopDown
perf metrics feature, causes MSR access error.

Current TopDown perf metrics feature is enumerated using the boot CPU's
PERF_CAPABILITIES MSR. The MSR only indicates the boot CPU supports this
feature.

Check the PERF_CAPABILITIES MSR for each CPU. If any CPU doesn't support
the perf metrics feature, disable the feature globally.

Fixes: 59a854e2f3b9 ("perf/x86/intel: Support TopDown metrics on Ice Lake")
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20201001211711.25708-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index bdf28d2..7186098 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4083,6 +4083,17 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.counter_freezing)
enable_counter_freeze();
 
+   /* Disable perf metrics if any added CPU doesn't support it. */
+   if (x86_pmu.intel_cap.perf_metrics) {
+   union perf_capabilities perf_cap;
+
+   rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
+   if (!perf_cap.perf_metrics) {
+   x86_pmu.intel_cap.perf_metrics = 0;
+   x86_pmu.intel_ctrl &= ~(1ULL << 
GLOBAL_CTRL_EN_PERF_METRICS);
+   }
+   }
+
if (!cpuc->shared_regs)
return;
 


[tip: perf/core] perf/x86/intel/uncore: Update Ice Lake uncore units

2020-09-30 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 8f5d41f3a0f495435c88ebba8fc150c931c10fef
Gitweb:
https://git.kernel.org/tip/8f5d41f3a0f495435c88ebba8fc150c931c10fef
Author:Kan Liang 
AuthorDate:Fri, 25 Sep 2020 06:49:04 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:01 +02:00

perf/x86/intel/uncore: Update Ice Lake uncore units

There are some updates for the Icelake model specific uncore performance
monitors. (The update can be found at 10th generation intel core
processors families specification update Revision 004, ICL068)

1) Counter 0 of ARB uncore unit is not available for software use
2) The global 'enable bit' (bit 29) and 'freeze bit' (bit 31) of
   MSR_UNC_PERF_GLOBAL_CTRL cannot be used to control counter behavior.
   Needs to use local enable in event select MSR.

Accessing the modified bit/registers will be ignored by HW. Users may
observe inaccurate results with the current code.

The changes of the MSR_UNC_PERF_GLOBAL_CTRL imply that groups cannot be
read atomically anymore. Although the error of the result for a group
becomes a bit bigger, it still far lower than not using a group. The
group support is still kept. Only Remove the *_box() related
implementation.

Since the counter 0 of ARB uncore unit is not available, update the MSR
address for the ARB uncore unit.

There is no change for IMC uncore unit, which only include free-running
counters.

Fixes: 6e394376ee89 ("perf/x86/intel/uncore: Add Intel Icelake uncore support")
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200925134905.8839-2-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore_snb.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snb.c 
b/arch/x86/events/intel/uncore_snb.c
index d2d43b6..2bdfcf8 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -126,6 +126,10 @@
 #define ICL_UNC_CBO_0_PER_CTR0 0x702
 #define ICL_UNC_CBO_MSR_OFFSET 0x8
 
+/* ICL ARB register */
+#define ICL_UNC_ARB_PER_CTR0x3b1
+#define ICL_UNC_ARB_PERFEVTSEL 0x3b3
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
@@ -313,6 +317,12 @@ void skl_uncore_cpu_init(void)
snb_uncore_arb.ops = &skl_uncore_msr_ops;
 }
 
+static struct intel_uncore_ops icl_uncore_msr_ops = {
+   .disable_event  = snb_uncore_msr_disable_event,
+   .enable_event   = snb_uncore_msr_enable_event,
+   .read_counter   = uncore_msr_read_counter,
+};
+
 static struct intel_uncore_type icl_uncore_cbox = {
.name   = "cbox",
.num_counters   = 4,
@@ -321,7 +331,7 @@ static struct intel_uncore_type icl_uncore_cbox = {
.event_ctl  = SNB_UNC_CBO_0_PERFEVTSEL0,
.event_mask = SNB_UNC_RAW_EVENT_MASK,
.msr_offset = ICL_UNC_CBO_MSR_OFFSET,
-   .ops= &skl_uncore_msr_ops,
+   .ops= &icl_uncore_msr_ops,
.format_group   = &snb_uncore_format_group,
 };
 
@@ -350,13 +360,25 @@ static struct intel_uncore_type icl_uncore_clockbox = {
.single_fixed   = 1,
.event_mask = SNB_UNC_CTL_EV_SEL_MASK,
.format_group   = &icl_uncore_clock_format_group,
-   .ops= &skl_uncore_msr_ops,
+   .ops= &icl_uncore_msr_ops,
.event_descs= icl_uncore_events,
 };
 
+static struct intel_uncore_type icl_uncore_arb = {
+   .name   = "arb",
+   .num_counters   = 1,
+   .num_boxes  = 1,
+   .perf_ctr_bits  = 44,
+   .perf_ctr   = ICL_UNC_ARB_PER_CTR,
+   .event_ctl  = ICL_UNC_ARB_PERFEVTSEL,
+   .event_mask = SNB_UNC_RAW_EVENT_MASK,
+   .ops= &icl_uncore_msr_ops,
+   .format_group   = &snb_uncore_format_group,
+};
+
 static struct intel_uncore_type *icl_msr_uncores[] = {
&icl_uncore_cbox,
-   &snb_uncore_arb,
+   &icl_uncore_arb,
&icl_uncore_clockbox,
NULL,
 };
@@ -374,7 +396,6 @@ void icl_uncore_cpu_init(void)
 {
uncore_msr_uncores = icl_msr_uncores;
icl_uncore_cbox.num_boxes = icl_get_cbox_num();
-   snb_uncore_arb.ops = &skl_uncore_msr_ops;
 }
 
 static struct intel_uncore_type *tgl_msr_uncores[] = {


[tip: perf/core] perf/x86/intel/uncore: Reduce the number of CBOX counters

2020-09-30 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: ee139385432e919f4d1f59b80edbc073cdae1391
Gitweb:
https://git.kernel.org/tip/ee139385432e919f4d1f59b80edbc073cdae1391
Author:Kan Liang 
AuthorDate:Fri, 25 Sep 2020 06:49:05 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:01 +02:00

perf/x86/intel/uncore: Reduce the number of CBOX counters

An oops is triggered by the fuzzy test.

[  327.853081] unchecked MSR access error: RDMSR from 0x70c at rIP:
0xc082c820 (uncore_msr_read_counter+0x10/0x50 [intel_uncore])
[  327.853083] Call Trace:
[  327.853085]  
[  327.853089]  uncore_pmu_event_start+0x85/0x170 [intel_uncore]
[  327.853093]  uncore_pmu_event_add+0x1a4/0x410 [intel_uncore]
[  327.853097]  ? event_sched_in.isra.118+0xca/0x240

There are 2 GP counters for each CBOX, but the current code claims 4
counters. Accessing the invalid registers triggers the oops.

Fixes: 6e394376ee89 ("perf/x86/intel/uncore: Add Intel Icelake uncore support")
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200925134905.8839-3-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore_snb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snb.c 
b/arch/x86/events/intel/uncore_snb.c
index 2bdfcf8..de3d962 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -325,7 +325,7 @@ static struct intel_uncore_ops icl_uncore_msr_ops = {
 
 static struct intel_uncore_type icl_uncore_cbox = {
.name   = "cbox",
-   .num_counters   = 4,
+   .num_counters   = 2,
.perf_ctr_bits  = 44,
.perf_ctr   = ICL_UNC_CBO_0_PER_CTR0,
.event_ctl  = SNB_UNC_CBO_0_PERFEVTSEL0,


[tip: perf/core] perf/x86/intel/uncore: Split the Ice Lake and Tiger Lake MSR uncore support

2020-09-30 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 8abbcfefb5f7afabab4578bedd7cd400800cb039
Gitweb:
https://git.kernel.org/tip/8abbcfefb5f7afabab4578bedd7cd400800cb039
Author:Kan Liang 
AuthorDate:Fri, 25 Sep 2020 06:49:03 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:00 +02:00

perf/x86/intel/uncore: Split the Ice Lake and Tiger Lake MSR uncore support

Previously, the MSR uncore for the Ice Lake and Tiger Lake are
identical. The code path is shared. However, with recent update, the
global MSR_UNC_PERF_GLOBAL_CTRL register and ARB uncore unit are changed
for the Ice Lake. Split the Ice Lake and Tiger Lake MSR uncore support.

The changes only impact the MSR ops() and the ARB uncore unit. Other
codes can still be shared between the Ice Lake and the Tiger Lake.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200925134905.8839-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c |  4 ++--
 arch/x86/events/intel/uncore.h |  1 +
 arch/x86/events/intel/uncore_snb.c | 16 
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index ce0a5ba..86d012b 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1627,12 +1627,12 @@ static const struct intel_uncore_init_fun 
icl_uncore_init __initconst = {
 };
 
 static const struct intel_uncore_init_fun tgl_uncore_init __initconst = {
-   .cpu_init = icl_uncore_cpu_init,
+   .cpu_init = tgl_uncore_cpu_init,
.mmio_init = tgl_uncore_mmio_init,
 };
 
 static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = {
-   .cpu_init = icl_uncore_cpu_init,
+   .cpu_init = tgl_uncore_cpu_init,
.mmio_init = tgl_l_uncore_mmio_init,
 };
 
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index df544bc..83d2a7d 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -568,6 +568,7 @@ void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 void skl_uncore_cpu_init(void);
 void icl_uncore_cpu_init(void);
+void tgl_uncore_cpu_init(void);
 void tgl_uncore_mmio_init(void);
 void tgl_l_uncore_mmio_init(void);
 int snb_pci2phy_map_init(int devid);
diff --git a/arch/x86/events/intel/uncore_snb.c 
b/arch/x86/events/intel/uncore_snb.c
index cb94ba8..d2d43b6 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -377,6 +377,22 @@ void icl_uncore_cpu_init(void)
snb_uncore_arb.ops = &skl_uncore_msr_ops;
 }
 
+static struct intel_uncore_type *tgl_msr_uncores[] = {
+   &icl_uncore_cbox,
+   &snb_uncore_arb,
+   &icl_uncore_clockbox,
+   NULL,
+};
+
+void tgl_uncore_cpu_init(void)
+{
+   uncore_msr_uncores = tgl_msr_uncores;
+   icl_uncore_cbox.num_boxes = icl_get_cbox_num();
+   icl_uncore_cbox.ops = &skl_uncore_msr_ops;
+   icl_uncore_clockbox.ops = &skl_uncore_msr_ops;
+   snb_uncore_arb.ops = &skl_uncore_msr_ops;
+}
+
 enum {
SNB_PCI_UNCORE_IMC,
 };


[tip: perf/core] perf/x86/intel: Fix Ice Lake event constraint table

2020-09-30 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 010cb00265f150bf82b23c02ad1fb87ce5c781e1
Gitweb:
https://git.kernel.org/tip/010cb00265f150bf82b23c02ad1fb87ce5c781e1
Author:Kan Liang 
AuthorDate:Mon, 28 Sep 2020 06:47:26 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00

perf/x86/intel: Fix Ice Lake event constraint table

An error occues when sampling non-PEBS INST_RETIRED.PREC_DIST(0x01c0)
event.

  perf record -e cpu/event=0xc0,umask=0x01/ -- sleep 1
  Error:
  The sys_perf_event_open() syscall returned with 22 (Invalid argument)
  for event (cpu/event=0xc0,umask=0x01/).
  /bin/dmesg | grep -i perf may provide additional information.

The idxmsk64 of the event is set to 0. The event never be successfully
scheduled.

The event should be limit to the fixed counter 0.

Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support")
Reported-by: Yi, Ammy 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20200928134726.13090-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 75dea67..bdf28d2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -243,7 +243,7 @@ static struct extra_reg intel_skl_extra_regs[] 
__read_mostly = {
 
 static struct event_constraint intel_icl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0),  /* INST_RETIRED.ANY */
-   INTEL_UEVENT_CONSTRAINT(0x1c0, 0),  /* INST_RETIRED.PREC_DIST */
+   FIXED_EVENT_CONSTRAINT(0x01c0, 0),  /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1),  /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2),  /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3),  /* SLOTS */


[tip: perf/core] perf/x86/msr: Add Jasper Lake support

2020-09-30 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: c3bb8a9fa31b99f5b7d2e45cd0a10db91349f4c9
Gitweb:
https://git.kernel.org/tip/c3bb8a9fa31b99f5b7d2e45cd0a10db91349f4c9
Author:Kan Liang 
AuthorDate:Mon, 28 Sep 2020 05:30:42 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00

perf/x86/msr: Add Jasper Lake support

The Jasper Lake processor is also a Tremont microarchitecture. From the
perspective of perf MSR, there is nothing changed compared with
Elkhart Lake.
Share the code path with Elkhart Lake.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1601296242-32763-2-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/msr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index a949f6f..4be8f9c 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -78,6 +78,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
case INTEL_FAM6_ATOM_TREMONT_D:
case INTEL_FAM6_ATOM_TREMONT:
+   case INTEL_FAM6_ATOM_TREMONT_L:
 
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:


[tip: perf/core] perf/x86/intel: Add Jasper Lake support

2020-09-30 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: dbfd638889a0396f5fe14ff3cc2263ec1e1cac62
Gitweb:
https://git.kernel.org/tip/dbfd638889a0396f5fe14ff3cc2263ec1e1cac62
Author:Kan Liang 
AuthorDate:Mon, 28 Sep 2020 05:30:41 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:01 +02:00

perf/x86/intel: Add Jasper Lake support

The Jasper Lake processor is also a Tremont microarchitecture. From the
perspective of Intel PMU, there is nothing changed compared with
Elkhart Lake.
Share the perf code with Elkhart Lake.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1601296242-32763-1-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c72e490..75dea67 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5135,6 +5135,7 @@ __init int intel_pmu_init(void)
 
case INTEL_FAM6_ATOM_TREMONT_D:
case INTEL_FAM6_ATOM_TREMONT:
+   case INTEL_FAM6_ATOM_TREMONT_L:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
   sizeof(hw_cache_event_ids));


[tip: perf/core] perf/x86/intel/uncore: Fix the scale of the IMC free-running events

2020-09-30 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 8191016a026b8dfbb14dea64efc8e723ee99fe65
Gitweb:
https://git.kernel.org/tip/8191016a026b8dfbb14dea64efc8e723ee99fe65
Author:Kan Liang 
AuthorDate:Mon, 28 Sep 2020 06:32:40 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00

perf/x86/intel/uncore: Fix the scale of the IMC free-running events

The "MiB" result of the IMC free-running bandwidth events,
uncore_imc_free_running/read/ and uncore_imc_free_running/write/ are 16
times too small.

The "MiB" value equals the raw IMC free-running bandwidth counter value
times a "scale" which is inaccurate.

The IMC free-running bandwidth events should be incremented per 64B
cache line, not DWs (4 bytes). The "scale" should be 6.103515625e-5.
Fix the "scale" for both Snow Ridge and Ice Lake.

Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore 
support")
Fixes: ee49532b38dd ("perf/x86/intel/uncore: Add IMC uncore support for Snow 
Ridge")
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200928133240.12977-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore_snbep.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index 3f1e75f..7bdb182 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -4807,10 +4807,10 @@ static struct uncore_event_desc 
snr_uncore_imc_freerunning_events[] = {
INTEL_UNCORE_EVENT_DESC(dclk,   "event=0xff,umask=0x10"),
 
INTEL_UNCORE_EVENT_DESC(read,   "event=0xff,umask=0x20"),
-   INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"),
+   INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(read.unit,  "MiB"),
INTEL_UNCORE_EVENT_DESC(write,  "event=0xff,umask=0x21"),
-   INTEL_UNCORE_EVENT_DESC(write.scale,"3.814697266e-6"),
+   INTEL_UNCORE_EVENT_DESC(write.scale,"6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"),
{ /* end: all zeroes */ },
 };
@@ -5268,17 +5268,17 @@ static struct uncore_event_desc 
icx_uncore_imc_freerunning_events[] = {
INTEL_UNCORE_EVENT_DESC(dclk,   
"event=0xff,umask=0x10"),
 
INTEL_UNCORE_EVENT_DESC(read,   
"event=0xff,umask=0x20"),
-   INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"),
+   INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(read.unit,  "MiB"),
INTEL_UNCORE_EVENT_DESC(write,  
"event=0xff,umask=0x21"),
-   INTEL_UNCORE_EVENT_DESC(write.scale,"3.814697266e-6"),
+   INTEL_UNCORE_EVENT_DESC(write.scale,"6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"),
 
INTEL_UNCORE_EVENT_DESC(ddrt_read,  
"event=0xff,umask=0x30"),
-   INTEL_UNCORE_EVENT_DESC(ddrt_read.scale,"3.814697266e-6"),
+   INTEL_UNCORE_EVENT_DESC(ddrt_read.scale,"6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(ddrt_write, 
"event=0xff,umask=0x31"),
-   INTEL_UNCORE_EVENT_DESC(ddrt_write.scale,   "3.814697266e-6"),
+   INTEL_UNCORE_EVENT_DESC(ddrt_write.scale,   "6.103515625e-5"),
INTEL_UNCORE_EVENT_DESC(ddrt_write.unit,"MiB"),
{ /* end: all zeroes */ },
 };


[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_find_dev_pmu()

2020-09-25 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 8ed2ccaa3fa990be61619a61b9bc3914eefdc18f
Gitweb:
https://git.kernel.org/tip/8ed2ccaa3fa990be61619a61b9bc3914eefdc18f
Author:Kan Liang 
AuthorDate:Mon, 14 Sep 2020 07:34:16 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 24 Sep 2020 15:55:50 +02:00

perf/x86/intel/uncore: Factor out uncore_pci_find_dev_pmu()

When an uncore PCI sub driver gets a remove notification, the
corresponding PMU has to be retrieved and unregistered. The codes, which
find the corresponding PMU by comparing the pci_device_id table, can be
shared.

Factor out uncore_pci_find_dev_pmu(), which will be used later.

There is no functional change.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1600094060-82746-3-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c | 48 ++---
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e14b03f..f6ff1b9 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1008,6 +1008,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev 
*pdev,
 
return 0;
 }
+
+/*
+ * Find the PMU of a PCI device.
+ * @pdev: The PCI device.
+ * @ids: The ID table of the available PCI devices with a PMU.
+ */
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
+{
+   struct intel_uncore_pmu *pmu = NULL;
+   struct intel_uncore_type *type;
+   kernel_ulong_t data;
+   unsigned int devfn;
+
+   while (ids && ids->vendor) {
+   if ((ids->vendor == pdev->vendor) &&
+   (ids->device == pdev->device)) {
+   data = ids->driver_data;
+   devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data),
+ UNCORE_PCI_DEV_FUNC(data));
+   if (devfn == pdev->devfn) {
+   type = 
uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)];
+   pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)];
+   break;
+   }
+   }
+   ids++;
+   }
+   return pmu;
+}
+
 /*
  * add a pci uncore device
  */
@@ -1039,21 +1070,8 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
 */
if (id->driver_data & ~0x) {
struct pci_driver *pci_drv = pdev->driver;
-   const struct pci_device_id *ids = pci_drv->id_table;
-   unsigned int devfn;
-
-   while (ids && ids->vendor) {
-   if ((ids->vendor == pdev->vendor) &&
-   (ids->device == pdev->device)) {
-   devfn = 
PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
- 
UNCORE_PCI_DEV_FUNC(ids->driver_data));
-   if (devfn == pdev->devfn) {
-   pmu = 
&type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
-   break;
-   }
-   }
-   ids++;
-   }
+
+   pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
if (pmu == NULL)
return -ENODEV;
} else {


[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_get_dev_die_info()

2020-09-25 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: fe6507338d635f283e9618b5eaa35f503a8c375b
Gitweb:
https://git.kernel.org/tip/fe6507338d635f283e9618b5eaa35f503a8c375b
Author:Kan Liang 
AuthorDate:Mon, 14 Sep 2020 07:34:15 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 24 Sep 2020 15:55:50 +02:00

perf/x86/intel/uncore: Factor out uncore_pci_get_dev_die_info()

The socket and die information is required to register/unregister a PMU
in the uncore PCI sub driver. The codes, which get the socket and die
information from a BUS number, can be shared.

Factor out uncore_pci_get_dev_die_info(), which will be used later.

There is no functional change.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1600094060-82746-2-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index d5c6d3b..e14b03f 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -989,6 +989,26 @@ uncore_types_init(struct intel_uncore_type **types, bool 
setid)
 }
 
 /*
+ * Get the die information of a PCI device.
+ * @pdev: The PCI device.
+ * @phys_id: The physical socket id which the device maps to.
+ * @die: The die id which the device maps to.
+ */
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
+  int *phys_id, int *die)
+{
+   *phys_id = uncore_pcibus_to_physid(pdev->bus);
+   if (*phys_id < 0)
+   return -ENODEV;
+
+   *die = (topology_max_die_per_package() > 1) ? *phys_id :
+   topology_phys_to_logical_pkg(*phys_id);
+   if (*die < 0)
+   return -EINVAL;
+
+   return 0;
+}
+/*
  * add a pci uncore device
  */
 static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
*id)
@@ -998,14 +1018,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
struct intel_uncore_box *box;
int phys_id, die, ret;
 
-   phys_id = uncore_pcibus_to_physid(pdev->bus);
-   if (phys_id < 0)
-   return -ENODEV;
-
-   die = (topology_max_die_per_package() > 1) ? phys_id :
-   topology_phys_to_logical_pkg(phys_id);
-   if (die < 0)
-   return -EINVAL;
+   ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+   if (ret)
+   return ret;
 
if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
int idx = UNCORE_PCI_DEV_IDX(id->driver_data);


[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_pmu_unregister()

2020-09-25 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: cdcce92a3a03bccbb0b4a0342fc7e279fc507bc3
Gitweb:
https://git.kernel.org/tip/cdcce92a3a03bccbb0b4a0342fc7e279fc507bc3
Author:Kan Liang 
AuthorDate:Mon, 14 Sep 2020 07:34:18 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 24 Sep 2020 15:55:51 +02:00

perf/x86/intel/uncore: Factor out uncore_pci_pmu_unregister()

The PMU unregistration in the uncore PCI sub driver is similar as the
normal PMU unregistration for a PCI device. The codes to unregister a
PCI PMU can be shared.

Factor out uncore_pci_pmu_unregister(), which will be used later.

Use uncore_pci_get_dev_die_info() to replace the codes which retrieve
the socket and die informaion.

The pci_set_drvdata() is not included in uncore_pci_pmu_unregister() as
well, because the uncore PCI sub driver will not touch the private
driver data pointer of the device.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1600094060-82746-5-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c | 35 +++--
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 6c6f8b3..747d237 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1137,18 +1137,38 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
return ret;
 }
 
+/*
+ * Unregister the PMU of a PCI device
+ * @pmu: The corresponding PMU is unregistered.
+ * @phys_id: The physical socket id which the device maps to.
+ * @die: The die id which the device maps to.
+ */
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
+ int phys_id, int die)
+{
+   struct intel_uncore_box *box = pmu->boxes[die];
+
+   if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
+   return;
+
+   pmu->boxes[die] = NULL;
+   if (atomic_dec_return(&pmu->activeboxes) == 0)
+   uncore_pmu_unregister(pmu);
+   uncore_box_exit(box);
+   kfree(box);
+}
+
 static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
int i, phys_id, die;
 
-   phys_id = uncore_pcibus_to_physid(pdev->bus);
+   if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+   return;
 
box = pci_get_drvdata(pdev);
if (!box) {
-   die = (topology_max_die_per_package() > 1) ? phys_id :
-   topology_phys_to_logical_pkg(phys_id);
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
if (uncore_extra_pci_dev[die].dev[i] == pdev) {
uncore_extra_pci_dev[die].dev[i] = NULL;
@@ -1160,15 +1180,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
}
 
pmu = box->pmu;
-   if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
-   return;
 
pci_set_drvdata(pdev, NULL);
-   pmu->boxes[box->dieid] = NULL;
-   if (atomic_dec_return(&pmu->activeboxes) == 0)
-   uncore_pmu_unregister(pmu);
-   uncore_box_exit(box);
-   kfree(box);
+
+   uncore_pci_pmu_unregister(pmu, phys_id, die);
 }
 
 static int __init uncore_pci_init(void)


[tip: perf/core] perf/x86/intel/uncore: Generic support for the PCI sub driver

2020-09-25 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 95a7fc77443328ac8b68378df8e137a044ece5e8
Gitweb:
https://git.kernel.org/tip/95a7fc77443328ac8b68378df8e137a044ece5e8
Author:Kan Liang 
AuthorDate:Mon, 14 Sep 2020 07:34:19 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 24 Sep 2020 15:55:51 +02:00

perf/x86/intel/uncore: Generic support for the PCI sub driver

Some uncore counters may be located in the configuration space of a PCI
device, which already has a bonded driver. Currently, the uncore driver
cannot register a PCI uncore PMU for these counters, because, to
register a PCI uncore PMU, the uncore driver must be bond to the device.
However, one device can only have one bonded driver.

Add an uncore PCI sub driver to support such kind of devices.

The sub driver doesn't own the device. In initialization, the sub
driver searches the device via pci_get_device(), and register the
corresponding PMU for the device. In the meantime, the sub driver
registers a PCI bus notifier, which is used to notify the sub driver
once the device is removed. The sub driver can unregister the PMU
accordingly.

The sub driver only searches the devices defined in its id table. The
id table varies on different platforms, which will be implemented in the
following platform-specific patch.

Suggested-by: Bjorn Helgaas 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1600094060-82746-6-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c | 81 +-
 arch/x86/events/intel/uncore.h |  1 +-
 2 files changed, 82 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 747d237..ce0a5ba 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -12,6 +12,8 @@ struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
 
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
+/* The PCI driver for the device which the uncore doesn't own. */
+struct pci_driver *uncore_pci_sub_driver;
 /* pci bus to socket mapping */
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
@@ -1186,6 +1188,80 @@ static void uncore_pci_remove(struct pci_dev *pdev)
uncore_pci_pmu_unregister(pmu, phys_id, die);
 }
 
+static int uncore_bus_notify(struct notifier_block *nb,
+unsigned long action, void *data)
+{
+   struct device *dev = data;
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct intel_uncore_pmu *pmu;
+   int phys_id, die;
+
+   /* Unregister the PMU when the device is going to be deleted. */
+   if (action != BUS_NOTIFY_DEL_DEVICE)
+   return NOTIFY_DONE;
+
+   pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table);
+   if (!pmu)
+   return NOTIFY_DONE;
+
+   if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+   return NOTIFY_DONE;
+
+   uncore_pci_pmu_unregister(pmu, phys_id, die);
+
+   return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_notifier = {
+   .notifier_call = uncore_bus_notify,
+};
+
+static void uncore_pci_sub_driver_init(void)
+{
+   const struct pci_device_id *ids = uncore_pci_sub_driver->id_table;
+   struct intel_uncore_type *type;
+   struct intel_uncore_pmu *pmu;
+   struct pci_dev *pci_sub_dev;
+   bool notify = false;
+   unsigned int devfn;
+   int phys_id, die;
+
+   while (ids && ids->vendor) {
+   pci_sub_dev = NULL;
+   type = 
uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)];
+   /*
+* Search the available device, and register the
+* corresponding PMU.
+*/
+   while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
+ids->device, 
pci_sub_dev))) {
+   devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
+ 
UNCORE_PCI_DEV_FUNC(ids->driver_data));
+   if (devfn != pci_sub_dev->devfn)
+   continue;
+
+   pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
+   if (!pmu)
+   continue;
+
+   if (uncore_pci_get_dev_die_info(pci_sub_dev,
+   &phys_id, &die))
+   continue;
+
+   if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
+phys_id, die))
+   notify = true;
+   }
+   ids++;
+   }
+
+   if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier))
+   notif

[tip: perf/core] perf/x86/intel/uncore: Support PCIe3 unit on Snow Ridge

2020-09-25 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: a3b1e8451d3fd54fe0df661c2c4f983932b3c0bc
Gitweb:
https://git.kernel.org/tip/a3b1e8451d3fd54fe0df661c2c4f983932b3c0bc
Author:Kan Liang 
AuthorDate:Mon, 14 Sep 2020 07:34:20 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 24 Sep 2020 15:55:52 +02:00

perf/x86/intel/uncore: Support PCIe3 unit on Snow Ridge

The Snow Ridge integrated PCIe3 uncore unit can be used to collect
performance data, e.g. utilization, between PCIe devices, plugged into
the PCIe port, and the components (in M2IOSF) responsible for
translating and managing requests to/from the device. The performance
data is very useful for analyzing the performance of PCIe devices.

The device with the PCIe3 uncore PMON units is owned by the portdrv_pci
driver. Create a PCI sub driver for the PCIe3 uncore PMON units.

Here are some difference between PCIe3 uncore unit and other uncore
pci units.
- There may be several Root Ports on a system. But the uncore counters
  only exist in the Root Port A. A user can configure the channel mask
  to collect the data from other Root Ports.
- The event format of the PCIe3 uncore unit is the same as IIO unit of
  SKX.
- The Control Register of PCIe3 uncore unit is 64 bits.
- The offset of each counters is 8, which is the same as M2M unit of
  SNR.
- New MSR addresses for unit control, counter and counter config.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1600094060-82746-7-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore_snbep.c | 53 +++-
 1 file changed, 53 insertions(+)

diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index 62e88ad..495056f 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -393,6 +393,11 @@
 #define SNR_M2M_PCI_PMON_BOX_CTL   0x438
 #define SNR_M2M_PCI_PMON_UMASK_EXT 0xff
 
+/* SNR PCIE3 */
+#define SNR_PCIE3_PCI_PMON_CTL00x508
+#define SNR_PCIE3_PCI_PMON_CTR00x4e8
+#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e0
+
 /* SNR IMC */
 #define SNR_IMC_MMIO_PMON_FIXED_CTL0x54
 #define SNR_IMC_MMIO_PMON_FIXED_CTR0x38
@@ -4551,12 +4556,46 @@ static struct intel_uncore_type snr_uncore_m2m = {
.format_group   = &snr_m2m_uncore_format_group,
 };
 
+static void snr_uncore_pci_enable_event(struct intel_uncore_box *box, struct 
perf_event *event)
+{
+   struct pci_dev *pdev = box->pci_dev;
+   struct hw_perf_event *hwc = &event->hw;
+
+   pci_write_config_dword(pdev, hwc->config_base, (u32)(hwc->config | 
SNBEP_PMON_CTL_EN));
+   pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 
32));
+}
+
+static struct intel_uncore_ops snr_pcie3_uncore_pci_ops = {
+   .init_box   = snr_m2m_uncore_pci_init_box,
+   .disable_box= snbep_uncore_pci_disable_box,
+   .enable_box = snbep_uncore_pci_enable_box,
+   .disable_event  = snbep_uncore_pci_disable_event,
+   .enable_event   = snr_uncore_pci_enable_event,
+   .read_counter   = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type snr_uncore_pcie3 = {
+   .name   = "pcie3",
+   .num_counters   = 4,
+   .num_boxes  = 1,
+   .perf_ctr_bits  = 48,
+   .perf_ctr   = SNR_PCIE3_PCI_PMON_CTR0,
+   .event_ctl  = SNR_PCIE3_PCI_PMON_CTL0,
+   .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK,
+   .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+   .box_ctl= SNR_PCIE3_PCI_PMON_BOX_CTL,
+   .ops= &snr_pcie3_uncore_pci_ops,
+   .format_group   = &skx_uncore_iio_format_group,
+};
+
 enum {
SNR_PCI_UNCORE_M2M,
+   SNR_PCI_UNCORE_PCIE3,
 };
 
 static struct intel_uncore_type *snr_pci_uncores[] = {
[SNR_PCI_UNCORE_M2M]= &snr_uncore_m2m,
+   [SNR_PCI_UNCORE_PCIE3]  = &snr_uncore_pcie3,
NULL,
 };
 
@@ -4573,6 +4612,19 @@ static struct pci_driver snr_uncore_pci_driver = {
.id_table   = snr_uncore_pci_ids,
 };
 
+static const struct pci_device_id snr_uncore_pci_sub_ids[] = {
+   { /* PCIe3 RP */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a),
+   .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, 
SNR_PCI_UNCORE_PCIE3, 0),
+   },
+   { /* end: all zeroes */ }
+};
+
+static struct pci_driver snr_uncore_pci_sub_driver = {
+   .name   = "snr_uncore_sub",
+   .id_table   = snr_uncore_pci_sub_ids,
+};
+
 int snr_uncore_pci_init(void)
 {
/* SNR UBOX DID */
@@ -4584,6 +4636,7 @@ int snr_uncore_pci_init(void)
 
uncore_pci_uncores = snr_pci_uncores;
uncore_pci_driver = &snr_uncore_pci_driver;
+   uncore_pci_sub_driver = &snr_uncore_pci_sub_driver;
return 0;
 }
 


[tip: perf/core] perf/x86/intel/uncore: Factor out uncore_pci_pmu_register()

2020-09-25 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 16fa64315c1bd2a61fb20d6aa9a542dd5bf52971
Gitweb:
https://git.kernel.org/tip/16fa64315c1bd2a61fb20d6aa9a542dd5bf52971
Author:Kan Liang 
AuthorDate:Mon, 14 Sep 2020 07:34:17 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 24 Sep 2020 15:55:51 +02:00

perf/x86/intel/uncore: Factor out uncore_pci_pmu_register()

The PMU registration in the uncore PCI sub driver is similar as the
normal PMU registration for a PCI device. The codes to register a PCI
PMU can be shared.

Factor out uncore_pci_pmu_register(), which will be used later.

The pci_set_drvdata() is not included in uncore_pci_pmu_register(). The
uncore PCI sub driver doesn't own the PCI device. It will not touch the
private driver data pointer for the device.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1600094060-82746-4-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/uncore.c | 82 -
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index f6ff1b9..6c6f8b3 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1040,13 +1040,61 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const 
struct pci_device_id *ids)
 }
 
 /*
+ * Register the PMU for a PCI device
+ * @pdev: The PCI device.
+ * @type: The corresponding PMU type of the device.
+ * @pmu: The corresponding PMU of the device.
+ * @phys_id: The physical socket id which the device maps to.
+ * @die: The die id which the device maps to.
+ */
+static int uncore_pci_pmu_register(struct pci_dev *pdev,
+  struct intel_uncore_type *type,
+  struct intel_uncore_pmu *pmu,
+  int phys_id, int die)
+{
+   struct intel_uncore_box *box;
+   int ret;
+
+   if (WARN_ON_ONCE(pmu->boxes[die] != NULL))
+   return -EINVAL;
+
+   box = uncore_alloc_box(type, NUMA_NO_NODE);
+   if (!box)
+   return -ENOMEM;
+
+   if (pmu->func_id < 0)
+   pmu->func_id = pdev->devfn;
+   else
+   WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+   atomic_inc(&box->refcnt);
+   box->pci_phys_id = phys_id;
+   box->dieid = die;
+   box->pci_dev = pdev;
+   box->pmu = pmu;
+   uncore_box_init(box);
+
+   pmu->boxes[die] = box;
+   if (atomic_inc_return(&pmu->activeboxes) > 1)
+   return 0;
+
+   /* First active box registers the pmu */
+   ret = uncore_pmu_register(pmu);
+   if (ret) {
+   pmu->boxes[die] = NULL;
+   uncore_box_exit(box);
+   kfree(box);
+   }
+   return ret;
+}
+
+/*
  * add a pci uncore device
  */
 static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
*id)
 {
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu = NULL;
-   struct intel_uncore_box *box;
int phys_id, die, ret;
 
ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
@@ -1082,38 +1130,10 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}
 
-   if (WARN_ON_ONCE(pmu->boxes[die] != NULL))
-   return -EINVAL;
-
-   box = uncore_alloc_box(type, NUMA_NO_NODE);
-   if (!box)
-   return -ENOMEM;
-
-   if (pmu->func_id < 0)
-   pmu->func_id = pdev->devfn;
-   else
-   WARN_ON_ONCE(pmu->func_id != pdev->devfn);
-
-   atomic_inc(&box->refcnt);
-   box->pci_phys_id = phys_id;
-   box->dieid = die;
-   box->pci_dev = pdev;
-   box->pmu = pmu;
-   uncore_box_init(box);
-   pci_set_drvdata(pdev, box);
+   ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
 
-   pmu->boxes[die] = box;
-   if (atomic_inc_return(&pmu->activeboxes) > 1)
-   return 0;
+   pci_set_drvdata(pdev, pmu->boxes[die]);
 
-   /* First active box registers the pmu */
-   ret = uncore_pmu_register(pmu);
-   if (ret) {
-   pci_set_drvdata(pdev, NULL);
-   pmu->boxes[die] = NULL;
-   uncore_box_exit(box);
-   kfree(box);
-   }
return ret;
 }
 


[tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()

2020-09-11 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 44fae179ce73a26733d9e2d346da4e1a1cb94647
Gitweb:
https://git.kernel.org/tip/44fae179ce73a26733d9e2d346da4e1a1cb94647
Author:Kan Liang 
AuthorDate:Fri, 21 Aug 2020 12:57:53 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 10 Sep 2020 11:19:34 +02:00

perf/core: Pull pmu::sched_task() into perf_event_context_sched_out()

The pmu::sched_task() is a context switch callback. It passes the
cpuctx->task_ctx as a parameter to the lower code. To find the
cpuctx->task_ctx, the current code iterates a cpuctx list.
The same context will iterated in perf_event_context_sched_out() soon.
Share the cpuctx->task_ctx can avoid the unnecessary iteration of the
cpuctx list.

The pmu::sched_task() is also required for the optimization case for
equivalent contexts.

The task_ctx_sched_out() will eventually disable and reenable the PMU
when schedule out events. Add perf_pmu_disable() and perf_pmu_enable()
around task_ctx_sched_out() don't break anything.

Drop the cpuctx->ctx.lock for the pmu::sched_task(). The lock is for
per-CPU context, which is not necessary for the per-task context
schedule.

No one uses sched_cb_entry, perf_sched_cb_usages, sched_cb_list, and
perf_pmu_sched_task() any more.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200821195754.20159-2-kan.li...@linux.intel.com
---
 include/linux/perf_event.h |  1 +-
 kernel/events/core.c   | 47 +
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 46a3974..0c19d27 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -872,7 +872,6 @@ struct perf_cpu_context {
struct list_headcgrp_cpuctx_entry;
 #endif
 
-   struct list_headsched_cb_entry;
int sched_cb_usage;
 
int online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3f5fec4..45edb85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -382,7 +382,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3384,10 +3383,12 @@ static void perf_event_context_sched_out(struct 
task_struct *task, int ctxn,
struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
+   struct pmu *pmu;
 
if (likely(!ctx))
return;
 
+   pmu = ctx->pmu;
cpuctx = __get_cpu_context(ctx);
if (!cpuctx->task_ctx)
return;
@@ -3417,11 +3418,15 @@ static void perf_event_context_sched_out(struct 
task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
-   struct pmu *pmu = ctx->pmu;
 
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
 
+   perf_pmu_disable(pmu);
+
+   if (cpuctx->sched_cb_usage && pmu->sched_task)
+   pmu->sched_task(ctx, false);
+
/*
 * PMU specific parts of task perf context can require
 * additional synchronization. As an example of such
@@ -3433,6 +3438,8 @@ static void perf_event_context_sched_out(struct 
task_struct *task, int ctxn,
else
swap(ctx->task_ctx_data, 
next_ctx->task_ctx_data);
 
+   perf_pmu_enable(pmu);
+
/*
 * RCU_INIT_POINTER here is safe because we've not
 * modified the ctx and the above modification of
@@ -3455,21 +3462,22 @@ unlock:
 
if (do_switch) {
raw_spin_lock(&ctx->lock);
+   perf_pmu_disable(pmu);
+
+   if (cpuctx->sched_cb_usage && pmu->sched_task)
+   pmu->sched_task(ctx, false);
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+
+   perf_pmu_enable(pmu);
raw_spin_unlock(&ctx->lock);
}
 }
 
-static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-
 void perf_sched_cb_dec(struct pmu *pmu)
 {
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-   this_cpu_dec(perf_sched_cb_usages);
-
-   if (!--cpuctx->sched_cb_usage)
-   list_del(&cpuctx->sched_cb_entry);
+   --cpuctx->sched_cb_usage;
 }

[tip: perf/core] perf/core: Pull pmu::sched_task() into perf_event_context_sched_in()

2020-09-11 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 556cccad389717d6eb4f5a24b45ff41cad3aaabf
Gitweb:
https://git.kernel.org/tip/556cccad389717d6eb4f5a24b45ff41cad3aaabf
Author:Kan Liang 
AuthorDate:Fri, 21 Aug 2020 12:57:52 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 10 Sep 2020 11:19:34 +02:00

perf/core: Pull pmu::sched_task() into perf_event_context_sched_in()

The pmu::sched_task() is a context switch callback. It passes the
cpuctx->task_ctx as a parameter to the lower code. To find the
cpuctx->task_ctx, the current code iterates a cpuctx list.

The same context was just iterated in perf_event_context_sched_in(),
which is invoked right before the pmu::sched_task().

Reuse the cpuctx->task_ctx from perf_event_context_sched_in() can avoid
the unnecessary iteration of the cpuctx list.

Both pmu::sched_task and perf_event_context_sched_in() have to disable
PMU. Pull the pmu::sched_task into perf_event_context_sched_in() can
also save the overhead from the PMU disable and reenable.

The new and old tasks may have equivalent contexts. The current code
optimize this case by swapping the context, which avoids the scheduling.
For this case, pmu::sched_task() is still required, e.g., restore the
LBR content.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200821195754.20159-1-kan.li...@linux.intel.com
---
 kernel/events/core.c | 51 ++-
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57efe3b..3f5fec4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3491,30 +3491,36 @@ void perf_sched_cb_inc(struct pmu *pmu)
  * PEBS requires this to provide PID/TID information. This requires we flush
  * all queued PEBS records before we context switch to a new task.
  */
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool 
sched_in)
+{
+   struct pmu *pmu;
+
+   pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+
+   if (WARN_ON_ONCE(!pmu->sched_task))
+   return;
+
+   perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+   perf_pmu_disable(pmu);
+
+   pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+   perf_pmu_enable(pmu);
+   perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+
 static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
 {
struct perf_cpu_context *cpuctx;
-   struct pmu *pmu;
 
if (prev == next)
return;
 
-   list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), 
sched_cb_entry) {
-   pmu = cpuctx->ctx.pmu; /* software PMUs will not have 
sched_task */
-
-   if (WARN_ON_ONCE(!pmu->sched_task))
-   continue;
-
-   perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-   perf_pmu_disable(pmu);
-
-   pmu->sched_task(cpuctx->task_ctx, sched_in);
+   list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), 
sched_cb_entry)
+   __perf_pmu_sched_task(cpuctx, sched_in);
 
-   perf_pmu_enable(pmu);
-   perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-   }
 }
 
 static void perf_event_switch(struct task_struct *task,
@@ -3773,10 +3779,14 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
struct task_struct *task)
 {
struct perf_cpu_context *cpuctx;
+   struct pmu *pmu = ctx->pmu;
 
cpuctx = __get_cpu_context(ctx);
-   if (cpuctx->task_ctx == ctx)
+   if (cpuctx->task_ctx == ctx) {
+   if (cpuctx->sched_cb_usage)
+   __perf_pmu_sched_task(cpuctx, true);
return;
+   }
 
perf_ctx_lock(cpuctx, ctx);
/*
@@ -3786,7 +3796,7 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
 
-   perf_pmu_disable(ctx->pmu);
+   perf_pmu_disable(pmu);
/*
 * We want to keep the following priority order:
 * cpu pinned (that don't need to move), task pinned,
@@ -3798,7 +3808,11 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
-   perf_pmu_enable(ctx->pmu);
+
+   if (cpuctx->sched_cb_usage && pmu->sched_task)
+   pmu->sched_task(cpuctx->task_ctx, true);
+
+   perf_pmu_enable(pmu);
 
 unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -3841,9 +3855,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
if (atomic_read(&nr_switch_events))
perf_ev

[tip: perf/core] perf/x86/intel/ds: Fix x86_pmu_stop warning for large PEBS

2020-09-11 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 35d1ce6bec133679ff16325d335217f108b84871
Gitweb:
https://git.kernel.org/tip/35d1ce6bec133679ff16325d335217f108b84871
Author:Kan Liang 
AuthorDate:Wed, 02 Sep 2020 14:06:49 -07:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 10 Sep 2020 11:19:33 +02:00

perf/x86/intel/ds: Fix x86_pmu_stop warning for large PEBS

A warning as below may be triggered when sampling with large PEBS.

[  410.411250] perf: interrupt took too long (72145 > 71975), lowering
kernel.perf_event_max_sample_rate to 2000
[  410.724923] [ cut here ]
[  410.729822] WARNING: CPU: 0 PID: 16397 at arch/x86/events/core.c:1422
x86_pmu_stop+0x95/0xa0
[  410.933811]  x86_pmu_del+0x50/0x150
[  410.937304]  event_sched_out.isra.0+0xbc/0x210
[  410.941751]  group_sched_out.part.0+0x53/0xd0
[  410.946111]  ctx_sched_out+0x193/0x270
[  410.949862]  __perf_event_task_sched_out+0x32c/0x890
[  410.954827]  ? set_next_entity+0x98/0x2d0
[  410.958841]  __schedule+0x592/0x9c0
[  410.962332]  schedule+0x5f/0xd0
[  410.965477]  exit_to_usermode_loop+0x73/0x120
[  410.969837]  prepare_exit_to_usermode+0xcd/0xf0
[  410.974369]  ret_from_intr+0x2a/0x3a
[  410.977946] RIP: 0033:0x40123c
[  411.079661] ---[ end trace bc83adaea7bb664a ]---

In the non-overflow context, e.g., context switch, with large PEBS, perf
may stop an event twice. An example is below.

  //max_samples_per_tick is adjusted to 2
  //NMI is triggered
  intel_pmu_handle_irq()
 handle_pmi_common()
   drain_pebs()
 __intel_pmu_pebs_event()
   perf_event_overflow()
 __perf_event_account_interrupt()
   hwc->interrupts = 1
   return 0
  //A context switch happens right after the NMI.
  //In the same tick, the perf_throttled_seq is not changed.
  perf_event_task_sched_out()
 perf_pmu_sched_task()
   intel_pmu_drain_pebs_buffer()
 __intel_pmu_pebs_event()
   perf_event_overflow()
 __perf_event_account_interrupt()
   ++hwc->interrupts >= max_samples_per_tick
   return 1
   x86_pmu_stop();  # First stop
 perf_event_context_sched_out()
   task_ctx_sched_out()
 ctx_sched_out()
   event_sched_out()
 x86_pmu_del()
   x86_pmu_stop();  # Second stop and trigger the warning

Perf should only invoke the perf_event_overflow() in the overflow
context.

Current drain_pebs() is called from:
- handle_pmi_common()   -- overflow context
- intel_pmu_pebs_sched_task()   -- non-overflow context
- intel_pmu_pebs_disable()  -- non-overflow context
- intel_pmu_auto_reload_read()  -- possible overflow context
  With PERF_SAMPLE_READ + PERF_FORMAT_GROUP, the function may be
  invoked in the NMI handler. But, before calling the function, the
  PEBS buffer has already been drained. The __intel_pmu_pebs_event()
  will not be called in the possible overflow context.

To fix the issue, an indicator is required to distinguish between the
overflow context aka handle_pmi_common() and other cases.
The dummy regs pointer can be used as the indicator.

In the non-overflow context, perf should treat the last record the same
as other PEBS records, and doesn't invoke the generic overflow handler.

Fixes: 21509084f999 ("perf/x86/intel: Handle multiple records in the PEBS 
buffer")
Reported-by: Like Xu 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Like Xu 
Link: https://lkml.kernel.org/r/20200902210649.2743-1-kan.li...@linux.intel.com
---
 arch/x86/events/intel/ds.c | 32 
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 86848c5..404315d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -670,9 +670,7 @@ unlock:
 
 static inline void intel_pmu_drain_pebs_buffer(void)
 {
-   struct pt_regs regs;
-
-   x86_pmu.drain_pebs(®s);
+   x86_pmu.drain_pebs(NULL);
 }
 
 /*
@@ -1737,6 +1735,7 @@ static void __intel_pmu_pebs_event(struct perf_event 
*event,
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
+   struct pt_regs dummy_iregs;
 
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
@@ -1749,6 +1748,9 @@ static void __intel_pmu_pebs_event(struct perf_event 
*event,
} else if (!intel_pmu_save_and_restart(event))
return;
 
+   if (!iregs)
+   iregs = &dummy_iregs;
+
while (count > 1) {
setup_sample(event, iregs, at, &data, regs);
perf_event_output(event, &data, regs);
@@ -1758,16 +1760,22 @@ static void __intel_pmu_pebs_event(struct perf_event 
*event,
}
 
setup_sample(e

[tip: perf/core] perf/x86/intel: Move BTS index to 47

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: d39fcc32893dac2d02900d99c38276a00cc54d60
Gitweb:
https://git.kernel.org/tip/d39fcc32893dac2d02900d99c38276a00cc54d60
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:07 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:35 +02:00

perf/x86/intel: Move BTS index to 47

The bit 48 in the PERF_GLOBAL_STATUS is used to indicate the overflow
status of the PERF_METRICS counters.

Move the BTS index to the bit 47.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-5-kan.li...@linux.intel.com
---
 arch/x86/include/asm/perf_event.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index fe8110a..58419e5 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -238,11 +238,11 @@ struct x86_pmu_capability {
 /*
  * We model BTS tracing as another fixed-mode PMC.
  *
- * We choose a value in the middle of the fixed event range, since lower
+ * We choose the value 47 for the fixed index of BTS, since lower
  * values are used by actual fixed events and higher values are used
  * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
  */
-#define INTEL_PMC_IDX_FIXED_BTS(INTEL_PMC_IDX_FIXED + 
16)
+#define INTEL_PMC_IDX_FIXED_BTS(INTEL_PMC_IDX_FIXED + 
15)
 
 #define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
 #define GLOBAL_STATUS_BUFFER_OVF_BIT   62


[tip: perf/core] perf/x86: Use event_base_rdpmc for the RDPMC userspace support

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 75608cb02ea5dd997990e2998eca3670cb71a18c
Gitweb:
https://git.kernel.org/tip/75608cb02ea5dd997990e2998eca3670cb71a18c
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:04 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:34 +02:00

perf/x86: Use event_base_rdpmc for the RDPMC userspace support

The RDPMC index is always re-calculated for the RDPMC userspace support,
which is unnecessary.

The RDPMC index value is stored in the variable event_base_rdpmc for
the kernel usage, which can be used for RDPMC userspace support as well.

Suggested-by: Peter Zijlstra 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-2-kan.li...@linux.intel.com
---
 arch/x86/events/core.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1cbf57d..8e108ea 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2208,17 +2208,12 @@ static void x86_pmu_event_unmapped(struct perf_event 
*event, struct mm_struct *m
 
 static int x86_pmu_event_idx(struct perf_event *event)
 {
-   int idx = event->hw.idx;
+   struct hw_perf_event *hwc = &event->hw;
 
-   if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+   if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return 0;
 
-   if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
-   idx -= INTEL_PMC_IDX_FIXED;
-   idx |= 1 << 30;
-   }
-
-   return idx + 1;
+   return hwc->event_base_rdpmc + 1;
 }
 
 static ssize_t get_attr_rdpmc(struct device *cdev,


[tip: perf/core] perf/x86/intel: Name the global status bit in NMI handler

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 60a2a271cf05cf046c522e1d7f62116b4bcb32a2
Gitweb:
https://git.kernel.org/tip/60a2a271cf05cf046c522e1d7f62116b4bcb32a2
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:05 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:34 +02:00

perf/x86/intel: Name the global status bit in NMI handler

Magic numbers are used in the current NMI handler for the global status
bit. Use a meaningful name to replace the magic numbers to improve the
readability of the code.

Remove a Tab for all GLOBAL_STATUS_* and INTEL_PMC_IDX_FIXED_BTS macros
to reduce the length of the line.

Suggested-by: Peter Zijlstra 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-3-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c  |  4 ++--
 arch/x86/include/asm/perf_event.h | 22 --
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5096347..ac1408f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2389,7 +2389,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 
status)
/*
 * PEBS overflow sets bit 62 in the global status register
 */
-   if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+   if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long 
*)&status)) {
u64 pebs_enabled = cpuc->pebs_enabled;
 
handled++;
@@ -2410,7 +2410,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 
status)
/*
 * Intel PT
 */
-   if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+   if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned 
long *)&status)) {
handled++;
if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
perf_guest_cbs->handle_intel_pt_intr))
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 0c1b137..fd3eba6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -225,16 +225,18 @@ struct x86_pmu_capability {
  * values are used by actual fixed events and higher values are used
  * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
  */
-#define INTEL_PMC_IDX_FIXED_BTS
(INTEL_PMC_IDX_FIXED + 16)
-
-#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
-#define GLOBAL_STATUS_BUFFER_OVF   BIT_ULL(62)
-#define GLOBAL_STATUS_UNC_OVF  BIT_ULL(61)
-#define GLOBAL_STATUS_ASIF BIT_ULL(60)
-#define GLOBAL_STATUS_COUNTERS_FROZEN  BIT_ULL(59)
-#define GLOBAL_STATUS_LBRS_FROZEN_BIT  58
-#define GLOBAL_STATUS_LBRS_FROZEN  
BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
-#define GLOBAL_STATUS_TRACE_TOPAPMIBIT_ULL(55)
+#define INTEL_PMC_IDX_FIXED_BTS(INTEL_PMC_IDX_FIXED + 
16)
+
+#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF_BIT   62
+#define GLOBAL_STATUS_BUFFER_OVF   
BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT)
+#define GLOBAL_STATUS_UNC_OVF  BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN  BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN_BIT  58
+#define GLOBAL_STATUS_LBRS_FROZEN  
BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
+#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT55
+#define GLOBAL_STATUS_TRACE_TOPAPMI
BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
 
 /*
  * We model guest LBR event tracing as another fixed-mode PMC like BTS.


[tip: perf/core] perf/x86/intel: Generic support for hardware TopDown metrics

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 7b2c05a15d29d0570a0d21da1e4fd5cbc85cbf13
Gitweb:
https://git.kernel.org/tip/7b2c05a15d29d0570a0d21da1e4fd5cbc85cbf13
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:11 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00

perf/x86/intel: Generic support for hardware TopDown metrics

Intro
=

The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.

The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.

To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.

Events
==

The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.

For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.

When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
  MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
  and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
  metrics events. The sharing between multiple users of the same metric
  without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
  TopDown events, which avoid the unnecessary writing of the fixed
  control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
  the fixed counter 3 are read separately. The values may be modified by
  an NMI.

All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.

The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.

Groups
==

The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.

NMI
==

The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.

The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.

RDPMC
==

RDPMC is temporarily disabled. A later patch will enable it.

Suggested-by: Peter Zijlstra 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.li...@linux.intel.com
---
 arch/x86/events/core.c|  63 ---
 arch/x86/events/intel/core.c  | 124 +++--
 arch/x86/events/perf_event.h  |  37 +-
 arch/x86/include/asm/msr-index.h  |   1 +-
 arch/x86/include/asm/perf_event.h |  47 +++-
 5 files changed, 257 insertions(+), 15 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 8e108ea..53fcf0a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -76,6 +76,9 @@ u64 x86_perf_event_update(struct perf_event *event)
if (unlikely(!hwc->event_base))
return 0;
 
+   if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event)
+   return x86_pmu.update_topdown_event(event);
+
/*
 * Careful: an NMI might modify the previous event value.
 *
@@ -1031,6 +1034,42 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int 
n, int *assign)
return unsched ? -EINVAL : 0;
 }
 
+static int add_nr_metric_event(struct cpu_hw_events *cpuc,
+  struct perf_event *event)
+{
+   if (is_metric_event(event)) {
+   if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
+   return -EINVAL;
+   cpuc->n_metric

[tip: perf/core] perf/core: Add a new PERF_EV_CAP_SIBLING event capability

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 9f0c4fa111dc909ca545c45ea20ec84da555ce16
Gitweb:
https://git.kernel.org/tip/9f0c4fa111dc909ca545c45ea20ec84da555ce16
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:10 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00

perf/core: Add a new PERF_EV_CAP_SIBLING event capability

Current perf assumes that events in a group are independent. Close an
event doesn't impact the value of the other events in the same group.
If the closed event is a member, after the event closure, other events
are still running like a group. If the closed event is a leader, other
events are running as singleton events.

Add PERF_EV_CAP_SIBLING to allow events to indicate they require being
part of a group, and when the leader dies they cannot exist
independently.

Suggested-by: Peter Zijlstra 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-8-kan.li...@linux.intel.com
---
 include/linux/perf_event.h |  4 -
 kernel/events/core.c   | 38 -
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 04a49cc..6048650 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -576,9 +576,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event 
*,
  * PERF_EV_CAP_SOFTWARE: Is a software event.
  * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
  * from any CPU in the package where it is active.
+ * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
+ * cannot be a group leader. If an event with this flag is detached from the
+ * group it is scheduled out and moved into an unrecoverable ERROR state.
  */
 #define PERF_EV_CAP_SOFTWARE   BIT(0)
 #define PERF_EV_CAP_READ_ACTIVE_PKGBIT(1)
+#define PERF_EV_CAP_SIBLINGBIT(2)
 
 #define SWEVENT_HLIST_BITS 8
 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5bfe8e3..57efe3b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2133,8 +2133,24 @@ static inline struct list_head *get_event_list(struct 
perf_event *event)
return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
 }
 
+/*
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
+ * cannot exist on their own, schedule them out and move them into the ERROR
+ * state. Also see _perf_event_enable(), it will not be able to recover
+ * this ERROR state.
+ */
+static inline void perf_remove_sibling_event(struct perf_event *event)
+{
+   struct perf_event_context *ctx = event->ctx;
+   struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+   event_sched_out(event, cpuctx, ctx);
+   perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+}
+
 static void perf_group_detach(struct perf_event *event)
 {
+   struct perf_event *leader = event->group_leader;
struct perf_event *sibling, *tmp;
struct perf_event_context *ctx = event->ctx;
 
@@ -2153,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event)
/*
 * If this is a sibling, remove it from its group.
 */
-   if (event->group_leader != event) {
+   if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
goto out;
@@ -2166,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event)
 */
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, 
sibling_list) {
 
+   if (sibling->event_caps & PERF_EV_CAP_SIBLING)
+   perf_remove_sibling_event(sibling);
+
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
 
@@ -2183,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event)
}
 
 out:
-   perf_event__header_size(event->group_leader);
-
-   for_each_sibling_event(tmp, event->group_leader)
+   for_each_sibling_event(tmp, leader)
perf_event__header_size(tmp);
+
+   perf_event__header_size(leader);
 }
 
 static bool is_orphaned_event(struct perf_event *event)
@@ -2979,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state <  PERF_EVENT_STATE_ERROR) {
+out:
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -2990,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event)
 * has gone back into error state, as distinct from the task having
 * been scheduled away before the cross-call arrived.
 */
-   if (event->

[tip: perf/core] perf/x86/intel: Introduce the fourth fixed counter

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 6f7225099d5f3ec3019f380a0da2b456b7796cb0
Gitweb:
https://git.kernel.org/tip/6f7225099d5f3ec3019f380a0da2b456b7796cb0
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:06 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:35 +02:00

perf/x86/intel: Introduce the fourth fixed counter

The fourth fixed counter, TOPDOWN.SLOTS, is introduced in Ice Lake to
measure the level 1 TopDown events.

Add MSR address and macros for the new fixed counter, which will be used
in a later patch.

Add comments to explain the event encoding rules for the fixed counters.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-4-kan.li...@linux.intel.com
---
 arch/x86/include/asm/perf_event.h | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index fd3eba6..fe8110a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -197,12 +197,24 @@ struct x86_pmu_capability {
  */
 
 /*
- * All 3 fixed-mode PMCs are configured via this single MSR:
+ * All the fixed-mode PMCs are configured via this single MSR:
  */
 #define MSR_ARCH_PERFMON_FIXED_CTR_CTRL0x38d
 
 /*
- * The counts are available in three separate MSRs:
+ * There is no event-code assigned to the fixed-mode PMCs.
+ *
+ * For a fixed-mode PMC, which has an equivalent event on a general-purpose
+ * PMC, the event-code of the equivalent event is used for the fixed-mode PMC,
+ * e.g., Instr_Retired.Any and CPU_CLK_Unhalted.Core.
+ *
+ * For a fixed-mode PMC, which doesn't have an equivalent event, a
+ * pseudo-encoding is used, e.g., CPU_CLK_Unhalted.Ref and TOPDOWN.SLOTS.
+ * The pseudo event-code for a fixed-mode PMC must be 0x00.
+ * The pseudo umask-code is 0xX. The X equals the index of the fixed
+ * counter + 1, e.g., the fixed counter 2 has the pseudo-encoding 0x0300.
+ *
+ * The counts are available in separate MSRs:
  */
 
 /* Instr_Retired.Any: */
@@ -213,11 +225,16 @@ struct x86_pmu_capability {
 #define MSR_ARCH_PERFMON_FIXED_CTR10x30a
 #define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1)
 
-/* CPU_CLK_Unhalted.Ref: */
+/* CPU_CLK_Unhalted.Ref: event=0x00,umask=0x3 (pseudo-encoding) */
 #define MSR_ARCH_PERFMON_FIXED_CTR20x30b
 #define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2)
 #define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)
 
+/* TOPDOWN.SLOTS: event=0x00,umask=0x4 (pseudo-encoding) */
+#define MSR_ARCH_PERFMON_FIXED_CTR30x30c
+#define INTEL_PMC_IDX_FIXED_SLOTS  (INTEL_PMC_IDX_FIXED + 3)
+#define INTEL_PMC_MSK_FIXED_SLOTS  (1ULL << INTEL_PMC_IDX_FIXED_SLOTS)
+
 /*
  * We model BTS tracing as another fixed-mode PMC.
  *


[tip: perf/core] perf/x86/intel: Use switch in intel_pmu_disable/enable_event

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 58da7dbe6f036fefe504a4bb452afbd39bba73f7
Gitweb:
https://git.kernel.org/tip/58da7dbe6f036fefe504a4bb452afbd39bba73f7
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:09 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00

perf/x86/intel: Use switch in intel_pmu_disable/enable_event

Currently, the if-else is used in the intel_pmu_disable/enable_event to
check the type of an event. It works well, but with more and more types
added later, e.g., perf metrics, compared to the switch statement, the
if-else may impair the readability of the code.

There is no harm to use the switch statement to replace the if-else
here. Also, some optimizing compilers may compile a switch statement
into a jump-table which is more efficient than if-else for a large
number of cases. The performance gain may not be observed for now,
because the number of cases is only 5, but the benefits may be observed
with more and more types added in the future.

Use switch to replace the if-else in the intel_pmu_disable/enable_event.

If the idx is invalid, print a warning.

For the case INTEL_PMC_IDX_FIXED_BTS in intel_pmu_disable_event, don't
need to check the event->attr.precise_ip. Use return for the case.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-7-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c | 36 +++
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ac1408f..76eab81 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2180,17 +2180,28 @@ static void intel_pmu_disable_event(struct perf_event 
*event)
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
 
-   if (idx < INTEL_PMC_IDX_FIXED) {
+   switch (idx) {
+   case 0 ... INTEL_PMC_IDX_FIXED - 1:
intel_clear_masks(event, idx);
x86_pmu_disable_event(event);
-   } else if (idx < INTEL_PMC_IDX_FIXED_BTS) {
+   break;
+   case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
intel_clear_masks(event, idx);
intel_pmu_disable_fixed(event);
-   } else if (idx == INTEL_PMC_IDX_FIXED_BTS) {
+   break;
+   case INTEL_PMC_IDX_FIXED_BTS:
intel_pmu_disable_bts();
intel_pmu_drain_bts_buffer();
-   } else if (idx == INTEL_PMC_IDX_FIXED_VLBR)
+   return;
+   case INTEL_PMC_IDX_FIXED_VLBR:
intel_clear_masks(event, idx);
+   break;
+   default:
+   intel_clear_masks(event, idx);
+   pr_warn("Failed to disable the event with invalid index %d\n",
+   idx);
+   return;
+   }
 
/*
 * Needs to be called after x86_pmu_disable_event,
@@ -2262,18 +2273,27 @@ static void intel_pmu_enable_event(struct perf_event 
*event)
if (unlikely(event->attr.precise_ip))
intel_pmu_pebs_enable(event);
 
-   if (idx < INTEL_PMC_IDX_FIXED) {
+   switch (idx) {
+   case 0 ... INTEL_PMC_IDX_FIXED - 1:
intel_set_masks(event, idx);
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-   } else if (idx < INTEL_PMC_IDX_FIXED_BTS) {
+   break;
+   case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
intel_set_masks(event, idx);
intel_pmu_enable_fixed(event);
-   } else if (idx == INTEL_PMC_IDX_FIXED_BTS) {
+   break;
+   case INTEL_PMC_IDX_FIXED_BTS:
if (!__this_cpu_read(cpu_hw_events.enabled))
return;
intel_pmu_enable_bts(hwc->config);
-   } else if (idx == INTEL_PMC_IDX_FIXED_VLBR)
+   break;
+   case INTEL_PMC_IDX_FIXED_VLBR:
intel_set_masks(event, idx);
+   break;
+   default:
+   pr_warn("Failed to enable the event with invalid index %d\n",
+   idx);
+   }
 }
 
 static void intel_pmu_add_event(struct perf_event *event)


[tip: perf/core] perf/x86/intel: Fix the name of perf METRICS

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: bbdbde2a415d9f479803266cae6fb0c1a9f6c80e
Gitweb:
https://git.kernel.org/tip/bbdbde2a415d9f479803266cae6fb0c1a9f6c80e
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:08 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:35 +02:00

perf/x86/intel: Fix the name of perf METRICS

Bit 15 of the PERF_CAPABILITIES MSR indicates that the perf METRICS
feature is supported. The perf METRICS is not a PEBS feature.

Rename pebs_metrics_available perf_metrics.

The bit is not used in the current code. It will be used in a later
patch.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-6-kan.li...@linux.intel.com
---
 arch/x86/events/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7b68ab5..5d453da 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -537,7 +537,7 @@ union perf_capabilities {
 */
u64 full_width_write:1;
u64 pebs_baseline:1;
-   u64 pebs_metrics_available:1;
+   u64 perf_metrics:1;
u64 pebs_output_pt_available:1;
};
u64 capabilities;


[tip: perf/core] perf/x86: Add a macro for RDPMC offset of fixed counters

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 0e2e45e2ded4988f5641115fd996c75dc32e4be3
Gitweb:
https://git.kernel.org/tip/0e2e45e2ded4988f5641115fd996c75dc32e4be3
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:12 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:36 +02:00

perf/x86: Add a macro for RDPMC offset of fixed counters

The RDPMC base offset of fixed counters is hard-code. Use a meaningful
name to replace the magic number to improve the readability of the code.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-10-kan.li...@linux.intel.com
---
 arch/x86/events/core.c| 3 ++-
 arch/x86/include/asm/perf_event.h | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 53fcf0a..ebf723f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1151,7 +1151,8 @@ static inline void x86_assign_hw_event(struct perf_event 
*event,
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
(idx - INTEL_PMC_IDX_FIXED);
-   hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+   hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
+   INTEL_PMC_FIXED_RDPMC_BASE;
break;
 
default:
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 000cab7..964ba31 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -196,6 +196,9 @@ struct x86_pmu_capability {
  * Fixed-purpose performance events:
  */
 
+/* RDPMC offset for Fixed PMCs */
+#define INTEL_PMC_FIXED_RDPMC_BASE (1 << 30)
+
 /*
  * All the fixed-mode PMCs are configured via this single MSR:
  */


[tip: perf/core] perf/x86/intel: Support TopDown metrics on Ice Lake

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 59a854e2f3b90ad2cc7368ae392de40b981ad51d
Gitweb:
https://git.kernel.org/tip/59a854e2f3b90ad2cc7368ae392de40b981ad51d
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:13 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:37 +02:00

perf/x86/intel: Support TopDown metrics on Ice Lake

Ice Lake supports the hardware TopDown metrics feature, which can free
up the scarce GP counters.

Update the event constraints for the metrics events. The metric counters
do not exist, which are mapped to a dummy offset. The sharing between
multiple users of the same metric without multiplexing is not allowed.

Implement set_topdown_event_period for Ice Lake. The values in
PERF_METRICS MSR are derived from the fixed counter 3. Both registers
should start from zero.

Implement update_topdown_event for Ice Lake. The metric is reported by
multiplying the metric (fraction) with slots. To maintain accurate
measurements, both registers are cleared for each update. The fixed
counter 3 should always be cleared before the PERF_METRICS.

Implement td_attr for the new metrics events and the new slots fixed
counter. Make them visible to the perf user tools.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-11-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c  | 118 +-
 arch/x86/events/perf_event.h  |  13 +++-
 arch/x86/include/asm/msr-index.h  |   2 +-
 arch/x86/include/asm/perf_event.h |   2 +-
 4 files changed, 135 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 4a43668..db83334 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -247,6 +247,10 @@ static struct event_constraint 
intel_icl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x003c, 1),  /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2),  /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0400, 3),  /* SLOTS */
+   METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+   METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+   METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+   METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf),
INTEL_EVENT_CONSTRAINT(0x32, 0xf),  /* SW_PREFETCH_ACCESS.* */
@@ -309,6 +313,12 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, 
td_recovery_bubbles,
 EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
"4", "2");
 
+EVENT_ATTR_STR(slots,  slots,  "event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring,   td_retiring,
"event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec,   td_bad_spec,
"event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound,   td_fe_bound,
"event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound,   td_be_bound,
"event=0x00,umask=0x83");
+
 static struct attribute *snb_events_attrs[] = {
EVENT_PTR(td_slots_issued),
EVENT_PTR(td_slots_retired),
@@ -2232,6 +2242,99 @@ static void intel_pmu_del_event(struct perf_event *event)
intel_pmu_pebs_del(event);
 }
 
+static int icl_set_topdown_event_period(struct perf_event *event)
+{
+   struct hw_perf_event *hwc = &event->hw;
+   s64 left = local64_read(&hwc->period_left);
+
+   /*
+* The values in PERF_METRICS MSR are derived from fixed counter 3.
+* Software should start both registers, PERF_METRICS and fixed
+* counter 3, from zero.
+* Clear PERF_METRICS and Fixed counter 3 in initialization.
+* After that, both MSRs will be cleared for each read.
+* Don't need to clear them again.
+*/
+   if (left == x86_pmu.max_period) {
+   wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
+   wrmsrl(MSR_PERF_METRICS, 0);
+   local64_set(&hwc->period_left, 0);
+   }
+
+   perf_event_update_userpage(event);
+
+   return 0;
+}
+
+static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
+{
+   u32 val;
+
+   /*
+* The metric is reported as an 8bit integer fraction
+* suming up to 0xff.
+* slots-in-metric = (Metric / 0xff) * slots
+*/
+   val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff;
+   return  mul_u64_u32_div(slots, val, 0xff);
+}
+
+static void __icl_update_topdown_event(struct perf_event *event,
+  u64 slots, u64 metrics)
+{
+   int idx = event->hw.idx;
+   u64 delta;
+
+   if (is_metric_idx(idx))
+   delta = icl_get_metrics_event_value(metrics, slots, idx);
+   else
+   delta = slots;
+
+   loc

[tip: perf/core] perf/x86/intel: Support per-thread RDPMC TopDown metrics

2020-08-19 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 2cb5383b30d47c446ec7d884cd80f93ffcc31817
Gitweb:
https://git.kernel.org/tip/2cb5383b30d47c446ec7d884cd80f93ffcc31817
Author:Kan Liang 
AuthorDate:Thu, 23 Jul 2020 10:11:14 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 18 Aug 2020 16:34:37 +02:00

perf/x86/intel: Support per-thread RDPMC TopDown metrics

Starts from Ice Lake, the TopDown metrics are directly available as
fixed counters and do not require generic counters. Also, the TopDown
metrics can be collected per thread. Extend the RDPMC usage to support
per-thread TopDown metrics.

The RDPMC index of the PERF_METRICS will be output if RDPMC users ask
for the RDPMC index of the metrics events.

To support per thread RDPMC TopDown, the metrics and slots counters have
to be saved/restored during the context switching.

The last_period and period_left are not used in the counting mode. Use
the fields for saved_metric and saved_slots.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200723171117.9918-12-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   |  5 +-
 arch/x86/events/intel/core.c | 90 ++-
 include/linux/perf_event.h   | 29 +++
 3 files changed, 102 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index ebf723f..0f3d015 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2257,7 +2257,10 @@ static int x86_pmu_event_idx(struct perf_event *event)
if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return 0;
 
-   return hwc->event_base_rdpmc + 1;
+   if (is_metric_idx(hwc->idx))
+   return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
+   else
+   return hwc->event_base_rdpmc + 1;
 }
 
 static ssize_t get_attr_rdpmc(struct device *cdev,
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index db83334..c72e490 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2258,7 +2258,13 @@ static int icl_set_topdown_event_period(struct 
perf_event *event)
if (left == x86_pmu.max_period) {
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrl(MSR_PERF_METRICS, 0);
-   local64_set(&hwc->period_left, 0);
+   hwc->saved_slots = 0;
+   hwc->saved_metric = 0;
+   }
+
+   if ((hwc->saved_slots) && is_slots_event(event)) {
+   wrmsrl(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots);
+   wrmsrl(MSR_PERF_METRICS, hwc->saved_metric);
}
 
perf_event_update_userpage(event);
@@ -2279,7 +2285,7 @@ static inline u64 icl_get_metrics_event_value(u64 metric, 
u64 slots, int idx)
return  mul_u64_u32_div(slots, val, 0xff);
 }
 
-static void __icl_update_topdown_event(struct perf_event *event,
+static u64 icl_get_topdown_value(struct perf_event *event,
   u64 slots, u64 metrics)
 {
int idx = event->hw.idx;
@@ -2290,7 +2296,50 @@ static void __icl_update_topdown_event(struct perf_event 
*event,
else
delta = slots;
 
-   local64_add(delta, &event->count);
+   return delta;
+}
+
+static void __icl_update_topdown_event(struct perf_event *event,
+  u64 slots, u64 metrics,
+  u64 last_slots, u64 last_metrics)
+{
+   u64 delta, last = 0;
+
+   delta = icl_get_topdown_value(event, slots, metrics);
+   if (last_slots)
+   last = icl_get_topdown_value(event, last_slots, last_metrics);
+
+   /*
+* The 8bit integer fraction of metric may be not accurate,
+* especially when the changes is very small.
+* For example, if only a few bad_spec happens, the fraction
+* may be reduced from 1 to 0. If so, the bad_spec event value
+* will be 0 which is definitely less than the last value.
+* Avoid update event->count for this case.
+*/
+   if (delta > last) {
+   delta -= last;
+   local64_add(delta, &event->count);
+   }
+}
+
+static void update_saved_topdown_regs(struct perf_event *event,
+ u64 slots, u64 metrics)
+{
+   struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+   struct perf_event *other;
+   int idx;
+
+   event->hw.saved_slots = slots;
+   event->hw.saved_metric = metrics;
+
+   for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) 
{
+   if (!is_topdown_idx(idx))
+   continue;
+   other = cpuc->events[idx];
+   other->hw.saved_slots = slots;
+   other->hw.saved_metric = metrics;
+   }
 }
 
 /*
@@ -2304,6 +2353,7 @@ static u64 icl_update_topdown_event(struct perf_event 
*event)
st

[tip: x86/urgent] x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs

2020-08-06 Thread tip-bot2 for Kan Liang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 76d10256a97a7cab72b123d54b766a3c17da658c
Gitweb:
https://git.kernel.org/tip/76d10256a97a7cab72b123d54b766a3c17da658c
Author:Kan Liang 
AuthorDate:Mon, 20 Jul 2020 06:50:51 -07:00
Committer: Ingo Molnar 
CommitterDate: Fri, 07 Aug 2020 01:32:00 +02:00

x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs

An xstate size check warning is triggered on machines which support
Architectural LBRs.

XSAVE consistency problem, dumping leaves
WARNING: CPU: 0 PID: 0 at arch/x86/kernel/fpu/xstate.c:649 
fpu__init_system_xstate+0x4d4/0xd0e
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted intel-arch_lbr+
RIP: 0010:fpu__init_system_xstate+0x4d4/0xd0e

The xstate size check routine, init_xstate_size(), compares the size
retrieved from the hardware with the size of task->fpu, which is
calculated by the software.

The size from the hardware is the total size of the enabled xstates in
XCR0 | IA32_XSS. Architectural LBR state is a dynamic supervisor
feature, which sets the corresponding bit in the IA32_XSS at boot time.
The size from the hardware includes the size of the Architectural LBR
state.

However, a dynamic supervisor feature doesn't allocate a buffer in the
task->fpu. The size of task->fpu doesn't include the size of the
Architectural LBR state. The mismatch will trigger the warning.

Three options as below were considered to fix the issue:

- Correct the size from the hardware by subtracting the size of the
  dynamic supervisor features.
  The purpose of the check is to compare the size CPU told with the size
  of the XSAVE buffer, which is calculated by the software. If the
  software mucks with the number from hardware, it removes the value of
  the check.
  This option is not a good option.

- Prevent the hardware from counting the size of the dynamic supervisor
  feature by temporarily removing the corresponding bits in IA32_XSS.
  Two extra MSR writes are required to flip the IA32_XSS. The option is
  not pretty, but it is workable. The check is only called once at early
  boot time. The synchronization or context-switching doesn't need to be
  worried.
  This option is implemented here.

- Remove the check entirely, because the check hasn't found any real
  problems. The option may be an alternative as option 2.
  This option is not implemented here.

Add a new function, get_xsaves_size_no_dynamic(), which retrieves the
total size without the dynamic supervisor features from the hardware.
The size will be used to compare with the size of task->fpu.

Fixes: f0dccc9da4c0 ("x86/fpu/xstate: Support dynamic supervisor feature for 
LBR")
Reported-by: Chang S. Bae 
Signed-off-by: Kan Liang 
Signed-off-by: Ingo Molnar 
Reviewed-by: Dave Hansen 
Link: 
https://lore.kernel.org/r/1595253051-75374-1-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/kernel/fpu/xstate.c | 33 -
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index be2a68a..6073e34 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -611,6 +611,10 @@ static void check_xstate_against_struct(int nr)
  * This essentially double-checks what the cpu told us about
  * how large the XSAVE buffer needs to be.  We are recalculating
  * it to be safe.
+ *
+ * Dynamic XSAVE features allocate their own buffers and are not
+ * covered by these checks. Only the size of the buffer for task->fpu
+ * is checked here.
  */
 static void do_extra_xstate_size_checks(void)
 {
@@ -673,6 +677,33 @@ static unsigned int __init get_xsaves_size(void)
return ebx;
 }
 
+/*
+ * Get the total size of the enabled xstates without the dynamic supervisor
+ * features.
+ */
+static unsigned int __init get_xsaves_size_no_dynamic(void)
+{
+   u64 mask = xfeatures_mask_dynamic();
+   unsigned int size;
+
+   if (!mask)
+   return get_xsaves_size();
+
+   /* Disable dynamic features. */
+   wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+
+   /*
+* Ask the hardware what size is required of the buffer.
+* This is the size required for the task->fpu buffer.
+*/
+   size = get_xsaves_size();
+
+   /* Re-enable dynamic features so XSAVES will work on them again. */
+   wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
+
+   return size;
+}
+
 static unsigned int __init get_xsave_size(void)
 {
unsigned int eax, ebx, ecx, edx;
@@ -710,7 +741,7 @@ static int __init init_xstate_size(void)
xsave_size = get_xsave_size();
 
if (boot_cpu_has(X86_FEATURE_XSAVES))
-   possible_xstate_size = get_xsaves_size();
+   possible_xstate_size = get_xsaves_size_no_dynamic();
else
possible_xstate_size = xsave_size;
 


[tip: x86/urgent] x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs

2020-08-06 Thread tip-bot2 for Kan Liang
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: ec8602b79088b0f3556d9c7a3a05313bc4e4a96f
Gitweb:
https://git.kernel.org/tip/ec8602b79088b0f3556d9c7a3a05313bc4e4a96f
Author:Kan Liang 
AuthorDate:Mon, 20 Jul 2020 06:50:51 -07:00
Committer: Ingo Molnar 
CommitterDate: Thu, 06 Aug 2020 17:11:59 +02:00

x86/fpu/xstate: Fix an xstate size check warning with architectural LBRs

An xstate size check warning is triggered on machines which support
Architectural LBRs.

XSAVE consistency problem, dumping leaves
WARNING: CPU: 0 PID: 0 at arch/x86/kernel/fpu/xstate.c:649 
fpu__init_system_xstate+0x4d4/0xd0e
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted intel-arch_lbr+
RIP: 0010:fpu__init_system_xstate+0x4d4/0xd0e

The xstate size check routine, init_xstate_size(), compares the size
retrieved from the hardware with the size of task->fpu, which is
calculated by the software.

The size from the hardware is the total size of the enabled xstates in
XCR0 | IA32_XSS. Architectural LBR state is a dynamic supervisor
feature, which sets the corresponding bit in the IA32_XSS at boot time.
The size from the hardware includes the size of the Architectural LBR
state.

However, a dynamic supervisor feature doesn't allocate a buffer in the
task->fpu. The size of task->fpu doesn't include the size of the
Architectural LBR state. The mismatch will trigger the warning.

Three options as below were considered to fix the issue:

- Correct the size from the hardware by subtracting the size of the
  dynamic supervisor features.
  The purpose of the check is to compare the size CPU told with the size
  of the XSAVE buffer, which is calculated by the software. If the
  software mucks with the number from hardware, it removes the value of
  the check.
  This option is not a good option.

- Prevent the hardware from counting the size of the dynamic supervisor
  feature by temporarily removing the corresponding bits in IA32_XSS.
  Two extra MSR writes are required to flip the IA32_XSS. The option is
  not pretty, but it is workable. The check is only called once at early
  boot time. The synchronization or context-switching doesn't need to be
  worried.
  This option is implemented here.

- Remove the check entirely, because the check hasn't found any real
  problems. The option may be an alternative as option 2.
  This option is not implemented here.

Add a new function, get_xsaves_size_no_dynamic(), which retrieves the
total size without the dynamic supervisor features from the hardware.
The size will be used to compare with the size of task->fpu.

Fixes: f0dccc9da4c0 ("x86/fpu/xstate: Support dynamic supervisor feature for 
LBR")
Reported-by: Chang S. Bae 
Signed-off-by: Kan Liang 
Signed-off-by: Ingo Molnar 
Reviewed-by: Dave Hansen 
Link: 
https://lore.kernel.org/r/1595253051-75374-1-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/kernel/fpu/xstate.c | 33 -
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index be2a68a..6073e34 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -611,6 +611,10 @@ static void check_xstate_against_struct(int nr)
  * This essentially double-checks what the cpu told us about
  * how large the XSAVE buffer needs to be.  We are recalculating
  * it to be safe.
+ *
+ * Dynamic XSAVE features allocate their own buffers and are not
+ * covered by these checks. Only the size of the buffer for task->fpu
+ * is checked here.
  */
 static void do_extra_xstate_size_checks(void)
 {
@@ -673,6 +677,33 @@ static unsigned int __init get_xsaves_size(void)
return ebx;
 }
 
+/*
+ * Get the total size of the enabled xstates without the dynamic supervisor
+ * features.
+ */
+static unsigned int __init get_xsaves_size_no_dynamic(void)
+{
+   u64 mask = xfeatures_mask_dynamic();
+   unsigned int size;
+
+   if (!mask)
+   return get_xsaves_size();
+
+   /* Disable dynamic features. */
+   wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+
+   /*
+* Ask the hardware what size is required of the buffer.
+* This is the size required for the task->fpu buffer.
+*/
+   size = get_xsaves_size();
+
+   /* Re-enable dynamic features so XSAVES will work on them again. */
+   wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
+
+   return size;
+}
+
 static unsigned int __init get_xsave_size(void)
 {
unsigned int eax, ebx, ecx, edx;
@@ -710,7 +741,7 @@ static int __init init_xstate_size(void)
xsave_size = get_xsave_size();
 
if (boot_cpu_has(X86_FEATURE_XSAVES))
-   possible_xstate_size = get_xsaves_size();
+   possible_xstate_size = get_xsaves_size_no_dynamic();
else
possible_xstate_size = xsave_size;
 


[tip: perf/core] x86/fpu: Use proper mask to replace full instruction mask

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: a063bf249b9f8d8004f282031781322c1b527d13
Gitweb:
https://git.kernel.org/tip/a063bf249b9f8d8004f282031781322c1b527d13
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:25 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00

x86/fpu: Use proper mask to replace full instruction mask

When saving xstate to a kernel/user XSAVE area with the XSAVE family of
instructions, the current code applies the 'full' instruction mask (-1),
which tries to XSAVE all possible features. This method relies on
hardware to trim 'all possible' down to what is enabled in the
hardware. The code works well for now. However, there will be a
problem, if some features are enabled in hardware, but are not suitable
to be saved into all kernel XSAVE buffers, like task->fpu, due to
performance consideration.

One such example is the Last Branch Records (LBR) state. The LBR state
only contains valuable information when LBR is explicitly enabled by
the perf subsystem, and the size of an LBR state is large (808 bytes
for now). To avoid both CPU overhead and space overhead at each context
switch, the LBR state should not be saved into task->fpu like other
state components. It should be saved/restored on demand when LBR is
enabled in the perf subsystem. Current copy_xregs_to_* will trigger a
buffer overflow for such cases.

Three sites use the '-1' instruction mask which must be updated.

Two are saving/restoring the xstate to/from a kernel-allocated XSAVE
buffer and can use 'xfeatures_mask_all', which will save/restore all of
the features present in a normal task FPU buffer.

The last one saves the register state directly to a user buffer. It
could
also use 'xfeatures_mask_all'. Just as it was with the '-1' argument,
any supervisor states in the mask will be filtered out by the hardware
and not saved to the buffer.  But, to be more explicit about what is
expected to be saved, use xfeatures_mask_user() for the instruction
mask.

KVM includes the header file fpu/internal.h. To avoid 'undefined
xfeatures_mask_all' compiling issue, move copy_fpregs_to_fpstate() to
fpu/core.c and export it, because:
- The xfeatures_mask_all is indirectly used via copy_fpregs_to_fpstate()
  by KVM. The function which is directly used by other modules should be
  exported.
- The copy_fpregs_to_fpstate() is a function, while xfeatures_mask_all
  is a variable for the "internal" FPU state. It's safer to export a
  function than a variable, which may be implicitly changed by others.
- The copy_fpregs_to_fpstate() is a big function with many checks. The
  removal of the inline keyword should not impact the performance.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1593780569-62993-20-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/include/asm/fpu/internal.h | 47 
 arch/x86/kernel/fpu/core.c  | 39 +++-
 2 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 42159f4..d3724dc 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -274,7 +274,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
  */
 static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
 {
-   u64 mask = -1;
+   u64 mask = xfeatures_mask_all;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
@@ -320,7 +320,7 @@ static inline void copy_kernel_to_xregs_booting(struct 
xregs_state *xstate)
  */
 static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
 {
-   u64 mask = -1;
+   u64 mask = xfeatures_mask_all;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
@@ -356,6 +356,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state 
*xstate, u64 mask)
  */
 static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 {
+   u64 mask = xfeatures_mask_user();
+   u32 lmask = mask;
+   u32 hmask = mask >> 32;
int err;
 
/*
@@ -367,7 +370,7 @@ static inline int copy_xregs_to_user(struct xregs_state 
__user *buf)
return -EFAULT;
 
stac();
-   XSTATE_OP(XSAVE, buf, -1, -1, err);
+   XSTATE_OP(XSAVE, buf, lmask, hmask, err);
clac();
 
return err;
@@ -408,43 +411,7 @@ static inline int copy_kernel_to_xregs_err(struct 
xregs_state *xstate, u64 mask)
return err;
 }
 
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact and we can
- * keep registers active.
- *
- * The legacy FNSAVE instruction cleared all FPU state
- * unconditionally, so registers are essentially destroyed.
- * Modern FPU state can be kept in registers, if there are
- * no pending FP exceptions.
- 

[tip: perf/core] perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 020d91e5f32da4f4b929b3a6e680135fd526107c
Gitweb:
https://git.kernel.org/tip/020d91e5f32da4f4b929b3a6e680135fd526107c
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:17 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:53 +02:00

perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline

The {rd,wr}lbr_{to,from} wrappers are invoked in hot paths, e.g. context
switch and NMI handler. They should be always inline to achieve better
performance. However, the CONFIG_OPTIMIZE_INLINING allows the compiler
to uninline functions marked 'inline'.

Mark the {rd,wr}lbr_{to,from} wrappers as __always_inline to force
inline the wrappers.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-12-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/lbr.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index b8baaf1..21f4f07 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -332,18 +332,18 @@ static u64 lbr_from_signext_quirk_rd(u64 val)
return val;
 }
 
-static inline void wrlbr_from(unsigned int idx, u64 val)
+static __always_inline void wrlbr_from(unsigned int idx, u64 val)
 {
val = lbr_from_signext_quirk_wr(val);
wrmsrl(x86_pmu.lbr_from + idx, val);
 }
 
-static inline void wrlbr_to(unsigned int idx, u64 val)
+static __always_inline void wrlbr_to(unsigned int idx, u64 val)
 {
wrmsrl(x86_pmu.lbr_to + idx, val);
 }
 
-static inline u64 rdlbr_from(unsigned int idx)
+static __always_inline u64 rdlbr_from(unsigned int idx)
 {
u64 val;
 
@@ -352,7 +352,7 @@ static inline u64 rdlbr_from(unsigned int idx)
return lbr_from_signext_quirk_rd(val);
 }
 
-static inline u64 rdlbr_to(unsigned int idx)
+static __always_inline u64 rdlbr_to(unsigned int idx)
 {
u64 val;
 


[tip: perf/core] x86/fpu/xstate: Support dynamic supervisor feature for LBR

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: f0dccc9da4c0fda049e99326f85db8c242fd781f
Gitweb:
https://git.kernel.org/tip/f0dccc9da4c0fda049e99326f85db8c242fd781f
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:26 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00

x86/fpu/xstate: Support dynamic supervisor feature for LBR

Last Branch Records (LBR) registers are used to log taken branches and
other control flows. In perf with call stack mode, LBR information is
used to reconstruct a call stack. To get the complete call stack, perf
has to save/restore all LBR registers during a context switch. Due to
the large number of the LBR registers, e.g., the current platform has
96 LBR registers, this process causes a high CPU overhead. To reduce
the CPU overhead during a context switch, an LBR state component that
contains all the LBR related registers is introduced in hardware. All
LBR registers can be saved/restored together using one XSAVES/XRSTORS
instruction.

However, the kernel should not save/restore the LBR state component at
each context switch, like other state components, because of the
following unique features of LBR:
- The LBR state component only contains valuable information when LBR
  is enabled in the perf subsystem, but for most of the time, LBR is
  disabled.
- The size of the LBR state component is huge. For the current
  platform, it's 808 bytes.
If the kernel saves/restores the LBR state at each context switch, for
most of the time, it is just a waste of space and cycles.

To efficiently support the LBR state component, it is desired to have:
- only context-switch the LBR when the LBR feature is enabled in perf.
- only allocate an LBR-specific XSAVE buffer on demand.
  (Besides the LBR state, a legacy region and an XSAVE header have to be
   included in the buffer as well. There is a total of (808+576) byte
   overhead for the LBR-specific XSAVE buffer. The overhead only happens
   when the perf is actively using LBRs. There is still a space-saving,
   on average, when it replaces the constant 808 bytes of overhead for
   every task, all the time on the systems that support architectural
   LBR.)
- be able to use XSAVES/XRSTORS for accessing LBR at run time.
  However, the IA32_XSS should not be adjusted at run time.
  (The XCR0 | IA32_XSS are used to determine the requested-feature
  bitmap (RFBM) of XSAVES.)

A solution, called dynamic supervisor feature, is introduced to address
this issue, which
- does not allocate a buffer in each task->fpu;
- does not save/restore a state component at each context switch;
- sets the bit corresponding to the dynamic supervisor feature in
  IA32_XSS at boot time, and avoids setting it at run time.
- dynamically allocates a specific buffer for a state component
  on demand, e.g. only allocates LBR-specific XSAVE buffer when LBR is
  enabled in perf. (Note: The buffer has to include the LBR state
  component, a legacy region and a XSAVE header space.)
  (Implemented in a later patch)
- saves/restores a state component on demand, e.g. manually invokes
  the XSAVES/XRSTORS instruction to save/restore the LBR state
  to/from the buffer when perf is active and a call stack is required.
  (Implemented in a later patch)

A new mask XFEATURE_MASK_DYNAMIC and a helper xfeatures_mask_dynamic()
are introduced to indicate the dynamic supervisor feature. For the
systems which support the Architecture LBR, LBR is the only dynamic
supervisor feature for now. For the previous systems, there is no
dynamic supervisor feature available.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1593780569-62993-21-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/include/asm/fpu/types.h  |  7 +++-
 arch/x86/include/asm/fpu/xstate.h | 30 ++-
 arch/x86/kernel/fpu/xstate.c  | 15 ++-
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f098f6c..132e9cc 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,6 +114,12 @@ enum xfeature {
XFEATURE_Hi16_ZMM,
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
+   XFEATURE_RSRVD_COMP_10,
+   XFEATURE_RSRVD_COMP_11,
+   XFEATURE_RSRVD_COMP_12,
+   XFEATURE_RSRVD_COMP_13,
+   XFEATURE_RSRVD_COMP_14,
+   XFEATURE_LBR,
 
XFEATURE_MAX,
 };
@@ -128,6 +134,7 @@ enum xfeature {
 #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
 #define XFEATURE_MASK_PT   (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_LBR  (1 << XFEATURE_LBR)
 
 #define XFEATURE_MASK_FPSSE(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512  

[tip: perf/core] perf/x86/intel/lbr: Create kmem_cache for the LBR context data

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 33cad284497cf40f55ad6029c06011de3538ebed
Gitweb:
https://git.kernel.org/tip/33cad284497cf40f55ad6029c06011de3538ebed
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:23 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:55 +02:00

perf/x86/intel/lbr: Create kmem_cache for the LBR context data

A new kmem_cache method is introduced to allocate the PMU specific data
task_ctx_data, which requires the PMU specific code to create a
kmem_cache.

Currently, the task_ctx_data is only used by the Intel LBR call stack
feature, which is introduced since Haswell. The kmem_cache should be
only created for Haswell and later platforms. There is no alignment
requirement for the existing platforms.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-18-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/lbr.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e4e249a..e784c1d 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1531,9 +1531,17 @@ void __init intel_pmu_lbr_init_snb(void)
 */
 }
 
+static inline struct kmem_cache *
+create_lbr_kmem_cache(size_t size, size_t align)
+{
+   return kmem_cache_create("x86_lbr", size, align, 0, NULL);
+}
+
 /* haswell */
 void intel_pmu_lbr_init_hsw(void)
 {
+   size_t size = sizeof(struct x86_perf_task_context);
+
x86_pmu.lbr_nr   = 16;
x86_pmu.lbr_tos  = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
@@ -1542,6 +1550,8 @@ void intel_pmu_lbr_init_hsw(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+   x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
if (lbr_from_signext_quirk_needed())
static_branch_enable(&lbr_from_quirk_key);
 }
@@ -1549,6 +1559,8 @@ void intel_pmu_lbr_init_hsw(void)
 /* skylake */
 __init void intel_pmu_lbr_init_skl(void)
 {
+   size_t size = sizeof(struct x86_perf_task_context);
+
x86_pmu.lbr_nr   = 32;
x86_pmu.lbr_tos  = MSR_LBR_TOS;
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
@@ -1558,6 +1570,8 @@ __init void intel_pmu_lbr_init_skl(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+   x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
/*
 * SW branch filter usage:
 * - support syscall, sysret capture.
@@ -1631,6 +1645,7 @@ void __init intel_pmu_arch_lbr_init(void)
union cpuid28_ebx ebx;
union cpuid28_ecx ecx;
unsigned int unused_edx;
+   size_t size;
u64 lbr_nr;
 
/* Arch LBR Capabilities */
@@ -1655,8 +1670,10 @@ void __init intel_pmu_arch_lbr_init(void)
x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
x86_pmu.lbr_nr = lbr_nr;
 
-   x86_get_pmu()->task_ctx_size = sizeof(struct 
x86_perf_task_context_arch_lbr) +
-  lbr_nr * sizeof(struct lbr_entry);
+   size = sizeof(struct x86_perf_task_context_arch_lbr) +
+  lbr_nr * sizeof(struct lbr_entry);
+   x86_get_pmu()->task_ctx_size = size;
+   x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
 
x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;


[tip: perf/core] perf/x86/intel/lbr: Support XSAVES for arch LBR read

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: c085fb8774671e83f6199a8e838fbc0e57094029
Gitweb:
https://git.kernel.org/tip/c085fb8774671e83f6199a8e838fbc0e57094029
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:29 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:57 +02:00

perf/x86/intel/lbr: Support XSAVES for arch LBR read

Reading LBR registers in a perf NMI handler for a non-PEBS event
causes a high overhead because the number of LBR registers is huge.
To reduce the overhead, the XSAVES instruction should be used to replace
the LBR registers' reading method.

The XSAVES buffer used for LBR read has to be per-CPU because the NMI
handler invoked the lbr_read(). The existing task_ctx_data buffer
cannot be used which is per-task and only be allocated for the LBR call
stack mode. A new lbr_xsave pointer is introduced in the cpu_hw_events
as an XSAVES buffer for LBR read.

The XSAVES buffer should be allocated only when LBR is used by a
non-PEBS event on the CPU because the total size of the lbr_xsave is
not small (~1.4KB).

The XSAVES buffer is allocated when a non-PEBS event is added, but it
is lazily released in x86_release_hardware() when perf releases the
entire PMU hardware resource, because perf may frequently schedule the
event, e.g. high context switch. The lazy release method reduces the
overhead of frequently allocate/free the buffer.

If the lbr_xsave fails to be allocated, roll back to normal Arch LBR
lbr_read().

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1593780569-62993-24-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/core.c   |  1 +-
 arch/x86/events/intel/lbr.c  | 40 ++-
 arch/x86/events/perf_event.h |  7 ++-
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 6b1228a..1cbf57d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -358,6 +358,7 @@ void x86_release_hardware(void)
if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
release_pmc_hardware();
release_ds_buffers();
+   release_lbr_buffers();
mutex_unlock(&pmc_reserve_mutex);
}
 }
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index cb1a049..63f58bd 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -658,6 +658,7 @@ static inline bool branch_user_callstack(unsigned br_sel)
 
 void intel_pmu_lbr_add(struct perf_event *event)
 {
+   struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
if (!x86_pmu.lbr_nr)
@@ -695,6 +696,29 @@ void intel_pmu_lbr_add(struct perf_event *event)
perf_sched_cb_inc(event->ctx->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
+
+   if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+   kmem_cache && !cpuc->lbr_xsave &&
+   (cpuc->lbr_users != cpuc->lbr_pebs_users))
+   cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
+}
+
+void release_lbr_buffers(void)
+{
+   struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+   struct cpu_hw_events *cpuc;
+   int cpu;
+
+   if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+   return;
+
+   for_each_possible_cpu(cpu) {
+   cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+   if (kmem_cache && cpuc->lbr_xsave) {
+   kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
+   cpuc->lbr_xsave = NULL;
+   }
+   }
 }
 
 void intel_pmu_lbr_del(struct perf_event *event)
@@ -945,6 +969,19 @@ static void intel_pmu_arch_lbr_read(struct cpu_hw_events 
*cpuc)
intel_pmu_store_lbr(cpuc, NULL);
 }
 
+static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
+{
+   struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;
+
+   if (!xsave) {
+   intel_pmu_store_lbr(cpuc, NULL);
+   return;
+   }
+   copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);
+
+   intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
+}
+
 void intel_pmu_lbr_read(void)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1767,14 +1804,15 @@ void __init intel_pmu_arch_lbr_init(void)
x86_pmu.lbr_ctl_map = NULL;
 
x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
-   x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
if (arch_lbr_xsave) {
x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves;
x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors;
+   x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave;
pr_cont("XSAVE 

[tip: perf/core] perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: ce711ea3cab9ad325d849792d442848e553095b8
Gitweb:
https://git.kernel.org/tip/ce711ea3cab9ad325d849792d442848e553095b8
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:28 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00

perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch

In the LBR call stack mode, LBR information is used to reconstruct a
call stack. To get the complete call stack, perf has to save/restore
all LBR registers during a context switch. Due to a large number of the
LBR registers, this process causes a high CPU overhead. To reduce the
CPU overhead during a context switch, use the XSAVES/XRSTORS
instructions.

Every XSAVE area must follow a canonical format: the legacy region, an
XSAVE header and the extended region. Although the LBR information is
only kept in the extended region, a space for the legacy region and
XSAVE header is still required. Add a new dedicated structure for LBR
XSAVES support.

Before enabling XSAVES support, the size of the LBR state has to be
sanity checked, because:
- the size of the software structure is calculated from the max number
of the LBR depth, which is enumerated by the CPUID leaf for Arch LBR.
The size of the LBR state is enumerated by the CPUID leaf for XSAVE
support of Arch LBR. If the values from the two CPUID leaves are not
consistent, it may trigger a buffer overflow. For example, a hypervisor
may unconsciously set inconsistent values for the two emulated CPUID.
- unlike other state components, the size of an LBR state depends on the
max number of LBRs, which may vary from generation to generation.

Expose the function xfeature_size() for the sanity check.
The LBR XSAVES support will be disabled if the size of the LBR state
enumerated by CPUID doesn't match with the size of the software
structure.

The XSAVE instruction requires 64-byte alignment for state buffers. A
new macro is added to reflect the alignment requirement. A 64-byte
aligned kmem_cache is created for architecture LBR.

Currently, the structure for each state component is maintained in
fpu/types.h. The structure for the new LBR state component should be
maintained in the same place. Move structure lbr_entry to fpu/types.h as
well for broader sharing.

Add dedicated lbr_save/lbr_restore functions for LBR XSAVES support,
which invokes the corresponding xstate helpers to XSAVES/XRSTORS LBR
information at the context switch when the call stack mode is enabled.
Since the XSAVES/XRSTORS instructions will be eventually invoked, the
dedicated functions is named with '_xsaves'/'_xrstors' postfix.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1593780569-62993-23-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/lbr.c   | 79 --
 arch/x86/events/perf_event.h  | 21 -
 arch/x86/include/asm/fpu/types.h  | 20 -
 arch/x86/include/asm/fpu/xstate.h |  3 +-
 arch/x86/include/asm/perf_event.h |  4 +--
 arch/x86/kernel/fpu/xstate.c  |  2 +-
 6 files changed, 119 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 3ad5289..cb1a049 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -483,6 +483,17 @@ static void intel_pmu_arch_lbr_restore(void *ctx)
}
 }
 
+/*
+ * Restore the Architecture LBR state from the xsave area in the perf
+ * context data for the task via the XRSTORS instruction.
+ */
+static void intel_pmu_arch_lbr_xrstors(void *ctx)
+{
+   struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+   copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
if (static_cpu_has(X86_FEATURE_ARCH_LBR))
@@ -557,6 +568,17 @@ static void intel_pmu_arch_lbr_save(void *ctx)
entries[x86_pmu.lbr_nr - 1].from = 0;
 }
 
+/*
+ * Save the Architecture LBR state to the xsave area in the perf
+ * context data for the task via the XSAVES instruction.
+ */
+static void intel_pmu_arch_lbr_xsaves(void *ctx)
+{
+   struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+   copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
 static void __intel_pmu_lbr_save(void *ctx)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1639,12 +1661,40 @@ void intel_pmu_lbr_init_knl(void)
x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
 }
 
+/*
+ * LBR state size is variable based on the max number of registers.
+ * This calculates the expected state size, which should match
+ * what the hardware enumerates for the size of XFEATURE_LBR.
+ */
+static inline unsigned int get_lbr_state_size(void)
+{
+   return sizeof(struct arch_lbr_state) +

[tip: perf/core] perf/core: Use kmem_cache to allocate the PMU specific data

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 217c2a633ebb36f1cc6d249f4ef2e4a809d46818
Gitweb:
https://git.kernel.org/tip/217c2a633ebb36f1cc6d249f4ef2e4a809d46818
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:22 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:55 +02:00

perf/core: Use kmem_cache to allocate the PMU specific data

Currently, the PMU specific data task_ctx_data is allocated by the
function kzalloc() in the perf generic code. When there is no specific
alignment requirement for the task_ctx_data, the method works well for
now. However, there will be a problem once a specific alignment
requirement is introduced in future features, e.g., the Architecture LBR
XSAVE feature requires 64-byte alignment. If the specific alignment
requirement is not fulfilled, the XSAVE family of instructions will fail
to save/restore the xstate to/from the task_ctx_data.

The function kzalloc() itself only guarantees a natural alignment. A
new method to allocate the task_ctx_data has to be introduced, which
has to meet the requirements as below:
- must be a generic method can be used by different architectures,
  because the allocation of the task_ctx_data is implemented in the
  perf generic code;
- must be an alignment-guarantee method (The alignment requirement is
  not changed after the boot);
- must be able to allocate/free a buffer (smaller than a page size)
  dynamically;
- should not cause extra CPU overhead or space overhead.

Several options were considered as below:
- One option is to allocate a larger buffer for task_ctx_data. E.g.,
ptr = kmalloc(size + alignment, GFP_KERNEL);
ptr &= ~(alignment - 1);
  This option causes space overhead.
- Another option is to allocate the task_ctx_data in the PMU specific
  code. To do so, several function pointers have to be added. As a
  result, both the generic structure and the PMU specific structure
  will become bigger. Besides, extra function calls are added when
  allocating/freeing the buffer. This option will increase both the
  space overhead and CPU overhead.
- The third option is to use a kmem_cache to allocate a buffer for the
  task_ctx_data. The kmem_cache can be created with a specific alignment
  requirement by the PMU at boot time. A new pointer for kmem_cache has
  to be added in the generic struct pmu, which would be used to
  dynamically allocate a buffer for the task_ctx_data at run time.
  Although the new pointer is added to the struct pmu, the existing
  variable task_ctx_size is not required anymore. The size of the
  generic structure is kept the same.

The third option which meets all the aforementioned requirements is used
to replace kzalloc() for the PMU specific data allocation. A later patch
will remove the kzalloc() method and the related variables.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.li...@linux.intel.com
---
 include/linux/perf_event.h | 5 +
 kernel/events/core.c   | 8 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 46fe5cf..09915ae 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -425,6 +425,11 @@ struct pmu {
size_t  task_ctx_size;
 
/*
+* Kmem cache of PMU specific data
+*/
+   struct kmem_cache   *task_ctx_cache;
+
+   /*
 * PMU specific parts of task perf event context (i.e. 
ctx->task_ctx_data)
 * can be synchronized using this function. See Intel LBR callstack 
support
 * implementation and Perf core context switch handling callbacks for 
usage
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7509040..30d9b31 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1240,12 +1240,18 @@ static void get_ctx(struct perf_event_context *ctx)
 
 static void *alloc_task_ctx_data(struct pmu *pmu)
 {
+   if (pmu->task_ctx_cache)
+   return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
+
return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
 }
 
 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
 {
-   kfree(task_ctx_data);
+   if (pmu->task_ctx_cache && task_ctx_data)
+   kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+   else
+   kfree(task_ctx_data);
 }
 
 static void free_ctx(struct rcu_head *head)


[tip: perf/core] x86/fpu/xstate: Add helpers for LBR dynamic supervisor feature

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 50f408d96d4d1a945d2c50c5fd8ed400883edf0e
Gitweb:
https://git.kernel.org/tip/50f408d96d4d1a945d2c50c5fd8ed400883edf0e
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:27 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:56 +02:00

x86/fpu/xstate: Add helpers for LBR dynamic supervisor feature

The perf subsystem will only need to save/restore the LBR state.
However, the existing helpers save all supported supervisor states to a
kernel buffer, which will be unnecessary. Two helpers are introduced to
only save/restore requested dynamic supervisor states. The supervisor
features in XFEATURE_MASK_SUPERVISOR_SUPPORTED and
XFEATURE_MASK_SUPERVISOR_UNSUPPORTED mask cannot be saved/restored using
these helpers.

The helpers will be used in the following patch.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1593780569-62993-22-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/include/asm/fpu/xstate.h |  3 +-
 arch/x86/kernel/fpu/xstate.c  | 72 ++-
 2 files changed, 75 insertions(+)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 040c4d4..c029fce 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -106,6 +106,9 @@ int copy_xstate_to_user(void __user *ubuf, struct 
xregs_state *xsave, unsigned i
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 void copy_supervisor_to_kernel(struct xregs_state *xsave);
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
+
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 int validate_user_xstate_header(const struct xstate_header *hdr);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index dcf0624..b0c22b7 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1361,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state 
*xstate)
}
 }
 
+/**
+ * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
+ *   an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features saved into the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are saved into the xsave
+ * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
+ * supervisor feature). Besides the dynamic supervisor states, the legacy
+ * region and XSAVE header are also saved into the xsave area. The supervisor
+ * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+{
+   u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+   u32 lmask, hmask;
+   int err;
+
+   if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+   return;
+
+   if (WARN_ON_FPU(!dynamic_mask))
+   return;
+
+   lmask = dynamic_mask;
+   hmask = dynamic_mask >> 32;
+
+   XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+
+   /* Should never fault when copying to a kernel buffer */
+   WARN_ON_FPU(err);
+}
+
+/**
+ * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
+ *   an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features restored from the xsave 
area
+ *
+ * Only the dynamic supervisor states sets in the mask are restored from the
+ * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
+ * dynamic supervisor feature). Besides the dynamic supervisor states, the
+ * legacy region and XSAVE header are also restored from the xsave area. The
+ * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+{
+   u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+   u32 lmask, hmask;
+   int err;
+
+   if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+   return;
+
+   if (WARN_ON_FPU(!dynamic_mask))
+   return;
+
+   lmask = dynamic_mask;
+   hmask = dynamic_mask >> 32;
+
+   XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+
+   /* Should never fault when copying from a kernel buffer */
+   WARN_ON_FPU(err);
+}
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * Report the 

[tip: perf/core] perf/x86/intel/lbr: Factor out intel_pmu_store_lbr

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 631618a0dca31dc23dcce38cf345c6139bd8a1e9
Gitweb:
https://git.kernel.org/tip/631618a0dca31dc23dcce38cf345c6139bd8a1e9
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:19 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:54 +02:00

perf/x86/intel/lbr: Factor out intel_pmu_store_lbr

The way to store the LBR information from a PEBS LBR record can be
reused in Architecture LBR, because
- The LBR information is stored like a stack. Entry 0 is always the
  youngest branch.
- The layout of the LBR INFO MSR is similar.

The LBR information may be retrieved from either the LBR registers
(non-PEBS event) or a buffer (PEBS event). Extend rdlbr_*() to support
both methods.

Explicitly check the invalid entry (0s), which can avoid unnecessary MSR
access if using a non-PEBS event. For a PEBS event, the check should
slightly improve the performance as well. The invalid entries are cut.
The intel_pmu_lbr_filter() doesn't need to check and filter them out.

Cannot share the function with current model-specific LBR read, because
the direction of the LBR growth is opposite.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-14-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/lbr.c | 82 
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index d3d129c..0d7a859 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -348,28 +348,37 @@ static __always_inline void wrlbr_info(unsigned int idx, 
u64 val)
wrmsrl(x86_pmu.lbr_info + idx, val);
 }
 
-static __always_inline u64 rdlbr_from(unsigned int idx)
+static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr)
 {
u64 val;
 
+   if (lbr)
+   return lbr->from;
+
rdmsrl(x86_pmu.lbr_from + idx, val);
 
return lbr_from_signext_quirk_rd(val);
 }
 
-static __always_inline u64 rdlbr_to(unsigned int idx)
+static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr)
 {
u64 val;
 
+   if (lbr)
+   return lbr->to;
+
rdmsrl(x86_pmu.lbr_to + idx, val);
 
return val;
 }
 
-static __always_inline u64 rdlbr_info(unsigned int idx)
+static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr)
 {
u64 val;
 
+   if (lbr)
+   return lbr->info;
+
rdmsrl(x86_pmu.lbr_info + idx, val);
 
return val;
@@ -387,16 +396,16 @@ wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool 
need_info)
 static inline bool
 rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
 {
-   u64 from = rdlbr_from(idx);
+   u64 from = rdlbr_from(idx, NULL);
 
/* Don't read invalid entry */
if (!from)
return false;
 
lbr->from = from;
-   lbr->to = rdlbr_to(idx);
+   lbr->to = rdlbr_to(idx, NULL);
if (need_info)
-   lbr->info = rdlbr_info(idx);
+   lbr->info = rdlbr_info(idx, NULL);
 
return true;
 }
@@ -432,7 +441,7 @@ void intel_pmu_lbr_restore(void *ctx)
 
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
-   return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos);
+   return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
 }
 
 static void __intel_pmu_lbr_restore(void *ctx)
@@ -709,8 +718,8 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
u16 cycles = 0;
int lbr_flags = lbr_desc[lbr_format];
 
-   from = rdlbr_from(lbr_idx);
-   to   = rdlbr_to(lbr_idx);
+   from = rdlbr_from(lbr_idx, NULL);
+   to   = rdlbr_to(lbr_idx, NULL);
 
/*
 * Read LBR call stack entries
@@ -722,7 +731,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (lbr_format == LBR_FORMAT_INFO && need_info) {
u64 info;
 
-   info = rdlbr_info(lbr_idx);
+   info = rdlbr_info(lbr_idx, NULL);
mis = !!(info & LBR_INFO_MISPRED);
pred = !mis;
in_tx = !!(info & LBR_INFO_IN_TX);
@@ -777,6 +786,42 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_stack.hw_idx = tos;
 }
 
+static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
+   struct lbr_entry *entries)
+{
+   struct perf_branch_entry *e;
+   struct lbr_entry *lbr;
+   u64 from, to, info;
+   int i;
+
+   for (i = 0; i < x86_pmu.lbr_nr; i++) {
+   lbr = entries ? &entries[i] : NULL;
+   e = &cpuc->lbr_entries[i];
+
+   from = rdlbr_from(i, lbr);
+   

[tip: perf/core] perf/x86/intel/lbr: Factor out a new struct for generic optimization

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 530bfff6480307d210734222a54d56af7f908957
Gitweb:
https://git.kernel.org/tip/530bfff6480307d210734222a54d56af7f908957
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:11 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:52 +02:00

perf/x86/intel/lbr: Factor out a new struct for generic optimization

To reduce the overhead of a context switch with LBR enabled, some
generic optimizations were introduced, e.g. avoiding restore LBR if no
one else touched them. The generic optimizations can also be used by
Architecture LBR later. Currently, the fields for the generic
optimizations are part of structure x86_perf_task_context, which will be
deprecated by Architecture LBR. A new structure should be introduced
for the common fields of generic optimization, which can be shared
between Architecture LBR and model-specific LBR.

Both 'valid_lbrs' and 'tos' are also used by the generic optimizations,
but they are not moved into the new structure, because Architecture LBR
is stack-like. The 'valid_lbrs' which records the index of the valid LBR
is not required anymore. The TOS MSR will be removed.

LBR registers may be cleared in the deep Cstate. If so, the generic
optimizations should not be applied. Perf has to unconditionally
restore the LBR registers. A generic function is required to detect the
reset due to the deep Cstate. lbr_is_reset_in_cstate() is introduced.
Currently, for the model-specific LBR, the TOS MSR is used to detect the
reset. There will be another method introduced for Architecture LBR
later.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-6-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/lbr.c  | 38 +++
 arch/x86/events/perf_event.h | 10 ++---
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index b2b8dc9..bba9939 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -355,33 +355,37 @@ void intel_pmu_lbr_restore(void *ctx)
wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+static __always_inline bool
+lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx)
+{
+   return !rdlbr_from(task_ctx->tos);
+}
+
 static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-   u64 tos;
 
-   if (task_ctx->lbr_callstack_users == 0 ||
-   task_ctx->lbr_stack_state == LBR_NONE) {
+   if (task_ctx->opt.lbr_callstack_users == 0 ||
+   task_ctx->opt.lbr_stack_state == LBR_NONE) {
intel_pmu_lbr_reset();
return;
}
 
-   tos = task_ctx->tos;
/*
 * Does not restore the LBR registers, if
 * - No one else touched them, and
-* - Did not enter C6
+* - Was not cleared in Cstate
 */
if ((task_ctx == cpuc->last_task_ctx) &&
-   (task_ctx->log_id == cpuc->last_log_id) &&
-   rdlbr_from(tos)) {
-   task_ctx->lbr_stack_state = LBR_NONE;
+   (task_ctx->opt.log_id == cpuc->last_log_id) &&
+   !lbr_is_reset_in_cstate(task_ctx)) {
+   task_ctx->opt.lbr_stack_state = LBR_NONE;
return;
}
 
x86_pmu.lbr_restore(task_ctx);
 
-   task_ctx->lbr_stack_state = LBR_NONE;
+   task_ctx->opt.lbr_stack_state = LBR_NONE;
 }
 
 void intel_pmu_lbr_save(void *ctx)
@@ -415,17 +419,17 @@ static void __intel_pmu_lbr_save(struct 
x86_perf_task_context *task_ctx)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-   if (task_ctx->lbr_callstack_users == 0) {
-   task_ctx->lbr_stack_state = LBR_NONE;
+   if (task_ctx->opt.lbr_callstack_users == 0) {
+   task_ctx->opt.lbr_stack_state = LBR_NONE;
return;
}
 
x86_pmu.lbr_save(task_ctx);
 
-   task_ctx->lbr_stack_state = LBR_VALID;
+   task_ctx->opt.lbr_stack_state = LBR_VALID;
 
cpuc->last_task_ctx = task_ctx;
-   cpuc->last_log_id = ++task_ctx->log_id;
+   cpuc->last_log_id = ++task_ctx->opt.log_id;
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
@@ -447,8 +451,8 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context 
*prev,
if (!prev_ctx_data || !next_ctx_data)
return;
 
-   swap(prev_ctx_data->lbr_callstack_users,
-next_ctx_data->lbr_callstack_users);
+   swap(prev_ctx_data->opt.lbr_callstack_users,
+next_ctx_data->opt.lbr_callstack_users);
 }
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
@@ -503,7 +507,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
if (branch_user_callstack(cpuc->br_se

[tip: perf/core] perf/x86/intel/lbr: Add the function pointers for LBR save and restore

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 799571bf38fc2b4b744fa448184b5915739b10fd
Gitweb:
https://git.kernel.org/tip/799571bf38fc2b4b744fa448184b5915739b10fd
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:10 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:52 +02:00

perf/x86/intel/lbr: Add the function pointers for LBR save and restore

The MSRs of Architectural LBR are different from previous model-specific
LBR. Perf has to implement different functions to save and restore them.

The function pointers for LBR save and restore are introduced. Perf
should initialize the corresponding functions at boot time.

The generic optimizations, e.g. avoiding restore LBR if no one else
touched them, still apply for Architectural LBRs. The related codes are
not moved to model-specific functions.

Current model-specific LBR functions are set as default.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-5-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c |  4 ++-
 arch/x86/events/intel/lbr.c  | 79 +--
 arch/x86/events/perf_event.h |  6 +++-
 3 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6414b47..50cb3c6 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3981,6 +3981,8 @@ static __initconst const struct x86_pmu core_pmu = {
 
.lbr_reset  = intel_pmu_lbr_reset_64,
.lbr_read   = intel_pmu_lbr_read_64,
+   .lbr_save   = intel_pmu_lbr_save,
+   .lbr_restore= intel_pmu_lbr_restore,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4029,6 +4031,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 
.lbr_reset  = intel_pmu_lbr_reset_64,
.lbr_read   = intel_pmu_lbr_read_64,
+   .lbr_save   = intel_pmu_lbr_save,
+   .lbr_restore= intel_pmu_lbr_restore,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index b8943f4..b2b8dc9 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -323,31 +323,13 @@ static inline u64 rdlbr_to(unsigned int idx)
return val;
 }
 
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+void intel_pmu_lbr_restore(void *ctx)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+   struct x86_perf_task_context *task_ctx = ctx;
int i;
unsigned lbr_idx, mask;
-   u64 tos;
-
-   if (task_ctx->lbr_callstack_users == 0 ||
-   task_ctx->lbr_stack_state == LBR_NONE) {
-   intel_pmu_lbr_reset();
-   return;
-   }
-
-   tos = task_ctx->tos;
-   /*
-* Does not restore the LBR registers, if
-* - No one else touched them, and
-* - Did not enter C6
-*/
-   if ((task_ctx == cpuc->last_task_ctx) &&
-   (task_ctx->log_id == cpuc->last_log_id) &&
-   rdlbr_from(tos)) {
-   task_ctx->lbr_stack_state = LBR_NONE;
-   return;
-   }
+   u64 tos = task_ctx->tos;
 
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) {
@@ -368,24 +350,48 @@ static void __intel_pmu_lbr_restore(struct 
x86_perf_task_context *task_ctx)
}
 
wrmsrl(x86_pmu.lbr_tos, tos);
-   task_ctx->lbr_stack_state = LBR_NONE;
 
if (cpuc->lbr_select)
wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-   unsigned lbr_idx, mask;
-   u64 tos, from;
-   int i;
+   u64 tos;
 
-   if (task_ctx->lbr_callstack_users == 0) {
+   if (task_ctx->lbr_callstack_users == 0 ||
+   task_ctx->lbr_stack_state == LBR_NONE) {
+   intel_pmu_lbr_reset();
+   return;
+   }
+
+   tos = task_ctx->tos;
+   /*
+* Does not restore the LBR registers, if
+* - No one else touched them, and
+* - Did not enter C6
+*/
+   if ((task_ctx == cpuc->last_task_ctx) &&
+   (task_ctx->log_id == cpuc->last_log_id) &&
+   rdlbr_from(tos)) {
task_ctx->lbr_stack_state = LBR_NONE;
return;
}
 
+   x86_pmu.lbr_restore(task_ctx);
+
+   task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+void intel_pmu_lbr_save(void *ctx)
+{
+   struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+   struct x86_perf_task_context *task_ctx = ctx;
+   unsigned lbr_idx, mask;
+  

[tip: perf/core] perf/x86: Expose CPUID enumeration bits for arch LBR

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: af6cf129706b2f79e12f97e62d977e7f653cdfd1
Gitweb:
https://git.kernel.org/tip/af6cf129706b2f79e12f97e62d977e7f653cdfd1
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:14 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:53 +02:00

perf/x86: Expose CPUID enumeration bits for arch LBR

The LBR capabilities of Architecture LBR are retrieved from the CPUID
enumeration once at boot time. The capabilities have to be saved for
future usage.

Several new fields are added into structure x86_pmu to indicate the
capabilities. The fields will be used in the following patches.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-9-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/perf_event.h  | 13 ++-
 arch/x86/include/asm/perf_event.h | 40 ++-
 2 files changed, 53 insertions(+)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7dbf148..cc81177 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -693,6 +693,19 @@ struct x86_pmu {
boollbr_double_abort;  /* duplicated lbr aborts */
boollbr_pt_coexist;/* (LBR|BTS) may coexist 
with PT */
 
+   /*
+* Intel Architectural LBR CPUID Enumeration
+*/
+   unsigned intlbr_depth_mask:8;
+   unsigned intlbr_deep_c_reset:1;
+   unsigned intlbr_lip:1;
+   unsigned intlbr_cpl:1;
+   unsigned intlbr_filter:1;
+   unsigned intlbr_call_stack:1;
+   unsigned intlbr_mispred:1;
+   unsigned intlbr_timed_lbr:1;
+   unsigned intlbr_br_type:1;
+
void(*lbr_reset)(void);
void(*lbr_read)(struct cpu_hw_events *cpuc);
void(*lbr_save)(void *ctx);
diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 2df7073..9ffce7d 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -142,6 +142,46 @@ union cpuid10_edx {
unsigned int full;
 };
 
+/*
+ * Intel Architectural LBR CPUID detection/enumeration details:
+ */
+union cpuid28_eax {
+   struct {
+   /* Supported LBR depth values */
+   unsigned intlbr_depth_mask:8;
+   unsigned intreserved:22;
+   /* Deep C-state Reset */
+   unsigned intlbr_deep_c_reset:1;
+   /* IP values contain LIP */
+   unsigned intlbr_lip:1;
+   } split;
+   unsigned intfull;
+};
+
+union cpuid28_ebx {
+   struct {
+   /* CPL Filtering Supported */
+   unsigned intlbr_cpl:1;
+   /* Branch Filtering Supported */
+   unsigned intlbr_filter:1;
+   /* Call-stack Mode Supported */
+   unsigned intlbr_call_stack:1;
+   } split;
+   unsigned intfull;
+};
+
+union cpuid28_ecx {
+   struct {
+   /* Mispredict Bit Supported */
+   unsigned intlbr_mispred:1;
+   /* Timed LBRs Supported */
+   unsigned intlbr_timed_lbr:1;
+   /* Branch Type Field Supported */
+   unsigned intlbr_br_type:1;
+   } split;
+   unsigned intfull;
+};
+
 struct x86_pmu_capability {
int version;
int num_counters_gp;


[tip: perf/core] perf/core: Factor out functions to allocate/free the task_ctx_data

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: ff9ff926889dd8026b4ba55266a010c27f68604f
Gitweb:
https://git.kernel.org/tip/ff9ff926889dd8026b4ba55266a010c27f68604f
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:21 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:54 +02:00

perf/core: Factor out functions to allocate/free the task_ctx_data

The method to allocate/free the task_ctx_data is going to be changed in
the following patch. Currently, the task_ctx_data is allocated/freed in
several different places. To avoid repeatedly modifying the same codes
in several different places, alloc_task_ctx_data() and
free_task_ctx_data() are factored out to allocate/free the
task_ctx_data. The modification only needs to be applied once.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.li...@linux.intel.com
---
 kernel/events/core.c | 21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9b8f925..7509040 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1238,12 +1238,22 @@ static void get_ctx(struct perf_event_context *ctx)
refcount_inc(&ctx->refcount);
 }
 
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+   return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+}
+
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+{
+   kfree(task_ctx_data);
+}
+
 static void free_ctx(struct rcu_head *head)
 {
struct perf_event_context *ctx;
 
ctx = container_of(head, struct perf_event_context, rcu_head);
-   kfree(ctx->task_ctx_data);
+   free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
 }
 
@@ -4471,7 +4481,7 @@ find_get_context(struct pmu *pmu, struct task_struct 
*task,
goto errout;
 
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-   task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+   task_ctx_data = alloc_task_ctx_data(pmu);
if (!task_ctx_data) {
err = -ENOMEM;
goto errout;
@@ -4529,11 +4539,11 @@ retry:
}
}
 
-   kfree(task_ctx_data);
+   free_task_ctx_data(pmu, task_ctx_data);
return ctx;
 
 errout:
-   kfree(task_ctx_data);
+   free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
 }
 
@@ -12497,8 +12507,7 @@ inherit_event(struct perf_event *parent_event,
!child_ctx->task_ctx_data) {
struct pmu *pmu = child_event->pmu;
 
-   child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
-  GFP_KERNEL);
+   child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
return ERR_PTR(-ENOMEM);


[tip: perf/core] perf/x86/intel/lbr: Add a function pointer for LBR reset

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 9f354a726cb1d4eb00a0784a27eaa0a3283cff71
Gitweb:
https://git.kernel.org/tip/9f354a726cb1d4eb00a0784a27eaa0a3283cff71
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:08 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:51 +02:00

perf/x86/intel/lbr: Add a function pointer for LBR reset

The method to reset Architectural LBRs is different from previous
model-specific LBR. Perf has to implement a different function.

A function pointer is introduced for LBR reset. The enum of
LBR_FORMAT_* is also moved to perf_event.h. Perf should initialize the
corresponding functions at boot time, and avoid checking lbr_format at
run time.

The current 64-bit LBR reset function is set as default.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-3-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c |  7 +++
 arch/x86/events/intel/lbr.c  | 20 +++-
 arch/x86/events/perf_event.h | 17 +
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 582ddff..fe49e99 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3978,6 +3978,8 @@ static __initconst const struct x86_pmu core_pmu = {
.cpu_dead   = intel_pmu_cpu_dead,
 
.check_period   = intel_pmu_check_period,
+
+   .lbr_reset  = intel_pmu_lbr_reset_64,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4023,6 +4025,8 @@ static __initconst const struct x86_pmu intel_pmu = {
.check_period   = intel_pmu_check_period,
 
.aux_output_match   = intel_pmu_aux_output_match,
+
+   .lbr_reset  = intel_pmu_lbr_reset_64,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -4649,6 +4653,9 @@ __init int intel_pmu_init(void)
x86_pmu.intel_cap.capabilities = capabilities;
}
 
+   if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+   x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+
intel_ds_init();
 
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs 
last */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index d03de75..7af27a7 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -8,17 +8,6 @@
 
 #include "../perf_event.h"
 
-enum {
-   LBR_FORMAT_32   = 0x00,
-   LBR_FORMAT_LIP  = 0x01,
-   LBR_FORMAT_EIP  = 0x02,
-   LBR_FORMAT_EIP_FLAGS= 0x03,
-   LBR_FORMAT_EIP_FLAGS2   = 0x04,
-   LBR_FORMAT_INFO = 0x05,
-   LBR_FORMAT_TIME = 0x06,
-   LBR_FORMAT_MAX_KNOWN= LBR_FORMAT_TIME,
-};
-
 static const enum {
LBR_EIP_FLAGS   = 1,
LBR_TSX = 2,
@@ -194,7 +183,7 @@ static void __intel_pmu_lbr_disable(void)
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-static void intel_pmu_lbr_reset_32(void)
+void intel_pmu_lbr_reset_32(void)
 {
int i;
 
@@ -202,7 +191,7 @@ static void intel_pmu_lbr_reset_32(void)
wrmsrl(x86_pmu.lbr_from + i, 0);
 }
 
-static void intel_pmu_lbr_reset_64(void)
+void intel_pmu_lbr_reset_64(void)
 {
int i;
 
@@ -221,10 +210,7 @@ void intel_pmu_lbr_reset(void)
if (!x86_pmu.lbr_nr)
return;
 
-   if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-   intel_pmu_lbr_reset_32();
-   else
-   intel_pmu_lbr_reset_64();
+   x86_pmu.lbr_reset();
 
cpuc->last_task_ctx = NULL;
cpuc->last_log_id = 0;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 8147596..5c1ad43 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -180,6 +180,17 @@ struct x86_perf_task_context;
 #define MAX_LBR_ENTRIES32
 
 enum {
+   LBR_FORMAT_32   = 0x00,
+   LBR_FORMAT_LIP  = 0x01,
+   LBR_FORMAT_EIP  = 0x02,
+   LBR_FORMAT_EIP_FLAGS= 0x03,
+   LBR_FORMAT_EIP_FLAGS2   = 0x04,
+   LBR_FORMAT_INFO = 0x05,
+   LBR_FORMAT_TIME = 0x06,
+   LBR_FORMAT_MAX_KNOWN= LBR_FORMAT_TIME,
+};
+
+enum {
X86_PERF_KFREE_SHARED = 0,
X86_PERF_KFREE_EXCL   = 1,
X86_PERF_KFREE_MAX
@@ -682,6 +693,8 @@ struct x86_pmu {
boollbr_double_abort;  /* duplicated lbr aborts */
boollbr_pt_coexist;/* (LBR|BTS) may coexist 
with PT */
 
+   void(*lbr_reset)(void);
+
/*
 * Intel PT/LBR/BTS are exclusive
 */
@@ -1058,6 +1071,10 @@ u64 lbr_from_signext_quirk_wr(u64 val);
 
 void intel_pmu_lbr_reset(void);
 
+void intel_pmu_lbr_reset_32(void);
+
+void intel_pmu_lbr_reset_64(v

[tip: perf/core] perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all()

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: fda1f99f34a8f0975086bcfef34da865009995c1
Gitweb:
https://git.kernel.org/tip/fda1f99f34a8f0975086bcfef34da865009995c1
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:18 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:54 +02:00

perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all()

The previous model-specific LBR and Architecture LBR (legacy way) use a
similar method to save/restore the LBR information, which directly
accesses the LBR registers. The codes which read/write a set of LBR
registers can be shared between them.

Factor out two functions which are used to read/write a set of LBR
registers.

Add lbr_info into structure x86_pmu, and use it to replace the hardcoded
LBR INFO MSR, because the LBR INFO MSR address of the previous
model-specific LBR is different from Architecture LBR. The MSR address
should be assigned at boot time. For now, only Sky Lake and later
platforms have the LBR INFO MSR.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-13-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/lbr.c  | 66 ++-
 arch/x86/events/perf_event.h |  2 +-
 2 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 21f4f07..d3d129c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -237,7 +237,7 @@ void intel_pmu_lbr_reset_64(void)
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to   + i, 0);
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-   wrmsrl(MSR_LBR_INFO_0 + i, 0);
+   wrmsrl(x86_pmu.lbr_info + i, 0);
}
 }
 
@@ -343,6 +343,11 @@ static __always_inline void wrlbr_to(unsigned int idx, u64 
val)
wrmsrl(x86_pmu.lbr_to + idx, val);
 }
 
+static __always_inline void wrlbr_info(unsigned int idx, u64 val)
+{
+   wrmsrl(x86_pmu.lbr_info + idx, val);
+}
+
 static __always_inline u64 rdlbr_from(unsigned int idx)
 {
u64 val;
@@ -361,8 +366,44 @@ static __always_inline u64 rdlbr_to(unsigned int idx)
return val;
 }
 
+static __always_inline u64 rdlbr_info(unsigned int idx)
+{
+   u64 val;
+
+   rdmsrl(x86_pmu.lbr_info + idx, val);
+
+   return val;
+}
+
+static inline void
+wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+   wrlbr_from(idx, lbr->from);
+   wrlbr_to(idx, lbr->to);
+   if (need_info)
+   wrlbr_info(idx, lbr->info);
+}
+
+static inline bool
+rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+   u64 from = rdlbr_from(idx);
+
+   /* Don't read invalid entry */
+   if (!from)
+   return false;
+
+   lbr->from = from;
+   lbr->to = rdlbr_to(idx);
+   if (need_info)
+   lbr->info = rdlbr_info(idx);
+
+   return true;
+}
+
 void intel_pmu_lbr_restore(void *ctx)
 {
+   bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
int i;
@@ -372,11 +413,7 @@ void intel_pmu_lbr_restore(void *ctx)
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) {
lbr_idx = (tos - i) & mask;
-   wrlbr_from(lbr_idx, task_ctx->lbr[i].from);
-   wrlbr_to(lbr_idx, task_ctx->lbr[i].to);
-
-   if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-   wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
+   wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info);
}
 
for (; i < x86_pmu.lbr_nr; i++) {
@@ -384,7 +421,7 @@ void intel_pmu_lbr_restore(void *ctx)
wrlbr_from(lbr_idx, 0);
wrlbr_to(lbr_idx, 0);
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-   wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+   wrlbr_info(lbr_idx, 0);
}
 
wrmsrl(x86_pmu.lbr_tos, tos);
@@ -427,23 +464,19 @@ static void __intel_pmu_lbr_restore(void *ctx)
 
 void intel_pmu_lbr_save(void *ctx)
 {
+   bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
unsigned lbr_idx, mask;
-   u64 tos, from;
+   u64 tos;
int i;
 
mask = x86_pmu.lbr_nr - 1;
tos = intel_pmu_lbr_tos();
for (i = 0; i < x86_pmu.lbr_nr; i++) {
lbr_idx = (tos - i) & mask;
-   from = rdlbr_from(lbr_idx);
-   if (!from)
+   if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info))

[tip: perf/core] perf/x86/intel/lbr: Add a function pointer for LBR read

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: c301b1d80ed5b806834fe0f739f028f65fb4fb16
Gitweb:
https://git.kernel.org/tip/c301b1d80ed5b806834fe0f739f028f65fb4fb16
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:09 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:51 +02:00

perf/x86/intel/lbr: Add a function pointer for LBR read

The method to read Architectural LBRs is different from previous
model-specific LBR. Perf has to implement a different function.

A function pointer for LBR read is introduced. Perf should initialize
the corresponding function at boot time, and avoid checking lbr_format
at run time.

The current 64-bit LBR read function is set as default.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-4-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/core.c |  6 +-
 arch/x86/events/intel/lbr.c  |  9 +++--
 arch/x86/events/perf_event.h |  5 +
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fe49e99..6414b47 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3980,6 +3980,7 @@ static __initconst const struct x86_pmu core_pmu = {
.check_period   = intel_pmu_check_period,
 
.lbr_reset  = intel_pmu_lbr_reset_64,
+   .lbr_read   = intel_pmu_lbr_read_64,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4027,6 +4028,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.aux_output_match   = intel_pmu_aux_output_match,
 
.lbr_reset  = intel_pmu_lbr_reset_64,
+   .lbr_read   = intel_pmu_lbr_read_64,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -4653,8 +4655,10 @@ __init int intel_pmu_init(void)
x86_pmu.intel_cap.capabilities = capabilities;
}
 
-   if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+   if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) {
x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+   x86_pmu.lbr_read = intel_pmu_lbr_read_32;
+   }
 
intel_ds_init();
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 7af27a7..b8943f4 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -562,7 +562,7 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
 }
 
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 {
unsigned long mask = x86_pmu.lbr_nr - 1;
u64 tos = intel_pmu_lbr_tos();
@@ -599,7 +599,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events 
*cpuc)
  * is the same as the linear address, allowing us to merge the LIP and EIP
  * LBR formats.
  */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
bool need_info = false, call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -704,10 +704,7 @@ void intel_pmu_lbr_read(void)
cpuc->lbr_users == cpuc->lbr_pebs_users)
return;
 
-   if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-   intel_pmu_lbr_read_32(cpuc);
-   else
-   intel_pmu_lbr_read_64(cpuc);
+   x86_pmu.lbr_read(cpuc);
 
intel_pmu_lbr_filter(cpuc);
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5c1ad43..312d27f 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -694,6 +694,7 @@ struct x86_pmu {
boollbr_pt_coexist;/* (LBR|BTS) may coexist 
with PT */
 
void(*lbr_reset)(void);
+   void(*lbr_read)(struct cpu_hw_events *cpuc);
 
/*
 * Intel PT/LBR/BTS are exclusive
@@ -1085,6 +1086,10 @@ void intel_pmu_lbr_disable_all(void);
 
 void intel_pmu_lbr_read(void);
 
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
+
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+
 void intel_pmu_lbr_init_core(void);
 
 void intel_pmu_lbr_init_nhm(void);


[tip: perf/core] x86/cpufeatures: Add Architectural LBRs feature bit

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: bd657aa3dd8514e62486ce7f90b5e484c18d684d
Gitweb:
https://git.kernel.org/tip/bd657aa3dd8514e62486ce7f90b5e484c18d684d
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:07 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:51 +02:00

x86/cpufeatures: Add Architectural LBRs feature bit

CPUID.(EAX=07H, ECX=0):EDX[19] indicates whether an Intel CPU supports
Architectural LBRs.

The "X86_FEATURE_..., word 18" is already mirrored from CPUID
"0x0007:0 (EDX)". Add X86_FEATURE_ARCH_LBR under the "word 18"
section.

The feature will appear as "arch_lbr" in /proc/cpuinfo.

The Architectural Last Branch Records (LBR) feature enables recording
of software path history by logging taken branches and other control
flows. The feature will be supported in the perf_events subsystem.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1593780569-62993-2-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 02dabc9..72ba4c5 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -366,6 +366,7 @@
 #define X86_FEATURE_MD_CLEAR   (18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_PCONFIG(18*32+18) /* Intel PCONFIG */
+#define X86_FEATURE_ARCH_LBR   (18*32+19) /* Intel ARCH LBR */
 #define X86_FEATURE_SPEC_CTRL  (18*32+26) /* "" Speculation Control 
(IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP(18*32+27) /* "" Single Thread 
Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D  (18*32+28) /* Flush L1D cache */


[tip: perf/core] x86/msr-index: Add bunch of MSRs for Arch LBR

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f
Gitweb:
https://git.kernel.org/tip/d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:13 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:52 +02:00

x86/msr-index: Add bunch of MSRs for Arch LBR

Add Arch LBR related MSRs and the new LBR INFO bits in MSR-index.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-8-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/include/asm/msr-index.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e8370e6..bdc07fc 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -158,7 +158,23 @@
 #define LBR_INFO_MISPRED   BIT_ULL(63)
 #define LBR_INFO_IN_TX BIT_ULL(62)
 #define LBR_INFO_ABORT BIT_ULL(61)
+#define LBR_INFO_CYC_CNT_VALID BIT_ULL(60)
 #define LBR_INFO_CYCLES0x
+#define LBR_INFO_BR_TYPE_OFFSET56
+#define LBR_INFO_BR_TYPE   (0xfull << LBR_INFO_BR_TYPE_OFFSET)
+
+#define MSR_ARCH_LBR_CTL   0x14ce
+#define ARCH_LBR_CTL_LBREN BIT(0)
+#define ARCH_LBR_CTL_CPL_OFFSET1
+#define ARCH_LBR_CTL_CPL   (0x3ull << ARCH_LBR_CTL_CPL_OFFSET)
+#define ARCH_LBR_CTL_STACK_OFFSET  3
+#define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET)
+#define ARCH_LBR_CTL_FILTER_OFFSET 16
+#define ARCH_LBR_CTL_FILTER(0x7full << ARCH_LBR_CTL_FILTER_OFFSET)
+#define MSR_ARCH_LBR_DEPTH 0x14cf
+#define MSR_ARCH_LBR_FROM_00x1500
+#define MSR_ARCH_LBR_TO_0  0x1600
+#define MSR_ARCH_LBR_INFO_00x1200
 
 #define MSR_IA32_PEBS_ENABLE   0x03f1
 #define MSR_PEBS_DATA_CFG  0x03f2


[tip: perf/core] perf/x86/intel/lbr: Unify the stored format of LBR information

2020-07-08 Thread tip-bot2 for Kan Liang
The following commit has been merged into the perf/core branch of tip:

Commit-ID: 5624986dc61b81a77fb6136bc232593483d1c254
Gitweb:
https://git.kernel.org/tip/5624986dc61b81a77fb6136bc232593483d1c254
Author:Kan Liang 
AuthorDate:Fri, 03 Jul 2020 05:49:16 -07:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 08 Jul 2020 11:38:53 +02:00

perf/x86/intel/lbr: Unify the stored format of LBR information

Current LBR information in the structure x86_perf_task_context is stored
in a different format from the PEBS LBR record and Architecture LBR,
which prevents the sharing of the common codes.

Use the format of the PEBS LBR record as a unified format. Use a generic
name lbr_entry to replace pebs_lbr_entry.

Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/1593780569-62993-11-git-send-email-kan.li...@linux.intel.com
---
 arch/x86/events/intel/ds.c|  6 +++---
 arch/x86/events/intel/lbr.c   | 20 ++--
 arch/x86/events/perf_event.h  |  6 ++
 arch/x86/include/asm/perf_event.h |  6 +-
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index dc43cc1..86848c5 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -954,7 +954,7 @@ static void adaptive_pebs_record_size_update(void)
if (pebs_data_cfg & PEBS_DATACFG_XMMS)
sz += sizeof(struct pebs_xmm);
if (pebs_data_cfg & PEBS_DATACFG_LBRS)
-   sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry);
+   sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
 
cpuc->pebs_record_size = sz;
 }
@@ -1595,10 +1595,10 @@ static void setup_pebs_adaptive_sample_data(struct 
perf_event *event,
}
 
if (format_size & PEBS_DATACFG_LBRS) {
-   struct pebs_lbr *lbr = next_record;
+   struct lbr_entry *lbr = next_record;
int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
& 0xff) + 1;
-   next_record = next_record + num_lbr*sizeof(struct 
pebs_lbr_entry);
+   next_record = next_record + num_lbr * sizeof(struct lbr_entry);
 
if (has_branch_stack(event)) {
intel_pmu_store_pebs_lbrs(lbr);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 7742562..b8baaf1 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -372,11 +372,11 @@ void intel_pmu_lbr_restore(void *ctx)
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) {
lbr_idx = (tos - i) & mask;
-   wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
-   wrlbr_to  (lbr_idx, task_ctx->lbr_to[i]);
+   wrlbr_from(lbr_idx, task_ctx->lbr[i].from);
+   wrlbr_to(lbr_idx, task_ctx->lbr[i].to);
 
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-   wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+   wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
}
 
for (; i < x86_pmu.lbr_nr; i++) {
@@ -440,10 +440,10 @@ void intel_pmu_lbr_save(void *ctx)
from = rdlbr_from(lbr_idx);
if (!from)
break;
-   task_ctx->lbr_from[i] = from;
-   task_ctx->lbr_to[i]   = rdlbr_to(lbr_idx);
+   task_ctx->lbr[i].from = from;
+   task_ctx->lbr[i].to = rdlbr_to(lbr_idx);
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-   rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+   rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
}
task_ctx->valid_lbrs = i;
task_ctx->tos = tos;
@@ -1179,7 +1179,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
}
 }
 
-void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int i;
@@ -1193,11 +1193,11 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
 
for (i = 0; i < x86_pmu.lbr_nr; i++) {
-   u64 info = lbr->lbr[i].info;
+   u64 info = lbr[i].info;
struct perf_branch_entry *e = &cpuc->lbr_entries[i];
 
-   e->from = lbr->lbr[i].from;
-   e->to   = lbr->lbr[i].to;
+   e->from = lbr[i].from;
+   e->to   = lbr[i].to;
e->mispred  = !!(info & LBR_INFO_MISPRED);
e->predicted= !(info & LBR_INFO_MISPRED);
e->in_tx= !!(info & LBR_INFO_IN_TX);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ba

  1   2   >