[PATCH 4/5] MIPS: perf: Allocate per-core counters on demand

2018-04-03 Thread Matt Redfearn
Previously when performance counters are per-core, rather than
per-thread, the number available were divided by 2 on detection, and the
counters used by each thread in a core were "swizzled" to ensure
separation. However, this solution is suboptimal since it relies on a
couple of assumptions:
a) Always having 2 VPEs / core (number of counters was divided by 2)
b) Always having a number of counters implemented in the core that is
   divisible by 2. For instance if an SoC implementation had a single
   counter and 2 VPEs per core, then this logic would fail and no
   performance counters would be available.
The mechanism also does not allow for one VPE in a core using more than
it's allocation of the per-core counters to count multiple events even
though other VPEs may not be using them.

Fix this situation by instead allocating (and releasing) per-core
performance counters when they are requested. This approach removes the
above assumptions and fixes the shortcomings.

In order to do this:
Add additional logic to mipsxx_pmu_alloc_counter() to detect if a
sibling is using a per-core counter, and to allocate a per-core counter
in all sibling CPUs.
Similarly, add a mipsxx_pmu_free_counter() function to release a
per-core counter in all sibling CPUs when it is finished with.
A new spinlock, core_counters_lock, is introduced to ensure exclusivity
when allocating and releasing per-core counters.
Since counters are now allocated per-core on demand, rather than being
reserved per-thread at boot, all of the "swizzling" of counters is
removed.

The upshot is that in an SoC with 2 counters / thread, counters are
reported as:
Performance counters: mips/interAptiv PMU enabled, 2 32-bit counters
available to each CPU, irq 18

Running an instance of a test program on each of 2 threads in a
core, both threads can use their 2 counters to count 2 events:

taskset 4 perf stat -e instructions:u,branches:u ./test_prog & taskset 8
perf stat -e instructions:u,branches:u ./test_prog

 Performance counter stats for './test_prog':

 30002  instructions:u
 1  branches:u

   0.005164264 seconds time elapsed
 Performance counter stats for './test_prog':

 30002  instructions:u
 1  branches:u

   0.006139975 seconds time elapsed

In an SoC with 2 counters / core (which can be forced by setting
cpu_has_mipsmt_pertccounters = 0), counters are reported as:
Performance counters: mips/interAptiv PMU enabled, 2 32-bit counters
available to each core, irq 18

Running an instance of a test program on each of 2 threads in a
core, now only one thread manages to secure the performance counters to
count 2 events. The other thread does not get any counters.

taskset 4 perf stat -e instructions:u,branches:u ./test_prog & taskset 8
perf stat -e instructions:u,branches:u ./test_prog

 Performance counter stats for './test_prog':

instructions:u
branches:u

   0.005179533 seconds time elapsed

 Performance counter stats for './test_prog':

 30002  instructions:u
 1  branches:u

   0.005179467 seconds time elapsed

Signed-off-by: Matt Redfearn 
---

 arch/mips/kernel/perf_event_mipsxx.c | 130 ---
 1 file changed, 88 insertions(+), 42 deletions(-)

diff --git a/arch/mips/kernel/perf_event_mipsxx.c 
b/arch/mips/kernel/perf_event_mipsxx.c
index 782a1b6c6352..bedb0d5ff3f2 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -131,6 +131,8 @@ static struct mips_pmu mipspmu;
 #ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
 static int cpu_has_mipsmt_pertccounters;
 
+static DEFINE_SPINLOCK(core_counters_lock);
+
 static DEFINE_RWLOCK(pmuint_rwlock);
 
 #if defined(CONFIG_CPU_BMIPS5000)
@@ -141,20 +143,6 @@ static DEFINE_RWLOCK(pmuint_rwlock);
 0 : cpu_vpe_id(¤t_cpu_data))
 #endif
 
-/* Copied from op_model_mipsxx.c */
-static unsigned int vpe_shift(void)
-{
-   if (num_possible_cpus() > 1)
-   return 1;
-
-   return 0;
-}
-
-static unsigned int counters_total_to_per_cpu(unsigned int counters)
-{
-   return counters >> vpe_shift();
-}
-
 #else /* !CONFIG_MIPS_PERF_SHARED_TC_COUNTERS */
 #define vpe_id()   0
 
@@ -165,17 +153,8 @@ static void pause_local_counters(void);
 static irqreturn_t mipsxx_pmu_handle_irq(int, void *);
 static int mipsxx_pmu_handle_shared_irq(void);
 
-static unsigned int mipsxx_pmu_swizzle_perf_idx(unsigned int idx)
-{
-   if (vpe_id() == 1)
-   idx = (idx + 2) & 3;
-   return idx;
-}
-
 static u64 mipsxx_pmu_read_counter(unsigned int idx)
 {
-   idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
case 0:
/*
@@ -197,8 +176,6 @@ static u64 mipsxx_pmu_read_counter(unsigned int idx)
 
 static u64 mipsxx_pmu_read_counter_64(unsigned int idx)
 {
-   idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
  

Re: [PATCH 4/5] MIPS: perf: Allocate per-core counters on demand

2018-04-03 Thread kbuild test robot
Hi Matt,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/perf/core]
[also build test ERROR on v4.16 next-20180403]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Matt-Redfearn/MIPS-perf-MT-fixes-and-improvements/20180404-011026
config: mips-gpr_defconfig (attached as .config)
compiler: mipsel-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=mips 

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/kernel.h:14:0,
from include/linux/cpumask.h:10,
from arch/mips/kernel/perf_event_mipsxx.c:18:
   arch/mips/kernel/perf_event_mipsxx.c: In function 'mipsxx_pmu_free_counter':
>> arch/mips/kernel/perf_event_mipsxx.c:365:42: error: 'cpu' undeclared (first 
>> use in this function)
 pr_debug("CPU%d released counter %d\n", cpu, hwc->idx);
 ^
   include/linux/printk.h:136:17: note: in definition of macro 'no_printk'
  printk(fmt, ##__VA_ARGS__);  \
^~~
>> arch/mips/kernel/perf_event_mipsxx.c:365:2: note: in expansion of macro 
>> 'pr_debug'
 pr_debug("CPU%d released counter %d\n", cpu, hwc->idx);
 ^~~~
   arch/mips/kernel/perf_event_mipsxx.c:365:42: note: each undeclared 
identifier is reported only once for each function it appears in
 pr_debug("CPU%d released counter %d\n", cpu, hwc->idx);
 ^
   include/linux/printk.h:136:17: note: in definition of macro 'no_printk'
  printk(fmt, ##__VA_ARGS__);  \
^~~
>> arch/mips/kernel/perf_event_mipsxx.c:365:2: note: in expansion of macro 
>> 'pr_debug'
 pr_debug("CPU%d released counter %d\n", cpu, hwc->idx);
 ^~~~
   arch/mips/kernel/perf_event_mipsxx.c: In function 'mipsxx_pmu_enable_event':
   arch/mips/kernel/perf_event_mipsxx.c:386:22: error: expected expression 
before ')' token
 } else if (range > V) {
 ^
   arch/mips/kernel/perf_event_mipsxx.c: In function 
'mipspmu_perf_event_encode':
   arch/mips/kernel/perf_event_mipsxx.c:718:28: error: 'const struct 
mips_perf_event' has no member named 'range'
  return ((unsigned int)pev->range << 24) |
   ^~

vim +/cpu +365 arch/mips/kernel/perf_event_mipsxx.c

   339  
   340  static void mipsxx_pmu_free_counter(struct cpu_hw_events *cpuc,
   341  struct hw_perf_event *hwc)
   342  {
   343  #ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
   344  int sibling_cpu, cpu = smp_processor_id();
   345  
   346  /* When counters are per-core, free them in all sibling CPUs */
   347  if (!cpu_has_mipsmt_pertccounters) {
   348  struct cpu_hw_events *sibling_cpuc;
   349  unsigned long flags;
   350  
   351  spin_lock_irqsave(&core_counters_lock, flags);
   352  
   353  for_each_cpu(sibling_cpu, &cpu_sibling_map[cpu]) {
   354  sibling_cpuc = per_cpu_ptr(&cpu_hw_events, 
sibling_cpu);
   355  
   356  clear_bit(hwc->idx, sibling_cpuc->used_mask);
   357  pr_debug("CPU%d released core counter %d\n",
   358   sibling_cpu, hwc->idx);
   359  }
   360  
   361  spin_unlock_irqrestore(&core_counters_lock, flags);
   362  return;
   363  }
   364  #endif
 > 365  pr_debug("CPU%d released counter %d\n", cpu, hwc->idx);
   366  clear_bit(hwc->idx, cpuc->used_mask);
   367  }
   368  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip