Re: [PATCH 3/3] perf stat: Add --per-numa agregation support

2019-09-02 Thread Jiri Olsa
On Mon, Sep 02, 2019 at 12:43:29PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Sep 02, 2019 at 06:13:17PM +0300, Alexey Budankov escreveu:
> > 
> > On 02.09.2019 15:12, Jiri Olsa wrote:
> > > Adding new --per-numa option to aggregate counts per NUMA
> > > nodes for system-wide mode measurements.
> > > 
> > > You can specify --per-numa in live mode:
> > > 
> > >   # perf stat  -a -I 1000 -e cycles --per-numa
> > >   #   time numa   cpus counts unit events
> > 
> > It might probably better have 'node' instead of 'numa' as in the 
> > option name '--per-node' as in the table header, like this:
> 
> Agreed

ok, will change

jirka

> 
> > 
> > #   time node cpus counts unit events
> >  1.000542550 020  6,202,097  cycles
> >  1.000542550 120639,559  cycles
> >  2.002040063 020  7,412,495  cycles
> >  2.002040063 120  2,185,577  cycles
> >  3.003451699 020  6,508,917  cycles
> >  3.003451699 120765,607  cycles
> >...
> > 
> > BR,
> > Alexey
> 
> -- 
> 
> - Arnaldo


Re: [PATCH 3/3] perf stat: Add --per-numa agregation support

2019-09-02 Thread Arnaldo Carvalho de Melo
Em Mon, Sep 02, 2019 at 06:13:17PM +0300, Alexey Budankov escreveu:
> 
> On 02.09.2019 15:12, Jiri Olsa wrote:
> > Adding new --per-numa option to aggregate counts per NUMA
> > nodes for system-wide mode measurements.
> > 
> > You can specify --per-numa in live mode:
> > 
> >   # perf stat  -a -I 1000 -e cycles --per-numa
> >   #   time numa   cpus counts unit events
> 
> It might probably better have 'node' instead of 'numa' as in the 
> option name '--per-node' as in the table header, like this:

Agreed

> 
> #   time node cpus counts unit events
>  1.000542550 020  6,202,097  cycles
>  1.000542550 120639,559  cycles
>  2.002040063 020  7,412,495  cycles
>  2.002040063 120  2,185,577  cycles
>  3.003451699 020  6,508,917  cycles
>  3.003451699 120765,607  cycles
>...
> 
> BR,
> Alexey

-- 

- Arnaldo


Re: [PATCH 3/3] perf stat: Add --per-numa agregation support

2019-09-02 Thread Alexey Budankov


On 02.09.2019 15:12, Jiri Olsa wrote:
> Adding new --per-numa option to aggregate counts per NUMA
> nodes for system-wide mode measurements.
> 
> You can specify --per-numa in live mode:
> 
>   # perf stat  -a -I 1000 -e cycles --per-numa
>   #   time numa   cpus counts unit events

It might probably better have 'node' instead of 'numa' as in the 
option name '--per-node' as in the table header, like this:

#   time node cpus counts unit events
 1.000542550 020  6,202,097  cycles
 1.000542550 120639,559  cycles
 2.002040063 020  7,412,495  cycles
 2.002040063 120  2,185,577  cycles
 3.003451699 020  6,508,917  cycles
 3.003451699 120765,607  cycles
   ...

BR,
Alexey


[PATCH 3/3] perf stat: Add --per-numa agregation support

2019-09-02 Thread Jiri Olsa
Adding new --per-numa option to aggregate counts per NUMA
nodes for system-wide mode measurements.

You can specify --per-numa in live mode:

  # perf stat  -a -I 1000 -e cycles --per-numa
  #   time numa   cpus counts unit events
   1.000542550 N0   20  6,202,097  cycles
   1.000542550 N1   20639,559  cycles
   2.002040063 N0   20  7,412,495  cycles
   2.002040063 N1   20  2,185,577  cycles
   3.003451699 N0   20  6,508,917  cycles
   3.003451699 N1   20765,607  cycles
  ...

Or in the record/report stat session:

  # perf stat record -a -I 1000 -e cycles
  #   time counts unit events
   1.000536937 10,008,468  cycles
   2.002090152  9,578,539  cycles
   3.003625233  7,647,869  cycles
   4.005135036  7,032,086  cycles
  ^C 4.340902364  3,923,893  cycles

  # perf stat report --per-numa
  #   time numa   cpus counts unit events
   1.000536937 N0   20  9,355,086  cycles
   1.000536937 N1   20653,382  cycles
   2.002090152 N0   20  7,712,838  cycles
   2.002090152 N1   20  1,865,701  cycles
   3.003625233 N0   20  6,604,441  cycles
   3.003625233 N1   20  1,043,428  cycles
   4.005135036 N0   20  6,350,522  cycles
   4.005135036 N1   20681,564  cycles
   4.340902364 N0   20  3,403,188  cycles
   4.340902364 N1   20520,705  cycles

Link: http://lkml.kernel.org/n/tip-h57ftv8vmqrgzz3kdvlvh...@git.kernel.org
Signed-off-by: Jiri Olsa 
---
 tools/perf/Documentation/perf-stat.txt |  5 +++
 tools/perf/builtin-stat.c  | 52 ++
 tools/perf/util/cpumap.c   | 18 +
 tools/perf/util/cpumap.h   |  3 ++
 tools/perf/util/stat-display.c | 15 
 tools/perf/util/stat.c |  1 +
 tools/perf/util/stat.h |  1 +
 7 files changed, 95 insertions(+)

diff --git a/tools/perf/Documentation/perf-stat.txt 
b/tools/perf/Documentation/perf-stat.txt
index 930c51c01201..74299dc2ffd1 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -217,6 +217,11 @@ core number and the number of online logical processors on 
that physical process
 Aggregate counts per monitored threads, when monitoring threads (-t option)
 or processes (-p option).
 
+--per-numa::
+Aggregate counts per NUMA nodes for system-wide mode measurements. This
+is a useful mode to detect imbalance between NUMA nodes. To enable this
+mode, use --per-numa in addition to -a. (system-wide).
+
 -D msecs::
 --delay msecs::
 After starting the program, wait msecs before measuring. This is useful to
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 5bc0c570b7b6..5c30e9e3de19 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -790,6 +790,8 @@ static struct option stat_options[] = {
 "aggregate counts per physical processor core", AGGR_CORE),
OPT_SET_UINT(0, "per-thread", _config.aggr_mode,
 "aggregate counts per thread", AGGR_THREAD),
+   OPT_SET_UINT(0, "per-numa", _config.aggr_mode,
+"aggregate counts per numa node", AGGR_NUMA),
OPT_UINTEGER('D', "delay", _config.initial_delay,
 "ms to wait before starting measurement after program 
start"),
OPT_CALLBACK_NOOPT(0, "metric-only", _config.metric_only, NULL,
@@ -822,6 +824,12 @@ static int perf_stat__get_core(struct perf_stat_config 
*config __maybe_unused,
return cpu_map__get_core(map, cpu, NULL);
 }
 
+static int perf_stat__get_numa(struct perf_stat_config *config __maybe_unused,
+  struct perf_cpu_map *map, int cpu)
+{
+   return cpu_map__get_numa(map, cpu, NULL);
+}
+
 static int perf_stat__get_aggr(struct perf_stat_config *config,
   aggr_get_id_t get_id, struct perf_cpu_map *map, 
int idx)
 {
@@ -856,6 +864,12 @@ static int perf_stat__get_core_cached(struct 
perf_stat_config *config,
return perf_stat__get_aggr(config, perf_stat__get_core, map, idx);
 }
 
+static int perf_stat__get_numa_cached(struct perf_stat_config *config,
+ struct perf_cpu_map *map, int idx)
+{
+   return perf_stat__get_aggr(config, perf_stat__get_numa, map, idx);
+}
+
 static bool term_percore_set(void)
 {
struct evsel *counter;
@@ -894,6 +908,13 @@ static int perf_stat_init_aggr_mode(void)
}
stat_config.aggr_get_id = perf_stat__get_core_cached;
break;
+   case AGGR_NUMA:
+   if