[RESEND PATCH v5 4/4] perf: Update .gitignore file

2021-04-19 Thread alexander . antonov
From: Alexander Antonov 

After a "make -C tools/perf", git reports the following untracked file:
perf-iostat

Add this generated file to perf's .gitignore file.

Acked-by: Namhyung Kim 
Signed-off-by: Alexander Antonov 
---
 tools/perf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index f3f84781fd74..e555e9729758 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -20,6 +20,7 @@ perf.data.old
 output.svg
 perf-archive
 perf-with-kcore
+perf-iostat
 tags
 TAGS
 cscope*
-- 
2.21.3



[RESEND PATCH v5 1/4] perf stat: Basic support for iostat in perf

2021-04-19 Thread alexander . antonov
From: Alexander Antonov 

Add basic flow for a new iostat mode in perf. Mode is intended to
provide four I/O performance metrics per each PCIe root port: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

The actual code to compute the metrics and attribute it to
root port is in follow-on patches.

Acked-by: Namhyung Kim 
Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c  | 21 +-
 tools/perf/util/Build  |  1 +
 tools/perf/util/iostat.c   | 53 ++
 tools/perf/util/iostat.h   | 47 ++
 tools/perf/util/stat-display.c | 40 ++---
 tools/perf/util/stat-shadow.c  |  5 +++-
 tools/perf/util/stat.h |  1 +
 7 files changed, 156 insertions(+), 12 deletions(-)
 create mode 100644 tools/perf/util/iostat.c
 create mode 100644 tools/perf/util/iostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2a2c15cac80a..ba5b31aab86b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -68,6 +68,7 @@
 #include "util/affinity.h"
 #include "util/pfm.h"
 #include "util/bpf_counter.h"
+#include "util/iostat.h"
 #include "asm/bug.h"
 
 #include 
@@ -212,7 +213,8 @@ static struct perf_stat_config stat_config = {
.walltime_nsecs_stats   = &walltime_nsecs_stats,
.big_num= true,
.ctl_fd = -1,
-   .ctl_fd_ack = -1
+   .ctl_fd_ack = -1,
+   .iostat_run = false,
 };
 
 static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -1268,6 +1270,9 @@ static struct option stat_options[] = {
 "\t\t\t  Optionally send control command completion 
('ack\\n') to ack-fd descriptor.\n"
 "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened 
and used as ctl-fd / ack-fd.",
  parse_control_option),
+   OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "default",
+   "measure I/O performance metrics provided by 
arch/platform",
+   iostat_parse),
OPT_END()
 };
 
@@ -2341,6 +2346,17 @@ int cmd_stat(int argc, const char **argv)
goto out;
}
 
+   if (stat_config.iostat_run) {
+   status = iostat_prepare(evsel_list, &stat_config);
+   if (status)
+   goto out;
+   if (iostat_mode == IOSTAT_LIST) {
+   iostat_list(evsel_list, &stat_config);
+   goto out;
+   } else if (verbose)
+   iostat_list(evsel_list, &stat_config);
+   }
+
if (add_default_attributes())
goto out;
 
@@ -2516,6 +2532,9 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
evlist__free_stats(evsel_list);
 out:
+   if (stat_config.iostat_run)
+   iostat_release(evsel_list);
+
zfree(&stat_config.walltime_run);
 
if (smi_cost && smi_reset)
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index e3e12f9d4733..7dd815712d60 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -102,6 +102,7 @@ perf-y += rwsem.o
 perf-y += thread-stack.o
 perf-y += spark.o
 perf-y += topdown.o
+perf-y += iostat.o
 perf-y += stream.o
 perf-$(CONFIG_AUXTRACE) += auxtrace.o
 perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c
new file mode 100644
index ..57dd49da28fe
--- /dev/null
+++ b/tools/perf/util/iostat.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "util/iostat.h"
+#include "util/debug.h"
+
+enum iostat_mode_t iostat_mode = IOSTAT_NONE;
+
+__weak int iostat_prepare(struct evlist *evlist __maybe_unused,
+ struct perf_stat_config *config __maybe_unused)
+{
+   return -1;
+}
+
+__weak int iostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iostat mode is not supported on current platform\n");
+   return -1;
+}
+
+__weak void iostat_list(struct evlist *evlist __maybe_unused,
+  struct perf_stat_config *config __maybe_unused)
+{
+}
+
+__weak void iostat_release(struct evlist *evlist __maybe_unused)
+{
+}
+
+__weak void iostat_print_header_prefix(struct perf_stat_config *config 
__maybe_unused)
+{
+}
+
+__weak void iostat_print_metric(struct perf_stat_config *config __maybe_unused,
+   struct evsel *evsel __maybe_unused,
+   struct perf_stat_output_ctx *out __maybe_unused)
+{
+}
+
+__weak void iostat_prefix(struct evlist *evlist _

[RESEND PATCH v5 2/4] perf stat: Helper functions for PCIe root ports list in iostat mode

2021-04-19 Thread alexander . antonov
From: Alexander Antonov 

Introduce helper functions to control PCIe root ports list.
These helpers will be used in the follow-up patch.

Acked-by: Namhyung Kim 
Signed-off-by: Alexander Antonov 
---
 tools/perf/arch/x86/util/iostat.c | 110 ++
 1 file changed, 110 insertions(+)
 create mode 100644 tools/perf/arch/x86/util/iostat.c

diff --git a/tools/perf/arch/x86/util/iostat.c 
b/tools/perf/arch/x86/util/iostat.c
new file mode 100644
index ..c4471f8efa5e
--- /dev/null
+++ b/tools/perf/arch/x86/util/iostat.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+struct iio_root_port {
+   u32 domain;
+   u8 bus;
+   u8 die;
+   u8 pmu_idx;
+   int idx;
+};
+
+struct iio_root_ports_list {
+   struct iio_root_port **rps;
+   int nr_entries;
+};
+
+static void iio_root_port_show(FILE *output,
+  const struct iio_root_port * const rp)
+{
+   if (output && rp)
+   fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+   rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus,
+  u8 die, u8 pmu_idx)
+{
+   struct iio_root_port *p = calloc(1, sizeof(*p));
+
+   if (p) {
+   p->domain = domain;
+   p->bus = bus;
+   p->die = die;
+   p->pmu_idx = pmu_idx;
+   }
+   return p;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+   int idx;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++)
+   free(list->rps[idx]);
+   free(list->rps);
+   free(list);
+   }
+}
+
+static struct iio_root_port *iio_root_port_find_by_notation(
+   const struct iio_root_ports_list * const list, u32 domain, u8 bus)
+{
+   int idx;
+   struct iio_root_port *rp;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++) {
+   rp = list->rps[idx];
+   if (rp && rp->domain == domain && rp->bus == bus)
+   return rp;
+   }
+   }
+   return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   tmp_buf = realloc(list->rps,
+ list->nr_entries * sizeof(*list->rps));
+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
-- 
2.21.3



[RESEND PATCH v5 3/4] perf stat: Enable iostat mode for x86 platforms

2021-04-19 Thread alexander . antonov
From: Alexander Antonov 

This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
PCIe root port:
 - Inbound Read: I/O devices below root port read from the host memory
 - Inbound Write: I/O devices below root port write to the host memory
 - Outbound Read: CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Acked-by: Namhyung Kim 
Signed-off-by: Alexander Antonov 
---
 tools/perf/Documentation/perf-iostat.txt |  88 ++
 tools/perf/Makefile.perf |   5 +-
 tools/perf/arch/x86/util/Build   |   1 +
 tools/perf/arch/x86/util/iostat.c| 360 +++
 tools/perf/command-list.txt  |   1 +
 tools/perf/perf-iostat.sh|  12 +
 6 files changed, 466 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/Documentation/perf-iostat.txt
 create mode 100644 tools/perf/perf-iostat.sh

diff --git a/tools/perf/Documentation/perf-iostat.txt 
b/tools/perf/Documentation/perf-iostat.txt
new file mode 100644
index ..165176944031
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===
+
+NAME
+
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iostat' list
+'perf iostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each PCIe root 
port:
+
+- Inbound Read   - I/O devices below root port read from the host memory, in MB
+
+- Inbound Write  - I/O devices below root port write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+---
+...::
+   Any command you can specify in a shell.
+
+list::
+   List all PCIe root ports.
+
+::
+   Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+
+
+1. List all PCIe root ports (example for 2-S platform):
+
+   $ perf iostat list
+   S0-uncore_iio_0<:00>
+   S1-uncore_iio_0<:80>
+   S0-uncore_iio_1<:17>
+   S1-uncore_iio_1<:85>
+   S0-uncore_iio_2<:3a>
+   S1-uncore_iio_2<:ae>
+   S0-uncore_iio_3<:5d>
+   S1-uncore_iio_3<:d7>
+
+2. Collect metrics for all PCIe root ports:
+
+   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :00102  
  3
+   :80000  
  0
+   :17   352552   430  
 21
+   :85000  
  0
+   :3a300  
  0
+   :ae000  
  0
+   :5d000  
  0
+   :d7000  
  0
+
+3. Collect metrics for comma-separated list of PCIe root ports:
+
+   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :17   358559   440  
 22
+   :3a320  
  0
+
+197.081983474 seconds time elapsed
+
+SEE ALSO
+
+linkperf:perf-stat[1]
\ No newline at end of file
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 090fb9d62665..6240fbb1646e 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -283,6 +283,7 @@ SCRIPT_SH =
 
 SCRIPT_SH += perf-archive.sh
 SCRIPT_SH += perf-with-kcore.sh
+SCRIPT_SH 

[RESEND PATCH v5 0/4] perf stat: Introduce iostat mode to provide I/O performance metrics

2021-04-19 Thread alexander . antonov
From: Alexander Antonov 

Resending V5 with added Acked-by: Namhyung Kim  tag.

Thanks,
Alexander

The previous version can be found at:
v4: 
https://lkml.kernel.org/r/20210203135830.38568-1-alexander.anto...@linux.intel.com/
Changes in this revision are:
v4 -> v5:
- Addressed comments from Namhyung Kim:
  1. Removed AGGR_PCIE_PORT aggregation mode
  2. Added iostat_prepare() function
  3. Moved implementation specific fprintf() calls to separate x86-related 
function
  4. Fixed code-related issues
- Moved __weak iostat's functions to separate util/iostat.c file

The previous version can be found at:
v3: 
https://lkml.kernel.org/r/20210126080619.30275-1-alexander.anto...@linux.intel.com/
Changes in this revision are:
v3 -> v4:
- Addressed comment from Namhyung Kim:
  1. Removed NULL-termination of root ports list

The previous version can be found at:
v2: 
https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v2 -> v3:
- Addressed comments from Namhyung Kim:
  1. Removed perf_device pointer from evsel structure. Use priv field instead
  2. Renamed 'iiostat' to 'iostat'
  3. Renamed 'show' mode to 'list' mode
  4. Renamed iiostat_delete_root_ports() to iiostat_release() and
 iostat_show_root_ports() to iostat_list()

The previous version can be found at:
v1: 
https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v1 -> v2:
- Addressed comment from Arnaldo Carvalho de Melo:
  1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat':
- Added perf-iiostat.sh script to use short command
- Updated manual pages to get help for 'perf iiostat'
- Added 'perf-iiostat' to perf's gitignore file

Mode is intended to provide four I/O performance metrics in MB per each
root port:
 - Inbound Read:   I/O devices below root port read from the host memory
 - Inbound Write:  I/O devices below root port write to the host memory
 - Outbound Read:  CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Note: iostat introduces new perf data aggregation mode - per PCIe root port
hence -e and -M options are not supported.

Usage examples:

1. List all PCIe root ports (example for 2-S platform):
   $ perf iostat list
   S0-uncore_iio_0<:00>
   S1-uncore_iio_0<:80>
   S0-uncore_iio_1<:17>
   S1-uncore_iio_1<:85>
   S0-uncore_iio_2<:3a>
   S1-uncore_iio_2<:ae>
   S0-uncore_iio_3<:5d>
   S1-uncore_iio_3<:d7>

2. Collect metrics for all PCIe root ports:
   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :00102   
 3 
   :80000   
 0 
   :17   352552   430   
21 
   :85000   
 0 
   :3a300   
 0 
   :ae000   
 0 
   :5d000   
 0 
   :d7000   
 0

3. Collect metrics for comma separated list of PCIe root ports:
   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :17   358559   44    0   
    22 
   :3a320   
 0 

197.081983474 seconds time elapsed

Alexander Antonov (4):
  perf stat: Basic support for iostat in perf
  perf stat: Helper functions for PCIe root ports list in iostat mode
  perf stat: Enable iostat mode for x86 platforms
  perf: Update .gitignore file

 tools/perf/.gitignore|   

[tip: perf/core] perf/x86/intel/uncore: Enable IIO stacks to PMON mapping for multi-segment SKX

2021-04-02 Thread tip-bot2 for Alexander Antonov
The following commit has been merged into the perf/core branch of tip:

Commit-ID: cface0326a6c2ae5c8f47bd466f07624b3e348a7
Gitweb:
https://git.kernel.org/tip/cface0326a6c2ae5c8f47bd466f07624b3e348a7
Author:Alexander Antonov 
AuthorDate:Tue, 23 Mar 2021 18:05:07 +03:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 02 Apr 2021 10:04:55 +02:00

perf/x86/intel/uncore: Enable IIO stacks to PMON mapping for multi-segment SKX

IIO stacks to PMON mapping on Skylake servers is exposed through introduced
early attributes /sys/devices/uncore_iio_/dieX, where dieX is a
file which holds "Segment:Root Bus" for PCIe root port which can
be monitored by that IIO PMON block. These sysfs attributes are disabled
for multiple segment topologies except VMD domains which start at 0x1.
This patch removes the limitation and enables IIO stacks to PMON mapping
for multi-segment Skylake servers by introducing segment-aware
intel_uncore_topology structure and attributing the topology configuration
to the segment in skx_iio_get_topology() function.

Reported-by: kernel test robot 
Signed-off-by: Alexander Antonov 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Kan Liang 
Reviewed-by: Andi Kleen 
Tested-by: Kyle Meyer 
Link: 
https://lkml.kernel.org/r/20210323150507.2013-1-alexander.anto...@linux.intel.com
---
 arch/x86/events/intel/uncore.c   | 12 +-
 arch/x86/events/intel/uncore.h   |  9 +++-
 arch/x86/events/intel/uncore_snbep.c | 60 ---
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 35b3470..a2b68bb 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -53,6 +53,18 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus)
return die_id;
 }
 
+int uncore_die_to_segment(int die)
+{
+   struct pci_bus *bus = NULL;
+
+   /* Find first pci bus which attributes to specified die. */
+   while ((bus = pci_find_next_bus(bus)) &&
+  (die != uncore_pcibus_to_dieid(bus)))
+   ;
+
+   return bus ? pci_domain_nr(bus) : -EINVAL;
+}
+
 static void uncore_free_pcibus_map(void)
 {
struct pci2phy_map *map, *tmp;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 549cfb2..96569dc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -42,6 +42,7 @@ struct intel_uncore_pmu;
 struct intel_uncore_box;
 struct uncore_event_desc;
 struct freerunning_counters;
+struct intel_uncore_topology;
 
 struct intel_uncore_type {
const char *name;
@@ -87,7 +88,7 @@ struct intel_uncore_type {
 * to identify which platform component each PMON block of that type is
 * supposed to monitor.
 */
-   u64 *topology;
+   struct intel_uncore_topology *topology;
/*
 * Optional callbacks for managing mapping of Uncore units to PMONs
 */
@@ -176,6 +177,11 @@ struct freerunning_counters {
unsigned *box_offsets;
 };
 
+struct intel_uncore_topology {
+   u64 configuration;
+   int segment;
+};
+
 struct pci2phy_map {
struct list_head list;
int segment;
@@ -184,6 +190,7 @@ struct pci2phy_map {
 
 struct pci2phy_map *__find_pci2phy_map(int segment);
 int uncore_pcibus_to_dieid(struct pci_bus *bus);
+int uncore_die_to_segment(int die);
 
 ssize_t uncore_event_show(struct device *dev,
  struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index b79951d..acc3c0e 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3684,7 +3684,8 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
 
 static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
 {
-   return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+   return pmu->type->topology[die].configuration >>
+  (pmu->pmu_idx * BUS_NUM_STRIDE);
 }
 
 static umode_t
@@ -3697,19 +3698,14 @@ skx_iio_mapping_visible(struct kobject *kobj, struct 
attribute *attr, int die)
 }
 
 static ssize_t skx_iio_mapping_show(struct device *dev,
-   struct device_attribute *attr, char *buf)
+   struct device_attribute *attr, char *buf)
 {
-   struct pci_bus *bus = pci_find_next_bus(NULL);
-   struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+   struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
long die = (long)ea->var;
 
-   /*
-* Current implementation is for single segment configuration hence it's
-* safe to take the segment value from the first available root bus.
-*/
-   

[PATCH v5 4/4] perf: Update .gitignore file

2021-03-24 Thread Alexander Antonov
After a "make -C tools/perf", git reports the following untracked file:
perf-iostat

Add this generated file to perf's .gitignore file.

Signed-off-by: Alexander Antonov 
---
 tools/perf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index f3f84781fd74..e555e9729758 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -20,6 +20,7 @@ perf.data.old
 output.svg
 perf-archive
 perf-with-kcore
+perf-iostat
 tags
 TAGS
 cscope*
-- 
2.19.1



[PATCH v5 3/4] perf stat: Enable iostat mode for x86 platforms

2021-03-24 Thread Alexander Antonov
This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
PCIe root port:
 - Inbound Read: I/O devices below root port read from the host memory
 - Inbound Write: I/O devices below root port write to the host memory
 - Outbound Read: CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Signed-off-by: Alexander Antonov 
---
 tools/perf/Documentation/perf-iostat.txt |  88 ++
 tools/perf/Makefile.perf |   5 +-
 tools/perf/arch/x86/util/Build   |   1 +
 tools/perf/arch/x86/util/iostat.c| 360 +++
 tools/perf/command-list.txt  |   1 +
 tools/perf/perf-iostat.sh|  12 +
 6 files changed, 466 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/Documentation/perf-iostat.txt
 create mode 100644 tools/perf/perf-iostat.sh

diff --git a/tools/perf/Documentation/perf-iostat.txt 
b/tools/perf/Documentation/perf-iostat.txt
new file mode 100644
index ..165176944031
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===
+
+NAME
+
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iostat' list
+'perf iostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each PCIe root 
port:
+
+- Inbound Read   - I/O devices below root port read from the host memory, in MB
+
+- Inbound Write  - I/O devices below root port write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+---
+...::
+   Any command you can specify in a shell.
+
+list::
+   List all PCIe root ports.
+
+::
+   Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+
+
+1. List all PCIe root ports (example for 2-S platform):
+
+   $ perf iostat list
+   S0-uncore_iio_0<:00>
+   S1-uncore_iio_0<:80>
+   S0-uncore_iio_1<:17>
+   S1-uncore_iio_1<:85>
+   S0-uncore_iio_2<:3a>
+   S1-uncore_iio_2<:ae>
+   S0-uncore_iio_3<:5d>
+   S1-uncore_iio_3<:d7>
+
+2. Collect metrics for all PCIe root ports:
+
+   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :00102  
  3
+   :80000  
  0
+   :17   352552   430  
 21
+   :85000  
  0
+   :3a300  
  0
+   :ae000  
  0
+   :5d000  
  0
+   :d7000  
  0
+
+3. Collect metrics for comma-separated list of PCIe root ports:
+
+   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :17   358559   440  
 22
+   :3a320  
  0
+
+197.081983474 seconds time elapsed
+
+SEE ALSO
+
+linkperf:perf-stat[1]
\ No newline at end of file
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index a7d768fdc8a1..3b3a452f4862 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -283,6 +283,7 @@ SCRIPT_SH =
 
 SCRIPT_SH += perf-archive.sh
 SCRIPT_SH += perf-with-kcore.sh
+SCRIPT_SH += perf-iostat.sh
 
 grep-libs = $(filter -l%,$(1))

[PATCH v5 0/4] perf stat: Introduce iostat mode to provide I/O performance metrics

2021-03-24 Thread Alexander Antonov
The previous version can be found at:
v4: 
https://lkml.kernel.org/r/20210203135830.38568-1-alexander.anto...@linux.intel.com/
Changes in this revision are:
v4 -> v5:
- Addressed comments from Namhyung Kim:
  1. Removed AGGR_PCIE_PORT aggregation mode
  2. Added iostat_prepare() function
  3. Moved implementation specific fprintf() calls to separate x86-related 
function
  4. Fixed code-related issues
- Moved __weak iostat's functions to separate util/iostat.c file

The previous version can be found at:
v3: 
https://lkml.kernel.org/r/20210126080619.30275-1-alexander.anto...@linux.intel.com/
Changes in this revision are:
v3 -> v4:
- Addressed comment from Namhyung Kim:
  1. Removed NULL-termination of root ports list

The previous version can be found at:
v2: 
https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v2 -> v3:
- Addressed comments from Namhyung Kim:
  1. Removed perf_device pointer from evsel structure. Use priv field instead
  2. Renamed 'iiostat' to 'iostat'
  3. Renamed 'show' mode to 'list' mode
  4. Renamed iiostat_delete_root_ports() to iiostat_release() and
 iostat_show_root_ports() to iostat_list()

The previous version can be found at:
v1: 
https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v1 -> v2:
- Addressed comment from Arnaldo Carvalho de Melo:
  1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat':
- Added perf-iiostat.sh script to use short command
- Updated manual pages to get help for 'perf iiostat'
- Added 'perf-iiostat' to perf's gitignore file

Mode is intended to provide four I/O performance metrics in MB per each
root port:
 - Inbound Read:   I/O devices below root port read from the host memory
 - Inbound Write:  I/O devices below root port write to the host memory
 - Outbound Read:  CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Note: iostat introduces new perf data aggregation mode - per PCIe root port
hence -e and -M options are not supported.

Usage examples:

1. List all PCIe root ports (example for 2-S platform):
   $ perf iostat list
   S0-uncore_iio_0<:00>
   S1-uncore_iio_0<:80>
   S0-uncore_iio_1<:17>
   S1-uncore_iio_1<:85>
   S0-uncore_iio_2<:3a>
   S1-uncore_iio_2<:ae>
   S0-uncore_iio_3<:5d>
   S1-uncore_iio_3<:d7>

2. Collect metrics for all PCIe root ports:
   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :00102   
 3 
   :80000   
 0 
   :17   352552   430   
21 
   :85000   
 0 
   :3a300   
 0 
   :ae000   
 0 
   :5d000   
 0 
   :d7000   
 0

3. Collect metrics for comma separated list of PCIe root ports:
   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :17   358559   44    0   
    22 
   :3a320   
 0 

197.081983474 seconds time elapsed

Alexander Antonov (4):
  perf stat: Basic support for iostat in perf
  perf stat: Helper functions for PCIe root ports list in iostat mode
  perf stat: Enable iostat mode for x86 platforms
  perf: Update .gitignore file

 tools/perf/.gitignore|   1 +
 tools/perf/Documentation/perf-iostat.txt |  88 +
 tools/perf/Makefile.perf

[PATCH v5 1/4] perf stat: Basic support for iostat in perf

2021-03-24 Thread Alexander Antonov
Add basic flow for a new iostat mode in perf. Mode is intended to
provide four I/O performance metrics per each PCIe root port: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

The actual code to compute the metrics and attribute it to
root port is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c  | 21 +-
 tools/perf/util/Build  |  1 +
 tools/perf/util/iostat.c   | 53 ++
 tools/perf/util/iostat.h   | 47 ++
 tools/perf/util/stat-display.c | 40 ++---
 tools/perf/util/stat-shadow.c  |  5 +++-
 tools/perf/util/stat.h |  1 +
 7 files changed, 156 insertions(+), 12 deletions(-)
 create mode 100644 tools/perf/util/iostat.c
 create mode 100644 tools/perf/util/iostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2e2e4a8345ea..4cef64ce9261 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -68,6 +68,7 @@
 #include "util/affinity.h"
 #include "util/pfm.h"
 #include "util/bpf_counter.h"
+#include "util/iostat.h"
 #include "asm/bug.h"
 
 #include 
@@ -212,7 +213,8 @@ static struct perf_stat_config stat_config = {
.walltime_nsecs_stats   = &walltime_nsecs_stats,
.big_num= true,
.ctl_fd = -1,
-   .ctl_fd_ack = -1
+   .ctl_fd_ack = -1,
+   .iostat_run = false,
 };
 
 static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -1247,6 +1249,9 @@ static struct option stat_options[] = {
 "\t\t\t  Optionally send control command completion 
('ack\\n') to ack-fd descriptor.\n"
 "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened 
and used as ctl-fd / ack-fd.",
  parse_control_option),
+   OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "default",
+   "measure I/O performance metrics provided by 
arch/platform",
+   iostat_parse),
OPT_END()
 };
 
@@ -2320,6 +2325,17 @@ int cmd_stat(int argc, const char **argv)
goto out;
}
 
+   if (stat_config.iostat_run) {
+   status = iostat_prepare(evsel_list, &stat_config);
+   if (status)
+   goto out;
+   if (iostat_mode == IOSTAT_LIST) {
+   iostat_list(evsel_list, &stat_config);
+   goto out;
+   } else if (verbose)
+   iostat_list(evsel_list, &stat_config);
+   }
+
if (add_default_attributes())
goto out;
 
@@ -2495,6 +2511,9 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
evlist__free_stats(evsel_list);
 out:
+   if (stat_config.iostat_run)
+   iostat_release(evsel_list);
+
zfree(&stat_config.walltime_run);
 
if (smi_cost && smi_reset)
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index e3e12f9d4733..7dd815712d60 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -102,6 +102,7 @@ perf-y += rwsem.o
 perf-y += thread-stack.o
 perf-y += spark.o
 perf-y += topdown.o
+perf-y += iostat.o
 perf-y += stream.o
 perf-$(CONFIG_AUXTRACE) += auxtrace.o
 perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c
new file mode 100644
index ..57dd49da28fe
--- /dev/null
+++ b/tools/perf/util/iostat.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "util/iostat.h"
+#include "util/debug.h"
+
+enum iostat_mode_t iostat_mode = IOSTAT_NONE;
+
+__weak int iostat_prepare(struct evlist *evlist __maybe_unused,
+ struct perf_stat_config *config __maybe_unused)
+{
+   return -1;
+}
+
+__weak int iostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iostat mode is not supported on current platform\n");
+   return -1;
+}
+
+__weak void iostat_list(struct evlist *evlist __maybe_unused,
+  struct perf_stat_config *config __maybe_unused)
+{
+}
+
+__weak void iostat_release(struct evlist *evlist __maybe_unused)
+{
+}
+
+__weak void iostat_print_header_prefix(struct perf_stat_config *config 
__maybe_unused)
+{
+}
+
+__weak void iostat_print_metric(struct perf_stat_config *config __maybe_unused,
+   struct evsel *evsel __maybe_unused,
+   struct perf_stat_output_ctx *out __maybe_unused)
+{
+}
+
+__weak void iostat_prefix(struct evlist *evlist __maybe_unused,
+ struct pe

[PATCH v5 2/4] perf stat: Helper functions for PCIe root ports list in iostat mode

2021-03-24 Thread Alexander Antonov
Introduce helper functions to control PCIe root ports list.
These helpers will be used in the follow-up patch.

Signed-off-by: Alexander Antonov 
---
 tools/perf/arch/x86/util/iostat.c | 110 ++
 1 file changed, 110 insertions(+)
 create mode 100644 tools/perf/arch/x86/util/iostat.c

diff --git a/tools/perf/arch/x86/util/iostat.c 
b/tools/perf/arch/x86/util/iostat.c
new file mode 100644
index ..c4471f8efa5e
--- /dev/null
+++ b/tools/perf/arch/x86/util/iostat.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+struct iio_root_port {
+   u32 domain;
+   u8 bus;
+   u8 die;
+   u8 pmu_idx;
+   int idx;
+};
+
+struct iio_root_ports_list {
+   struct iio_root_port **rps;
+   int nr_entries;
+};
+
+static void iio_root_port_show(FILE *output,
+  const struct iio_root_port * const rp)
+{
+   if (output && rp)
+   fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+   rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus,
+  u8 die, u8 pmu_idx)
+{
+   struct iio_root_port *p = calloc(1, sizeof(*p));
+
+   if (p) {
+   p->domain = domain;
+   p->bus = bus;
+   p->die = die;
+   p->pmu_idx = pmu_idx;
+   }
+   return p;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+   int idx;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++)
+   free(list->rps[idx]);
+   free(list->rps);
+   free(list);
+   }
+}
+
+static struct iio_root_port *iio_root_port_find_by_notation(
+   const struct iio_root_ports_list * const list, u32 domain, u8 bus)
+{
+   int idx;
+   struct iio_root_port *rp;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++) {
+   rp = list->rps[idx];
+   if (rp && rp->domain == domain && rp->bus == bus)
+   return rp;
+   }
+   }
+   return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   tmp_buf = realloc(list->rps,
+ list->nr_entries * sizeof(*list->rps));
+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
-- 
2.19.1



[PATCH] perf/x86/intel/uncore: Enable IIO stacks to PMON mapping for multi-segment SKX

2021-03-23 Thread Alexander Antonov
IIO stacks to PMON mapping on Skylake servers is exposed through introduced
early attributes /sys/devices/uncore_iio_/dieX, where dieX is a
file which holds "Segment:Root Bus" for PCIe root port which can
be monitored by that IIO PMON block. These sysfs attributes are disabled
for multiple segment topologies except VMD domains which start at 0x1.
This patch removes the limitation and enables IIO stacks to PMON mapping
for multi-segment Skylake servers by introducing segment-aware
intel_uncore_topology structure and attributing the topology configuration
to the segment in skx_iio_get_topology() function.

Reported-by: kernel test robot 
Tested-by: Kyle Meyer 
Reviewed-by: Andi Kleen 
Reviewed-by: Kan Liang 
Signed-off-by: Alexander Antonov 
---
 arch/x86/events/intel/uncore.c   | 12 ++
 arch/x86/events/intel/uncore.h   |  9 -
 arch/x86/events/intel/uncore_snbep.c | 60 +---
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 33c8180d5a87..0c066d9aa17a 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -48,6 +48,18 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus)
return die_id;
 }
 
+int uncore_die_to_segment(int die)
+{
+   struct pci_bus *bus = NULL;
+
+   /* Find first pci bus which attributes to specified die. */
+   while ((bus = pci_find_next_bus(bus)) &&
+  (die != uncore_pcibus_to_dieid(bus)))
+   ;
+
+   return bus ? pci_domain_nr(bus) : -EINVAL;
+}
+
 static void uncore_free_pcibus_map(void)
 {
struct pci2phy_map *map, *tmp;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index a3c6e1643ad2..be2095ec458c 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -42,6 +42,7 @@ struct intel_uncore_pmu;
 struct intel_uncore_box;
 struct uncore_event_desc;
 struct freerunning_counters;
+struct intel_uncore_topology;
 
 struct intel_uncore_type {
const char *name;
@@ -80,7 +81,7 @@ struct intel_uncore_type {
 * to identify which platform component each PMON block of that type is
 * supposed to monitor.
 */
-   u64 *topology;
+   struct intel_uncore_topology *topology;
/*
 * Optional callbacks for managing mapping of Uncore units to PMONs
 */
@@ -169,6 +170,11 @@ struct freerunning_counters {
unsigned *box_offsets;
 };
 
+struct intel_uncore_topology {
+   u64 configuration;
+   int segment;
+};
+
 struct pci2phy_map {
struct list_head list;
int segment;
@@ -177,6 +183,7 @@ struct pci2phy_map {
 
 struct pci2phy_map *__find_pci2phy_map(int segment);
 int uncore_pcibus_to_dieid(struct pci_bus *bus);
+int uncore_die_to_segment(int die);
 
 ssize_t uncore_event_show(struct device *dev,
  struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index b79951d0707c..acc3c0e52f4d 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3684,7 +3684,8 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
 
 static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
 {
-   return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+   return pmu->type->topology[die].configuration >>
+  (pmu->pmu_idx * BUS_NUM_STRIDE);
 }
 
 static umode_t
@@ -3697,19 +3698,14 @@ skx_iio_mapping_visible(struct kobject *kobj, struct 
attribute *attr, int die)
 }
 
 static ssize_t skx_iio_mapping_show(struct device *dev,
-   struct device_attribute *attr, char *buf)
+   struct device_attribute *attr, char *buf)
 {
-   struct pci_bus *bus = pci_find_next_bus(NULL);
-   struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+   struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
long die = (long)ea->var;
 
-   /*
-* Current implementation is for single segment configuration hence it's
-* safe to take the segment value from the first available root bus.
-*/
-   return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
-  skx_iio_stack(uncore_pmu, die));
+   return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment,
+  skx_iio_stack(pmu, die));
 }
 
 static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
@@ -3746,34 +3742,32 @@ static int die_to_cpu(int die)
 
 static int skx_iio_get_topology(struct intel_uncore_type *type)
 {
-   int i, ret;
-   struct pci_bus *bus = NULL;
-
-   /*
-* Verified single-segment environ

Re: [PATCH v4 4/5] perf stat: Enable iostat mode for x86 platforms

2021-03-10 Thread Alexander Antonov



On 3/9/2021 10:51 AM, liuqi (BA) wrote:

Hi Alexander,

On 2021/2/3 21:58, Alexander Antonov wrote:

This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
PCIe root port:
  - Inbound Read: I/O devices below root port read from the host memory
  - Inbound Write: I/O devices below root port write to the host memory
  - Outbound Read: CPU reads from I/O devices below root port
  - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
 #EventCount * 4B / (1024 * 1024)

Signed-off-by: Alexander Antonov
---
  tools/perf/Documentation/perf-iostat.txt |  88 ++
  tools/perf/Makefile.perf |   5 +-
  tools/perf/arch/x86/util/Build   |   1 +
  tools/perf/arch/x86/util/iostat.c    | 345 +++
  tools/perf/command-list.txt  |   1 +
  tools/perf/perf-iostat.sh    |  12 +
  6 files changed, 451 insertions(+), 1 deletion(-)
  create mode 100644 tools/perf/Documentation/perf-iostat.txt
  create mode 100644 tools/perf/perf-iostat.sh

diff --git a/tools/perf/Documentation/perf-iostat.txt 
b/tools/perf/Documentation/perf-iostat.txt

new file mode 100644
index ..165176944031
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===
+
+NAME
+
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iostat' list
+'perf iostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each 
PCIe root port:

+
+- Inbound Read   - I/O devices below root port read from the host 
memory, in MB

+
+- Inbound Write  - I/O devices below root port write to the host 
memory, in MB

+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+---
+...::
+    Any command you can specify in a shell.
+
+list::
+    List all PCIe root ports.


I noticed that "iostat" commond and cmd_iostat() callback function is 
not registered in cmd_struct in perf.c. So I think "perf iostat list" 
perhaps can not work properly.


I also test this patchset on x86 platform, and here is the log:

root@ubuntu:/home/lq# ./perf iostat list
perf: 'iostat' is not a perf-command. See 'perf --help'.
root@ubuntu:/home/lq# ./perf stat --iostat
^C
 Performance counter stats for 'system wide':

   port Inbound Read(MB)    Inbound Write(MB) Outbound 
Read(MB)   Outbound Write(MB)

:00    0 0    0  0
:80    0 0    0  0
:17    0 0    0  0
:85    0 0    0  0
:3a    0 0    0  0
:ae    0 0    0  0
:5d    0 0    0  0
:d7    0 0    0  0

   0.611303832 seconds time elapsed


root@ubuntu:/home/lq# ./perf stat --iostat=:17
^C
 Performance counter stats for 'system wide':

   port Inbound Read(MB)    Inbound Write(MB) Outbound 
Read(MB)   Outbound Write(MB)

:17    0 0    0  0

   0.521317572 seconds time elapsed

So how does following perf iostat list work, did I miss something?

Thanks,
Qi


Hello,

The 'iostat' mode uses aliases mechanism in perf same as 'perf archive' and
in this case you don't need to add function callback into cmd_struct.
For example, the command 'perf iostat list' will be converted to
'perf stat --iostat=list'.

After building the perf tool you should have two shell scripts in tools/perf
directory and one of them is executable, for example:
# make -C tools/perf
# ls -l tools/perf/perf-iostat*
-rwxr-xr-x 1 root root 290 Mar 10 18:17 perf-iostat
-rw-r--r-- 1 root root 290 Feb  3 15:14 perf-iostat.sh

It should be possible to run 'perf iostat' from build directory:
# cd tools/perf
# ./perf iostat list
S0-uncore_iio_0<:00>
S1-uncore_iio_0<:80>
S0-uncore_iio_1<:17>
S1-uncore_iio_1<:85>
S0-uncore_iio_2<:3a>
S1-uncore_iio_2<:ae>
S0-uncore_iio_3<:5d>
S1-uncore_iio_3<:d7>

Also you can copy 'perf-iostat' to ~/libexec/perf-core/ or just 

Re: [PATCH v4 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode

2021-02-08 Thread Alexander Antonov



On 2/4/2021 3:32 PM, Namhyung Kim wrote:

On Wed, Feb 3, 2021 at 10:58 PM Alexander Antonov
 wrote:

Introduce helper functions to control PCIe root ports list.
These helpers will be used in the follow-up patch.

Signed-off-by: Alexander Antonov 
---
  tools/perf/arch/x86/util/iostat.c | 124 ++
  1 file changed, 124 insertions(+)
  create mode 100644 tools/perf/arch/x86/util/iostat.c

diff --git a/tools/perf/arch/x86/util/iostat.c 
b/tools/perf/arch/x86/util/iostat.c
new file mode 100644
index ..961e540106e6
--- /dev/null
+++ b/tools/perf/arch/x86/util/iostat.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+struct iio_root_port {
+   u32 domain;
+   u8 bus;
+   u8 die;
+   u8 pmu_idx;
+   int idx;
+};
+
+struct iio_root_ports_list {
+   struct iio_root_port **rps;
+   int nr_entries;
+};
+
+static void iio_root_port_show(FILE *output,
+  const struct iio_root_port * const rp)
+{
+   if (output && rp)
+   fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+   rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus,
+  u8 die, u8 pmu_idx)
+{
+   struct iio_root_port *p = calloc(1, sizeof(*p));
+
+   if (p) {
+   p->domain = domain;
+   p->bus = bus;
+   p->die = die;
+   p->pmu_idx = pmu_idx;
+   }
+   return p;
+}
+
+static struct iio_root_ports_list *iio_root_ports_list_new(void)
+{
+   struct iio_root_ports_list *list = calloc(1, sizeof(*list));
+
+   if (list) {
+   list->rps = calloc(1, sizeof(struct iio_root_port *));

This seems unnecessary now.

Thanks,
Namhyung



Yes, you are right. Will be fixed.

Thank you,
Alexander

+   if (!list->rps) {
+   free(list);
+   list = NULL;
+   }
+   }
+   return list;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+   int idx;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++)
+   free(list->rps[idx]);
+   free(list->rps);
+   free(list);
+   }
+}
+
+static struct iio_root_port *iio_root_port_find_by_notation(
+   const struct iio_root_ports_list * const list, u32 domain, u8 bus)
+{
+   int idx;
+   struct iio_root_port *rp;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++) {
+   rp = list->rps[idx];
+   if (rp && rp->domain == domain && rp->bus == bus)
+   return rp;
+   }
+   }
+   return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   tmp_buf = realloc(list->rps,
+ list->nr_entries * sizeof(*list->rps));
+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
--
2.19.1



Re: [PATCH v4 2/5] perf stat: Basic support for iostat in perf

2021-02-08 Thread Alexander Antonov



On 2/4/2021 3:22 PM, Namhyung Kim wrote:

On Wed, Feb 3, 2021 at 10:58 PM Alexander Antonov
 wrote:

Add basic flow for a new iostat mode in perf. Mode is intended to
provide four I/O performance metrics per each PCIe root port: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

The actual code to compute the metrics and attribute it to
root port is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
  tools/perf/builtin-stat.c  | 31 ++
  tools/perf/util/iostat.h   | 32 +++
  tools/perf/util/stat-display.c | 40 +-
  tools/perf/util/stat-shadow.c  | 11 +-
  tools/perf/util/stat.h |  1 +
  5 files changed, 113 insertions(+), 2 deletions(-)
  create mode 100644 tools/perf/util/iostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 60fdb6a0805f..66c913692120 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -65,6 +65,7 @@
  #include "util/target.h"
  #include "util/time-utils.h"
  #include "util/top.h"
+#include "util/iostat.h"
  #include "asm/bug.h"

  #include 
@@ -186,6 +187,7 @@ static struct perf_stat_config stat_config = {
 .metric_only_len= METRIC_ONLY_LEN,
 .walltime_nsecs_stats   = &walltime_nsecs_stats,
 .big_num= true,
+   .iostat_run = false,
  };

  static inline void diff_timespec(struct timespec *r, struct timespec *a,
@@ -723,6 +725,14 @@ static int parse_metric_groups(const struct option *opt,
 return metricgroup__parse_groups(opt, str, &stat_config.metric_events);
  }

+__weak int iostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iostat mode is not supported\n");
+   return -1;
+}
+
  static struct option stat_options[] = {
 OPT_BOOLEAN('T', "transaction", &transaction_run,
 "hardware transaction statistics"),
@@ -803,6 +813,8 @@ static struct option stat_options[] = {
 OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list",
  "monitor specified metrics or metric groups (separated by 
,)",
  parse_metric_groups),
+   OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "root port",
+   "measure PCIe metrics per root port", iostat_parse),

Can we make the help string and default argument more generic?
Something like "measure IO metrics provided by arch/platform"
and the default value being "default". :)


Do you mean using "default" instead of "root port"?
What about the faceless "I/O unit"? :)

 OPT_END()
  };

@@ -1131,6 +1143,12 @@ __weak void arch_topdown_group_warn(void)
  {
  }

+__weak int iostat_list(struct evlist *evlist __maybe_unused,
+   struct perf_stat_config *config __maybe_unused)
+{
+   return 0;
+}
+
  /*
   * Add default attributes, if there were no attributes specified or
   * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1682,6 +1700,10 @@ static void setup_system_wide(int forks)
 }
  }

+__weak void iostat_release(struct evlist *evlist __maybe_unused)
+{
+}
+
  int cmd_stat(int argc, const char **argv)
  {
 const char * const stat_usage[] = {
@@ -1858,6 +1880,12 @@ int cmd_stat(int argc, const char **argv)
 goto out;
 }

+   if (stat_config.iostat_run) {
+   status = iostat_list(evsel_list, &stat_config);

I think it's unnatural to call iostat_list() unconditionally here.
How about this?

 status = iostat_prepare(...);
 if (status < 0)
 goto out;

 if (status == IOSTAT_LIST)
 iostat_list(...);
 else
 ...

I think it's applicable.
In case of 'list' option we will just print list of root ports and exit.
Also listing of root ports is available in verbose mode. In this case we 
will

print list and start the collection.



+   if (status || !stat_config.iostat_run)
+   goto out;
+   }
+
 if (add_default_attributes())
 goto out;

@@ -2008,6 +2036,9 @@ int cmd_stat(int argc, const char **argv)
 perf_stat__exit_aggr_mode();
 perf_evlist__free_stats(evsel_list);
  out:
+   if (stat_config.iostat_run)
+   iostat_release(evsel_list);
+
 zfree(&stat_config.walltime_run);

 if (smi_cost && smi_reset)
diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h
new file mode 100644
index ..b34ebedfd5e6
--- /dev/null
+++ b/tools/perf/util/iostat.h
@@ -0,0 +1,32 @

Re: [PATCH v4 1/5] perf stat: Add AGGR_PCIE_PORT mode

2021-02-08 Thread Alexander Antonov

On 2/4/2021 3:07 PM, Namhyung Kim wrote:

Hello,

On Wed, Feb 3, 2021 at 10:58 PM Alexander Antonov
 wrote:

Adding AGGR_PCIE_PORT mode to be able to distinguish aggr_mode
for root ports in following patches.

I'm not sure adding the AGGR_PCIE_PORT is the right way.
In my understanding, the aggr mode is to specify how we aggregate
counter values of a single event from different cpus.  But this seems
to aggregate counter values from different events.  Also the new
mode is basically the same as AGGR_GLOBAL.

As you will add stat_config.iostat_run to distinguish the iostat
command, probably we just want to use the global aggr mode
(and it's the default!) and get rid of the AGGR_PCIE_PORT.

Thoughts?

Thanks,
Namhyung

Hello Namhyung,

Actually, you are right. We aggregate counter values from different 
events of a

single IIO stack (PCIe root port) to calculate metrics for this IO stack.
But the reason is to prevent using of '-e' and '-M' options in 'iostat' mode
because it can be a reason for the mess in the output that can confuse 
users.


There is an idea to use your suggestion for this part:

status = iostat_prepare(...);
if (status < 0)
    goto out;
if (status == IOSTAT_LIST)
    iostat_list(...);
else
    ...

So, we can check if evlist is empty inside iostat_prepare(). If not, print
a warning, for example, "The -e and -M options are not supported. All chosen
events/metrics will be dropped". Then we can free of evlist by using
evlist__delete(), create new one by using evlist__new() and fill the evlist.

In this case the body of iostat_prepare() function would be:

iostat_prepare()
{
    If (!is_evlist_empty) {
        pr_warning();
        evlist__delete();
        evlist__new()
    }

    iostat_event_group();
}

It will allow to get rid of the AGGR_PCIE_PORT.
What do you think?

Thank you,
Alexander


[PATCH v4 5/5] perf: Update .gitignore file

2021-02-03 Thread Alexander Antonov
After a "make -C tools/perf", git reports the following untracked file:
perf-iostat

Add this generated file to perf's .gitignore file.

Signed-off-by: Alexander Antonov 
---
 tools/perf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index bf1252dc2cb0..421f27e2b9af 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -19,6 +19,7 @@ perf.data.old
 output.svg
 perf-archive
 perf-with-kcore
+perf-iostat
 tags
 TAGS
 cscope*
-- 
2.19.1



[PATCH v4 4/5] perf stat: Enable iostat mode for x86 platforms

2021-02-03 Thread Alexander Antonov
This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
PCIe root port:
 - Inbound Read: I/O devices below root port read from the host memory
 - Inbound Write: I/O devices below root port write to the host memory
 - Outbound Read: CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Signed-off-by: Alexander Antonov 
---
 tools/perf/Documentation/perf-iostat.txt |  88 ++
 tools/perf/Makefile.perf |   5 +-
 tools/perf/arch/x86/util/Build   |   1 +
 tools/perf/arch/x86/util/iostat.c| 345 +++
 tools/perf/command-list.txt  |   1 +
 tools/perf/perf-iostat.sh|  12 +
 6 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/Documentation/perf-iostat.txt
 create mode 100644 tools/perf/perf-iostat.sh

diff --git a/tools/perf/Documentation/perf-iostat.txt 
b/tools/perf/Documentation/perf-iostat.txt
new file mode 100644
index ..165176944031
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===
+
+NAME
+
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iostat' list
+'perf iostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each PCIe root 
port:
+
+- Inbound Read   - I/O devices below root port read from the host memory, in MB
+
+- Inbound Write  - I/O devices below root port write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+---
+...::
+   Any command you can specify in a shell.
+
+list::
+   List all PCIe root ports.
+
+::
+   Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+
+
+1. List all PCIe root ports (example for 2-S platform):
+
+   $ perf iostat list
+   S0-uncore_iio_0<:00>
+   S1-uncore_iio_0<:80>
+   S0-uncore_iio_1<:17>
+   S1-uncore_iio_1<:85>
+   S0-uncore_iio_2<:3a>
+   S1-uncore_iio_2<:ae>
+   S0-uncore_iio_3<:5d>
+   S1-uncore_iio_3<:d7>
+
+2. Collect metrics for all PCIe root ports:
+
+   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :00102  
  3
+   :80000  
  0
+   :17   352552   430  
 21
+   :85000  
  0
+   :3a300  
  0
+   :ae000  
  0
+   :5d000  
  0
+   :d7000  
  0
+
+3. Collect metrics for comma-separated list of PCIe root ports:
+
+   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :17   358559   440  
 22
+   :3a320  
  0
+
+197.081983474 seconds time elapsed
+
+SEE ALSO
+
+linkperf:perf-stat[1]
\ No newline at end of file
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 902c792f326a..b4ab48cc01e3 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -267,6 +267,7 @@ SCRIPT_SH =
 
 SCRIPT_SH += perf-archive.sh
 SCRIPT_SH += perf-with-kcore.sh
+SCRIPT_SH += perf-iostat.sh
 
 grep-libs = $(filter -l%,$(1))

[PATCH v4 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode

2021-02-03 Thread Alexander Antonov
Introduce helper functions to control PCIe root ports list.
These helpers will be used in the follow-up patch.

Signed-off-by: Alexander Antonov 
---
 tools/perf/arch/x86/util/iostat.c | 124 ++
 1 file changed, 124 insertions(+)
 create mode 100644 tools/perf/arch/x86/util/iostat.c

diff --git a/tools/perf/arch/x86/util/iostat.c 
b/tools/perf/arch/x86/util/iostat.c
new file mode 100644
index ..961e540106e6
--- /dev/null
+++ b/tools/perf/arch/x86/util/iostat.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+struct iio_root_port {
+   u32 domain;
+   u8 bus;
+   u8 die;
+   u8 pmu_idx;
+   int idx;
+};
+
+struct iio_root_ports_list {
+   struct iio_root_port **rps;
+   int nr_entries;
+};
+
+static void iio_root_port_show(FILE *output,
+  const struct iio_root_port * const rp)
+{
+   if (output && rp)
+   fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+   rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus,
+  u8 die, u8 pmu_idx)
+{
+   struct iio_root_port *p = calloc(1, sizeof(*p));
+
+   if (p) {
+   p->domain = domain;
+   p->bus = bus;
+   p->die = die;
+   p->pmu_idx = pmu_idx;
+   }
+   return p;
+}
+
+static struct iio_root_ports_list *iio_root_ports_list_new(void)
+{
+   struct iio_root_ports_list *list = calloc(1, sizeof(*list));
+
+   if (list) {
+   list->rps = calloc(1, sizeof(struct iio_root_port *));
+   if (!list->rps) {
+   free(list);
+   list = NULL;
+   }
+   }
+   return list;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+   int idx;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++)
+   free(list->rps[idx]);
+   free(list->rps);
+   free(list);
+   }
+}
+
+static struct iio_root_port *iio_root_port_find_by_notation(
+   const struct iio_root_ports_list * const list, u32 domain, u8 bus)
+{
+   int idx;
+   struct iio_root_port *rp;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++) {
+   rp = list->rps[idx];
+   if (rp && rp->domain == domain && rp->bus == bus)
+   return rp;
+   }
+   }
+   return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   tmp_buf = realloc(list->rps,
+ list->nr_entries * sizeof(*list->rps));
+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
-- 
2.19.1



[PATCH v4 2/5] perf stat: Basic support for iostat in perf

2021-02-03 Thread Alexander Antonov
Add basic flow for a new iostat mode in perf. Mode is intended to
provide four I/O performance metrics per each PCIe root port: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

The actual code to compute the metrics and attribute it to
root port is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c  | 31 ++
 tools/perf/util/iostat.h   | 32 +++
 tools/perf/util/stat-display.c | 40 +-
 tools/perf/util/stat-shadow.c  | 11 +-
 tools/perf/util/stat.h |  1 +
 5 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 tools/perf/util/iostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 60fdb6a0805f..66c913692120 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -65,6 +65,7 @@
 #include "util/target.h"
 #include "util/time-utils.h"
 #include "util/top.h"
+#include "util/iostat.h"
 #include "asm/bug.h"
 
 #include 
@@ -186,6 +187,7 @@ static struct perf_stat_config stat_config = {
.metric_only_len= METRIC_ONLY_LEN,
.walltime_nsecs_stats   = &walltime_nsecs_stats,
.big_num= true,
+   .iostat_run = false,
 };
 
 static inline void diff_timespec(struct timespec *r, struct timespec *a,
@@ -723,6 +725,14 @@ static int parse_metric_groups(const struct option *opt,
return metricgroup__parse_groups(opt, str, &stat_config.metric_events);
 }
 
+__weak int iostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iostat mode is not supported\n");
+   return -1;
+}
+
 static struct option stat_options[] = {
OPT_BOOLEAN('T', "transaction", &transaction_run,
"hardware transaction statistics"),
@@ -803,6 +813,8 @@ static struct option stat_options[] = {
OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list",
 "monitor specified metrics or metric groups (separated by 
,)",
 parse_metric_groups),
+   OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "root port",
+   "measure PCIe metrics per root port", iostat_parse),
OPT_END()
 };
 
@@ -1131,6 +1143,12 @@ __weak void arch_topdown_group_warn(void)
 {
 }
 
+__weak int iostat_list(struct evlist *evlist __maybe_unused,
+   struct perf_stat_config *config __maybe_unused)
+{
+   return 0;
+}
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1682,6 +1700,10 @@ static void setup_system_wide(int forks)
}
 }
 
+__weak void iostat_release(struct evlist *evlist __maybe_unused)
+{
+}
+
 int cmd_stat(int argc, const char **argv)
 {
const char * const stat_usage[] = {
@@ -1858,6 +1880,12 @@ int cmd_stat(int argc, const char **argv)
goto out;
}
 
+   if (stat_config.iostat_run) {
+   status = iostat_list(evsel_list, &stat_config);
+   if (status || !stat_config.iostat_run)
+   goto out;
+   }
+
if (add_default_attributes())
goto out;
 
@@ -2008,6 +2036,9 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
perf_evlist__free_stats(evsel_list);
 out:
+   if (stat_config.iostat_run)
+   iostat_release(evsel_list);
+
zfree(&stat_config.walltime_run);
 
if (smi_cost && smi_reset)
diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h
new file mode 100644
index ..b34ebedfd5e6
--- /dev/null
+++ b/tools/perf/util/iostat.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#ifndef _IOSTAT_H
+#define _IOSTAT_H
+
+#include 
+#include "util/stat.h"
+#include "util/parse-events.h"
+#include "util/evlist.h"
+
+struct option;
+struct perf_stat_config;
+struct evlist;
+struct timespec;
+
+int iostat_parse(const struct option *opt, const char *str,
+int unset __maybe_unused);
+void iostat_prefix(struct perf_stat_config *config, struct evlist *evlist,
+  char *prefix, struct timespec *ts);
+void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+struct perf_stat_output_ctx *out);
+int iostat_list(struct evlist *evlist, struct perf_stat_config *config);
+void iostat_release(struct evlist *evlist);
+
+#endif /* _IOSTAT_H */
diff --git a/tools/p

[PATCH v4 0/5] perf stat: Introduce iostat mode to provide I/O performance metrics

2021-02-03 Thread Alexander Antonov
The previous version can be found at:
v3: 
https://lkml.kernel.org/r/20210126080619.30275-1-alexander.anto...@linux.intel.com/
Changes in this revision are:
v3 -> v4:
- Addressed comment from Namhyung Kim:
   1. Removed NULL-termination of root ports list

The previous version can be found at:
v2: 
https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v2 -> v3:
- Addressed comments from Namhyung Kim:
  1. Removed perf_device pointer from evsel structure. Use priv field instead
  2. Renamed 'iiostat' to 'iostat'
  3. Renamed 'show' mode to 'list' mode
  4. Renamed iiostat_delete_root_ports() to iiostat_release() and
 iostat_show_root_ports() to iostat_list()

The previous version can be found at:
v1: 
https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v1 -> v2:
- Addressed comment from Arnaldo Carvalho de Melo:
  1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat':
- Added perf-iiostat.sh script to use short command
- Updated manual pages to get help for 'perf iiostat'
- Added 'perf-iiostat' to perf's gitignore file

Mode is intended to provide four I/O performance metrics in MB per each
root port:
 - Inbound Read:   I/O devices below root port read from the host memory
 - Inbound Write:  I/O devices below root port write to the host memory
 - Outbound Read:  CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Note: iostat introduces new perf data aggregation mode - per PCIe root port
hence -e and -M options are not supported.

Usage examples:

1. List all PCIe root ports (example for 2-S platform):
   $ perf iostat list
   S0-uncore_iio_0<:00>
   S1-uncore_iio_0<:80>
   S0-uncore_iio_1<:17>
   S1-uncore_iio_1<:85>
   S0-uncore_iio_2<:3a>
   S1-uncore_iio_2<:ae>
   S0-uncore_iio_3<:5d>
   S1-uncore_iio_3<:d7>

2. Collect metrics for all PCIe root ports:
   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :00102   
 3 
   :80000   
 0 
   :17   352552   430   
21 
   :85000   
 0 
   :3a300   
 0 
   :ae000   
 0 
   :5d000   
 0 
   :d7000   
 0

3. Collect metrics for comma separated list of PCIe root ports:
   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :17   358559   440   
    22 
   :3a320   
 0 

197.081983474 seconds time elapsed


Alexander Antonov (5):
  perf stat: Add AGGR_PCIE_PORT mode
  perf stat: Basic support for iostat in perf
  perf stat: Helper functions for PCIe root ports list in iostat mode
  perf stat: Enable iostat mode for x86 platforms
  perf: Update .gitignore file

 tools/perf/.gitignore |   1 +
 tools/perf/Documentation/perf-iostat.txt  |  88 
 tools/perf/Makefile.perf  |   5 +-
 tools/perf/arch/x86/util/Build|   1 +
 tools/perf/arch/x86/util/iostat.c | 469 ++
 tools/perf/builtin-stat.c |  36 +-
 tools/perf/command-list.txt   |   1 +
 tools/perf/perf-iostat.sh |  12 +
 tools/perf/util/iostat.h  |  32 ++
 .../scripting-engines/trace-event-python.c|   3 +-
 tools/perf/

[PATCH v4 1/5] perf stat: Add AGGR_PCIE_PORT mode

2021-02-03 Thread Alexander Antonov
Adding AGGR_PCIE_PORT mode to be able to distinguish aggr_mode
for root ports in following patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c   |  5 -
 .../util/scripting-engines/trace-event-python.c |  3 ++-
 tools/perf/util/stat-display.c  | 13 +++--
 tools/perf/util/stat.c  |  4 +++-
 tools/perf/util/stat.h  |  1 +
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 468fc49420ce..60fdb6a0805f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -908,6 +908,7 @@ static int perf_stat_init_aggr_mode(void)
break;
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_PCIE_PORT:
case AGGR_UNSET:
default:
break;
@@ -1072,6 +1073,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat 
*st)
case AGGR_NONE:
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_PCIE_PORT:
case AGGR_UNSET:
default:
break;
@@ -1844,7 +1846,8 @@ int cmd_stat(int argc, const char **argv)
 * --per-thread is aggregated per thread, we dont mix it with cpu mode
 */
if (((stat_config.aggr_mode != AGGR_GLOBAL &&
- stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) &&
+ stat_config.aggr_mode != AGGR_THREAD &&
+ stat_config.aggr_mode != AGGR_PCIE_PORT) || nr_cgroups) &&
!target__has_cpu(&target)) {
fprintf(stderr, "both cgroup and no-aggregation "
"modes only available in system-wide mode\n");
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
b/tools/perf/util/scripting-engines/trace-event-python.c
index 5d341efc3237..e604c199f493 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1396,7 +1396,8 @@ static void python_process_stat(struct perf_stat_config 
*config,
struct perf_cpu_map *cpus = counter->core.cpus;
int cpu, thread;
 
-   if (config->aggr_mode == AGGR_GLOBAL) {
+   if (config->aggr_mode == AGGR_GLOBAL ||
+   config->aggr_mode == AGGR_PCIE_PORT) {
process_stat(counter, -1, -1, tstamp,
 &counter->counts->aggr);
return;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index ed3b0ac2f785..db1bec115d0b 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -123,6 +123,7 @@ static void aggr_printout(struct perf_stat_config *config,
config->csv_sep);
break;
case AGGR_GLOBAL:
+   case AGGR_PCIE_PORT:
case AGGR_UNSET:
default:
break;
@@ -322,7 +323,8 @@ static int first_shadow_cpu(struct perf_stat_config *config,
if (config->aggr_mode == AGGR_NONE)
return id;
 
-   if (config->aggr_mode == AGGR_GLOBAL)
+   if (config->aggr_mode == AGGR_GLOBAL ||
+   config->aggr_mode == AGGR_PCIE_PORT)
return 0;
 
for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
@@ -416,6 +418,7 @@ static void printout(struct perf_stat_config *config, int 
id, int nr,
if (config->csv_output && !config->metric_only) {
static int aggr_fields[] = {
[AGGR_GLOBAL] = 0,
+   [AGGR_PCIE_PORT] = 0,
[AGGR_THREAD] = 1,
[AGGR_NONE] = 1,
[AGGR_SOCKET] = 2,
@@ -899,6 +902,7 @@ static int aggr_header_lens[] = {
[AGGR_NONE] = 6,
[AGGR_THREAD] = 24,
[AGGR_GLOBAL] = 0,
+   [AGGR_PCIE_PORT] = 0,
 };
 
 static const char *aggr_header_csv[] = {
@@ -907,7 +911,8 @@ static const char *aggr_header_csv[] = {
[AGGR_SOCKET]   =   "socket,cpus",
[AGGR_NONE] =   "cpu,",
[AGGR_THREAD]   =   "comm-pid,",
-   [AGGR_GLOBAL]   =   ""
+   [AGGR_GLOBAL]   =   "",
+   [AGGR_PCIE_PORT] =  "port,"
 };
 
 static void print_metric_headers(struct perf_stat_config *config,
@@ -990,6 +995,8 @@ static void print_interval(struct perf_stat_config *config,
if (!metric_only)
fprintf(output, "  counts %*s 
events\n", unit_width, "unit");
break;
+   case AGGR_PCIE_PORT:
+   break;
case AGGR_GLOBAL:
default:
fprintf(output, "#   time");
@@ -1214,6 +122

Re: [PATCH v3 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode

2021-02-01 Thread Alexander Antonov



On 1/29/2021 11:26 AM, Namhyung Kim wrote:

Hello,

On Tue, Jan 26, 2021 at 5:06 PM Alexander Antonov
 wrote:

Introduce helper functions to control PCIe root ports list.
These helpers will be used in the follow-up patch.

Signed-off-by: Alexander Antonov 
---

[SNIP]

+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   /* One more for NULL.*/
+   tmp_buf = realloc(list->rps,
+ (list->nr_entries + 1) * sizeof(*list->rps));

Why is this +1 needed since you already have the number of
entries in the list?

Thanks,
Namhyung


Hello,

My first approach for iteration through root ports list was using 
NULL-terminated array.

And seems like I just forgot to remove this code. I will fix it.

Thank you,
Alexander




+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   tmp_buf[list->nr_entries] = NULL;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
--
2.19.1



[PATCH v3 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode

2021-01-26 Thread Alexander Antonov
Introduce helper functions to control PCIe root ports list.
These helpers will be used in the follow-up patch.

Signed-off-by: Alexander Antonov 
---
 tools/perf/arch/x86/util/iostat.c | 127 ++
 1 file changed, 127 insertions(+)
 create mode 100644 tools/perf/arch/x86/util/iostat.c

diff --git a/tools/perf/arch/x86/util/iostat.c 
b/tools/perf/arch/x86/util/iostat.c
new file mode 100644
index ..3ef727f9da63
--- /dev/null
+++ b/tools/perf/arch/x86/util/iostat.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+struct iio_root_port {
+   u32 domain;
+   u8 bus;
+   u8 die;
+   u8 pmu_idx;
+   int idx;
+};
+
+struct iio_root_ports_list {
+   struct iio_root_port **rps;
+   int nr_entries;
+};
+
+static void iio_root_port_show(FILE *output,
+  const struct iio_root_port * const rp)
+{
+   if (output && rp)
+   fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+   rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus,
+  u8 die, u8 pmu_idx)
+{
+   struct iio_root_port *p = calloc(1, sizeof(*p));
+
+   if (p) {
+   p->domain = domain;
+   p->bus = bus;
+   p->die = die;
+   p->pmu_idx = pmu_idx;
+   }
+   return p;
+}
+
+static struct iio_root_ports_list *iio_root_ports_list_new(void)
+{
+   struct iio_root_ports_list *list = calloc(1, sizeof(*list));
+
+   if (list) {
+   list->rps = calloc(1, sizeof(struct iio_root_port *));
+   if (!list->rps) {
+   free(list);
+   list = NULL;
+   }
+   }
+
+   return list;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+   int idx;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++)
+   free(list->rps[idx]);
+   free(list->rps);
+   free(list);
+   }
+}
+
+static struct iio_root_port *iio_root_port_find_by_notation(
+   const struct iio_root_ports_list * const list, u32 domain, u8 bus)
+{
+   int idx;
+   struct iio_root_port *rp;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++) {
+   rp = list->rps[idx];
+   if (rp && rp->domain == domain && rp->bus == bus)
+   return rp;
+   }
+   }
+   return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   /* One more for NULL.*/
+   tmp_buf = realloc(list->rps,
+ (list->nr_entries + 1) * sizeof(*list->rps));
+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   tmp_buf[list->nr_entries] = NULL;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
-- 
2.19.1



[PATCH v3 5/5] perf: Update .gitignore file

2021-01-26 Thread Alexander Antonov
After a "make -C tools/perf", git reports the following untracked file:
perf-iostat

Add this generated file to perf's .gitignore file.

Signed-off-by: Alexander Antonov 
---
 tools/perf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index bf1252dc2cb0..421f27e2b9af 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -19,6 +19,7 @@ perf.data.old
 output.svg
 perf-archive
 perf-with-kcore
+perf-iostat
 tags
 TAGS
 cscope*
-- 
2.19.1



[PATCH v3 4/5] perf stat: Enable iostat mode for x86 platforms

2021-01-26 Thread Alexander Antonov
This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
PCIe root port:
 - Inbound Read: I/O devices below root port read from the host memory
 - Inbound Write: I/O devices below root port write to the host memory
 - Outbound Read: CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Signed-off-by: Alexander Antonov 
---
 tools/perf/Documentation/perf-iostat.txt |  88 ++
 tools/perf/Makefile.perf |   5 +-
 tools/perf/arch/x86/util/Build   |   1 +
 tools/perf/arch/x86/util/iostat.c| 345 +++
 tools/perf/command-list.txt  |   1 +
 tools/perf/perf-iostat.sh|  12 +
 6 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/Documentation/perf-iostat.txt
 create mode 100644 tools/perf/perf-iostat.sh

diff --git a/tools/perf/Documentation/perf-iostat.txt 
b/tools/perf/Documentation/perf-iostat.txt
new file mode 100644
index ..165176944031
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===
+
+NAME
+
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iostat' list
+'perf iostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each PCIe root 
port:
+
+- Inbound Read   - I/O devices below root port read from the host memory, in MB
+
+- Inbound Write  - I/O devices below root port write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+---
+...::
+   Any command you can specify in a shell.
+
+list::
+   List all PCIe root ports.
+
+::
+   Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+
+
+1. List all PCIe root ports (example for 2-S platform):
+
+   $ perf iostat list
+   S0-uncore_iio_0<:00>
+   S1-uncore_iio_0<:80>
+   S0-uncore_iio_1<:17>
+   S1-uncore_iio_1<:85>
+   S0-uncore_iio_2<:3a>
+   S1-uncore_iio_2<:ae>
+   S0-uncore_iio_3<:5d>
+   S1-uncore_iio_3<:d7>
+
+2. Collect metrics for all PCIe root ports:
+
+   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :00102  
  3
+   :80000  
  0
+   :17   352552   430  
 21
+   :85000  
  0
+   :3a300  
  0
+   :ae000  
  0
+   :5d000  
  0
+   :d7000  
  0
+
+3. Collect metrics for comma-separated list of PCIe root ports:
+
+   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :17   358559   440  
 22
+   :3a320  
  0
+
+197.081983474 seconds time elapsed
+
+SEE ALSO
+
+linkperf:perf-stat[1]
\ No newline at end of file
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 902c792f326a..b4ab48cc01e3 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -267,6 +267,7 @@ SCRIPT_SH =
 
 SCRIPT_SH += perf-archive.sh
 SCRIPT_SH += perf-with-kcore.sh
+SCRIPT_SH += perf-iostat.sh
 
 grep-libs = $(filter -l%,$(1))

[PATCH v3 2/5] perf stat: Basic support for iostat in perf

2021-01-26 Thread Alexander Antonov
Add basic flow for a new iostat mode in perf. Mode is intended to
provide four I/O performance metrics per each PCIe root port: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

The actual code to compute the metrics and attribute it to
root port is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c  | 31 ++
 tools/perf/util/iostat.h   | 32 +++
 tools/perf/util/stat-display.c | 40 +-
 tools/perf/util/stat-shadow.c  | 11 +-
 tools/perf/util/stat.h |  1 +
 5 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 tools/perf/util/iostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 60fdb6a0805f..66c913692120 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -65,6 +65,7 @@
 #include "util/target.h"
 #include "util/time-utils.h"
 #include "util/top.h"
+#include "util/iostat.h"
 #include "asm/bug.h"
 
 #include 
@@ -186,6 +187,7 @@ static struct perf_stat_config stat_config = {
.metric_only_len= METRIC_ONLY_LEN,
.walltime_nsecs_stats   = &walltime_nsecs_stats,
.big_num= true,
+   .iostat_run = false,
 };
 
 static inline void diff_timespec(struct timespec *r, struct timespec *a,
@@ -723,6 +725,14 @@ static int parse_metric_groups(const struct option *opt,
return metricgroup__parse_groups(opt, str, &stat_config.metric_events);
 }
 
+__weak int iostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iostat mode is not supported\n");
+   return -1;
+}
+
 static struct option stat_options[] = {
OPT_BOOLEAN('T', "transaction", &transaction_run,
"hardware transaction statistics"),
@@ -803,6 +813,8 @@ static struct option stat_options[] = {
OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list",
 "monitor specified metrics or metric groups (separated by 
,)",
 parse_metric_groups),
+   OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "root port",
+   "measure PCIe metrics per root port", iostat_parse),
OPT_END()
 };
 
@@ -1131,6 +1143,12 @@ __weak void arch_topdown_group_warn(void)
 {
 }
 
+__weak int iostat_list(struct evlist *evlist __maybe_unused,
+   struct perf_stat_config *config __maybe_unused)
+{
+   return 0;
+}
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1682,6 +1700,10 @@ static void setup_system_wide(int forks)
}
 }
 
+__weak void iostat_release(struct evlist *evlist __maybe_unused)
+{
+}
+
 int cmd_stat(int argc, const char **argv)
 {
const char * const stat_usage[] = {
@@ -1858,6 +1880,12 @@ int cmd_stat(int argc, const char **argv)
goto out;
}
 
+   if (stat_config.iostat_run) {
+   status = iostat_list(evsel_list, &stat_config);
+   if (status || !stat_config.iostat_run)
+   goto out;
+   }
+
if (add_default_attributes())
goto out;
 
@@ -2008,6 +2036,9 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
perf_evlist__free_stats(evsel_list);
 out:
+   if (stat_config.iostat_run)
+   iostat_release(evsel_list);
+
zfree(&stat_config.walltime_run);
 
if (smi_cost && smi_reset)
diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h
new file mode 100644
index ..b34ebedfd5e6
--- /dev/null
+++ b/tools/perf/util/iostat.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#ifndef _IOSTAT_H
+#define _IOSTAT_H
+
+#include 
+#include "util/stat.h"
+#include "util/parse-events.h"
+#include "util/evlist.h"
+
+struct option;
+struct perf_stat_config;
+struct evlist;
+struct timespec;
+
+int iostat_parse(const struct option *opt, const char *str,
+int unset __maybe_unused);
+void iostat_prefix(struct perf_stat_config *config, struct evlist *evlist,
+  char *prefix, struct timespec *ts);
+void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+struct perf_stat_output_ctx *out);
+int iostat_list(struct evlist *evlist, struct perf_stat_config *config);
+void iostat_release(struct evlist *evlist);
+
+#endif /* _IOSTAT_H */
diff --git a/tools/p

[PATCH v3 0/5] perf stat: Introduce iostat mode to provide I/O performance metrics

2021-01-26 Thread Alexander Antonov
The previous version can be found at:
v2: 
https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v2 -> v3:
- Addressed comments from Namhyung Kim:
  1. Removed perf_device pointer from evsel structure. Use priv field instead
  2. Renamed 'iiostat' to 'iostat'
  3. Renamed 'show' mode to 'list' mode
  4. Renamed iiostat_delete_root_ports() to iiostat_release() and
 iostat_show_root_ports() to iostat_list()


The previous version can be found at:
v1: 
https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v1 -> v2:
- Addressed comment from Arnaldo Carvalho de Melo:
  1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat':
- Added perf-iiostat.sh script to use short command
- Updated manual pages to get help for 'perf iiostat'
- Added 'perf-iiostat' to perf's gitignore file

Mode is intended to provide four I/O performance metrics in MB per each
root port:
 - Inbound Read:   I/O devices below root port read from the host memory
 - Inbound Write:  I/O devices below root port write to the host memory
 - Outbound Read:  CPU reads from I/O devices below root port
 - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Note: iostat introduces new perf data aggregation mode - per PCIe root port
hence -e and -M options are not supported.

Usage examples:

1. List all PCIe root ports (example for 2-S platform):
   $ perf iostat list
   S0-uncore_iio_0<:00>
   S1-uncore_iio_0<:80>
   S0-uncore_iio_1<:17>
   S1-uncore_iio_1<:85>
   S0-uncore_iio_2<:3a>
   S1-uncore_iio_2<:ae>
   S0-uncore_iio_3<:5d>
   S1-uncore_iio_3<:d7>

2. Collect metrics for all PCIe root ports:
   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :00102   
 3 
   :80000   
 0 
   :17   352552   430   
21 
   :85000   
 0 
   :3a300   
 0 
   :ae000   
 0 
   :5d000   
 0 
   :d7000   
 0

3. Collect metrics for comma separated list of PCIe root ports:
   $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :17   358559   440   
    22 
   :3a320   
 0 

197.081983474 seconds time elapsed

Alexander Antonov (5):
  perf stat: Add AGGR_PCIE_PORT mode
  perf stat: Basic support for iostat in perf
  perf stat: Helper functions for PCIe root ports list in iostat mode
  perf stat: Enable iostat mode for x86 platforms
  perf: Update .gitignore file

 tools/perf/.gitignore |   1 +
 tools/perf/Documentation/perf-iostat.txt  |  88 
 tools/perf/Makefile.perf  |   5 +-
 tools/perf/arch/x86/util/Build|   1 +
 tools/perf/arch/x86/util/iostat.c | 472 ++
 tools/perf/builtin-stat.c |  36 +-
 tools/perf/command-list.txt   |   1 +
 tools/perf/perf-iostat.sh |  12 +
 tools/perf/util/iostat.h  |  32 ++
 .../scripting-engines/trace-event-python.c|   3 +-
 tools/perf/util/stat-display.c|  53 +-
 tools/perf/util/stat-shadow.c |  11 +-
 tools/perf/util/stat.c|   4 +-
 tools/perf/util/stat.h|   2 +
 14 files changed, 713 insertions(+), 8 dele

[PATCH v3 1/5] perf stat: Add AGGR_PCIE_PORT mode

2021-01-26 Thread Alexander Antonov
Adding AGGR_PCIE_PORT mode to be able to distinguish aggr_mode
for root ports in following patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c   |  5 -
 .../util/scripting-engines/trace-event-python.c |  3 ++-
 tools/perf/util/stat-display.c  | 13 +++--
 tools/perf/util/stat.c  |  4 +++-
 tools/perf/util/stat.h  |  1 +
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 468fc49420ce..60fdb6a0805f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -908,6 +908,7 @@ static int perf_stat_init_aggr_mode(void)
break;
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_PCIE_PORT:
case AGGR_UNSET:
default:
break;
@@ -1072,6 +1073,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat 
*st)
case AGGR_NONE:
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_PCIE_PORT:
case AGGR_UNSET:
default:
break;
@@ -1844,7 +1846,8 @@ int cmd_stat(int argc, const char **argv)
 * --per-thread is aggregated per thread, we dont mix it with cpu mode
 */
if (((stat_config.aggr_mode != AGGR_GLOBAL &&
- stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) &&
+ stat_config.aggr_mode != AGGR_THREAD &&
+ stat_config.aggr_mode != AGGR_PCIE_PORT) || nr_cgroups) &&
!target__has_cpu(&target)) {
fprintf(stderr, "both cgroup and no-aggregation "
"modes only available in system-wide mode\n");
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
b/tools/perf/util/scripting-engines/trace-event-python.c
index 5d341efc3237..e604c199f493 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1396,7 +1396,8 @@ static void python_process_stat(struct perf_stat_config 
*config,
struct perf_cpu_map *cpus = counter->core.cpus;
int cpu, thread;
 
-   if (config->aggr_mode == AGGR_GLOBAL) {
+   if (config->aggr_mode == AGGR_GLOBAL ||
+   config->aggr_mode == AGGR_PCIE_PORT) {
process_stat(counter, -1, -1, tstamp,
 &counter->counts->aggr);
return;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index ed3b0ac2f785..db1bec115d0b 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -123,6 +123,7 @@ static void aggr_printout(struct perf_stat_config *config,
config->csv_sep);
break;
case AGGR_GLOBAL:
+   case AGGR_PCIE_PORT:
case AGGR_UNSET:
default:
break;
@@ -322,7 +323,8 @@ static int first_shadow_cpu(struct perf_stat_config *config,
if (config->aggr_mode == AGGR_NONE)
return id;
 
-   if (config->aggr_mode == AGGR_GLOBAL)
+   if (config->aggr_mode == AGGR_GLOBAL ||
+   config->aggr_mode == AGGR_PCIE_PORT)
return 0;
 
for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
@@ -416,6 +418,7 @@ static void printout(struct perf_stat_config *config, int 
id, int nr,
if (config->csv_output && !config->metric_only) {
static int aggr_fields[] = {
[AGGR_GLOBAL] = 0,
+   [AGGR_PCIE_PORT] = 0,
[AGGR_THREAD] = 1,
[AGGR_NONE] = 1,
[AGGR_SOCKET] = 2,
@@ -899,6 +902,7 @@ static int aggr_header_lens[] = {
[AGGR_NONE] = 6,
[AGGR_THREAD] = 24,
[AGGR_GLOBAL] = 0,
+   [AGGR_PCIE_PORT] = 0,
 };
 
 static const char *aggr_header_csv[] = {
@@ -907,7 +911,8 @@ static const char *aggr_header_csv[] = {
[AGGR_SOCKET]   =   "socket,cpus",
[AGGR_NONE] =   "cpu,",
[AGGR_THREAD]   =   "comm-pid,",
-   [AGGR_GLOBAL]   =   ""
+   [AGGR_GLOBAL]   =   "",
+   [AGGR_PCIE_PORT] =  "port,"
 };
 
 static void print_metric_headers(struct perf_stat_config *config,
@@ -990,6 +995,8 @@ static void print_interval(struct perf_stat_config *config,
if (!metric_only)
fprintf(output, "  counts %*s 
events\n", unit_width, "unit");
break;
+   case AGGR_PCIE_PORT:
+   break;
case AGGR_GLOBAL:
default:
fprintf(output, "#   time");
@@ -1214,6 +122

Re: [PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms

2021-01-15 Thread Alexander Antonov



On 1/15/2021 10:33 AM, Namhyung Kim wrote:

On Fri, Jan 15, 2021 at 1:41 AM Alexander Antonov
 wrote:

On 1/14/2021 6:39 AM, Namhyung Kim wrote:

On Wed, Jan 13, 2021 at 9:08 PM Alexander Antonov
 wrote:

On 1/6/2021 12:02 PM, Namhyung Kim wrote:

On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov

diff --git a/tools/perf/perf-iiostat.sh b/tools/perf/perf-iiostat.sh
new file mode 100644
index ..2c5168d2550b
--- /dev/null
+++ b/tools/perf/perf-iiostat.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# perf iiostat
+# Alexander Antonov 
+
+if [[ "$1" == "show" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? 
]]; then
+DELIMITER="="
+else
+DELIMITER=" "
+fi
+
+perf stat --iiostat$DELIMITER$*

Why is this needed?

Thanks,
Namhyung

Arnaldo raised question relates to format of 'perf stat --iiostat'
subcommand
and explained how it can be changed to 'perf iiostat' through the aliases
mechanism in perf.

Yeah, I know that.  What I'm asking is the DELIMITER part.

Thanks,
Namhyung

I'm using DELIMITER to resolve two different cases for format of iiostat
command:
The first one is the command with an option for iiostat mode, for example:
'perf iiostat show' which should be converted to 'perf stat
--iiostat=show' or
'perf iiostat :ae,:5d' to 'perf stat --iiostat=:ae,:5d'.
The second is the command without any option for iiostat: 'perf iiostat
-I 1000'
should be converted to 'perf stat --iiostat -I 1000'.

Can't we simply use a whitespace ?

We need to use the equal sign to pass arguments to iiostat mode.

Thanks,
Alexander


Re: [PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms

2021-01-14 Thread Alexander Antonov



On 1/14/2021 6:39 AM, Namhyung Kim wrote:

On Wed, Jan 13, 2021 at 9:08 PM Alexander Antonov
 wrote:


On 1/6/2021 12:02 PM, Namhyung Kim wrote:

On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov
 wrote:

This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
IIO stack:
   - Inbound Read: I/O devices below IIO stack read from the host memory
   - Inbound Write: I/O devices below IIO stack write to the host memory
   - Outbound Read: CPU reads from I/O devices below IIO stack
   - Outbound Write: CPU writes to I/O devices below IIO stack

Each metric requiries only one IIO event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
  #EventCount * 4B / (1024 * 1024)

Hmm.. maybe we can do this with JSON metrics, no?

Do you mean to add metrics to *-metrics.json file?
Looks like it's possible but in this case JSON file should be updated
for each
new enabled platform and calculations will be the same.
I would prefer to leave it as is because perf will work without changing of
userspace part once IIO sysfs attributes are added for new platforms.

OK.


Signed-off-by: Alexander Antonov 
---

[SNIP]

diff --git a/tools/perf/perf-iiostat.sh b/tools/perf/perf-iiostat.sh
new file mode 100644
index ..2c5168d2550b
--- /dev/null
+++ b/tools/perf/perf-iiostat.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# perf iiostat
+# Alexander Antonov 
+
+if [[ "$1" == "show" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? 
]]; then
+DELIMITER="="
+else
+DELIMITER=" "
+fi
+
+perf stat --iiostat$DELIMITER$*

Why is this needed?

Thanks,
Namhyung

Arnaldo raised question relates to format of 'perf stat --iiostat'
subcommand
and explained how it can be changed to 'perf iiostat' through the aliases
mechanism in perf.

Yeah, I know that.  What I'm asking is the DELIMITER part.

Thanks,
Namhyung
I'm using DELIMITER to resolve two different cases for format of iiostat 
command:

The first one is the command with an option for iiostat mode, for example:
'perf iiostat show' which should be converted to 'perf stat 
--iiostat=show' or

'perf iiostat :ae,:5d' to 'perf stat --iiostat=:ae,:5d'.
The second is the command without any option for iiostat: 'perf iiostat 
-I 1000'

should be converted to 'perf stat --iiostat -I 1000'.

Thanks,
Alexander


Re: [PATCH v2 3/6] perf stat: Basic support for iiostat in perf

2021-01-14 Thread Alexander Antonov



On 1/14/2021 6:34 AM, Namhyung Kim wrote:

Hello,

On Wed, Jan 13, 2021 at 8:34 PM Alexander Antonov
 wrote:


On 1/6/2021 11:56 AM, Namhyung Kim wrote:

On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov
 wrote:

Add basic flow for a new iiostat mode in perf. Mode is intended to
provide four I/O performance metrics per each IIO stack: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

It seems like a generic analysis and other archs can extend it later..
Then we can make it a bit more general.. at least, names? :)

I'm not sure that I fully understand you. Do you mean to rename metrics?
The mode is intended to provide PCIe metrics which are appliable for
other archs
as well.
Actually, I suppose we can rename 'iiostat' to 'pciestat' or something
like this
to make it a bit more general because the name 'IIO' (Integrated I/O
stack) is
Intel specific and it can be named in different way on other platforms.
In this
case the code has to be updated in the same way as well.

Maybe just 'iostat' ?

Yeah, it looks better :)




The actual code to compute the metrics and attribute it to
evsel::perf_device is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
   tools/perf/builtin-stat.c  | 33 -
   tools/perf/util/iiostat.h  | 33 +
   tools/perf/util/stat-display.c | 38 +-
   tools/perf/util/stat-shadow.c  | 11 +-
   tools/perf/util/stat.h |  1 +
   5 files changed, 113 insertions(+), 3 deletions(-)
   create mode 100644 tools/perf/util/iiostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 72f9d0aa3f96..14c3da136927 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -67,6 +67,7 @@
   #include "util/top.h"
   #include "util/affinity.h"
   #include "util/pfm.h"
+#include "util/iiostat.h"
   #include "asm/bug.h"

   #include 
@@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = {
  .walltime_nsecs_stats   = &walltime_nsecs_stats,
  .big_num= true,
  .ctl_fd = -1,
-   .ctl_fd_ack = -1
+   .ctl_fd_ack = -1,
+   .iiostat_run= false,
   };

   static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt,
  return parse_cgroups(opt, str, unset);
   }

+__weak int iiostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iiostat mode is not supported\n");
+   return -1;
+}
+
   static struct option stat_options[] = {
  OPT_BOOLEAN('T', "transaction", &transaction_run,
  "hardware transaction statistics"),
@@ -1185,6 +1195,8 @@ static struct option stat_options[] = {
   "\t\t\t  Optionally send control command completion 
('ack\\n') to ack-fd descriptor.\n"
   "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened 
and used as ctl-fd / ack-fd.",
parse_control_option),
+   OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root 
port",
+   "measure PCIe metrics per IIO stack", 
iiostat_parse),
  OPT_END()
   };

@@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct 
perf_stat *st)
  return 0;
   }

+__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused,
+  struct perf_stat_config *config 
__maybe_unused)
+{
+   return 0;
+}

I think it's too specific, maybe iiostat_prepare() ?

What do you think about iiostat_show_root_ports() -> iiostat_show()?

I'm ok with it, I thought it needs some initialization work there.


+
   /*
* Add default attributes, if there were no attributes specified or
* if -d/--detailed, -d -d or -d -d -d is used:
@@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks)
  }
   }

+__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused)
+{
+}

Same here..

I suggest to rename iiostat_delete_root_ports() -> iiostat_release().
What do you think?

Looks good.


+
   int cmd_stat(int argc, const char **argv)
   {
  const char * const stat_usage[] = {
@@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv)
  goto out;
  }

+   if (stat_config.iiostat_run) {
+   status = iiostat_show_root_ports(evsel_list, &stat_config);
+   if (status || !stat_config.iiostat_run)
+   goto out;
+   }
+
  if (add_default_attributes(

Re: [PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms

2021-01-13 Thread Alexander Antonov



On 1/6/2021 12:02 PM, Namhyung Kim wrote:

On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov
 wrote:

This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
IIO stack:
  - Inbound Read: I/O devices below IIO stack read from the host memory
  - Inbound Write: I/O devices below IIO stack write to the host memory
  - Outbound Read: CPU reads from I/O devices below IIO stack
  - Outbound Write: CPU writes to I/O devices below IIO stack

Each metric requiries only one IIO event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
 #EventCount * 4B / (1024 * 1024)

Hmm.. maybe we can do this with JSON metrics, no?

Do you mean to add metrics to *-metrics.json file?
Looks like it's possible but in this case JSON file should be updated 
for each

new enabled platform and calculations will be the same.
I would prefer to leave it as is because perf will work without changing of
userspace part once IIO sysfs attributes are added for new platforms.



Signed-off-by: Alexander Antonov 
---
  tools/perf/Documentation/perf-iiostat.txt |  89 ++
  tools/perf/Makefile.perf  |   5 +-
  tools/perf/arch/x86/util/Build|   1 +
  tools/perf/arch/x86/util/iiostat.c| 337 ++
  tools/perf/command-list.txt   |   1 +
  tools/perf/perf-iiostat.sh|  12 +
  6 files changed, 444 insertions(+), 1 deletion(-)
  create mode 100644 tools/perf/Documentation/perf-iiostat.txt
  create mode 100644 tools/perf/perf-iiostat.sh

diff --git a/tools/perf/Documentation/perf-iiostat.txt 
b/tools/perf/Documentation/perf-iiostat.txt
new file mode 100644
index ..38b5697b0d85
--- /dev/null
+++ b/tools/perf/Documentation/perf-iiostat.txt
@@ -0,0 +1,89 @@
+perf-iiostat(1)
+===
+
+NAME
+
+perf-iiostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iiostat' show
+'perf iiostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each IIO
+stack (PCIe root port):
+
+- Inbound Read   - I/O devices below IIO stack read from the host memory, in MB
+
+- Inbound Write  - I/O devices below IIO stack write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below IIO stack, in MB
+
+- Outbound Write - CPU writes to I/O devices below IIO stack, in MB
+
+OPTIONS
+---
+...::
+   Any command you can specify in a shell.
+
+show::
+   List all IIO stacks.

I'd prefer 'list' for this, but not a strong opinion..

The 'list' is fine for me as well.



+
+::
+   Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+
+
+1. List all IIO stacks (example for 2-S platform):
+
+   $ perf iiostat show
+   S0-uncore_iio_0<:00>
+   S1-uncore_iio_0<:80>
+   S0-uncore_iio_1<:17>
+   S1-uncore_iio_1<:85>
+   S0-uncore_iio_2<:3a>
+   S1-uncore_iio_2<:ae>
+   S0-uncore_iio_3<:5d>
+   S1-uncore_iio_3<:d7>
+
+2. Collect metrics for all I/O stacks:
+
+   $ perf iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+   :00102  
  3
+   :80000  
  0
+   :17   352552   430  
 21
+   :85000  
  0
+   :3a300  
  0
+   :ae000  
  0
+   :5d000  
  0
+   :d7000  
  0
+
+3. Collect metrics for comma-separated list of I/O stacks:
+
+   $ perf iiostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+

Re: [PATCH v2 3/6] perf stat: Basic support for iiostat in perf

2021-01-13 Thread Alexander Antonov



On 1/6/2021 11:56 AM, Namhyung Kim wrote:

On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov
 wrote:

Add basic flow for a new iiostat mode in perf. Mode is intended to
provide four I/O performance metrics per each IIO stack: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

It seems like a generic analysis and other archs can extend it later..
Then we can make it a bit more general.. at least, names? :)

I'm not sure that I fully understand you. Do you mean to rename metrics?
The mode is intended to provide PCIe metrics which are appliable for 
other archs

as well.
Actually, I suppose we can rename 'iiostat' to 'pciestat' or something 
like this
to make it a bit more general because the name 'IIO' (Integrated I/O 
stack) is
Intel specific and it can be named in different way on other platforms. 
In this

case the code has to be updated in the same way as well.



The actual code to compute the metrics and attribute it to
evsel::perf_device is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
  tools/perf/builtin-stat.c  | 33 -
  tools/perf/util/iiostat.h  | 33 +
  tools/perf/util/stat-display.c | 38 +-
  tools/perf/util/stat-shadow.c  | 11 +-
  tools/perf/util/stat.h |  1 +
  5 files changed, 113 insertions(+), 3 deletions(-)
  create mode 100644 tools/perf/util/iiostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 72f9d0aa3f96..14c3da136927 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -67,6 +67,7 @@
  #include "util/top.h"
  #include "util/affinity.h"
  #include "util/pfm.h"
+#include "util/iiostat.h"
  #include "asm/bug.h"

  #include 
@@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = {
 .walltime_nsecs_stats   = &walltime_nsecs_stats,
 .big_num= true,
 .ctl_fd = -1,
-   .ctl_fd_ack = -1
+   .ctl_fd_ack = -1,
+   .iiostat_run= false,
  };

  static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt,
 return parse_cgroups(opt, str, unset);
  }

+__weak int iiostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iiostat mode is not supported\n");
+   return -1;
+}
+
  static struct option stat_options[] = {
 OPT_BOOLEAN('T', "transaction", &transaction_run,
 "hardware transaction statistics"),
@@ -1185,6 +1195,8 @@ static struct option stat_options[] = {
  "\t\t\t  Optionally send control command completion ('ack\\n') 
to ack-fd descriptor.\n"
  "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and 
used as ctl-fd / ack-fd.",
   parse_control_option),
+   OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root 
port",
+   "measure PCIe metrics per IIO stack", 
iiostat_parse),
 OPT_END()
  };

@@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct 
perf_stat *st)
 return 0;
  }

+__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused,
+  struct perf_stat_config *config 
__maybe_unused)
+{
+   return 0;
+}

I think it's too specific, maybe iiostat_prepare() ?

What do you think about iiostat_show_root_ports() -> iiostat_show()?



+
  /*
   * Add default attributes, if there were no attributes specified or
   * if -d/--detailed, -d -d or -d -d -d is used:
@@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks)
 }
  }

+__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused)
+{
+}

Same here..

I suggest to rename iiostat_delete_root_ports() -> iiostat_release().
What do you think?



+
  int cmd_stat(int argc, const char **argv)
  {
 const char * const stat_usage[] = {
@@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv)
 goto out;
 }

+   if (stat_config.iiostat_run) {
+   status = iiostat_show_root_ports(evsel_list, &stat_config);
+   if (status || !stat_config.iiostat_run)
+   goto out;
+   }
+
 if (add_default_attributes())
 goto out;

@@ -2406,6 +2434,9 @@ int cmd_stat(int argc, const char **argv)
 perf_stat__exit_aggr_mode();
 perf_evlist__free_stats(evsel_list);
  out:
+   if (stat_config.iiostat_run)
+   iiostat_delete_root_ports(evsel_list);
+
 z

Re: [PATCH v2 2/6] perf evsel: Introduce an observed performance device

2021-01-13 Thread Alexander Antonov



On 1/6/2021 11:44 AM, Namhyung Kim wrote:

Hi,

On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov
 wrote:

Adding evsel::perf_device void pointer.

For performance monitoring purposes, an evsel can have a related device.
These changes allow to attribute, for example, I/O performance metrics
to IIO stack.

Signed-off-by: Alexander Antonov 
---
  tools/perf/util/evsel.h | 1 +
  1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 79a860d8e3ee..c346920f477a 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -127,6 +127,7 @@ struct evsel {
  * See also evsel__has_callchain().
  */
 __u64   synth_sample_type;
+   void*perf_device;

Maybe we can use the existing 'priv' field.

Thanks,
Namhyung


Hello Namhyung,

Looks like the 'priv' field isn't used in this case. I suppose it can be
re-used in iiostat mode.

Thanks,
Alexander



  };

  struct perf_missing_features {
--
2.19.1



[PATCH v2 2/6] perf evsel: Introduce an observed performance device

2020-12-23 Thread Alexander Antonov
Adding evsel::perf_device void pointer.

For performance monitoring purposes, an evsel can have a related device.
These changes allow to attribute, for example, I/O performance metrics
to IIO stack.

Signed-off-by: Alexander Antonov 
---
 tools/perf/util/evsel.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 79a860d8e3ee..c346920f477a 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -127,6 +127,7 @@ struct evsel {
 * See also evsel__has_callchain().
 */
__u64   synth_sample_type;
+   void*perf_device;
 };
 
 struct perf_missing_features {
-- 
2.19.1



[PATCH v2 1/6] perf stat: Add AGGR_IIO_STACK mode

2020-12-23 Thread Alexander Antonov
Adding AGGR_IIO_STACK mode to be able to distinguish aggr_mode
for IIO stacks in following patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c   |  7 +--
 .../util/scripting-engines/trace-event-python.c |  2 +-
 tools/perf/util/stat-display.c  | 13 +++--
 tools/perf/util/stat.c  |  3 ++-
 tools/perf/util/stat.h  |  1 +
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f15b2f8aa14d..72f9d0aa3f96 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -913,7 +913,7 @@ static int __run_perf_stat(int argc, const char **argv, int 
run_idx)
init_stats(&walltime_nsecs_stats);
update_stats(&walltime_nsecs_stats, t1 - t0);
 
-   if (stat_config.aggr_mode == AGGR_GLOBAL)
+   if (stat_config.aggr_mode == AGGR_GLOBAL || 
stat_config.aggr_mode == AGGR_IIO_STACK)
perf_evlist__save_aggr_prev_raw_counts(evsel_list);
 
perf_evlist__copy_prev_raw_counts(evsel_list);
@@ -1309,6 +1309,7 @@ static int perf_stat_init_aggr_mode(void)
break;
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_IIO_STACK:
case AGGR_UNSET:
default:
break;
@@ -1499,6 +1500,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat 
*st)
case AGGR_NONE:
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_IIO_STACK:
case AGGR_UNSET:
default:
break;
@@ -2216,7 +2218,8 @@ int cmd_stat(int argc, const char **argv)
 * --per-thread is aggregated per thread, we dont mix it with cpu mode
 */
if (((stat_config.aggr_mode != AGGR_GLOBAL &&
- stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) &&
+ stat_config.aggr_mode != AGGR_THREAD &&
+ stat_config.aggr_mode != AGGR_IIO_STACK) || nr_cgroups) &&
!target__has_cpu(&target)) {
fprintf(stderr, "both cgroup and no-aggregation "
"modes only available in system-wide mode\n");
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
b/tools/perf/util/scripting-engines/trace-event-python.c
index c83c2c6564e0..e8b472faeae4 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1401,7 +1401,7 @@ static void python_process_stat(struct perf_stat_config 
*config,
struct perf_cpu_map *cpus = counter->core.cpus;
int cpu, thread;
 
-   if (config->aggr_mode == AGGR_GLOBAL) {
+   if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == 
AGGR_IIO_STACK) {
process_stat(counter, -1, -1, tstamp,
 &counter->counts->aggr);
return;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 4b57c0c07632..3bfcdb80443a 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -133,6 +133,7 @@ static void aggr_printout(struct perf_stat_config *config,
config->csv_sep);
break;
case AGGR_GLOBAL:
+   case AGGR_IIO_STACK:
case AGGR_UNSET:
default:
break;
@@ -330,7 +331,7 @@ static int first_shadow_cpu(struct perf_stat_config *config,
if (config->aggr_mode == AGGR_NONE)
return id;
 
-   if (config->aggr_mode == AGGR_GLOBAL)
+   if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == 
AGGR_IIO_STACK)
return 0;
 
for (i = 0; i < evsel__nr_cpus(evsel); i++) {
@@ -424,6 +425,7 @@ static void printout(struct perf_stat_config *config, int 
id, int nr,
if (config->csv_output && !config->metric_only) {
static int aggr_fields[] = {
[AGGR_GLOBAL] = 0,
+   [AGGR_IIO_STACK] = 0,
[AGGR_THREAD] = 1,
[AGGR_NONE] = 1,
[AGGR_SOCKET] = 2,
@@ -906,6 +908,7 @@ static int aggr_header_lens[] = {
[AGGR_NONE] = 6,
[AGGR_THREAD] = 24,
[AGGR_GLOBAL] = 0,
+   [AGGR_IIO_STACK] = 0,
 };
 
 static const char *aggr_header_csv[] = {
@@ -914,7 +917,8 @@ static const char *aggr_header_csv[] = {
[AGGR_SOCKET]   =   "socket,cpus",
[AGGR_NONE] =   "cpu,",
[AGGR_THREAD]   =   "comm-pid,",
-   [AGGR_GLOBAL]   =   ""
+   [AGGR_GLOBAL]   =   "",
+   [AGGR_IIO_STACK] =  "port,"
 };
 
 static void print_metric_headers(struct perf_stat_config *c

[PATCH v2 6/6] perf: Update .gitignore file

2020-12-23 Thread Alexander Antonov
After a "make -C tools/perf", git reports the following untracked file:
perf-iiostat

Add this generated file to perf's .gitignore file.

Signed-off-by: Alexander Antonov 
---
 tools/perf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index f3f84781fd74..ab826736e677 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -20,6 +20,7 @@ perf.data.old
 output.svg
 perf-archive
 perf-with-kcore
+perf-iiostat
 tags
 TAGS
 cscope*
-- 
2.19.1



[PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms

2020-12-23 Thread Alexander Antonov
This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
IIO stack:
 - Inbound Read: I/O devices below IIO stack read from the host memory
 - Inbound Write: I/O devices below IIO stack write to the host memory
 - Outbound Read: CPU reads from I/O devices below IIO stack
 - Outbound Write: CPU writes to I/O devices below IIO stack

Each metric requiries only one IIO event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Signed-off-by: Alexander Antonov 
---
 tools/perf/Documentation/perf-iiostat.txt |  89 ++
 tools/perf/Makefile.perf  |   5 +-
 tools/perf/arch/x86/util/Build|   1 +
 tools/perf/arch/x86/util/iiostat.c| 337 ++
 tools/perf/command-list.txt   |   1 +
 tools/perf/perf-iiostat.sh|  12 +
 6 files changed, 444 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/Documentation/perf-iiostat.txt
 create mode 100644 tools/perf/perf-iiostat.sh

diff --git a/tools/perf/Documentation/perf-iiostat.txt 
b/tools/perf/Documentation/perf-iiostat.txt
new file mode 100644
index ..38b5697b0d85
--- /dev/null
+++ b/tools/perf/Documentation/perf-iiostat.txt
@@ -0,0 +1,89 @@
+perf-iiostat(1)
+===
+
+NAME
+
+perf-iiostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iiostat' show
+'perf iiostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each IIO
+stack (PCIe root port):
+
+- Inbound Read   - I/O devices below IIO stack read from the host memory, in MB
+
+- Inbound Write  - I/O devices below IIO stack write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below IIO stack, in MB
+
+- Outbound Write - CPU writes to I/O devices below IIO stack, in MB
+
+OPTIONS
+---
+...::
+   Any command you can specify in a shell.
+
+show::
+   List all IIO stacks.
+
+::
+   Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+
+
+1. List all IIO stacks (example for 2-S platform):
+
+   $ perf iiostat show
+   S0-uncore_iio_0<:00>
+   S1-uncore_iio_0<:80>
+   S0-uncore_iio_1<:17>
+   S1-uncore_iio_1<:85>
+   S0-uncore_iio_2<:3a>
+   S1-uncore_iio_2<:ae>
+   S0-uncore_iio_3<:5d>
+   S1-uncore_iio_3<:d7>
+
+2. Collect metrics for all I/O stacks:
+
+   $ perf iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
+   :00102  
  3 
+   :80000  
  0 
+   :17   352552   430  
 21 
+   :85000  
  0 
+   :3a300  
  0 
+   :ae000  
  0 
+   :5d000  
  0 
+   :d7000  
  0
+
+3. Collect metrics for comma-separated list of I/O stacks:
+
+   $ perf iiostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+Performance counter stats for 'system wide':
+
+  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
+   :17   358559   440  
 22 
+   :3a320  
  0 
+
+197.081983474 seconds time elapsed
+
+SEE ALSO
+
+linkperf:perf-stat[1]
\ No newline at end of file
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 7ce3f2e8b9c7..c16c14a304a9 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -280,6 +280,7 @@ SCRIPT_SH =
 
 SCRIPT_SH += perf-archive.sh
 SCRIPT_SH += perf-with-kcore.sh
+SCRIPT_SH += perf-iiostat.s

[PATCH v2 4/6] perf stat: Helper functions for IIO stacks list in iiostat mode

2020-12-23 Thread Alexander Antonov
Introduce helper functions to control IIO stacks list.
These helpers will be used in the follow-up patch.

Signed-off-by: Alexander Antonov 
---
 tools/perf/arch/x86/util/iiostat.c | 125 +
 1 file changed, 125 insertions(+)
 create mode 100644 tools/perf/arch/x86/util/iiostat.c

diff --git a/tools/perf/arch/x86/util/iiostat.c 
b/tools/perf/arch/x86/util/iiostat.c
new file mode 100644
index ..98b9707b4827
--- /dev/null
+++ b/tools/perf/arch/x86/util/iiostat.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iiostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iiostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+struct iio_root_port {
+   u32 domain;
+   u8 bus;
+   u8 die;
+   u8 pmu_idx;
+   int idx;
+};
+
+struct iio_root_ports_list {
+   struct iio_root_port **rps;
+   int nr_entries;
+};
+
+static void iio_root_port_show(FILE *output, const struct iio_root_port * 
const rp)
+{
+   if (output && rp)
+   fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+   rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, u8 die, u8 
pmu_idx)
+{
+   struct iio_root_port *p = calloc(1, sizeof(*p));
+
+   if (p) {
+   p->domain = domain;
+   p->bus = bus;
+   p->die = die;
+   p->pmu_idx = pmu_idx;
+   }
+   return p;
+}
+
+static struct iio_root_ports_list *iio_root_ports_list_new(void)
+{
+   struct iio_root_ports_list *list = calloc(1, sizeof(*list));
+
+   if (list) {
+   list->rps = calloc(1, sizeof(struct iio_root_port *));
+   if (!list->rps) {
+   free(list);
+   list = NULL;
+   }
+   }
+
+   return list;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+   int idx;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++)
+   free(list->rps[idx]);
+   free(list->rps);
+   free(list);
+   }
+}
+
+static
+struct iio_root_port *iio_root_port_find_by_notation(const struct 
iio_root_ports_list * const list,
+u32 domain, u8 bus)
+{
+   int idx;
+   struct iio_root_port *rp;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++) {
+   rp = list->rps[idx];
+   if (rp && rp->domain == domain && rp->bus == bus)
+   return rp;
+   }
+   }
+   return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   /* One more for NULL.*/
+   tmp_buf = realloc(list->rps, (list->nr_entries + 1) * 
sizeof(*list->rps));
+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   tmp_buf[list->nr_entries] = NULL;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
-- 
2.19.1



[PATCH v2 0/6] perf stat: Introduce iiostat mode to provide I/O performance metrics

2020-12-23 Thread Alexander Antonov
The previous version can be found at:
v1: 
https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com

Changes in this revision are:
v1 -> v2:
  1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat':
- Added perf-iiostat.sh script to use short command
- Updated manual pages to get help for 'perf iiostat'
- Added 'perf-iiostat' to perf's gitignore file

Mode is intended to provide four I/O performance metrics in MB per each
IIO stack:
 - Inbound Read:   I/O devices below IIO stack read from the host memory
 - Inbound Write:  I/O devices below IIO stack write to the host memory
 - Outbound Read:  CPU reads from I/O devices below IIO stack
 - Outbound Write: CPU writes to I/O devices below IIO stack

Each metric requiries only one IIO event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Note: iiostat introduces new perf data aggregation mode - per I/O stack
hence -e and -M options are not supported.

Usage examples:

1. List all IIO stacks (example for 2-S platform):
   $ perf iiostat show
   S0-uncore_iio_0<:00>
   S1-uncore_iio_0<:80>
   S0-uncore_iio_1<:17>
   S1-uncore_iio_1<:85>
   S0-uncore_iio_2<:3a>
   S1-uncore_iio_2<:ae>
   S0-uncore_iio_3<:5d>
   S1-uncore_iio_3<:d7>

2. Collect metrics for all I/O stacks:
   $ perf iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :00102   
 3 
   :80000   
 0 
   :17   352552   430   
21 
   :85000   
 0 
   :3a300   
 0 
   :ae000   
 0 
   :5d000   
 0 
   :d7000   
 0

3. Collect metrics for comma separated list of I/O stacks:
   $ perf iiostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :17   358559   440   
22 
   :3a    3        20   
 0 

197.081983474 seconds time elapsed


Alexander Antonov (6):
  perf stat: Add AGGR_IIO_STACK mode
  perf evsel: Introduce an observed performance device
  perf stat: Basic support for iiostat in perf
  perf stat: Helper functions for IIO stacks list in iiostat mode
  perf stat: Enable iiostat mode for x86 platforms
  perf: Update .gitignore file

 tools/perf/.gitignore |   1 +
 tools/perf/Documentation/perf-iiostat.txt |  89 
 tools/perf/Makefile.perf  |   5 +-
 tools/perf/arch/x86/util/Build|   1 +
 tools/perf/arch/x86/util/iiostat.c| 462 ++
 tools/perf/builtin-stat.c |  40 +-
 tools/perf/command-list.txt   |   1 +
 tools/perf/perf-iiostat.sh|  12 +
 tools/perf/util/evsel.h   |   1 +
 tools/perf/util/iiostat.h |  33 ++
 .../scripting-engines/trace-event-python.c|   2 +-
 tools/perf/util/stat-display.c|  51 +-
 tools/perf/util/stat-shadow.c |  11 +-
 tools/perf/util/stat.c|   3 +-
 tools/perf/util/stat.h|   2 +
 15 files changed, 704 insertions(+), 10 deletions(-)
 create mode 100644 tools/perf/Documentation/perf-iiostat.txt
 create mode 100644 tools/perf/arch/x86/util/iiostat.c
 create mode 100644 tools/perf/perf-iiostat.sh
 create mode 100644 tools/perf/util/iiostat.h


base-commit: 644bf4b0f7acde641d3db200b4db66977e96c3bd
-- 
2.19.1



[PATCH v2 3/6] perf stat: Basic support for iiostat in perf

2020-12-23 Thread Alexander Antonov
Add basic flow for a new iiostat mode in perf. Mode is intended to
provide four I/O performance metrics per each IIO stack: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

The actual code to compute the metrics and attribute it to
evsel::perf_device is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c  | 33 -
 tools/perf/util/iiostat.h  | 33 +
 tools/perf/util/stat-display.c | 38 +-
 tools/perf/util/stat-shadow.c  | 11 +-
 tools/perf/util/stat.h |  1 +
 5 files changed, 113 insertions(+), 3 deletions(-)
 create mode 100644 tools/perf/util/iiostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 72f9d0aa3f96..14c3da136927 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -67,6 +67,7 @@
 #include "util/top.h"
 #include "util/affinity.h"
 #include "util/pfm.h"
+#include "util/iiostat.h"
 #include "asm/bug.h"
 
 #include 
@@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = {
.walltime_nsecs_stats   = &walltime_nsecs_stats,
.big_num= true,
.ctl_fd = -1,
-   .ctl_fd_ack = -1
+   .ctl_fd_ack = -1,
+   .iiostat_run= false,
 };
 
 static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt,
return parse_cgroups(opt, str, unset);
 }
 
+__weak int iiostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("iiostat mode is not supported\n");
+   return -1;
+}
+
 static struct option stat_options[] = {
OPT_BOOLEAN('T', "transaction", &transaction_run,
"hardware transaction statistics"),
@@ -1185,6 +1195,8 @@ static struct option stat_options[] = {
 "\t\t\t  Optionally send control command completion 
('ack\\n') to ack-fd descriptor.\n"
 "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened 
and used as ctl-fd / ack-fd.",
  parse_control_option),
+   OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root 
port",
+   "measure PCIe metrics per IIO stack", 
iiostat_parse),
OPT_END()
 };
 
@@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct 
perf_stat *st)
return 0;
 }
 
+__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused,
+  struct perf_stat_config *config 
__maybe_unused)
+{
+   return 0;
+}
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks)
}
 }
 
+__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused)
+{
+}
+
 int cmd_stat(int argc, const char **argv)
 {
const char * const stat_usage[] = {
@@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv)
goto out;
}
 
+   if (stat_config.iiostat_run) {
+   status = iiostat_show_root_ports(evsel_list, &stat_config);
+   if (status || !stat_config.iiostat_run)
+   goto out;
+   }
+
if (add_default_attributes())
goto out;
 
@@ -2406,6 +2434,9 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
perf_evlist__free_stats(evsel_list);
 out:
+   if (stat_config.iiostat_run)
+   iiostat_delete_root_ports(evsel_list);
+
zfree(&stat_config.walltime_run);
 
if (smi_cost && smi_reset)
diff --git a/tools/perf/util/iiostat.h b/tools/perf/util/iiostat.h
new file mode 100644
index ..8d4226df9975
--- /dev/null
+++ b/tools/perf/util/iiostat.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf iiostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#ifndef _IIOSTAT_H
+#define _IIOSTAT_H
+
+#include 
+#include "util/stat.h"
+#include "util/parse-events.h"
+#include "util/evlist.h"
+
+struct option;
+struct perf_stat_config;
+struct evlist;
+struct timespec;
+
+int iiostat_parse(const struct option *opt, const char *str,
+ int unset __maybe_unused);
+void iiostat_prefix(struct perf_stat_config *config, struct evlist *evlist,
+   char *prefix, struct timespec *ts);
+void iiostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+ struct perf_stat

Re: [PATCH 0/5] perf stat: Introduce --iiostat mode to provide I/O performance metrics

2020-12-20 Thread Alexander Antonov

On 12/15/2020 4:58 PM, Arnaldo Carvalho de Melo wrote:

Em Mon, Dec 14, 2020 at 07:04:30PM -0800, Andi Kleen escreveu:

My first thought was: Why not have a 'perf iiostat' subcommand?
  

Same would apply to a lot of options in perf stat.
  

I guess you could add some aliases to "perf" that give shortcuts
for common perf stat command lines.

Yeah, and we have a mechanism for that, that was exercised only in the
'perf archive' case:

~/libexec/perf-core/perf-archive

I tried this and it works:

[root@five ~]# ls -la ~/bin/perf
lrwxrwxrwx. 1 root root 19 Feb 18  2020 /root/bin/perf -> /home/acme/bin/perf
[root@five ~]# vim ~acme/libexec/perf-core/perf-cgtop
[root@five ~]# chmod +x ~acme/libexec/perf-core/perf-cgtop
[root@five ~]# cat ~acme/libexec/perf-core/perf-cgtop
perf top --hierarchy --all-cgroups -s cgroup,dso,sym $*
[root@five ~]# perf cgtop
[root@five ~]#

use 'e' to expand collapse the current level (+ -> -), 'E'/'C' to
expand/collapse all levels.

'perf help' doesn't show it, which is a shame, I'll add support for it
to traverse ~/libexec/perf-core/perf-* and get the first non interpreter
comment line as a description for the command, so to add a new one is
just a matter of dropping a shell + man page, no need to change the perf
binary.


To test that '$*' at the end:

[root@five ~]# perf cgtop -U

I.e.:

[acme@five perf]$ perf top -h -U

  Usage: perf top []

 -U, --hide_user_symbols
   hide user symbols

[acme@five perf]$

And it works, just kernel level samples grouped in an hierarchy, first
cgroup, then dso, then the symbol.

Also, using this with the 'P' hotkey:

[root@five ~]# perf cgtop --percent-limit 1

Shows how it looks like:

[root@five ~]# cat perf.hist.0
-  86.77%/user.slice/user-1000.slice/session-2.scope
-  36.18%[kernel]
   2.24%[k] unmap_page_range
   1.15%[k] clear_page_rep
   1.10%[k] add_mm_counter_fast
   1.03%[k] alloc_set_pte
   1.03%[k] handle_mm_fault
-  17.65%libc-2.32.so
   2.04%[.] _int_malloc
   1.82%[.] __memmove_avx_unaligned_erms
   1.48%[.] __strlen_avx2
   1.13%[.] _int_free
   1.12%[.] malloc
-   8.09%make
   1.65%[.] jhash_string
   1.05%[.] hash_find_slot
-   6.90%ld-2.32.so
   2.03%[.] do_lookup_x
   1.49%[.] _dl_lookup_symbol_x
-   4.78%cc1
-   4.60%libperl.so.5.32.0
-   2.86%bash
-   1.98%libselinux.so.1
-   1.61%libpython2.7.so.1.0
-   1.06%libpcre2-8.so.0.10.0
-   9.17%/user.slice/user-1000.slice/session-4.scope
-   4.66%perf
-   2.40%libc-2.32.so
-   1.82%[kernel]
-   4.04%/
-   4.02%[kernel]
[root@five ~]#

So 'perf iiostat' would become:

[root@five ~]# cat ~acme/libexec/perf-core/perf-iiostat
perf stat --iiostat $*
[root@five ~]#

There are parameters to that '--iiostat' in the current patchset that
may complicates this tho, with some changes I guess we get what we want.

- Arnaldo


Hello Arnaldo,
Sorry for delayed response.

This is the interesting approach to get shorter command. Thank you for the
explanation. I will update the patchset.

- Alexander


[PATCH 5/5] perf stat: Enable --iiostat mode for x86 platforms

2020-12-10 Thread Alexander Antonov
This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
IIO stack:
 - Inbound Read: I/O devices below IIO stack read from the host memory
 - Inbound Write: I/O devices below IIO stack write to the host memory
 - Outbound Read: CPU reads from I/O devices below IIO stack
 - Outbound Write: CPU writes to I/O devices below IIO stack

Each metric requiries only one IIO event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Signed-off-by: Alexander Antonov 
---
 tools/perf/Documentation/perf-stat.txt |  31 +++
 tools/perf/arch/x86/util/Build |   1 +
 tools/perf/arch/x86/util/iiostat.c | 335 +
 3 files changed, 367 insertions(+)

diff --git a/tools/perf/Documentation/perf-stat.txt 
b/tools/perf/Documentation/perf-stat.txt
index 5d4a673d7621..2c066f7e0681 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -121,6 +121,37 @@ to activate system-wide monitoring. Default is to count on 
all CPUs.
 -A::
 --no-aggr::
 Do not aggregate counts across all monitored CPUs.
+--iiostat::
+Mode is intended to provide four I/O performance metrics per each IIO
+stack (PCIe root port):
+--Inbound Read(MB)   - I/O devices below IIO stack read from the host 
memory, in MB
+--Inbound Write(MB)  - I/O devices below IIO stack write to the host 
memory, in MB
+--Outbound Read(MB)  - CPU reads from I/O devices below IIO stack, in MB
+--Outbound Write(MB) - CPU writes to I/O devices below IIO stack, in MB
+
+Sample output:
+
+Show all IIO stacks on 2-S platform:
+  $ perf stat --iiostat=show
+S0-uncore_iio_0<:00>
+S1-uncore_iio_0<:80>
+S0-uncore_iio_1<:17>
+S1-uncore_iio_1<:85>
+S0-uncore_iio_2<:3a>
+S1-uncore_iio_2<:ae>
+S0-uncore_iio_3<:5d>
+S1-uncore_iio_3<:d7>
+
+Print metrics for requested IIO stacks, multiple comma-separated list 
supported.
+  $ perf stat --iiostat=:17 -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 213.997 s, 1.8 GB/s
+
+   Performance counter stats for 'system wide':
+
+ port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB)
+  :17   358559   440   
22
 
 -n::
 --null::
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index 347c39b960eb..6fa275d3d897 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -6,6 +6,7 @@ perf-y += perf_regs.o
 perf-y += topdown.o
 perf-y += machine.o
 perf-y += event.o
+perf-y += iiostat.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
diff --git a/tools/perf/arch/x86/util/iiostat.c 
b/tools/perf/arch/x86/util/iiostat.c
index 70f93a96723f..44342a111746 100644
--- a/tools/perf/arch/x86/util/iiostat.c
+++ b/tools/perf/arch/x86/util/iiostat.c
@@ -27,6 +27,44 @@
 #include "util/counts.h"
 #include "path.h"
 
+#ifndef MAX_PATH
+#define MAX_PATH 1024
+#endif
+
+#define UNCORE_IIO_PMU_PATH"devices/uncore_iio_%d"
+#define SYSFS_UNCORE_PMU_PATH  "%s/"UNCORE_IIO_PMU_PATH
+#define PLATFORM_MAPPING_PATH  UNCORE_IIO_PMU_PATH"/die%d"
+
+enum iiostat_mode_t {
+   IIOSTAT_NONE= -1,
+   IIOSTAT_RUN = 0,
+   IIOSTAT_SHOW= 1
+};
+
+static enum iiostat_mode_t iiostat_mode = IIOSTAT_NONE;
+
+/*
+ * Each metric requiries only one IIO event which increments at every 4B 
transfer
+ * in corresponding direction. The formulas to compute metrics are generic:
+ * #EventCount * 4B / (1024 * 1024)
+ */
+static const char * const iiostat_metrics[] = {
+   "Inbound Read(MB)",
+   "Inbound Write(MB)",
+   "Outbound Read(MB)",
+   "Outbound Write(MB)",
+};
+
+static inline int iiostat_metrics_count(void)
+{
+   return sizeof(iiostat_metrics) / sizeof(char *);
+}
+
+static const char *iiostat_metric_by_idx(int idx)
+{
+   return *(iiostat_metrics + idx % iiostat_metrics_count());
+}
+
 struct iio_root_port {
u32 domain;
u8 bus;
@@ -123,3 +161,300 @@ static int iio_root_ports_list_insert(struct 
iio_root_ports_list *list,
}
return 0;
 }
+
+static int uncore_pmu_iio_platform_mapping(u8 pmu_idx, struct 
iio_root_ports_list * const list)
+{
+   char *buf;
+   char path[MAX_PATH];
+   

[PATCH 4/5] perf stat: Helper functions for IIO stacks list in iiostat mode

2020-12-10 Thread Alexander Antonov
Introduce helper functions to control IIO stacks list.
These helpers will be used in the follow-up patch.

Signed-off-by: Alexander Antonov 
---
 tools/perf/arch/x86/util/iiostat.c | 125 +
 1 file changed, 125 insertions(+)
 create mode 100644 tools/perf/arch/x86/util/iiostat.c

diff --git a/tools/perf/arch/x86/util/iiostat.c 
b/tools/perf/arch/x86/util/iiostat.c
new file mode 100644
index ..70f93a96723f
--- /dev/null
+++ b/tools/perf/arch/x86/util/iiostat.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf stat --iiostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iiostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+struct iio_root_port {
+   u32 domain;
+   u8 bus;
+   u8 die;
+   u8 pmu_idx;
+   int idx;
+};
+
+struct iio_root_ports_list {
+   struct iio_root_port **rps;
+   int nr_entries;
+};
+
+static void iio_root_port_show(FILE *output, const struct iio_root_port * 
const rp)
+{
+   if (output && rp)
+   fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+   rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, u8 die, u8 
pmu_idx)
+{
+   struct iio_root_port *p = calloc(1, sizeof(*p));
+
+   if (p) {
+   p->domain = domain;
+   p->bus = bus;
+   p->die = die;
+   p->pmu_idx = pmu_idx;
+   }
+   return p;
+}
+
+static struct iio_root_ports_list *iio_root_ports_list_new(void)
+{
+   struct iio_root_ports_list *list = calloc(1, sizeof(*list));
+
+   if (list) {
+   list->rps = calloc(1, sizeof(struct iio_root_port *));
+   if (!list->rps) {
+   free(list);
+   list = NULL;
+   }
+   }
+
+   return list;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+   int idx;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++)
+   free(list->rps[idx]);
+   free(list->rps);
+   free(list);
+   }
+}
+
+static
+struct iio_root_port *iio_root_port_find_by_notation(const struct 
iio_root_ports_list * const list,
+u32 domain, u8 bus)
+{
+   int idx;
+   struct iio_root_port *rp;
+
+   if (list) {
+   for (idx = 0; idx < list->nr_entries; idx++) {
+   rp = list->rps[idx];
+   if (rp && rp->domain == domain && rp->bus == bus)
+   return rp;
+   }
+   }
+   return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+ struct iio_root_port * const rp)
+{
+   struct iio_root_port **tmp_buf;
+
+   if (list && rp) {
+   rp->idx = list->nr_entries++;
+   /* One more for NULL.*/
+   tmp_buf = realloc(list->rps, (list->nr_entries + 1) * 
sizeof(*list->rps));
+   if (!tmp_buf) {
+   pr_err("Failed to realloc memory\n");
+   return -ENOMEM;
+   }
+   tmp_buf[rp->idx] = rp;
+   tmp_buf[list->nr_entries] = NULL;
+   list->rps = tmp_buf;
+   }
+   return 0;
+}
-- 
2.19.1



[PATCH 3/5] perf stat: Basic support for iiostat in perf stat

2020-12-10 Thread Alexander Antonov
Add basic flow for a new --iiostat mode in perf stat. Mode is intended to
provide four I/O performance metrics per each IIO stack: Inbound Read,
Inbound Write, Outbound Read, Outbound Write.

The actual code to compute the metrics and attribute it to
evsel::perf_device is in follow-on patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c  | 33 -
 tools/perf/util/iiostat.h  | 33 +
 tools/perf/util/stat-display.c | 38 +-
 tools/perf/util/stat-shadow.c  | 11 +-
 tools/perf/util/stat.h |  1 +
 5 files changed, 113 insertions(+), 3 deletions(-)
 create mode 100644 tools/perf/util/iiostat.h

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e3ff55de4f7a..c8168cfe202b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -67,6 +67,7 @@
 #include "util/top.h"
 #include "util/affinity.h"
 #include "util/pfm.h"
+#include "util/iiostat.h"
 #include "asm/bug.h"
 
 #include 
@@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = {
.walltime_nsecs_stats   = &walltime_nsecs_stats,
.big_num= true,
.ctl_fd = -1,
-   .ctl_fd_ack = -1
+   .ctl_fd_ack = -1,
+   .iiostat_run= false,
 };
 
 static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt,
return parse_cgroups(opt, str, unset);
 }
 
+__weak int iiostat_parse(const struct option *opt __maybe_unused,
+const char *str __maybe_unused,
+int unset __maybe_unused)
+{
+   pr_err("--iiostat mode is not supported\n");
+   return -1;
+}
+
 static struct option stat_options[] = {
OPT_BOOLEAN('T', "transaction", &transaction_run,
"hardware transaction statistics"),
@@ -1185,6 +1195,8 @@ static struct option stat_options[] = {
 "\t\t\t  Optionally send control command completion 
('ack\\n') to ack-fd descriptor.\n"
 "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened 
and used as ctl-fd / ack-fd.",
  parse_control_option),
+   OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root 
port",
+   "measure PCIe metrics per IIO stack", 
iiostat_parse),
OPT_END()
 };
 
@@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct 
perf_stat *st)
return 0;
 }
 
+__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused,
+  struct perf_stat_config *config 
__maybe_unused)
+{
+   return 0;
+}
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks)
}
 }
 
+__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused)
+{
+}
+
 int cmd_stat(int argc, const char **argv)
 {
const char * const stat_usage[] = {
@@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv)
goto out;
}
 
+   if (stat_config.iiostat_run) {
+   status = iiostat_show_root_ports(evsel_list, &stat_config);
+   if (status || !stat_config.iiostat_run)
+   goto out;
+   }
+
if (add_default_attributes())
goto out;
 
@@ -2406,6 +2434,9 @@ int cmd_stat(int argc, const char **argv)
perf_stat__exit_aggr_mode();
perf_evlist__free_stats(evsel_list);
 out:
+   if (stat_config.iiostat_run)
+   iiostat_delete_root_ports(evsel_list);
+
zfree(&stat_config.walltime_run);
 
if (smi_cost && smi_reset)
diff --git a/tools/perf/util/iiostat.h b/tools/perf/util/iiostat.h
new file mode 100644
index ..6a905b2b40b9
--- /dev/null
+++ b/tools/perf/util/iiostat.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf stat --iiostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov 
+ */
+
+#ifndef _IIOSTAT_H
+#define _IIOSTAT_H
+
+#include 
+#include "util/stat.h"
+#include "util/parse-events.h"
+#include "util/evlist.h"
+
+struct option;
+struct perf_stat_config;
+struct evlist;
+struct timespec;
+
+int iiostat_parse(const struct option *opt, const char *str,
+ int unset __maybe_unused);
+void iiostat_prefix(struct perf_stat_config *config, struct evlist *evlist,
+   char *prefix, struct timespec *ts);
+void iiostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+ st

[PATCH 2/5] perf evsel: Introduce an observed performance device

2020-12-10 Thread Alexander Antonov
Adding evsel::perf_device void pointer.

For performance monitoring purposes, an evsel can have a related device.
These changes allow to attribute, for example, I/O performance metrics
to IIO stack.

Signed-off-by: Alexander Antonov 
---
 tools/perf/util/evsel.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 79a860d8e3ee..c346920f477a 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -127,6 +127,7 @@ struct evsel {
 * See also evsel__has_callchain().
 */
__u64   synth_sample_type;
+   void*perf_device;
 };
 
 struct perf_missing_features {
-- 
2.19.1



[PATCH 1/5] perf stat: Add AGGR_IIO_STACK mode

2020-12-10 Thread Alexander Antonov
Adding AGGR_IIO_STACK mode to be able to distinguish aggr_mode
for IIO stacks in following patches.

Signed-off-by: Alexander Antonov 
---
 tools/perf/builtin-stat.c   |  5 -
 .../util/scripting-engines/trace-event-python.c |  2 +-
 tools/perf/util/stat-display.c  | 13 +++--
 tools/perf/util/stat.c  |  3 ++-
 tools/perf/util/stat.h  |  1 +
 5 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f15b2f8aa14d..e3ff55de4f7a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1309,6 +1309,7 @@ static int perf_stat_init_aggr_mode(void)
break;
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_IIO_STACK:
case AGGR_UNSET:
default:
break;
@@ -1499,6 +1500,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat 
*st)
case AGGR_NONE:
case AGGR_GLOBAL:
case AGGR_THREAD:
+   case AGGR_IIO_STACK:
case AGGR_UNSET:
default:
break;
@@ -2216,7 +2218,8 @@ int cmd_stat(int argc, const char **argv)
 * --per-thread is aggregated per thread, we dont mix it with cpu mode
 */
if (((stat_config.aggr_mode != AGGR_GLOBAL &&
- stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) &&
+ stat_config.aggr_mode != AGGR_THREAD &&
+ stat_config.aggr_mode != AGGR_IIO_STACK) || nr_cgroups) &&
!target__has_cpu(&target)) {
fprintf(stderr, "both cgroup and no-aggregation "
"modes only available in system-wide mode\n");
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c 
b/tools/perf/util/scripting-engines/trace-event-python.c
index c83c2c6564e0..e8b472faeae4 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1401,7 +1401,7 @@ static void python_process_stat(struct perf_stat_config 
*config,
struct perf_cpu_map *cpus = counter->core.cpus;
int cpu, thread;
 
-   if (config->aggr_mode == AGGR_GLOBAL) {
+   if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == 
AGGR_IIO_STACK) {
process_stat(counter, -1, -1, tstamp,
 &counter->counts->aggr);
return;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 4b57c0c07632..3bfcdb80443a 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -133,6 +133,7 @@ static void aggr_printout(struct perf_stat_config *config,
config->csv_sep);
break;
case AGGR_GLOBAL:
+   case AGGR_IIO_STACK:
case AGGR_UNSET:
default:
break;
@@ -330,7 +331,7 @@ static int first_shadow_cpu(struct perf_stat_config *config,
if (config->aggr_mode == AGGR_NONE)
return id;
 
-   if (config->aggr_mode == AGGR_GLOBAL)
+   if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == 
AGGR_IIO_STACK)
return 0;
 
for (i = 0; i < evsel__nr_cpus(evsel); i++) {
@@ -424,6 +425,7 @@ static void printout(struct perf_stat_config *config, int 
id, int nr,
if (config->csv_output && !config->metric_only) {
static int aggr_fields[] = {
[AGGR_GLOBAL] = 0,
+   [AGGR_IIO_STACK] = 0,
[AGGR_THREAD] = 1,
[AGGR_NONE] = 1,
[AGGR_SOCKET] = 2,
@@ -906,6 +908,7 @@ static int aggr_header_lens[] = {
[AGGR_NONE] = 6,
[AGGR_THREAD] = 24,
[AGGR_GLOBAL] = 0,
+   [AGGR_IIO_STACK] = 0,
 };
 
 static const char *aggr_header_csv[] = {
@@ -914,7 +917,8 @@ static const char *aggr_header_csv[] = {
[AGGR_SOCKET]   =   "socket,cpus",
[AGGR_NONE] =   "cpu,",
[AGGR_THREAD]   =   "comm-pid,",
-   [AGGR_GLOBAL]   =   ""
+   [AGGR_GLOBAL]   =   "",
+   [AGGR_IIO_STACK] =  "port,"
 };
 
 static void print_metric_headers(struct perf_stat_config *config,
@@ -1001,6 +1005,9 @@ static void print_interval(struct perf_stat_config 
*config,
if (!metric_only)
fprintf(output, "  counts %*s 
events\n", unit_width, "unit");
break;
+   case AGGR_IIO_STACK:
+   fprintf(output, "#   timeport");
+   break;
case AGGR_GLOBAL:
default:
 

[PATCH 0/5] perf stat: Introduce --iiostat mode to provide I/O performance metrics

2020-12-10 Thread Alexander Antonov
Mode is intended to provide four I/O performance metrics in MB per each
IIO stack:
 - Inbound Read: I/O devices below IIO stack read from the host memory
 - Inbound Write: I/O devices below IIO stack write to the host memory
 - Outbound Read: CPU reads from I/O devices below IIO stack
 - Outbound Write: CPU writes to I/O devices below IIO stack

Each metric requiries only one IIO event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
#EventCount * 4B / (1024 * 1024)

Note: --iiostat introduces new perf data aggregation mode - per I/O stack
hence -e and -M options are not supported.

Usage examples:

1. List all IIO stacks (example for 2-S platform):
   $ perf stat --iiostat=show
   S0-uncore_iio_0<:00>
   S1-uncore_iio_0<:80>
   S0-uncore_iio_1<:17>
   S1-uncore_iio_1<:85>
   S0-uncore_iio_2<:3a>
   S1-uncore_iio_2<:ae>
   S0-uncore_iio_3<:5d>
   S1-uncore_iio_3<:d7>

2. Collect metrics for all I/O stacks:
   $ perf stat --iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :00102   
 3 
   :80000   
 0 
   :17   352552   430   
21 
   :85000   
 0 
   :3a300   
 0 
   :ae000   
 0 
   :5d000   
 0 
   :d7000   
 0

3. Collect metrics for comma separated list of I/O stacks:
   $ perf stat --iiostat=:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M 
oflag=direct
   357708+0 records in
   357707+0 records out
   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s

Performance counter stats for 'system wide':

  port Inbound Read(MB)Inbound Write(MB)Outbound 
Read(MB)   Outbound Write(MB) 
   :17   358559   440   
22 
   :3a320   
         0 

197.081983474 seconds time elapsed

Alexander Antonov (5):
  perf stat: Add AGGR_IIO_STACK mode
  perf evsel: Introduce an observed performance device
  perf stat: Basic support for iiostat in perf stat
  perf stat: Helper functions for IIO stacks list in iiostat mode
  perf stat: Enable --iiostat mode for x86 platforms

 tools/perf/Documentation/perf-stat.txt|  31 ++
 tools/perf/arch/x86/util/Build|   1 +
 tools/perf/arch/x86/util/iiostat.c| 460 ++
 tools/perf/builtin-stat.c |  38 +-
 tools/perf/util/evsel.h   |   1 +
 tools/perf/util/iiostat.h |  33 ++
 .../scripting-engines/trace-event-python.c|   2 +-
 tools/perf/util/stat-display.c|  51 +-
 tools/perf/util/stat-shadow.c |  11 +-
 tools/perf/util/stat.c|   3 +-
 tools/perf/util/stat.h|   2 +
 11 files changed, 625 insertions(+), 8 deletions(-)
 create mode 100644 tools/perf/arch/x86/util/iiostat.c
 create mode 100644 tools/perf/util/iiostat.h


base-commit: 644bf4b0f7acde641d3db200b4db66977e96c3bd
-- 
2.19.1



Re: [PATCH] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server

2020-10-13 Thread Alexander Antonov

Hello Kyle,

Currently we do not have plans on supporting the Uncore units to IIO PMON
mapping on multiple segment platforms due to a variety of such platforms.
It would be great if you describe your case, I mean how you configure 
segments

on your platform. It will help to cover your configuration and determine a
common approach for the mapping algorithm.

Thanks,
Alexander

On 10/09/2020 05:11 PM, Meyer, Kyle wrote:

Hello Alexander,

Do you plan on supporting multiple segment platforms?

Thanks,
Kyle Meyer


From: alexander.anto...@linux.intel.com 
Sent: Monday, September 28, 2020 5:21 AM
To: pet...@infradead.org; linux-kernel@vger.kernel.org; x...@kernel.org
Cc: alexander.shish...@linux.intel.com; kan.li...@linux.intel.com; 
alexey.budan...@linux.intel.com; a...@linux.intel.com; a...@kernel.org; 
mi...@redhat.com; alexander.anto...@linux.intel.com; Meyer, Kyle; Anderson, Russ
Subject: [PATCH] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server

From: Alexander Antonov 

Introduced early attributes /sys/devices/uncore_iio_/die* are
initialized by skx_iio_set_mapping(), however, for example, for multiple
segment platforms skx_iio_get_topology() returns -EPERM before a list of
attributes in skx_iio_mapping_group will have been initialized.
As a result the list is being NULL. Thus the warning
"sysfs: (bin_)attrs not set by subsystem for group: uncore_iio_*/" appears
and uncore_iio pmus are not available in sysfs. Clear IIO attr_update
to properly handle the cases when topology information cannot be
retrieved.

Fixes: bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON 
mapping")
Reported-by: Kyle Meyer 
Suggested-by: Kan Liang 
Reviewed-by: Alexei Budankov 
Reviewed-by: Kan Liang 
Signed-off-by: Alexander Antonov 
---
  arch/x86/events/intel/uncore_snbep.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index 62e88ad919ff..ccfa1d6b6aa0 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3749,7 +3749,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type 
*type)

 ret = skx_iio_get_topology(type);
 if (ret)
-   return ret;
+   goto clear_attr_update;
+
+   ret = -ENOMEM;

 /* One more for NULL. */
 attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
@@ -3781,8 +3783,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type 
*type)
 kfree(eas);
 kfree(attrs);
 kfree(type->topology);
+clear_attr_update:
 type->attr_update = NULL;
-   return -ENOMEM;
+   return ret;
  }

  static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)

base-commit: a1b8638ba1320e6684aa98233c15255eb803fac7
--
2.19.1





[tip: perf/core] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server

2020-09-30 Thread tip-bot2 for Alexander Antonov
The following commit has been merged into the perf/core branch of tip:

Commit-ID: f797f05d917ffef94249ee0aec4c14a5b50517b2
Gitweb:
https://git.kernel.org/tip/f797f05d917ffef94249ee0aec4c14a5b50517b2
Author:Alexander Antonov 
AuthorDate:Mon, 28 Sep 2020 13:21:33 +03:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00

perf/x86/intel/uncore: Fix for iio mapping on Skylake Server

Introduced early attributes /sys/devices/uncore_iio_/die* are
initialized by skx_iio_set_mapping(), however, for example, for multiple
segment platforms skx_iio_get_topology() returns -EPERM before a list of
attributes in skx_iio_mapping_group will have been initialized.
As a result the list is being NULL. Thus the warning
"sysfs: (bin_)attrs not set by subsystem for group: uncore_iio_*/" appears
and uncore_iio pmus are not available in sysfs. Clear IIO attr_update
to properly handle the cases when topology information cannot be
retrieved.

Fixes: bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON 
mapping")
Reported-by: Kyle Meyer 
Suggested-by: Kan Liang 
Signed-off-by: Alexander Antonov 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Alexei Budankov 
Reviewed-by: Kan Liang 
Link: 
https://lkml.kernel.org/r/20200928102133.61041-1-alexander.anto...@linux.intel.com
---
 arch/x86/events/intel/uncore_snbep.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index 495056f..3f1e75f 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3754,7 +3754,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type 
*type)
 
ret = skx_iio_get_topology(type);
if (ret)
-   return ret;
+   goto clear_attr_update;
+
+   ret = -ENOMEM;
 
/* One more for NULL. */
attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
@@ -3786,8 +3788,9 @@ err:
kfree(eas);
kfree(attrs);
kfree(type->topology);
+clear_attr_update:
type->attr_update = NULL;
-   return -ENOMEM;
+   return ret;
 }
 
 static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)


[PATCH] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server

2020-09-28 Thread alexander . antonov
From: Alexander Antonov 

Introduced early attributes /sys/devices/uncore_iio_/die* are
initialized by skx_iio_set_mapping(), however, for example, for multiple
segment platforms skx_iio_get_topology() returns -EPERM before a list of
attributes in skx_iio_mapping_group will have been initialized.
As a result the list is being NULL. Thus the warning
"sysfs: (bin_)attrs not set by subsystem for group: uncore_iio_*/" appears
and uncore_iio pmus are not available in sysfs. Clear IIO attr_update
to properly handle the cases when topology information cannot be
retrieved.

Fixes: bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON 
mapping")
Reported-by: Kyle Meyer 
Suggested-by: Kan Liang 
Reviewed-by: Alexei Budankov 
Reviewed-by: Kan Liang 
Signed-off-by: Alexander Antonov 
---
 arch/x86/events/intel/uncore_snbep.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index 62e88ad919ff..ccfa1d6b6aa0 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -3749,7 +3749,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type 
*type)
 
ret = skx_iio_get_topology(type);
if (ret)
-   return ret;
+   goto clear_attr_update;
+
+   ret = -ENOMEM;
 
/* One more for NULL. */
attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
@@ -3781,8 +3783,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type 
*type)
kfree(eas);
kfree(attrs);
kfree(type->topology);
+clear_attr_update:
type->attr_update = NULL;
-   return -ENOMEM;
+   return ret;
 }
 
 static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)

base-commit: a1b8638ba1320e6684aa98233c15255eb803fac7
-- 
2.19.1



[PATCH v10 1/3] perf/x86/intel/uncore: Expose an Uncore unit to PMON mapping

2020-06-01 Thread alexander . antonov
From: Roman Sudarikov 

Each Uncore unit type, by its nature, can be mapped to its own context -
which platform component each PMON block of that type is supposed to
monitor.

Intel® Xeon® Scalable processor family (code name Skylake-SP) makes
significant changes in the integrated I/O (IIO) architecture. The new
solution introduces IIO stacks which are responsible for managing traffic
between the PCIe domain and the Mesh domain. Each IIO stack has its own
PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link
or various built-in accelerators. IIO PMON blocks allow concurrent
monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack.

Software is supposed to program required perf counters within each IIO
stack and gather performance data. The tricky thing here is that IIO PMON
reports data per IIO stack but users have no idea what IIO stacks are -
they only know devices which are connected to the platform.

Understanding IIO stack concept to find which IIO stack that particular
IO device is connected to, or to identify an IIO PMON block to program
for monitoring specific IIO stack assumes a lot of implicit knowledge
about given Intel server platform architecture.

Usage example:
ls /sys/devices/uncore__/die*

Co-developed-by: Alexander Antonov 
Signed-off-by: Alexander Antonov 
Signed-off-by: Roman Sudarikov 
Reviewed-by: Kan Liang 
Reviewed-by: Alexander Shishkin 
---
 arch/x86/events/intel/uncore.c |  8 
 arch/x86/events/intel/uncore.h | 12 
 2 files changed, 20 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cf76d6631afa..b71e8f7529a4 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -843,10 +843,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu 
*pmu)
.read   = uncore_pmu_event_read,
.module = THIS_MODULE,
.capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
+   .attr_update= pmu->type->attr_update,
};
} else {
pmu->pmu = *pmu->type->pmu;
pmu->pmu.attr_groups = pmu->type->attr_groups;
+   pmu->pmu.attr_update = pmu->type->attr_update;
}
 
if (pmu->type->num_boxes == 1) {
@@ -887,6 +889,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
struct intel_uncore_pmu *pmu = type->pmus;
int i;
 
+   if (type->cleanup_mapping)
+   type->cleanup_mapping(type);
+
if (pmu) {
for (i = 0; i < type->num_boxes; i++, pmu++) {
uncore_pmu_unregister(pmu);
@@ -954,6 +959,9 @@ static int __init uncore_type_init(struct intel_uncore_type 
*type, bool setid)
 
type->pmu_group = &uncore_pmu_attr_group;
 
+   if (type->set_mapping)
+   type->set_mapping(type);
+
return 0;
 
 err:
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 0da4a4605536..8f2b77d27513 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -72,7 +72,19 @@ struct intel_uncore_type {
struct uncore_event_desc *event_descs;
struct freerunning_counters *freerunning;
const struct attribute_group *attr_groups[4];
+   const struct attribute_group **attr_update;
struct pmu *pmu; /* for custom pmu ops */
+   /*
+* Uncore PMU would store relevant platform topology configuration here
+* to identify which platform component each PMON block of that type is
+* supposed to monitor.
+*/
+   u64 *topology;
+   /*
+* Optional callbacks for managing mapping of Uncore units to PMONs
+*/
+   int (*set_mapping)(struct intel_uncore_type *type);
+   void (*cleanup_mapping)(struct intel_uncore_type *type);
 };
 
 #define pmu_group attr_groups[0]
-- 
2.19.1



[PATCH v10 2/3] perf/x86/intel/uncore: Wrap the max dies calculation into an accessor

2020-06-01 Thread alexander . antonov
From: Roman Sudarikov 

The accessor to return number of dies on the platform.

Co-developed-by: Alexander Antonov 
Signed-off-by: Alexander Antonov 
Signed-off-by: Roman Sudarikov 
Reviewed-by: Kan Liang 
Reviewed-by: Alexander Shishkin 
---
 arch/x86/events/intel/uncore.c | 13 +++--
 arch/x86/events/intel/uncore.h |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index b71e8f7529a4..e4f37dc83cf0 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver;
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_extra_dev *uncore_extra_pci_dev;
-static int max_dies;
+int __uncore_max_dies;
 
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
@@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct 
intel_uncore_pmu *pmu, int cpu
 * The unsigned check also catches the '-1' return value for non
 * existent mappings in the topology map.
 */
-   return dieid < max_dies ? pmu->boxes[dieid] : NULL;
+   return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event 
*event)
@@ -879,7 +879,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
int die;
 
-   for (die = 0; die < max_dies; die++)
+   for (die = 0; die < uncore_max_dies(); die++)
kfree(pmu->boxes[die]);
kfree(pmu->boxes);
 }
@@ -920,7 +920,7 @@ static int __init uncore_type_init(struct intel_uncore_type 
*type, bool setid)
if (!pmus)
return -ENOMEM;
 
-   size = max_dies * sizeof(struct intel_uncore_box *);
+   size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
 
for (i = 0; i < type->num_boxes; i++) {
pmus[i].func_id = setid ? i : -1;
@@ -1120,7 +1120,7 @@ static int __init uncore_pci_init(void)
size_t size;
int ret;
 
-   size = max_dies * sizeof(struct pci_extra_dev);
+   size = uncore_max_dies() * sizeof(struct pci_extra_dev);
uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
if (!uncore_extra_pci_dev) {
ret = -ENOMEM;
@@ -1547,7 +1547,8 @@ static int __init intel_uncore_init(void)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return -ENODEV;
 
-   max_dies = topology_max_packages() * topology_max_die_per_package();
+   __uncore_max_dies =
+   topology_max_packages() * topology_max_die_per_package();
 
uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
if (uncore_init->pci_init) {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 8f2b77d27513..8621b66c49ef 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -181,6 +181,9 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
+extern int __uncore_max_dies;
+#define uncore_max_dies()  (__uncore_max_dies)
+
 #define INTEL_UNCORE_EVENT_DESC(_name, _config)\
 {  \
.attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
-- 
2.19.1



[PATCH v10 3/3] perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping

2020-06-01 Thread alexander . antonov
From: Roman Sudarikov 

Current version supports a server line starting Intel® Xeon® Processor
Scalable Family and introduces mapping for IIO Uncore units only.
Other units can be added on demand.

IIO stack to PMON mapping is exposed through:
/sys/devices/uncore_iio_/dieX
where dieX is file which holds "Segment:Root Bus" for PCIe root port,
which can be monitored by that IIO PMON block.

Details are explained in Documentation/ABI/testing/sysfs-devices-mapping

Reported-by: kbuild test robot 
Co-developed-by: Alexander Antonov 
Signed-off-by: Alexander Antonov 
Signed-off-by: Roman Sudarikov 
Reviewed-by: Kan Liang 
Reviewed-by: Alexander Shishkin 
---
 .../ABI/testing/sysfs-devices-mapping |  33 +++
 arch/x86/events/intel/uncore.h|   9 +
 arch/x86/events/intel/uncore_snbep.c  | 191 ++
 3 files changed, 233 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-mapping

diff --git a/Documentation/ABI/testing/sysfs-devices-mapping 
b/Documentation/ABI/testing/sysfs-devices-mapping
new file mode 100644
index ..490ccfd67f12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -0,0 +1,33 @@
+What:   /sys/devices/uncore_iio_x/dieX
+Date:   February 2020
+Contact:Roman Sudarikov 
+Description:
+Each IIO stack (PCIe root port) has its own IIO PMON block, so
+each dieX file (where X is die number) holds "Segment:Root Bus"
+for PCIe root port, which can be monitored by that IIO PMON
+block.
+For example, on 4-die Xeon platform with up to 6 IIO stacks per
+die and, therefore, 6 IIO PMON blocks per die, the mapping of
+IIO PMON block 0 exposes as the following:
+
+$ ls /sys/devices/uncore_iio_0/die*
+-r--r--r-- /sys/devices/uncore_iio_0/die0
+-r--r--r-- /sys/devices/uncore_iio_0/die1
+-r--r--r-- /sys/devices/uncore_iio_0/die2
+-r--r--r-- /sys/devices/uncore_iio_0/die3
+
+$ tail /sys/devices/uncore_iio_0/die*
+==> /sys/devices/uncore_iio_0/die0 <==
+:00
+==> /sys/devices/uncore_iio_0/die1 <==
+:40
+==> /sys/devices/uncore_iio_0/die2 <==
+:80
+==> /sys/devices/uncore_iio_0/die3 <==
+:c0
+
+Which means:
+IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x
+IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x
+IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x
+IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 8621b66c49ef..61a7eaa81224 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -181,6 +181,15 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+   return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n) container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n)container_of(n, struct 
dev_ext_attribute, attr)
+#define attr_to_ext_attr(n)to_dev_ext_attribute(to_device_attribute(n))
+
 extern int __uncore_max_dies;
 #define uncore_max_dies()  (__uncore_max_dies)
 
diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index 07652fa20ebb..0b64d2d85ad8 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -273,6 +273,30 @@
 #define SKX_CPUNODEID  0xc0
 #define SKX_GIDNIDMAP  0xd4
 
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * |  Bit  |  Default  |  Description
+ * | [63]  |00h| VALID - When set, indicates the CPU bus
+ *   numbers have been initialized. (RO)
+ * |[62:48]|---| Reserved
+ * |[47:40]|00h| BUS_NUM_5 — Return the bus number BIOS assigned
+ *   CPUBUSNO(5). (RO)
+ * |[39:32]|00h| BUS_NUM_4 — Return the bus number BIOS assigned
+ *   CPUBUSNO(4). (RO)
+ * |[31:24]|00h| BUS_NUM_3 — Return the bus number BIOS assigned
+ *   CPUBUSNO(3). (RO)
+ * |[23:16]|00h| BUS_NUM_2 — Return the bus number BIOS assigned
+ *   CPUBUSNO(2). (RO)
+ * |[15:8] |00h| BUS_NUM_1 — Return the bus number BIOS assigned
+ *   

[PATCH v10 0/3] perf x86: Exposing IO stack to IO PMON mapping through sysfs

2020-06-01 Thread alexander . antonov
From: Alexander Antonov 

The previous version can be found at:
v9: 
https://lkml.kernel.org/r/20200525080554.21313-1-alexander.anto...@linux.intel.com/
Changes in this revision are:
v9 -> v10:
- Addressed comment from CI Test Service:
  1. Fixed coding style issues (old style declaration)

The previous version can be found at:
v8: 
https://lkml.kernel.org/r/20200320073110.4761-1-roman.sudari...@linux.intel.com/
Changes in this revision are:
v8 -> v9:
- Addressed comments from Alexander Shishkin:
  1. Improved comments and commit messages
  2. Replacing "0444" with the S_IRUGO results in the following checkpatch
 warning: "Symbolic permissions 'S_IRUGO' are not preferred. Consider using
 octal permissions '0444'". Thus keeping 0444 for now.
 Also see: https://lkml.org/lkml/2016/8/2/1945

The previous version can be found at:
v7: 
https://lkml.kernel.org/r/20200303135418.9621-1-roman.sudari...@linux.intel.com/
Changes in this revision are:
v7 -> v8:
- Addressed comments from Kan Liang:
  1. Fixed coding style issues (gotos in error path, comments style)

The previous version can be found at:
v6: 
https://lkml.kernel.org/r/20200213150148.5627-1-roman.sudari...@linux.intel.com/

Changes in this revision are:
v6 -> v7:
- Addressed comments from Greg Kroah-Hartman:
  1. Added proper handling of load/unload path
  2. Simplified the mapping attribute show procedure by using the segment value
 of the first available root bus for all mapping attributes which is safe
 due to current implementation supports single segment configuration only
  3. Fixed coding style issues (extra lines, gotos in error path, macros etc)

The previous version can be found at:
v5: 
https://lkml.kernel.org/r/20200211161549.19828-1-roman.sudari...@linux.intel.com/

Changes in this revision are:
v5 -> v6:
  1. Changed the mapping attribute name to "dieX"
  2. Called sysfs_attr_init() prior to dynamically creating the mapping attrs
  3. Removed redundant "empty" attribute
  4. Got an agreement on the mapping attribute format

The previous version can be found at:
v4: 
https://lkml.kernel.org/r/20200117133759.5729-1-roman.sudari...@linux.intel.com/

Changes in this revision are:
v4 -> v5:
- Addressed comments from Greg Kroah-Hartman:
  1. Using the attr_update flow for newly introduced optional attributes
  2. No subfolder, optional attributes are created the same level as 'cpumask'
  3. No symlinks, optional attributes are created as files
  4. Single file for each IIO PMON block to node mapping
  5. Added Documentation/ABI/sysfs-devices-mapping

The previous version can be found at:
v3: 
https://lkml.kernel.org/r/20200113135444.12027-1-roman.sudari...@linux.intel.com

Changes in this revision are:
v3 -> v4:
- Addressed comments from Greg Kroah-Hartman:
  1. Reworked handling of newly introduced attribute.
  2. Required Documentation update is expected in the follow up patchset


The previous version can be found at:
v2: 
https://lkml.kernel.org/r/20191210091451.6054-1-roman.sudari...@linux.intel.com

Changes in this revision are:
v2 -> v3:
  1. Addressed comments from Peter and Kan

The previous version can be found at:
v1: 
https://lkml.kernel.org/r/20191126163630.17300-1-roman.sudari...@linux.intel.com

Changes in this revision are:
v1 -> v2:
  1. Fixed process related issues;
  2. This patch set includes kernel support for IIO stack to PMON mapping;
  3. Stephane raised concerns regarding output format which may require
code changes in the user space part of the feature only. We will continue
output format discussion in the context of user space update.

Intel® Xeon® Scalable processor family (code name Skylake-SP) makes
significant changes in the integrated I/O (IIO) architecture. The new
solution introduces IIO stacks which are responsible for managing traffic
between the PCIe domain and the Mesh domain. Each IIO stack has its own
PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link
or various built-in accelerators. IIO PMON blocks allow concurrent
monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack.

Software is supposed to program required perf counters within each IIO
stack and gather performance data. The tricky thing here is that IIO PMON
reports data per IIO stack but users have no idea what IIO stacks are -
they only know devices which are connected to the platform.

Understanding IIO stack concept to find which IIO stack that particular
IO device is connected to, or to identify an IIO PMON block to program
for monitoring specific IIO stack assumes a lot of implicit knowledge
about given Intel server platform architecture.

This patch set introduces:
1. An infrastructure for exposing an Uncore unit to Uncore PMON mapping
   through sysfs-backend;
2. A new --iiostat mode in perf stat to provide I/O performance metrics
   per I/O device.

Usage examples:

1. List all d

[RESEND PATCH v9 1/3] perf/x86/intel/uncore: Expose an Uncore unit to PMON mapping

2020-05-25 Thread alexander . antonov
From: Roman Sudarikov 

Each Uncore unit type, by its nature, can be mapped to its own context -
which platform component each PMON block of that type is supposed to
monitor.

Intel® Xeon® Scalable processor family (code name Skylake-SP) makes
significant changes in the integrated I/O (IIO) architecture. The new
solution introduces IIO stacks which are responsible for managing traffic
between the PCIe domain and the Mesh domain. Each IIO stack has its own
PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link
or various built-in accelerators. IIO PMON blocks allow concurrent
monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack.

Software is supposed to program required perf counters within each IIO
stack and gather performance data. The tricky thing here is that IIO PMON
reports data per IIO stack but users have no idea what IIO stacks are -
they only know devices which are connected to the platform.

Understanding IIO stack concept to find which IIO stack that particular
IO device is connected to, or to identify an IIO PMON block to program
for monitoring specific IIO stack assumes a lot of implicit knowledge
about given Intel server platform architecture.

Usage example:
ls /sys/devices/uncore__/die*

Co-developed-by: Alexander Antonov 
Signed-off-by: Alexander Antonov 
Signed-off-by: Roman Sudarikov 
Reviewed-by: Kan Liang 
Reviewed-by: Alexander Shishkin 
---
 arch/x86/events/intel/uncore.c |  8 
 arch/x86/events/intel/uncore.h | 12 
 2 files changed, 20 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cf76d6631afa..b71e8f7529a4 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -843,10 +843,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu 
*pmu)
.read   = uncore_pmu_event_read,
.module = THIS_MODULE,
.capabilities   = PERF_PMU_CAP_NO_EXCLUDE,
+   .attr_update= pmu->type->attr_update,
};
} else {
pmu->pmu = *pmu->type->pmu;
pmu->pmu.attr_groups = pmu->type->attr_groups;
+   pmu->pmu.attr_update = pmu->type->attr_update;
}
 
if (pmu->type->num_boxes == 1) {
@@ -887,6 +889,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
struct intel_uncore_pmu *pmu = type->pmus;
int i;
 
+   if (type->cleanup_mapping)
+   type->cleanup_mapping(type);
+
if (pmu) {
for (i = 0; i < type->num_boxes; i++, pmu++) {
uncore_pmu_unregister(pmu);
@@ -954,6 +959,9 @@ static int __init uncore_type_init(struct intel_uncore_type 
*type, bool setid)
 
type->pmu_group = &uncore_pmu_attr_group;
 
+   if (type->set_mapping)
+   type->set_mapping(type);
+
return 0;
 
 err:
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 0da4a4605536..8f2b77d27513 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -72,7 +72,19 @@ struct intel_uncore_type {
struct uncore_event_desc *event_descs;
struct freerunning_counters *freerunning;
const struct attribute_group *attr_groups[4];
+   const struct attribute_group **attr_update;
struct pmu *pmu; /* for custom pmu ops */
+   /*
+* Uncore PMU would store relevant platform topology configuration here
+* to identify which platform component each PMON block of that type is
+* supposed to monitor.
+*/
+   u64 *topology;
+   /*
+* Optional callbacks for managing mapping of Uncore units to PMONs
+*/
+   int (*set_mapping)(struct intel_uncore_type *type);
+   void (*cleanup_mapping)(struct intel_uncore_type *type);
 };
 
 #define pmu_group attr_groups[0]
-- 
2.19.1



[RESEND PATCH v9 0/3] perf x86: Exposing IO stack to IO PMON mapping through sysfs

2020-05-25 Thread alexander . antonov
From: Alexander Antonov 

The previous version can be found at:
v8: 
https://lkml.kernel.org/r/20200320073110.4761-1-roman.sudari...@linux.intel.com/
Changes in this revision are:
v8 -> v9:
- Addressed comments from Alexander Shishkin:
  1. Improved comments and commit messages
  2. Replacing "0444" with the S_IRUGO results in the following checkpatch
 warning: "Symbolic permissions 'S_IRUGO' are not preferred. Consider using
 octal permissions '0444'". Thus keeping 0444 for now.
 Also see: https://lkml.org/lkml/2016/8/2/1945

The previous version can be found at:
v7: 
https://lkml.kernel.org/r/20200303135418.9621-1-roman.sudari...@linux.intel.com/
Changes in this revision are:
v7 -> v8:
- Addressed comments from Kan Liang:
  1. Fixed coding style issues (gotos in error path, comments style)

The previous version can be found at:
v6: 
https://lkml.kernel.org/r/20200213150148.5627-1-roman.sudari...@linux.intel.com/

Changes in this revision are:
v6 -> v7:
- Addressed comments from Greg Kroah-Hartman:
  1. Added proper handling of load/unload path
  2. Simplified the mapping attribute show procedure by using the segment value
 of the first available root bus for all mapping attributes which is safe
 due to current implementation supports single segment configuration only
  3. Fixed coding style issues (extra lines, gotos in error path, macros etc)

The previous version can be found at:
v5: 
https://lkml.kernel.org/r/20200211161549.19828-1-roman.sudari...@linux.intel.com/

Changes in this revision are:
v5 -> v6:
  1. Changed the mapping attribute name to "dieX"
  2. Called sysfs_attr_init() prior to dynamically creating the mapping attrs
  3. Removed redundant "empty" attribute
  4. Got an agreement on the mapping attribute format

The previous version can be found at:
v4: 
https://lkml.kernel.org/r/20200117133759.5729-1-roman.sudari...@linux.intel.com/

Changes in this revision are:
v4 -> v5:
- Addressed comments from Greg Kroah-Hartman:
  1. Using the attr_update flow for newly introduced optional attributes
  2. No subfolder, optional attributes are created the same level as 'cpumask'
  3. No symlinks, optional attributes are created as files
  4. Single file for each IIO PMON block to node mapping
  5. Added Documentation/ABI/sysfs-devices-mapping

The previous version can be found at:
v3: 
https://lkml.kernel.org/r/20200113135444.12027-1-roman.sudari...@linux.intel.com

Changes in this revision are:
v3 -> v4:
- Addressed comments from Greg Kroah-Hartman:
  1. Reworked handling of newly introduced attribute.
  2. Required Documentation update is expected in the follow up patchset


The previous version can be found at:
v2: 
https://lkml.kernel.org/r/20191210091451.6054-1-roman.sudari...@linux.intel.com

Changes in this revision are:
v2 -> v3:
  1. Addressed comments from Peter and Kan

The previous version can be found at:
v1: 
https://lkml.kernel.org/r/20191126163630.17300-1-roman.sudari...@linux.intel.com

Changes in this revision are:
v1 -> v2:
  1. Fixed process related issues;
  2. This patch set includes kernel support for IIO stack to PMON mapping;
  3. Stephane raised concerns regarding output format which may require
code changes in the user space part of the feature only. We will continue
output format discussion in the context of user space update.

Intel® Xeon® Scalable processor family (code name Skylake-SP) makes
significant changes in the integrated I/O (IIO) architecture. The new
solution introduces IIO stacks which are responsible for managing traffic
between the PCIe domain and the Mesh domain. Each IIO stack has its own
PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link
or various built-in accelerators. IIO PMON blocks allow concurrent
monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack.

Software is supposed to program required perf counters within each IIO
stack and gather performance data. The tricky thing here is that IIO PMON
reports data per IIO stack but users have no idea what IIO stacks are -
they only know devices which are connected to the platform.

Understanding IIO stack concept to find which IIO stack that particular
IO device is connected to, or to identify an IIO PMON block to program
for monitoring specific IIO stack assumes a lot of implicit knowledge
about given Intel server platform architecture.

This patch set introduces:
1. An infrastructure for exposing an Uncore unit to Uncore PMON mapping
   through sysfs-backend;
2. A new --iiostat mode in perf stat to provide I/O performance metrics
   per I/O device.

Usage examples:

1. List all devices below IIO stacks
  ./perf stat --iiostat=show

Sample output w/o libpci:

S0-RootPort0-uncore_iio_0<00:00.0>
S1-RootPort0-uncore_iio_0<81:00.0>
S0-RootPort1-uncore_iio_1<18:00.0>
S1-RootPort1-uncore_iio_1<86:00.0>
S1-RootPort1-un

[RESEND PATCH v9 3/3] perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping

2020-05-25 Thread alexander . antonov
From: Roman Sudarikov 

Current version supports a server line starting Intel® Xeon® Processor
Scalable Family and introduces mapping for IIO Uncore units only.
Other units can be added on demand.

IIO stack to PMON mapping is exposed through:
/sys/devices/uncore_iio_/dieX
where dieX is file which holds "Segment:Root Bus" for PCIe root port,
which can be monitored by that IIO PMON block.

Details are explained in Documentation/ABI/testing/sysfs-devices-mapping

Co-developed-by: Alexander Antonov 
Signed-off-by: Alexander Antonov 
Signed-off-by: Roman Sudarikov 
Reviewed-by: Kan Liang 
Reviewed-by: Alexander Shishkin 
---
 .../ABI/testing/sysfs-devices-mapping |  33 +++
 arch/x86/events/intel/uncore.h|   9 +
 arch/x86/events/intel/uncore_snbep.c  | 191 ++
 3 files changed, 233 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-mapping

diff --git a/Documentation/ABI/testing/sysfs-devices-mapping 
b/Documentation/ABI/testing/sysfs-devices-mapping
new file mode 100644
index ..490ccfd67f12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -0,0 +1,33 @@
+What:   /sys/devices/uncore_iio_x/dieX
+Date:   February 2020
+Contact:Roman Sudarikov 
+Description:
+Each IIO stack (PCIe root port) has its own IIO PMON block, so
+each dieX file (where X is die number) holds "Segment:Root Bus"
+for PCIe root port, which can be monitored by that IIO PMON
+block.
+For example, on 4-die Xeon platform with up to 6 IIO stacks per
+die and, therefore, 6 IIO PMON blocks per die, the mapping of
+IIO PMON block 0 exposes as the following:
+
+$ ls /sys/devices/uncore_iio_0/die*
+-r--r--r-- /sys/devices/uncore_iio_0/die0
+-r--r--r-- /sys/devices/uncore_iio_0/die1
+-r--r--r-- /sys/devices/uncore_iio_0/die2
+-r--r--r-- /sys/devices/uncore_iio_0/die3
+
+$ tail /sys/devices/uncore_iio_0/die*
+==> /sys/devices/uncore_iio_0/die0 <==
+:00
+==> /sys/devices/uncore_iio_0/die1 <==
+:40
+==> /sys/devices/uncore_iio_0/die2 <==
+:80
+==> /sys/devices/uncore_iio_0/die3 <==
+:c0
+
+Which means:
+IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x
+IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x
+IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x
+IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 8621b66c49ef..61a7eaa81224 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -181,6 +181,15 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+   return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n) container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n)container_of(n, struct 
dev_ext_attribute, attr)
+#define attr_to_ext_attr(n)to_dev_ext_attribute(to_device_attribute(n))
+
 extern int __uncore_max_dies;
 #define uncore_max_dies()  (__uncore_max_dies)
 
diff --git a/arch/x86/events/intel/uncore_snbep.c 
b/arch/x86/events/intel/uncore_snbep.c
index 07652fa20ebb..8cd3539028ae 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -273,6 +273,30 @@
 #define SKX_CPUNODEID  0xc0
 #define SKX_GIDNIDMAP  0xd4
 
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * |  Bit  |  Default  |  Description
+ * | [63]  |00h| VALID - When set, indicates the CPU bus
+ *   numbers have been initialized. (RO)
+ * |[62:48]|---| Reserved
+ * |[47:40]|00h| BUS_NUM_5 — Return the bus number BIOS assigned
+ *   CPUBUSNO(5). (RO)
+ * |[39:32]|00h| BUS_NUM_4 — Return the bus number BIOS assigned
+ *   CPUBUSNO(4). (RO)
+ * |[31:24]|00h| BUS_NUM_3 — Return the bus number BIOS assigned
+ *   CPUBUSNO(3). (RO)
+ * |[23:16]|00h| BUS_NUM_2 — Return the bus number BIOS assigned
+ *   CPUBUSNO(2). (RO)
+ * |[15:8] |00h| BUS_NUM_1 — Return the bus number BIOS assigned
+ *   CPUBUSNO(1). (RO)
+ * 

[RESEND PATCH v9 2/3] perf/x86/intel/uncore: Wrap the max dies calculation into an accessor

2020-05-25 Thread alexander . antonov
From: Roman Sudarikov 

The accessor to return number of dies on the platform.

Co-developed-by: Alexander Antonov 
Signed-off-by: Alexander Antonov 
Signed-off-by: Roman Sudarikov 
Reviewed-by: Kan Liang 
Reviewed-by: Alexander Shishkin 
---
 arch/x86/events/intel/uncore.c | 13 +++--
 arch/x86/events/intel/uncore.h |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index b71e8f7529a4..e4f37dc83cf0 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver;
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_extra_dev *uncore_extra_pci_dev;
-static int max_dies;
+int __uncore_max_dies;
 
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
@@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct 
intel_uncore_pmu *pmu, int cpu
 * The unsigned check also catches the '-1' return value for non
 * existent mappings in the topology map.
 */
-   return dieid < max_dies ? pmu->boxes[dieid] : NULL;
+   return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event 
*event)
@@ -879,7 +879,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
int die;
 
-   for (die = 0; die < max_dies; die++)
+   for (die = 0; die < uncore_max_dies(); die++)
kfree(pmu->boxes[die]);
kfree(pmu->boxes);
 }
@@ -920,7 +920,7 @@ static int __init uncore_type_init(struct intel_uncore_type 
*type, bool setid)
if (!pmus)
return -ENOMEM;
 
-   size = max_dies * sizeof(struct intel_uncore_box *);
+   size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
 
for (i = 0; i < type->num_boxes; i++) {
pmus[i].func_id = setid ? i : -1;
@@ -1120,7 +1120,7 @@ static int __init uncore_pci_init(void)
size_t size;
int ret;
 
-   size = max_dies * sizeof(struct pci_extra_dev);
+   size = uncore_max_dies() * sizeof(struct pci_extra_dev);
uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
if (!uncore_extra_pci_dev) {
ret = -ENOMEM;
@@ -1547,7 +1547,8 @@ static int __init intel_uncore_init(void)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return -ENODEV;
 
-   max_dies = topology_max_packages() * topology_max_die_per_package();
+   __uncore_max_dies =
+   topology_max_packages() * topology_max_die_per_package();
 
uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
if (uncore_init->pci_init) {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 8f2b77d27513..8621b66c49ef 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -181,6 +181,9 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
+extern int __uncore_max_dies;
+#define uncore_max_dies()  (__uncore_max_dies)
+
 #define INTEL_UNCORE_EVENT_DESC(_name, _config)\
 {  \
.attr   = __ATTR(_name, 0444, uncore_event_show, NULL), \
-- 
2.19.1