[RESEND PATCH v5 4/4] perf: Update .gitignore file
From: Alexander Antonov After a "make -C tools/perf", git reports the following untracked file: perf-iostat Add this generated file to perf's .gitignore file. Acked-by: Namhyung Kim Signed-off-by: Alexander Antonov --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index f3f84781fd74..e555e9729758 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -20,6 +20,7 @@ perf.data.old output.svg perf-archive perf-with-kcore +perf-iostat tags TAGS cscope* -- 2.21.3
[RESEND PATCH v5 1/4] perf stat: Basic support for iostat in perf
From: Alexander Antonov Add basic flow for a new iostat mode in perf. Mode is intended to provide four I/O performance metrics per each PCIe root port: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to root port is in follow-on patches. Acked-by: Namhyung Kim Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 21 +- tools/perf/util/Build | 1 + tools/perf/util/iostat.c | 53 ++ tools/perf/util/iostat.h | 47 ++ tools/perf/util/stat-display.c | 40 ++--- tools/perf/util/stat-shadow.c | 5 +++- tools/perf/util/stat.h | 1 + 7 files changed, 156 insertions(+), 12 deletions(-) create mode 100644 tools/perf/util/iostat.c create mode 100644 tools/perf/util/iostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2a2c15cac80a..ba5b31aab86b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -68,6 +68,7 @@ #include "util/affinity.h" #include "util/pfm.h" #include "util/bpf_counter.h" +#include "util/iostat.h" #include "asm/bug.h" #include @@ -212,7 +213,8 @@ static struct perf_stat_config stat_config = { .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, .ctl_fd = -1, - .ctl_fd_ack = -1 + .ctl_fd_ack = -1, + .iostat_run = false, }; static bool cpus_map_matched(struct evsel *a, struct evsel *b) @@ -1268,6 +1270,9 @@ static struct option stat_options[] = { "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", parse_control_option), + OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "default", + "measure I/O performance metrics provided by arch/platform", + iostat_parse), OPT_END() }; @@ -2341,6 +2346,17 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iostat_run) { + status = iostat_prepare(evsel_list, &stat_config); + if (status) + goto out; + if (iostat_mode == IOSTAT_LIST) { + iostat_list(evsel_list, &stat_config); + goto out; + } else if (verbose) + iostat_list(evsel_list, &stat_config); + } + if (add_default_attributes()) goto out; @@ -2516,6 +2532,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); evlist__free_stats(evsel_list); out: + if (stat_config.iostat_run) + iostat_release(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index e3e12f9d4733..7dd815712d60 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -102,6 +102,7 @@ perf-y += rwsem.o perf-y += thread-stack.o perf-y += spark.o perf-y += topdown.o +perf-y += iostat.o perf-y += stream.o perf-$(CONFIG_AUXTRACE) += auxtrace.o perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c new file mode 100644 index ..57dd49da28fe --- /dev/null +++ b/tools/perf/util/iostat.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "util/iostat.h" +#include "util/debug.h" + +enum iostat_mode_t iostat_mode = IOSTAT_NONE; + +__weak int iostat_prepare(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return -1; +} + +__weak int iostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iostat mode is not supported on current platform\n"); + return -1; +} + +__weak void iostat_list(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ +} + +__weak void iostat_release(struct evlist *evlist __maybe_unused) +{ +} + +__weak void iostat_print_header_prefix(struct perf_stat_config *config __maybe_unused) +{ +} + +__weak void iostat_print_metric(struct perf_stat_config *config __maybe_unused, + struct evsel *evsel __maybe_unused, + struct perf_stat_output_ctx *out __maybe_unused) +{ +} + +__weak void iostat_prefix(struct evlist *evlist _
[RESEND PATCH v5 2/4] perf stat: Helper functions for PCIe root ports list in iostat mode
From: Alexander Antonov Introduce helper functions to control PCIe root ports list. These helpers will be used in the follow-up patch. Acked-by: Namhyung Kim Signed-off-by: Alexander Antonov --- tools/perf/arch/x86/util/iostat.c | 110 ++ 1 file changed, 110 insertions(+) create mode 100644 tools/perf/arch/x86/util/iostat.c diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c new file mode 100644 index ..c4471f8efa5e --- /dev/null +++ b/tools/perf/arch/x86/util/iostat.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, + const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, + u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static struct iio_root_port *iio_root_port_find_by_notation( + const struct iio_root_ports_list * const list, u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + tmp_buf = realloc(list->rps, + list->nr_entries * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + list->rps = tmp_buf; + } + return 0; +} -- 2.21.3
[RESEND PATCH v5 3/4] perf stat: Enable iostat mode for x86 platforms
From: Alexander Antonov This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each PCIe root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Acked-by: Namhyung Kim Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-iostat.txt | 88 ++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/iostat.c| 360 +++ tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh| 12 + 6 files changed, 466 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iostat.txt create mode 100644 tools/perf/perf-iostat.sh diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt new file mode 100644 index ..165176944031 --- /dev/null +++ b/tools/perf/Documentation/perf-iostat.txt @@ -0,0 +1,88 @@ +perf-iostat(1) +=== + +NAME + +perf-iostat - Show I/O performance metrics + +SYNOPSIS + +[verse] +'perf iostat' list +'perf iostat' -- [] + +DESCRIPTION +--- +Mode is intended to provide four I/O performance metrics per each PCIe root port: + +- Inbound Read - I/O devices below root port read from the host memory, in MB + +- Inbound Write - I/O devices below root port write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below root port, in MB + +- Outbound Write - CPU writes to I/O devices below root port, in MB + +OPTIONS +--- +...:: + Any command you can specify in a shell. + +list:: + List all PCIe root ports. + +:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES + + +1. List all PCIe root ports (example for 2-S platform): + + $ perf iostat list + S0-uncore_iio_0<:00> + S1-uncore_iio_0<:80> + S0-uncore_iio_1<:17> + S1-uncore_iio_1<:85> + S0-uncore_iio_2<:3a> + S1-uncore_iio_2<:ae> + S0-uncore_iio_3<:5d> + S1-uncore_iio_3<:d7> + +2. Collect metrics for all PCIe root ports: + + $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :00102 3 + :80000 0 + :17 352552 430 21 + :85000 0 + :3a300 0 + :ae000 0 + :5d000 0 + :d7000 0 + +3. Collect metrics for comma-separated list of PCIe root ports: + + $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :17 358559 440 22 + :3a320 0 + +197.081983474 seconds time elapsed + +SEE ALSO + +linkperf:perf-stat[1] \ No newline at end of file diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 090fb9d62665..6240fbb1646e 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -283,6 +283,7 @@ SCRIPT_SH = SCRIPT_SH += perf-archive.sh SCRIPT_SH += perf-with-kcore.sh +SCRIPT_SH
[RESEND PATCH v5 0/4] perf stat: Introduce iostat mode to provide I/O performance metrics
From: Alexander Antonov Resending V5 with added Acked-by: Namhyung Kim tag. Thanks, Alexander The previous version can be found at: v4: https://lkml.kernel.org/r/20210203135830.38568-1-alexander.anto...@linux.intel.com/ Changes in this revision are: v4 -> v5: - Addressed comments from Namhyung Kim: 1. Removed AGGR_PCIE_PORT aggregation mode 2. Added iostat_prepare() function 3. Moved implementation specific fprintf() calls to separate x86-related function 4. Fixed code-related issues - Moved __weak iostat's functions to separate util/iostat.c file The previous version can be found at: v3: https://lkml.kernel.org/r/20210126080619.30275-1-alexander.anto...@linux.intel.com/ Changes in this revision are: v3 -> v4: - Addressed comment from Namhyung Kim: 1. Removed NULL-termination of root ports list The previous version can be found at: v2: https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com Changes in this revision are: v2 -> v3: - Addressed comments from Namhyung Kim: 1. Removed perf_device pointer from evsel structure. Use priv field instead 2. Renamed 'iiostat' to 'iostat' 3. Renamed 'show' mode to 'list' mode 4. Renamed iiostat_delete_root_ports() to iiostat_release() and iostat_show_root_ports() to iostat_list() The previous version can be found at: v1: https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com Changes in this revision are: v1 -> v2: - Addressed comment from Arnaldo Carvalho de Melo: 1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat': - Added perf-iiostat.sh script to use short command - Updated manual pages to get help for 'perf iiostat' - Added 'perf-iiostat' to perf's gitignore file Mode is intended to provide four I/O performance metrics in MB per each root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Note: iostat introduces new perf data aggregation mode - per PCIe root port hence -e and -M options are not supported. Usage examples: 1. List all PCIe root ports (example for 2-S platform): $ perf iostat list S0-uncore_iio_0<:00> S1-uncore_iio_0<:80> S0-uncore_iio_1<:17> S1-uncore_iio_1<:85> S0-uncore_iio_2<:3a> S1-uncore_iio_2<:ae> S0-uncore_iio_3<:5d> S1-uncore_iio_3<:d7> 2. Collect metrics for all PCIe root ports: $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :00102 3 :80000 0 :17 352552 430 21 :85000 0 :3a300 0 :ae000 0 :5d000 0 :d7000 0 3. Collect metrics for comma separated list of PCIe root ports: $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :17 358559 44 0 22 :3a320 0 197.081983474 seconds time elapsed Alexander Antonov (4): perf stat: Basic support for iostat in perf perf stat: Helper functions for PCIe root ports list in iostat mode perf stat: Enable iostat mode for x86 platforms perf: Update .gitignore file tools/perf/.gitignore|
[tip: perf/core] perf/x86/intel/uncore: Enable IIO stacks to PMON mapping for multi-segment SKX
The following commit has been merged into the perf/core branch of tip: Commit-ID: cface0326a6c2ae5c8f47bd466f07624b3e348a7 Gitweb: https://git.kernel.org/tip/cface0326a6c2ae5c8f47bd466f07624b3e348a7 Author:Alexander Antonov AuthorDate:Tue, 23 Mar 2021 18:05:07 +03:00 Committer: Peter Zijlstra CommitterDate: Fri, 02 Apr 2021 10:04:55 +02:00 perf/x86/intel/uncore: Enable IIO stacks to PMON mapping for multi-segment SKX IIO stacks to PMON mapping on Skylake servers is exposed through introduced early attributes /sys/devices/uncore_iio_/dieX, where dieX is a file which holds "Segment:Root Bus" for PCIe root port which can be monitored by that IIO PMON block. These sysfs attributes are disabled for multiple segment topologies except VMD domains which start at 0x1. This patch removes the limitation and enables IIO stacks to PMON mapping for multi-segment Skylake servers by introducing segment-aware intel_uncore_topology structure and attributing the topology configuration to the segment in skx_iio_get_topology() function. Reported-by: kernel test robot Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Reviewed-by: Andi Kleen Tested-by: Kyle Meyer Link: https://lkml.kernel.org/r/20210323150507.2013-1-alexander.anto...@linux.intel.com --- arch/x86/events/intel/uncore.c | 12 +- arch/x86/events/intel/uncore.h | 9 +++- arch/x86/events/intel/uncore_snbep.c | 60 --- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 35b3470..a2b68bb 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -53,6 +53,18 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus) return die_id; } +int uncore_die_to_segment(int die) +{ + struct pci_bus *bus = NULL; + + /* Find first pci bus which attributes to specified die. */ + while ((bus = pci_find_next_bus(bus)) && + (die != uncore_pcibus_to_dieid(bus))) + ; + + return bus ? pci_domain_nr(bus) : -EINVAL; +} + static void uncore_free_pcibus_map(void) { struct pci2phy_map *map, *tmp; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 549cfb2..96569dc 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -42,6 +42,7 @@ struct intel_uncore_pmu; struct intel_uncore_box; struct uncore_event_desc; struct freerunning_counters; +struct intel_uncore_topology; struct intel_uncore_type { const char *name; @@ -87,7 +88,7 @@ struct intel_uncore_type { * to identify which platform component each PMON block of that type is * supposed to monitor. */ - u64 *topology; + struct intel_uncore_topology *topology; /* * Optional callbacks for managing mapping of Uncore units to PMONs */ @@ -176,6 +177,11 @@ struct freerunning_counters { unsigned *box_offsets; }; +struct intel_uncore_topology { + u64 configuration; + int segment; +}; + struct pci2phy_map { struct list_head list; int segment; @@ -184,6 +190,7 @@ struct pci2phy_map { struct pci2phy_map *__find_pci2phy_map(int segment); int uncore_pcibus_to_dieid(struct pci_bus *bus); +int uncore_die_to_segment(int die); ssize_t uncore_event_show(struct device *dev, struct device_attribute *attr, char *buf); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b79951d..acc3c0e 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3684,7 +3684,8 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) { - return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); + return pmu->type->topology[die].configuration >> + (pmu->pmu_idx * BUS_NUM_STRIDE); } static umode_t @@ -3697,19 +3698,14 @@ skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) } static ssize_t skx_iio_mapping_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { - struct pci_bus *bus = pci_find_next_bus(NULL); - struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); long die = (long)ea->var; - /* -* Current implementation is for single segment configuration hence it's -* safe to take the segment value from the first available root bus. -*/ -
[PATCH v5 4/4] perf: Update .gitignore file
After a "make -C tools/perf", git reports the following untracked file: perf-iostat Add this generated file to perf's .gitignore file. Signed-off-by: Alexander Antonov --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index f3f84781fd74..e555e9729758 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -20,6 +20,7 @@ perf.data.old output.svg perf-archive perf-with-kcore +perf-iostat tags TAGS cscope* -- 2.19.1
[PATCH v5 3/4] perf stat: Enable iostat mode for x86 platforms
This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each PCIe root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-iostat.txt | 88 ++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/iostat.c| 360 +++ tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh| 12 + 6 files changed, 466 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iostat.txt create mode 100644 tools/perf/perf-iostat.sh diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt new file mode 100644 index ..165176944031 --- /dev/null +++ b/tools/perf/Documentation/perf-iostat.txt @@ -0,0 +1,88 @@ +perf-iostat(1) +=== + +NAME + +perf-iostat - Show I/O performance metrics + +SYNOPSIS + +[verse] +'perf iostat' list +'perf iostat' -- [] + +DESCRIPTION +--- +Mode is intended to provide four I/O performance metrics per each PCIe root port: + +- Inbound Read - I/O devices below root port read from the host memory, in MB + +- Inbound Write - I/O devices below root port write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below root port, in MB + +- Outbound Write - CPU writes to I/O devices below root port, in MB + +OPTIONS +--- +...:: + Any command you can specify in a shell. + +list:: + List all PCIe root ports. + +:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES + + +1. List all PCIe root ports (example for 2-S platform): + + $ perf iostat list + S0-uncore_iio_0<:00> + S1-uncore_iio_0<:80> + S0-uncore_iio_1<:17> + S1-uncore_iio_1<:85> + S0-uncore_iio_2<:3a> + S1-uncore_iio_2<:ae> + S0-uncore_iio_3<:5d> + S1-uncore_iio_3<:d7> + +2. Collect metrics for all PCIe root ports: + + $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :00102 3 + :80000 0 + :17 352552 430 21 + :85000 0 + :3a300 0 + :ae000 0 + :5d000 0 + :d7000 0 + +3. Collect metrics for comma-separated list of PCIe root ports: + + $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :17 358559 440 22 + :3a320 0 + +197.081983474 seconds time elapsed + +SEE ALSO + +linkperf:perf-stat[1] \ No newline at end of file diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index a7d768fdc8a1..3b3a452f4862 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -283,6 +283,7 @@ SCRIPT_SH = SCRIPT_SH += perf-archive.sh SCRIPT_SH += perf-with-kcore.sh +SCRIPT_SH += perf-iostat.sh grep-libs = $(filter -l%,$(1))
[PATCH v5 0/4] perf stat: Introduce iostat mode to provide I/O performance metrics
The previous version can be found at: v4: https://lkml.kernel.org/r/20210203135830.38568-1-alexander.anto...@linux.intel.com/ Changes in this revision are: v4 -> v5: - Addressed comments from Namhyung Kim: 1. Removed AGGR_PCIE_PORT aggregation mode 2. Added iostat_prepare() function 3. Moved implementation specific fprintf() calls to separate x86-related function 4. Fixed code-related issues - Moved __weak iostat's functions to separate util/iostat.c file The previous version can be found at: v3: https://lkml.kernel.org/r/20210126080619.30275-1-alexander.anto...@linux.intel.com/ Changes in this revision are: v3 -> v4: - Addressed comment from Namhyung Kim: 1. Removed NULL-termination of root ports list The previous version can be found at: v2: https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com Changes in this revision are: v2 -> v3: - Addressed comments from Namhyung Kim: 1. Removed perf_device pointer from evsel structure. Use priv field instead 2. Renamed 'iiostat' to 'iostat' 3. Renamed 'show' mode to 'list' mode 4. Renamed iiostat_delete_root_ports() to iiostat_release() and iostat_show_root_ports() to iostat_list() The previous version can be found at: v1: https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com Changes in this revision are: v1 -> v2: - Addressed comment from Arnaldo Carvalho de Melo: 1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat': - Added perf-iiostat.sh script to use short command - Updated manual pages to get help for 'perf iiostat' - Added 'perf-iiostat' to perf's gitignore file Mode is intended to provide four I/O performance metrics in MB per each root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Note: iostat introduces new perf data aggregation mode - per PCIe root port hence -e and -M options are not supported. Usage examples: 1. List all PCIe root ports (example for 2-S platform): $ perf iostat list S0-uncore_iio_0<:00> S1-uncore_iio_0<:80> S0-uncore_iio_1<:17> S1-uncore_iio_1<:85> S0-uncore_iio_2<:3a> S1-uncore_iio_2<:ae> S0-uncore_iio_3<:5d> S1-uncore_iio_3<:d7> 2. Collect metrics for all PCIe root ports: $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :00102 3 :80000 0 :17 352552 430 21 :85000 0 :3a300 0 :ae000 0 :5d000 0 :d7000 0 3. Collect metrics for comma separated list of PCIe root ports: $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :17 358559 44 0 22 :3a320 0 197.081983474 seconds time elapsed Alexander Antonov (4): perf stat: Basic support for iostat in perf perf stat: Helper functions for PCIe root ports list in iostat mode perf stat: Enable iostat mode for x86 platforms perf: Update .gitignore file tools/perf/.gitignore| 1 + tools/perf/Documentation/perf-iostat.txt | 88 + tools/perf/Makefile.perf
[PATCH v5 1/4] perf stat: Basic support for iostat in perf
Add basic flow for a new iostat mode in perf. Mode is intended to provide four I/O performance metrics per each PCIe root port: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to root port is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 21 +- tools/perf/util/Build | 1 + tools/perf/util/iostat.c | 53 ++ tools/perf/util/iostat.h | 47 ++ tools/perf/util/stat-display.c | 40 ++--- tools/perf/util/stat-shadow.c | 5 +++- tools/perf/util/stat.h | 1 + 7 files changed, 156 insertions(+), 12 deletions(-) create mode 100644 tools/perf/util/iostat.c create mode 100644 tools/perf/util/iostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2e2e4a8345ea..4cef64ce9261 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -68,6 +68,7 @@ #include "util/affinity.h" #include "util/pfm.h" #include "util/bpf_counter.h" +#include "util/iostat.h" #include "asm/bug.h" #include @@ -212,7 +213,8 @@ static struct perf_stat_config stat_config = { .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, .ctl_fd = -1, - .ctl_fd_ack = -1 + .ctl_fd_ack = -1, + .iostat_run = false, }; static bool cpus_map_matched(struct evsel *a, struct evsel *b) @@ -1247,6 +1249,9 @@ static struct option stat_options[] = { "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", parse_control_option), + OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "default", + "measure I/O performance metrics provided by arch/platform", + iostat_parse), OPT_END() }; @@ -2320,6 +2325,17 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iostat_run) { + status = iostat_prepare(evsel_list, &stat_config); + if (status) + goto out; + if (iostat_mode == IOSTAT_LIST) { + iostat_list(evsel_list, &stat_config); + goto out; + } else if (verbose) + iostat_list(evsel_list, &stat_config); + } + if (add_default_attributes()) goto out; @@ -2495,6 +2511,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); evlist__free_stats(evsel_list); out: + if (stat_config.iostat_run) + iostat_release(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index e3e12f9d4733..7dd815712d60 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -102,6 +102,7 @@ perf-y += rwsem.o perf-y += thread-stack.o perf-y += spark.o perf-y += topdown.o +perf-y += iostat.o perf-y += stream.o perf-$(CONFIG_AUXTRACE) += auxtrace.o perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c new file mode 100644 index ..57dd49da28fe --- /dev/null +++ b/tools/perf/util/iostat.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "util/iostat.h" +#include "util/debug.h" + +enum iostat_mode_t iostat_mode = IOSTAT_NONE; + +__weak int iostat_prepare(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return -1; +} + +__weak int iostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iostat mode is not supported on current platform\n"); + return -1; +} + +__weak void iostat_list(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ +} + +__weak void iostat_release(struct evlist *evlist __maybe_unused) +{ +} + +__weak void iostat_print_header_prefix(struct perf_stat_config *config __maybe_unused) +{ +} + +__weak void iostat_print_metric(struct perf_stat_config *config __maybe_unused, + struct evsel *evsel __maybe_unused, + struct perf_stat_output_ctx *out __maybe_unused) +{ +} + +__weak void iostat_prefix(struct evlist *evlist __maybe_unused, + struct pe
[PATCH v5 2/4] perf stat: Helper functions for PCIe root ports list in iostat mode
Introduce helper functions to control PCIe root ports list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov --- tools/perf/arch/x86/util/iostat.c | 110 ++ 1 file changed, 110 insertions(+) create mode 100644 tools/perf/arch/x86/util/iostat.c diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c new file mode 100644 index ..c4471f8efa5e --- /dev/null +++ b/tools/perf/arch/x86/util/iostat.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, + const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, + u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static struct iio_root_port *iio_root_port_find_by_notation( + const struct iio_root_ports_list * const list, u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + tmp_buf = realloc(list->rps, + list->nr_entries * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + list->rps = tmp_buf; + } + return 0; +} -- 2.19.1
[PATCH] perf/x86/intel/uncore: Enable IIO stacks to PMON mapping for multi-segment SKX
IIO stacks to PMON mapping on Skylake servers is exposed through introduced early attributes /sys/devices/uncore_iio_/dieX, where dieX is a file which holds "Segment:Root Bus" for PCIe root port which can be monitored by that IIO PMON block. These sysfs attributes are disabled for multiple segment topologies except VMD domains which start at 0x1. This patch removes the limitation and enables IIO stacks to PMON mapping for multi-segment Skylake servers by introducing segment-aware intel_uncore_topology structure and attributing the topology configuration to the segment in skx_iio_get_topology() function. Reported-by: kernel test robot Tested-by: Kyle Meyer Reviewed-by: Andi Kleen Reviewed-by: Kan Liang Signed-off-by: Alexander Antonov --- arch/x86/events/intel/uncore.c | 12 ++ arch/x86/events/intel/uncore.h | 9 - arch/x86/events/intel/uncore_snbep.c | 60 +--- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 33c8180d5a87..0c066d9aa17a 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -48,6 +48,18 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus) return die_id; } +int uncore_die_to_segment(int die) +{ + struct pci_bus *bus = NULL; + + /* Find first pci bus which attributes to specified die. */ + while ((bus = pci_find_next_bus(bus)) && + (die != uncore_pcibus_to_dieid(bus))) + ; + + return bus ? pci_domain_nr(bus) : -EINVAL; +} + static void uncore_free_pcibus_map(void) { struct pci2phy_map *map, *tmp; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index a3c6e1643ad2..be2095ec458c 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -42,6 +42,7 @@ struct intel_uncore_pmu; struct intel_uncore_box; struct uncore_event_desc; struct freerunning_counters; +struct intel_uncore_topology; struct intel_uncore_type { const char *name; @@ -80,7 +81,7 @@ struct intel_uncore_type { * to identify which platform component each PMON block of that type is * supposed to monitor. */ - u64 *topology; + struct intel_uncore_topology *topology; /* * Optional callbacks for managing mapping of Uncore units to PMONs */ @@ -169,6 +170,11 @@ struct freerunning_counters { unsigned *box_offsets; }; +struct intel_uncore_topology { + u64 configuration; + int segment; +}; + struct pci2phy_map { struct list_head list; int segment; @@ -177,6 +183,7 @@ struct pci2phy_map { struct pci2phy_map *__find_pci2phy_map(int segment); int uncore_pcibus_to_dieid(struct pci_bus *bus); +int uncore_die_to_segment(int die); ssize_t uncore_event_show(struct device *dev, struct device_attribute *attr, char *buf); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b79951d0707c..acc3c0e52f4d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3684,7 +3684,8 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) { - return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); + return pmu->type->topology[die].configuration >> + (pmu->pmu_idx * BUS_NUM_STRIDE); } static umode_t @@ -3697,19 +3698,14 @@ skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) } static ssize_t skx_iio_mapping_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { - struct pci_bus *bus = pci_find_next_bus(NULL); - struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); long die = (long)ea->var; - /* -* Current implementation is for single segment configuration hence it's -* safe to take the segment value from the first available root bus. -*/ - return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus), - skx_iio_stack(uncore_pmu, die)); + return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment, + skx_iio_stack(pmu, die)); } static int skx_msr_cpu_bus_read(int cpu, u64 *topology) @@ -3746,34 +3742,32 @@ static int die_to_cpu(int die) static int skx_iio_get_topology(struct intel_uncore_type *type) { - int i, ret; - struct pci_bus *bus = NULL; - - /* -* Verified single-segment environ
Re: [PATCH v4 4/5] perf stat: Enable iostat mode for x86 platforms
On 3/9/2021 10:51 AM, liuqi (BA) wrote: Hi Alexander, On 2021/2/3 21:58, Alexander Antonov wrote: This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each PCIe root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-iostat.txt | 88 ++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/iostat.c | 345 +++ tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh | 12 + 6 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iostat.txt create mode 100644 tools/perf/perf-iostat.sh diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt new file mode 100644 index ..165176944031 --- /dev/null +++ b/tools/perf/Documentation/perf-iostat.txt @@ -0,0 +1,88 @@ +perf-iostat(1) +=== + +NAME + +perf-iostat - Show I/O performance metrics + +SYNOPSIS + +[verse] +'perf iostat' list +'perf iostat' -- [] + +DESCRIPTION +--- +Mode is intended to provide four I/O performance metrics per each PCIe root port: + +- Inbound Read - I/O devices below root port read from the host memory, in MB + +- Inbound Write - I/O devices below root port write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below root port, in MB + +- Outbound Write - CPU writes to I/O devices below root port, in MB + +OPTIONS +--- +...:: + Any command you can specify in a shell. + +list:: + List all PCIe root ports. I noticed that "iostat" commond and cmd_iostat() callback function is not registered in cmd_struct in perf.c. So I think "perf iostat list" perhaps can not work properly. I also test this patchset on x86 platform, and here is the log: root@ubuntu:/home/lq# ./perf iostat list perf: 'iostat' is not a perf-command. See 'perf --help'. root@ubuntu:/home/lq# ./perf stat --iostat ^C Performance counter stats for 'system wide': port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) :00 0 0 0 0 :80 0 0 0 0 :17 0 0 0 0 :85 0 0 0 0 :3a 0 0 0 0 :ae 0 0 0 0 :5d 0 0 0 0 :d7 0 0 0 0 0.611303832 seconds time elapsed root@ubuntu:/home/lq# ./perf stat --iostat=:17 ^C Performance counter stats for 'system wide': port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) :17 0 0 0 0 0.521317572 seconds time elapsed So how does following perf iostat list work, did I miss something? Thanks, Qi Hello, The 'iostat' mode uses aliases mechanism in perf same as 'perf archive' and in this case you don't need to add function callback into cmd_struct. For example, the command 'perf iostat list' will be converted to 'perf stat --iostat=list'. After building the perf tool you should have two shell scripts in tools/perf directory and one of them is executable, for example: # make -C tools/perf # ls -l tools/perf/perf-iostat* -rwxr-xr-x 1 root root 290 Mar 10 18:17 perf-iostat -rw-r--r-- 1 root root 290 Feb 3 15:14 perf-iostat.sh It should be possible to run 'perf iostat' from build directory: # cd tools/perf # ./perf iostat list S0-uncore_iio_0<:00> S1-uncore_iio_0<:80> S0-uncore_iio_1<:17> S1-uncore_iio_1<:85> S0-uncore_iio_2<:3a> S1-uncore_iio_2<:ae> S0-uncore_iio_3<:5d> S1-uncore_iio_3<:d7> Also you can copy 'perf-iostat' to ~/libexec/perf-core/ or just
Re: [PATCH v4 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode
On 2/4/2021 3:32 PM, Namhyung Kim wrote: On Wed, Feb 3, 2021 at 10:58 PM Alexander Antonov wrote: Introduce helper functions to control PCIe root ports list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov --- tools/perf/arch/x86/util/iostat.c | 124 ++ 1 file changed, 124 insertions(+) create mode 100644 tools/perf/arch/x86/util/iostat.c diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c new file mode 100644 index ..961e540106e6 --- /dev/null +++ b/tools/perf/arch/x86/util/iostat.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, + const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, + u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static struct iio_root_ports_list *iio_root_ports_list_new(void) +{ + struct iio_root_ports_list *list = calloc(1, sizeof(*list)); + + if (list) { + list->rps = calloc(1, sizeof(struct iio_root_port *)); This seems unnecessary now. Thanks, Namhyung Yes, you are right. Will be fixed. Thank you, Alexander + if (!list->rps) { + free(list); + list = NULL; + } + } + return list; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static struct iio_root_port *iio_root_port_find_by_notation( + const struct iio_root_ports_list * const list, u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + tmp_buf = realloc(list->rps, + list->nr_entries * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + list->rps = tmp_buf; + } + return 0; +} -- 2.19.1
Re: [PATCH v4 2/5] perf stat: Basic support for iostat in perf
On 2/4/2021 3:22 PM, Namhyung Kim wrote: On Wed, Feb 3, 2021 at 10:58 PM Alexander Antonov wrote: Add basic flow for a new iostat mode in perf. Mode is intended to provide four I/O performance metrics per each PCIe root port: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to root port is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 31 ++ tools/perf/util/iostat.h | 32 +++ tools/perf/util/stat-display.c | 40 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.h | 1 + 5 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 tools/perf/util/iostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 60fdb6a0805f..66c913692120 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,6 +65,7 @@ #include "util/target.h" #include "util/time-utils.h" #include "util/top.h" +#include "util/iostat.h" #include "asm/bug.h" #include @@ -186,6 +187,7 @@ static struct perf_stat_config stat_config = { .metric_only_len= METRIC_ONLY_LEN, .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, + .iostat_run = false, }; static inline void diff_timespec(struct timespec *r, struct timespec *a, @@ -723,6 +725,14 @@ static int parse_metric_groups(const struct option *opt, return metricgroup__parse_groups(opt, str, &stat_config.metric_events); } +__weak int iostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iostat mode is not supported\n"); + return -1; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -803,6 +813,8 @@ static struct option stat_options[] = { OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list", "monitor specified metrics or metric groups (separated by ,)", parse_metric_groups), + OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "root port", + "measure PCIe metrics per root port", iostat_parse), Can we make the help string and default argument more generic? Something like "measure IO metrics provided by arch/platform" and the default value being "default". :) Do you mean using "default" instead of "root port"? What about the faceless "I/O unit"? :) OPT_END() }; @@ -1131,6 +1143,12 @@ __weak void arch_topdown_group_warn(void) { } +__weak int iostat_list(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return 0; +} + /* * Add default attributes, if there were no attributes specified or * if -d/--detailed, -d -d or -d -d -d is used: @@ -1682,6 +1700,10 @@ static void setup_system_wide(int forks) } } +__weak void iostat_release(struct evlist *evlist __maybe_unused) +{ +} + int cmd_stat(int argc, const char **argv) { const char * const stat_usage[] = { @@ -1858,6 +1880,12 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iostat_run) { + status = iostat_list(evsel_list, &stat_config); I think it's unnatural to call iostat_list() unconditionally here. How about this? status = iostat_prepare(...); if (status < 0) goto out; if (status == IOSTAT_LIST) iostat_list(...); else ... I think it's applicable. In case of 'list' option we will just print list of root ports and exit. Also listing of root ports is available in verbose mode. In this case we will print list and start the collection. + if (status || !stat_config.iostat_run) + goto out; + } + if (add_default_attributes()) goto out; @@ -2008,6 +2036,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: + if (stat_config.iostat_run) + iostat_release(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h new file mode 100644 index ..b34ebedfd5e6 --- /dev/null +++ b/tools/perf/util/iostat.h @@ -0,0 +1,32 @
Re: [PATCH v4 1/5] perf stat: Add AGGR_PCIE_PORT mode
On 2/4/2021 3:07 PM, Namhyung Kim wrote: Hello, On Wed, Feb 3, 2021 at 10:58 PM Alexander Antonov wrote: Adding AGGR_PCIE_PORT mode to be able to distinguish aggr_mode for root ports in following patches. I'm not sure adding the AGGR_PCIE_PORT is the right way. In my understanding, the aggr mode is to specify how we aggregate counter values of a single event from different cpus. But this seems to aggregate counter values from different events. Also the new mode is basically the same as AGGR_GLOBAL. As you will add stat_config.iostat_run to distinguish the iostat command, probably we just want to use the global aggr mode (and it's the default!) and get rid of the AGGR_PCIE_PORT. Thoughts? Thanks, Namhyung Hello Namhyung, Actually, you are right. We aggregate counter values from different events of a single IIO stack (PCIe root port) to calculate metrics for this IO stack. But the reason is to prevent using of '-e' and '-M' options in 'iostat' mode because it can be a reason for the mess in the output that can confuse users. There is an idea to use your suggestion for this part: status = iostat_prepare(...); if (status < 0) goto out; if (status == IOSTAT_LIST) iostat_list(...); else ... So, we can check if evlist is empty inside iostat_prepare(). If not, print a warning, for example, "The -e and -M options are not supported. All chosen events/metrics will be dropped". Then we can free of evlist by using evlist__delete(), create new one by using evlist__new() and fill the evlist. In this case the body of iostat_prepare() function would be: iostat_prepare() { If (!is_evlist_empty) { pr_warning(); evlist__delete(); evlist__new() } iostat_event_group(); } It will allow to get rid of the AGGR_PCIE_PORT. What do you think? Thank you, Alexander
[PATCH v4 5/5] perf: Update .gitignore file
After a "make -C tools/perf", git reports the following untracked file: perf-iostat Add this generated file to perf's .gitignore file. Signed-off-by: Alexander Antonov --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index bf1252dc2cb0..421f27e2b9af 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -19,6 +19,7 @@ perf.data.old output.svg perf-archive perf-with-kcore +perf-iostat tags TAGS cscope* -- 2.19.1
[PATCH v4 4/5] perf stat: Enable iostat mode for x86 platforms
This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each PCIe root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-iostat.txt | 88 ++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/iostat.c| 345 +++ tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh| 12 + 6 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iostat.txt create mode 100644 tools/perf/perf-iostat.sh diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt new file mode 100644 index ..165176944031 --- /dev/null +++ b/tools/perf/Documentation/perf-iostat.txt @@ -0,0 +1,88 @@ +perf-iostat(1) +=== + +NAME + +perf-iostat - Show I/O performance metrics + +SYNOPSIS + +[verse] +'perf iostat' list +'perf iostat' -- [] + +DESCRIPTION +--- +Mode is intended to provide four I/O performance metrics per each PCIe root port: + +- Inbound Read - I/O devices below root port read from the host memory, in MB + +- Inbound Write - I/O devices below root port write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below root port, in MB + +- Outbound Write - CPU writes to I/O devices below root port, in MB + +OPTIONS +--- +...:: + Any command you can specify in a shell. + +list:: + List all PCIe root ports. + +:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES + + +1. List all PCIe root ports (example for 2-S platform): + + $ perf iostat list + S0-uncore_iio_0<:00> + S1-uncore_iio_0<:80> + S0-uncore_iio_1<:17> + S1-uncore_iio_1<:85> + S0-uncore_iio_2<:3a> + S1-uncore_iio_2<:ae> + S0-uncore_iio_3<:5d> + S1-uncore_iio_3<:d7> + +2. Collect metrics for all PCIe root ports: + + $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :00102 3 + :80000 0 + :17 352552 430 21 + :85000 0 + :3a300 0 + :ae000 0 + :5d000 0 + :d7000 0 + +3. Collect metrics for comma-separated list of PCIe root ports: + + $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :17 358559 440 22 + :3a320 0 + +197.081983474 seconds time elapsed + +SEE ALSO + +linkperf:perf-stat[1] \ No newline at end of file diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 902c792f326a..b4ab48cc01e3 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -267,6 +267,7 @@ SCRIPT_SH = SCRIPT_SH += perf-archive.sh SCRIPT_SH += perf-with-kcore.sh +SCRIPT_SH += perf-iostat.sh grep-libs = $(filter -l%,$(1))
[PATCH v4 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode
Introduce helper functions to control PCIe root ports list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov --- tools/perf/arch/x86/util/iostat.c | 124 ++ 1 file changed, 124 insertions(+) create mode 100644 tools/perf/arch/x86/util/iostat.c diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c new file mode 100644 index ..961e540106e6 --- /dev/null +++ b/tools/perf/arch/x86/util/iostat.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, + const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, + u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static struct iio_root_ports_list *iio_root_ports_list_new(void) +{ + struct iio_root_ports_list *list = calloc(1, sizeof(*list)); + + if (list) { + list->rps = calloc(1, sizeof(struct iio_root_port *)); + if (!list->rps) { + free(list); + list = NULL; + } + } + return list; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static struct iio_root_port *iio_root_port_find_by_notation( + const struct iio_root_ports_list * const list, u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + tmp_buf = realloc(list->rps, + list->nr_entries * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + list->rps = tmp_buf; + } + return 0; +} -- 2.19.1
[PATCH v4 2/5] perf stat: Basic support for iostat in perf
Add basic flow for a new iostat mode in perf. Mode is intended to provide four I/O performance metrics per each PCIe root port: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to root port is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 31 ++ tools/perf/util/iostat.h | 32 +++ tools/perf/util/stat-display.c | 40 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.h | 1 + 5 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 tools/perf/util/iostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 60fdb6a0805f..66c913692120 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,6 +65,7 @@ #include "util/target.h" #include "util/time-utils.h" #include "util/top.h" +#include "util/iostat.h" #include "asm/bug.h" #include @@ -186,6 +187,7 @@ static struct perf_stat_config stat_config = { .metric_only_len= METRIC_ONLY_LEN, .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, + .iostat_run = false, }; static inline void diff_timespec(struct timespec *r, struct timespec *a, @@ -723,6 +725,14 @@ static int parse_metric_groups(const struct option *opt, return metricgroup__parse_groups(opt, str, &stat_config.metric_events); } +__weak int iostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iostat mode is not supported\n"); + return -1; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -803,6 +813,8 @@ static struct option stat_options[] = { OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list", "monitor specified metrics or metric groups (separated by ,)", parse_metric_groups), + OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "root port", + "measure PCIe metrics per root port", iostat_parse), OPT_END() }; @@ -1131,6 +1143,12 @@ __weak void arch_topdown_group_warn(void) { } +__weak int iostat_list(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return 0; +} + /* * Add default attributes, if there were no attributes specified or * if -d/--detailed, -d -d or -d -d -d is used: @@ -1682,6 +1700,10 @@ static void setup_system_wide(int forks) } } +__weak void iostat_release(struct evlist *evlist __maybe_unused) +{ +} + int cmd_stat(int argc, const char **argv) { const char * const stat_usage[] = { @@ -1858,6 +1880,12 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iostat_run) { + status = iostat_list(evsel_list, &stat_config); + if (status || !stat_config.iostat_run) + goto out; + } + if (add_default_attributes()) goto out; @@ -2008,6 +2036,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: + if (stat_config.iostat_run) + iostat_release(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h new file mode 100644 index ..b34ebedfd5e6 --- /dev/null +++ b/tools/perf/util/iostat.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#ifndef _IOSTAT_H +#define _IOSTAT_H + +#include +#include "util/stat.h" +#include "util/parse-events.h" +#include "util/evlist.h" + +struct option; +struct perf_stat_config; +struct evlist; +struct timespec; + +int iostat_parse(const struct option *opt, const char *str, +int unset __maybe_unused); +void iostat_prefix(struct perf_stat_config *config, struct evlist *evlist, + char *prefix, struct timespec *ts); +void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, +struct perf_stat_output_ctx *out); +int iostat_list(struct evlist *evlist, struct perf_stat_config *config); +void iostat_release(struct evlist *evlist); + +#endif /* _IOSTAT_H */ diff --git a/tools/p
[PATCH v4 0/5] perf stat: Introduce iostat mode to provide I/O performance metrics
The previous version can be found at: v3: https://lkml.kernel.org/r/20210126080619.30275-1-alexander.anto...@linux.intel.com/ Changes in this revision are: v3 -> v4: - Addressed comment from Namhyung Kim: 1. Removed NULL-termination of root ports list The previous version can be found at: v2: https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com Changes in this revision are: v2 -> v3: - Addressed comments from Namhyung Kim: 1. Removed perf_device pointer from evsel structure. Use priv field instead 2. Renamed 'iiostat' to 'iostat' 3. Renamed 'show' mode to 'list' mode 4. Renamed iiostat_delete_root_ports() to iiostat_release() and iostat_show_root_ports() to iostat_list() The previous version can be found at: v1: https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com Changes in this revision are: v1 -> v2: - Addressed comment from Arnaldo Carvalho de Melo: 1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat': - Added perf-iiostat.sh script to use short command - Updated manual pages to get help for 'perf iiostat' - Added 'perf-iiostat' to perf's gitignore file Mode is intended to provide four I/O performance metrics in MB per each root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Note: iostat introduces new perf data aggregation mode - per PCIe root port hence -e and -M options are not supported. Usage examples: 1. List all PCIe root ports (example for 2-S platform): $ perf iostat list S0-uncore_iio_0<:00> S1-uncore_iio_0<:80> S0-uncore_iio_1<:17> S1-uncore_iio_1<:85> S0-uncore_iio_2<:3a> S1-uncore_iio_2<:ae> S0-uncore_iio_3<:5d> S1-uncore_iio_3<:d7> 2. Collect metrics for all PCIe root ports: $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :00102 3 :80000 0 :17 352552 430 21 :85000 0 :3a300 0 :ae000 0 :5d000 0 :d7000 0 3. Collect metrics for comma separated list of PCIe root ports: $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :17 358559 440 22 :3a320 0 197.081983474 seconds time elapsed Alexander Antonov (5): perf stat: Add AGGR_PCIE_PORT mode perf stat: Basic support for iostat in perf perf stat: Helper functions for PCIe root ports list in iostat mode perf stat: Enable iostat mode for x86 platforms perf: Update .gitignore file tools/perf/.gitignore | 1 + tools/perf/Documentation/perf-iostat.txt | 88 tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build| 1 + tools/perf/arch/x86/util/iostat.c | 469 ++ tools/perf/builtin-stat.c | 36 +- tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh | 12 + tools/perf/util/iostat.h | 32 ++ .../scripting-engines/trace-event-python.c| 3 +- tools/perf/
[PATCH v4 1/5] perf stat: Add AGGR_PCIE_PORT mode
Adding AGGR_PCIE_PORT mode to be able to distinguish aggr_mode for root ports in following patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 5 - .../util/scripting-engines/trace-event-python.c | 3 ++- tools/perf/util/stat-display.c | 13 +++-- tools/perf/util/stat.c | 4 +++- tools/perf/util/stat.h | 1 + 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 468fc49420ce..60fdb6a0805f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -908,6 +908,7 @@ static int perf_stat_init_aggr_mode(void) break; case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_PCIE_PORT: case AGGR_UNSET: default: break; @@ -1072,6 +1073,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) case AGGR_NONE: case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_PCIE_PORT: case AGGR_UNSET: default: break; @@ -1844,7 +1846,8 @@ int cmd_stat(int argc, const char **argv) * --per-thread is aggregated per thread, we dont mix it with cpu mode */ if (((stat_config.aggr_mode != AGGR_GLOBAL && - stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) && + stat_config.aggr_mode != AGGR_THREAD && + stat_config.aggr_mode != AGGR_PCIE_PORT) || nr_cgroups) && !target__has_cpu(&target)) { fprintf(stderr, "both cgroup and no-aggregation " "modes only available in system-wide mode\n"); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 5d341efc3237..e604c199f493 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1396,7 +1396,8 @@ static void python_process_stat(struct perf_stat_config *config, struct perf_cpu_map *cpus = counter->core.cpus; int cpu, thread; - if (config->aggr_mode == AGGR_GLOBAL) { + if (config->aggr_mode == AGGR_GLOBAL || + config->aggr_mode == AGGR_PCIE_PORT) { process_stat(counter, -1, -1, tstamp, &counter->counts->aggr); return; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index ed3b0ac2f785..db1bec115d0b 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -123,6 +123,7 @@ static void aggr_printout(struct perf_stat_config *config, config->csv_sep); break; case AGGR_GLOBAL: + case AGGR_PCIE_PORT: case AGGR_UNSET: default: break; @@ -322,7 +323,8 @@ static int first_shadow_cpu(struct perf_stat_config *config, if (config->aggr_mode == AGGR_NONE) return id; - if (config->aggr_mode == AGGR_GLOBAL) + if (config->aggr_mode == AGGR_GLOBAL || + config->aggr_mode == AGGR_PCIE_PORT) return 0; for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) { @@ -416,6 +418,7 @@ static void printout(struct perf_stat_config *config, int id, int nr, if (config->csv_output && !config->metric_only) { static int aggr_fields[] = { [AGGR_GLOBAL] = 0, + [AGGR_PCIE_PORT] = 0, [AGGR_THREAD] = 1, [AGGR_NONE] = 1, [AGGR_SOCKET] = 2, @@ -899,6 +902,7 @@ static int aggr_header_lens[] = { [AGGR_NONE] = 6, [AGGR_THREAD] = 24, [AGGR_GLOBAL] = 0, + [AGGR_PCIE_PORT] = 0, }; static const char *aggr_header_csv[] = { @@ -907,7 +911,8 @@ static const char *aggr_header_csv[] = { [AGGR_SOCKET] = "socket,cpus", [AGGR_NONE] = "cpu,", [AGGR_THREAD] = "comm-pid,", - [AGGR_GLOBAL] = "" + [AGGR_GLOBAL] = "", + [AGGR_PCIE_PORT] = "port," }; static void print_metric_headers(struct perf_stat_config *config, @@ -990,6 +995,8 @@ static void print_interval(struct perf_stat_config *config, if (!metric_only) fprintf(output, " counts %*s events\n", unit_width, "unit"); break; + case AGGR_PCIE_PORT: + break; case AGGR_GLOBAL: default: fprintf(output, "# time"); @@ -1214,6 +122
Re: [PATCH v3 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode
On 1/29/2021 11:26 AM, Namhyung Kim wrote: Hello, On Tue, Jan 26, 2021 at 5:06 PM Alexander Antonov wrote: Introduce helper functions to control PCIe root ports list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov --- [SNIP] +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + /* One more for NULL.*/ + tmp_buf = realloc(list->rps, + (list->nr_entries + 1) * sizeof(*list->rps)); Why is this +1 needed since you already have the number of entries in the list? Thanks, Namhyung Hello, My first approach for iteration through root ports list was using NULL-terminated array. And seems like I just forgot to remove this code. I will fix it. Thank you, Alexander + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + tmp_buf[list->nr_entries] = NULL; + list->rps = tmp_buf; + } + return 0; +} -- 2.19.1
[PATCH v3 3/5] perf stat: Helper functions for PCIe root ports list in iostat mode
Introduce helper functions to control PCIe root ports list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov --- tools/perf/arch/x86/util/iostat.c | 127 ++ 1 file changed, 127 insertions(+) create mode 100644 tools/perf/arch/x86/util/iostat.c diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c new file mode 100644 index ..3ef727f9da63 --- /dev/null +++ b/tools/perf/arch/x86/util/iostat.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, + const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, + u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static struct iio_root_ports_list *iio_root_ports_list_new(void) +{ + struct iio_root_ports_list *list = calloc(1, sizeof(*list)); + + if (list) { + list->rps = calloc(1, sizeof(struct iio_root_port *)); + if (!list->rps) { + free(list); + list = NULL; + } + } + + return list; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static struct iio_root_port *iio_root_port_find_by_notation( + const struct iio_root_ports_list * const list, u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + /* One more for NULL.*/ + tmp_buf = realloc(list->rps, + (list->nr_entries + 1) * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + tmp_buf[list->nr_entries] = NULL; + list->rps = tmp_buf; + } + return 0; +} -- 2.19.1
[PATCH v3 5/5] perf: Update .gitignore file
After a "make -C tools/perf", git reports the following untracked file: perf-iostat Add this generated file to perf's .gitignore file. Signed-off-by: Alexander Antonov --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index bf1252dc2cb0..421f27e2b9af 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -19,6 +19,7 @@ perf.data.old output.svg perf-archive perf-with-kcore +perf-iostat tags TAGS cscope* -- 2.19.1
[PATCH v3 4/5] perf stat: Enable iostat mode for x86 platforms
This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each PCIe root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-iostat.txt | 88 ++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/iostat.c| 345 +++ tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh| 12 + 6 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iostat.txt create mode 100644 tools/perf/perf-iostat.sh diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt new file mode 100644 index ..165176944031 --- /dev/null +++ b/tools/perf/Documentation/perf-iostat.txt @@ -0,0 +1,88 @@ +perf-iostat(1) +=== + +NAME + +perf-iostat - Show I/O performance metrics + +SYNOPSIS + +[verse] +'perf iostat' list +'perf iostat' -- [] + +DESCRIPTION +--- +Mode is intended to provide four I/O performance metrics per each PCIe root port: + +- Inbound Read - I/O devices below root port read from the host memory, in MB + +- Inbound Write - I/O devices below root port write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below root port, in MB + +- Outbound Write - CPU writes to I/O devices below root port, in MB + +OPTIONS +--- +...:: + Any command you can specify in a shell. + +list:: + List all PCIe root ports. + +:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES + + +1. List all PCIe root ports (example for 2-S platform): + + $ perf iostat list + S0-uncore_iio_0<:00> + S1-uncore_iio_0<:80> + S0-uncore_iio_1<:17> + S1-uncore_iio_1<:85> + S0-uncore_iio_2<:3a> + S1-uncore_iio_2<:ae> + S0-uncore_iio_3<:5d> + S1-uncore_iio_3<:d7> + +2. Collect metrics for all PCIe root ports: + + $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :00102 3 + :80000 0 + :17 352552 430 21 + :85000 0 + :3a300 0 + :ae000 0 + :5d000 0 + :d7000 0 + +3. Collect metrics for comma-separated list of PCIe root ports: + + $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :17 358559 440 22 + :3a320 0 + +197.081983474 seconds time elapsed + +SEE ALSO + +linkperf:perf-stat[1] \ No newline at end of file diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 902c792f326a..b4ab48cc01e3 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -267,6 +267,7 @@ SCRIPT_SH = SCRIPT_SH += perf-archive.sh SCRIPT_SH += perf-with-kcore.sh +SCRIPT_SH += perf-iostat.sh grep-libs = $(filter -l%,$(1))
[PATCH v3 2/5] perf stat: Basic support for iostat in perf
Add basic flow for a new iostat mode in perf. Mode is intended to provide four I/O performance metrics per each PCIe root port: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to root port is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 31 ++ tools/perf/util/iostat.h | 32 +++ tools/perf/util/stat-display.c | 40 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.h | 1 + 5 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 tools/perf/util/iostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 60fdb6a0805f..66c913692120 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,6 +65,7 @@ #include "util/target.h" #include "util/time-utils.h" #include "util/top.h" +#include "util/iostat.h" #include "asm/bug.h" #include @@ -186,6 +187,7 @@ static struct perf_stat_config stat_config = { .metric_only_len= METRIC_ONLY_LEN, .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, + .iostat_run = false, }; static inline void diff_timespec(struct timespec *r, struct timespec *a, @@ -723,6 +725,14 @@ static int parse_metric_groups(const struct option *opt, return metricgroup__parse_groups(opt, str, &stat_config.metric_events); } +__weak int iostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iostat mode is not supported\n"); + return -1; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -803,6 +813,8 @@ static struct option stat_options[] = { OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list", "monitor specified metrics or metric groups (separated by ,)", parse_metric_groups), + OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "root port", + "measure PCIe metrics per root port", iostat_parse), OPT_END() }; @@ -1131,6 +1143,12 @@ __weak void arch_topdown_group_warn(void) { } +__weak int iostat_list(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return 0; +} + /* * Add default attributes, if there were no attributes specified or * if -d/--detailed, -d -d or -d -d -d is used: @@ -1682,6 +1700,10 @@ static void setup_system_wide(int forks) } } +__weak void iostat_release(struct evlist *evlist __maybe_unused) +{ +} + int cmd_stat(int argc, const char **argv) { const char * const stat_usage[] = { @@ -1858,6 +1880,12 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iostat_run) { + status = iostat_list(evsel_list, &stat_config); + if (status || !stat_config.iostat_run) + goto out; + } + if (add_default_attributes()) goto out; @@ -2008,6 +2036,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: + if (stat_config.iostat_run) + iostat_release(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h new file mode 100644 index ..b34ebedfd5e6 --- /dev/null +++ b/tools/perf/util/iostat.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#ifndef _IOSTAT_H +#define _IOSTAT_H + +#include +#include "util/stat.h" +#include "util/parse-events.h" +#include "util/evlist.h" + +struct option; +struct perf_stat_config; +struct evlist; +struct timespec; + +int iostat_parse(const struct option *opt, const char *str, +int unset __maybe_unused); +void iostat_prefix(struct perf_stat_config *config, struct evlist *evlist, + char *prefix, struct timespec *ts); +void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, +struct perf_stat_output_ctx *out); +int iostat_list(struct evlist *evlist, struct perf_stat_config *config); +void iostat_release(struct evlist *evlist); + +#endif /* _IOSTAT_H */ diff --git a/tools/p
[PATCH v3 0/5] perf stat: Introduce iostat mode to provide I/O performance metrics
The previous version can be found at: v2: https://lkml.kernel.org/r/20201223130320.3930-1-alexander.anto...@linux.intel.com Changes in this revision are: v2 -> v3: - Addressed comments from Namhyung Kim: 1. Removed perf_device pointer from evsel structure. Use priv field instead 2. Renamed 'iiostat' to 'iostat' 3. Renamed 'show' mode to 'list' mode 4. Renamed iiostat_delete_root_ports() to iiostat_release() and iostat_show_root_ports() to iostat_list() The previous version can be found at: v1: https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com Changes in this revision are: v1 -> v2: - Addressed comment from Arnaldo Carvalho de Melo: 1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat': - Added perf-iiostat.sh script to use short command - Updated manual pages to get help for 'perf iiostat' - Added 'perf-iiostat' to perf's gitignore file Mode is intended to provide four I/O performance metrics in MB per each root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Note: iostat introduces new perf data aggregation mode - per PCIe root port hence -e and -M options are not supported. Usage examples: 1. List all PCIe root ports (example for 2-S platform): $ perf iostat list S0-uncore_iio_0<:00> S1-uncore_iio_0<:80> S0-uncore_iio_1<:17> S1-uncore_iio_1<:85> S0-uncore_iio_2<:3a> S1-uncore_iio_2<:ae> S0-uncore_iio_3<:5d> S1-uncore_iio_3<:d7> 2. Collect metrics for all PCIe root ports: $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :00102 3 :80000 0 :17 352552 430 21 :85000 0 :3a300 0 :ae000 0 :5d000 0 :d7000 0 3. Collect metrics for comma separated list of PCIe root ports: $ perf iostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :17 358559 440 22 :3a320 0 197.081983474 seconds time elapsed Alexander Antonov (5): perf stat: Add AGGR_PCIE_PORT mode perf stat: Basic support for iostat in perf perf stat: Helper functions for PCIe root ports list in iostat mode perf stat: Enable iostat mode for x86 platforms perf: Update .gitignore file tools/perf/.gitignore | 1 + tools/perf/Documentation/perf-iostat.txt | 88 tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build| 1 + tools/perf/arch/x86/util/iostat.c | 472 ++ tools/perf/builtin-stat.c | 36 +- tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh | 12 + tools/perf/util/iostat.h | 32 ++ .../scripting-engines/trace-event-python.c| 3 +- tools/perf/util/stat-display.c| 53 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.c| 4 +- tools/perf/util/stat.h| 2 + 14 files changed, 713 insertions(+), 8 dele
[PATCH v3 1/5] perf stat: Add AGGR_PCIE_PORT mode
Adding AGGR_PCIE_PORT mode to be able to distinguish aggr_mode for root ports in following patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 5 - .../util/scripting-engines/trace-event-python.c | 3 ++- tools/perf/util/stat-display.c | 13 +++-- tools/perf/util/stat.c | 4 +++- tools/perf/util/stat.h | 1 + 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 468fc49420ce..60fdb6a0805f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -908,6 +908,7 @@ static int perf_stat_init_aggr_mode(void) break; case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_PCIE_PORT: case AGGR_UNSET: default: break; @@ -1072,6 +1073,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) case AGGR_NONE: case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_PCIE_PORT: case AGGR_UNSET: default: break; @@ -1844,7 +1846,8 @@ int cmd_stat(int argc, const char **argv) * --per-thread is aggregated per thread, we dont mix it with cpu mode */ if (((stat_config.aggr_mode != AGGR_GLOBAL && - stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) && + stat_config.aggr_mode != AGGR_THREAD && + stat_config.aggr_mode != AGGR_PCIE_PORT) || nr_cgroups) && !target__has_cpu(&target)) { fprintf(stderr, "both cgroup and no-aggregation " "modes only available in system-wide mode\n"); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 5d341efc3237..e604c199f493 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1396,7 +1396,8 @@ static void python_process_stat(struct perf_stat_config *config, struct perf_cpu_map *cpus = counter->core.cpus; int cpu, thread; - if (config->aggr_mode == AGGR_GLOBAL) { + if (config->aggr_mode == AGGR_GLOBAL || + config->aggr_mode == AGGR_PCIE_PORT) { process_stat(counter, -1, -1, tstamp, &counter->counts->aggr); return; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index ed3b0ac2f785..db1bec115d0b 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -123,6 +123,7 @@ static void aggr_printout(struct perf_stat_config *config, config->csv_sep); break; case AGGR_GLOBAL: + case AGGR_PCIE_PORT: case AGGR_UNSET: default: break; @@ -322,7 +323,8 @@ static int first_shadow_cpu(struct perf_stat_config *config, if (config->aggr_mode == AGGR_NONE) return id; - if (config->aggr_mode == AGGR_GLOBAL) + if (config->aggr_mode == AGGR_GLOBAL || + config->aggr_mode == AGGR_PCIE_PORT) return 0; for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) { @@ -416,6 +418,7 @@ static void printout(struct perf_stat_config *config, int id, int nr, if (config->csv_output && !config->metric_only) { static int aggr_fields[] = { [AGGR_GLOBAL] = 0, + [AGGR_PCIE_PORT] = 0, [AGGR_THREAD] = 1, [AGGR_NONE] = 1, [AGGR_SOCKET] = 2, @@ -899,6 +902,7 @@ static int aggr_header_lens[] = { [AGGR_NONE] = 6, [AGGR_THREAD] = 24, [AGGR_GLOBAL] = 0, + [AGGR_PCIE_PORT] = 0, }; static const char *aggr_header_csv[] = { @@ -907,7 +911,8 @@ static const char *aggr_header_csv[] = { [AGGR_SOCKET] = "socket,cpus", [AGGR_NONE] = "cpu,", [AGGR_THREAD] = "comm-pid,", - [AGGR_GLOBAL] = "" + [AGGR_GLOBAL] = "", + [AGGR_PCIE_PORT] = "port," }; static void print_metric_headers(struct perf_stat_config *config, @@ -990,6 +995,8 @@ static void print_interval(struct perf_stat_config *config, if (!metric_only) fprintf(output, " counts %*s events\n", unit_width, "unit"); break; + case AGGR_PCIE_PORT: + break; case AGGR_GLOBAL: default: fprintf(output, "# time"); @@ -1214,6 +122
Re: [PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms
On 1/15/2021 10:33 AM, Namhyung Kim wrote: On Fri, Jan 15, 2021 at 1:41 AM Alexander Antonov wrote: On 1/14/2021 6:39 AM, Namhyung Kim wrote: On Wed, Jan 13, 2021 at 9:08 PM Alexander Antonov wrote: On 1/6/2021 12:02 PM, Namhyung Kim wrote: On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov diff --git a/tools/perf/perf-iiostat.sh b/tools/perf/perf-iiostat.sh new file mode 100644 index ..2c5168d2550b --- /dev/null +++ b/tools/perf/perf-iiostat.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# perf iiostat +# Alexander Antonov + +if [[ "$1" == "show" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? ]]; then +DELIMITER="=" +else +DELIMITER=" " +fi + +perf stat --iiostat$DELIMITER$* Why is this needed? Thanks, Namhyung Arnaldo raised question relates to format of 'perf stat --iiostat' subcommand and explained how it can be changed to 'perf iiostat' through the aliases mechanism in perf. Yeah, I know that. What I'm asking is the DELIMITER part. Thanks, Namhyung I'm using DELIMITER to resolve two different cases for format of iiostat command: The first one is the command with an option for iiostat mode, for example: 'perf iiostat show' which should be converted to 'perf stat --iiostat=show' or 'perf iiostat :ae,:5d' to 'perf stat --iiostat=:ae,:5d'. The second is the command without any option for iiostat: 'perf iiostat -I 1000' should be converted to 'perf stat --iiostat -I 1000'. Can't we simply use a whitespace ? We need to use the equal sign to pass arguments to iiostat mode. Thanks, Alexander
Re: [PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms
On 1/14/2021 6:39 AM, Namhyung Kim wrote: On Wed, Jan 13, 2021 at 9:08 PM Alexander Antonov wrote: On 1/6/2021 12:02 PM, Namhyung Kim wrote: On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov wrote: This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each IIO stack: - Inbound Read: I/O devices below IIO stack read from the host memory - Inbound Write: I/O devices below IIO stack write to the host memory - Outbound Read: CPU reads from I/O devices below IIO stack - Outbound Write: CPU writes to I/O devices below IIO stack Each metric requiries only one IIO event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Hmm.. maybe we can do this with JSON metrics, no? Do you mean to add metrics to *-metrics.json file? Looks like it's possible but in this case JSON file should be updated for each new enabled platform and calculations will be the same. I would prefer to leave it as is because perf will work without changing of userspace part once IIO sysfs attributes are added for new platforms. OK. Signed-off-by: Alexander Antonov --- [SNIP] diff --git a/tools/perf/perf-iiostat.sh b/tools/perf/perf-iiostat.sh new file mode 100644 index ..2c5168d2550b --- /dev/null +++ b/tools/perf/perf-iiostat.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# perf iiostat +# Alexander Antonov + +if [[ "$1" == "show" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? ]]; then +DELIMITER="=" +else +DELIMITER=" " +fi + +perf stat --iiostat$DELIMITER$* Why is this needed? Thanks, Namhyung Arnaldo raised question relates to format of 'perf stat --iiostat' subcommand and explained how it can be changed to 'perf iiostat' through the aliases mechanism in perf. Yeah, I know that. What I'm asking is the DELIMITER part. Thanks, Namhyung I'm using DELIMITER to resolve two different cases for format of iiostat command: The first one is the command with an option for iiostat mode, for example: 'perf iiostat show' which should be converted to 'perf stat --iiostat=show' or 'perf iiostat :ae,:5d' to 'perf stat --iiostat=:ae,:5d'. The second is the command without any option for iiostat: 'perf iiostat -I 1000' should be converted to 'perf stat --iiostat -I 1000'. Thanks, Alexander
Re: [PATCH v2 3/6] perf stat: Basic support for iiostat in perf
On 1/14/2021 6:34 AM, Namhyung Kim wrote: Hello, On Wed, Jan 13, 2021 at 8:34 PM Alexander Antonov wrote: On 1/6/2021 11:56 AM, Namhyung Kim wrote: On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov wrote: Add basic flow for a new iiostat mode in perf. Mode is intended to provide four I/O performance metrics per each IIO stack: Inbound Read, Inbound Write, Outbound Read, Outbound Write. It seems like a generic analysis and other archs can extend it later.. Then we can make it a bit more general.. at least, names? :) I'm not sure that I fully understand you. Do you mean to rename metrics? The mode is intended to provide PCIe metrics which are appliable for other archs as well. Actually, I suppose we can rename 'iiostat' to 'pciestat' or something like this to make it a bit more general because the name 'IIO' (Integrated I/O stack) is Intel specific and it can be named in different way on other platforms. In this case the code has to be updated in the same way as well. Maybe just 'iostat' ? Yeah, it looks better :) The actual code to compute the metrics and attribute it to evsel::perf_device is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 33 - tools/perf/util/iiostat.h | 33 + tools/perf/util/stat-display.c | 38 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.h | 1 + 5 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 tools/perf/util/iiostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 72f9d0aa3f96..14c3da136927 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -67,6 +67,7 @@ #include "util/top.h" #include "util/affinity.h" #include "util/pfm.h" +#include "util/iiostat.h" #include "asm/bug.h" #include @@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = { .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, .ctl_fd = -1, - .ctl_fd_ack = -1 + .ctl_fd_ack = -1, + .iiostat_run= false, }; static bool cpus_map_matched(struct evsel *a, struct evsel *b) @@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt, return parse_cgroups(opt, str, unset); } +__weak int iiostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iiostat mode is not supported\n"); + return -1; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1185,6 +1195,8 @@ static struct option stat_options[] = { "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", parse_control_option), + OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root port", + "measure PCIe metrics per IIO stack", iiostat_parse), OPT_END() }; @@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) return 0; } +__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return 0; +} I think it's too specific, maybe iiostat_prepare() ? What do you think about iiostat_show_root_ports() -> iiostat_show()? I'm ok with it, I thought it needs some initialization work there. + /* * Add default attributes, if there were no attributes specified or * if -d/--detailed, -d -d or -d -d -d is used: @@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks) } } +__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused) +{ +} Same here.. I suggest to rename iiostat_delete_root_ports() -> iiostat_release(). What do you think? Looks good. + int cmd_stat(int argc, const char **argv) { const char * const stat_usage[] = { @@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iiostat_run) { + status = iiostat_show_root_ports(evsel_list, &stat_config); + if (status || !stat_config.iiostat_run) + goto out; + } + if (add_default_attributes(
Re: [PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms
On 1/6/2021 12:02 PM, Namhyung Kim wrote: On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov wrote: This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each IIO stack: - Inbound Read: I/O devices below IIO stack read from the host memory - Inbound Write: I/O devices below IIO stack write to the host memory - Outbound Read: CPU reads from I/O devices below IIO stack - Outbound Write: CPU writes to I/O devices below IIO stack Each metric requiries only one IIO event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Hmm.. maybe we can do this with JSON metrics, no? Do you mean to add metrics to *-metrics.json file? Looks like it's possible but in this case JSON file should be updated for each new enabled platform and calculations will be the same. I would prefer to leave it as is because perf will work without changing of userspace part once IIO sysfs attributes are added for new platforms. Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-iiostat.txt | 89 ++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build| 1 + tools/perf/arch/x86/util/iiostat.c| 337 ++ tools/perf/command-list.txt | 1 + tools/perf/perf-iiostat.sh| 12 + 6 files changed, 444 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iiostat.txt create mode 100644 tools/perf/perf-iiostat.sh diff --git a/tools/perf/Documentation/perf-iiostat.txt b/tools/perf/Documentation/perf-iiostat.txt new file mode 100644 index ..38b5697b0d85 --- /dev/null +++ b/tools/perf/Documentation/perf-iiostat.txt @@ -0,0 +1,89 @@ +perf-iiostat(1) +=== + +NAME + +perf-iiostat - Show I/O performance metrics + +SYNOPSIS + +[verse] +'perf iiostat' show +'perf iiostat' -- [] + +DESCRIPTION +--- +Mode is intended to provide four I/O performance metrics per each IIO +stack (PCIe root port): + +- Inbound Read - I/O devices below IIO stack read from the host memory, in MB + +- Inbound Write - I/O devices below IIO stack write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below IIO stack, in MB + +- Outbound Write - CPU writes to I/O devices below IIO stack, in MB + +OPTIONS +--- +...:: + Any command you can specify in a shell. + +show:: + List all IIO stacks. I'd prefer 'list' for this, but not a strong opinion.. The 'list' is fine for me as well. + +:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES + + +1. List all IIO stacks (example for 2-S platform): + + $ perf iiostat show + S0-uncore_iio_0<:00> + S1-uncore_iio_0<:80> + S0-uncore_iio_1<:17> + S1-uncore_iio_1<:85> + S0-uncore_iio_2<:3a> + S1-uncore_iio_2<:ae> + S0-uncore_iio_3<:5d> + S1-uncore_iio_3<:d7> + +2. Collect metrics for all I/O stacks: + + $ perf iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :00102 3 + :80000 0 + :17 352552 430 21 + :85000 0 + :3a300 0 + :ae000 0 + :5d000 0 + :d7000 0 + +3. Collect metrics for comma-separated list of I/O stacks: + + $ perf iiostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) +
Re: [PATCH v2 3/6] perf stat: Basic support for iiostat in perf
On 1/6/2021 11:56 AM, Namhyung Kim wrote: On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov wrote: Add basic flow for a new iiostat mode in perf. Mode is intended to provide four I/O performance metrics per each IIO stack: Inbound Read, Inbound Write, Outbound Read, Outbound Write. It seems like a generic analysis and other archs can extend it later.. Then we can make it a bit more general.. at least, names? :) I'm not sure that I fully understand you. Do you mean to rename metrics? The mode is intended to provide PCIe metrics which are appliable for other archs as well. Actually, I suppose we can rename 'iiostat' to 'pciestat' or something like this to make it a bit more general because the name 'IIO' (Integrated I/O stack) is Intel specific and it can be named in different way on other platforms. In this case the code has to be updated in the same way as well. The actual code to compute the metrics and attribute it to evsel::perf_device is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 33 - tools/perf/util/iiostat.h | 33 + tools/perf/util/stat-display.c | 38 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.h | 1 + 5 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 tools/perf/util/iiostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 72f9d0aa3f96..14c3da136927 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -67,6 +67,7 @@ #include "util/top.h" #include "util/affinity.h" #include "util/pfm.h" +#include "util/iiostat.h" #include "asm/bug.h" #include @@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = { .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, .ctl_fd = -1, - .ctl_fd_ack = -1 + .ctl_fd_ack = -1, + .iiostat_run= false, }; static bool cpus_map_matched(struct evsel *a, struct evsel *b) @@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt, return parse_cgroups(opt, str, unset); } +__weak int iiostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iiostat mode is not supported\n"); + return -1; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1185,6 +1195,8 @@ static struct option stat_options[] = { "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", parse_control_option), + OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root port", + "measure PCIe metrics per IIO stack", iiostat_parse), OPT_END() }; @@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) return 0; } +__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return 0; +} I think it's too specific, maybe iiostat_prepare() ? What do you think about iiostat_show_root_ports() -> iiostat_show()? + /* * Add default attributes, if there were no attributes specified or * if -d/--detailed, -d -d or -d -d -d is used: @@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks) } } +__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused) +{ +} Same here.. I suggest to rename iiostat_delete_root_ports() -> iiostat_release(). What do you think? + int cmd_stat(int argc, const char **argv) { const char * const stat_usage[] = { @@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iiostat_run) { + status = iiostat_show_root_ports(evsel_list, &stat_config); + if (status || !stat_config.iiostat_run) + goto out; + } + if (add_default_attributes()) goto out; @@ -2406,6 +2434,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: + if (stat_config.iiostat_run) + iiostat_delete_root_ports(evsel_list); + z
Re: [PATCH v2 2/6] perf evsel: Introduce an observed performance device
On 1/6/2021 11:44 AM, Namhyung Kim wrote: Hi, On Wed, Dec 23, 2020 at 10:03 PM Alexander Antonov wrote: Adding evsel::perf_device void pointer. For performance monitoring purposes, an evsel can have a related device. These changes allow to attribute, for example, I/O performance metrics to IIO stack. Signed-off-by: Alexander Antonov --- tools/perf/util/evsel.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 79a860d8e3ee..c346920f477a 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -127,6 +127,7 @@ struct evsel { * See also evsel__has_callchain(). */ __u64 synth_sample_type; + void*perf_device; Maybe we can use the existing 'priv' field. Thanks, Namhyung Hello Namhyung, Looks like the 'priv' field isn't used in this case. I suppose it can be re-used in iiostat mode. Thanks, Alexander }; struct perf_missing_features { -- 2.19.1
[PATCH v2 2/6] perf evsel: Introduce an observed performance device
Adding evsel::perf_device void pointer. For performance monitoring purposes, an evsel can have a related device. These changes allow to attribute, for example, I/O performance metrics to IIO stack. Signed-off-by: Alexander Antonov --- tools/perf/util/evsel.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 79a860d8e3ee..c346920f477a 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -127,6 +127,7 @@ struct evsel { * See also evsel__has_callchain(). */ __u64 synth_sample_type; + void*perf_device; }; struct perf_missing_features { -- 2.19.1
[PATCH v2 1/6] perf stat: Add AGGR_IIO_STACK mode
Adding AGGR_IIO_STACK mode to be able to distinguish aggr_mode for IIO stacks in following patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 7 +-- .../util/scripting-engines/trace-event-python.c | 2 +- tools/perf/util/stat-display.c | 13 +++-- tools/perf/util/stat.c | 3 ++- tools/perf/util/stat.h | 1 + 5 files changed, 20 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f15b2f8aa14d..72f9d0aa3f96 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -913,7 +913,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) init_stats(&walltime_nsecs_stats); update_stats(&walltime_nsecs_stats, t1 - t0); - if (stat_config.aggr_mode == AGGR_GLOBAL) + if (stat_config.aggr_mode == AGGR_GLOBAL || stat_config.aggr_mode == AGGR_IIO_STACK) perf_evlist__save_aggr_prev_raw_counts(evsel_list); perf_evlist__copy_prev_raw_counts(evsel_list); @@ -1309,6 +1309,7 @@ static int perf_stat_init_aggr_mode(void) break; case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_IIO_STACK: case AGGR_UNSET: default: break; @@ -1499,6 +1500,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) case AGGR_NONE: case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_IIO_STACK: case AGGR_UNSET: default: break; @@ -2216,7 +2218,8 @@ int cmd_stat(int argc, const char **argv) * --per-thread is aggregated per thread, we dont mix it with cpu mode */ if (((stat_config.aggr_mode != AGGR_GLOBAL && - stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) && + stat_config.aggr_mode != AGGR_THREAD && + stat_config.aggr_mode != AGGR_IIO_STACK) || nr_cgroups) && !target__has_cpu(&target)) { fprintf(stderr, "both cgroup and no-aggregation " "modes only available in system-wide mode\n"); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c83c2c6564e0..e8b472faeae4 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1401,7 +1401,7 @@ static void python_process_stat(struct perf_stat_config *config, struct perf_cpu_map *cpus = counter->core.cpus; int cpu, thread; - if (config->aggr_mode == AGGR_GLOBAL) { + if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == AGGR_IIO_STACK) { process_stat(counter, -1, -1, tstamp, &counter->counts->aggr); return; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 4b57c0c07632..3bfcdb80443a 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -133,6 +133,7 @@ static void aggr_printout(struct perf_stat_config *config, config->csv_sep); break; case AGGR_GLOBAL: + case AGGR_IIO_STACK: case AGGR_UNSET: default: break; @@ -330,7 +331,7 @@ static int first_shadow_cpu(struct perf_stat_config *config, if (config->aggr_mode == AGGR_NONE) return id; - if (config->aggr_mode == AGGR_GLOBAL) + if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == AGGR_IIO_STACK) return 0; for (i = 0; i < evsel__nr_cpus(evsel); i++) { @@ -424,6 +425,7 @@ static void printout(struct perf_stat_config *config, int id, int nr, if (config->csv_output && !config->metric_only) { static int aggr_fields[] = { [AGGR_GLOBAL] = 0, + [AGGR_IIO_STACK] = 0, [AGGR_THREAD] = 1, [AGGR_NONE] = 1, [AGGR_SOCKET] = 2, @@ -906,6 +908,7 @@ static int aggr_header_lens[] = { [AGGR_NONE] = 6, [AGGR_THREAD] = 24, [AGGR_GLOBAL] = 0, + [AGGR_IIO_STACK] = 0, }; static const char *aggr_header_csv[] = { @@ -914,7 +917,8 @@ static const char *aggr_header_csv[] = { [AGGR_SOCKET] = "socket,cpus", [AGGR_NONE] = "cpu,", [AGGR_THREAD] = "comm-pid,", - [AGGR_GLOBAL] = "" + [AGGR_GLOBAL] = "", + [AGGR_IIO_STACK] = "port," }; static void print_metric_headers(struct perf_stat_config *c
[PATCH v2 6/6] perf: Update .gitignore file
After a "make -C tools/perf", git reports the following untracked file: perf-iiostat Add this generated file to perf's .gitignore file. Signed-off-by: Alexander Antonov --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index f3f84781fd74..ab826736e677 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -20,6 +20,7 @@ perf.data.old output.svg perf-archive perf-with-kcore +perf-iiostat tags TAGS cscope* -- 2.19.1
[PATCH v2 5/6] perf stat: Enable iiostat mode for x86 platforms
This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each IIO stack: - Inbound Read: I/O devices below IIO stack read from the host memory - Inbound Write: I/O devices below IIO stack write to the host memory - Outbound Read: CPU reads from I/O devices below IIO stack - Outbound Write: CPU writes to I/O devices below IIO stack Each metric requiries only one IIO event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-iiostat.txt | 89 ++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build| 1 + tools/perf/arch/x86/util/iiostat.c| 337 ++ tools/perf/command-list.txt | 1 + tools/perf/perf-iiostat.sh| 12 + 6 files changed, 444 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iiostat.txt create mode 100644 tools/perf/perf-iiostat.sh diff --git a/tools/perf/Documentation/perf-iiostat.txt b/tools/perf/Documentation/perf-iiostat.txt new file mode 100644 index ..38b5697b0d85 --- /dev/null +++ b/tools/perf/Documentation/perf-iiostat.txt @@ -0,0 +1,89 @@ +perf-iiostat(1) +=== + +NAME + +perf-iiostat - Show I/O performance metrics + +SYNOPSIS + +[verse] +'perf iiostat' show +'perf iiostat' -- [] + +DESCRIPTION +--- +Mode is intended to provide four I/O performance metrics per each IIO +stack (PCIe root port): + +- Inbound Read - I/O devices below IIO stack read from the host memory, in MB + +- Inbound Write - I/O devices below IIO stack write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below IIO stack, in MB + +- Outbound Write - CPU writes to I/O devices below IIO stack, in MB + +OPTIONS +--- +...:: + Any command you can specify in a shell. + +show:: + List all IIO stacks. + +:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES + + +1. List all IIO stacks (example for 2-S platform): + + $ perf iiostat show + S0-uncore_iio_0<:00> + S1-uncore_iio_0<:80> + S0-uncore_iio_1<:17> + S1-uncore_iio_1<:85> + S0-uncore_iio_2<:3a> + S1-uncore_iio_2<:ae> + S0-uncore_iio_3<:5d> + S1-uncore_iio_3<:d7> + +2. Collect metrics for all I/O stacks: + + $ perf iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :00102 3 + :80000 0 + :17 352552 430 21 + :85000 0 + :3a300 0 + :ae000 0 + :5d000 0 + :d7000 0 + +3. Collect metrics for comma-separated list of I/O stacks: + + $ perf iiostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + +Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :17 358559 440 22 + :3a320 0 + +197.081983474 seconds time elapsed + +SEE ALSO + +linkperf:perf-stat[1] \ No newline at end of file diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 7ce3f2e8b9c7..c16c14a304a9 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -280,6 +280,7 @@ SCRIPT_SH = SCRIPT_SH += perf-archive.sh SCRIPT_SH += perf-with-kcore.sh +SCRIPT_SH += perf-iiostat.s
[PATCH v2 4/6] perf stat: Helper functions for IIO stacks list in iiostat mode
Introduce helper functions to control IIO stacks list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov --- tools/perf/arch/x86/util/iiostat.c | 125 + 1 file changed, 125 insertions(+) create mode 100644 tools/perf/arch/x86/util/iiostat.c diff --git a/tools/perf/arch/x86/util/iiostat.c b/tools/perf/arch/x86/util/iiostat.c new file mode 100644 index ..98b9707b4827 --- /dev/null +++ b/tools/perf/arch/x86/util/iiostat.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf iiostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iiostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static struct iio_root_ports_list *iio_root_ports_list_new(void) +{ + struct iio_root_ports_list *list = calloc(1, sizeof(*list)); + + if (list) { + list->rps = calloc(1, sizeof(struct iio_root_port *)); + if (!list->rps) { + free(list); + list = NULL; + } + } + + return list; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static +struct iio_root_port *iio_root_port_find_by_notation(const struct iio_root_ports_list * const list, +u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + /* One more for NULL.*/ + tmp_buf = realloc(list->rps, (list->nr_entries + 1) * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + tmp_buf[list->nr_entries] = NULL; + list->rps = tmp_buf; + } + return 0; +} -- 2.19.1
[PATCH v2 0/6] perf stat: Introduce iiostat mode to provide I/O performance metrics
The previous version can be found at: v1: https://lkml.kernel.org/r/20201210090340.14358-1-alexander.anto...@linux.intel.com Changes in this revision are: v1 -> v2: 1. Using 'perf iiostat' subcommand instead of 'perf stat --iiostat': - Added perf-iiostat.sh script to use short command - Updated manual pages to get help for 'perf iiostat' - Added 'perf-iiostat' to perf's gitignore file Mode is intended to provide four I/O performance metrics in MB per each IIO stack: - Inbound Read: I/O devices below IIO stack read from the host memory - Inbound Write: I/O devices below IIO stack write to the host memory - Outbound Read: CPU reads from I/O devices below IIO stack - Outbound Write: CPU writes to I/O devices below IIO stack Each metric requiries only one IIO event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Note: iiostat introduces new perf data aggregation mode - per I/O stack hence -e and -M options are not supported. Usage examples: 1. List all IIO stacks (example for 2-S platform): $ perf iiostat show S0-uncore_iio_0<:00> S1-uncore_iio_0<:80> S0-uncore_iio_1<:17> S1-uncore_iio_1<:85> S0-uncore_iio_2<:3a> S1-uncore_iio_2<:ae> S0-uncore_iio_3<:5d> S1-uncore_iio_3<:d7> 2. Collect metrics for all I/O stacks: $ perf iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :00102 3 :80000 0 :17 352552 430 21 :85000 0 :3a300 0 :ae000 0 :5d000 0 :d7000 0 3. Collect metrics for comma separated list of I/O stacks: $ perf iiostat :17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :17 358559 440 22 :3a 3 20 0 197.081983474 seconds time elapsed Alexander Antonov (6): perf stat: Add AGGR_IIO_STACK mode perf evsel: Introduce an observed performance device perf stat: Basic support for iiostat in perf perf stat: Helper functions for IIO stacks list in iiostat mode perf stat: Enable iiostat mode for x86 platforms perf: Update .gitignore file tools/perf/.gitignore | 1 + tools/perf/Documentation/perf-iiostat.txt | 89 tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build| 1 + tools/perf/arch/x86/util/iiostat.c| 462 ++ tools/perf/builtin-stat.c | 40 +- tools/perf/command-list.txt | 1 + tools/perf/perf-iiostat.sh| 12 + tools/perf/util/evsel.h | 1 + tools/perf/util/iiostat.h | 33 ++ .../scripting-engines/trace-event-python.c| 2 +- tools/perf/util/stat-display.c| 51 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.c| 3 +- tools/perf/util/stat.h| 2 + 15 files changed, 704 insertions(+), 10 deletions(-) create mode 100644 tools/perf/Documentation/perf-iiostat.txt create mode 100644 tools/perf/arch/x86/util/iiostat.c create mode 100644 tools/perf/perf-iiostat.sh create mode 100644 tools/perf/util/iiostat.h base-commit: 644bf4b0f7acde641d3db200b4db66977e96c3bd -- 2.19.1
[PATCH v2 3/6] perf stat: Basic support for iiostat in perf
Add basic flow for a new iiostat mode in perf. Mode is intended to provide four I/O performance metrics per each IIO stack: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to evsel::perf_device is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 33 - tools/perf/util/iiostat.h | 33 + tools/perf/util/stat-display.c | 38 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.h | 1 + 5 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 tools/perf/util/iiostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 72f9d0aa3f96..14c3da136927 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -67,6 +67,7 @@ #include "util/top.h" #include "util/affinity.h" #include "util/pfm.h" +#include "util/iiostat.h" #include "asm/bug.h" #include @@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = { .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, .ctl_fd = -1, - .ctl_fd_ack = -1 + .ctl_fd_ack = -1, + .iiostat_run= false, }; static bool cpus_map_matched(struct evsel *a, struct evsel *b) @@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt, return parse_cgroups(opt, str, unset); } +__weak int iiostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("iiostat mode is not supported\n"); + return -1; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1185,6 +1195,8 @@ static struct option stat_options[] = { "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", parse_control_option), + OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root port", + "measure PCIe metrics per IIO stack", iiostat_parse), OPT_END() }; @@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) return 0; } +__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return 0; +} + /* * Add default attributes, if there were no attributes specified or * if -d/--detailed, -d -d or -d -d -d is used: @@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks) } } +__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused) +{ +} + int cmd_stat(int argc, const char **argv) { const char * const stat_usage[] = { @@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iiostat_run) { + status = iiostat_show_root_ports(evsel_list, &stat_config); + if (status || !stat_config.iiostat_run) + goto out; + } + if (add_default_attributes()) goto out; @@ -2406,6 +2434,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: + if (stat_config.iiostat_run) + iiostat_delete_root_ports(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/iiostat.h b/tools/perf/util/iiostat.h new file mode 100644 index ..8d4226df9975 --- /dev/null +++ b/tools/perf/util/iiostat.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * perf iiostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#ifndef _IIOSTAT_H +#define _IIOSTAT_H + +#include +#include "util/stat.h" +#include "util/parse-events.h" +#include "util/evlist.h" + +struct option; +struct perf_stat_config; +struct evlist; +struct timespec; + +int iiostat_parse(const struct option *opt, const char *str, + int unset __maybe_unused); +void iiostat_prefix(struct perf_stat_config *config, struct evlist *evlist, + char *prefix, struct timespec *ts); +void iiostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, + struct perf_stat
Re: [PATCH 0/5] perf stat: Introduce --iiostat mode to provide I/O performance metrics
On 12/15/2020 4:58 PM, Arnaldo Carvalho de Melo wrote: Em Mon, Dec 14, 2020 at 07:04:30PM -0800, Andi Kleen escreveu: My first thought was: Why not have a 'perf iiostat' subcommand? Same would apply to a lot of options in perf stat. I guess you could add some aliases to "perf" that give shortcuts for common perf stat command lines. Yeah, and we have a mechanism for that, that was exercised only in the 'perf archive' case: ~/libexec/perf-core/perf-archive I tried this and it works: [root@five ~]# ls -la ~/bin/perf lrwxrwxrwx. 1 root root 19 Feb 18 2020 /root/bin/perf -> /home/acme/bin/perf [root@five ~]# vim ~acme/libexec/perf-core/perf-cgtop [root@five ~]# chmod +x ~acme/libexec/perf-core/perf-cgtop [root@five ~]# cat ~acme/libexec/perf-core/perf-cgtop perf top --hierarchy --all-cgroups -s cgroup,dso,sym $* [root@five ~]# perf cgtop [root@five ~]# use 'e' to expand collapse the current level (+ -> -), 'E'/'C' to expand/collapse all levels. 'perf help' doesn't show it, which is a shame, I'll add support for it to traverse ~/libexec/perf-core/perf-* and get the first non interpreter comment line as a description for the command, so to add a new one is just a matter of dropping a shell + man page, no need to change the perf binary. To test that '$*' at the end: [root@five ~]# perf cgtop -U I.e.: [acme@five perf]$ perf top -h -U Usage: perf top [] -U, --hide_user_symbols hide user symbols [acme@five perf]$ And it works, just kernel level samples grouped in an hierarchy, first cgroup, then dso, then the symbol. Also, using this with the 'P' hotkey: [root@five ~]# perf cgtop --percent-limit 1 Shows how it looks like: [root@five ~]# cat perf.hist.0 - 86.77%/user.slice/user-1000.slice/session-2.scope - 36.18%[kernel] 2.24%[k] unmap_page_range 1.15%[k] clear_page_rep 1.10%[k] add_mm_counter_fast 1.03%[k] alloc_set_pte 1.03%[k] handle_mm_fault - 17.65%libc-2.32.so 2.04%[.] _int_malloc 1.82%[.] __memmove_avx_unaligned_erms 1.48%[.] __strlen_avx2 1.13%[.] _int_free 1.12%[.] malloc - 8.09%make 1.65%[.] jhash_string 1.05%[.] hash_find_slot - 6.90%ld-2.32.so 2.03%[.] do_lookup_x 1.49%[.] _dl_lookup_symbol_x - 4.78%cc1 - 4.60%libperl.so.5.32.0 - 2.86%bash - 1.98%libselinux.so.1 - 1.61%libpython2.7.so.1.0 - 1.06%libpcre2-8.so.0.10.0 - 9.17%/user.slice/user-1000.slice/session-4.scope - 4.66%perf - 2.40%libc-2.32.so - 1.82%[kernel] - 4.04%/ - 4.02%[kernel] [root@five ~]# So 'perf iiostat' would become: [root@five ~]# cat ~acme/libexec/perf-core/perf-iiostat perf stat --iiostat $* [root@five ~]# There are parameters to that '--iiostat' in the current patchset that may complicates this tho, with some changes I guess we get what we want. - Arnaldo Hello Arnaldo, Sorry for delayed response. This is the interesting approach to get shorter command. Thank you for the explanation. I will update the patchset. - Alexander
[PATCH 5/5] perf stat: Enable --iiostat mode for x86 platforms
This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each IIO stack: - Inbound Read: I/O devices below IIO stack read from the host memory - Inbound Write: I/O devices below IIO stack write to the host memory - Outbound Read: CPU reads from I/O devices below IIO stack - Outbound Write: CPU writes to I/O devices below IIO stack Each metric requiries only one IIO event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Signed-off-by: Alexander Antonov --- tools/perf/Documentation/perf-stat.txt | 31 +++ tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/iiostat.c | 335 + 3 files changed, 367 insertions(+) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 5d4a673d7621..2c066f7e0681 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -121,6 +121,37 @@ to activate system-wide monitoring. Default is to count on all CPUs. -A:: --no-aggr:: Do not aggregate counts across all monitored CPUs. +--iiostat:: +Mode is intended to provide four I/O performance metrics per each IIO +stack (PCIe root port): +--Inbound Read(MB) - I/O devices below IIO stack read from the host memory, in MB +--Inbound Write(MB) - I/O devices below IIO stack write to the host memory, in MB +--Outbound Read(MB) - CPU reads from I/O devices below IIO stack, in MB +--Outbound Write(MB) - CPU writes to I/O devices below IIO stack, in MB + +Sample output: + +Show all IIO stacks on 2-S platform: + $ perf stat --iiostat=show +S0-uncore_iio_0<:00> +S1-uncore_iio_0<:80> +S0-uncore_iio_1<:17> +S1-uncore_iio_1<:85> +S0-uncore_iio_2<:3a> +S1-uncore_iio_2<:ae> +S0-uncore_iio_3<:5d> +S1-uncore_iio_3<:d7> + +Print metrics for requested IIO stacks, multiple comma-separated list supported. + $ perf stat --iiostat=:17 -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 213.997 s, 1.8 GB/s + + Performance counter stats for 'system wide': + + port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) + :17 358559 440 22 -n:: --null:: diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 347c39b960eb..6fa275d3d897 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -6,6 +6,7 @@ perf-y += perf_regs.o perf-y += topdown.o perf-y += machine.o perf-y += event.o +perf-y += iiostat.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o diff --git a/tools/perf/arch/x86/util/iiostat.c b/tools/perf/arch/x86/util/iiostat.c index 70f93a96723f..44342a111746 100644 --- a/tools/perf/arch/x86/util/iiostat.c +++ b/tools/perf/arch/x86/util/iiostat.c @@ -27,6 +27,44 @@ #include "util/counts.h" #include "path.h" +#ifndef MAX_PATH +#define MAX_PATH 1024 +#endif + +#define UNCORE_IIO_PMU_PATH"devices/uncore_iio_%d" +#define SYSFS_UNCORE_PMU_PATH "%s/"UNCORE_IIO_PMU_PATH +#define PLATFORM_MAPPING_PATH UNCORE_IIO_PMU_PATH"/die%d" + +enum iiostat_mode_t { + IIOSTAT_NONE= -1, + IIOSTAT_RUN = 0, + IIOSTAT_SHOW= 1 +}; + +static enum iiostat_mode_t iiostat_mode = IIOSTAT_NONE; + +/* + * Each metric requiries only one IIO event which increments at every 4B transfer + * in corresponding direction. The formulas to compute metrics are generic: + * #EventCount * 4B / (1024 * 1024) + */ +static const char * const iiostat_metrics[] = { + "Inbound Read(MB)", + "Inbound Write(MB)", + "Outbound Read(MB)", + "Outbound Write(MB)", +}; + +static inline int iiostat_metrics_count(void) +{ + return sizeof(iiostat_metrics) / sizeof(char *); +} + +static const char *iiostat_metric_by_idx(int idx) +{ + return *(iiostat_metrics + idx % iiostat_metrics_count()); +} + struct iio_root_port { u32 domain; u8 bus; @@ -123,3 +161,300 @@ static int iio_root_ports_list_insert(struct iio_root_ports_list *list, } return 0; } + +static int uncore_pmu_iio_platform_mapping(u8 pmu_idx, struct iio_root_ports_list * const list) +{ + char *buf; + char path[MAX_PATH]; +
[PATCH 4/5] perf stat: Helper functions for IIO stacks list in iiostat mode
Introduce helper functions to control IIO stacks list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov --- tools/perf/arch/x86/util/iiostat.c | 125 + 1 file changed, 125 insertions(+) create mode 100644 tools/perf/arch/x86/util/iiostat.c diff --git a/tools/perf/arch/x86/util/iiostat.c b/tools/perf/arch/x86/util/iiostat.c new file mode 100644 index ..70f93a96723f --- /dev/null +++ b/tools/perf/arch/x86/util/iiostat.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf stat --iiostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iiostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static struct iio_root_ports_list *iio_root_ports_list_new(void) +{ + struct iio_root_ports_list *list = calloc(1, sizeof(*list)); + + if (list) { + list->rps = calloc(1, sizeof(struct iio_root_port *)); + if (!list->rps) { + free(list); + list = NULL; + } + } + + return list; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static +struct iio_root_port *iio_root_port_find_by_notation(const struct iio_root_ports_list * const list, +u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + /* One more for NULL.*/ + tmp_buf = realloc(list->rps, (list->nr_entries + 1) * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + tmp_buf[list->nr_entries] = NULL; + list->rps = tmp_buf; + } + return 0; +} -- 2.19.1
[PATCH 3/5] perf stat: Basic support for iiostat in perf stat
Add basic flow for a new --iiostat mode in perf stat. Mode is intended to provide four I/O performance metrics per each IIO stack: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to evsel::perf_device is in follow-on patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 33 - tools/perf/util/iiostat.h | 33 + tools/perf/util/stat-display.c | 38 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.h | 1 + 5 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 tools/perf/util/iiostat.h diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e3ff55de4f7a..c8168cfe202b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -67,6 +67,7 @@ #include "util/top.h" #include "util/affinity.h" #include "util/pfm.h" +#include "util/iiostat.h" #include "asm/bug.h" #include @@ -198,7 +199,8 @@ static struct perf_stat_config stat_config = { .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num= true, .ctl_fd = -1, - .ctl_fd_ack = -1 + .ctl_fd_ack = -1, + .iiostat_run= false, }; static bool cpus_map_matched(struct evsel *a, struct evsel *b) @@ -1073,6 +1075,14 @@ static int parse_stat_cgroups(const struct option *opt, return parse_cgroups(opt, str, unset); } +__weak int iiostat_parse(const struct option *opt __maybe_unused, +const char *str __maybe_unused, +int unset __maybe_unused) +{ + pr_err("--iiostat mode is not supported\n"); + return -1; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1185,6 +1195,8 @@ static struct option stat_options[] = { "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", parse_control_option), + OPT_CALLBACK_OPTARG(0, "iiostat", &evsel_list, &stat_config, "root port", + "measure PCIe metrics per IIO stack", iiostat_parse), OPT_END() }; @@ -1509,6 +1521,12 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) return 0; } +__weak int iiostat_show_root_ports(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return 0; +} + /* * Add default attributes, if there were no attributes specified or * if -d/--detailed, -d -d or -d -d -d is used: @@ -2054,6 +2072,10 @@ static void setup_system_wide(int forks) } } +__weak void iiostat_delete_root_ports(struct evlist *evlist __maybe_unused) +{ +} + int cmd_stat(int argc, const char **argv) { const char * const stat_usage[] = { @@ -2230,6 +2252,12 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iiostat_run) { + status = iiostat_show_root_ports(evsel_list, &stat_config); + if (status || !stat_config.iiostat_run) + goto out; + } + if (add_default_attributes()) goto out; @@ -2406,6 +2434,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); perf_evlist__free_stats(evsel_list); out: + if (stat_config.iiostat_run) + iiostat_delete_root_ports(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/iiostat.h b/tools/perf/util/iiostat.h new file mode 100644 index ..6a905b2b40b9 --- /dev/null +++ b/tools/perf/util/iiostat.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * perf stat --iiostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#ifndef _IIOSTAT_H +#define _IIOSTAT_H + +#include +#include "util/stat.h" +#include "util/parse-events.h" +#include "util/evlist.h" + +struct option; +struct perf_stat_config; +struct evlist; +struct timespec; + +int iiostat_parse(const struct option *opt, const char *str, + int unset __maybe_unused); +void iiostat_prefix(struct perf_stat_config *config, struct evlist *evlist, + char *prefix, struct timespec *ts); +void iiostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, + st
[PATCH 2/5] perf evsel: Introduce an observed performance device
Adding evsel::perf_device void pointer. For performance monitoring purposes, an evsel can have a related device. These changes allow to attribute, for example, I/O performance metrics to IIO stack. Signed-off-by: Alexander Antonov --- tools/perf/util/evsel.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 79a860d8e3ee..c346920f477a 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -127,6 +127,7 @@ struct evsel { * See also evsel__has_callchain(). */ __u64 synth_sample_type; + void*perf_device; }; struct perf_missing_features { -- 2.19.1
[PATCH 1/5] perf stat: Add AGGR_IIO_STACK mode
Adding AGGR_IIO_STACK mode to be able to distinguish aggr_mode for IIO stacks in following patches. Signed-off-by: Alexander Antonov --- tools/perf/builtin-stat.c | 5 - .../util/scripting-engines/trace-event-python.c | 2 +- tools/perf/util/stat-display.c | 13 +++-- tools/perf/util/stat.c | 3 ++- tools/perf/util/stat.h | 1 + 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f15b2f8aa14d..e3ff55de4f7a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1309,6 +1309,7 @@ static int perf_stat_init_aggr_mode(void) break; case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_IIO_STACK: case AGGR_UNSET: default: break; @@ -1499,6 +1500,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) case AGGR_NONE: case AGGR_GLOBAL: case AGGR_THREAD: + case AGGR_IIO_STACK: case AGGR_UNSET: default: break; @@ -2216,7 +2218,8 @@ int cmd_stat(int argc, const char **argv) * --per-thread is aggregated per thread, we dont mix it with cpu mode */ if (((stat_config.aggr_mode != AGGR_GLOBAL && - stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) && + stat_config.aggr_mode != AGGR_THREAD && + stat_config.aggr_mode != AGGR_IIO_STACK) || nr_cgroups) && !target__has_cpu(&target)) { fprintf(stderr, "both cgroup and no-aggregation " "modes only available in system-wide mode\n"); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c83c2c6564e0..e8b472faeae4 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1401,7 +1401,7 @@ static void python_process_stat(struct perf_stat_config *config, struct perf_cpu_map *cpus = counter->core.cpus; int cpu, thread; - if (config->aggr_mode == AGGR_GLOBAL) { + if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == AGGR_IIO_STACK) { process_stat(counter, -1, -1, tstamp, &counter->counts->aggr); return; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 4b57c0c07632..3bfcdb80443a 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -133,6 +133,7 @@ static void aggr_printout(struct perf_stat_config *config, config->csv_sep); break; case AGGR_GLOBAL: + case AGGR_IIO_STACK: case AGGR_UNSET: default: break; @@ -330,7 +331,7 @@ static int first_shadow_cpu(struct perf_stat_config *config, if (config->aggr_mode == AGGR_NONE) return id; - if (config->aggr_mode == AGGR_GLOBAL) + if (config->aggr_mode == AGGR_GLOBAL || config->aggr_mode == AGGR_IIO_STACK) return 0; for (i = 0; i < evsel__nr_cpus(evsel); i++) { @@ -424,6 +425,7 @@ static void printout(struct perf_stat_config *config, int id, int nr, if (config->csv_output && !config->metric_only) { static int aggr_fields[] = { [AGGR_GLOBAL] = 0, + [AGGR_IIO_STACK] = 0, [AGGR_THREAD] = 1, [AGGR_NONE] = 1, [AGGR_SOCKET] = 2, @@ -906,6 +908,7 @@ static int aggr_header_lens[] = { [AGGR_NONE] = 6, [AGGR_THREAD] = 24, [AGGR_GLOBAL] = 0, + [AGGR_IIO_STACK] = 0, }; static const char *aggr_header_csv[] = { @@ -914,7 +917,8 @@ static const char *aggr_header_csv[] = { [AGGR_SOCKET] = "socket,cpus", [AGGR_NONE] = "cpu,", [AGGR_THREAD] = "comm-pid,", - [AGGR_GLOBAL] = "" + [AGGR_GLOBAL] = "", + [AGGR_IIO_STACK] = "port," }; static void print_metric_headers(struct perf_stat_config *config, @@ -1001,6 +1005,9 @@ static void print_interval(struct perf_stat_config *config, if (!metric_only) fprintf(output, " counts %*s events\n", unit_width, "unit"); break; + case AGGR_IIO_STACK: + fprintf(output, "# timeport"); + break; case AGGR_GLOBAL: default:
[PATCH 0/5] perf stat: Introduce --iiostat mode to provide I/O performance metrics
Mode is intended to provide four I/O performance metrics in MB per each IIO stack: - Inbound Read: I/O devices below IIO stack read from the host memory - Inbound Write: I/O devices below IIO stack write to the host memory - Outbound Read: CPU reads from I/O devices below IIO stack - Outbound Write: CPU writes to I/O devices below IIO stack Each metric requiries only one IIO event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Note: --iiostat introduces new perf data aggregation mode - per I/O stack hence -e and -M options are not supported. Usage examples: 1. List all IIO stacks (example for 2-S platform): $ perf stat --iiostat=show S0-uncore_iio_0<:00> S1-uncore_iio_0<:80> S0-uncore_iio_1<:17> S1-uncore_iio_1<:85> S0-uncore_iio_2<:3a> S1-uncore_iio_2<:ae> S0-uncore_iio_3<:5d> S1-uncore_iio_3<:d7> 2. Collect metrics for all I/O stacks: $ perf stat --iiostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :00102 3 :80000 0 :17 352552 430 21 :85000 0 :3a300 0 :ae000 0 :5d000 0 :d7000 0 3. Collect metrics for comma separated list of I/O stacks: $ perf stat --iiostat=:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct 357708+0 records in 357707+0 records out 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s Performance counter stats for 'system wide': port Inbound Read(MB)Inbound Write(MB)Outbound Read(MB) Outbound Write(MB) :17 358559 440 22 :3a320 0 197.081983474 seconds time elapsed Alexander Antonov (5): perf stat: Add AGGR_IIO_STACK mode perf evsel: Introduce an observed performance device perf stat: Basic support for iiostat in perf stat perf stat: Helper functions for IIO stacks list in iiostat mode perf stat: Enable --iiostat mode for x86 platforms tools/perf/Documentation/perf-stat.txt| 31 ++ tools/perf/arch/x86/util/Build| 1 + tools/perf/arch/x86/util/iiostat.c| 460 ++ tools/perf/builtin-stat.c | 38 +- tools/perf/util/evsel.h | 1 + tools/perf/util/iiostat.h | 33 ++ .../scripting-engines/trace-event-python.c| 2 +- tools/perf/util/stat-display.c| 51 +- tools/perf/util/stat-shadow.c | 11 +- tools/perf/util/stat.c| 3 +- tools/perf/util/stat.h| 2 + 11 files changed, 625 insertions(+), 8 deletions(-) create mode 100644 tools/perf/arch/x86/util/iiostat.c create mode 100644 tools/perf/util/iiostat.h base-commit: 644bf4b0f7acde641d3db200b4db66977e96c3bd -- 2.19.1
Re: [PATCH] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server
Hello Kyle, Currently we do not have plans on supporting the Uncore units to IIO PMON mapping on multiple segment platforms due to a variety of such platforms. It would be great if you describe your case, I mean how you configure segments on your platform. It will help to cover your configuration and determine a common approach for the mapping algorithm. Thanks, Alexander On 10/09/2020 05:11 PM, Meyer, Kyle wrote: Hello Alexander, Do you plan on supporting multiple segment platforms? Thanks, Kyle Meyer From: alexander.anto...@linux.intel.com Sent: Monday, September 28, 2020 5:21 AM To: pet...@infradead.org; linux-kernel@vger.kernel.org; x...@kernel.org Cc: alexander.shish...@linux.intel.com; kan.li...@linux.intel.com; alexey.budan...@linux.intel.com; a...@linux.intel.com; a...@kernel.org; mi...@redhat.com; alexander.anto...@linux.intel.com; Meyer, Kyle; Anderson, Russ Subject: [PATCH] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server From: Alexander Antonov Introduced early attributes /sys/devices/uncore_iio_/die* are initialized by skx_iio_set_mapping(), however, for example, for multiple segment platforms skx_iio_get_topology() returns -EPERM before a list of attributes in skx_iio_mapping_group will have been initialized. As a result the list is being NULL. Thus the warning "sysfs: (bin_)attrs not set by subsystem for group: uncore_iio_*/" appears and uncore_iio pmus are not available in sysfs. Clear IIO attr_update to properly handle the cases when topology information cannot be retrieved. Fixes: bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Reported-by: Kyle Meyer Suggested-by: Kan Liang Reviewed-by: Alexei Budankov Reviewed-by: Kan Liang Signed-off-by: Alexander Antonov --- arch/x86/events/intel/uncore_snbep.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 62e88ad919ff..ccfa1d6b6aa0 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3749,7 +3749,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) ret = skx_iio_get_topology(type); if (ret) - return ret; + goto clear_attr_update; + + ret = -ENOMEM; /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); @@ -3781,8 +3783,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) kfree(eas); kfree(attrs); kfree(type->topology); +clear_attr_update: type->attr_update = NULL; - return -ENOMEM; + return ret; } static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) base-commit: a1b8638ba1320e6684aa98233c15255eb803fac7 -- 2.19.1
[tip: perf/core] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server
The following commit has been merged into the perf/core branch of tip: Commit-ID: f797f05d917ffef94249ee0aec4c14a5b50517b2 Gitweb: https://git.kernel.org/tip/f797f05d917ffef94249ee0aec4c14a5b50517b2 Author:Alexander Antonov AuthorDate:Mon, 28 Sep 2020 13:21:33 +03:00 Committer: Peter Zijlstra CommitterDate: Tue, 29 Sep 2020 09:57:02 +02:00 perf/x86/intel/uncore: Fix for iio mapping on Skylake Server Introduced early attributes /sys/devices/uncore_iio_/die* are initialized by skx_iio_set_mapping(), however, for example, for multiple segment platforms skx_iio_get_topology() returns -EPERM before a list of attributes in skx_iio_mapping_group will have been initialized. As a result the list is being NULL. Thus the warning "sysfs: (bin_)attrs not set by subsystem for group: uncore_iio_*/" appears and uncore_iio pmus are not available in sysfs. Clear IIO attr_update to properly handle the cases when topology information cannot be retrieved. Fixes: bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Reported-by: Kyle Meyer Suggested-by: Kan Liang Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexei Budankov Reviewed-by: Kan Liang Link: https://lkml.kernel.org/r/20200928102133.61041-1-alexander.anto...@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 495056f..3f1e75f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3754,7 +3754,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) ret = skx_iio_get_topology(type); if (ret) - return ret; + goto clear_attr_update; + + ret = -ENOMEM; /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); @@ -3786,8 +3788,9 @@ err: kfree(eas); kfree(attrs); kfree(type->topology); +clear_attr_update: type->attr_update = NULL; - return -ENOMEM; + return ret; } static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
[PATCH] perf/x86/intel/uncore: Fix for iio mapping on Skylake Server
From: Alexander Antonov Introduced early attributes /sys/devices/uncore_iio_/die* are initialized by skx_iio_set_mapping(), however, for example, for multiple segment platforms skx_iio_get_topology() returns -EPERM before a list of attributes in skx_iio_mapping_group will have been initialized. As a result the list is being NULL. Thus the warning "sysfs: (bin_)attrs not set by subsystem for group: uncore_iio_*/" appears and uncore_iio pmus are not available in sysfs. Clear IIO attr_update to properly handle the cases when topology information cannot be retrieved. Fixes: bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Reported-by: Kyle Meyer Suggested-by: Kan Liang Reviewed-by: Alexei Budankov Reviewed-by: Kan Liang Signed-off-by: Alexander Antonov --- arch/x86/events/intel/uncore_snbep.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 62e88ad919ff..ccfa1d6b6aa0 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3749,7 +3749,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) ret = skx_iio_get_topology(type); if (ret) - return ret; + goto clear_attr_update; + + ret = -ENOMEM; /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); @@ -3781,8 +3783,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) kfree(eas); kfree(attrs); kfree(type->topology); +clear_attr_update: type->attr_update = NULL; - return -ENOMEM; + return ret; } static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) base-commit: a1b8638ba1320e6684aa98233c15255eb803fac7 -- 2.19.1
[PATCH v10 1/3] perf/x86/intel/uncore: Expose an Uncore unit to PMON mapping
From: Roman Sudarikov Each Uncore unit type, by its nature, can be mapped to its own context - which platform component each PMON block of that type is supposed to monitor. Intel® Xeon® Scalable processor family (code name Skylake-SP) makes significant changes in the integrated I/O (IIO) architecture. The new solution introduces IIO stacks which are responsible for managing traffic between the PCIe domain and the Mesh domain. Each IIO stack has its own PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link or various built-in accelerators. IIO PMON blocks allow concurrent monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack. Software is supposed to program required perf counters within each IIO stack and gather performance data. The tricky thing here is that IIO PMON reports data per IIO stack but users have no idea what IIO stacks are - they only know devices which are connected to the platform. Understanding IIO stack concept to find which IIO stack that particular IO device is connected to, or to identify an IIO PMON block to program for monitoring specific IIO stack assumes a lot of implicit knowledge about given Intel server platform architecture. Usage example: ls /sys/devices/uncore__/die* Co-developed-by: Alexander Antonov Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin --- arch/x86/events/intel/uncore.c | 8 arch/x86/events/intel/uncore.h | 12 2 files changed, 20 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cf76d6631afa..b71e8f7529a4 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -843,10 +843,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .read = uncore_pmu_event_read, .module = THIS_MODULE, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .attr_update= pmu->type->attr_update, }; } else { pmu->pmu = *pmu->type->pmu; pmu->pmu.attr_groups = pmu->type->attr_groups; + pmu->pmu.attr_update = pmu->type->attr_update; } if (pmu->type->num_boxes == 1) { @@ -887,6 +889,9 @@ static void uncore_type_exit(struct intel_uncore_type *type) struct intel_uncore_pmu *pmu = type->pmus; int i; + if (type->cleanup_mapping) + type->cleanup_mapping(type); + if (pmu) { for (i = 0; i < type->num_boxes; i++, pmu++) { uncore_pmu_unregister(pmu); @@ -954,6 +959,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) type->pmu_group = &uncore_pmu_attr_group; + if (type->set_mapping) + type->set_mapping(type); + return 0; err: diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 0da4a4605536..8f2b77d27513 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -72,7 +72,19 @@ struct intel_uncore_type { struct uncore_event_desc *event_descs; struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; + const struct attribute_group **attr_update; struct pmu *pmu; /* for custom pmu ops */ + /* +* Uncore PMU would store relevant platform topology configuration here +* to identify which platform component each PMON block of that type is +* supposed to monitor. +*/ + u64 *topology; + /* +* Optional callbacks for managing mapping of Uncore units to PMONs +*/ + int (*set_mapping)(struct intel_uncore_type *type); + void (*cleanup_mapping)(struct intel_uncore_type *type); }; #define pmu_group attr_groups[0] -- 2.19.1
[PATCH v10 2/3] perf/x86/intel/uncore: Wrap the max dies calculation into an accessor
From: Roman Sudarikov The accessor to return number of dies on the platform. Co-developed-by: Alexander Antonov Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin --- arch/x86/events/intel/uncore.c | 13 +++-- arch/x86/events/intel/uncore.h | 3 +++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index b71e8f7529a4..e4f37dc83cf0 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_dies; +int __uncore_max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return dieid < max_dies ? pmu->boxes[dieid] : NULL; + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -879,7 +879,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int die; - for (die = 0; die < max_dies; die++) + for (die = 0; die < uncore_max_dies(); die++) kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -920,7 +920,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_dies * sizeof(struct intel_uncore_box *); + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -1120,7 +1120,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_dies * sizeof(struct pci_extra_dev); + size = uncore_max_dies() * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1547,7 +1547,8 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_dies = topology_max_packages() * topology_max_die_per_package(); + __uncore_max_dies = + topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 8f2b77d27513..8621b66c49ef 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -181,6 +181,9 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +extern int __uncore_max_dies; +#define uncore_max_dies() (__uncore_max_dies) + #define INTEL_UNCORE_EVENT_DESC(_name, _config)\ { \ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ -- 2.19.1
[PATCH v10 3/3] perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping
From: Roman Sudarikov Current version supports a server line starting Intel® Xeon® Processor Scalable Family and introduces mapping for IIO Uncore units only. Other units can be added on demand. IIO stack to PMON mapping is exposed through: /sys/devices/uncore_iio_/dieX where dieX is file which holds "Segment:Root Bus" for PCIe root port, which can be monitored by that IIO PMON block. Details are explained in Documentation/ABI/testing/sysfs-devices-mapping Reported-by: kbuild test robot Co-developed-by: Alexander Antonov Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin --- .../ABI/testing/sysfs-devices-mapping | 33 +++ arch/x86/events/intel/uncore.h| 9 + arch/x86/events/intel/uncore_snbep.c | 191 ++ 3 files changed, 233 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-mapping diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping new file mode 100644 index ..490ccfd67f12 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-mapping @@ -0,0 +1,33 @@ +What: /sys/devices/uncore_iio_x/dieX +Date: February 2020 +Contact:Roman Sudarikov +Description: +Each IIO stack (PCIe root port) has its own IIO PMON block, so +each dieX file (where X is die number) holds "Segment:Root Bus" +for PCIe root port, which can be monitored by that IIO PMON +block. +For example, on 4-die Xeon platform with up to 6 IIO stacks per +die and, therefore, 6 IIO PMON blocks per die, the mapping of +IIO PMON block 0 exposes as the following: + +$ ls /sys/devices/uncore_iio_0/die* +-r--r--r-- /sys/devices/uncore_iio_0/die0 +-r--r--r-- /sys/devices/uncore_iio_0/die1 +-r--r--r-- /sys/devices/uncore_iio_0/die2 +-r--r--r-- /sys/devices/uncore_iio_0/die3 + +$ tail /sys/devices/uncore_iio_0/die* +==> /sys/devices/uncore_iio_0/die0 <== +:00 +==> /sys/devices/uncore_iio_0/die1 <== +:40 +==> /sys/devices/uncore_iio_0/die2 <== +:80 +==> /sys/devices/uncore_iio_0/die3 <== +:c0 + +Which means: +IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x +IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x +IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x +IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 8621b66c49ef..61a7eaa81224 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -181,6 +181,15 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) +{ + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); +} + +#define to_device_attribute(n) container_of(n, struct device_attribute, attr) +#define to_dev_ext_attribute(n)container_of(n, struct dev_ext_attribute, attr) +#define attr_to_ext_attr(n)to_dev_ext_attribute(to_device_attribute(n)) + extern int __uncore_max_dies; #define uncore_max_dies() (__uncore_max_dies) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 07652fa20ebb..0b64d2d85ad8 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -273,6 +273,30 @@ #define SKX_CPUNODEID 0xc0 #define SKX_GIDNIDMAP 0xd4 +/* + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR + * that BIOS programmed. MSR has package scope. + * | Bit | Default | Description + * | [63] |00h| VALID - When set, indicates the CPU bus + * numbers have been initialized. (RO) + * |[62:48]|---| Reserved + * |[47:40]|00h| BUS_NUM_5 — Return the bus number BIOS assigned + * CPUBUSNO(5). (RO) + * |[39:32]|00h| BUS_NUM_4 — Return the bus number BIOS assigned + * CPUBUSNO(4). (RO) + * |[31:24]|00h| BUS_NUM_3 — Return the bus number BIOS assigned + * CPUBUSNO(3). (RO) + * |[23:16]|00h| BUS_NUM_2 — Return the bus number BIOS assigned + * CPUBUSNO(2). (RO) + * |[15:8] |00h| BUS_NUM_1 — Return the bus number BIOS assigned + *
[PATCH v10 0/3] perf x86: Exposing IO stack to IO PMON mapping through sysfs
From: Alexander Antonov The previous version can be found at: v9: https://lkml.kernel.org/r/20200525080554.21313-1-alexander.anto...@linux.intel.com/ Changes in this revision are: v9 -> v10: - Addressed comment from CI Test Service: 1. Fixed coding style issues (old style declaration) The previous version can be found at: v8: https://lkml.kernel.org/r/20200320073110.4761-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v8 -> v9: - Addressed comments from Alexander Shishkin: 1. Improved comments and commit messages 2. Replacing "0444" with the S_IRUGO results in the following checkpatch warning: "Symbolic permissions 'S_IRUGO' are not preferred. Consider using octal permissions '0444'". Thus keeping 0444 for now. Also see: https://lkml.org/lkml/2016/8/2/1945 The previous version can be found at: v7: https://lkml.kernel.org/r/20200303135418.9621-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v7 -> v8: - Addressed comments from Kan Liang: 1. Fixed coding style issues (gotos in error path, comments style) The previous version can be found at: v6: https://lkml.kernel.org/r/20200213150148.5627-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v6 -> v7: - Addressed comments from Greg Kroah-Hartman: 1. Added proper handling of load/unload path 2. Simplified the mapping attribute show procedure by using the segment value of the first available root bus for all mapping attributes which is safe due to current implementation supports single segment configuration only 3. Fixed coding style issues (extra lines, gotos in error path, macros etc) The previous version can be found at: v5: https://lkml.kernel.org/r/20200211161549.19828-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v5 -> v6: 1. Changed the mapping attribute name to "dieX" 2. Called sysfs_attr_init() prior to dynamically creating the mapping attrs 3. Removed redundant "empty" attribute 4. Got an agreement on the mapping attribute format The previous version can be found at: v4: https://lkml.kernel.org/r/20200117133759.5729-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v4 -> v5: - Addressed comments from Greg Kroah-Hartman: 1. Using the attr_update flow for newly introduced optional attributes 2. No subfolder, optional attributes are created the same level as 'cpumask' 3. No symlinks, optional attributes are created as files 4. Single file for each IIO PMON block to node mapping 5. Added Documentation/ABI/sysfs-devices-mapping The previous version can be found at: v3: https://lkml.kernel.org/r/20200113135444.12027-1-roman.sudari...@linux.intel.com Changes in this revision are: v3 -> v4: - Addressed comments from Greg Kroah-Hartman: 1. Reworked handling of newly introduced attribute. 2. Required Documentation update is expected in the follow up patchset The previous version can be found at: v2: https://lkml.kernel.org/r/20191210091451.6054-1-roman.sudari...@linux.intel.com Changes in this revision are: v2 -> v3: 1. Addressed comments from Peter and Kan The previous version can be found at: v1: https://lkml.kernel.org/r/20191126163630.17300-1-roman.sudari...@linux.intel.com Changes in this revision are: v1 -> v2: 1. Fixed process related issues; 2. This patch set includes kernel support for IIO stack to PMON mapping; 3. Stephane raised concerns regarding output format which may require code changes in the user space part of the feature only. We will continue output format discussion in the context of user space update. Intel® Xeon® Scalable processor family (code name Skylake-SP) makes significant changes in the integrated I/O (IIO) architecture. The new solution introduces IIO stacks which are responsible for managing traffic between the PCIe domain and the Mesh domain. Each IIO stack has its own PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link or various built-in accelerators. IIO PMON blocks allow concurrent monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack. Software is supposed to program required perf counters within each IIO stack and gather performance data. The tricky thing here is that IIO PMON reports data per IIO stack but users have no idea what IIO stacks are - they only know devices which are connected to the platform. Understanding IIO stack concept to find which IIO stack that particular IO device is connected to, or to identify an IIO PMON block to program for monitoring specific IIO stack assumes a lot of implicit knowledge about given Intel server platform architecture. This patch set introduces: 1. An infrastructure for exposing an Uncore unit to Uncore PMON mapping through sysfs-backend; 2. A new --iiostat mode in perf stat to provide I/O performance metrics per I/O device. Usage examples: 1. List all d
[RESEND PATCH v9 1/3] perf/x86/intel/uncore: Expose an Uncore unit to PMON mapping
From: Roman Sudarikov Each Uncore unit type, by its nature, can be mapped to its own context - which platform component each PMON block of that type is supposed to monitor. Intel® Xeon® Scalable processor family (code name Skylake-SP) makes significant changes in the integrated I/O (IIO) architecture. The new solution introduces IIO stacks which are responsible for managing traffic between the PCIe domain and the Mesh domain. Each IIO stack has its own PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link or various built-in accelerators. IIO PMON blocks allow concurrent monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack. Software is supposed to program required perf counters within each IIO stack and gather performance data. The tricky thing here is that IIO PMON reports data per IIO stack but users have no idea what IIO stacks are - they only know devices which are connected to the platform. Understanding IIO stack concept to find which IIO stack that particular IO device is connected to, or to identify an IIO PMON block to program for monitoring specific IIO stack assumes a lot of implicit knowledge about given Intel server platform architecture. Usage example: ls /sys/devices/uncore__/die* Co-developed-by: Alexander Antonov Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin --- arch/x86/events/intel/uncore.c | 8 arch/x86/events/intel/uncore.h | 12 2 files changed, 20 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cf76d6631afa..b71e8f7529a4 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -843,10 +843,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .read = uncore_pmu_event_read, .module = THIS_MODULE, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .attr_update= pmu->type->attr_update, }; } else { pmu->pmu = *pmu->type->pmu; pmu->pmu.attr_groups = pmu->type->attr_groups; + pmu->pmu.attr_update = pmu->type->attr_update; } if (pmu->type->num_boxes == 1) { @@ -887,6 +889,9 @@ static void uncore_type_exit(struct intel_uncore_type *type) struct intel_uncore_pmu *pmu = type->pmus; int i; + if (type->cleanup_mapping) + type->cleanup_mapping(type); + if (pmu) { for (i = 0; i < type->num_boxes; i++, pmu++) { uncore_pmu_unregister(pmu); @@ -954,6 +959,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) type->pmu_group = &uncore_pmu_attr_group; + if (type->set_mapping) + type->set_mapping(type); + return 0; err: diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 0da4a4605536..8f2b77d27513 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -72,7 +72,19 @@ struct intel_uncore_type { struct uncore_event_desc *event_descs; struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; + const struct attribute_group **attr_update; struct pmu *pmu; /* for custom pmu ops */ + /* +* Uncore PMU would store relevant platform topology configuration here +* to identify which platform component each PMON block of that type is +* supposed to monitor. +*/ + u64 *topology; + /* +* Optional callbacks for managing mapping of Uncore units to PMONs +*/ + int (*set_mapping)(struct intel_uncore_type *type); + void (*cleanup_mapping)(struct intel_uncore_type *type); }; #define pmu_group attr_groups[0] -- 2.19.1
[RESEND PATCH v9 0/3] perf x86: Exposing IO stack to IO PMON mapping through sysfs
From: Alexander Antonov The previous version can be found at: v8: https://lkml.kernel.org/r/20200320073110.4761-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v8 -> v9: - Addressed comments from Alexander Shishkin: 1. Improved comments and commit messages 2. Replacing "0444" with the S_IRUGO results in the following checkpatch warning: "Symbolic permissions 'S_IRUGO' are not preferred. Consider using octal permissions '0444'". Thus keeping 0444 for now. Also see: https://lkml.org/lkml/2016/8/2/1945 The previous version can be found at: v7: https://lkml.kernel.org/r/20200303135418.9621-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v7 -> v8: - Addressed comments from Kan Liang: 1. Fixed coding style issues (gotos in error path, comments style) The previous version can be found at: v6: https://lkml.kernel.org/r/20200213150148.5627-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v6 -> v7: - Addressed comments from Greg Kroah-Hartman: 1. Added proper handling of load/unload path 2. Simplified the mapping attribute show procedure by using the segment value of the first available root bus for all mapping attributes which is safe due to current implementation supports single segment configuration only 3. Fixed coding style issues (extra lines, gotos in error path, macros etc) The previous version can be found at: v5: https://lkml.kernel.org/r/20200211161549.19828-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v5 -> v6: 1. Changed the mapping attribute name to "dieX" 2. Called sysfs_attr_init() prior to dynamically creating the mapping attrs 3. Removed redundant "empty" attribute 4. Got an agreement on the mapping attribute format The previous version can be found at: v4: https://lkml.kernel.org/r/20200117133759.5729-1-roman.sudari...@linux.intel.com/ Changes in this revision are: v4 -> v5: - Addressed comments from Greg Kroah-Hartman: 1. Using the attr_update flow for newly introduced optional attributes 2. No subfolder, optional attributes are created the same level as 'cpumask' 3. No symlinks, optional attributes are created as files 4. Single file for each IIO PMON block to node mapping 5. Added Documentation/ABI/sysfs-devices-mapping The previous version can be found at: v3: https://lkml.kernel.org/r/20200113135444.12027-1-roman.sudari...@linux.intel.com Changes in this revision are: v3 -> v4: - Addressed comments from Greg Kroah-Hartman: 1. Reworked handling of newly introduced attribute. 2. Required Documentation update is expected in the follow up patchset The previous version can be found at: v2: https://lkml.kernel.org/r/20191210091451.6054-1-roman.sudari...@linux.intel.com Changes in this revision are: v2 -> v3: 1. Addressed comments from Peter and Kan The previous version can be found at: v1: https://lkml.kernel.org/r/20191126163630.17300-1-roman.sudari...@linux.intel.com Changes in this revision are: v1 -> v2: 1. Fixed process related issues; 2. This patch set includes kernel support for IIO stack to PMON mapping; 3. Stephane raised concerns regarding output format which may require code changes in the user space part of the feature only. We will continue output format discussion in the context of user space update. Intel® Xeon® Scalable processor family (code name Skylake-SP) makes significant changes in the integrated I/O (IIO) architecture. The new solution introduces IIO stacks which are responsible for managing traffic between the PCIe domain and the Mesh domain. Each IIO stack has its own PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link or various built-in accelerators. IIO PMON blocks allow concurrent monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack. Software is supposed to program required perf counters within each IIO stack and gather performance data. The tricky thing here is that IIO PMON reports data per IIO stack but users have no idea what IIO stacks are - they only know devices which are connected to the platform. Understanding IIO stack concept to find which IIO stack that particular IO device is connected to, or to identify an IIO PMON block to program for monitoring specific IIO stack assumes a lot of implicit knowledge about given Intel server platform architecture. This patch set introduces: 1. An infrastructure for exposing an Uncore unit to Uncore PMON mapping through sysfs-backend; 2. A new --iiostat mode in perf stat to provide I/O performance metrics per I/O device. Usage examples: 1. List all devices below IIO stacks ./perf stat --iiostat=show Sample output w/o libpci: S0-RootPort0-uncore_iio_0<00:00.0> S1-RootPort0-uncore_iio_0<81:00.0> S0-RootPort1-uncore_iio_1<18:00.0> S1-RootPort1-uncore_iio_1<86:00.0> S1-RootPort1-un
[RESEND PATCH v9 3/3] perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping
From: Roman Sudarikov Current version supports a server line starting Intel® Xeon® Processor Scalable Family and introduces mapping for IIO Uncore units only. Other units can be added on demand. IIO stack to PMON mapping is exposed through: /sys/devices/uncore_iio_/dieX where dieX is file which holds "Segment:Root Bus" for PCIe root port, which can be monitored by that IIO PMON block. Details are explained in Documentation/ABI/testing/sysfs-devices-mapping Co-developed-by: Alexander Antonov Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin --- .../ABI/testing/sysfs-devices-mapping | 33 +++ arch/x86/events/intel/uncore.h| 9 + arch/x86/events/intel/uncore_snbep.c | 191 ++ 3 files changed, 233 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-mapping diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping new file mode 100644 index ..490ccfd67f12 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-mapping @@ -0,0 +1,33 @@ +What: /sys/devices/uncore_iio_x/dieX +Date: February 2020 +Contact:Roman Sudarikov +Description: +Each IIO stack (PCIe root port) has its own IIO PMON block, so +each dieX file (where X is die number) holds "Segment:Root Bus" +for PCIe root port, which can be monitored by that IIO PMON +block. +For example, on 4-die Xeon platform with up to 6 IIO stacks per +die and, therefore, 6 IIO PMON blocks per die, the mapping of +IIO PMON block 0 exposes as the following: + +$ ls /sys/devices/uncore_iio_0/die* +-r--r--r-- /sys/devices/uncore_iio_0/die0 +-r--r--r-- /sys/devices/uncore_iio_0/die1 +-r--r--r-- /sys/devices/uncore_iio_0/die2 +-r--r--r-- /sys/devices/uncore_iio_0/die3 + +$ tail /sys/devices/uncore_iio_0/die* +==> /sys/devices/uncore_iio_0/die0 <== +:00 +==> /sys/devices/uncore_iio_0/die1 <== +:40 +==> /sys/devices/uncore_iio_0/die2 <== +:80 +==> /sys/devices/uncore_iio_0/die3 <== +:c0 + +Which means: +IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x +IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x +IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x +IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 8621b66c49ef..61a7eaa81224 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -181,6 +181,15 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) +{ + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); +} + +#define to_device_attribute(n) container_of(n, struct device_attribute, attr) +#define to_dev_ext_attribute(n)container_of(n, struct dev_ext_attribute, attr) +#define attr_to_ext_attr(n)to_dev_ext_attribute(to_device_attribute(n)) + extern int __uncore_max_dies; #define uncore_max_dies() (__uncore_max_dies) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 07652fa20ebb..8cd3539028ae 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -273,6 +273,30 @@ #define SKX_CPUNODEID 0xc0 #define SKX_GIDNIDMAP 0xd4 +/* + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR + * that BIOS programmed. MSR has package scope. + * | Bit | Default | Description + * | [63] |00h| VALID - When set, indicates the CPU bus + * numbers have been initialized. (RO) + * |[62:48]|---| Reserved + * |[47:40]|00h| BUS_NUM_5 — Return the bus number BIOS assigned + * CPUBUSNO(5). (RO) + * |[39:32]|00h| BUS_NUM_4 — Return the bus number BIOS assigned + * CPUBUSNO(4). (RO) + * |[31:24]|00h| BUS_NUM_3 — Return the bus number BIOS assigned + * CPUBUSNO(3). (RO) + * |[23:16]|00h| BUS_NUM_2 — Return the bus number BIOS assigned + * CPUBUSNO(2). (RO) + * |[15:8] |00h| BUS_NUM_1 — Return the bus number BIOS assigned + * CPUBUSNO(1). (RO) + *
[RESEND PATCH v9 2/3] perf/x86/intel/uncore: Wrap the max dies calculation into an accessor
From: Roman Sudarikov The accessor to return number of dies on the platform. Co-developed-by: Alexander Antonov Signed-off-by: Alexander Antonov Signed-off-by: Roman Sudarikov Reviewed-by: Kan Liang Reviewed-by: Alexander Shishkin --- arch/x86/events/intel/uncore.c | 13 +++-- arch/x86/events/intel/uncore.h | 3 +++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index b71e8f7529a4..e4f37dc83cf0 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_dies; +int __uncore_max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return dieid < max_dies ? pmu->boxes[dieid] : NULL; + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -879,7 +879,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int die; - for (die = 0; die < max_dies; die++) + for (die = 0; die < uncore_max_dies(); die++) kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -920,7 +920,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_dies * sizeof(struct intel_uncore_box *); + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -1120,7 +1120,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_dies * sizeof(struct pci_extra_dev); + size = uncore_max_dies() * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1547,7 +1547,8 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_dies = topology_max_packages() * topology_max_die_per_package(); + __uncore_max_dies = + topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 8f2b77d27513..8621b66c49ef 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -181,6 +181,9 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +extern int __uncore_max_dies; +#define uncore_max_dies() (__uncore_max_dies) + #define INTEL_UNCORE_EVENT_DESC(_name, _config)\ { \ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ -- 2.19.1