Currently, there is no mechanism to filter events based on containers. perf -G can be used, but it will not filter events for the containers created after perf is invoked, making it difficult to assess/analyze performance issues of multiple containers at once. This limitation can be overcome, if there is a standard kernel identifier for containers.
This patch introduces a container identifier entry field in perf sample data to identify or distinguish sample data of different containers. It uses the cgroup namespace inode number of a given task as it's container identifier (cid). Alternatively, inode number of pid namespace can also be used as cid. This patch assumes each container is created with it's own cgroup namespace. Suggested-by: Ananth N Mavinakayanahalli <ana...@linux.vnet.ibm.com> Signed-off-by: Hari Bathini <hbath...@linux.vnet.ibm.com> --- Changes from v1: 1. Updated PERF_RECORD_SAMPLE comment. 2. Fixed compile issue with CONFIG_CGROUPS=n Will post the manpage update as and when this gets in.. include/linux/perf_event.h | 4 ++++ include/uapi/linux/perf_event.h | 4 +++- kernel/events/core.c | 23 +++++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b6b43c..4d553ee 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -908,6 +908,10 @@ struct perf_sample_data { struct perf_regs regs_intr; u64 stack_user_size; + struct { + u32 cid; + u32 reserved; + } cid_entry; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c66a485..826b799 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -139,8 +139,9 @@ enum perf_event_sample_format { PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_CID = 1U << 19, - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ }; /* @@ -773,6 +774,7 @@ enum perf_event_type { * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u32 cid, res; } && PERF_SAMPLE_CID * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf..465febd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5776,6 +5776,9 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_CID) + perf_output_put(handle, data->cid_entry); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5909,6 +5912,26 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_CID) { + int size = sizeof(u64); + + /* Container identifier for a given task */ +#ifdef CONFIG_CGROUPS + /* + * Use the task's cgroup namespace inode number. + */ + data->cid_entry.cid = current->nsproxy->cgroup_ns->ns.inum; +#else + /* + * If cgroup namespace is not enabled, + * all tasks have the same cid. + */ + data->cid_entry.cid = 0xffffffffUL; +#endif + data->cid_entry.reserved = 0; + header->size += size; + } } static void __always_inline