The RFC patch supports filtering container specific events when perf tool is executed inside a container.
Unlike previous approaches, this approach lets the user decide what is a container through a set of kernel configs. The main reason for such an approach is the lack of container-unique identifier in the kernel and a clear definition on what constitutes a container; any combination of the namespaces can be considered as a container. Previous approaches mandated at least a PID namespace or a cgroup namespace or a perf-namespace (was newly introduced to support container-aware tracing) to be a part of a container. However, based on the discussions in LKML, mandating a namespace to be a part of a container is not acceptable. Hence, this patch lets the user to define a container through a set of kernel configs. This patch restricts the filtering of events to perf hardware events with sample type set to PERF_SAMPLE_IDENTIFIER. Further, this patch piggybacks on the cgroups support, i.e., the patch expects processes inside a container to be grouped into a single perf_event cgroup. However, if the approach of user deciding what is a container is acceptable, then the filtering will be extended to other events and further will be decoupled from grouping the processes to perf_event cgroup. Limitation: - Two different definitions of a container cannot co-exist. Links to earlier approaches: - https://lwn.net/Articles/695601/ - https://lwn.net/Articles/691298/ - https://lkml.org/lkml/2015/7/15/192 Patch is based on 4.8 kernel Signed-off-by: Aravinda Prasad <aravi...@linux.vnet.ibm.com> --- init/Kconfig | 64 ++++++++++++++++++++++++++++++++ kernel/events/core.c | 99 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 148 insertions(+), 15 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index cac3f09..48568f0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1720,6 +1720,70 @@ config DEBUG_PERF_USE_VMALLOC Say N if unsure. +config PERF_NS_TRACE + default n + bool "Container-aware tracing support" + depends on CGROUPS && NAMESPACES + help + Enable tracing support inside a container. + + This allows to filter container specific events, without + any change in the user interface, when perf is invoked + within a container. + + As the kernel has no concept of a container the user should + select from the below choice to let the kernel identify a container. + + Say N if unsure. + +if PERF_NS_TRACE + +menu "Select the namespaces with which containers are created" + +config UTS_NS_TRACE + bool "UTS namespace" + depends on UTS_NS + default n + help + Select if containers are created with UTS namespace" + +config IPC_NS_TRACE + bool "IPC namespace" + depends on IPC_NS + default n + help + Select if containers are created with IPC namespace" + +config MNT_NS_TRACE + bool "Mount namespace" + default n + help + Select if containers are created with mount namespace" + +config PID_NS_TRACE + bool "PID Namespaces" + default y + depends on PID_NS + help + Select if containers are created with IPC namespace" + +config NET_NS_TRACE + bool "Network namespace" + depends on NET_NS + default n + help + Select if containers are created with NET namespace" + +config CGROUPS_NS_TRACE + bool "Cgroup namespace" + default y + help + Select if containers are created with cgroup namespace" + +endmenu + +endif #PERF_NS_TRACE + endmenu config VM_EVENT_COUNTERS diff --git a/kernel/events/core.c b/kernel/events/core.c index fc9bb22..5920c9c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -802,23 +802,86 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_unlock(); } +#ifdef CONFIG_PERF_NS_TRACE +static inline bool is_container(void) +{ + bool flag = 0; +#ifdef CONFIG_PID_NS_TRACE + if (task_active_pid_ns(current) == &init_pid_ns) + return 0; + else + flag = 1; +#endif +#ifdef CONFIG_UTS_NS_TRACE + if (current->nsproxy->uts_ns == &init_uts_ns) + return 0; + else + flag = 1; +#endif +#ifdef CONFIG_IPC_NS_TRACE + if (current->nsproxy->ipc_ns == &init_ipc_ns) + return 0; + else + flag = 1; +#endif +#ifdef CONFIG_MNT_NS_TRACE + if (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns) + return 0; + else + flag = 1; +#endif +#ifdef CONFIG_NET_NS_TRACE + if (current->nsproxy->net_ns == &init_net) + return 0; + else + flag = 1; +#endif +#ifdef CONFIG_CGROUPS_NS_TRACE + if (current->nsproxy->cgroup_ns == &init_cgroup_ns) + return 0; + else + flag = 1; +#endif + return flag; +} +#endif /* #ifdef CONFIG_PERF_NS_TRACE */ + static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct fd f = fdget(fd); + struct fd f; int ret = 0; - if (!f.file) - return -EBADF; + if (fd != -1) { + f = fdget(fd); + if (!f.file) + return -EBADF; - css = css_tryget_online_from_dir(f.file->f_path.dentry, - &perf_event_cgrp_subsys); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; + css = css_tryget_online_from_dir(f.file->f_path.dentry, + &perf_event_cgrp_subsys); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + fdput(f); + return ret; + } +#ifdef CONFIG_PERF_NS_TRACE + } else if (event->attach_state == PERF_ATTACH_TASK) { + /* Tracing on a PID. No need to set event->cgrp */ + return ret; + } else if (is_container()) { + css = task_css(current, perf_event_cgrp_id); + if (!css || !css_tryget_online(css)) + return -ENOENT; + } else { + /* + * perf invoked from global context and hence don't set + * event->cgrp as all the events should be included + */ + return ret; +#endif /* #ifdef CONFIG_PERF_NS_TRACE */ } cgrp = container_of(css, struct perf_cgroup, css); @@ -833,8 +896,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, perf_detach_cgroup(event); ret = -EINVAL; } -out: - fdput(f); + if (fd != -1) + fdput(f); + return ret; } @@ -9059,11 +9123,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; - if (cgroup_fd != -1) { - err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); - if (err) - goto err_ns; - } + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_ns; pmu = perf_init_event(event); if (!pmu) @@ -9404,6 +9466,13 @@ SYSCALL_DEFINE5(perf_event_open, return -EACCES; } +#ifdef CONFIG_PERF_NS_TRACE + if (is_container() && !(attr.type == PERF_TYPE_HARDWARE && + attr.sample_type == PERF_SAMPLE_IDENTIFIER)) { + return -EACCES; + } +#endif + if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL;