The RFC patch supports filtering container specific events
when perf tool is executed inside a container.

Unlike previous approaches, this approach lets the user
decide what is a container through a set of kernel configs.
The main reason for such an approach is the lack of
container-unique identifier in the kernel and a clear
definition on what constitutes a container; any combination
of the namespaces can be considered as a container.

Previous approaches mandated at least a PID namespace or a
cgroup namespace or a perf-namespace (was newly introduced
to support container-aware tracing) to be a part of a container.
However, based on the discussions in LKML, mandating a
namespace to be a part of a container is not acceptable.
Hence, this patch lets the user to define a container
through a set of kernel configs.

This patch restricts the filtering of events to perf hardware
events with sample type set to PERF_SAMPLE_IDENTIFIER.
Further, this patch piggybacks on the cgroups support, i.e.,
the patch expects processes inside a container to be grouped
into a single perf_event cgroup.

However, if the approach of user deciding what is a container
is acceptable, then the filtering will be extended to other
events and further will be decoupled from grouping the processes
to perf_event cgroup.

Limitation:
  - Two different definitions of a container cannot co-exist.

Links to earlier approaches:
  - https://lwn.net/Articles/695601/
  - https://lwn.net/Articles/691298/
  - https://lkml.org/lkml/2015/7/15/192

Patch is based on 4.8 kernel

Signed-off-by: Aravinda Prasad <aravi...@linux.vnet.ibm.com>
---
 init/Kconfig         |   64 ++++++++++++++++++++++++++++++++
 kernel/events/core.c |   99 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 148 insertions(+), 15 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index cac3f09..48568f0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1720,6 +1720,70 @@ config DEBUG_PERF_USE_VMALLOC
 
         Say N if unsure.
 
+config PERF_NS_TRACE
+       default n
+       bool "Container-aware tracing support"
+       depends on CGROUPS && NAMESPACES
+       help
+        Enable tracing support inside a container.
+
+        This allows to filter container specific events, without
+        any change in the user interface, when perf is invoked
+        within a container.
+
+        As the kernel has no concept of a container the user should
+        select from the below choice to let the kernel identify a container.
+
+        Say N if unsure.
+
+if PERF_NS_TRACE
+
+menu "Select the namespaces with which containers are created"
+
+config UTS_NS_TRACE
+       bool "UTS namespace"
+       depends on UTS_NS
+       default n
+       help
+        Select if containers are created with UTS namespace"
+
+config IPC_NS_TRACE
+       bool "IPC namespace"
+       depends on IPC_NS
+       default n
+       help
+        Select if containers are created with IPC namespace"
+
+config MNT_NS_TRACE
+       bool "Mount namespace"
+       default n
+       help
+        Select if containers are created with mount namespace"
+
+config PID_NS_TRACE
+       bool "PID Namespaces"
+       default y
+       depends on PID_NS
+       help
+        Select if containers are created with IPC namespace"
+
+config NET_NS_TRACE
+       bool "Network namespace"
+       depends on NET_NS
+       default n
+       help
+        Select if containers are created with NET namespace"
+
+config CGROUPS_NS_TRACE
+       bool "Cgroup namespace"
+       default y
+       help
+        Select if containers are created with cgroup namespace"
+
+endmenu
+
+endif #PERF_NS_TRACE
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc9bb22..5920c9c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -802,23 +802,86 @@ static inline void perf_cgroup_sched_in(struct 
task_struct *prev,
        rcu_read_unlock();
 }
 
+#ifdef CONFIG_PERF_NS_TRACE
+static inline bool is_container(void)
+{
+       bool flag = 0;
+#ifdef CONFIG_PID_NS_TRACE
+       if (task_active_pid_ns(current) == &init_pid_ns)
+               return 0;
+       else
+               flag = 1;
+#endif
+#ifdef CONFIG_UTS_NS_TRACE
+       if (current->nsproxy->uts_ns == &init_uts_ns)
+               return 0;
+       else
+               flag = 1;
+#endif
+#ifdef CONFIG_IPC_NS_TRACE
+       if (current->nsproxy->ipc_ns == &init_ipc_ns)
+               return 0;
+       else
+               flag = 1;
+#endif
+#ifdef CONFIG_MNT_NS_TRACE
+       if (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns)
+               return 0;
+       else
+               flag = 1;
+#endif
+#ifdef CONFIG_NET_NS_TRACE
+       if (current->nsproxy->net_ns == &init_net)
+               return 0;
+       else
+               flag = 1;
+#endif
+#ifdef CONFIG_CGROUPS_NS_TRACE
+       if (current->nsproxy->cgroup_ns == &init_cgroup_ns)
+               return 0;
+       else
+               flag = 1;
+#endif
+       return flag;
+}
+#endif /* #ifdef CONFIG_PERF_NS_TRACE */
+
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
 {
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
-       struct fd f = fdget(fd);
+       struct fd f;
        int ret = 0;
 
-       if (!f.file)
-               return -EBADF;
+       if (fd != -1) {
+               f = fdget(fd);
+               if (!f.file)
+                       return -EBADF;
 
-       css = css_tryget_online_from_dir(f.file->f_path.dentry,
-                                        &perf_event_cgrp_subsys);
-       if (IS_ERR(css)) {
-               ret = PTR_ERR(css);
-               goto out;
+               css = css_tryget_online_from_dir(f.file->f_path.dentry,
+                                                &perf_event_cgrp_subsys);
+               if (IS_ERR(css)) {
+                       ret = PTR_ERR(css);
+                       fdput(f);
+                       return ret;
+               }
+#ifdef CONFIG_PERF_NS_TRACE
+       } else if (event->attach_state == PERF_ATTACH_TASK) {
+               /* Tracing on a PID. No need to set event->cgrp */
+               return ret;
+       } else if (is_container()) {
+               css = task_css(current, perf_event_cgrp_id);
+               if (!css || !css_tryget_online(css))
+                       return -ENOENT;
+       } else {
+               /*
+                * perf invoked from global context and hence don't set
+                * event->cgrp as all the events should be included
+                */
+               return ret;
+#endif /* #ifdef CONFIG_PERF_NS_TRACE */
        }
 
        cgrp = container_of(css, struct perf_cgroup, css);
@@ -833,8 +896,9 @@ static inline int perf_cgroup_connect(int fd, struct 
perf_event *event,
                perf_detach_cgroup(event);
                ret = -EINVAL;
        }
-out:
-       fdput(f);
+       if (fd != -1)
+               fdput(f);
+
        return ret;
 }
 
@@ -9059,11 +9123,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;
 
-       if (cgroup_fd != -1) {
-               err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
-               if (err)
-                       goto err_ns;
-       }
+       err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+       if (err)
+               goto err_ns;
 
        pmu = perf_init_event(event);
        if (!pmu)
@@ -9404,6 +9466,13 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EACCES;
        }
 
+#ifdef CONFIG_PERF_NS_TRACE
+       if (is_container() && !(attr.type == PERF_TYPE_HARDWARE &&
+                       attr.sample_type == PERF_SAMPLE_IDENTIFIER)) {
+               return -EACCES;
+       }
+#endif
+
        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;

Reply via email to