Allow unprivileged users to trace their own processes' syscalls using
perf trace, similar to strace without the intrusive overhead of ptrace().

Currently, perf trace requires CAP_PERFMON or paranoid level ≤ 1 even
though the kernel has existing infrastructure (TRACE_EVENT_FL_CAP_ANY)
specifically designed to mark syscall tracepoints as safe for
unprivileged access. To fix this:

1. Loosen the condition in perf_event_open() which requires priviliges
for all events with exclude_kernel=0. This allows perf_event_open() to
bypass the paranoid check for task-attached tracepoint events. Ensure
that sample types which can expose kernel addresses to unprivileged
users are blocked.

2. Make the format and id tracefs files world-readable only for tracepoints
with TRACE_EVENT_FL_CAP_ANY, allowing unprivileged users to see syscall
tracepoint ids without exposing sensitive information.

Also add a check to perf_trace_event_perm() to ensure only 
TRACE_EVENT_FL_CAP_ANY
events can be traced.

Example usage after this change:
  $ perf trace ls          # works as unprivileged user
  $ perf trace             # system-wide, still requires privileges
  $ perf trace -p 1234     # requires ptrace permission on pid 1234

Assisted-by: Claude:claude-sonnet-4.5
Signed-off-by: Anubhav Shelat <[email protected]>
---
Changes in v2:
- Add check to block sample types that bypass KASLR, suggested by
  sashiko.
- Link to v1: 
https://lore.kernel.org/linux-perf-users/[email protected]/
---
 kernel/events/core.c            | 22 +++++++++++++++++++---
 kernel/trace/trace_event_perf.c | 12 +++++++++++-
 kernel/trace/trace_events.c     |  8 ++++++--
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 89b40e439717..db8c674704b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13834,9 +13834,25 @@ SYSCALL_DEFINE5(perf_event_open,
                return err;
 
        if (!attr.exclude_kernel) {
-               err = perf_allow_kernel();
-               if (err)
-                       return err;
+               bool tp_bypass = false;
+
+               if (attr.type == PERF_TYPE_TRACEPOINT && pid != -1) {
+                       /*
+                        * Block sample types that expose kernel addresses to
+                        * prevent KASLR bypass
+                        */
+                       u64 kaddr_leak = PERF_SAMPLE_CALLCHAIN |
+                                        PERF_SAMPLE_BRANCH_STACK |
+                                        PERF_SAMPLE_ADDR;
+
+                       tp_bypass = !(attr.sample_type & kaddr_leak);
+               }
+
+               if (!tp_bypass) {
+                       err = perf_allow_kernel();
+                       if (err)
+                               return err;
+               }
        }
 
        if (attr.namespaces) {
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a6bb7577e8c5..e8347df7ede5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -73,8 +73,18 @@ static int perf_trace_event_perm(struct trace_event_call 
*tp_event,
        }
 
        /* No tracing, just counting, so no obvious leak */
-       if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+       if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) {
+               /*
+                * Only allow CAP_ANY tracepoints for unprivileged
+                * task-attached events in case kernel context is exposed.
+                */
+               if (!p_event->attr.exclude_kernel && !perfmon_capable()) {
+                       if (!(p_event->attach_state == PERF_ATTACH_TASK &&
+                             (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)))
+                               return -EACCES;
+               }
                return 0;
+       }
 
        /* Some events are ok to be traced by non-root users... */
        if (p_event->attach_state == PERF_ATTACH_TASK) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 249d1cba72c0..6250b2529376 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3051,7 +3051,9 @@ static int event_callback(const char *name, umode_t 
*mode, void **data,
        struct trace_event_call *call = file->event_call;
 
        if (strcmp(name, "format") == 0) {
-               *mode = TRACE_MODE_READ;
+               *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ?
+                       (TRACE_MODE_READ | 0004) :
+                       TRACE_MODE_READ;
                *fops = &ftrace_event_format_fops;
                return 1;
        }
@@ -3087,7 +3089,9 @@ static int event_callback(const char *name, umode_t 
*mode, void **data,
 #ifdef CONFIG_PERF_EVENTS
        if (call->event.type && call->class->reg &&
            strcmp(name, "id") == 0) {
-               *mode = TRACE_MODE_READ;
+               *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ?
+               (TRACE_MODE_READ | 0004) :
+               TRACE_MODE_READ;
                *data = (void *)(long)call->event.type;
                *fops = &ftrace_event_id_fops;
                return 1;
-- 
2.53.0


Reply via email to