Allow unprivileged users to trace their own processes' syscalls using perf trace, similar to strace without the intrusive overhead of ptrace().
Currently, perf trace requires CAP_PERFMON or paranoid level ≤ 1 even though the kernel has existing infrastructure (TRACE_EVENT_FL_CAP_ANY) specifically designed to mark syscall tracepoints as safe for unprivileged access. To fix this: 1. Loosen the condition in perf_event_open() which requires priviliges for all events with exclude_kernel=0. This allows perf_event_open() to bypass the paranoid check for task-attached tracepoint events. Ensure that sample types which can expose kernel addresses to unprivileged users are blocked. 2. Make the format and id tracefs files world-readable only for tracepoints with TRACE_EVENT_FL_CAP_ANY, allowing unprivileged users to see syscall tracepoint ids without exposing sensitive information. Also add a check to perf_trace_event_perm() to ensure only TRACE_EVENT_FL_CAP_ANY events can be traced. Example usage after this change: $ perf trace ls # works as unprivileged user $ perf trace # system-wide, still requires privileges $ perf trace -p 1234 # requires ptrace permission on pid 1234 Assisted-by: Claude:claude-sonnet-4.5 Signed-off-by: Anubhav Shelat <[email protected]> --- Changes in v2: - Add check to block sample types that bypass KASLR, suggested by sashiko. - Link to v1: https://lore.kernel.org/linux-perf-users/[email protected]/ --- kernel/events/core.c | 22 +++++++++++++++++++--- kernel/trace/trace_event_perf.c | 12 +++++++++++- kernel/trace/trace_events.c | 8 ++++++-- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 89b40e439717..db8c674704b2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13834,9 +13834,25 @@ SYSCALL_DEFINE5(perf_event_open, return err; if (!attr.exclude_kernel) { - err = perf_allow_kernel(); - if (err) - return err; + bool tp_bypass = false; + + if (attr.type == PERF_TYPE_TRACEPOINT && pid != -1) { + /* + * Block sample types that expose kernel addresses to + * prevent KASLR bypass + */ + u64 kaddr_leak = PERF_SAMPLE_CALLCHAIN | + PERF_SAMPLE_BRANCH_STACK | + PERF_SAMPLE_ADDR; + + tp_bypass = !(attr.sample_type & kaddr_leak); + } + + if (!tp_bypass) { + err = perf_allow_kernel(); + if (err) + return err; + } } if (attr.namespaces) { diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a6bb7577e8c5..e8347df7ede5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -73,8 +73,18 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, } /* No tracing, just counting, so no obvious leak */ - if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) { + /* + * Only allow CAP_ANY tracepoints for unprivileged + * task-attached events in case kernel context is exposed. + */ + if (!p_event->attr.exclude_kernel && !perfmon_capable()) { + if (!(p_event->attach_state == PERF_ATTACH_TASK && + (tp_event->flags & TRACE_EVENT_FL_CAP_ANY))) + return -EACCES; + } return 0; + } /* Some events are ok to be traced by non-root users... */ if (p_event->attach_state == PERF_ATTACH_TASK) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 249d1cba72c0..6250b2529376 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3051,7 +3051,9 @@ static int event_callback(const char *name, umode_t *mode, void **data, struct trace_event_call *call = file->event_call; if (strcmp(name, "format") == 0) { - *mode = TRACE_MODE_READ; + *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ? + (TRACE_MODE_READ | 0004) : + TRACE_MODE_READ; *fops = &ftrace_event_format_fops; return 1; } @@ -3087,7 +3089,9 @@ static int event_callback(const char *name, umode_t *mode, void **data, #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg && strcmp(name, "id") == 0) { - *mode = TRACE_MODE_READ; + *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ? + (TRACE_MODE_READ | 0004) : + TRACE_MODE_READ; *data = (void *)(long)call->event.type; *fops = &ftrace_event_id_fops; return 1; -- 2.53.0
