Allow unprivileged users to trace their own processes' syscalls using perf trace, similar to strace without the intrusive overhead of ptrace().
Currently, perf trace requires CAP_PERFMON or paranoid level ≤ 1 even though the kernel has existing infrastructure (TRACE_EVENT_FL_CAP_ANY) specifically designed to mark syscall tracepoints as safe for unprivileged access. To fix this: 1. Loosen the condition in perf_event_open() which requires privileges for all events with exclude_kernel=0. This allows perf_event_open() to bypass the paranoid check for task-attached tracepoint events. Ensure that sample types which can expose kernel addresses to unprivileged users are blocked. Ensure the PERF_SECURITY_KERNEL LSM hook is preserved. 2. Make the format and id tracefs files world-readable only for tracepoints with TRACE_EVENT_FL_CAP_ANY, allowing unprivileged users to see syscall tracepoint ids without exposing sensitive information. 3. Add a check to perf_trace_event_perm() to block PERF_SAMPLE_IP on kernel tracepoints for unprivileged users to prevent KASLR bypass. We do this here rather than in kaddr_leak because perf_trace_event_perm() can distinguish between kernel tracepoints and uprobe tracepoints, where the IP is a safe user space address and is necessary for uprobe functionality. 4. Restrict pure counting events (no PERF_SAMPLE_RAW) to TRACE_EVENT_FL_CAP_ANY tracepoints preventing unprivileged users from counting internal kernel tracepoints while preserving current behavior for exclude_kernel=1 events. Example usage after this change: $ perf trace ls # works as unprivileged user $ perf trace # system-wide, still requires privileges $ perf trace -p 1234 # requires ptrace permission on pid 1234 Assisted-by: Claude:claude-sonnet-4.5 Signed-off-by: Anubhav Shelat <[email protected]> --- kernel/events/core.c | 28 +++++++++++++++++++++++++--- kernel/trace/trace_event_perf.c | 21 ++++++++++++++++++++- kernel/trace/trace_events.c | 16 ++++++++++++++-- 3 files changed, 59 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 7935d5663944..ff2d1e9a0b79 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -13873,9 +13873,31 @@ SYSCALL_DEFINE5(perf_event_open, return err; if (!attr.exclude_kernel) { - err = perf_allow_kernel(); - if (err) - return err; + bool tp_bypass = false; + + /* Check unprivileged tracepoints */ + if (attr.type == PERF_TYPE_TRACEPOINT && pid != -1) { + /* + * Block sample types that expose kernel addresses to + * prevent KASLR bypass + */ + u64 kaddr_leak = PERF_SAMPLE_CALLCHAIN | + PERF_SAMPLE_BRANCH_STACK | + PERF_SAMPLE_ADDR | + PERF_SAMPLE_REGS_INTR; + + tp_bypass = !(attr.sample_type & kaddr_leak); + } + + if (!tp_bypass) { + err = perf_allow_kernel(); + if (err) + return err; + } else { + err = security_perf_event_open(PERF_SECURITY_KERNEL); + if (err) + return err; + } } if (attr.namespaces) { diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a6bb7577e8c5..466007ed2869 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -72,9 +72,28 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, return -EINVAL; } + /* + * PERF_SAMPLE_IP on kernel tracepoints exposes a kernel text + * address, weakening KASLR. Block for unprivileged users unless + * the tracepoint is a uprobe (userspace IP, safe to expose). + */ + if ((p_event->attr.sample_type & PERF_SAMPLE_IP) && + !p_event->attr.exclude_kernel && + !(tp_event->flags & TRACE_EVENT_FL_UPROBE) && + sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + /* No tracing, just counting, so no obvious leak */ - if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) { + /* Prevent unprivileged users from counting kernel tracepoints */ + if (!p_event->attr.exclude_kernel && + sysctl_perf_event_paranoid > 1 && !perfmon_capable()) { + if (!(p_event->attach_state == PERF_ATTACH_TASK && + (tp_event->flags & TRACE_EVENT_FL_CAP_ANY))) + return -EACCES; + } return 0; + } /* Some events are ok to be traced by non-root users... */ if (p_event->attach_state == PERF_ATTACH_TASK) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c46e623e7e0d..cbd07e2ec528 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3050,7 +3050,13 @@ static int event_callback(const char *name, umode_t *mode, void **data, struct trace_event_call *call = file->event_call; if (strcmp(name, "format") == 0) { - *mode = TRACE_MODE_READ; + /* + * Make format tracefs file world readable for tracepoints with + * TRACE_EVENT_FL_CAP_ANY + */ + *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ? + (TRACE_MODE_READ | 0004) : + TRACE_MODE_READ; *fops = &ftrace_event_format_fops; return 1; } @@ -3086,7 +3092,13 @@ static int event_callback(const char *name, umode_t *mode, void **data, #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg && strcmp(name, "id") == 0) { - *mode = TRACE_MODE_READ; + /* + * Make id tracefs file world readable for tracepoints with + * TRACE_EVENT_FL_CAP_ANY + */ + *mode = (call->flags & TRACE_EVENT_FL_CAP_ANY) ? + (TRACE_MODE_READ | 0004) : + TRACE_MODE_READ; *data = (void *)(long)call->event.type; *fops = &ftrace_event_id_fops; return 1; -- 2.54.0
