seccomp_nack_syscall() calls syscall_rollback(), which means that the syscall exit path sees the original syscall number as the return value.
This confuses audit_syscall_exit(), trace_syscall_exit(), and ptrace, causing them to report completely bogus syscall exit events. Add a new SYSCALL_WORK_SECCOMP_EXIT flag set by seccomp_nack_syscall(), and change syscall_exit_work() to return early if this flag is set. After all, this syscall was never actually executed. Note that syscall_exit_work() has to clear SYSCALL_WORK_SECCOMP_EXIT for the !force_coredump case, and that is why we actually need the new flag: seccomp_nack_syscall() can't just clear SYSCALL_AUDIT/TRACEPOINT/TRACE. Reported-by: Max Ver <[email protected]> Closes: https://lore.kernel.org/all/cabjjbfjo+p3ja1r0gjuzrcepqb1fab3kqxyhc_psfoqo21y...@mail.gmail.com/ Signed-off-by: Oleg Nesterov <[email protected]> --- include/linux/entry-common.h | 9 ++++++++- include/linux/thread_info.h | 2 ++ kernel/seccomp.c | 4 ++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 535da46c3ee9..403802eed387 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -34,7 +34,8 @@ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ - SYSCALL_WORK_SYSCALL_EXIT_TRAP) + SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ + SYSCALL_WORK_SECCOMP_EXIT) /** * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper @@ -235,6 +236,12 @@ static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned lon } } + if (work & SYSCALL_WORK_SECCOMP_EXIT) { + /* Rejected by seccomp, no valid syscall exit state */ + clear_syscall_work(SECCOMP_EXIT); + return; + } + audit_syscall_exit(regs); if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 051e42902690..167c850ae16e 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -40,6 +40,7 @@ enum { #ifdef CONFIG_GENERIC_ENTRY enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, + SYSCALL_WORK_BIT_SECCOMP_EXIT, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, @@ -50,6 +51,7 @@ enum syscall_work_bit { }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) +#define SYSCALL_WORK_SECCOMP_EXIT BIT(SYSCALL_WORK_BIT_SECCOMP_EXIT) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index cb8dd78791cd..35703dceb6d2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1262,6 +1262,10 @@ static void seccomp_nack_syscall(int this_syscall, int data, bool force_coredump syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ force_sig_seccomp(this_syscall, data, force_coredump); +#ifdef CONFIG_GENERIC_ENTRY + /* No valid syscall exit state after syscall_rollback() */ + set_syscall_work(SECCOMP_EXIT); +#endif } static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) -- 2.52.0

