seccomp_nack_syscall() calls syscall_rollback(), which means that the
syscall exit path sees the original syscall number as the return value.

This confuses audit_syscall_exit(), trace_syscall_exit(), and ptrace,
causing them to report completely bogus syscall exit events.

Add a new SYSCALL_WORK_SECCOMP_EXIT flag set by seccomp_nack_syscall(),
and change syscall_exit_work() to return early if this flag is set. After
all, this syscall was never actually executed.

Note that syscall_exit_work() has to clear SYSCALL_WORK_SECCOMP_EXIT for
the !force_coredump case, and that is why we actually need the new flag:
seccomp_nack_syscall() can't just clear SYSCALL_AUDIT/TRACEPOINT/TRACE.

Reported-by: Max Ver <[email protected]>
Closes: 
https://lore.kernel.org/all/cabjjbfjo+p3ja1r0gjuzrcepqb1fab3kqxyhc_psfoqo21y...@mail.gmail.com/
Signed-off-by: Oleg Nesterov <[email protected]>
---
 include/linux/entry-common.h | 9 ++++++++-
 include/linux/thread_info.h  | 2 ++
 kernel/seccomp.c             | 4 ++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 535da46c3ee9..403802eed387 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -34,7 +34,8 @@
                                 SYSCALL_WORK_SYSCALL_TRACE |           \
                                 SYSCALL_WORK_SYSCALL_AUDIT |           \
                                 SYSCALL_WORK_SYSCALL_USER_DISPATCH |   \
-                                SYSCALL_WORK_SYSCALL_EXIT_TRAP)
+                                SYSCALL_WORK_SYSCALL_EXIT_TRAP |       \
+                                SYSCALL_WORK_SECCOMP_EXIT)
 
 /**
  * arch_ptrace_report_syscall_entry - Architecture specific 
ptrace_report_syscall_entry() wrapper
@@ -235,6 +236,12 @@ static __always_inline void syscall_exit_work(struct 
pt_regs *regs, unsigned lon
                }
        }
 
+       if (work & SYSCALL_WORK_SECCOMP_EXIT) {
+               /* Rejected by seccomp, no valid syscall exit state */
+               clear_syscall_work(SECCOMP_EXIT);
+               return;
+       }
+
        audit_syscall_exit(regs);
 
        if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 051e42902690..167c850ae16e 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -40,6 +40,7 @@ enum {
 #ifdef CONFIG_GENERIC_ENTRY
 enum syscall_work_bit {
        SYSCALL_WORK_BIT_SECCOMP,
+       SYSCALL_WORK_BIT_SECCOMP_EXIT,
        SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
        SYSCALL_WORK_BIT_SYSCALL_TRACE,
        SYSCALL_WORK_BIT_SYSCALL_EMU,
@@ -50,6 +51,7 @@ enum syscall_work_bit {
 };
 
 #define SYSCALL_WORK_SECCOMP                   BIT(SYSCALL_WORK_BIT_SECCOMP)
+#define SYSCALL_WORK_SECCOMP_EXIT              
BIT(SYSCALL_WORK_BIT_SECCOMP_EXIT)
 #define SYSCALL_WORK_SYSCALL_TRACEPOINT                
BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
 #define SYSCALL_WORK_SYSCALL_TRACE             
BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
 #define SYSCALL_WORK_SYSCALL_EMU               
BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index cb8dd78791cd..35703dceb6d2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1262,6 +1262,10 @@ static void seccomp_nack_syscall(int this_syscall, int 
data, bool force_coredump
        syscall_rollback(current, current_pt_regs());
        /* Let the filter pass back 16 bits of data. */
        force_sig_seccomp(this_syscall, data, force_coredump);
+#ifdef CONFIG_GENERIC_ENTRY
+       /* No valid syscall exit state after syscall_rollback() */
+       set_syscall_work(SECCOMP_EXIT);
+#endif
 }
 
 static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
-- 
2.52.0


Reply via email to