On Mon, Jul 22, 2019 at 12:17:16PM -0700, Andy Lutomirski wrote: > On Mon, Jul 22, 2019 at 11:39 AM Kees Cook <keesc...@chromium.org> wrote: > > > > On Mon, Jul 22, 2019 at 08:31:32PM +0200, Thomas Gleixner wrote: > > > On Mon, 22 Jul 2019, Kees Cook wrote: > > > > Just so I'm understanding: the vDSO change introduced code to make an > > > > actual syscall on i386, which for most seccomp filters would be > > > > rejected? > > > > > > No. The old x86 specific VDSO implementation had a fallback syscall as > > > well, i.e. clock_gettime(). On 32bit clock_gettime() uses the y2038 > > > endangered timespec. > > > > > > So when the VDSO was made generic we changed the internal data structures > > > to be 2038 safe right away. As a consequence the fallback syscall is not > > > clock_gettime(), it's clock_gettime64(). which seems to surprise seccomp. > > > > Okay, it's didn't add a syscall, it just changed it. Results are the > > same: conservative filters suddenly start breaking due to the different > > call. (And now I see why Andy's alias suggestion would help...) > > > > I'm not sure which direction to do with this. It seems like an alias > > list is a large hammer for this case, and a "seccomp-bypass when calling > > from vDSO" solution seems too fragile? > > > > I don't like the seccomp bypass at all. If someone uses seccomp to > disallow all clock_gettime() variants, there shouldn't be a back door > to learn the time. > > Here's the restart_syscall() logic that makes me want aliases: we have > different syscall numbers for restart_syscall() on 32-bit and 64-bit. > The logic to decide which one to use is dubious at best. I'd like to > introduce a restart_syscall2() that is identical to restart_syscall() > except that it has the same number on both variants.
I've built a straw-man for this idea... but I have to say I don't like it. This can lead to really unexpected behaviors if someone were to have differing filters for the two syscalls. For example, let's say someone was doing a paranoid audit of 2038-unsafe clock usage and marked clock_gettime() with RET_KILL and marked clock_gettime64() with RET_LOG. This aliasing would make clock_gettime64() trigger with RET_KILL... There's no sense of "default" in BPF so we can't check for "and they weren't expecting it", and we can't check for exact matches since the filter might be using a balance tree to bucket the allowed syscalls. Anyway, here it is in case it sparks some more thoughts... diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 9242b28418d5..1e82bd43286c 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -13,6 +13,7 @@ */ #undef CONFIG_64BIT #undef CONFIG_X86_64 +#undef CONFIG_COMPAT #undef CONFIG_PGTABLE_LEVELS #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 2bd1338de236..5d0da5ee1b61 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -6,6 +6,12 @@ #ifdef CONFIG_X86_32 #define __NR_seccomp_sigreturn __NR_sigreturn +#define seccomp_syscall_alias(arch, syscall) \ + ({ \ + (syscall) == __NR_clock_gettime64 ? \ + __NR_clock_gettime : \ + -1; \ + }) #endif #ifdef CONFIG_COMPAT @@ -14,6 +20,13 @@ #define __NR_seccomp_write_32 __NR_ia32_write #define __NR_seccomp_exit_32 __NR_ia32_exit #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn +#define seccomp_syscall_alias(arch, syscall) \ + ({ \ + (arch) != AUDIT_ARCH_I386 ? -1 : \ + (syscall) == __NR_ia32_clock_gettime64 ? \ + __NR_ia32_clock_gettime : \ + -1; \ + }) #endif #include <asm-generic/seccomp.h> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 84868d37b35d..06f5ca81d756 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -83,6 +83,14 @@ static inline int seccomp_mode(struct seccomp *s) #ifdef CONFIG_SECCOMP_FILTER extern void put_seccomp_filter(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); +/* + * Allow architectures to provide syscall aliases for special cases + * when attempting to make invisible transitions to new syscalls that + * have no arguments (e.g. clock_gettime64, restart_syscall). + */ +# ifndef seccomp_syscall_alias +# define seccomp_syscall_alias(arch, syscall) (-1) +# endif #else /* CONFIG_SECCOMP_FILTER */ static inline void put_seccomp_filter(struct task_struct *tsk) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dba52a7db5e8..5153c6155d9a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -807,6 +807,26 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; + /* Handle syscall aliases when result is not SECCOMP_RET_ALLOW. */ + if (unlikely(action != SECCOMP_RET_ALLOW)) { + int alias; + + alias = seccomp_syscall_alias(sd->arch, sd->nr); + if (unlikely(alias != -1)) { + /* Use sd_local for an aliased syscall. */ + if (sd != &sd_local) { + sd_local = *sd; + sd = &sd_local; + } + sd_local.nr = alias; + + /* Run again, with the alias, accepting the results. */ + filter_ret = seccomp_run_filters(sd, &match); + data = filter_ret & SECCOMP_RET_DATA; + action = filter_ret & SECCOMP_RET_ACTION_FULL; + } + } + switch (action) { case SECCOMP_RET_ERRNO: /* Set low-order bits as an errno, capped at MAX_ERRNO. */ diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 6ef7f16c4cf5..778619d145ea 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3480,6 +3519,55 @@ TEST(seccomp_get_notif_sizes) EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp)); } +#if defined(__i386__) +TEST(seccomp_syscall_alias) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), +#ifdef __NR_sigreturn + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 6, 0), +#endif + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 5, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 4, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 3, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 2, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 1, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_gettime, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + unsigned char buffer[128]; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel refused to install seccomp filter!"); + } + + /* We allow clock_gettime explicitly. */ + EXPECT_EQ(0, syscall(__NR_clock_gettime, CLOCK_REALTIME, &buffer)) { + TH_LOG("Failed __NR_clock_gettime!?"); + } + + /* With aliases, clock_gettime64 should be allowed too. */ + EXPECT_EQ(0, syscall(__NR_clock_gettime64, CLOCK_REALTIME, &buffer)) { + TH_LOG("Failed __NR_clock_gettime64!"); + } + + syscall(__NR_exit, 0); +} +#endif + /* * TODO: * - add microbenchmarks -- Kees Cook