Re: [5.2 REGRESSION] Generic vDSO breaks seccomp-enabled userspace on i386

Kees Cook Mon, 22 Jul 2019 16:28:44 -0700

On Mon, Jul 22, 2019 at 12:17:16PM -0700, Andy Lutomirski wrote:
> On Mon, Jul 22, 2019 at 11:39 AM Kees Cook <[email protected]> wrote:
> >
> > On Mon, Jul 22, 2019 at 08:31:32PM +0200, Thomas Gleixner wrote:
> > > On Mon, 22 Jul 2019, Kees Cook wrote:
> > > > Just so I'm understanding: the vDSO change introduced code to make an
> > > > actual syscall on i386, which for most seccomp filters would be 
> > > > rejected?
> > >
> > > No. The old x86 specific VDSO implementation had a fallback syscall as
> > > well, i.e. clock_gettime(). On 32bit clock_gettime() uses the y2038
> > > endangered timespec.
> > >
> > > So when the VDSO was made generic we changed the internal data structures
> > > to be 2038 safe right away. As a consequence the fallback syscall is not
> > > clock_gettime(), it's clock_gettime64(). which seems to surprise seccomp.
> >
> > Okay, it's didn't add a syscall, it just changed it. Results are the
> > same: conservative filters suddenly start breaking due to the different
> > call. (And now I see why Andy's alias suggestion would help...)
> >
> > I'm not sure which direction to do with this. It seems like an alias
> > list is a large hammer for this case, and a "seccomp-bypass when calling
> > from vDSO" solution seems too fragile?
> >
> 
> I don't like the seccomp bypass at all.  If someone uses seccomp to
> disallow all clock_gettime() variants, there shouldn't be a back door
> to learn the time.
> 
> Here's the restart_syscall() logic that makes me want aliases: we have
> different syscall numbers for restart_syscall() on 32-bit and 64-bit.
> The logic to decide which one to use is dubious at best.  I'd like to
> introduce a restart_syscall2() that is identical to restart_syscall()
> except that it has the same number on both variants.


I've built a straw-man for this idea... but I have to say I don't
like it. This can lead to really unexpected behaviors if someone
were to have differing filters for the two syscalls. For example,
let's say someone was doing a paranoid audit of 2038-unsafe clock usage
and marked clock_gettime() with RET_KILL and marked clock_gettime64()
with RET_LOG. This aliasing would make clock_gettime64() trigger with
RET_KILL...

There's no sense of "default" in BPF so we can't check for "and they
weren't expecting it", and we can't check for exact matches since the
filter might be using a balance tree to bucket the allowed syscalls.

Anyway, here it is in case it sparks some more thoughts...


diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c 
b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 9242b28418d5..1e82bd43286c 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -13,6 +13,7 @@
  */
 #undef CONFIG_64BIT
 #undef CONFIG_X86_64
+#undef CONFIG_COMPAT
 #undef CONFIG_PGTABLE_LEVELS
 #undef CONFIG_ILLEGAL_POINTER_VALUE
 #undef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 2bd1338de236..5d0da5ee1b61 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -6,6 +6,12 @@
 
 #ifdef CONFIG_X86_32
 #define __NR_seccomp_sigreturn         __NR_sigreturn
+#define seccomp_syscall_alias(arch, syscall)                   \
+       ({                                                      \
+               (syscall) == __NR_clock_gettime64 ?             \
+                       __NR_clock_gettime :                    \
+                       -1;                                     \
+       })
 #endif
 
 #ifdef CONFIG_COMPAT
@@ -14,6 +20,13 @@
 #define __NR_seccomp_write_32          __NR_ia32_write
 #define __NR_seccomp_exit_32           __NR_ia32_exit
 #define __NR_seccomp_sigreturn_32      __NR_ia32_sigreturn
+#define seccomp_syscall_alias(arch, syscall)                   \
+       ({                                                      \
+               (arch) != AUDIT_ARCH_I386 ? -1 :                \
+               (syscall) == __NR_ia32_clock_gettime64 ?        \
+                       __NR_ia32_clock_gettime :               \
+                       -1;                                     \
+       })
 #endif
 
 #include <asm-generic/seccomp.h>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 84868d37b35d..06f5ca81d756 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -83,6 +83,14 @@ static inline int seccomp_mode(struct seccomp *s)
 #ifdef CONFIG_SECCOMP_FILTER
 extern void put_seccomp_filter(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
+/*
+ * Allow architectures to provide syscall aliases for special cases
+ * when attempting to make invisible transitions to new syscalls that
+ * have no arguments (e.g. clock_gettime64, restart_syscall).
+ */
+# ifndef seccomp_syscall_alias
+#  define seccomp_syscall_alias(arch, syscall)         (-1)
+# endif
 #else  /* CONFIG_SECCOMP_FILTER */
 static inline void put_seccomp_filter(struct task_struct *tsk)
 {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..5153c6155d9a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -807,6 +807,26 @@ static int __seccomp_filter(int this_syscall, const struct 
seccomp_data *sd,
        data = filter_ret & SECCOMP_RET_DATA;
        action = filter_ret & SECCOMP_RET_ACTION_FULL;
 
+       /* Handle syscall aliases when result is not SECCOMP_RET_ALLOW. */
+       if (unlikely(action != SECCOMP_RET_ALLOW)) {
+               int alias;
+
+               alias = seccomp_syscall_alias(sd->arch, sd->nr);
+               if (unlikely(alias != -1)) {
+                       /* Use sd_local for an aliased syscall. */
+                       if (sd != &sd_local) {
+                               sd_local = *sd;
+                               sd = &sd_local;
+                       }
+                       sd_local.nr = alias;
+
+                       /* Run again, with the alias, accepting the results. */
+                       filter_ret = seccomp_run_filters(sd, &match);
+                       data = filter_ret & SECCOMP_RET_DATA;
+                       action = filter_ret & SECCOMP_RET_ACTION_FULL;
+               }
+       }
+
        switch (action) {
        case SECCOMP_RET_ERRNO:
                /* Set low-order bits as an errno, capped at MAX_ERRNO. */
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 6ef7f16c4cf5..778619d145ea 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -3480,6 +3519,55 @@ TEST(seccomp_get_notif_sizes)
        EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
 }
 
+#if defined(__i386__)
+TEST(seccomp_syscall_alias)
+{
+       struct sock_filter filter[] = {
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+                       offsetof(struct seccomp_data, nr)),
+#ifdef __NR_sigreturn
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 6, 0),
+#endif
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 5, 0),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 4, 0),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 3, 0),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 2, 0),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 1, 0),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_gettime, 0, 1),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+       };
+       struct sock_fprog prog = {
+               .len = (unsigned short)ARRAY_SIZE(filter),
+               .filter = filter,
+       };
+       unsigned char buffer[128];
+       long ret;
+
+       ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+       ASSERT_EQ(0, ret) {
+               TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+       }
+
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+       ASSERT_EQ(0, ret) {
+               TH_LOG("Kernel refused to install seccomp filter!");
+       }
+
+       /* We allow clock_gettime explicitly. */
+       EXPECT_EQ(0, syscall(__NR_clock_gettime, CLOCK_REALTIME, &buffer)) {
+               TH_LOG("Failed __NR_clock_gettime!?");
+       }
+
+       /* With aliases, clock_gettime64 should be allowed too. */
+       EXPECT_EQ(0, syscall(__NR_clock_gettime64, CLOCK_REALTIME, &buffer)) {
+               TH_LOG("Failed __NR_clock_gettime64!");
+       }
+
+       syscall(__NR_exit, 0);
+}
+#endif
+
 /*
  * TODO:
  * - add microbenchmarks

-- 
Kees Cook

Re: [5.2 REGRESSION] Generic vDSO breaks seccomp-enabled userspace on i386

Reply via email to