[tip:x86/entry] x86/syscalls: Split the x32 syscalls into their own table
Commit-ID: 6365b842aae4490ebfafadfc6bb27a6d3cc54757 Gitweb: https://git.kernel.org/tip/6365b842aae4490ebfafadfc6bb27a6d3cc54757 Author: Andy Lutomirski AuthorDate: Wed, 3 Jul 2019 13:34:04 -0700 Committer: Thomas Gleixner CommitDate: Mon, 22 Jul 2019 10:31:23 +0200 x86/syscalls: Split the x32 syscalls into their own table For unfortunate historical reasons, the x32 syscalls and the x86_64 syscalls are not all numbered the same. As an example, ioctl() is nr 16 on x86_64 but 514 on x32. This has potentially nasty consequences, since it means that there are two valid RAX values to do ioctl(2) and two invalid RAX values. The valid values are 16 (i.e. ioctl(2) using the x86_64 ABI) and (514 | 0x4000) (i.e. ioctl(2) using the x32 ABI). The invalid values are 514 and (16 | 0x4000). 514 will enter the "COMPAT_SYSCALL_DEFINE3(ioctl, ...)" entry point with in_compat_syscall() and in_x32_syscall() returning false, whereas (16 | 0x4000) will enter the native entry point with in_compat_syscall() and in_x32_syscall() returning true. Both are bogus, and both will exercise code paths in the kernel and in any running seccomp filters that really ought to be unreachable. Splitting out the x32 syscalls into their own tables, allows both bogus invocations to return -ENOSYS. I've checked glibc, musl, and Bionic, and all of them appear to call syscalls with their correct numbers, so this change should have no effect on them. There is an added benefit going forward: new syscalls that need special handling on x32 can share the same number on x32 and x86_64. This means that the special syscall range 512-547 can be treated as a legacy wart instead of something that may need to be extended in the future. Also add a selftest to verify the new behavior. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/208024256b764312598f014ebfb0a42472c19354.1562185330.git.l...@kernel.org --- arch/x86/entry/common.c | 13 ++-- arch/x86/entry/syscall_64.c | 25 +++ arch/x86/entry/syscalls/syscalltbl.sh | 31 + arch/x86/include/asm/syscall.h | 4 ++ arch/x86/include/asm/unistd.h | 6 -- arch/x86/kernel/asm-offsets_64.c| 20 ++ tools/testing/selftests/x86/Makefile| 2 +- tools/testing/selftests/x86/syscall_numbering.c | 89 + 8 files changed, 163 insertions(+), 27 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 536b574b6161..3f8e22615812 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -285,15 +285,16 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); - /* -* NB: Native and x32 syscalls are dispatched from the same -* table. The only functional difference is the x32 bit in -* regs->orig_ax, which changes the behavior of some syscalls. -*/ - nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); +#ifdef CONFIG_X86_X32_ABI + } else if (likely((nr & __X32_SYSCALL_BIT) && + (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { + nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, + X32_NR_syscalls); + regs->ax = x32_sys_call_table[nr](regs); +#endif } syscall_return_slowpath(regs); diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index d5252bc1e380..b1bf31713374 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -10,10 +10,13 @@ /* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ extern asmlinkage long sys_ni_syscall(const struct pt_regs *); #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); +#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) #include #undef __SYSCALL_64 +#undef __SYSCALL_X32 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, +#define __SYSCALL_X32(nr, sym, qual) asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* @@ -23,3 +26,25 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { [0 ... __NR_syscall_max] = _ni_syscall, #include }; + +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#ifdef CONFIG_X86_X32_ABI + +#define __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_X32(nr, sym, qual) [nr] = sym, + +asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { + /* +* Smells like a compiler bug -- it doesn't work +* when the & below is removed. +*/ + [0 ... __NR_syscall_x32_max] = _ni_syscall, +#include
[tip:x86/entry] x86/syscalls: Disallow compat entries for all types of 64-bit syscalls
Commit-ID: f85a8573ceb225e606fcf38a9320782316f47c71 Gitweb: https://git.kernel.org/tip/f85a8573ceb225e606fcf38a9320782316f47c71 Author: Andy Lutomirski AuthorDate: Wed, 3 Jul 2019 13:34:03 -0700 Committer: Thomas Gleixner CommitDate: Mon, 22 Jul 2019 10:31:22 +0200 x86/syscalls: Disallow compat entries for all types of 64-bit syscalls A "compat" entry in the syscall tables means to use a different entry on 32-bit and 64-bit builds. This only makes sense for syscalls that exist in the first place in 32-bit builds, so disallow it for anything other than i386. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/4b7565954c5a06530ac01d98cb1592538fd8ae51.1562185330.git.l...@kernel.org --- arch/x86/entry/syscalls/syscalltbl.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 94fcd1951aca..53c8c1a9adf9 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -27,8 +27,8 @@ emit() { compat="$4" umlentry="" -if [ "$abi" = "64" -a -n "$compat" ]; then - echo "a compat entry for a 64-bit syscall makes no sense" >&2 +if [ "$abi" != "I386" -a -n "$compat" ]; then + echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 exit 1 fi
[tip:x86/entry] x86/syscalls: Use the compat versions of rt_sigsuspend() and rt_sigprocmask()
Commit-ID: a8d03c3f300eefff3b5c14798409e4b43e37dd9b Gitweb: https://git.kernel.org/tip/a8d03c3f300eefff3b5c14798409e4b43e37dd9b Author: Andy Lutomirski AuthorDate: Wed, 3 Jul 2019 13:34:02 -0700 Committer: Thomas Gleixner CommitDate: Mon, 22 Jul 2019 10:31:22 +0200 x86/syscalls: Use the compat versions of rt_sigsuspend() and rt_sigprocmask() I'm working on some code that detects at build time if there's a COMPAT_SYSCALL_DEFINE() that is not referenced in the x86 syscall tables. It catches three offenders: rt_sigsuspend(), rt_sigprocmask(), and sendfile64(). For rt_sigsuspend() and rt_sigprocmask(), the only potential difference between the native and compat versions is that the compat version converts the sigset_t, but, on little endian architectures, the conversion is a no-op. This is why they both currently work on x86. To make the code more consistent, and to make the upcoming patches work, rewire x86 to use the compat vesions. sendfile64() is more complicated, and will be addressed separately. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/51643ac3157b5921eae0e172a8a0b1d953e68ebb.1562185330.git.l...@kernel.org --- arch/x86/entry/syscalls/syscall_32.tbl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c00019abd076..3fe02546aed3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -186,11 +186,11 @@ 172i386prctl sys_prctl __ia32_sys_prctl 173i386rt_sigreturnsys_rt_sigreturn sys32_rt_sigreturn 174i386rt_sigactionsys_rt_sigaction __ia32_compat_sys_rt_sigaction -175i386rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask +175i386rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask 176i386rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending 177i386rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32 178i386rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo -179i386rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend +179i386rt_sigsuspend sys_rt_sigsuspend __ia32_compat_sys_rt_sigsuspend 180i386pread64 sys_pread64 __ia32_compat_sys_x86_pread 181i386pwrite64sys_pwrite64 __ia32_compat_sys_x86_pwrite 182i386chown sys_chown16 __ia32_sys_chown16
[tip:x86/entry] x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long
Commit-ID: 45e29d119e9923ff14dfb840e3482bef1667bbfb Gitweb: https://git.kernel.org/tip/45e29d119e9923ff14dfb840e3482bef1667bbfb Author: Andy Lutomirski AuthorDate: Wed, 3 Jul 2019 13:34:05 -0700 Committer: Thomas Gleixner CommitDate: Mon, 22 Jul 2019 10:31:22 +0200 x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long Currently, it's an int. This is bizarre. Fortunately, the code using it still works: ~__X32_SYSCALL_BIT is also int, so, if nr is unsigned long, then C kindly sign-extends the ~__X32_SYSCALL_BIT part, and it actually results in the desired value. This is far more subtle than it deserves to be. Syscall numbers are, for all practical purposes, unsigned long, so make __X32_SYSCALL_BIT be unsigned long. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/99b0d83ad891c67105470a1a6b63243fd63a5061.1562185330.git.l...@kernel.org --- arch/x86/include/uapi/asm/unistd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index 30d7d04d72d6..196fdd02b8b1 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -3,7 +3,7 @@ #define _UAPI_ASM_X86_UNISTD_H /* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x4000 +#define __X32_SYSCALL_BIT 0x4000UL #ifndef __KERNEL__ # ifdef __i386__
[tip:x86/apic] x86/apic: Initialize TPR to block interrupts 16-31
Commit-ID: 229b969b3d38bc28bcd55841ee7ca9a9afb922f3 Gitweb: https://git.kernel.org/tip/229b969b3d38bc28bcd55841ee7ca9a9afb922f3 Author: Andy Lutomirski AuthorDate: Sun, 14 Jul 2019 08:23:14 -0700 Committer: Thomas Gleixner CommitDate: Mon, 22 Jul 2019 10:12:32 +0200 x86/apic: Initialize TPR to block interrupts 16-31 The APIC, per spec, is fundamentally confused and thinks that interrupt vectors 16-31 are valid. This makes no sense -- the CPU reserves vectors 0-31 for exceptions (faults, traps, etc). Obviously, no device should actually produce an interrupt with vector 16-31, but robustness can be improved by setting the APIC TPR class to 1, which will prevent delivery of an interrupt with a vector below 32. Note: This is *not* intended as a security measure against attackers who control malicious hardware. Any PCI or similar hardware that can be controlled by an attacker MUST be behind a functional IOMMU that remaps interrupts. The purpose of this change is to reduce the chance that a certain class of device malfunctions crashes the kernel in hard-to-debug ways. Suggested-by: Andrew Cooper Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/dc04a9f8b234d7b0956a8d2560b8945bcd9c4bf7.1563117760.git.l...@kernel.org --- arch/x86/kernel/apic/apic.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f5291362da1a..84032bf81476 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1561,11 +1561,14 @@ static void setup_local_APIC(void) #endif /* -* Set Task Priority to 'accept all'. We never change this -* later on. +* Set Task Priority to 'accept all except vectors 0-31'. An APIC +* vector in the 16-31 range could be delivered if TPR == 0, but we +* would think it's an exception and terrible things will happen. We +* never change this later on. */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; + value |= 0x10; apic_write(APIC_TASKPRI, value); apic_pending_intr_clear();
[tip:x86/urgent] Revert "x86/ptrace: Prevent ptrace from clearing the FS/GS selector" and fix the test
Commit-ID: c7ca0b614513afba57824cae68447f9c32b1ee61 Gitweb: https://git.kernel.org/tip/c7ca0b614513afba57824cae68447f9c32b1ee61 Author: Andy Lutomirski AuthorDate: Mon, 15 Jul 2019 07:21:44 -0700 Committer: Thomas Gleixner CommitDate: Mon, 15 Jul 2019 17:12:31 +0200 Revert "x86/ptrace: Prevent ptrace from clearing the FS/GS selector" and fix the test This reverts commit 48f5e52e916b55fb73754833efbacc7f8081a159. The ptrace ABI change was a prerequisite to the proposed design for FSGSBASE. Since FSGSBASE support has been reverted, and since I'm not convinced that the ABI was ever adequately tested, revert the ABI change as well. This also modifies the test case so that it tests the preexisting behavior. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/fca39c478ea7fb15bc76fe8a36bd180810a067f6.1563200250.git.l...@kernel.org --- arch/x86/kernel/ptrace.c | 14 -- tools/testing/selftests/x86/fsgsbase.c | 22 -- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 71691a8310e7..0fdbe89d0754 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -369,12 +369,22 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - x86_fsbase_write_task(child, value); + /* +* When changing the FS base, use do_arch_prctl_64() +* to set the index to zero and to set the base +* as requested. +*/ + if (child->thread.fsbase != value) + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): + /* +* Exactly the same here as the %fs handling above. +*/ if (value >= TASK_SIZE_MAX) return -EIO; - x86_gsbase_write_task(child, value); + if (child->thread.gsbase != value) + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 5ab4c60c100e..15a329da59fa 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -489,25 +489,11 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs != *shared_scratch) { - nerrs++; - printf("[FAIL]\tGS changed to %lx\n", gs); - - /* -* On older kernels, poking a nonzero value into the -* base would zero the selector. On newer kernels, -* this behavior has changed -- poking the base -* changes only the base and, if FSGSBASE is not -* available, this may have no effect. -*/ - if (gs == 0) - printf("\tNote: this is expected behavior on older kernels.\n"); - } else if (have_fsgsbase && (base != 0xFF)) { - nerrs++; - printf("[FAIL]\tGSBASE changed to %lx\n", base); + if (gs == 0 && base == 0xFF) { + printf("[OK]\tGS was reset as expected\n"); } else { - printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : ""); - printf("\n"); + nerrs++; + printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); } }
[tip:x86/cpu] selftests/x86/fsgsbase: Fix some test case bugs
Commit-ID: 697096b1f458fb81212d1c82d7846e932455 Gitweb: https://git.kernel.org/tip/697096b1f458fb81212d1c82d7846e932455 Author: Andy Lutomirski AuthorDate: Tue, 2 Jul 2019 20:43:04 -0700 Committer: Thomas Gleixner CommitDate: Wed, 3 Jul 2019 16:24:56 +0200 selftests/x86/fsgsbase: Fix some test case bugs This refactors do_unexpected_base() to clean up some code. It also fixes the following bugs in test_ptrace_write_gsbase(): - Incorrect printf() format string caused crashes. - Hardcoded 0x7 for the gs selector was not reliably correct. It also documents the fact that the test is expected to fail on old kernels. Fixes: a87730cc3acc ("selftests/x86/fsgsbase: Test ptracer-induced GSBASE write with FSGSBASE") Fixes: 1b6858d5a2eb ("selftests/x86/fsgsbase: Test ptracer-induced GSBASE write") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: "BaeChang Seok" Cc: Borislav Petkov Cc: Peter Zijlstra Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: H. Peter Anvin Cc: "BaeChang Seok" Link: https://lkml.kernel.org/r/bab29c84f2475e2c30ddb00f1b877fcd7f4f96a8.1562125333.git.l...@kernel.org --- tools/testing/selftests/x86/fsgsbase.c | 74 ++ 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 21fd4f94b5b0..5ab4c60c100e 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -35,6 +35,8 @@ static volatile sig_atomic_t want_segv; static volatile unsigned long segv_addr; +static unsigned short *shared_scratch; + static int nerrs; static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), @@ -242,16 +244,11 @@ static void do_remote_base() static __thread int set_thread_area_entry_number = -1; -static void do_unexpected_base(void) +static unsigned short load_gs(void) { /* -* The goal here is to try to arrange for GS == 0, GSBASE != -* 0, and for the the kernel the think that GSBASE == 0. -* -* To make the test as reliable as possible, this uses -* explicit descriptors. (This is not the only way. This -* could use ARCH_SET_GS with a low, nonzero base, but the -* relevant side effect of ARCH_SET_GS could change.) +* Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think +* that GSBASE == 0 (i.e. thread.gsbase == 0). */ /* Step 1: tell the kernel that we have GSBASE == 0. */ @@ -271,8 +268,9 @@ static void do_unexpected_base(void) .useable = 0 }; if (syscall(SYS_modify_ldt, 1, , sizeof(desc)) == 0) { - printf("\tother thread: using LDT slot 0\n"); + printf("\tusing LDT slot 0\n"); asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); + return 0x7; } else { /* No modify_ldt for us (configured out, perhaps) */ @@ -294,20 +292,15 @@ static void do_unexpected_base(void) if (ret != 0) { printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); - return; + return 0; } - printf("\tother thread: using GDT slot %d\n", desc.entry_number); + printf("\tusing GDT slot %d\n", desc.entry_number); set_thread_area_entry_number = desc.entry_number; - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); + unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3); + asm volatile ("mov %0, %%gs" : : "rm" (gs)); + return gs; } - - /* -* Step 3: set the selector back to zero. On AMD chips, this will -* preserve GSBASE. -*/ - - asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); } void test_wrbase(unsigned short index, unsigned long base) @@ -346,12 +339,19 @@ static void *threadproc(void *ctx) if (ftx == 3) return NULL; - if (ftx == 1) + if (ftx == 1) { do_remote_base(); - else if (ftx == 2) - do_unexpected_base(); - else + } else if (ftx == 2) { + /* +* On AMD chips, this causes GSBASE != 0, GS == 0, and +* thread.gsbase == 0. +*/ + + load_gs(); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + } else { errx(1, "helper thread got bad command"); + } ftx = 0; syscall(SYS_futex, , FUTEX_WAKE, 0, NULL, NULL, 0); @@ -453,12 +453,7 @@ static void
[tip:x86/cpu] x86/entry/64: Fix and clean up paranoid_exit
Commit-ID: 539bca535decb11a0861b6205c6684b8e908589b Gitweb: https://git.kernel.org/tip/539bca535decb11a0861b6205c6684b8e908589b Author: Andy Lutomirski AuthorDate: Mon, 1 Jul 2019 20:43:21 -0700 Committer: Thomas Gleixner CommitDate: Tue, 2 Jul 2019 08:45:20 +0200 x86/entry/64: Fix and clean up paranoid_exit paranoid_exit needs to restore CR3 before GSBASE. Doing it in the opposite order crashes if the exception came from a context with user GSBASE and user CR3 -- RESTORE_CR3 cannot resture user CR3 if run with user GSBASE. This results in infinitely recursing exceptions if user code does SYSENTER with TF set if both FSGSBASE and PTI are enabled. The old code worked if user code just set TF without SYSENTER because #DB from user mode is special cased in idtentry and paranoid_exit doesn't run. Fix it by cleaning up the spaghetti code. All that paranoid_exit needs to do is to disable IRQs, handle IRQ tracing, then restore CR3, and restore GSBASE. Simply do those actions in that order. Fixes: 708078f65721 ("x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit") Reported-by: Vegard Nossum Signed-off-by: Chang S. Bae Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Peter Zijlstra Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/59725ceb08977359489fbed979716949ad45f616.1562035429.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 33 + 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 54b1b0468b2b..670306f588bf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1256,31 +1256,32 @@ END(paranoid_entry) ENTRY(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG - /* Handle GS depending on FSGSBASE availability */ - ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE + /* +* The order of operations is important. IRQ tracing requires +* kernel GSBASE and CR3. RESTORE_CR3 requires kernel GS base. +* +* NB to anyone to tries to optimize this code: this code does +* not execute at all for exceptions coming from user mode. Those +* exceptions go through error_exit instead. +*/ + TRACE_IRQS_IRETQ_DEBUG + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases. */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE /* With FSGSBASE enabled, unconditionally restore GSBASE */ wrgsbase%rbx - jmp .Lparanoid_exit_no_swapgs; + jmp restore_regs_and_return_to_kernel .Lparanoid_exit_checkgs: /* On non-FSGSBASE systems, conditionally do SWAPGS */ testl %ebx, %ebx - jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - SWAPGS_UNSAFE_STACK - jmp .Lparanoid_exit_restore - -.Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + jnz restore_regs_and_return_to_kernel -.Lparanoid_exit_restore: + /* We are returning to a context with user GSBASE. */ + SWAPGS_UNSAFE_STACK jmp restore_regs_and_return_to_kernel END(paranoid_exit)
[tip:x86/cpu] x86/entry/64: Don't compile ignore_sysret if 32-bit emulation is enabled
Commit-ID: dffb3f9db6b593f3ed6ab4c8d8f10e0aa6aa7a88 Gitweb: https://git.kernel.org/tip/dffb3f9db6b593f3ed6ab4c8d8f10e0aa6aa7a88 Author: Andy Lutomirski AuthorDate: Mon, 1 Jul 2019 20:43:20 -0700 Committer: Thomas Gleixner CommitDate: Tue, 2 Jul 2019 08:45:20 +0200 x86/entry/64: Don't compile ignore_sysret if 32-bit emulation is enabled It's only used if !CONFIG_IA32_EMULATION, so disable it in normal configs. This will save a few bytes of text and reduce confusion. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: "BaeChang Seok" Cc: Borislav Petkov Cc: Peter Zijlstra Cc: "Bae, Chang Seok" Link: https://lkml.kernel.org/r/0f7dafa72fe7194689de5ee8cfe5d83509fabcf5.1562035429.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7f9f5119d6b1..54b1b0468b2b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1743,11 +1743,17 @@ nmi_restore: iretq END(nmi) +#ifndef CONFIG_IA32_EMULATION +/* + * This handles SYSCALL from 32-bit code. There is no way to program + * MSRs to fully disable 32-bit SYSCALL. + */ ENTRY(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) +#endif ENTRY(rewind_stack_do_exit) UNWIND_HINT_FUNC
[tip:x86/cpu] selftests/x86: Test SYSCALL and SYSENTER manually with TF set
Commit-ID: 9402eaf4c11f0b892eda7b2bcb4654ab34ce34f9 Gitweb: https://git.kernel.org/tip/9402eaf4c11f0b892eda7b2bcb4654ab34ce34f9 Author: Andy Lutomirski AuthorDate: Mon, 1 Jul 2019 20:43:19 -0700 Committer: Thomas Gleixner CommitDate: Tue, 2 Jul 2019 08:45:20 +0200 selftests/x86: Test SYSCALL and SYSENTER manually with TF set Make sure that both variants of the nasty TF-in-compat-syscall are exercised regardless of what vendor's CPU is running the tests. Also change the intentional signal after SYSCALL to use ud2, which is a lot more comprehensible. This crashes the kernel due to an FSGSBASE bug right now. This test *also* detects a bug in KVM when run on an Intel host. KVM people, feel free to use it to help debug. There's a bunch of code in this test to warn instead of going into an infinite looping when the bug gets triggered. Reported-by: Vegard Nossum Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: "BaeChang Seok" Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: k...@vger.kernel.org Cc: "Bae, Chang Seok" Link: https://lkml.kernel.org/r/5f5de10441ab2e3005538b4c33be9b1965d1bb63.1562035429.git.l...@kernel.org --- tools/testing/selftests/x86/Makefile| 5 +- tools/testing/selftests/x86/syscall_arg_fault.c | 112 ++-- 2 files changed, 110 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 186520198de7..fa07d526fe39 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ + protection_keys test_vdso test_vsyscall mov_ss_trap \ + syscall_arg_fault +TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index 4e25d38c8bbd..bc0ecc2e862e 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -15,9 +15,30 @@ #include #include +#ifdef __x86_64__ +# define WIDTH "q" +#else +# define WIDTH "l" +#endif + /* Our sigaltstack scratch space. */ static unsigned char altstack_data[SIGSTKSZ]; +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH + : : "rm" (eflags) : "flags"); +} + +#define X86_EFLAGS_TF (1UL << 8) + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf; static volatile sig_atomic_t n_errs; +#ifdef __x86_64__ +#define REG_AX REG_RAX +#define REG_IP REG_RIP +#else +#define REG_AX REG_EAX +#define REG_IP REG_EIP +#endif + static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; + long ax = (long)ctx->uc_mcontext.gregs[REG_AX]; - if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { - printf("[FAIL]\tAX had the wrong value: 0x%x\n", - ctx->uc_mcontext.gregs[REG_EAX]); + if (ax != -EFAULT && ax != -ENOSYS) { + printf("[FAIL]\tAX had the wrong value: 0x%lx\n", + (unsigned long)ax); n_errs++; } else { printf("[OK]\tSeems okay\n"); @@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) siglongjmp(jmpbuf, 1); } +static volatile sig_atomic_t sigtrap_consecutive_syscalls; + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + /* +* KVM has some bugs that can cause us to stop making progress. +* detect them and complain, but don't infinite loop or fail the +* test. +*/ + + ucontext_t *ctx = (ucontext_t*)ctx_void; + unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP]; + + if (*ip == 0x340f || *ip == 0x050f) { + /* The trap was on SYSCALL or SYSENTER */ + sigtrap_consecutive_syscalls++; + if (sigtrap_consecutive_syscalls > 3) { + printf("[WARN]\tGot stuck
[tip:x86/entry] selftests/x86: Add a test for process_vm_readv() on the vsyscall page
Commit-ID: 7f0a5e0755832301e7b010eab46fb715c483ba60 Gitweb: https://git.kernel.org/tip/7f0a5e0755832301e7b010eab46fb715c483ba60 Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:09 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:40 +0200 selftests/x86: Add a test for process_vm_readv() on the vsyscall page get_gate_page() is a piece of somewhat alarming code to make get_user_pages() work on the vsyscall page. Test it via process_vm_readv(). Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Cc: Borislav Petkov Cc: Kernel Hardening Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/0fe34229a9330e8f9de9765967939cc4f1cf26b1.1561610354.git.l...@kernel.org --- tools/testing/selftests/x86/test_vsyscall.c | 35 + 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 34a1d35995ef..4602326b8f5b 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef __x86_64__ # define VSYS(x) (x) @@ -459,6 +460,38 @@ static int test_vsys_x(void) return 0; } +static int test_process_vm_readv(void) +{ +#ifdef __x86_64__ + char buf[4096]; + struct iovec local, remote; + int ret; + + printf("[RUN]\tprocess_vm_readv() from vsyscall page\n"); + + local.iov_base = buf; + local.iov_len = 4096; + remote.iov_base = (void *)0xff60; + remote.iov_len = 4096; + ret = process_vm_readv(getpid(), , 1, , 1, 0); + if (ret != 4096) { + printf("[OK]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno); + return 0; + } + + if (vsyscall_map_r) { + if (!memcmp(buf, (const void *)0xff60, 4096)) { + printf("[OK]\tIt worked and read correct data\n"); + } else { + printf("[FAIL]\tIt worked but returned incorrect data\n"); + return 1; + } + } +#endif + + return 0; +} + #ifdef __x86_64__ #define X86_EFLAGS_TF (1UL << 8) static volatile sig_atomic_t num_vsyscall_traps; @@ -533,6 +566,8 @@ int main(int argc, char **argv) nerrs += test_vsys_r(); nerrs += test_vsys_x(); + nerrs += test_process_vm_readv(); + #ifdef __x86_64__ nerrs += test_emulation(); #endif
[tip:x86/entry] x86/vsyscall: Add __ro_after_init to global variables
Commit-ID: 441cedab2dfca18fe4983cbc795de04536ed421e Gitweb: https://git.kernel.org/tip/441cedab2dfca18fe4983cbc795de04536ed421e Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:08 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:40 +0200 x86/vsyscall: Add __ro_after_init to global variables The vDSO is only configurable by command-line options, so make its global variables __ro_after_init. This seems highly unlikely to ever stop an exploit, but it's nicer anyway. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Cc: Borislav Petkov Cc: Kernel Hardening Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/a386925835e49d319e70c4d7404b1f6c3c2e3702.1561610354.git.l...@kernel.org --- arch/x86/entry/vsyscall/vsyscall_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 9c58ab807aeb..07003f3f1bfc 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -42,7 +42,7 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, XONLY, NONE } vsyscall_mode = +static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) @@ -305,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; -static struct vm_area_struct gate_vma = { +static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC,
[tip:x86/entry] x86/vsyscall: Change the default vsyscall mode to xonly
Commit-ID: 625b7b7f79c66626fb2b7687fc1a58309a57edd5 Gitweb: https://git.kernel.org/tip/625b7b7f79c66626fb2b7687fc1a58309a57edd5 Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:07 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:39 +0200 x86/vsyscall: Change the default vsyscall mode to xonly The use case for full emulation over xonly is very esoteric, e.g. magic instrumentation tools. Change the default to the safer xonly mode. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Cc: Borislav Petkov Cc: Kernel Hardening Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/30539f8072d2376b9c9efcc07e6ed0d6bf20e882.1561610354.git.l...@kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0182d2c67590..32028edc1b0e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2285,7 +2285,7 @@ config COMPAT_VDSO choice prompt "vsyscall table for legacy applications" depends on X86_64 - default LEGACY_VSYSCALL_EMULATE + default LEGACY_VSYSCALL_XONLY help Legacy user code that does not know how to find the vDSO expects to be able to issue three syscalls by calling fixed addresses in
[tip:x86/entry] selftests/x86/vsyscall: Verify that vsyscall=none blocks execution
Commit-ID: b0386979867168575118501104f3d135067eab4f Gitweb: https://git.kernel.org/tip/b0386979867168575118501104f3d135067eab4f Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:06 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:39 +0200 selftests/x86/vsyscall: Verify that vsyscall=none blocks execution If vsyscall=none accidentally still allowed vsyscalls, the test wouldn't fail. Fix it. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Cc: Borislav Petkov Cc: Kernel Hardening Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/b413397c804265f8865f3e70b14b09485ea7c314.1561610354.git.l...@kernel.org --- tools/testing/selftests/x86/test_vsyscall.c | 76 - 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 4c9a8d76dba0..34a1d35995ef 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -49,21 +49,21 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), } /* vsyscalls and vDSO */ -bool should_read_vsyscall = false; +bool vsyscall_map_r = false, vsyscall_map_x = false; typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); -gtod_t vgtod = (gtod_t)VSYS(0xff60); +const gtod_t vgtod = (gtod_t)VSYS(0xff60); gtod_t vdso_gtod; typedef int (*vgettime_t)(clockid_t, struct timespec *); vgettime_t vdso_gettime; typedef long (*time_func_t)(time_t *t); -time_func_t vtime = (time_func_t)VSYS(0xff600400); +const time_func_t vtime = (time_func_t)VSYS(0xff600400); time_func_t vdso_time; typedef long (*getcpu_t)(unsigned *, unsigned *, void *); -getcpu_t vgetcpu = (getcpu_t)VSYS(0xff600800); +const getcpu_t vgetcpu = (getcpu_t)VSYS(0xff600800); getcpu_t vdso_getcpu; static void init_vdso(void) @@ -107,7 +107,7 @@ static int init_vsys(void) maps = fopen("/proc/self/maps", "r"); if (!maps) { printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); - should_read_vsyscall = true; + vsyscall_map_r = true; return 0; } @@ -133,12 +133,8 @@ static int init_vsys(void) } printf("\tvsyscall permissions are %c-%c\n", r, x); - should_read_vsyscall = (r == 'r'); - if (x != 'x') { - vgtod = NULL; - vtime = NULL; - vgetcpu = NULL; - } + vsyscall_map_r = (r == 'r'); + vsyscall_map_x = (x == 'x'); found = true; break; @@ -148,10 +144,8 @@ static int init_vsys(void) if (!found) { printf("\tno vsyscall map in /proc/self/maps\n"); - should_read_vsyscall = false; - vgtod = NULL; - vtime = NULL; - vgetcpu = NULL; + vsyscall_map_r = false; + vsyscall_map_x = false; } return nerrs; @@ -242,7 +236,7 @@ static int test_gtod(void) err(1, "syscall gettimeofday"); if (vdso_gtod) ret_vdso = vdso_gtod(_vdso, _vdso); - if (vgtod) + if (vsyscall_map_x) ret_vsys = vgtod(_vsys, _vsys); if (sys_gtod(_sys2, _sys) != 0) err(1, "syscall gettimeofday"); @@ -256,7 +250,7 @@ static int test_gtod(void) } } - if (vgtod) { + if (vsyscall_map_x) { if (ret_vsys == 0) { nerrs += check_gtod(_sys1, _sys2, _sys, "vsyscall", _vsys, _vsys); } else { @@ -277,7 +271,7 @@ static int test_time(void) { t_sys1 = sys_time(_sys1); if (vdso_time) t_vdso = vdso_time(_vdso); - if (vtime) + if (vsyscall_map_x) t_vsys = vtime(_vsys); t_sys2 = sys_time(_sys2); if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { @@ -298,7 +292,7 @@ static int test_time(void) { } } - if (vtime) { + if (vsyscall_map_x) { if (t_vsys < 0 || t_vsys != t2_vsys) { printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); nerrs++; @@ -334,7 +328,7 @@ static int test_getcpu(int cpu) ret_sys = sys_getcpu(_sys, _sys, 0); if (vdso_getcpu) ret_vdso = vdso_getcpu(_vdso, _vdso, 0); - if (vgetcpu) + if (vsyscall_map_x) ret_vsys = vgetcpu(_vsys, _vsys, 0); if (ret_sys == 0) { @@ -373,7 +367,7 @@ static int test_getcpu(int cpu) } } - if (vgetcpu) { + if
[tip:x86/entry] x86/vsyscall: Document odd SIGSEGV error code for vsyscalls
Commit-ID: e0a446ce394a7915f2ffc03f9bb610c5ac4dbbf1 Gitweb: https://git.kernel.org/tip/e0a446ce394a7915f2ffc03f9bb610c5ac4dbbf1 Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:05 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:39 +0200 x86/vsyscall: Document odd SIGSEGV error code for vsyscalls Even if vsyscall=none, user page faults on the vsyscall page are reported as though the PROT bit in the error code was set. Add a comment explaining why this is probably okay and display the value in the test case. While at it, explain why the behavior is correct with respect to PKRU. Modify also the selftest to print the odd error code so that there is a way to demonstrate the odd behaviour. If anyone really cares about more accurate emulation, the behaviour could be changed. But that needs a real good justification. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Cc: Borislav Petkov Cc: Kernel Hardening Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/75c91855fd850649ace162eec5495a1354221aaa.1561610354.git.l...@kernel.org --- arch/x86/mm/fault.c | 7 +++ tools/testing/selftests/x86/test_vsyscall.c | 9 - 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 288a5462076f..58e4f1f00bbc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address, * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. +* +* NB: This means that failed vsyscalls with vsyscall=none +* will have the PROT bit. This doesn't leak any +* information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) error_code |= X86_PF_PROT; @@ -1375,6 +1379,9 @@ void do_user_addr_fault(struct pt_regs *regs, * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. +* +* PKRU never rejects instruction fetches, so we don't need +* to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { if (emulate_vsyscall(hw_error_code, regs, address)) diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 0b4f1cc2291c..4c9a8d76dba0 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -183,9 +183,13 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node, } static jmp_buf jmpbuf; +static volatile unsigned long segv_err; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { + ucontext_t *ctx = (ucontext_t *)ctx_void; + + segv_err = ctx->uc_mcontext.gregs[REG_ERR]; siglongjmp(jmpbuf, 1); } @@ -416,8 +420,11 @@ static int test_vsys_r(void) } else if (!can_read && should_read_vsyscall) { printf("[FAIL]\tWe don't have read access, but we should\n"); return 1; + } else if (can_read) { + printf("[OK]\tWe have read access\n"); } else { - printf("[OK]\tgot expected result\n"); + printf("[OK]\tWe do not have read access: #PF(0x%lx)\n", + segv_err); } #endif
[tip:x86/entry] x86/vsyscall: Add a new vsyscall=xonly mode
Commit-ID: bd49e16e3339f052fae05fb3e955c5db0c9c6445 Gitweb: https://git.kernel.org/tip/bd49e16e3339f052fae05fb3e955c5db0c9c6445 Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:03 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:38 +0200 x86/vsyscall: Add a new vsyscall=xonly mode With vsyscall emulation on, a readable vsyscall page is still exposed that contains syscall instructions that validly implement the vsyscalls. This is required because certain dynamic binary instrumentation tools attempt to read the call targets of call instructions in the instrumented code. If the instrumented code uses vsyscalls, then the vsyscall page needs to contain readable code. Unfortunately, leaving readable memory at a deterministic address can be used to help various ASLR bypasses, so some hardening value can be gained by disallowing vsyscall reads. Given how rarely the vsyscall page needs to be readable, add a mechanism to make the vsyscall page be execute only. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Cc: Borislav Petkov Cc: Kernel Hardening Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/d17655777c21bc09a7af1bbcf74e6f2b69a51152.1561610354.git.l...@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 7 +- arch/x86/Kconfig| 33 ++--- arch/x86/entry/vsyscall/vsyscall_64.c | 16 ++-- 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0082d1e56999..be8c3a680afa 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5100,7 +5100,12 @@ targets for exploits that can control RIP. emulate [default] Vsyscalls turn into traps and are - emulated reasonably safely. + emulated reasonably safely. The vsyscall + page is readable. + + xonly Vsyscalls turn into traps and are + emulated reasonably safely. The vsyscall + page is not readable. noneVsyscalls don't work at all. This makes them quite hard to use for exploits but diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..0182d2c67590 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2293,23 +2293,38 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[emulate|none]. + line parameter vsyscall=[emulate|xonly|none]. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty to improve security. - If unsure, select "Emulate". + If unsure, select "Emulate execution only". config LEGACY_VSYSCALL_EMULATE - bool "Emulate" + bool "Full emulation" help - The kernel traps and emulates calls into the fixed - vsyscall address mapping. This makes the mapping - non-executable, but it still contains known contents, - which could be used in certain rare security vulnerability - exploits. This configuration is recommended when userspace - still uses the vsyscall area. + The kernel traps and emulates calls into the fixed vsyscall + address mapping. This makes the mapping non-executable, but + it still contains readable known contents, which could be + used in certain rare security vulnerability exploits. This + configuration is recommended when using legacy userspace + that still uses vsyscalls along with legacy binary + instrumentation tools that require code to be readable. + + An example of this type of legacy userspace is running + Pin on an old binary that still uses vsyscalls. + + config LEGACY_VSYSCALL_XONLY + bool "Emulate execution only" + help + The kernel traps and emulates calls into the fixed vsyscall + address mapping and does not allow reads. This + configuration is recommended when userspace might use the + legacy vsyscall area but support for legacy binary + instrumentation of legacy code is not needed. It mitigates + certain uses of the vsyscall area as an
[tip:x86/entry] x86/vsyscall: Show something useful on a read fault
Commit-ID: 918ce325098a4eef99daad7b6796da33cebaf03a Gitweb: https://git.kernel.org/tip/918ce325098a4eef99daad7b6796da33cebaf03a Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:04 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:39 +0200 x86/vsyscall: Show something useful on a read fault Just segfaulting the application when it tries to read the vsyscall page in xonly mode is not helpful for those who need to debug it. Emit a hint. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Link: https://lkml.kernel.org/r/8016afffe0eab497be32017ad7f6f7030dc3ba66.1561610354.git.l...@kernel.org --- arch/x86/entry/vsyscall/vsyscall_64.c | 19 ++- arch/x86/include/asm/vsyscall.h | 6 -- arch/x86/mm/fault.c | 11 +-- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index fedd7628f3a6..9c58ab807aeb 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -117,7 +117,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) } } -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; @@ -126,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) long ret; unsigned long orig_dx; + /* Write faults or kernel-privilege faults never get fixed up. */ + if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) + return false; + + if (!(error_code & X86_PF_INSTR)) { + /* Failed vsyscall read */ + if (vsyscall_mode == EMULATE) + return false; + + /* +* User code tried and failed to read the vsyscall page. +*/ + warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); + return false; + } + /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index b986b2ca688a..ab60a71a8dcb 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root); * Called on instruction fetch fault in vsyscall page. * Returns true if handled. */ -extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +extern bool emulate_vsyscall(unsigned long error_code, +struct pt_regs *regs, unsigned long address); #else static inline void map_vsyscall(void) {} -static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +static inline bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { return false; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 46df4c6aae46..288a5462076f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1369,16 +1369,15 @@ void do_user_addr_fault(struct pt_regs *regs, #ifdef CONFIG_X86_64 /* -* Instruction fetch faults in the vsyscall page might need -* emulation. The vsyscall page is at a high address -* (>PAGE_OFFSET), but is considered to be part of the user -* address space. +* Faults in the vsyscall page might need emulation. The +* vsyscall page is at a high address (>PAGE_OFFSET), but is +* considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. */ - if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(regs, address)) + if (is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(hw_error_code, regs, address)) return; } #endif
[tip:x86/entry] Documentation/admin: Remove the vsyscall=native documentation
Commit-ID: d974ffcfb7447db5f29a4b662a3eaf99a4e1109e Gitweb: https://git.kernel.org/tip/d974ffcfb7447db5f29a4b662a3eaf99a4e1109e Author: Andy Lutomirski AuthorDate: Wed, 26 Jun 2019 21:45:02 -0700 Committer: Thomas Gleixner CommitDate: Fri, 28 Jun 2019 00:04:38 +0200 Documentation/admin: Remove the vsyscall=native documentation The vsyscall=native feature is gone -- remove the docs. Fixes: 076ca272a14c ("x86/vsyscall/64: Drop "native" vsyscalls") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Cc: Florian Weimer Cc: Jann Horn Cc: sta...@vger.kernel.org Cc: Borislav Petkov Cc: Kernel Hardening Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/d77c7105eb4c57c1a95a95b6a5b8ba194a18e764.1561610354.git.l...@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 6 -- 1 file changed, 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..0082d1e56999 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5102,12 +5102,6 @@ emulate [default] Vsyscalls turn into traps and are emulated reasonably safely. - native Vsyscalls are native syscall instructions. - This is a little bit faster than trapping - and makes a few dynamic recompilers work - better than they would in emulation mode. - It also makes exploits much easier to write. - noneVsyscalls don't work at all. This makes them quite hard to use for exploits but might break your system.
[tip:timers/vdso] x86/vdso: Give the [ph]vclock_page declarations real types
Commit-ID: ecf9db3d1f1a8fd2c335148891c3b044e9ce0628 Gitweb: https://git.kernel.org/tip/ecf9db3d1f1a8fd2c335148891c3b044e9ce0628 Author: Andy Lutomirski AuthorDate: Sat, 22 Jun 2019 15:08:18 -0700 Committer: Thomas Gleixner CommitDate: Mon, 24 Jun 2019 01:21:31 +0200 x86/vdso: Give the [ph]vclock_page declarations real types Clean up the vDSO code a bit by giving pvclock_page and hvclock_page their actual types instead of u8[PAGE_SIZE]. This shouldn't materially affect the generated code. Heavily based on a patch from Linus. [ tglx: Adapted to the unified VDSO code ] Co-developed-by: Linus Torvalds Signed-off-by: Linus Torvalds Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/6920c5188f8658001af1fc56fd35b815706d300c.1561241273.git.l...@kernel.org --- arch/x86/include/asm/vdso/gettimeofday.h | 36 +--- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index f92752d6cbcf..5b63f1f78a1f 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -26,13 +26,33 @@ #define VDSO_HAS_CLOCK_GETRES 1 +/* + * Declare the memory-mapped vclock data pages. These come from hypervisors. + * If we ever reintroduce something like direct access to an MMIO clock like + * the HPET again, it will go here as well. + * + * A load from any of these pages will segfault if the clock in question is + * disabled, so appropriate compiler barriers and checks need to be used + * to prevent stray loads. + * + * These declarations MUST NOT be const. The compiler will assume that + * an extern const variable has genuinely constant contents, and the + * resulting code won't work, since the whole point is that these pages + * change over time, possibly while we're accessing them. + */ + #ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page[PAGE_SIZE] +/* + * This is the vCPU 0 pvclock page. We only use pvclock from the vDSO + * if the hypervisor tells us that all vCPUs can get valid data from the + * vCPU 0 page. + */ +extern struct pvclock_vsyscall_time_info pvclock_page __attribute__((visibility("hidden"))); #endif #ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page[PAGE_SIZE] +extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden"))); #endif @@ -131,14 +151,9 @@ clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) #endif #ifdef CONFIG_PARAVIRT_CLOCK -static const struct pvclock_vsyscall_time_info *get_pvti0(void) -{ - return (const struct pvclock_vsyscall_time_info *)_page; -} - static u64 vread_pvclock(void) { - const struct pvclock_vcpu_time_info *pvti = _pvti0()->pvti; + const struct pvclock_vcpu_time_info *pvti = _page.pvti; u32 version; u64 ret; @@ -180,10 +195,7 @@ static u64 vread_pvclock(void) #ifdef CONFIG_HYPERV_TSCPAGE static u64 vread_hvclock(void) { - const struct ms_hyperv_tsc_page *tsc_pg = - (const struct ms_hyperv_tsc_page *)_page; - - return hv_read_tsc_page(tsc_pg); + return hv_read_tsc_page(_page); } #endif
[tip:x86/cpu] x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit
Commit-ID: 2032f1f96ee0da600633c6c627b9c0a2e0f8b8a6 Gitweb: https://git.kernel.org/tip/2032f1f96ee0da600633c6c627b9c0a2e0f8b8a6 Author: Andy Lutomirski AuthorDate: Wed, 8 May 2019 03:02:31 -0700 Committer: Thomas Gleixner CommitDate: Sat, 22 Jun 2019 11:38:56 +0200 x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit Now that FSGSBASE is fully supported, remove unsafe_fsgsbase, enable FSGSBASE by default, and add nofsgsbase to disable it. Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-17-git-send-email-chang.seok@intel.com --- Documentation/admin-guide/kernel-parameters.txt | 3 +-- arch/x86/kernel/cpu/common.c| 32 +++-- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b0fa5273b0fc..35bc3c3574c6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,8 +2857,7 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. - unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be - replaced with a nofsgsbase flag. + nofsgsbase [X86] Disables FSGSBASE instructions. no_console_suspend [HW] Never suspend the console diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 71defe2d1b7c..1305f16b6105 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,21 +366,21 @@ out: cr4_clear_bits(X86_CR4_UMIP); } -/* - * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are - * updated. This allows us to get the kernel ready incrementally. - * - * Once all the pieces are in place, these will go away and be replaced with - * a nofsgsbase chicken flag. - */ -static bool unsafe_fsgsbase; - -static __init int setup_unsafe_fsgsbase(char *arg) +static __init int x86_nofsgsbase_setup(char *arg) { - unsafe_fsgsbase = true; + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); return 1; } -__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase); +__setup("nofsgsbase", x86_nofsgsbase_setup); /* * Protection Keys are not available in 32-bit mode. @@ -1387,12 +1387,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_umip(c); /* Enable FSGSBASE instructions if available. */ - if (cpu_has(c, X86_FEATURE_FSGSBASE)) { - if (unsafe_fsgsbase) - cr4_set_bits(X86_CR4_FSGSBASE); - else - clear_cpu_cap(c, X86_FEATURE_FSGSBASE); - } + if (cpu_has(c, X86_FEATURE_FSGSBASE)) + cr4_set_bits(X86_CR4_FSGSBASE); /* * The vendor-specific functions might have changed features.
[tip:x86/cpu] selftests/x86/fsgsbase: Test RD/WRGSBASE
Commit-ID: 9ad75a0922e1533b08f3d1451bd908d19e5db41e Gitweb: https://git.kernel.org/tip/9ad75a0922e1533b08f3d1451bd908d19e5db41e Author: Andy Lutomirski AuthorDate: Wed, 8 May 2019 03:02:29 -0700 Committer: Thomas Gleixner CommitDate: Sat, 22 Jun 2019 11:38:55 +0200 selftests/x86/fsgsbase: Test RD/WRGSBASE This validates that GS and GSBASE are independently preserved across context switches. [ chang: Use FSGSBASE instructions directly instead of .byte ] Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-15-git-send-email-chang.seok@intel.com --- tools/testing/selftests/x86/fsgsbase.c | 102 - 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index b02ddce49bbb..afd029897c79 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifndef __x86_64__ # error This test is 64-bit only @@ -74,6 +75,43 @@ static void sigsegv(int sig, siginfo_t *si, void *ctx_void) } +static jmp_buf jmpbuf; + +static void sigill(int sig, siginfo_t *si, void *ctx_void) +{ + siglongjmp(jmpbuf, 1); +} + +static bool have_fsgsbase; + +static inline unsigned long rdgsbase(void) +{ + unsigned long gsbase; + + asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); + + return gsbase; +} + +static inline unsigned long rdfsbase(void) +{ + unsigned long fsbase; + + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); + + return fsbase; +} + +static inline void wrgsbase(unsigned long gsbase) +{ + asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); +} + +static inline void wrfsbase(unsigned long fsbase) +{ + asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); +} + enum which_base { FS, GS }; static unsigned long read_base(enum which_base which) @@ -202,14 +240,16 @@ static void do_remote_base() to_set, hard_zero ? " and clear gs" : "", sel); } -void do_unexpected_base(void) +static __thread int set_thread_area_entry_number = -1; + +static void do_unexpected_base(void) { /* * The goal here is to try to arrange for GS == 0, GSBASE != * 0, and for the the kernel the think that GSBASE == 0. * * To make the test as reliable as possible, this uses -* explicit descriptorss. (This is not the only way. This +* explicit descriptors. (This is not the only way. This * could use ARCH_SET_GS with a low, nonzero base, but the * relevant side effect of ARCH_SET_GS could change.) */ @@ -242,7 +282,7 @@ void do_unexpected_base(void) MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0); memcpy(low_desc, , sizeof(desc)); - low_desc->entry_number = -1; + low_desc->entry_number = set_thread_area_entry_number; /* 32-bit set_thread_area */ long ret; @@ -257,6 +297,8 @@ void do_unexpected_base(void) return; } printf("\tother thread: using GDT slot %d\n", desc.entry_number); + set_thread_area_entry_number = desc.entry_number; + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); } @@ -268,6 +310,34 @@ void do_unexpected_base(void) asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); } +void test_wrbase(unsigned short index, unsigned long base) +{ + unsigned short newindex; + unsigned long newbase; + + printf("[RUN]\tGS = 0x%hx, GSBASE = 0x%lx\n", index, base); + + asm volatile ("mov %0, %%gs" : : "rm" (index)); + wrgsbase(base); + + remote_base = 0; + ftx = 1; + syscall(SYS_futex, , FUTEX_WAKE, 0, NULL, NULL, 0); + while (ftx != 0) + syscall(SYS_futex, , FUTEX_WAIT, 1, NULL, NULL, 0); + + asm volatile ("mov %%gs, %0" : "=rm" (newindex)); + newbase = rdgsbase(); + + if (newindex == index && newbase == base) { + printf("[OK]\tIndex and base were preserved\n"); + } else { + printf("[FAIL]\tAfter switch, GS = 0x%hx and GSBASE = 0x%lx\n", + newindex, newbase); + nerrs++; + } +} + static void *threadproc(void *ctx) { while (1) { @@ -439,6 +509,17 @@ int main() { pthread_t thread; + /* Probe FSGSBASE */ + sethandler(SIGILL, sigill, 0); + if (sigsetjmp(jmpbuf, 1) == 0) { + rdfsbase(); + have_fsgsbase = true; + printf("\tFSGSBASE instructions are enabled\n"); + } else { +
[tip:x86/cpu] x86/process/64: Use FSBSBASE in switch_to() if available
Commit-ID: 1ab5f3f7fe3d7548b4361b68c1fed140c6841af9 Gitweb: https://git.kernel.org/tip/1ab5f3f7fe3d7548b4361b68c1fed140c6841af9 Author: Andy Lutomirski AuthorDate: Wed, 8 May 2019 03:02:22 -0700 Committer: Thomas Gleixner CommitDate: Sat, 22 Jun 2019 11:38:52 +0200 x86/process/64: Use FSBSBASE in switch_to() if available With the new FSGSBASE instructions, FS and GSABSE can be efficiently read and writen in __switch_to(). Use that capability to preserve the full state. This will enable user code to do whatever it wants with the new instructions without any kernel-induced gotchas. (There can still be architectural gotchas: movl %gs,%eax; movl %eax,%gs may change GSBASE if WRGSBASE was used, but users are expected to read the CPU manual before doing things like that.) This is a considerable speedup. It seems to save about 100 cycles per context switch compared to the baseline 4.6-rc1 behavior on a Skylake laptop. [ chang: 5~10% performance improvements were seen with a context switch benchmark that ran threads with different FS/GSBASE values (to the baseline 4.16). Minor edit on the changelog. ] [ tglx: Masaage changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Cc: Ravi Shankar Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-8-git-send-email-chang.seok@intel.com --- arch/x86/kernel/process_64.c | 34 -- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c34ee0f72378..59013f480b86 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -244,8 +244,18 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* +* If FSGSBASE is enabled, we can't make any useful guesses +* about the base, and user code expects us to save the current +* value. Fortunately, reading the base directly is efficient. +*/ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } } #if IS_ENABLED(CONFIG_KVM) @@ -324,10 +334,22 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } } static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
[tip:x86/cpu] x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE
Commit-ID: b64ed19b93c368be0fb6acf05377e8e3a694c92b Gitweb: https://git.kernel.org/tip/b64ed19b93c368be0fb6acf05377e8e3a694c92b Author: Andy Lutomirski AuthorDate: Wed, 8 May 2019 03:02:18 -0700 Committer: Thomas Gleixner CommitDate: Sat, 22 Jun 2019 11:38:51 +0200 x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE This is temporary. It will allow the next few patches to be tested incrementally. Setting unsafe_fsgsbase is a root hole. Don't do it. Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Reviewed-by: Andi Kleen Reviewed-by: Andy Lutomirski Cc: Ravi Shankar Cc: Andrew Morton Cc: Randy Dunlap Cc: H. Peter Anvin Link: https://lkml.kernel.org/r/1557309753-24073-4-git-send-email-chang.seok@intel.com --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/x86/kernel/cpu/common.c| 24 2 files changed, 27 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..b0fa5273b0fc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2857,6 +2857,9 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. + unsafe_fsgsbase [X86] Allow FSGSBASE instructions. This will be + replaced with a nofsgsbase flag. + no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dad20bc891d5..71defe2d1b7c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,6 +366,22 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +/* + * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are + * updated. This allows us to get the kernel ready incrementally. + * + * Once all the pieces are in place, these will go away and be replaced with + * a nofsgsbase chicken flag. + */ +static bool unsafe_fsgsbase; + +static __init int setup_unsafe_fsgsbase(char *arg) +{ + unsafe_fsgsbase = true; + return 1; +} +__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase); + /* * Protection Keys are not available in 32-bit mode. */ @@ -1370,6 +1386,14 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + if (unsafe_fsgsbase) + cr4_set_bits(X86_CR4_FSGSBASE); + else + clear_cpu_cap(c, X86_FEATURE_FSGSBASE); + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes."
[tip:x86/urgent] x86/speculation/mds: Improve CPU buffer clear documentation
Commit-ID: 9d8d0294e78a164d407133dea05caf4b84247d6a Gitweb: https://git.kernel.org/tip/9d8d0294e78a164d407133dea05caf4b84247d6a Author: Andy Lutomirski AuthorDate: Tue, 14 May 2019 13:24:40 -0700 Committer: Ingo Molnar CommitDate: Thu, 16 May 2019 09:05:12 +0200 x86/speculation/mds: Improve CPU buffer clear documentation On x86_64, all returns to usermode go through prepare_exit_to_usermode(), with the sole exception of do_nmi(). This even includes machine checks -- this was added several years ago to support MCE recovery. Update the documentation. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: Jon Masters Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org Fixes: 04dcbdb80578 ("x86/speculation/mds: Clear CPU buffers on exit to user") Link: http://lkml.kernel.org/r/999fa9e126ba6a48e9d214d2f18dbde5c62ac55c.1557865329.git.l...@kernel.org Signed-off-by: Ingo Molnar --- Documentation/x86/mds.rst | 39 +++ 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 0dc812bb9249..5d4330be200f 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -142,38 +142,13 @@ Mitigation points mds_user_clear. The mitigation is invoked in prepare_exit_to_usermode() which covers - most of the kernel to user space transitions. There are a few exceptions - which are not invoking prepare_exit_to_usermode() on return to user - space. These exceptions use the paranoid exit code. - - - Non Maskable Interrupt (NMI): - - Access to sensible data like keys, credentials in the NMI context is - mostly theoretical: The CPU can do prefetching or execute a - misspeculated code path and thereby fetching data which might end up - leaking through a buffer. - - But for mounting other attacks the kernel stack address of the task is - already valuable information. So in full mitigation mode, the NMI is - mitigated on the return from do_nmi() to provide almost complete - coverage. - - - Machine Check Exception (#MC): - - Another corner case is a #MC which hits between the CPU buffer clear - invocation and the actual return to user. As this still is in kernel - space it takes the paranoid exit path which does not clear the CPU - buffers. So the #MC handler repopulates the buffers to some - extent. Machine checks are not reliably controllable and the window is - extremly small so mitigation would just tick a checkbox that this - theoretical corner case is covered. To keep the amount of special - cases small, ignore #MC. - - - Debug Exception (#DB): - - This takes the paranoid exit path only when the INT1 breakpoint is in - kernel space. #DB on a user space address takes the regular exit path, - so no extra mitigation required. + all but one of the kernel to user space transitions. The exception + is when we return from a Non Maskable Interrupt (NMI), which is + handled directly in do_nmi(). + + (The reason that NMI is special is that prepare_exit_to_usermode() can +enable IRQs. In NMI context, NMIs are blocked, and we don't want to +enable IRQs with NMIs blocked.) 2. C-State transition
[tip:x86/urgent] x86/speculation/mds: Revert CPU buffer clear on double fault exit
Commit-ID: 88640e1dcd089879530a49a8d212d1814678dfe7 Gitweb: https://git.kernel.org/tip/88640e1dcd089879530a49a8d212d1814678dfe7 Author: Andy Lutomirski AuthorDate: Tue, 14 May 2019 13:24:39 -0700 Committer: Ingo Molnar CommitDate: Thu, 16 May 2019 09:05:11 +0200 x86/speculation/mds: Revert CPU buffer clear on double fault exit The double fault ESPFIX path doesn't return to user mode at all -- it returns back to the kernel by simulating a #GP fault. prepare_exit_to_usermode() will run on the way out of general_protection before running user code. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: Jon Masters Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org Fixes: 04dcbdb80578 ("x86/speculation/mds: Clear CPU buffers on exit to user") Link: http://lkml.kernel.org/r/ac97612445c0a44ee10374f6ea79c222fe22a5c4.1557865329.git.l...@kernel.org Signed-off-by: Ingo Molnar --- Documentation/x86/mds.rst | 7 --- arch/x86/kernel/traps.c | 8 2 files changed, 15 deletions(-) diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst index 534e9baa4e1d..0dc812bb9249 100644 --- a/Documentation/x86/mds.rst +++ b/Documentation/x86/mds.rst @@ -158,13 +158,6 @@ Mitigation points mitigated on the return from do_nmi() to provide almost complete coverage. - - Double fault (#DF): - - A double fault is usually fatal, but the ESPFIX workaround, which can - be triggered from user space through modify_ldt(2) is a recoverable - double fault. #DF uses the paranoid exit path, so explicit mitigation - in the double fault handler is required. - - Machine Check Exception (#MC): Another corner case is a #MC which hits between the CPU buffer clear diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7de466eb960b..8b6d03e55d2f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include #include @@ -368,13 +367,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)>orig_ax; - /* -* This situation can be triggered by userspace via -* modify_ldt(2) and the return does not take the regular -* user space exit, so a CPU buffer clear is required when -* MDS mitigation is enabled. -*/ - mds_user_clear_cpu_buffers(); return; } #endif
[tip:x86/mm] x86/mm: Introduce temporary mm structs
Commit-ID: cefa929c034eb5d9c15c50088235a0093a219687 Gitweb: https://git.kernel.org/tip/cefa929c034eb5d9c15c50088235a0093a219687 Author: Andy Lutomirski AuthorDate: Thu, 25 Apr 2019 17:11:23 -0700 Committer: Ingo Molnar CommitDate: Tue, 30 Apr 2019 12:37:50 +0200 x86/mm: Introduce temporary mm structs Using a dedicated page-table for temporary PTEs prevents other cores from using - even speculatively - these PTEs, thereby providing two benefits: (1) Security hardening: an attacker that gains kernel memory writing abilities cannot easily overwrite sensitive data. (2) Avoiding TLB shootdowns: the PTEs do not need to be flushed in remote page-tables. To do so a temporary mm_struct can be used. Mappings which are private for this mm can be set in the userspace part of the address-space. During the whole time in which the temporary mm is loaded, interrupts must be disabled. The first use-case for temporary mm struct, which will follow, is for poking the kernel text. [ Commit message was written by Nadav Amit ] Tested-by: Masami Hiramatsu Signed-off-by: Andy Lutomirski Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-4-na...@vmware.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 33 + 1 file changed, 33 insertions(+) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19d18fae6ec6..24dc3b810970 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -356,4 +356,37 @@ static inline unsigned long __get_current_cr3_fast(void) return cr3; } +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */
[tip:x86/irq] x86/irq/64: Remap the IRQ stack with guard pages
Commit-ID: 18b7a6bef62de1d598fbff23b52114b7775ecf00 Gitweb: https://git.kernel.org/tip/18b7a6bef62de1d598fbff23b52114b7775ecf00 Author: Andy Lutomirski AuthorDate: Sun, 14 Apr 2019 18:00:07 +0200 Committer: Borislav Petkov CommitDate: Wed, 17 Apr 2019 15:40:57 +0200 x86/irq/64: Remap the IRQ stack with guard pages The IRQ stack lives in percpu space, so an IRQ handler that overflows it will overwrite other data structures. Use vmap() to remap the IRQ stack so that it will have the usual guard pages that vmap()/vmalloc() allocations have. With this, the kernel will panic immediately on an IRQ stack overflow. [ tglx: Move the map code to a proper place and invoke it only when a CPU is about to be brought online. No point in installing the map at early boot for all possible CPUs. Fail the CPU bringup if the vmap() fails as done for all other preparatory stages in CPU hotplug. ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160146.363733...@linutronix.de --- arch/x86/kernel/irq_64.c | 30 ++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c0f89d136b80..f107eb2021f6 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -91,6 +91,35 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) return true; } +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; + + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); + + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } + + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va) + return -ENOMEM; + + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ static int map_irq_stack(unsigned int cpu) { void *va = per_cpu_ptr(_stack_backing_store, cpu); @@ -98,6 +127,7 @@ static int map_irq_stack(unsigned int cpu) per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; return 0; } +#endif int irq_init_percpu_irqstack(unsigned int cpu) {
[tip:x86/irq] x86/irq/64: Split the IRQ stack into its own pages
Commit-ID: e6401c13093173aad709a5c6de00cf8d692ee786 Gitweb: https://git.kernel.org/tip/e6401c13093173aad709a5c6de00cf8d692ee786 Author: Andy Lutomirski AuthorDate: Sun, 14 Apr 2019 18:00:06 +0200 Committer: Borislav Petkov CommitDate: Wed, 17 Apr 2019 15:37:02 +0200 x86/irq/64: Split the IRQ stack into its own pages Currently, the IRQ stack is hardcoded as the first page of the percpu area, and the stack canary lives on the IRQ stack. The former gets in the way of adding an IRQ stack guard page, and the latter is a potential weakness in the stack canary mechanism. Split the IRQ stack into its own private percpu pages. [ tglx: Make 64 and 32 bit share struct irq_stack ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Brijesh Singh Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: Feng Tang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jan Beulich Cc: Jiri Kosina Cc: Joerg Roedel Cc: Jordan Borgner Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Maran Wilson Cc: Masahiro Yamada Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Desaulniers Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Pu Wen Cc: "Rafael Ávila de Espíndola" Cc: Sean Christopherson Cc: Stefano Stabellini Cc: Vlastimil Babka Cc: x86-ml Cc: xen-de...@lists.xenproject.org Link: https://lkml.kernel.org/r/20190414160146.267376...@linutronix.de --- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/processor.h | 32 ++-- arch/x86/include/asm/stackprotector.h | 6 +++--- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/cpu/common.c | 8 arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/irq_64.c | 5 - arch/x86/kernel/setup_percpu.c| 5 - arch/x86/kernel/vmlinux.lds.S | 7 --- arch/x86/tools/relocs.c | 2 +- arch/x86/xen/xen-head.S | 10 +- 11 files changed, 39 insertions(+), 44 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 726abbe6c6d8..cfe4d6ea258d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -298,7 +298,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movqTASK_stack_canary(%rsi), %rbx - movq%rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq%rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif #ifdef CONFIG_RETPOLINE @@ -430,7 +430,7 @@ END(irq_entries_start) * it before we actually move ourselves to the IRQ stack. */ - movq\old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq\old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) movqPER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5e3dd4e2136d..7e99ef67bff0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); #define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) +/* Per CPU interrupt stacks */ +struct irq_stack { + charstack[IRQ_STACK_SIZE]; +} __aligned(IRQ_STACK_SIZE); + +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else @@ -375,28 +382,24 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #endif #ifdef CONFIG_X86_64 -union irq_stack_union { - char irq_stack[IRQ_STACK_SIZE]; +struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ - struct { - char gs_base[40]; - unsigned long stack_canary; - }; + chargs_base[40]; + unsigned long stack_canary; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -DECLARE_INIT_PER_CPU(irq_stack_union); +DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; +DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); + return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(char *, hardirq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); @@ -418,14 +421,7 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* - * per-CPU IRQ handling stacks - */ -struct irq_stack {
[tip:x86/irq] x86/irq/64: Remove a hardcoded irq_stack_union access
Commit-ID: 4f44b8f0b33b7111216f0fad353315f796b81617 Gitweb: https://git.kernel.org/tip/4f44b8f0b33b7111216f0fad353315f796b81617 Author: Andy Lutomirski AuthorDate: Sun, 14 Apr 2019 17:59:40 +0200 Committer: Borislav Petkov CommitDate: Wed, 17 Apr 2019 12:31:38 +0200 x86/irq/64: Remove a hardcoded irq_stack_union access stack_overflow_check() is using both irq_stack_ptr and irq_stack_union to find the IRQ stack. That's going to break when vmapped irq stacks are introduced. Change it to just use irq_stack_ptr. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Nicolai Stange Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.872549...@linutronix.de --- arch/x86/kernel/irq_64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b50ac9c7397b..f6dcc8fea5c0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -55,9 +55,8 @@ static inline void stack_overflow_check(struct pt_regs *regs) regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); + irq_stack_top = irq_stack_bottom - IRQ_STACK_SIZE + STACK_TOP_MARGIN; if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) return;
[tip:x86/irq] x86/dumpstack: Fix off-by-one errors in stack identification
Commit-ID: fa33215422fd415a07ec2a00e9f1acdaf0fa8e94 Gitweb: https://git.kernel.org/tip/fa33215422fd415a07ec2a00e9f1acdaf0fa8e94 Author: Andy Lutomirski AuthorDate: Sun, 14 Apr 2019 17:59:39 +0200 Committer: Borislav Petkov CommitDate: Wed, 17 Apr 2019 12:26:50 +0200 x86/dumpstack: Fix off-by-one errors in stack identification The get_stack_info() function is off-by-one when checking whether an address is on a IRQ stack or a IST stack. This prevents an overflowed IRQ or IST stack from being dumped properly. [ tglx: Do the same for 32-bit ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160143.785651...@linutronix.de --- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e40..d305440ebe9c 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -41,7 +41,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -66,7 +66,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57..90f0fa88cbb3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -65,7 +65,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack <= begin || stack >= end) + if (stack < begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -88,7 +88,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ;
[tip:x86/urgent] x86/uaccess: Don't leak the AC flag into __put_user() value evaluation
Commit-ID: 2a418cf3f5f1caf911af288e978d61c9844b0695 Gitweb: https://git.kernel.org/tip/2a418cf3f5f1caf911af288e978d61c9844b0695 Author: Andy Lutomirski AuthorDate: Fri, 22 Feb 2019 17:17:04 -0800 Committer: Borislav Petkov CommitDate: Mon, 25 Feb 2019 20:17:05 +0100 x86/uaccess: Don't leak the AC flag into __put_user() value evaluation When calling __put_user(foo(), ptr), the __put_user() macro would call foo() in between __uaccess_begin() and __uaccess_end(). If that code were buggy, then those bugs would be run without SMAP protection. Fortunately, there seem to be few instances of the problem in the kernel. Nevertheless, __put_user() should be fixed to avoid doing this. Therefore, evaluate __put_user()'s argument before setting AC. This issue was noticed when an objtool hack by Peter Zijlstra complained about genregs_get() and I compared the assembly output to the C source. [ bp: Massage commit message and fixed up whitespace. ] Fixes: 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses") Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Acked-by: Linus Torvalds Cc: Peter Zijlstra Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Denys Vlasenko Cc: sta...@vger.kernel.org Link: http://lkml.kernel.org/r/20190225125231.845656...@infradead.org --- arch/x86/include/asm/uaccess.h | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a77445d1b034..28376aa2d053 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -284,7 +284,7 @@ do { \ __put_user_goto(x, ptr, "l", "k", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64((__typeof__(*ptr))(x), ptr, label); \ + __put_user_goto_u64(x, ptr, label); \ break; \ default:\ __put_user_bad(); \ @@ -431,8 +431,10 @@ do { \ ({ \ __label__ __pu_label; \ int __pu_err = -EFAULT; \ + __typeof__(*(ptr)) __pu_val;\ + __pu_val = x; \ __uaccess_begin(); \ - __put_user_size((x), (ptr), (size), __pu_label);\ + __put_user_size(__pu_val, (ptr), (size), __pu_label); \ __pu_err = 0; \ __pu_label:\ __uaccess_end();\
[tip:x86/urgent] x86/uaccess: Don't leak the AC flag into __put_user() value evaluation
Commit-ID: 1ee2bd5e09195d5476daefec5c64ba597a0a9920 Gitweb: https://git.kernel.org/tip/1ee2bd5e09195d5476daefec5c64ba597a0a9920 Author: Andy Lutomirski AuthorDate: Fri, 22 Feb 2019 17:17:04 -0800 Committer: Borislav Petkov CommitDate: Mon, 25 Feb 2019 18:55:04 +0100 x86/uaccess: Don't leak the AC flag into __put_user() value evaluation When calling __put_user(foo(), ptr), the __put_user() macro would call foo() in between __uaccess_begin() and __uaccess_end(). If that code were buggy, then those bugs would be run without SMAP protection. Fortunately, there seem to be few instances of the problem in the kernel. Nevertheless, __put_user() should be fixed to avoid doing this. Therefore, evaluate __put_user()'s argument before setting AC. This issue was noticed when an objtool hack by Peter Zijlstra complained about genregs_get() and I compared the assembly output to the C source. [ bp: Massage commit message. ] Fixes: 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses") Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Denys Vlasenko Cc: sta...@vger.kernel.org Link: http://lkml.kernel.org/r/20190225125231.845656...@infradead.org --- arch/x86/include/asm/uaccess.h | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a77445d1b034..d7688efacf29 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -284,7 +284,7 @@ do { \ __put_user_goto(x, ptr, "l", "k", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64((__typeof__(*ptr))(x), ptr, label); \ + __put_user_goto_u64(x, ptr, label); \ break; \ default:\ __put_user_bad(); \ @@ -431,8 +431,10 @@ do { \ ({ \ __label__ __pu_label; \ int __pu_err = -EFAULT; \ + __typeof__(*(ptr)) __pu_val;\ + __pu_val = x; \ __uaccess_begin(); \ - __put_user_size((x), (ptr), (size), __pu_label);\ + __put_user_size(__pu_val, (ptr), (size), __pu_label); \ __pu_err = 0; \ __pu_label:\ __uaccess_end();\
[tip:x86/mm] x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code
Commit-ID: af2ebdcf044039e89da3cd44c0f04dea317020c5 Gitweb: https://git.kernel.org/tip/af2ebdcf044039e89da3cd44c0f04dea317020c5 Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:26 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:24:27 +0100 x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code Rather than hardcoding 6 with a comment, use the defined constants. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/e023f20352b0d05a8b0205629897917262d2ad68.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 85fd85d52ffd..d78bcc03e60e 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -102,7 +102,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { struct thread_struct *thread = >thread; - thread->error_code = 6; /* user fault, no page, write */ + thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF;
[tip:x86/mm] x86/oops: Show the correct CS value in show_regs()
Commit-ID: d38bc89c72e7235ac889ae64fe7828e2e61a18af Gitweb: https://git.kernel.org/tip/d38bc89c72e7235ac889ae64fe7828e2e61a18af Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:24 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:23:01 +0100 x86/oops: Show the correct CS value in show_regs() show_regs() shows the CS in the CPU register instead of the value in regs. This means that we'll probably print "CS: 0010" almost all the time regardless of what was actually in CS when the kernel malfunctioned. This gives a particularly confusing result if we OOPSed due to an implicit supervisor access from user mode. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/4e36812b6e1e95236a812021d35cbf22746b5af6.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0e0b4288a4b2..2b8e6324fa20 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -66,7 +66,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; - unsigned int ds, cs, es; + unsigned int ds, es; show_iret_regs(regs); @@ -98,7 +98,6 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) } asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); @@ -114,7 +113,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, es, cr0); printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
[tip:x86/mm] x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code
Commit-ID: af2ebdcf044039e89da3cd44c0f04dea317020c5 Gitweb: https://git.kernel.org/tip/af2ebdcf044039e89da3cd44c0f04dea317020c5 Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:26 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:24:27 +0100 x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code Rather than hardcoding 6 with a comment, use the defined constants. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/e023f20352b0d05a8b0205629897917262d2ad68.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 85fd85d52ffd..d78bcc03e60e 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -102,7 +102,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { struct thread_struct *thread = >thread; - thread->error_code = 6; /* user fault, no page, write */ + thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF;
[tip:x86/mm] x86/oops: Show the correct CS value in show_regs()
Commit-ID: d38bc89c72e7235ac889ae64fe7828e2e61a18af Gitweb: https://git.kernel.org/tip/d38bc89c72e7235ac889ae64fe7828e2e61a18af Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:24 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:23:01 +0100 x86/oops: Show the correct CS value in show_regs() show_regs() shows the CS in the CPU register instead of the value in regs. This means that we'll probably print "CS: 0010" almost all the time regardless of what was actually in CS when the kernel malfunctioned. This gives a particularly confusing result if we OOPSed due to an implicit supervisor access from user mode. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/4e36812b6e1e95236a812021d35cbf22746b5af6.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0e0b4288a4b2..2b8e6324fa20 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -66,7 +66,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; - unsigned int ds, cs, es; + unsigned int ds, es; show_iret_regs(regs); @@ -98,7 +98,6 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) } asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); @@ -114,7 +113,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, es, cr0); printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
[tip:x86/mm] x86/fault: Decode page fault OOPSes better
Commit-ID: a1a371c468f7238b7826fde55786b02377faf8e2 Gitweb: https://git.kernel.org/tip/a1a371c468f7238b7826fde55786b02377faf8e2 Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:25 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:24:28 +0100 x86/fault: Decode page fault OOPSes better One of Linus' favorite hobbies seems to be looking at OOPSes and decoding the error code in his head. This is not one of my favorite hobbies :) Teach the page fault OOPS hander to decode the error code. If it's a !USER fault from user mode, print an explicit note to that effect and print out the addresses of various tables that might cause such an error. With this patch applied, if I intentionally point the LDT at 0x0 and run the x86 selftests, I get: BUG: unable to handle kernel NULL pointer dereference at HW error: normal kernel read fault This was a system access from user code IDT: 0xfe00 (limit=0xfff) GDT: 0xfe001000 (limit=0x7f) LDTR: 0x50 -- base=0x0 limit=0xfff7 TR: 0x40 -- base=0xfe003000 limit=0x206f PGD 8456e067 P4D 8456e067 PUD 4623067 PMD 0 SMP PTI CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317 Hardware name: ... RIP: 0033:0x401454 Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/11212acb25980cd1b3030875cd9502414fbb214d.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 84 + 1 file changed, 84 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ca38bd0472f2..f5efbdba2b6d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include /* struct vm86 */ #include/* vma_pkey() */ #include/* efi_recover_from_page_fault()*/ +#include /* store_idt(), ... */ #define CREATE_TRACE_POINTS #include @@ -571,10 +572,53 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) +{ + u32 offset = (index >> 3) * sizeof(struct desc_struct); + unsigned long addr; + struct ldttss_desc desc; + + if (index == 0) { + pr_alert("%s: NULL\n", name); + return; + } + + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); + return; + } + + if (probe_kernel_read(, (void *)(gdt->address + offset), + sizeof(struct ldttss_desc))) { + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", +name, index); + return; + } + + addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24); +#ifdef CONFIG_X86_64 + addr |= ((u64)desc.base3 << 32); +#endif + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", +name, index, addr, (desc.limit0 | (desc.limit1 << 16))); +} + +static void errstr(unsigned long ec, char *buf, unsigned long mask, + const char *txt) +{ + if (ec & mask) { + if (buf[0]) + strcat(buf, " "); + strcat(buf, txt); + } +} + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + char errtxt[64]; + if (!oops_may_print()) return; @@ -602,6 +646,46 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); + errtxt[0] = 0; + errstr(error_code, errtxt, X86_PF_PROT, "PROT"); + errstr(error_code, errtxt, X86_PF_WRITE, "WRITE"); + errstr(error_code, errtxt, X86_PF_USER, "USER"); + errstr(error_code, errtxt, X86_PF_RSVD, "RSVD"); + errstr(error_code, errtxt, X86_PF_INSTR, "INSTR"); + errstr(error_code, errtxt, X86_PF_PK, "PK"); + pr_alert("HW error: %s\n", error_code ? errtxt : +"normal kernel read fault"); + if (!(error_code & X86_PF_USER) && user_mode(regs)) { + struct desc_ptr idt, gdt; + u16 ldtr, tr; + + pr_alert("This was a system access from user code\n"); + + /* +* This can happen for quite a few reasons. The more obvious +* ones are faults accessing the GDT, or LDT. Perhaps +* surprisingly, if the CPU tries to deliver a benign or +* contributory exception from user code and gets a page fault +
[tip:x86/mm] x86/fault: Decode page fault OOPSes better
Commit-ID: a1a371c468f7238b7826fde55786b02377faf8e2 Gitweb: https://git.kernel.org/tip/a1a371c468f7238b7826fde55786b02377faf8e2 Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:25 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:24:28 +0100 x86/fault: Decode page fault OOPSes better One of Linus' favorite hobbies seems to be looking at OOPSes and decoding the error code in his head. This is not one of my favorite hobbies :) Teach the page fault OOPS hander to decode the error code. If it's a !USER fault from user mode, print an explicit note to that effect and print out the addresses of various tables that might cause such an error. With this patch applied, if I intentionally point the LDT at 0x0 and run the x86 selftests, I get: BUG: unable to handle kernel NULL pointer dereference at HW error: normal kernel read fault This was a system access from user code IDT: 0xfe00 (limit=0xfff) GDT: 0xfe001000 (limit=0x7f) LDTR: 0x50 -- base=0x0 limit=0xfff7 TR: 0x40 -- base=0xfe003000 limit=0x206f PGD 8456e067 P4D 8456e067 PUD 4623067 PMD 0 SMP PTI CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317 Hardware name: ... RIP: 0033:0x401454 Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/11212acb25980cd1b3030875cd9502414fbb214d.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 84 + 1 file changed, 84 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ca38bd0472f2..f5efbdba2b6d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include /* struct vm86 */ #include/* vma_pkey() */ #include/* efi_recover_from_page_fault()*/ +#include /* store_idt(), ... */ #define CREATE_TRACE_POINTS #include @@ -571,10 +572,53 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) +{ + u32 offset = (index >> 3) * sizeof(struct desc_struct); + unsigned long addr; + struct ldttss_desc desc; + + if (index == 0) { + pr_alert("%s: NULL\n", name); + return; + } + + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); + return; + } + + if (probe_kernel_read(, (void *)(gdt->address + offset), + sizeof(struct ldttss_desc))) { + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", +name, index); + return; + } + + addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24); +#ifdef CONFIG_X86_64 + addr |= ((u64)desc.base3 << 32); +#endif + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", +name, index, addr, (desc.limit0 | (desc.limit1 << 16))); +} + +static void errstr(unsigned long ec, char *buf, unsigned long mask, + const char *txt) +{ + if (ec & mask) { + if (buf[0]) + strcat(buf, " "); + strcat(buf, txt); + } +} + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + char errtxt[64]; + if (!oops_may_print()) return; @@ -602,6 +646,46 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", (void *)address); + errtxt[0] = 0; + errstr(error_code, errtxt, X86_PF_PROT, "PROT"); + errstr(error_code, errtxt, X86_PF_WRITE, "WRITE"); + errstr(error_code, errtxt, X86_PF_USER, "USER"); + errstr(error_code, errtxt, X86_PF_RSVD, "RSVD"); + errstr(error_code, errtxt, X86_PF_INSTR, "INSTR"); + errstr(error_code, errtxt, X86_PF_PK, "PK"); + pr_alert("HW error: %s\n", error_code ? errtxt : +"normal kernel read fault"); + if (!(error_code & X86_PF_USER) && user_mode(regs)) { + struct desc_ptr idt, gdt; + u16 ldtr, tr; + + pr_alert("This was a system access from user code\n"); + + /* +* This can happen for quite a few reasons. The more obvious +* ones are faults accessing the GDT, or LDT. Perhaps +* surprisingly, if the CPU tries to deliver a benign or +* contributory exception from user code and gets a page fault +
[tip:x86/mm] x86/fault: Don't try to recover from an implicit supervisor access
Commit-ID: ebb53e2597e2dc7637ab213df006e99681b6ee25 Gitweb: https://git.kernel.org/tip/ebb53e2597e2dc7637ab213df006e99681b6ee25 Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:23 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:23:00 +0100 x86/fault: Don't try to recover from an implicit supervisor access This avoids a situation in which we attempt to apply various fixups that are not intended to handle implicit supervisor accesses from user mode if we screw up in a way that causes this type of fault. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/f151d72ff352265f3274c5ab3a4105090f49.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 82881bc5feef..ca38bd0472f2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -653,6 +653,15 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long flags; int sig; + if (user_mode(regs)) { + /* +* This is an implicit supervisor-mode access from user +* mode. Bypass all the kernel-mode recovery code and just +* OOPS. +*/ + goto oops; + } + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* @@ -738,6 +747,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_recover_from_page_fault(address); +oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice:
[tip:x86/mm] x86/fault: Don't try to recover from an implicit supervisor access
Commit-ID: ebb53e2597e2dc7637ab213df006e99681b6ee25 Gitweb: https://git.kernel.org/tip/ebb53e2597e2dc7637ab213df006e99681b6ee25 Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:23 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:23:00 +0100 x86/fault: Don't try to recover from an implicit supervisor access This avoids a situation in which we attempt to apply various fixups that are not intended to handle implicit supervisor accesses from user mode if we screw up in a way that causes this type of fault. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/f151d72ff352265f3274c5ab3a4105090f49.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 82881bc5feef..ca38bd0472f2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -653,6 +653,15 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long flags; int sig; + if (user_mode(regs)) { + /* +* This is an implicit supervisor-mode access from user +* mode. Bypass all the kernel-mode recovery code and just +* OOPS. +*/ + goto oops; + } + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { /* @@ -738,6 +747,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_recover_from_page_fault(address); +oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice:
[tip:x86/mm] x86/fault: Remove sw_error_code
Commit-ID: 0ed32f1aa66ee758e6c8164f549f7ff9d399a20e Gitweb: https://git.kernel.org/tip/0ed32f1aa66ee758e6c8164f549f7ff9d399a20e Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:22 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:22:59 +0100 x86/fault: Remove sw_error_code All of the fault handling code now corrently checks user_mode(regs) as needed, and nothing depends on the X86_PF_USER bit being munged. Get rid of the sw_error code and use hw_error_code everywhere. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/078f5b8ae6e8c79ff8ee7345b5c476c45003e5ac.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 50 +++--- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b898a38093a3..82881bc5feef 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1217,7 +1217,6 @@ void do_user_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1262,13 +1261,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* -* hw_error_code is literally the "page fault error code" passed to -* the kernel directly from the hardware. But, we will shortly be -* modifying it in software, so give it a new name. -*/ - sw_error_code = hw_error_code; - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -1278,26 +1270,6 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (user_mode(regs)) { local_irq_enable(); - /* -* Up to this point, X86_PF_USER set in hw_error_code -* indicated a user-mode access. But, after this, -* X86_PF_USER in sw_error_code will indicate either -* that, *or* an implicit kernel(supervisor)-mode access -* which originated from user mode. -*/ - if (!(hw_error_code & X86_PF_USER)) { - /* -* The CPU was in user mode, but the CPU says -* the fault was not a user-mode access. -* Must be an implicit kernel-mode access, -* which we do not expect to happen in the -* user address space. -*/ - pr_warn_once("kernel-mode error from user-mode: %lx\n", - hw_error_code); - - sw_error_code |= X86_PF_USER; - } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1306,9 +1278,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (sw_error_code & X86_PF_WRITE) + if (hw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (sw_error_code & X86_PF_INSTR) + if (hw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1321,7 +1293,7 @@ void do_user_addr_fault(struct pt_regs *regs, * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. */ - if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { if (emulate_vsyscall(regs, address)) return; } @@ -1345,7 +1317,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, sw_error_code, address); + bad_area_nosemaphore(regs, hw_error_code, address); return; } retry: @@ -1361,17 +1333,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return;
[tip:x86/mm] x86/fault: Remove sw_error_code
Commit-ID: 0ed32f1aa66ee758e6c8164f549f7ff9d399a20e Gitweb: https://git.kernel.org/tip/0ed32f1aa66ee758e6c8164f549f7ff9d399a20e Author: Andy Lutomirski AuthorDate: Wed, 21 Nov 2018 15:11:22 -0800 Committer: Ingo Molnar CommitDate: Thu, 22 Nov 2018 09:22:59 +0100 x86/fault: Remove sw_error_code All of the fault handling code now corrently checks user_mode(regs) as needed, and nothing depends on the X86_PF_USER bit being munged. Get rid of the sw_error code and use hw_error_code everywhere. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/078f5b8ae6e8c79ff8ee7345b5c476c45003e5ac.1542841400.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 50 +++--- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b898a38093a3..82881bc5feef 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1217,7 +1217,6 @@ void do_user_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { - unsigned long sw_error_code; struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; @@ -1262,13 +1261,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* -* hw_error_code is literally the "page fault error code" passed to -* the kernel directly from the hardware. But, we will shortly be -* modifying it in software, so give it a new name. -*/ - sw_error_code = hw_error_code; - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -1278,26 +1270,6 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (user_mode(regs)) { local_irq_enable(); - /* -* Up to this point, X86_PF_USER set in hw_error_code -* indicated a user-mode access. But, after this, -* X86_PF_USER in sw_error_code will indicate either -* that, *or* an implicit kernel(supervisor)-mode access -* which originated from user mode. -*/ - if (!(hw_error_code & X86_PF_USER)) { - /* -* The CPU was in user mode, but the CPU says -* the fault was not a user-mode access. -* Must be an implicit kernel-mode access, -* which we do not expect to happen in the -* user address space. -*/ - pr_warn_once("kernel-mode error from user-mode: %lx\n", - hw_error_code); - - sw_error_code |= X86_PF_USER; - } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1306,9 +1278,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (sw_error_code & X86_PF_WRITE) + if (hw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (sw_error_code & X86_PF_INSTR) + if (hw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1321,7 +1293,7 @@ void do_user_addr_fault(struct pt_regs *regs, * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. */ - if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { if (emulate_vsyscall(regs, address)) return; } @@ -1345,7 +1317,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, sw_error_code, address); + bad_area_nosemaphore(regs, hw_error_code, address); return; } retry: @@ -1361,17 +1333,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, sw_error_code, address); + bad_area(regs, hw_error_code, address); return;
[tip:x86/mm] x86/fault: Don't set thread.cr2, etc before OOPSing
Commit-ID: 1ad33f5aec20f53785dbad44c6fb3b204aefd921 Gitweb: https://git.kernel.org/tip/1ad33f5aec20f53785dbad44c6fb3b204aefd921 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:32 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:30 +0100 x86/fault: Don't set thread.cr2, etc before OOPSing The fault handling code sets the cr2, trap_nr, and error_code fields in thread_struct before OOPSing. No one reads those fields during an OOPS, so remove the code to set them. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d418022aa0fad9cb40467aa7acaf4e95be50ee96.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 8 1 file changed, 8 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b5ec1ca2f4a0..b898a38093a3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -621,10 +621,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - if (__die("Bad pagetable", regs, error_code)) sig = 0; @@ -753,10 +749,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0;
[tip:x86/mm] x86/fault: Don't set thread.cr2, etc before OOPSing
Commit-ID: 1ad33f5aec20f53785dbad44c6fb3b204aefd921 Gitweb: https://git.kernel.org/tip/1ad33f5aec20f53785dbad44c6fb3b204aefd921 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:32 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:30 +0100 x86/fault: Don't set thread.cr2, etc before OOPSing The fault handling code sets the cr2, trap_nr, and error_code fields in thread_struct before OOPSing. No one reads those fields during an OOPS, so remove the code to set them. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d418022aa0fad9cb40467aa7acaf4e95be50ee96.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 8 1 file changed, 8 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b5ec1ca2f4a0..b898a38093a3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -621,10 +621,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - if (__die("Bad pagetable", regs, error_code)) sig = 0; @@ -753,10 +749,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code; - sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0;
[tip:x86/mm] x86/fault: Make error_code sanitization more robust
Commit-ID: e49d3cbef0176c182b86206185f137a87f16ab91 Gitweb: https://git.kernel.org/tip/e49d3cbef0176c182b86206185f137a87f16ab91 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:31 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:29 +0100 x86/fault: Make error_code sanitization more robust The error code in a page fault on a kernel address indicates whether that address is mapped, which should not be revealed in a signal. The normal code path for a page fault on a kernel address sanitizes the bit, but the paths for vsyscall emulation and SIGBUS do not. Both are harmless, but for subtle reasons. SIGBUS is never sent for a kernel address, and vsyscall emulation will never fault on a kernel address per se because it will fail an access_ok() check instead. Make the code more robust by adding a helper that sets the relevant fields and sanitizing the error code in the helper. This also cleans up the code -- we had three copies of roughly the same thing. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/b31159bd55bd0c4fa061a20dfd6c429c094bebaa.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 30 +- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3c9aed03d18e..b5ec1ca2f4a0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -631,6 +631,24 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; + + /* +* To avoid leaking information about the kernel page +* table layout, pretend that user-mode accesses to +* kernel addresses are always protection faults. +*/ + if (address >= TASK_SIZE_MAX) + error_code |= X86_PF_PROT; + + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; +} + static noinline void no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) @@ -656,9 +674,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | X86_PF_USER; - tsk->thread.cr2 = address; + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address, @@ -821,9 +837,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); @@ -937,9 +951,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
[tip:x86/mm] x86/fault: Make error_code sanitization more robust
Commit-ID: e49d3cbef0176c182b86206185f137a87f16ab91 Gitweb: https://git.kernel.org/tip/e49d3cbef0176c182b86206185f137a87f16ab91 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:31 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:29 +0100 x86/fault: Make error_code sanitization more robust The error code in a page fault on a kernel address indicates whether that address is mapped, which should not be revealed in a signal. The normal code path for a page fault on a kernel address sanitizes the bit, but the paths for vsyscall emulation and SIGBUS do not. Both are harmless, but for subtle reasons. SIGBUS is never sent for a kernel address, and vsyscall emulation will never fault on a kernel address per se because it will fail an access_ok() check instead. Make the code more robust by adding a helper that sets the relevant fields and sanitizing the error code in the helper. This also cleans up the code -- we had three copies of roughly the same thing. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/b31159bd55bd0c4fa061a20dfd6c429c094bebaa.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 30 +- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3c9aed03d18e..b5ec1ca2f4a0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -631,6 +631,24 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; + + /* +* To avoid leaking information about the kernel page +* table layout, pretend that user-mode accesses to +* kernel addresses are always protection faults. +*/ + if (address >= TASK_SIZE_MAX) + error_code |= X86_PF_PROT; + + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; +} + static noinline void no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) @@ -656,9 +674,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { - tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | X86_PF_USER; - tsk->thread.cr2 = address; + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ force_sig_fault(signal, si_code, (void __user *)address, @@ -821,9 +837,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); @@ -937,9 +951,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_PF; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
[tip:x86/mm] x86/fault: Improve the condition for signalling vs OOPSing
Commit-ID: 6ea59b074f15e7ef4b042a108950861b383e7b02 Gitweb: https://git.kernel.org/tip/6ea59b074f15e7ef4b042a108950861b383e7b02 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:30 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:29 +0100 x86/fault: Improve the condition for signalling vs OOPSing __bad_area_nosemaphore() currently checks the X86_PF_USER bit in the error code to decide whether to send a signal or to treat the fault as a kernel error. This can cause somewhat erratic behavior. The straightforward cases where the CPL agrees with the hardware USER bit are all correct, but the other cases are confusing. - A user instruction accessing a kernel address with supervisor privilege (e.g. a descriptor table access failed). The USER bit will be clear, and we OOPS. This is correct, because it indicates a kernel bug, not a user error. - A user instruction accessing a user address with supervisor privilege (e.g. a descriptor table was incorrectly pointing at user memory). __bad_area_nosemaphore() will be passed a modified error code with the user bit set, and we will send a signal. Sending the signal will work (because the regs and the entry frame genuinely come from user mode), but we really ought to OOPS, as this event indicates a severe kernel bug. - A kernel instruction with user privilege (i.e. WRUSS). This should OOPS or get fixed up. The current code would instead try send a signal and malfunction. Change the logic: a signal should be sent if the faulting context is user mode *and* the access has user privilege. Otherwise it's either a kernel mode fault or a failed implicit access, either of which should end up in no_context(). Note to -stable maintainers: don't backport this unless you backport CET. The bug it fixes is unobservable in current kernels unless something is extremely wrong. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/10e509c43893170e262e82027ea399130ae81159.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a69b66cf071..3c9aed03d18e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -794,7 +794,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & X86_PF_USER) { + if (user_mode(regs) && (error_code & X86_PF_USER)) { /* * It's possible to have interrupts off here: */
[tip:x86/mm] x86/fault: Improve the condition for signalling vs OOPSing
Commit-ID: 6ea59b074f15e7ef4b042a108950861b383e7b02 Gitweb: https://git.kernel.org/tip/6ea59b074f15e7ef4b042a108950861b383e7b02 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:30 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:29 +0100 x86/fault: Improve the condition for signalling vs OOPSing __bad_area_nosemaphore() currently checks the X86_PF_USER bit in the error code to decide whether to send a signal or to treat the fault as a kernel error. This can cause somewhat erratic behavior. The straightforward cases where the CPL agrees with the hardware USER bit are all correct, but the other cases are confusing. - A user instruction accessing a kernel address with supervisor privilege (e.g. a descriptor table access failed). The USER bit will be clear, and we OOPS. This is correct, because it indicates a kernel bug, not a user error. - A user instruction accessing a user address with supervisor privilege (e.g. a descriptor table was incorrectly pointing at user memory). __bad_area_nosemaphore() will be passed a modified error code with the user bit set, and we will send a signal. Sending the signal will work (because the regs and the entry frame genuinely come from user mode), but we really ought to OOPS, as this event indicates a severe kernel bug. - A kernel instruction with user privilege (i.e. WRUSS). This should OOPS or get fixed up. The current code would instead try send a signal and malfunction. Change the logic: a signal should be sent if the faulting context is user mode *and* the access has user privilege. Otherwise it's either a kernel mode fault or a failed implicit access, either of which should end up in no_context(). Note to -stable maintainers: don't backport this unless you backport CET. The bug it fixes is unobservable in current kernels unless something is extremely wrong. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/10e509c43893170e262e82027ea399130ae81159.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a69b66cf071..3c9aed03d18e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -794,7 +794,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & X86_PF_USER) { + if (user_mode(regs) && (error_code & X86_PF_USER)) { /* * It's possible to have interrupts off here: */
[tip:x86/mm] x86/fault: Fix SMAP #PF handling buglet for implicit supervisor accesses
Commit-ID: e50928d7213e72ee95507221a89ed07d2bb6517b Gitweb: https://git.kernel.org/tip/e50928d7213e72ee95507221a89ed07d2bb6517b Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:29 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:29 +0100 x86/fault: Fix SMAP #PF handling buglet for implicit supervisor accesses Currently, if a user program somehow triggers an implicit supervisor access to a user address (e.g. if the kernel somehow sets LDTR to a user address), it will be incorrectly detected as a SMAP violation if AC is clear and SMAP is enabled. This is incorrect -- the error has nothing to do with SMAP. Fix the condition so that only accesses with the hardware USER bit set are diagnosed as SMAP violations. With the logic fixed, an implicit supervisor access to a user address will hit the code lower in the function that is intended to handle it even if SMAP is enabled. That logic is still a bit buggy, and later patches will clean it up. I *think* this code is still correct for WRUSS, and I've added a comment to that effect. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d1d1b2e66ef31f884dba172084486ea9423ddcdb.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d092ab74f18..7a69b66cf071 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1235,12 +1235,15 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* -* If SMAP is on, check for invalid kernel (supervisor) -* access to user pages in the user address space. +* If SMAP is on, check for invalid kernel (supervisor) access to user +* pages in the user address space. The odd case here is WRUSS, +* which, according to the preliminary documentation, does not respect +* SMAP and will have the USER bit set so, in all cases, SMAP +* enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(hw_error_code & X86_PF_USER) && -(user_mode(regs) || !(regs->flags & X86_EFLAGS_AC +!(regs->flags & X86_EFLAGS_AC))) { bad_area_nosemaphore(regs, hw_error_code, address); return;
[tip:x86/mm] x86/fault: Fix SMAP #PF handling buglet for implicit supervisor accesses
Commit-ID: e50928d7213e72ee95507221a89ed07d2bb6517b Gitweb: https://git.kernel.org/tip/e50928d7213e72ee95507221a89ed07d2bb6517b Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:29 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:29 +0100 x86/fault: Fix SMAP #PF handling buglet for implicit supervisor accesses Currently, if a user program somehow triggers an implicit supervisor access to a user address (e.g. if the kernel somehow sets LDTR to a user address), it will be incorrectly detected as a SMAP violation if AC is clear and SMAP is enabled. This is incorrect -- the error has nothing to do with SMAP. Fix the condition so that only accesses with the hardware USER bit set are diagnosed as SMAP violations. With the logic fixed, an implicit supervisor access to a user address will hit the code lower in the function that is intended to handle it even if SMAP is enabled. That logic is still a bit buggy, and later patches will clean it up. I *think* this code is still correct for WRUSS, and I've added a comment to that effect. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/d1d1b2e66ef31f884dba172084486ea9423ddcdb.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d092ab74f18..7a69b66cf071 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1235,12 +1235,15 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* -* If SMAP is on, check for invalid kernel (supervisor) -* access to user pages in the user address space. +* If SMAP is on, check for invalid kernel (supervisor) access to user +* pages in the user address space. The odd case here is WRUSS, +* which, according to the preliminary documentation, does not respect +* SMAP and will have the USER bit set so, in all cases, SMAP +* enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(hw_error_code & X86_PF_USER) && -(user_mode(regs) || !(regs->flags & X86_EFLAGS_AC +!(regs->flags & X86_EFLAGS_AC))) { bad_area_nosemaphore(regs, hw_error_code, address); return;
[tip:x86/mm] x86/cpufeatures, x86/fault: Mark SMAP as disabled when configured out
Commit-ID: dae0a10593007d049ea71601357ac41d4f247ee9 Gitweb: https://git.kernel.org/tip/dae0a10593007d049ea71601357ac41d4f247ee9 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:27 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:28 +0100 x86/cpufeatures, x86/fault: Mark SMAP as disabled when configured out Add X86_FEATURE_SMAP to the disabled features mask as appropriate and use cpu_feature_enabled() in the fault code. This lets us get rid of a redundant IS_ENABLED(CONFIG_X86_SMAP). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/fe93332eded3d702f0b0b4cf83928d6830739ba3.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/disabled-features.h | 8 +++- arch/x86/mm/fault.c | 5 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33833d1909af..a5ea841cc6d2 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_SMAP +# define DISABLE_SMAP 0 +#else +# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) +#endif + #ifdef CONFIG_X86_INTEL_UMIP # define DISABLE_UMIP 0 #else @@ -68,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_MPX) +#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP) #define DISABLED_MASK100 #define DISABLED_MASK110 #define DISABLED_MASK120 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8624cb7d8d65..39e39cd42097 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1150,10 +1150,7 @@ static int fault_in_kernel_space(unsigned long address) static inline bool smap_violation(int error_code, struct pt_regs *regs) { - if (!IS_ENABLED(CONFIG_X86_SMAP)) - return false; - - if (!static_cpu_has(X86_FEATURE_SMAP)) + if (!cpu_feature_enabled(X86_FEATURE_SMAP)) return false; if (error_code & X86_PF_USER)
[tip:x86/mm] x86/cpufeatures, x86/fault: Mark SMAP as disabled when configured out
Commit-ID: dae0a10593007d049ea71601357ac41d4f247ee9 Gitweb: https://git.kernel.org/tip/dae0a10593007d049ea71601357ac41d4f247ee9 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:27 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:28 +0100 x86/cpufeatures, x86/fault: Mark SMAP as disabled when configured out Add X86_FEATURE_SMAP to the disabled features mask as appropriate and use cpu_feature_enabled() in the fault code. This lets us get rid of a redundant IS_ENABLED(CONFIG_X86_SMAP). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/fe93332eded3d702f0b0b4cf83928d6830739ba3.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/disabled-features.h | 8 +++- arch/x86/mm/fault.c | 5 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 33833d1909af..a5ea841cc6d2 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_SMAP +# define DISABLE_SMAP 0 +#else +# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) +#endif + #ifdef CONFIG_X86_INTEL_UMIP # define DISABLE_UMIP 0 #else @@ -68,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_MPX) +#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP) #define DISABLED_MASK100 #define DISABLED_MASK110 #define DISABLED_MASK120 diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8624cb7d8d65..39e39cd42097 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1150,10 +1150,7 @@ static int fault_in_kernel_space(unsigned long address) static inline bool smap_violation(int error_code, struct pt_regs *regs) { - if (!IS_ENABLED(CONFIG_X86_SMAP)) - return false; - - if (!static_cpu_has(X86_FEATURE_SMAP)) + if (!cpu_feature_enabled(X86_FEATURE_SMAP)) return false; if (error_code & X86_PF_USER)
[tip:x86/mm] x86/fault: Fold smap_violation() into do_user_addr_fault()
Commit-ID: a15781b536293edc32bf374233f3b8ad77c3f72b Gitweb: https://git.kernel.org/tip/a15781b536293edc32bf374233f3b8ad77c3f72b Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:28 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:28 +0100 x86/fault: Fold smap_violation() into do_user_addr_fault() smap_violation() has a single caller, and the contents are a bit nonsensical. I'm going to fix it, but first let's fold it into its caller for ease of comprehension. In this particular case, the user_mode(regs) check is incorrect -- it will cause false positives in the case of a user-initiated kernel-privileged access. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/806c366f6ca861152398ce2c01744d59d9aceb6d.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 23 ++- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 39e39cd42097..9d092ab74f18 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1148,20 +1148,6 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } -static inline bool smap_violation(int error_code, struct pt_regs *regs) -{ - if (!cpu_feature_enabled(X86_FEATURE_SMAP)) - return false; - - if (error_code & X86_PF_USER) - return false; - - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) - return false; - - return true; -} - /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that @@ -1249,10 +1235,13 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* -* Check for invalid kernel (supervisor) access to user -* pages in the user address space. +* If SMAP is on, check for invalid kernel (supervisor) +* access to user pages in the user address space. */ - if (unlikely(smap_violation(hw_error_code, regs))) { + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && +!(hw_error_code & X86_PF_USER) && +(user_mode(regs) || !(regs->flags & X86_EFLAGS_AC + { bad_area_nosemaphore(regs, hw_error_code, address); return; }
[tip:x86/mm] x86/fault: Fold smap_violation() into do_user_addr_fault()
Commit-ID: a15781b536293edc32bf374233f3b8ad77c3f72b Gitweb: https://git.kernel.org/tip/a15781b536293edc32bf374233f3b8ad77c3f72b Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:28 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:28 +0100 x86/fault: Fold smap_violation() into do_user_addr_fault() smap_violation() has a single caller, and the contents are a bit nonsensical. I'm going to fix it, but first let's fold it into its caller for ease of comprehension. In this particular case, the user_mode(regs) check is incorrect -- it will cause false positives in the case of a user-initiated kernel-privileged access. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/806c366f6ca861152398ce2c01744d59d9aceb6d.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 23 ++- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 39e39cd42097..9d092ab74f18 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1148,20 +1148,6 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } -static inline bool smap_violation(int error_code, struct pt_regs *regs) -{ - if (!cpu_feature_enabled(X86_FEATURE_SMAP)) - return false; - - if (error_code & X86_PF_USER) - return false; - - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) - return false; - - return true; -} - /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that @@ -1249,10 +1235,13 @@ void do_user_addr_fault(struct pt_regs *regs, pgtable_bad(regs, hw_error_code, address); /* -* Check for invalid kernel (supervisor) access to user -* pages in the user address space. +* If SMAP is on, check for invalid kernel (supervisor) +* access to user pages in the user address space. */ - if (unlikely(smap_violation(hw_error_code, regs))) { + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && +!(hw_error_code & X86_PF_USER) && +(user_mode(regs) || !(regs->flags & X86_EFLAGS_AC + { bad_area_nosemaphore(regs, hw_error_code, address); return; }
[tip:x86/mm] x86/fault: Check user_mode(regs) when avoiding an mmap_sem deadlock
Commit-ID: 6344be608c039f3a787f1144c46fcb04c0f76561 Gitweb: https://git.kernel.org/tip/6344be608c039f3a787f1144c46fcb04c0f76561 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:25 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:27 +0100 x86/fault: Check user_mode(regs) when avoiding an mmap_sem deadlock The fault-handling code that takes mmap_sem needs to avoid a deadlock that could occur if the kernel took a bad (OOPS-worthy) page fault on a user address while holding mmap_sem. This can only happen if the faulting instruction was in the kernel (i.e. user_mode(regs)). Rather than checking the sw_error_code (which will have the USER bit set if the fault was a USER-permission access *or* if user_mode(regs)), just check user_mode(regs) directly. The old code would have malfunctioned if the kernel executed a bogus WRUSS instruction while holding mmap_sem. Fortunately, that is extremely unlikely in current kernels, which don't use WRUSS. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/4b89b542e8ceba9bd6abde2f386afed6d99244a9.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 29525cf21100..8624cb7d8d65 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1344,13 +1344,10 @@ void do_user_addr_fault(struct pt_regs *regs, * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we * 1. Failed to acquire mmap_sem, and -* 2. The access did not originate in userspace. Note: either the -*hardware or earlier page fault code may set X86_PF_USER -*in sw_error_code. +* 2. The access did not originate in userspace. */ if (unlikely(!down_read_trylock(>mmap_sem))) { - if (!(sw_error_code & X86_PF_USER) && - !search_exception_tables(regs->ip)) { + if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from * which we do not expect faults.
[tip:x86/mm] x86/fault: Check user_mode(regs) when avoiding an mmap_sem deadlock
Commit-ID: 6344be608c039f3a787f1144c46fcb04c0f76561 Gitweb: https://git.kernel.org/tip/6344be608c039f3a787f1144c46fcb04c0f76561 Author: Andy Lutomirski AuthorDate: Mon, 19 Nov 2018 14:45:25 -0800 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 08:44:27 +0100 x86/fault: Check user_mode(regs) when avoiding an mmap_sem deadlock The fault-handling code that takes mmap_sem needs to avoid a deadlock that could occur if the kernel took a bad (OOPS-worthy) page fault on a user address while holding mmap_sem. This can only happen if the faulting instruction was in the kernel (i.e. user_mode(regs)). Rather than checking the sw_error_code (which will have the USER bit set if the fault was a USER-permission access *or* if user_mode(regs)), just check user_mode(regs) directly. The old code would have malfunctioned if the kernel executed a bogus WRUSS instruction while holding mmap_sem. Fortunately, that is extremely unlikely in current kernels, which don't use WRUSS. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/4b89b542e8ceba9bd6abde2f386afed6d99244a9.1542667307.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 29525cf21100..8624cb7d8d65 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1344,13 +1344,10 @@ void do_user_addr_fault(struct pt_regs *regs, * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we * 1. Failed to acquire mmap_sem, and -* 2. The access did not originate in userspace. Note: either the -*hardware or earlier page fault code may set X86_PF_USER -*in sw_error_code. +* 2. The access did not originate in userspace. */ if (unlikely(!down_read_trylock(>mmap_sem))) { - if (!(sw_error_code & X86_PF_USER) && - !search_exception_tables(regs->ip)) { + if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from * which we do not expect faults.
[tip:x86/urgent] x86/entry/64: Further improve paranoid_entry comments
Commit-ID: ae852495be365f6be433dde6629d3f0316f8efde Gitweb: https://git.kernel.org/tip/ae852495be365f6be433dde6629d3f0316f8efde Author: Andy Lutomirski AuthorDate: Sun, 14 Oct 2018 11:38:18 -0700 Committer: Ingo Molnar CommitDate: Wed, 17 Oct 2018 12:30:27 +0200 x86/entry/64: Further improve paranoid_entry comments Commit: 16561f27f94e ("x86/entry: Add some paranoid entry/exit CR3 handling comments") ... added some comments. This improves them a bit: - When I first read the new comments, it was unclear to me whether they were referring to the case where paranoid_entry interrupted other entry code or where paranoid_entry was itself interrupted. Clarify it. - Remove the EBX comment. We no longer use EBX as a SWAPGS indicator. Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/c47daa1888dc2298e7e1d3f82bd76b776ea33393.1539542111.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1d9b4a300c8c..f95dcb209fdf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1189,15 +1189,13 @@ ENTRY(paranoid_entry) 1: /* * Always stash CR3 in %r14. This value will be restored, -* verbatim, at exit. Needed if kernel is interrupted -* after switching to the user CR3 value but before -* returning to userspace. +* verbatim, at exit. Needed if paranoid_entry interrupted +* another entry that already switched to the user CR3 value +* but has not yet returned to userspace. * * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return -* to kernel code, but with a user CR3 value. The %ebx flag -* for SWAPGS is also unusable for CR3 because there is a -* window with a user GS and a kernel CR3. +* to kernel code, but with a user CR3 value. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
[tip:x86/urgent] x86/entry/64: Further improve paranoid_entry comments
Commit-ID: ae852495be365f6be433dde6629d3f0316f8efde Gitweb: https://git.kernel.org/tip/ae852495be365f6be433dde6629d3f0316f8efde Author: Andy Lutomirski AuthorDate: Sun, 14 Oct 2018 11:38:18 -0700 Committer: Ingo Molnar CommitDate: Wed, 17 Oct 2018 12:30:27 +0200 x86/entry/64: Further improve paranoid_entry comments Commit: 16561f27f94e ("x86/entry: Add some paranoid entry/exit CR3 handling comments") ... added some comments. This improves them a bit: - When I first read the new comments, it was unclear to me whether they were referring to the case where paranoid_entry interrupted other entry code or where paranoid_entry was itself interrupted. Clarify it. - Remove the EBX comment. We no longer use EBX as a SWAPGS indicator. Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/c47daa1888dc2298e7e1d3f82bd76b776ea33393.1539542111.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1d9b4a300c8c..f95dcb209fdf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1189,15 +1189,13 @@ ENTRY(paranoid_entry) 1: /* * Always stash CR3 in %r14. This value will be restored, -* verbatim, at exit. Needed if kernel is interrupted -* after switching to the user CR3 value but before -* returning to userspace. +* verbatim, at exit. Needed if paranoid_entry interrupted +* another entry that already switched to the user CR3 value +* but has not yet returned to userspace. * * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return -* to kernel code, but with a user CR3 value. The %ebx flag -* for SWAPGS is also unusable for CR3 because there is a -* window with a user GS and a kernel CR3. +* to kernel code, but with a user CR3 value. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
[tip:x86/urgent] x86/entry/64: Further improve paranoid_entry comments
Commit-ID: 0cd6fa95a4a44ff2d649957423946c19eb98e825 Gitweb: https://git.kernel.org/tip/0cd6fa95a4a44ff2d649957423946c19eb98e825 Author: Andy Lutomirski AuthorDate: Sun, 14 Oct 2018 11:38:18 -0700 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 08:19:34 +0200 x86/entry/64: Further improve paranoid_entry comments Commit: 16561f27f94e ("x86/entry: Add some paranoid entry/exit CR3 handling comments") ... added some comments. This improves them a bit: - When I first read the new comments, it was unclear to me whether they were referring to the case where paranoid_entry interrupted other entry code or where paranoid_entry was itself interrupted. Clarify it. - Remove the EBX comment. We no longer use EBX as a SWAPGS indicator. Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/c47daa1888dc2298e7e1d3f82bd76b776ea33393.1539542111.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1d9b4a300c8c..f95dcb209fdf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1189,15 +1189,13 @@ ENTRY(paranoid_entry) 1: /* * Always stash CR3 in %r14. This value will be restored, -* verbatim, at exit. Needed if kernel is interrupted -* after switching to the user CR3 value but before -* returning to userspace. +* verbatim, at exit. Needed if paranoid_entry interrupted +* another entry that already switched to the user CR3 value +* but has not yet returned to userspace. * * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return -* to kernel code, but with a user CR3 value. The %ebx flag -* for SWAPGS is also unusable for CR3 because there is a -* window with a user GS and a kernel CR3. +* to kernel code, but with a user CR3 value. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
[tip:x86/urgent] x86/entry/64: Further improve paranoid_entry comments
Commit-ID: 0cd6fa95a4a44ff2d649957423946c19eb98e825 Gitweb: https://git.kernel.org/tip/0cd6fa95a4a44ff2d649957423946c19eb98e825 Author: Andy Lutomirski AuthorDate: Sun, 14 Oct 2018 11:38:18 -0700 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 08:19:34 +0200 x86/entry/64: Further improve paranoid_entry comments Commit: 16561f27f94e ("x86/entry: Add some paranoid entry/exit CR3 handling comments") ... added some comments. This improves them a bit: - When I first read the new comments, it was unclear to me whether they were referring to the case where paranoid_entry interrupted other entry code or where paranoid_entry was itself interrupted. Clarify it. - Remove the EBX comment. We no longer use EBX as a SWAPGS indicator. Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/c47daa1888dc2298e7e1d3f82bd76b776ea33393.1539542111.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1d9b4a300c8c..f95dcb209fdf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1189,15 +1189,13 @@ ENTRY(paranoid_entry) 1: /* * Always stash CR3 in %r14. This value will be restored, -* verbatim, at exit. Needed if kernel is interrupted -* after switching to the user CR3 value but before -* returning to userspace. +* verbatim, at exit. Needed if paranoid_entry interrupted +* another entry that already switched to the user CR3 value +* but has not yet returned to userspace. * * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return -* to kernel code, but with a user CR3 value. The %ebx flag -* for SWAPGS is also unusable for CR3 because there is a -* window with a user GS and a kernel CR3. +* to kernel code, but with a user CR3 value. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
[tip:x86/asm] x86/fsgsbase/64: Fix ptrace() to read the FS/GS base accurately
Commit-ID: 07e1d88adaaeab247b300926f78cc3f950dbeda3 Gitweb: https://git.kernel.org/tip/07e1d88adaaeab247b300926f78cc3f950dbeda3 Author: Andy Lutomirski AuthorDate: Tue, 18 Sep 2018 16:08:52 -0700 Committer: Ingo Molnar CommitDate: Mon, 8 Oct 2018 10:41:08 +0200 x86/fsgsbase/64: Fix ptrace() to read the FS/GS base accurately On 64-bit kernels ptrace can read the FS/GS base using the register access APIs (PTRACE_PEEKUSER, etc.) or PTRACE_ARCH_PRCTL. Make both of these mechanisms return the actual FS/GS base. This will improve debuggability by providing the correct information to ptracer such as GDB. [ chang: Rebased and revised patch description. ] [ mingo: Revised the changelog some more. ] Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537312139-5580-2-git-send-email-chang.seok@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 62 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e2ee403865eb..3acbf45cb7fb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "tls.h" @@ -342,6 +343,49 @@ static int set_segment_reg(struct task_struct *task, return 0; } +static unsigned long task_seg_base(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* +* There are no user segments in the GDT with nonzero bases +* other than the TLS segments. +*/ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(>thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* +* If performance here mattered, we could protect the LDT +* with RCU. This is a slow path, though, so we can just +* take the mutex. +*/ + mutex_lock(>mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(>mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -435,18 +479,16 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): { - /* -* XXX: This will not behave as expected if called on -* current or if fsindex != 0. -*/ - return task->thread.fsbase; + if (task->thread.fsindex == 0) + return task->thread.fsbase; + else + return task_seg_base(task, task->thread.fsindex); } case offsetof(struct user_regs_struct, gs_base): { - /* -* XXX: This will not behave as expected if called on -* current or if fsindex != 0. -*/ - return task->thread.gsbase; + if (task->thread.gsindex == 0) + return task->thread.gsbase; + else + return task_seg_base(task, task->thread.gsindex); } #endif }
[tip:x86/asm] x86/fsgsbase/64: Fix ptrace() to read the FS/GS base accurately
Commit-ID: 07e1d88adaaeab247b300926f78cc3f950dbeda3 Gitweb: https://git.kernel.org/tip/07e1d88adaaeab247b300926f78cc3f950dbeda3 Author: Andy Lutomirski AuthorDate: Tue, 18 Sep 2018 16:08:52 -0700 Committer: Ingo Molnar CommitDate: Mon, 8 Oct 2018 10:41:08 +0200 x86/fsgsbase/64: Fix ptrace() to read the FS/GS base accurately On 64-bit kernels ptrace can read the FS/GS base using the register access APIs (PTRACE_PEEKUSER, etc.) or PTRACE_ARCH_PRCTL. Make both of these mechanisms return the actual FS/GS base. This will improve debuggability by providing the correct information to ptracer such as GDB. [ chang: Rebased and revised patch description. ] [ mingo: Revised the changelog some more. ] Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537312139-5580-2-git-send-email-chang.seok@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 62 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e2ee403865eb..3acbf45cb7fb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "tls.h" @@ -342,6 +343,49 @@ static int set_segment_reg(struct task_struct *task, return 0; } +static unsigned long task_seg_base(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* +* There are no user segments in the GDT with nonzero bases +* other than the TLS segments. +*/ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(>thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* +* If performance here mattered, we could protect the LDT +* with RCU. This is a slow path, though, so we can just +* take the mutex. +*/ + mutex_lock(>mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(>mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -435,18 +479,16 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): { - /* -* XXX: This will not behave as expected if called on -* current or if fsindex != 0. -*/ - return task->thread.fsbase; + if (task->thread.fsindex == 0) + return task->thread.fsbase; + else + return task_seg_base(task, task->thread.fsindex); } case offsetof(struct user_regs_struct, gs_base): { - /* -* XXX: This will not behave as expected if called on -* current or if fsindex != 0. -*/ - return task->thread.gsbase; + if (task->thread.gsindex == 0) + return task->thread.gsbase; + else + return task_seg_base(task, task->thread.gsindex); } #endif }
[tip:x86/vdso] x86/vdso: Rearrange do_hres() to improve code generation
Commit-ID: 99c19e6a8fe4a95fa0dac191207a1d40461b1604 Gitweb: https://git.kernel.org/tip/99c19e6a8fe4a95fa0dac191207a1d40461b1604 Author: Andy Lutomirski AuthorDate: Fri, 5 Oct 2018 11:02:43 -0700 Committer: Thomas Gleixner CommitDate: Fri, 5 Oct 2018 21:03:23 +0200 x86/vdso: Rearrange do_hres() to improve code generation vgetcyc() is full of barriers, so fetching values out of the vvar page before vgetcyc() for use after vgetcyc() results in poor code generation. Put vgetcyc() first to avoid this problem. Also, pull the tv_sec division into the loop and put all the ts writes together. The old code wrote ts->tv_sec on each iteration before the syscall fallback check and then added in the offset afterwards, which forced the compiler to pointlessly copy base->sec to ts->tv_sec on each iteration. The new version seems to generate sensible code. Saves several cycles. With this patch applied, the result is faster than before the clock_gettime() rewrite. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/3c05644d010b72216aa286a6d20b5078d5fae5cd.1538762487.git.l...@kernel.org --- arch/x86/entry/vdso/vclock_gettime.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 18c8a78d1ec9..007b3fe9d727 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -142,23 +142,27 @@ notrace static inline u64 vgetcyc(int mode) notrace static int do_hres(clockid_t clk, struct timespec *ts) { struct vgtod_ts *base = >basetime[clk]; - u64 cycles, last, ns; + u64 cycles, last, sec, ns; unsigned int seq; do { seq = gtod_read_begin(gtod); - ts->tv_sec = base->sec; + cycles = vgetcyc(gtod->vclock_mode); ns = base->nsec; last = gtod->cycle_last; - cycles = vgetcyc(gtod->vclock_mode); if (unlikely((s64)cycles < 0)) return vdso_fallback_gettime(clk, ts); if (cycles > last) ns += (cycles - last) * gtod->mult; ns >>= gtod->shift; + sec = base->sec; } while (unlikely(gtod_read_retry(gtod, seq))); - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, ); + /* +* Do this outside the loop: a race inside the loop could result +* in __iter_div_u64_rem() being extremely slow. +*/ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, ); ts->tv_nsec = ns; return 0;
[tip:x86/vdso] x86/vdso: Rearrange do_hres() to improve code generation
Commit-ID: 99c19e6a8fe4a95fa0dac191207a1d40461b1604 Gitweb: https://git.kernel.org/tip/99c19e6a8fe4a95fa0dac191207a1d40461b1604 Author: Andy Lutomirski AuthorDate: Fri, 5 Oct 2018 11:02:43 -0700 Committer: Thomas Gleixner CommitDate: Fri, 5 Oct 2018 21:03:23 +0200 x86/vdso: Rearrange do_hres() to improve code generation vgetcyc() is full of barriers, so fetching values out of the vvar page before vgetcyc() for use after vgetcyc() results in poor code generation. Put vgetcyc() first to avoid this problem. Also, pull the tv_sec division into the loop and put all the ts writes together. The old code wrote ts->tv_sec on each iteration before the syscall fallback check and then added in the offset afterwards, which forced the compiler to pointlessly copy base->sec to ts->tv_sec on each iteration. The new version seems to generate sensible code. Saves several cycles. With this patch applied, the result is faster than before the clock_gettime() rewrite. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/3c05644d010b72216aa286a6d20b5078d5fae5cd.1538762487.git.l...@kernel.org --- arch/x86/entry/vdso/vclock_gettime.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 18c8a78d1ec9..007b3fe9d727 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -142,23 +142,27 @@ notrace static inline u64 vgetcyc(int mode) notrace static int do_hres(clockid_t clk, struct timespec *ts) { struct vgtod_ts *base = >basetime[clk]; - u64 cycles, last, ns; + u64 cycles, last, sec, ns; unsigned int seq; do { seq = gtod_read_begin(gtod); - ts->tv_sec = base->sec; + cycles = vgetcyc(gtod->vclock_mode); ns = base->nsec; last = gtod->cycle_last; - cycles = vgetcyc(gtod->vclock_mode); if (unlikely((s64)cycles < 0)) return vdso_fallback_gettime(clk, ts); if (cycles > last) ns += (cycles - last) * gtod->mult; ns >>= gtod->shift; + sec = base->sec; } while (unlikely(gtod_read_retry(gtod, seq))); - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, ); + /* +* Do this outside the loop: a race inside the loop could result +* in __iter_div_u64_rem() being extremely slow. +*/ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, ); ts->tv_nsec = ns; return 0;
[tip:x86/vdso] x86/vdso: Document vgtod_ts better
Commit-ID: bcc4a62a73cb65327d7268fbfa3a786d603f52dc Gitweb: https://git.kernel.org/tip/bcc4a62a73cb65327d7268fbfa3a786d603f52dc Author: Andy Lutomirski AuthorDate: Thu, 4 Oct 2018 14:44:45 -0700 Committer: Ingo Molnar CommitDate: Fri, 5 Oct 2018 10:12:18 +0200 x86/vdso: Document vgtod_ts better After reading do_hres() and do_course() and scratching my head a bit, I figured out why the arithmetic is strange. Document it. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f66f53d81150bbad47d7b282c9207a71a3ce1c16.1538689401.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vgtod.h | 9 + 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index d17b092b9f1b..69d05c6d47f5 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -13,6 +13,15 @@ typedef u64 gtod_long_t; typedef unsigned long gtod_long_t; #endif +/* + * There is one of these objects in the vvar page for each + * vDSO-accelerated clockid. For high-resolution clocks, this encodes + * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse + * clocks, this encodes the actual time. + * + * To confuse the reader, for high-resolution clocks, nsec is left-shifted + * by vsyscall_gtod_data.shift. + */ struct vgtod_ts { u64 sec; u64 nsec;
[tip:x86/vdso] x86/vdso: Document vgtod_ts better
Commit-ID: bcc4a62a73cb65327d7268fbfa3a786d603f52dc Gitweb: https://git.kernel.org/tip/bcc4a62a73cb65327d7268fbfa3a786d603f52dc Author: Andy Lutomirski AuthorDate: Thu, 4 Oct 2018 14:44:45 -0700 Committer: Ingo Molnar CommitDate: Fri, 5 Oct 2018 10:12:18 +0200 x86/vdso: Document vgtod_ts better After reading do_hres() and do_course() and scratching my head a bit, I figured out why the arithmetic is strange. Document it. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f66f53d81150bbad47d7b282c9207a71a3ce1c16.1538689401.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vgtod.h | 9 + 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index d17b092b9f1b..69d05c6d47f5 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -13,6 +13,15 @@ typedef u64 gtod_long_t; typedef unsigned long gtod_long_t; #endif +/* + * There is one of these objects in the vvar page for each + * vDSO-accelerated clockid. For high-resolution clocks, this encodes + * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse + * clocks, this encodes the actual time. + * + * To confuse the reader, for high-resolution clocks, nsec is left-shifted + * by vsyscall_gtod_data.shift. + */ struct vgtod_ts { u64 sec; u64 nsec;
[tip:x86/vdso] x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks
Commit-ID: 89fe0a1f1c694a3b0b3cfa8c0952d603753f36df Gitweb: https://git.kernel.org/tip/89fe0a1f1c694a3b0b3cfa8c0952d603753f36df Author: Andy Lutomirski AuthorDate: Thu, 4 Oct 2018 14:44:43 -0700 Committer: Ingo Molnar CommitDate: Fri, 5 Oct 2018 10:12:18 +0200 x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks When a vDSO clock function falls back to the syscall, no special barriers or ordering is needed, and the syscall fallbacks don't clobber any memory that is not explicitly listed in the asm constraints. Remove the "memory" clobber. This causes minor changes to the generated code, but otherwise has no obvious performance impact. I think it's nice to have, though, since it may help the optimizer in the future. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3a7438f5fb2422ed881683d2ccffd7f987b2dc44.1538689401.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vclock_gettime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index b7ccbff26a3f..18c8a78d1ec9 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -45,7 +45,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) long ret; asm ("syscall" : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : -"memory", "rcx", "r11"); +"rcx", "r11"); return ret; } @@ -62,7 +62,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) - : "memory", "edx"); + : "edx"); return ret; }
[tip:x86/vdso] x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks
Commit-ID: 89fe0a1f1c694a3b0b3cfa8c0952d603753f36df Gitweb: https://git.kernel.org/tip/89fe0a1f1c694a3b0b3cfa8c0952d603753f36df Author: Andy Lutomirski AuthorDate: Thu, 4 Oct 2018 14:44:43 -0700 Committer: Ingo Molnar CommitDate: Fri, 5 Oct 2018 10:12:18 +0200 x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks When a vDSO clock function falls back to the syscall, no special barriers or ordering is needed, and the syscall fallbacks don't clobber any memory that is not explicitly listed in the asm constraints. Remove the "memory" clobber. This causes minor changes to the generated code, but otherwise has no obvious performance impact. I think it's nice to have, though, since it may help the optimizer in the future. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3a7438f5fb2422ed881683d2ccffd7f987b2dc44.1538689401.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vclock_gettime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index b7ccbff26a3f..18c8a78d1ec9 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -45,7 +45,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) long ret; asm ("syscall" : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : -"memory", "rcx", "r11"); +"rcx", "r11"); return ret; } @@ -62,7 +62,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) - : "memory", "edx"); + : "edx"); return ret; }
[tip:x86/urgent] x86/vdso: Fix vDSO syscall fallback asm constraint regression
Commit-ID: 02e425668f5c9deb42787d10001a3b605993ad15 Gitweb: https://git.kernel.org/tip/02e425668f5c9deb42787d10001a3b605993ad15 Author: Andy Lutomirski AuthorDate: Wed, 3 Oct 2018 16:23:49 -0700 Committer: Ingo Molnar CommitDate: Thu, 4 Oct 2018 08:17:50 +0200 x86/vdso: Fix vDSO syscall fallback asm constraint regression When I added the missing memory outputs, I failed to update the index of the first argument (ebx) on 32-bit builds, which broke the fallbacks. Somehow I must have screwed up my testing or gotten lucky. Add another test to cover gettimeofday() as well. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org Fixes: 715bd9d12f84 ("x86/vdso: Fix asm constraints on vDSO syscall fallbacks") Link: http://lkml.kernel.org/r/21bd45ab04b6d838278fa5bebfa9163eceffa13c.1538608971.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vclock_gettime.c| 8 ++-- tools/testing/selftests/x86/test_vdso.c | 73 + 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 134e2d2e8add..e48ca3afa091 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -68,11 +68,11 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[clock], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*ts) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) : "memory", "edx"); return ret; } @@ -83,11 +83,11 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[tv], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*tv), "=m" (*tz) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) : "memory", "edx"); return ret; } diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 49f7294fb382..35edd61d1663 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -36,6 +36,10 @@ typedef int (*vgettime_t)(clockid_t, struct timespec *); vgettime_t vdso_clock_gettime; +typedef long (*vgtod_t)(struct timeval *tv, struct timezone *tz); + +vgtod_t vdso_gettimeofday; + typedef long (*getcpu_t)(unsigned *, unsigned *, void *); getcpu_t vgetcpu; @@ -104,6 +108,11 @@ static void fill_function_pointers() vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); if (!vdso_clock_gettime) printf("Warning: failed to find clock_gettime in vDSO\n"); + + vdso_gettimeofday = (vgtod_t)dlsym(vdso, "__vdso_gettimeofday"); + if (!vdso_gettimeofday) + printf("Warning: failed to find gettimeofday in vDSO\n"); + } static long sys_getcpu(unsigned * cpu, unsigned * node, @@ -117,6 +126,11 @@ static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) return syscall(__NR_clock_gettime, id, ts); } +static inline int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return syscall(__NR_gettimeofday, tv, tz); +} + static void test_getcpu(void) { printf("[RUN]\tTesting getcpu...\n"); @@ -177,6 +191,14 @@ static bool ts_leq(const struct timespec *a, const struct timespec *b) return a->tv_nsec <= b->tv_nsec; } +static bool tv_leq(const struct timeval *a, const struct timeval *b) +{ + if (a->tv_sec != b->tv_sec) + return a->tv_sec < b->tv_sec; + else + return a->tv_usec <= b->tv_usec; +} + static char const * const clocknames[] = { [0] = "CLOCK_REALTIME", [1] = "CLOCK_MONOTONIC", @@ -248,11 +270,62 @@ static void test_clock_gettime(void) test_one_clock_gettime(INT_MAX, "invalid"); } +static void test_gettimeofday(void) +{ + struct timeval start, vdso, end; + struct timezone sys_tz, vdso_tz; + int vdso_ret, end_ret; + + if (!vdso_gettimeofday) + return; + + printf("[RUN]\tTesting gettimeofday...\n"); + + if (sys_gettimeofday(, _tz) < 0) { + printf("[FAIL]\tsys_gettimeofday failed (%d)\n", errno); + nerrs++; + return; + } + + vdso_ret = vdso_gettimeofday(, _tz); + end_ret = sys_gettimeofday(, NULL); + + if (vdso_ret != 0 || end_ret != 0) { +
[tip:x86/urgent] x86/vdso: Fix vDSO syscall fallback asm constraint regression
Commit-ID: 02e425668f5c9deb42787d10001a3b605993ad15 Gitweb: https://git.kernel.org/tip/02e425668f5c9deb42787d10001a3b605993ad15 Author: Andy Lutomirski AuthorDate: Wed, 3 Oct 2018 16:23:49 -0700 Committer: Ingo Molnar CommitDate: Thu, 4 Oct 2018 08:17:50 +0200 x86/vdso: Fix vDSO syscall fallback asm constraint regression When I added the missing memory outputs, I failed to update the index of the first argument (ebx) on 32-bit builds, which broke the fallbacks. Somehow I must have screwed up my testing or gotten lucky. Add another test to cover gettimeofday() as well. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org Fixes: 715bd9d12f84 ("x86/vdso: Fix asm constraints on vDSO syscall fallbacks") Link: http://lkml.kernel.org/r/21bd45ab04b6d838278fa5bebfa9163eceffa13c.1538608971.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vclock_gettime.c| 8 ++-- tools/testing/selftests/x86/test_vdso.c | 73 + 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 134e2d2e8add..e48ca3afa091 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -68,11 +68,11 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[clock], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*ts) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) : "memory", "edx"); return ret; } @@ -83,11 +83,11 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[tv], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*tv), "=m" (*tz) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) : "memory", "edx"); return ret; } diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 49f7294fb382..35edd61d1663 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -36,6 +36,10 @@ typedef int (*vgettime_t)(clockid_t, struct timespec *); vgettime_t vdso_clock_gettime; +typedef long (*vgtod_t)(struct timeval *tv, struct timezone *tz); + +vgtod_t vdso_gettimeofday; + typedef long (*getcpu_t)(unsigned *, unsigned *, void *); getcpu_t vgetcpu; @@ -104,6 +108,11 @@ static void fill_function_pointers() vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); if (!vdso_clock_gettime) printf("Warning: failed to find clock_gettime in vDSO\n"); + + vdso_gettimeofday = (vgtod_t)dlsym(vdso, "__vdso_gettimeofday"); + if (!vdso_gettimeofday) + printf("Warning: failed to find gettimeofday in vDSO\n"); + } static long sys_getcpu(unsigned * cpu, unsigned * node, @@ -117,6 +126,11 @@ static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) return syscall(__NR_clock_gettime, id, ts); } +static inline int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return syscall(__NR_gettimeofday, tv, tz); +} + static void test_getcpu(void) { printf("[RUN]\tTesting getcpu...\n"); @@ -177,6 +191,14 @@ static bool ts_leq(const struct timespec *a, const struct timespec *b) return a->tv_nsec <= b->tv_nsec; } +static bool tv_leq(const struct timeval *a, const struct timeval *b) +{ + if (a->tv_sec != b->tv_sec) + return a->tv_sec < b->tv_sec; + else + return a->tv_usec <= b->tv_usec; +} + static char const * const clocknames[] = { [0] = "CLOCK_REALTIME", [1] = "CLOCK_MONOTONIC", @@ -248,11 +270,62 @@ static void test_clock_gettime(void) test_one_clock_gettime(INT_MAX, "invalid"); } +static void test_gettimeofday(void) +{ + struct timeval start, vdso, end; + struct timezone sys_tz, vdso_tz; + int vdso_ret, end_ret; + + if (!vdso_gettimeofday) + return; + + printf("[RUN]\tTesting gettimeofday...\n"); + + if (sys_gettimeofday(, _tz) < 0) { + printf("[FAIL]\tsys_gettimeofday failed (%d)\n", errno); + nerrs++; + return; + } + + vdso_ret = vdso_gettimeofday(, _tz); + end_ret = sys_gettimeofday(, NULL); + + if (vdso_ret != 0 || end_ret != 0) { +
[tip:x86/urgent] x86/vdso: Only enable vDSO retpolines when enabled and supported
Commit-ID: 4f166564014aba65ad6f15b612f6711fd0f117ee Gitweb: https://git.kernel.org/tip/4f166564014aba65ad6f15b612f6711fd0f117ee Author: Andy Lutomirski AuthorDate: Tue, 2 Oct 2018 21:26:50 -0700 Committer: Ingo Molnar CommitDate: Wed, 3 Oct 2018 08:26:14 +0200 x86/vdso: Only enable vDSO retpolines when enabled and supported When I fixed the vDSO build to use inline retpolines, I messed up the Makefile logic and made it unconditional. It should have depended on CONFIG_RETPOLINE and on the availability of compiler support. This broke the build on some older compilers. Reported-by: nikola.cipr...@linuxbox.cz Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: David Woodhouse Cc: Linus Torvalds Cc: Matt Rickard Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jason.vas.d...@gmail.com Cc: sta...@vger.kernel.org Fixes: 2e549b2ee0e3 ("x86/vdso: Fix vDSO build if a retpoline is emitted") Link: http://lkml.kernel.org/r/08a1f29f2c238dd1f493945e702a521f8a5aa3ae.1538540801.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/Makefile | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index fa3f439f0a92..141d415a8c80 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,7 +68,13 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + CFL += $(RETPOLINE_VDSO_CFLAGS) +endif +endif $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) @@ -138,7 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) +endif +endif + $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \
[tip:x86/urgent] x86/vdso: Only enable vDSO retpolines when enabled and supported
Commit-ID: 4f166564014aba65ad6f15b612f6711fd0f117ee Gitweb: https://git.kernel.org/tip/4f166564014aba65ad6f15b612f6711fd0f117ee Author: Andy Lutomirski AuthorDate: Tue, 2 Oct 2018 21:26:50 -0700 Committer: Ingo Molnar CommitDate: Wed, 3 Oct 2018 08:26:14 +0200 x86/vdso: Only enable vDSO retpolines when enabled and supported When I fixed the vDSO build to use inline retpolines, I messed up the Makefile logic and made it unconditional. It should have depended on CONFIG_RETPOLINE and on the availability of compiler support. This broke the build on some older compilers. Reported-by: nikola.cipr...@linuxbox.cz Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: David Woodhouse Cc: Linus Torvalds Cc: Matt Rickard Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jason.vas.d...@gmail.com Cc: sta...@vger.kernel.org Fixes: 2e549b2ee0e3 ("x86/vdso: Fix vDSO build if a retpoline is emitted") Link: http://lkml.kernel.org/r/08a1f29f2c238dd1f493945e702a521f8a5aa3ae.1538540801.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/Makefile | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index fa3f439f0a92..141d415a8c80 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,7 +68,13 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + CFL += $(RETPOLINE_VDSO_CFLAGS) +endif +endif $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) @@ -138,7 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) +endif +endif + $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \
[tip:x86/urgent] x86/vdso: Fix asm constraints on vDSO syscall fallbacks
Commit-ID: 715bd9d12f84d8f5cc8ad21d888f9bc304a8eb0b Gitweb: https://git.kernel.org/tip/715bd9d12f84d8f5cc8ad21d888f9bc304a8eb0b Author: Andy Lutomirski AuthorDate: Mon, 1 Oct 2018 12:52:15 -0700 Committer: Thomas Gleixner CommitDate: Tue, 2 Oct 2018 08:28:15 +0200 x86/vdso: Fix asm constraints on vDSO syscall fallbacks The syscall fallbacks in the vDSO have incorrect asm constraints. They are not marked as writing to their outputs -- instead, they are marked as clobbering "memory", which is useless. In particular, gcc is smart enough to know that the timespec parameter hasn't escaped, so a memory clobber doesn't clobber it. And passing a pointer as an asm *input* does not tell gcc that the pointed-to value is changed. Add in the fact that the asm instructions weren't volatile, and gcc was free to omit them entirely unless their sole output (the return value) is used. Which it is (phew!), but that stops happening with some upcoming patches. As a trivial example, the following code: void test_fallback(struct timespec *ts) { vdso_fallback_gettime(CLOCK_MONOTONIC, ts); } compiles to: 00c0 : c0: c3 retq To add insult to injury, the RCX and R11 clobbers on 64-bit builds were missing. The "memory" clobber is also unnecessary -- no ordering with respect to other memory operations is needed, but that's going to be fixed in a separate not-for-stable patch. Fixes: 2aae950b21e4 ("x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/2c0231690551989d2fafa60ed0e7b5cc8b403908.1538422295.git.l...@kernel.org --- arch/x86/entry/vdso/vclock_gettime.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index f19856d95c60..134e2d2e8add 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -43,8 +43,9 @@ extern u8 hvclock_page notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*ts) : +"0" (__NR_clock_gettime), "D" (clock), "S" (ts) : +"memory", "rcx", "r11"); return ret; } @@ -52,8 +53,9 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : +"0" (__NR_gettimeofday), "D" (tv), "S" (tz) : +"memory", "rcx", "r11"); return ret; } @@ -64,12 +66,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) : "memory", "edx"); return ret; @@ -79,12 +81,12 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*tv), "=m" (*tz) : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) : "memory", "edx"); return ret;
[tip:x86/urgent] selftests/x86: Add clock_gettime() tests to test_vdso
Commit-ID: 7c03e7035ac1cf2a6165754e4f3a49c2f1977838 Gitweb: https://git.kernel.org/tip/7c03e7035ac1cf2a6165754e4f3a49c2f1977838 Author: Andy Lutomirski AuthorDate: Mon, 1 Oct 2018 12:52:16 -0700 Committer: Thomas Gleixner CommitDate: Tue, 2 Oct 2018 08:28:32 +0200 selftests/x86: Add clock_gettime() tests to test_vdso Now that the vDSO implementation of clock_gettime() is getting reworked, add a selftest for it. This tests that its output is consistent with the syscall version. This is marked for stable to serve as a test for commit 715bd9d12f84 ("x86/vdso: Fix asm constraints on vDSO syscall fallbacks") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/082399674de2619b2befd8c0dde49b260605b126.1538422295.git.l...@kernel.org --- tools/testing/selftests/x86/test_vdso.c | 99 + 1 file changed, 99 insertions(+) diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 235259011704..49f7294fb382 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifndef SYS_getcpu # ifdef __x86_64__ @@ -31,6 +32,10 @@ int nerrs = 0; +typedef int (*vgettime_t)(clockid_t, struct timespec *); + +vgettime_t vdso_clock_gettime; + typedef long (*getcpu_t)(unsigned *, unsigned *, void *); getcpu_t vgetcpu; @@ -95,6 +100,10 @@ static void fill_function_pointers() printf("Warning: failed to find getcpu in vDSO\n"); vgetcpu = (getcpu_t) vsyscall_getcpu(); + + vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_clock_gettime) + printf("Warning: failed to find clock_gettime in vDSO\n"); } static long sys_getcpu(unsigned * cpu, unsigned * node, @@ -103,6 +112,11 @@ static long sys_getcpu(unsigned * cpu, unsigned * node, return syscall(__NR_getcpu, cpu, node, cache); } +static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) +{ + return syscall(__NR_clock_gettime, id, ts); +} + static void test_getcpu(void) { printf("[RUN]\tTesting getcpu...\n"); @@ -155,10 +169,95 @@ static void test_getcpu(void) } } +static bool ts_leq(const struct timespec *a, const struct timespec *b) +{ + if (a->tv_sec != b->tv_sec) + return a->tv_sec < b->tv_sec; + else + return a->tv_nsec <= b->tv_nsec; +} + +static char const * const clocknames[] = { + [0] = "CLOCK_REALTIME", + [1] = "CLOCK_MONOTONIC", + [2] = "CLOCK_PROCESS_CPUTIME_ID", + [3] = "CLOCK_THREAD_CPUTIME_ID", + [4] = "CLOCK_MONOTONIC_RAW", + [5] = "CLOCK_REALTIME_COARSE", + [6] = "CLOCK_MONOTONIC_COARSE", + [7] = "CLOCK_BOOTTIME", + [8] = "CLOCK_REALTIME_ALARM", + [9] = "CLOCK_BOOTTIME_ALARM", + [10] = "CLOCK_SGI_CYCLE", + [11] = "CLOCK_TAI", +}; + +static void test_one_clock_gettime(int clock, const char *name) +{ + struct timespec start, vdso, end; + int vdso_ret, end_ret; + + printf("[RUN]\tTesting clock_gettime for clock %s (%d)...\n", name, clock); + + if (sys_clock_gettime(clock, ) < 0) { + if (errno == EINVAL) { + vdso_ret = vdso_clock_gettime(clock, ); + if (vdso_ret == -EINVAL) { + printf("[OK]\tNo such clock.\n"); + } else { + printf("[FAIL]\tNo such clock, but __vdso_clock_gettime returned %d\n", vdso_ret); + nerrs++; + } + } else { + printf("[WARN]\t clock_gettime(%d) syscall returned error %d\n", clock, errno); + } + return; + } + + vdso_ret = vdso_clock_gettime(clock, ); + end_ret = sys_clock_gettime(clock, ); + + if (vdso_ret != 0 || end_ret != 0) { + printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n", + vdso_ret, errno); + nerrs++; + return; + } + + printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n", + (unsigned long long)start.tv_sec, start.tv_nsec, + (unsigned long long)vdso.tv_sec, vdso.tv_nsec, + (unsigned long long)end.tv_sec, end.tv_nsec); + + if (!ts_leq(, ) || !ts_leq(, )) { + printf("[FAIL]\tTimes are out of sequence\n"); + nerrs++; + } +} + +static void test_clock_gettime(void) +{ + for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]); +clock++) { + test_one_clock_gettime(clock, clocknames[clock]); + } + + /* Also test some invalid clock ids */ + test_one_clock_gettime(-1, "invalid"); + test_one_clock_gettime(INT_MIN,
[tip:x86/urgent] x86/vdso: Fix asm constraints on vDSO syscall fallbacks
Commit-ID: 715bd9d12f84d8f5cc8ad21d888f9bc304a8eb0b Gitweb: https://git.kernel.org/tip/715bd9d12f84d8f5cc8ad21d888f9bc304a8eb0b Author: Andy Lutomirski AuthorDate: Mon, 1 Oct 2018 12:52:15 -0700 Committer: Thomas Gleixner CommitDate: Tue, 2 Oct 2018 08:28:15 +0200 x86/vdso: Fix asm constraints on vDSO syscall fallbacks The syscall fallbacks in the vDSO have incorrect asm constraints. They are not marked as writing to their outputs -- instead, they are marked as clobbering "memory", which is useless. In particular, gcc is smart enough to know that the timespec parameter hasn't escaped, so a memory clobber doesn't clobber it. And passing a pointer as an asm *input* does not tell gcc that the pointed-to value is changed. Add in the fact that the asm instructions weren't volatile, and gcc was free to omit them entirely unless their sole output (the return value) is used. Which it is (phew!), but that stops happening with some upcoming patches. As a trivial example, the following code: void test_fallback(struct timespec *ts) { vdso_fallback_gettime(CLOCK_MONOTONIC, ts); } compiles to: 00c0 : c0: c3 retq To add insult to injury, the RCX and R11 clobbers on 64-bit builds were missing. The "memory" clobber is also unnecessary -- no ordering with respect to other memory operations is needed, but that's going to be fixed in a separate not-for-stable patch. Fixes: 2aae950b21e4 ("x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/2c0231690551989d2fafa60ed0e7b5cc8b403908.1538422295.git.l...@kernel.org --- arch/x86/entry/vdso/vclock_gettime.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index f19856d95c60..134e2d2e8add 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -43,8 +43,9 @@ extern u8 hvclock_page notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*ts) : +"0" (__NR_clock_gettime), "D" (clock), "S" (ts) : +"memory", "rcx", "r11"); return ret; } @@ -52,8 +53,9 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : +"0" (__NR_gettimeofday), "D" (tv), "S" (tz) : +"memory", "rcx", "r11"); return ret; } @@ -64,12 +66,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) : "memory", "edx"); return ret; @@ -79,12 +81,12 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*tv), "=m" (*tz) : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) : "memory", "edx"); return ret;
[tip:x86/urgent] selftests/x86: Add clock_gettime() tests to test_vdso
Commit-ID: 7c03e7035ac1cf2a6165754e4f3a49c2f1977838 Gitweb: https://git.kernel.org/tip/7c03e7035ac1cf2a6165754e4f3a49c2f1977838 Author: Andy Lutomirski AuthorDate: Mon, 1 Oct 2018 12:52:16 -0700 Committer: Thomas Gleixner CommitDate: Tue, 2 Oct 2018 08:28:32 +0200 selftests/x86: Add clock_gettime() tests to test_vdso Now that the vDSO implementation of clock_gettime() is getting reworked, add a selftest for it. This tests that its output is consistent with the syscall version. This is marked for stable to serve as a test for commit 715bd9d12f84 ("x86/vdso: Fix asm constraints on vDSO syscall fallbacks") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/082399674de2619b2befd8c0dde49b260605b126.1538422295.git.l...@kernel.org --- tools/testing/selftests/x86/test_vdso.c | 99 + 1 file changed, 99 insertions(+) diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 235259011704..49f7294fb382 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifndef SYS_getcpu # ifdef __x86_64__ @@ -31,6 +32,10 @@ int nerrs = 0; +typedef int (*vgettime_t)(clockid_t, struct timespec *); + +vgettime_t vdso_clock_gettime; + typedef long (*getcpu_t)(unsigned *, unsigned *, void *); getcpu_t vgetcpu; @@ -95,6 +100,10 @@ static void fill_function_pointers() printf("Warning: failed to find getcpu in vDSO\n"); vgetcpu = (getcpu_t) vsyscall_getcpu(); + + vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_clock_gettime) + printf("Warning: failed to find clock_gettime in vDSO\n"); } static long sys_getcpu(unsigned * cpu, unsigned * node, @@ -103,6 +112,11 @@ static long sys_getcpu(unsigned * cpu, unsigned * node, return syscall(__NR_getcpu, cpu, node, cache); } +static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) +{ + return syscall(__NR_clock_gettime, id, ts); +} + static void test_getcpu(void) { printf("[RUN]\tTesting getcpu...\n"); @@ -155,10 +169,95 @@ static void test_getcpu(void) } } +static bool ts_leq(const struct timespec *a, const struct timespec *b) +{ + if (a->tv_sec != b->tv_sec) + return a->tv_sec < b->tv_sec; + else + return a->tv_nsec <= b->tv_nsec; +} + +static char const * const clocknames[] = { + [0] = "CLOCK_REALTIME", + [1] = "CLOCK_MONOTONIC", + [2] = "CLOCK_PROCESS_CPUTIME_ID", + [3] = "CLOCK_THREAD_CPUTIME_ID", + [4] = "CLOCK_MONOTONIC_RAW", + [5] = "CLOCK_REALTIME_COARSE", + [6] = "CLOCK_MONOTONIC_COARSE", + [7] = "CLOCK_BOOTTIME", + [8] = "CLOCK_REALTIME_ALARM", + [9] = "CLOCK_BOOTTIME_ALARM", + [10] = "CLOCK_SGI_CYCLE", + [11] = "CLOCK_TAI", +}; + +static void test_one_clock_gettime(int clock, const char *name) +{ + struct timespec start, vdso, end; + int vdso_ret, end_ret; + + printf("[RUN]\tTesting clock_gettime for clock %s (%d)...\n", name, clock); + + if (sys_clock_gettime(clock, ) < 0) { + if (errno == EINVAL) { + vdso_ret = vdso_clock_gettime(clock, ); + if (vdso_ret == -EINVAL) { + printf("[OK]\tNo such clock.\n"); + } else { + printf("[FAIL]\tNo such clock, but __vdso_clock_gettime returned %d\n", vdso_ret); + nerrs++; + } + } else { + printf("[WARN]\t clock_gettime(%d) syscall returned error %d\n", clock, errno); + } + return; + } + + vdso_ret = vdso_clock_gettime(clock, ); + end_ret = sys_clock_gettime(clock, ); + + if (vdso_ret != 0 || end_ret != 0) { + printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n", + vdso_ret, errno); + nerrs++; + return; + } + + printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n", + (unsigned long long)start.tv_sec, start.tv_nsec, + (unsigned long long)vdso.tv_sec, vdso.tv_nsec, + (unsigned long long)end.tv_sec, end.tv_nsec); + + if (!ts_leq(, ) || !ts_leq(, )) { + printf("[FAIL]\tTimes are out of sequence\n"); + nerrs++; + } +} + +static void test_clock_gettime(void) +{ + for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]); +clock++) { + test_one_clock_gettime(clock, clocknames[clock]); + } + + /* Also test some invalid clock ids */ + test_one_clock_gettime(-1, "invalid"); + test_one_clock_gettime(INT_MIN,
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: bf904d2762ee6fc1e4acfcb0772bbfb4a27ad8a6 Gitweb: https://git.kernel.org/tip/bf904d2762ee6fc1e4acfcb0772bbfb4a27ad8a6 Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Wed, 12 Sep 2018 21:33:53 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 13 +-- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 37 insertions(+), 139 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: bf904d2762ee6fc1e4acfcb0772bbfb4a27ad8a6 Gitweb: https://git.kernel.org/tip/bf904d2762ee6fc1e4acfcb0772bbfb4a27ad8a6 Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Wed, 12 Sep 2018 21:33:53 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 13 +-- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 37 insertions(+), 139 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: e536a56190d412b0f98dbd4dde608f9f7081bb6d Gitweb: https://git.kernel.org/tip/e536a56190d412b0f98dbd4dde608f9f7081bb6d Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Wed, 12 Sep 2018 21:29:09 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 13 +-- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 36 insertions(+), 140 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: e536a56190d412b0f98dbd4dde608f9f7081bb6d Gitweb: https://git.kernel.org/tip/e536a56190d412b0f98dbd4dde608f9f7081bb6d Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Wed, 12 Sep 2018 21:29:09 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 13 +-- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 36 insertions(+), 140 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: 86635715ee4228ded59f662dab36e9732b9c978f Gitweb: https://git.kernel.org/tip/86635715ee4228ded59f662dab36e9732b9c978f Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:53:16 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 11 +- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 36 insertions(+), 138 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: 86635715ee4228ded59f662dab36e9732b9c978f Gitweb: https://git.kernel.org/tip/86635715ee4228ded59f662dab36e9732b9c978f Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:53:16 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 11 +- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 36 insertions(+), 138 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: 344347941aba1ec906ff50b4cdb8178c906746f2 Gitweb: https://git.kernel.org/tip/344347941aba1ec906ff50b4cdb8178c906746f2 Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:20:12 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 11 +- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 36 insertions(+), 138 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/pti/64: Remove the SYSCALL64 entry trampoline
Commit-ID: 344347941aba1ec906ff50b4cdb8178c906746f2 Gitweb: https://git.kernel.org/tip/344347941aba1ec906ff50b4cdb8178c906746f2 Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:44 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:20:12 +0200 x86/pti/64: Remove the SYSCALL64 entry trampoline The SYSCALL64 trampoline has a couple of nice properties: - The usual sequence of SWAPGS followed by two GS-relative accesses to set up RSP is somewhat slow because the GS-relative accesses need to wait for SWAPGS to finish. The trampoline approach allows RIP-relative accesses to set up RSP, which avoids the stall. - The trampoline avoids any percpu access before CR3 is set up, which means that no percpu memory needs to be mapped in the user page tables. This prevents using Meltdown to read any percpu memory outside the cpu_entry_area and prevents using timing leaks to directly locate the percpu areas. The downsides of using a trampoline may outweigh the upsides, however. It adds an extra non-contiguous I$ cache line to system calls, and it forces an indirect jump to transfer control back to the normal kernel text after CR3 is set up. The latter is because x86 lacks a 64-bit direct jump instruction that could jump from the trampoline to the entry text. With retpolines enabled, the indirect jump is extremely slow. Change the code to map the percpu TSS into the user page tables to allow the non-trampoline SYSCALL64 path to work under PTI. This does not add a new direct information leak, since the TSS is readable by Meltdown from the cpu_entry_area alias regardless. It does allow a timing attack to locate the percpu area, but KASLR is more or less a lost cause against local attack on CPUs vulnerable to Meltdown regardless. As far as I'm concerned, on current hardware, KASLR is only useful to mitigate remote attacks that try to attack the kernel without first gaining RCE against a vulnerable user process. On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces syscall overhead from ~237ns to ~228ns. There is a possible alternative approach: Move the trampoline within 2G of the entry text and make a separate copy for each CPU. This would allow a direct jump to rejoin the normal entry path. There are pro's and con's for this approach: + It avoids a pipeline stall - It executes from an extra page and read from another extra page during the syscall. The latter is because it needs to use a relative addressing mode to find sp1 -- it's the same *cacheline*, but accessed using an alias, so it's an extra TLB entry. - Slightly more memory. This would be one page per CPU for a simple implementation and 64-ish bytes per CPU or one page per node for a more complex implementation. - More code complexity. The current approach is chosen for simplicity and because the alternative does not provide a significant benefit, which makes it worth. [ tglx: Added the alternative discussion to the changelog ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/8c7c6e483612c3e4e10ca89495dc160b1aa66878.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 69 +-- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/include/asm/sections.h | 1 - arch/x86/kernel/asm-offsets.c | 2 - arch/x86/kernel/cpu/common.c | 11 +- arch/x86/kernel/kprobes/core.c| 10 + arch/x86/kernel/vmlinux.lds.S | 10 - arch/x86/mm/cpu_entry_area.c | 36 -- arch/x86/mm/pti.c | 33 - 9 files changed, 36 insertions(+), 138 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7e82e553183a..0d728142467f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the
[tip:x86/pti] x86/entry/64: Document idtentry
Commit-ID: bd7b1f7cbf9cb35dab8e1b99145d07afc5b7a132 Gitweb: https://git.kernel.org/tip/bd7b1f7cbf9cb35dab8e1b99145d07afc5b7a132 Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:42 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:20:11 +0200 x86/entry/64: Document idtentry The idtentry macro is complicated and magical. Document what it does to help future readers and to allow future patches to adjust the code and docs at the same time. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/6e56c3ad94879e41afe345750bc28ccc0e820ea8.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 36 arch/x86/kernel/traps.c | 4 2 files changed, 40 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 957dfb693ecc..ce6af4460e9c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -900,6 +900,42 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +/** + * idtentry - Generate an IDT entry stub + * @sym: Name of the generated entry point + * @do_sym:C function to be called + * @has_error_code:True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from + * kernel mode with user GSBASE and/or user CR3. + * 2 is special -- see below. + * @shift_ist: Set to an IST index if entries from kernel mode should + * decrement the IST stack so that nested entries get a + * fresh stack. (This is for #DB, which has a nasty habit + * of recursing.) + * + * idtentry generates an IDT stub that sets up a usable kernel context, + * creates struct pt_regs, and calls @do_sym. The stub has the following + * special behaviors: + * + * On an entry from user mode, the stub switches from the trampoline or + * IST stack to the normal thread stack. On an exit to user mode, the + * normal exit-to-usermode path is invoked. + * + * On an exit to kernel mode, if @paranoid == 0, we check for preemption, + * whereas we omit the preemption check if @paranoid != 0. This is purely + * because the implementation is simpler this way. The kernel only needs + * to check for asynchronous kernel preemption when IRQ handlers return. + * + * If @paranoid == 0, then the stub will handle IRET faults by pretending + * that the fault came from user mode. It will handle gs_change faults by + * pretending that the fault happened with kernel GSBASE. Since this handling + * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have + * @paranoid == 0. This special handling will do the wrong thing for + * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. + * + * @paranoid == 2 is special: the stub will never switch stacks. This is for + * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + */ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e6db475164ed..1a90821c0b74 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -383,6 +383,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * we won't enable interupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. +* +* We will enter general_protection with kernel GSBASE, +* which is what the stub expects, given that the faulting +* RIP will be the IRET instruction. */ regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)>orig_ax;
[tip:x86/pti] x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space
Commit-ID: 98f05b5138f0a9b56022295cc1387e635b25635d Gitweb: https://git.kernel.org/tip/98f05b5138f0a9b56022295cc1387e635b25635d Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:43 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:20:11 +0200 x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space In the non-trampoline SYSCALL64 path, a percpu variable is used to temporarily store the user RSP value. Instead of a separate variable, use the otherwise unused sp2 slot in the TSS. This will improve cache locality, as the sp1 slot is already used in the same code to find the kernel stack. It will also simplify a future change to make the non-trampoline path work in PTI mode. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/08e769a0023dbad4bac6f34f3631dbaf8ad59f4f.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S| 16 +--- arch/x86/include/asm/processor.h | 6 ++ arch/x86/kernel/asm-offsets.c| 3 ++- arch/x86/kernel/process_64.c | 2 -- arch/x86/xen/xen-asm_64.S| 8 +--- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ce6af4460e9c..7e82e553183a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -215,18 +215,20 @@ ENTRY(entry_SYSCALL_64) /* * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it * is not required to switch CR3. +* +* tss.sp2 is scratch space. */ - movq%rsp, PER_CPU_VAR(rsp_scratch) + movq%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch)/* pt_regs->sp */ - pushq %r11/* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx/* pt_regs->ip */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ + pushq %r11/* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx/* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax/* pt_regs->orig_ax */ + pushq %rax/* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d53c54b842da..b2bb1d691efc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -315,7 +315,13 @@ struct x86_hw_tss { */ u64 sp1; + /* +* Since Linux does not use ring 2, the 'sp2' slot is unused by +* hardware. entry_SYSCALL_64 uses it as scratch space to stash +* the user RSP value. +*/ u64 sp2; + u64 reserved2; u64 ist[7]; u32 reserved3; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 01de31db300d..fc2e90d3429a 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -105,7 +105,8 @@ void common(void) { DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); - /* Offset for sp0 and sp1 into the tss_struct */ + /* Offset for fields in tss_struct */ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a451bc374b9b..0fa7aa19f09e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -59,8 +59,6 @@ #include #endif -__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); - /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) { diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 417b339e5c8e..bb1c2da0381d 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -91,13 +91,15 @@ ENTRY(xen_iret) ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but -* still
[tip:x86/pti] x86/entry/64: Document idtentry
Commit-ID: bd7b1f7cbf9cb35dab8e1b99145d07afc5b7a132 Gitweb: https://git.kernel.org/tip/bd7b1f7cbf9cb35dab8e1b99145d07afc5b7a132 Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:42 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:20:11 +0200 x86/entry/64: Document idtentry The idtentry macro is complicated and magical. Document what it does to help future readers and to allow future patches to adjust the code and docs at the same time. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/6e56c3ad94879e41afe345750bc28ccc0e820ea8.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S | 36 arch/x86/kernel/traps.c | 4 2 files changed, 40 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 957dfb693ecc..ce6af4460e9c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -900,6 +900,42 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +/** + * idtentry - Generate an IDT entry stub + * @sym: Name of the generated entry point + * @do_sym:C function to be called + * @has_error_code:True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from + * kernel mode with user GSBASE and/or user CR3. + * 2 is special -- see below. + * @shift_ist: Set to an IST index if entries from kernel mode should + * decrement the IST stack so that nested entries get a + * fresh stack. (This is for #DB, which has a nasty habit + * of recursing.) + * + * idtentry generates an IDT stub that sets up a usable kernel context, + * creates struct pt_regs, and calls @do_sym. The stub has the following + * special behaviors: + * + * On an entry from user mode, the stub switches from the trampoline or + * IST stack to the normal thread stack. On an exit to user mode, the + * normal exit-to-usermode path is invoked. + * + * On an exit to kernel mode, if @paranoid == 0, we check for preemption, + * whereas we omit the preemption check if @paranoid != 0. This is purely + * because the implementation is simpler this way. The kernel only needs + * to check for asynchronous kernel preemption when IRQ handlers return. + * + * If @paranoid == 0, then the stub will handle IRET faults by pretending + * that the fault came from user mode. It will handle gs_change faults by + * pretending that the fault happened with kernel GSBASE. Since this handling + * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have + * @paranoid == 0. This special handling will do the wrong thing for + * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. + * + * @paranoid == 2 is special: the stub will never switch stacks. This is for + * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + */ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e6db475164ed..1a90821c0b74 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -383,6 +383,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * we won't enable interupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. +* +* We will enter general_protection with kernel GSBASE, +* which is what the stub expects, given that the faulting +* RIP will be the IRET instruction. */ regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)>orig_ax;
[tip:x86/pti] x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space
Commit-ID: 98f05b5138f0a9b56022295cc1387e635b25635d Gitweb: https://git.kernel.org/tip/98f05b5138f0a9b56022295cc1387e635b25635d Author: Andy Lutomirski AuthorDate: Mon, 3 Sep 2018 15:59:43 -0700 Committer: Thomas Gleixner CommitDate: Sat, 8 Sep 2018 11:20:11 +0200 x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space In the non-trampoline SYSCALL64 path, a percpu variable is used to temporarily store the user RSP value. Instead of a separate variable, use the otherwise unused sp2 slot in the TSS. This will improve cache locality, as the sp1 slot is already used in the same code to find the kernel stack. It will also simplify a future change to make the non-trampoline path work in PTI mode. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Josh Poimboeuf Cc: Joerg Roedel Cc: Jiri Olsa Cc: Andi Kleen Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/08e769a0023dbad4bac6f34f3631dbaf8ad59f4f.1536015544.git.l...@kernel.org --- arch/x86/entry/entry_64.S| 16 +--- arch/x86/include/asm/processor.h | 6 ++ arch/x86/kernel/asm-offsets.c| 3 ++- arch/x86/kernel/process_64.c | 2 -- arch/x86/xen/xen-asm_64.S| 8 +--- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ce6af4460e9c..7e82e553183a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -215,18 +215,20 @@ ENTRY(entry_SYSCALL_64) /* * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it * is not required to switch CR3. +* +* tss.sp2 is scratch space. */ - movq%rsp, PER_CPU_VAR(rsp_scratch) + movq%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch)/* pt_regs->sp */ - pushq %r11/* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx/* pt_regs->ip */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ + pushq %r11/* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx/* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax/* pt_regs->orig_ax */ + pushq %rax/* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d53c54b842da..b2bb1d691efc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -315,7 +315,13 @@ struct x86_hw_tss { */ u64 sp1; + /* +* Since Linux does not use ring 2, the 'sp2' slot is unused by +* hardware. entry_SYSCALL_64 uses it as scratch space to stash +* the user RSP value. +*/ u64 sp2; + u64 reserved2; u64 ist[7]; u32 reserved3; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 01de31db300d..fc2e90d3429a 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -105,7 +105,8 @@ void common(void) { DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); - /* Offset for sp0 and sp1 into the tss_struct */ + /* Offset for fields in tss_struct */ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a451bc374b9b..0fa7aa19f09e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -59,8 +59,6 @@ #include #endif -__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); - /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) { diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 417b339e5c8e..bb1c2da0381d 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -91,13 +91,15 @@ ENTRY(xen_iret) ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but -* still
[tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching
Commit-ID: 4012e77a903d114f915fc607d6d2ed54a3d6c9b1 Gitweb: https://git.kernel.org/tip/4012e77a903d114f915fc607d6d2ed54a3d6c9b1 Author: Andy Lutomirski AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700 Committer: Thomas Gleixner CommitDate: Fri, 31 Aug 2018 17:08:22 +0200 x86/nmi: Fix NMI uaccess race against CR3 switching A NMI can hit in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory. Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Rik van Riel Cc: Nadav Amit Cc: Borislav Petkov Cc: Jann Horn Cc: Peter Zijlstra Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.l...@kernel.org --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/tlbflush.h | 40 arch/x86/lib/usercopy.c | 5 + arch/x86/mm/tlb.c | 7 +++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5f4829f10129..dfb2f7c0d019 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs perf_callchain_store(entry, regs->ip); - if (!current->mm) + if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 29c9da6c62fc..58ce5288878e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -175,8 +175,16 @@ struct tlb_state { * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. +* +* During switch_mm_irqs_off(), loaded_mm will be set to +* LOADED_MM_SWITCHING during the brief interrupts-off window +* when CR3 and loaded_mm would otherwise be inconsistent. This +* is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; + +#define LOADED_MM_SWITCHING ((struct mm_struct *)1) + u16 loaded_mm_asid; u16 next_asid; /* last user mm's ctx id */ @@ -246,6 +254,38 @@ struct tlb_state { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +static inline bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* +* The condition we want to check is +* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, +* if we're running in a VM with shadow paging, and nmi_uaccess_okay() +* is supposed to be reasonably fast. +* +* Instead, we check the almost equivalent but somewhat conservative +* condition below, and we rely on the fact that switch_mm_irqs_off() +* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. +*/ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c8c6ad0d58b8..3f435d7fca5e 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -7,6 +7,8 @@ #include #include +#include + /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) if (__range_not_ok(from, n, TASK_SIZE)) return n; + if (!nmi_uaccess_okay()) + return n; + /* * Even though this function is typically called from NMI/IRQ context * disable pagefaults so that its behaviour is consistent even when diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9517d1b2a281..e96b99eb800c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, choose_new_asid(next,
[tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching
Commit-ID: 4012e77a903d114f915fc607d6d2ed54a3d6c9b1 Gitweb: https://git.kernel.org/tip/4012e77a903d114f915fc607d6d2ed54a3d6c9b1 Author: Andy Lutomirski AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700 Committer: Thomas Gleixner CommitDate: Fri, 31 Aug 2018 17:08:22 +0200 x86/nmi: Fix NMI uaccess race against CR3 switching A NMI can hit in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory. Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Rik van Riel Cc: Nadav Amit Cc: Borislav Petkov Cc: Jann Horn Cc: Peter Zijlstra Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.l...@kernel.org --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/tlbflush.h | 40 arch/x86/lib/usercopy.c | 5 + arch/x86/mm/tlb.c | 7 +++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5f4829f10129..dfb2f7c0d019 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs perf_callchain_store(entry, regs->ip); - if (!current->mm) + if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 29c9da6c62fc..58ce5288878e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -175,8 +175,16 @@ struct tlb_state { * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. +* +* During switch_mm_irqs_off(), loaded_mm will be set to +* LOADED_MM_SWITCHING during the brief interrupts-off window +* when CR3 and loaded_mm would otherwise be inconsistent. This +* is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; + +#define LOADED_MM_SWITCHING ((struct mm_struct *)1) + u16 loaded_mm_asid; u16 next_asid; /* last user mm's ctx id */ @@ -246,6 +254,38 @@ struct tlb_state { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +static inline bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* +* The condition we want to check is +* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, +* if we're running in a VM with shadow paging, and nmi_uaccess_okay() +* is supposed to be reasonably fast. +* +* Instead, we check the almost equivalent but somewhat conservative +* condition below, and we rely on the fact that switch_mm_irqs_off() +* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. +*/ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c8c6ad0d58b8..3f435d7fca5e 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -7,6 +7,8 @@ #include #include +#include + /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) if (__range_not_ok(from, n, TASK_SIZE)) return n; + if (!nmi_uaccess_okay()) + return n; + /* * Even though this function is typically called from NMI/IRQ context * disable pagefaults so that its behaviour is consistent even when diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9517d1b2a281..e96b99eb800c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, choose_new_asid(next,
[tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching
Commit-ID: 16f54a362e4083218ac8d67a4879532c6eef2d98 Gitweb: https://git.kernel.org/tip/16f54a362e4083218ac8d67a4879532c6eef2d98 Author: Andy Lutomirski AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700 Committer: Thomas Gleixner CommitDate: Thu, 30 Aug 2018 16:31:19 +0200 x86/nmi: Fix NMI uaccess race against CR3 switching A NMI can hit in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory. Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Rik van Riel Cc: Nadav Amit Cc: Borislav Petkov Cc: Jann Horn Cc: Peter Zijlstra Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.l...@kernel.org --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/tlbflush.h | 40 arch/x86/lib/usercopy.c | 5 + arch/x86/mm/tlb.c | 7 +++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5f4829f10129..dfb2f7c0d019 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs perf_callchain_store(entry, regs->ip); - if (!current->mm) + if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 29c9da6c62fc..58ce5288878e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -175,8 +175,16 @@ struct tlb_state { * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. +* +* During switch_mm_irqs_off(), loaded_mm will be set to +* LOADED_MM_SWITCHING during the brief interrupts-off window +* when CR3 and loaded_mm would otherwise be inconsistent. This +* is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; + +#define LOADED_MM_SWITCHING ((struct mm_struct *)1) + u16 loaded_mm_asid; u16 next_asid; /* last user mm's ctx id */ @@ -246,6 +254,38 @@ struct tlb_state { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +static inline bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* +* The condition we want to check is +* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, +* if we're running in a VM with shadow paging, and nmi_uaccess_okay() +* is supposed to be reasonably fast. +* +* Instead, we check the almost equivalent but somewhat conservative +* condition below, and we rely on the fact that switch_mm_irqs_off() +* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. +*/ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c8c6ad0d58b8..3f435d7fca5e 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -7,6 +7,8 @@ #include #include +#include + /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) if (__range_not_ok(from, n, TASK_SIZE)) return n; + if (!nmi_uaccess_okay()) + return n; + /* * Even though this function is typically called from NMI/IRQ context * disable pagefaults so that its behaviour is consistent even when diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9517d1b2a281..e96b99eb800c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, choose_new_asid(next,
[tip:x86/urgent] x86/nmi: Fix NMI uaccess race against CR3 switching
Commit-ID: 16f54a362e4083218ac8d67a4879532c6eef2d98 Gitweb: https://git.kernel.org/tip/16f54a362e4083218ac8d67a4879532c6eef2d98 Author: Andy Lutomirski AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700 Committer: Thomas Gleixner CommitDate: Thu, 30 Aug 2018 16:31:19 +0200 x86/nmi: Fix NMI uaccess race against CR3 switching A NMI can hit in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory. Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Rik van Riel Cc: Nadav Amit Cc: Borislav Petkov Cc: Jann Horn Cc: Peter Zijlstra Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.l...@kernel.org --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/tlbflush.h | 40 arch/x86/lib/usercopy.c | 5 + arch/x86/mm/tlb.c | 7 +++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5f4829f10129..dfb2f7c0d019 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs perf_callchain_store(entry, regs->ip); - if (!current->mm) + if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 29c9da6c62fc..58ce5288878e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -175,8 +175,16 @@ struct tlb_state { * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. +* +* During switch_mm_irqs_off(), loaded_mm will be set to +* LOADED_MM_SWITCHING during the brief interrupts-off window +* when CR3 and loaded_mm would otherwise be inconsistent. This +* is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; + +#define LOADED_MM_SWITCHING ((struct mm_struct *)1) + u16 loaded_mm_asid; u16 next_asid; /* last user mm's ctx id */ @@ -246,6 +254,38 @@ struct tlb_state { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +static inline bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* +* The condition we want to check is +* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, +* if we're running in a VM with shadow paging, and nmi_uaccess_okay() +* is supposed to be reasonably fast. +* +* Instead, we check the almost equivalent but somewhat conservative +* condition below, and we rely on the fact that switch_mm_irqs_off() +* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. +*/ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c8c6ad0d58b8..3f435d7fca5e 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -7,6 +7,8 @@ #include #include +#include + /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) if (__range_not_ok(from, n, TASK_SIZE)) return n; + if (!nmi_uaccess_okay()) + return n; + /* * Even though this function is typically called from NMI/IRQ context * disable pagefaults so that its behaviour is consistent even when diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9517d1b2a281..e96b99eb800c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, choose_new_asid(next,
[tip:x86/urgent] x86/vdso: Fix vDSO build if a retpoline is emitted
Commit-ID: 2e549b2ee0e358bc758480e716b881f9cabedb6a Gitweb: https://git.kernel.org/tip/2e549b2ee0e358bc758480e716b881f9cabedb6a Author: Andy Lutomirski AuthorDate: Thu, 16 Aug 2018 12:41:15 -0700 Committer: Thomas Gleixner CommitDate: Mon, 20 Aug 2018 18:04:41 +0200 x86/vdso: Fix vDSO build if a retpoline is emitted Currently, if the vDSO ends up containing an indirect branch or call, GCC will emit the "external thunk" style of retpoline, and it will fail to link. Fix it by building the vDSO with inline retpoline thunks. I haven't seen any reports of this triggering on an unpatched kernel. Fixes: commit 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Matt Rickard Cc: Borislav Petkov Cc: Jason Vas Dias Cc: David Woodhouse Cc: Peter Zijlstra Cc: Andi Kleen Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/c76538cd3afbe19c6246c2d1715bc6a60bd63985.1534448381.git.l...@kernel.org --- Makefile | 4 arch/x86/entry/vdso/Makefile | 6 -- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a0650bf79606..7bab2e90e4e1 100644 --- a/Makefile +++ b/Makefile @@ -507,9 +507,13 @@ KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) endif RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern -mindirect-branch-register +RETPOLINE_VDSO_CFLAGS_GCC := -mindirect-branch=thunk-inline -mindirect-branch-register RETPOLINE_CFLAGS_CLANG := -mretpoline-external-thunk +RETPOLINE_VDSO_CFLAGS_CLANG := -mretpoline RETPOLINE_CFLAGS := $(call cc-option,$(RETPOLINE_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_CFLAGS_CLANG))) +RETPOLINE_VDSO_CFLAGS := $(call cc-option,$(RETPOLINE_VDSO_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_VDSO_CFLAGS_CLANG))) export RETPOLINE_CFLAGS +export RETPOLINE_VDSO_CFLAGS KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS += $(call cc-option,-fno-PIE) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 9f695f517747..fa3f439f0a92 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,9 +68,9 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) -$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -132,11 +132,13 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \
[tip:x86/urgent] x86/vdso: Fix vDSO build if a retpoline is emitted
Commit-ID: 2e549b2ee0e358bc758480e716b881f9cabedb6a Gitweb: https://git.kernel.org/tip/2e549b2ee0e358bc758480e716b881f9cabedb6a Author: Andy Lutomirski AuthorDate: Thu, 16 Aug 2018 12:41:15 -0700 Committer: Thomas Gleixner CommitDate: Mon, 20 Aug 2018 18:04:41 +0200 x86/vdso: Fix vDSO build if a retpoline is emitted Currently, if the vDSO ends up containing an indirect branch or call, GCC will emit the "external thunk" style of retpoline, and it will fail to link. Fix it by building the vDSO with inline retpoline thunks. I haven't seen any reports of this triggering on an unpatched kernel. Fixes: commit 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Matt Rickard Cc: Borislav Petkov Cc: Jason Vas Dias Cc: David Woodhouse Cc: Peter Zijlstra Cc: Andi Kleen Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/c76538cd3afbe19c6246c2d1715bc6a60bd63985.1534448381.git.l...@kernel.org --- Makefile | 4 arch/x86/entry/vdso/Makefile | 6 -- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a0650bf79606..7bab2e90e4e1 100644 --- a/Makefile +++ b/Makefile @@ -507,9 +507,13 @@ KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) endif RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern -mindirect-branch-register +RETPOLINE_VDSO_CFLAGS_GCC := -mindirect-branch=thunk-inline -mindirect-branch-register RETPOLINE_CFLAGS_CLANG := -mretpoline-external-thunk +RETPOLINE_VDSO_CFLAGS_CLANG := -mretpoline RETPOLINE_CFLAGS := $(call cc-option,$(RETPOLINE_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_CFLAGS_CLANG))) +RETPOLINE_VDSO_CFLAGS := $(call cc-option,$(RETPOLINE_VDSO_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_VDSO_CFLAGS_CLANG))) export RETPOLINE_CFLAGS +export RETPOLINE_VDSO_CFLAGS KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS += $(call cc-option,-fno-PIE) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 9f695f517747..fa3f439f0a92 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,9 +68,9 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) -$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -132,11 +132,13 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \
[tip:x86/urgent] x86/entry/64: Remove %ebx handling from error_entry/exit
Commit-ID: b3681dd548d06deb2e1573890829dff4b15abf46 Gitweb: https://git.kernel.org/tip/b3681dd548d06deb2e1573890829dff4b15abf46 Author: Andy Lutomirski AuthorDate: Sun, 22 Jul 2018 11:05:09 -0700 Committer: Ingo Molnar CommitDate: Tue, 24 Jul 2018 10:07:36 +0200 x86/entry/64: Remove %ebx handling from error_entry/exit error_entry and error_exit communicate the user vs. kernel status of the frame using %ebx. This is unnecessary -- the information is in regs->cs. Just use regs->cs. This makes error_entry simpler and makes error_exit more robust. It also fixes a nasty bug. Before all the Spectre nonsense, the xen_failsafe_callback entry point returned like this: ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS ENCODE_FRAME_POINTER jmp error_exit And it did not go through error_entry. This was bogus: RBX contained garbage, and error_exit expected a flag in RBX. Fortunately, it generally contained *nonzero* garbage, so the correct code path was used. As part of the Spectre fixes, code was added to clear RBX to mitigate certain speculation attacks. Now, depending on kernel configuration, RBX got zeroed and, when running some Wine workloads, the kernel crashes. This was introduced by: commit 3ac6d8c787b8 ("x86/entry/64: Clear registers for exceptions/interrupts, to reduce speculation attack surface") With this patch applied, RBX is no longer needed as a flag, and the problem goes away. I suspect that malicious userspace could use this bug to crash the kernel even without the offending patch applied, though. [ Historical note: I wrote this patch as a cleanup before I was aware of the bug it fixed. ] [ Note to stable maintainers: this should probably get applied to all kernels. If you're nervous about that, a more conservative fix to add xorl %ebx,%ebx; incl %ebx before the jump to error_exit should also fix the problem. ] Reported-and-tested-by: M. Vefa Bicakci Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dominik Brodowski Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org Cc: xen-de...@lists.xenproject.org Fixes: 3ac6d8c787b8 ("x86/entry/64: Clear registers for exceptions/interrupts, to reduce speculation attack surface") Link: http://lkml.kernel.org/r/b5010a090d3586b2d6e06c7ad3ec5542d1241c45.1532282627.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 18 -- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73a522d53b53..8ae7ffda8f98 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -981,7 +981,7 @@ ENTRY(\sym) call\do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif END(\sym) .endm @@ -1222,7 +1222,6 @@ END(paranoid_exit) /* * Save all registers in pt_regs, and switch GS if needed. - * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) UNWIND_HINT_FUNC @@ -1269,7 +1268,6 @@ ENTRY(error_entry) * for these here too. */ .Lerror_kernelspace: - incl%ebx leaqnative_irq_return_iret(%rip), %rcx cmpq%rcx, RIP+8(%rsp) je .Lerror_bad_iret @@ -1303,28 +1301,20 @@ ENTRY(error_entry) /* * Pretend that the exception came from user mode: set up pt_regs -* as if we faulted immediately after IRET and clear EBX so that -* error_exit knows that we will be returning to user mode. +* as if we faulted immediately after IRET. */ mov %rsp, %rdi callfixup_bad_iret mov %rax, %rsp - decl%ebx jmp .Lerror_entry_from_usermode_after_swapgs END(error_entry) - -/* - * On entry, EBX is a "return to kernel mode" flag: - * 1: already in kernel mode, don't need SWAPGS - * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode - */ ENTRY(error_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - testl %ebx, %ebx - jnz retint_kernel + testb $3, CS(%rsp) + jz retint_kernel jmp retint_user END(error_exit)
[tip:x86/urgent] x86/entry/64: Remove %ebx handling from error_entry/exit
Commit-ID: b3681dd548d06deb2e1573890829dff4b15abf46 Gitweb: https://git.kernel.org/tip/b3681dd548d06deb2e1573890829dff4b15abf46 Author: Andy Lutomirski AuthorDate: Sun, 22 Jul 2018 11:05:09 -0700 Committer: Ingo Molnar CommitDate: Tue, 24 Jul 2018 10:07:36 +0200 x86/entry/64: Remove %ebx handling from error_entry/exit error_entry and error_exit communicate the user vs. kernel status of the frame using %ebx. This is unnecessary -- the information is in regs->cs. Just use regs->cs. This makes error_entry simpler and makes error_exit more robust. It also fixes a nasty bug. Before all the Spectre nonsense, the xen_failsafe_callback entry point returned like this: ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS ENCODE_FRAME_POINTER jmp error_exit And it did not go through error_entry. This was bogus: RBX contained garbage, and error_exit expected a flag in RBX. Fortunately, it generally contained *nonzero* garbage, so the correct code path was used. As part of the Spectre fixes, code was added to clear RBX to mitigate certain speculation attacks. Now, depending on kernel configuration, RBX got zeroed and, when running some Wine workloads, the kernel crashes. This was introduced by: commit 3ac6d8c787b8 ("x86/entry/64: Clear registers for exceptions/interrupts, to reduce speculation attack surface") With this patch applied, RBX is no longer needed as a flag, and the problem goes away. I suspect that malicious userspace could use this bug to crash the kernel even without the offending patch applied, though. [ Historical note: I wrote this patch as a cleanup before I was aware of the bug it fixed. ] [ Note to stable maintainers: this should probably get applied to all kernels. If you're nervous about that, a more conservative fix to add xorl %ebx,%ebx; incl %ebx before the jump to error_exit should also fix the problem. ] Reported-and-tested-by: M. Vefa Bicakci Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dominik Brodowski Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org Cc: xen-de...@lists.xenproject.org Fixes: 3ac6d8c787b8 ("x86/entry/64: Clear registers for exceptions/interrupts, to reduce speculation attack surface") Link: http://lkml.kernel.org/r/b5010a090d3586b2d6e06c7ad3ec5542d1241c45.1532282627.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 18 -- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73a522d53b53..8ae7ffda8f98 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -981,7 +981,7 @@ ENTRY(\sym) call\do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif END(\sym) .endm @@ -1222,7 +1222,6 @@ END(paranoid_exit) /* * Save all registers in pt_regs, and switch GS if needed. - * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) UNWIND_HINT_FUNC @@ -1269,7 +1268,6 @@ ENTRY(error_entry) * for these here too. */ .Lerror_kernelspace: - incl%ebx leaqnative_irq_return_iret(%rip), %rcx cmpq%rcx, RIP+8(%rsp) je .Lerror_bad_iret @@ -1303,28 +1301,20 @@ ENTRY(error_entry) /* * Pretend that the exception came from user mode: set up pt_regs -* as if we faulted immediately after IRET and clear EBX so that -* error_exit knows that we will be returning to user mode. +* as if we faulted immediately after IRET. */ mov %rsp, %rdi callfixup_bad_iret mov %rax, %rsp - decl%ebx jmp .Lerror_entry_from_usermode_after_swapgs END(error_entry) - -/* - * On entry, EBX is a "return to kernel mode" flag: - * 1: already in kernel mode, don't need SWAPGS - * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode - */ ENTRY(error_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - testl %ebx, %ebx - jnz retint_kernel + testb $3, CS(%rsp) + jz retint_kernel jmp retint_user END(error_exit)
[tip:x86/urgent] selftests/x86/sigreturn: Do minor cleanups
Commit-ID: e8a445dea219c32727016af14f847d2e8f7ebec8 Gitweb: https://git.kernel.org/tip/e8a445dea219c32727016af14f847d2e8f7ebec8 Author: Andy Lutomirski AuthorDate: Tue, 26 Jun 2018 22:17:18 -0700 Committer: Ingo Molnar CommitDate: Wed, 27 Jun 2018 09:36:56 +0200 selftests/x86/sigreturn: Do minor cleanups We have short names for the requested and resulting register values. Use them instead of spelling out the whole register entry for each case. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bb3bc1f923a2f6fe7912d22a1068fe29d6033d38.1530076529.git.l...@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/sigreturn.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 2559e2c01793..4d9dc3f2fd70 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -610,6 +610,7 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) */ for (int i = 0; i < NGREG; i++) { greg_t req = requested_regs[i], res = resulting_regs[i]; + if (i == REG_TRAPNO || i == REG_IP) continue; /* don't care */ @@ -673,18 +674,18 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) #endif /* Sanity check on the kernel */ - if (i == REG_CX && requested_regs[i] != resulting_regs[i]) { + if (i == REG_CX && req != res) { printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", - (unsigned long long)requested_regs[i], - (unsigned long long)resulting_regs[i]); + (unsigned long long)req, + (unsigned long long)res); nerrs++; continue; } - if (requested_regs[i] != resulting_regs[i] && !ignore_reg) { + if (req != res && !ignore_reg) { printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n", - i, (unsigned long long)requested_regs[i], - (unsigned long long)resulting_regs[i]); + i, (unsigned long long)req, + (unsigned long long)res); nerrs++; } }
[tip:x86/urgent] selftests/x86/sigreturn: Do minor cleanups
Commit-ID: e8a445dea219c32727016af14f847d2e8f7ebec8 Gitweb: https://git.kernel.org/tip/e8a445dea219c32727016af14f847d2e8f7ebec8 Author: Andy Lutomirski AuthorDate: Tue, 26 Jun 2018 22:17:18 -0700 Committer: Ingo Molnar CommitDate: Wed, 27 Jun 2018 09:36:56 +0200 selftests/x86/sigreturn: Do minor cleanups We have short names for the requested and resulting register values. Use them instead of spelling out the whole register entry for each case. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bb3bc1f923a2f6fe7912d22a1068fe29d6033d38.1530076529.git.l...@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/sigreturn.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 2559e2c01793..4d9dc3f2fd70 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -610,6 +610,7 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) */ for (int i = 0; i < NGREG; i++) { greg_t req = requested_regs[i], res = resulting_regs[i]; + if (i == REG_TRAPNO || i == REG_IP) continue; /* don't care */ @@ -673,18 +674,18 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) #endif /* Sanity check on the kernel */ - if (i == REG_CX && requested_regs[i] != resulting_regs[i]) { + if (i == REG_CX && req != res) { printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", - (unsigned long long)requested_regs[i], - (unsigned long long)resulting_regs[i]); + (unsigned long long)req, + (unsigned long long)res); nerrs++; continue; } - if (requested_regs[i] != resulting_regs[i] && !ignore_reg) { + if (req != res && !ignore_reg) { printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n", - i, (unsigned long long)requested_regs[i], - (unsigned long long)resulting_regs[i]); + i, (unsigned long long)req, + (unsigned long long)res); nerrs++; } }
[tip:x86/urgent] selftests/x86/sigreturn/64: Fix spurious failures on AMD CPUs
Commit-ID: ec348020566009d3da9b99f07c05814d13969c78 Gitweb: https://git.kernel.org/tip/ec348020566009d3da9b99f07c05814d13969c78 Author: Andy Lutomirski AuthorDate: Tue, 26 Jun 2018 22:17:17 -0700 Committer: Ingo Molnar CommitDate: Wed, 27 Jun 2018 09:36:56 +0200 selftests/x86/sigreturn/64: Fix spurious failures on AMD CPUs When I wrote the sigreturn test, I didn't realize that AMD's busted IRET behavior was different from Intel's busted IRET behavior: On AMD CPUs, the CPU leaks the high 32 bits of the kernel stack pointer to certain userspace contexts. Gee, thanks. There's very little the kernel can do about it. Modify the test so it passes. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/86e7fd3564497f657de30a36da4505799eebef01.1530076529.git.l...@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/sigreturn.c | 46 + 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 246145b84a12..2559e2c01793 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -612,19 +612,38 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) greg_t req = requested_regs[i], res = resulting_regs[i]; if (i == REG_TRAPNO || i == REG_IP) continue; /* don't care */ - if (i == REG_SP) { - printf("\tSP: %llx -> %llx\n", (unsigned long long)req, - (unsigned long long)res); + if (i == REG_SP) { /* -* In many circumstances, the high 32 bits of rsp -* are zeroed. For example, we could be a real -* 32-bit program, or we could hit any of a number -* of poorly-documented IRET or segmented ESP -* oddities. If this happens, it's okay. +* If we were using a 16-bit stack segment, then +* the kernel is a bit stuck: IRET only restores +* the low 16 bits of ESP/RSP if SS is 16-bit. +* The kernel uses a hack to restore bits 31:16, +* but that hack doesn't help with bits 63:32. +* On Intel CPUs, bits 63:32 end up zeroed, and, on +* AMD CPUs, they leak the high bits of the kernel +* espfix64 stack pointer. There's very little that +* the kernel can do about it. +* +* Similarly, if we are returning to a 32-bit context, +* the CPU will often lose the high 32 bits of RSP. */ - if (res == (req & 0x)) - continue; /* OK; not expected to work */ + + if (res == req) + continue; + + if (cs_bits != 64 && ((res ^ req) & 0x) == 0) { + printf("[NOTE]\tSP: %llx -> %llx\n", + (unsigned long long)req, + (unsigned long long)res); + continue; + } + + printf("[FAIL]\tSP mismatch: requested 0x%llx; got 0x%llx\n", + (unsigned long long)requested_regs[i], + (unsigned long long)resulting_regs[i]); + nerrs++; + continue; } bool ignore_reg = false; @@ -663,13 +682,6 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) } if (requested_regs[i] != resulting_regs[i] && !ignore_reg) { - /* -* SP is particularly interesting here. The -* usual cause of failures is that we hit the -* nasty IRET case of returning to a 16-bit SS, -* in which case bits 16:31 of the *kernel* -* stack pointer persist in ESP. -*/ printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n", i, (unsigned long long)requested_regs[i], (unsigned long long)resulting_regs[i]);
[tip:x86/urgent] x86/entry/64/compat: Fix "x86/entry/64/compat: Preserve r8-r11 in int $0x80"
Commit-ID: 22cd978e598618e82c3c3348d2069184f6884182 Gitweb: https://git.kernel.org/tip/22cd978e598618e82c3c3348d2069184f6884182 Author: Andy Lutomirski AuthorDate: Tue, 26 Jun 2018 22:45:52 -0700 Committer: Ingo Molnar CommitDate: Wed, 27 Jun 2018 09:35:40 +0200 x86/entry/64/compat: Fix "x86/entry/64/compat: Preserve r8-r11 in int $0x80" Commit: 8bb2610bc496 ("x86/entry/64/compat: Preserve r8-r11 in int $0x80") was busted: my original patch had a minor conflict with some of the nospec changes, but "git apply" is very clever and silently accepted the patch by making the same changes to a different function in the same file. There was obviously a huge offset, but "git apply" for some reason doesn't feel any need to say so. Move the changes to the correct function. Now the test_syscall_vdso_32 selftests passes. If anyone cares to observe the original problem, try applying the patch at: https://lore.kernel.org/lkml/d4c4d9985fbe64f8c9e19291886453914b48caee.1523975710.git.l...@kernel.org/raw to the kernel at 316d097c4cd4e7f2ef50c40cff2db266593c4ec4: - "git am" and "git apply" accept the patch without any complaints at all - "patch -p1" at least prints out a message about the huge offset. Reported-by: zhijianx...@intel.com Signed-off-by: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: Greg Kroah-Hartman Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org #v4.17+ Fixes: 8bb2610bc496 ("x86/entry/64/compat: Preserve r8-r11 in int $0x80") Link: http://lkml.kernel.org/r/6012b922485401bc42676e804171ded262fc2ef2.1530078306.git.l...@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 9de7f1e1dede..7d0df78db727 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -84,13 +84,13 @@ ENTRY(entry_SYSENTER_compat) pushq %rdx/* pt_regs->dx */ pushq %rcx/* pt_regs->cx */ pushq $-ENOSYS/* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r8 = 0 */ xorl%r8d, %r8d /* nospec r8 */ - pushq %r9 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r9 = 0 */ xorl%r9d, %r9d /* nospec r9 */ - pushq %r10/* pt_regs->r10 */ + pushq $0 /* pt_regs->r10 = 0 */ xorl%r10d, %r10d/* nospec r10 */ - pushq %r11/* pt_regs->r11 */ + pushq $0 /* pt_regs->r11 = 0 */ xorl%r11d, %r11d/* nospec r11 */ pushq %rbx/* pt_regs->rbx */ xorl%ebx, %ebx /* nospec rbx */ @@ -374,13 +374,13 @@ ENTRY(entry_INT80_compat) pushq %rcx/* pt_regs->cx */ xorl%ecx, %ecx /* nospec cx */ pushq $-ENOSYS/* pt_regs->ax */ - pushq $0 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r8 */ xorl%r8d, %r8d /* nospec r8 */ - pushq $0 /* pt_regs->r9 = 0 */ + pushq %r9 /* pt_regs->r9 */ xorl%r9d, %r9d /* nospec r9 */ - pushq $0 /* pt_regs->r10 = 0 */ + pushq %r10/* pt_regs->r10*/ xorl%r10d, %r10d/* nospec r10 */ - pushq $0 /* pt_regs->r11 = 0 */ + pushq %r11/* pt_regs->r11 */ xorl%r11d, %r11d/* nospec r11 */ pushq %rbx/* pt_regs->rbx */ xorl%ebx, %ebx /* nospec rbx */