Signed-off-by: Brian Gerst <brge...@gmail.com> --- arch/x86/entry/calling.h | 10 +---- arch/x86/entry/common.c | 56 ++++++++++++++++++++++++++- arch/x86/entry/entry_64.S | 71 ++-------------------------------- arch/x86/include/asm/syscall.h | 2 +- 4 files changed, 60 insertions(+), 79 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 98e4d8886f11c..904477d3e388f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -147,27 +147,19 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro POP_REGS pop_rdi=1 skip_r11rcx=0 +.macro POP_REGS pop_rdi=1 popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx - .if \skip_r11rcx - popq %rsi - .else popq %r11 - .endif popq %r10 popq %r9 popq %r8 popq %rax - .if \skip_r11rcx - popq %rsi - .else popq %rcx - .endif popq %rdx popq %rsi .if \pop_rdi diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 54ad1890aefca..9e01445f6679c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -365,9 +365,11 @@ __visible noinstr void syscall_return_slowpath(struct pt_regs *regs) } #ifdef CONFIG_X86_64 -__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) +__visible noinstr bool do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; + long rip; + unsigned int shift_rip; check_user_regs(regs); @@ -394,6 +396,58 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) instrumentation_end(); exit_to_user_mode(); + + /* + * Check that the register state is valid for using SYSRET to exit + * to userspace. Otherwise use the slower IRET exit path. + */ + + /* SYSRET requires RCX == RIP and R11 = EFLAGS */ + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) + return false; + + /* CS and SS must match values set in MSR_STAR */ + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) + return false; + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * Change top bits to match most significant bit (47th or 56th bit + * depending on paging mode) in the address. + */ + shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1); + rip = (long) regs->ip; + rip <<= shift_rip; + rip >>= shift_rip; + if (unlikely((unsigned long) rip != regs->ip)) + return false; + + /* + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot + * restore RF properly. If the slowpath sets it for whatever reason, we + * need to restore it correctly. + * + * SYSRET can restore TF, but unlike IRET, restoring TF results in a + * trap from userspace immediately after SYSRET. This would cause an + * infinite loop whenever #DB happens with register state that satisfies + * the opportunistic SYSRET conditions. For example, single-stepping + * this user code: + * + * movq $stuck_here, %rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + if (unlikely(regs->flags & (X86_EFLAGS_RF|X86_EFLAGS_TF))) + return false; + + /* Use SYSRET to exit to userspace */ + return true; } #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fb729f4c4fbc2..b8025a62ac5e8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -117,80 +117,15 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. If we're not, - * go to the slow exit path. - */ - movq RCX(%rsp), %rcx - movq RIP(%rsp), %r11 - - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - * - * Change top bits to match most significant bit (47th or 56th bit - * depending on paging mode) in the address. - */ -#ifdef CONFIG_X86_5LEVEL - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 -#else - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx -#endif - - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne swapgs_restore_regs_and_return_to_usermode - - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode - - movq R11(%rsp), %r11 - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot - * restore RF properly. If the slowpath sets it for whatever reason, we - * need to restore it correctly. - * - * SYSRET can restore TF, but unlike IRET, restoring TF results in a - * trap from userspace immediately after SYSRET. This would cause an - * infinite loop whenever #DB happens with register state that satisfies - * the opportunistic SYSRET conditions. For example, single-stepping - * this user code: - * - * movq $stuck_here, %rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz swapgs_restore_regs_and_return_to_usermode - - /* nothing to check for RSP */ - - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode + testb %al, %al /* Is SYSRET allowed? */ + jz swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - POP_REGS pop_rdi=0 skip_r11rcx=1 + POP_REGS pop_rdi=0 /* * Now all regs are restored except RSP and RDI. diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7cbf733d11afd..766f9b9736185 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -160,7 +160,7 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(unsigned long nr, struct pt_regs *regs); +bool do_syscall_64(unsigned long nr, struct pt_regs *regs); void do_int80_syscall_32(struct pt_regs *regs); long do_fast_syscall_32(struct pt_regs *regs); -- 2.26.2