On Sat, Mar 05, 2016 at 09:52:22PM -0800, Andy Lutomirski wrote:
> Right after SYSENTER, we can get a #DB or NMI.  On x86_32, there's no IST,
> so the exception handler is invoked on the temporary SYSENTER stack.
> 
> Because the SYSENTER stack is very small, we have a fixup to switch
> off the stack quickly when this happens.  The old fixup had several issues:
> 
> 1. It checked the interrupt frame's CS and EIP.  This wasn't
>    obviously correct on Xen or if vm86 mode was in use [1].
> 
> 2. In the NMI handler, it did some frightening digging into the
>    stack frame.  I'm not convinced this digging was correct.
> 
> 3. The fixup didn't switch stacks and then switch back.  Instead, it
>    synthesized a brand new stack frame that would redirect the IRET
>    back to the SYSENTER code.  That frame was highly questionable.
>    For one thing, if NMI nested inside #DB, we would effectively
>    abort the #DB prologue, which was probably safe but was
>    frightening.  For another, the code used PUSHFL to write the
>    FLAGS portion of the frame, which was simply bogus -- by the time
>    PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
>    flags were clobbered.
> 
> Simplify this considerably.  Instead of looking at the saved frame
> to see where we came from, check the hardware ESP register against
> the SYSENTER stack directly.  Malicious user code cannot spoof the
> kernel ESP register, and by moving the check after SAVE_ALL, we can
> use normal PER_CPU accesses to find all the relevant addresses.
> 
> With this patch applied, the improved syscall_nt_32 test finally
> passes on 32-bit kernels.
> 
> [1] It isn't obviously correct, but it is nonetheless safe from vm86
>     shenanigans as far as I can tell.  A user can't point EIP at
>     entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
>     like all kernel addresses, is greater than 0xffff and would thus
>     violate the CS segment limit.
> 
> Signed-off-by: Andy Lutomirski <l...@kernel.org>
> ---
>  arch/x86/entry/entry_32.S        | 114 
> ++++++++++++++++++---------------------
>  arch/x86/kernel/asm-offsets_32.c |   5 ++
>  2 files changed, 56 insertions(+), 63 deletions(-)
> 
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 7700cf695d8c..ad9a85e62128 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -987,51 +987,48 @@ error_code:
>       jmp     ret_from_exception
>  END(page_fault)
>  
> -/*
> - * Debug traps and NMI can happen at the one SYSENTER instruction
> - * that sets up the real kernel stack. Check here, since we can't
> - * allow the wrong stack to be used.
> - *
> - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
> - * already pushed 3 words if it hits on the sysenter instruction:
> - * eflags, cs and eip.
> - *
> - * We just load the right stack, and push the three (known) values
> - * by hand onto the new stack - while updating the return eip past
> - * the instruction that would have done it for sysenter.
> - */
> -.macro FIX_STACK offset ok label
> -     cmpw    $__KERNEL_CS, 4(%esp)
> -     jne     \ok
> -\label:
> -     movl    TSS_sysenter_sp0 + \offset(%esp), %esp
> -     pushfl
> -     pushl   $__KERNEL_CS
> -     pushl   $sysenter_past_esp
> -.endm
> -
>  ENTRY(debug)
> +     /*
> +      * #DB can happen at the first instruction of
> +      * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
> +      * happens, then we will be running on a very small stack.  We
> +      * need to detect this condition and switch to the thread
> +      * stack before calling any C code at all.
> +      *
> +      * If you edit this code, keep in mind that NMIs can happen in here.
> +      */

Btw, I think it is a bit more readable if this comment is over ENTRY(debug) like
the one over ENTRY(nmi) below.

>       ASM_CLAC
> -     cmpl    $entry_SYSENTER_32, (%esp)
> -     jne     debug_stack_correct
> -     FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
> -debug_stack_correct:
>       pushl   $-1                             # mark this as an int
>       SAVE_ALL
> -     TRACE_IRQS_OFF
>       xorl    %edx, %edx                      # error code 0
>       movl    %esp, %eax                      # pt_regs pointer
> +
> +     /* Are we currently on the SYSENTER stack? */
> +     PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
> +     subl    %eax, %ecx              /* ecx = (end of SYENTER_stack) - esp */
                                                         ^^^^^^^
Just a typo for the committer of this patch to fix:     SYSENTER_stack

> +     cmpl    $SIZEOF_SYSENTER_stack, %ecx
> +     jb      .Ldebug_from_sysenter_stack
> +
> +     TRACE_IRQS_OFF
> +     call    do_debug
> +     jmp     ret_from_exception
> +
> +.Ldebug_from_sysenter_stack:
> +     /* We're on the SYSENTER stack.  Switch off. */
> +     movl    %esp, %ebp
> +     movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
> +     TRACE_IRQS_OFF
>       call    do_debug
> +     movl    %ebp, %esp
>       jmp     ret_from_exception
>  END(debug)
>  
>  /*
> - * NMI is doubly nasty. It can happen _while_ we're handling
> - * a debug fault, and the debug fault hasn't yet been able to
> - * clear up the stack. So we first check whether we got  an
> - * NMI on the sysenter entry path, but after that we need to
> - * check whether we got an NMI on the debug path where the debug
> - * fault happened on the sysenter path.
> + * NMI is doubly nasty.  It can happen on the first instruction of
> + * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
> + * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
> + * switched stacks.  We handle both conditions by simply checking whether we
> + * interrupted kernel code running on the SYSENTER stack.
>   */
>  ENTRY(nmi)
>       ASM_CLAC
> @@ -1042,41 +1039,32 @@ ENTRY(nmi)
>       popl    %eax
>       je      nmi_espfix_stack
>  #endif
> -     cmpl    $entry_SYSENTER_32, (%esp)
> -     je      nmi_stack_fixup
> -     pushl   %eax
> -     movl    %esp, %eax
> -     /*
> -      * Do not access memory above the end of our stack page,
> -      * it might not exist.
> -      */
> -     andl    $(THREAD_SIZE-1), %eax
> -     cmpl    $(THREAD_SIZE-20), %eax
> -     popl    %eax
> -     jae     nmi_stack_correct
> -     cmpl    $entry_SYSENTER_32, 12(%esp)
> -     je      nmi_debug_stack_check
> -nmi_stack_correct:
> -     pushl   %eax
> +
> +     pushl   %eax                            # pt_regs->orig_ax
>       SAVE_ALL
>       xorl    %edx, %edx                      # zero error code
>       movl    %esp, %eax                      # pt_regs pointer
> +
> +     /* Are we currently on the SYSENTER stack? */
> +     PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
> +     subl    %eax, %ecx              /* ecx = (end of SYENTER_stack) - esp */

Typo copied :)

> +     cmpl    $SIZEOF_SYSENTER_stack, %ecx
> +     jb      .Lnmi_from_sysenter_stack
> +
> +     /* Not on SYSENTER stack. */
>       call    do_nmi
>       jmp     restore_all_notrace
>  
> -nmi_stack_fixup:
> -     FIX_STACK 12, nmi_stack_correct, 1
> -     jmp     nmi_stack_correct
> -
> -nmi_debug_stack_check:
> -     cmpw    $__KERNEL_CS, 16(%esp)
> -     jne     nmi_stack_correct
> -     cmpl    $debug, (%esp)
> -     jb      nmi_stack_correct
> -     cmpl    $debug_esp_fix_insn, (%esp)
> -     ja      nmi_stack_correct
> -     FIX_STACK 24, nmi_stack_correct, 1
> -     jmp     nmi_stack_correct
> +.Lnmi_from_sysenter_stack:
> +     /*
> +      * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
> +      * is using the thread stack right now, so it's safe for us to use it.
> +      */
> +     movl    %esp, %ebp
> +     movl    PER_CPU_VAR(cpu_current_top_of_stack), %esp
> +     call    do_nmi
> +     movl    %ebp, %esp
> +     jmp     restore_all_notrace
>  
>  #ifdef CONFIG_X86_ESPFIX32
>  nmi_espfix_stack:

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

Reply via email to