Re: [PATCH v2 09/10] x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup

2016-03-07 Thread Borislav Petkov
On Sat, Mar 05, 2016 at 09:52:22PM -0800, Andy Lutomirski wrote:
> Right after SYSENTER, we can get a #DB or NMI.  On x86_32, there's no IST,
> so the exception handler is invoked on the temporary SYSENTER stack.
> 
> Because the SYSENTER stack is very small, we have a fixup to switch
> off the stack quickly when this happens.  The old fixup had several issues:
> 
> 1. It checked the interrupt frame's CS and EIP.  This wasn't
>obviously correct on Xen or if vm86 mode was in use [1].
> 
> 2. In the NMI handler, it did some frightening digging into the
>stack frame.  I'm not convinced this digging was correct.
> 
> 3. The fixup didn't switch stacks and then switch back.  Instead, it
>synthesized a brand new stack frame that would redirect the IRET
>back to the SYSENTER code.  That frame was highly questionable.
>For one thing, if NMI nested inside #DB, we would effectively
>abort the #DB prologue, which was probably safe but was
>frightening.  For another, the code used PUSHFL to write the
>FLAGS portion of the frame, which was simply bogus -- by the time
>PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
>flags were clobbered.
> 
> Simplify this considerably.  Instead of looking at the saved frame
> to see where we came from, check the hardware ESP register against
> the SYSENTER stack directly.  Malicious user code cannot spoof the
> kernel ESP register, and by moving the check after SAVE_ALL, we can
> use normal PER_CPU accesses to find all the relevant addresses.
> 
> With this patch applied, the improved syscall_nt_32 test finally
> passes on 32-bit kernels.
> 
> [1] It isn't obviously correct, but it is nonetheless safe from vm86
> shenanigans as far as I can tell.  A user can't point EIP at
> entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
> like all kernel addresses, is greater than 0x and would thus
> violate the CS segment limit.
> 
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/entry/entry_32.S| 114 
> ++-
>  arch/x86/kernel/asm-offsets_32.c |   5 ++
>  2 files changed, 56 insertions(+), 63 deletions(-)
> 
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 7700cf695d8c..ad9a85e62128 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -987,51 +987,48 @@ error_code:
>   jmp ret_from_exception
>  END(page_fault)
>  
> -/*
> - * Debug traps and NMI can happen at the one SYSENTER instruction
> - * that sets up the real kernel stack. Check here, since we can't
> - * allow the wrong stack to be used.
> - *
> - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
> - * already pushed 3 words if it hits on the sysenter instruction:
> - * eflags, cs and eip.
> - *
> - * We just load the right stack, and push the three (known) values
> - * by hand onto the new stack - while updating the return eip past
> - * the instruction that would have done it for sysenter.
> - */
> -.macro FIX_STACK offset ok label
> - cmpw$__KERNEL_CS, 4(%esp)
> - jne \ok
> -\label:
> - movlTSS_sysenter_sp0 + \offset(%esp), %esp
> - pushfl
> - pushl   $__KERNEL_CS
> - pushl   $sysenter_past_esp
> -.endm
> -
>  ENTRY(debug)
> + /*
> +  * #DB can happen at the first instruction of
> +  * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
> +  * happens, then we will be running on a very small stack.  We
> +  * need to detect this condition and switch to the thread
> +  * stack before calling any C code at all.
> +  *
> +  * If you edit this code, keep in mind that NMIs can happen in here.
> +  */

Btw, I think it is a bit more readable if this comment is over ENTRY(debug) like
the one over ENTRY(nmi) below.

>   ASM_CLAC
> - cmpl$entry_SYSENTER_32, (%esp)
> - jne debug_stack_correct
> - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
> -debug_stack_correct:
>   pushl   $-1 # mark this as an int
>   SAVE_ALL
> - TRACE_IRQS_OFF
>   xorl%edx, %edx  # error code 0
>   movl%esp, %eax  # pt_regs pointer
> +
> + /* Are we currently on the SYSENTER stack? */
> + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
> + subl%eax, %ecx  /* ecx = (end of SYENTER_stack) - esp */
 ^^^
Just a typo for the committer of this patch to fix: SYSENTER_stack

> + cmpl$SIZEOF_SYSENTER_stack, %ecx
> + jb  .Ldebug_from_sysenter_stack
> +
> + TRACE_IRQS_OFF
> + calldo_debug
> + jmp ret_from_exception
> +
> +.Ldebug_from_sysenter_stack:
> + /* We're on the SYSENTER stack.  Switch off. */
> + movl%esp, %ebp
> + movlPER_CPU_VAR(cpu_current_top_of_stack), %esp
> + 

Re: [PATCH v2 09/10] x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup

2016-03-07 Thread Borislav Petkov
On Sat, Mar 05, 2016 at 09:52:22PM -0800, Andy Lutomirski wrote:
> Right after SYSENTER, we can get a #DB or NMI.  On x86_32, there's no IST,
> so the exception handler is invoked on the temporary SYSENTER stack.
> 
> Because the SYSENTER stack is very small, we have a fixup to switch
> off the stack quickly when this happens.  The old fixup had several issues:
> 
> 1. It checked the interrupt frame's CS and EIP.  This wasn't
>obviously correct on Xen or if vm86 mode was in use [1].
> 
> 2. In the NMI handler, it did some frightening digging into the
>stack frame.  I'm not convinced this digging was correct.
> 
> 3. The fixup didn't switch stacks and then switch back.  Instead, it
>synthesized a brand new stack frame that would redirect the IRET
>back to the SYSENTER code.  That frame was highly questionable.
>For one thing, if NMI nested inside #DB, we would effectively
>abort the #DB prologue, which was probably safe but was
>frightening.  For another, the code used PUSHFL to write the
>FLAGS portion of the frame, which was simply bogus -- by the time
>PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
>flags were clobbered.
> 
> Simplify this considerably.  Instead of looking at the saved frame
> to see where we came from, check the hardware ESP register against
> the SYSENTER stack directly.  Malicious user code cannot spoof the
> kernel ESP register, and by moving the check after SAVE_ALL, we can
> use normal PER_CPU accesses to find all the relevant addresses.
> 
> With this patch applied, the improved syscall_nt_32 test finally
> passes on 32-bit kernels.
> 
> [1] It isn't obviously correct, but it is nonetheless safe from vm86
> shenanigans as far as I can tell.  A user can't point EIP at
> entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
> like all kernel addresses, is greater than 0x and would thus
> violate the CS segment limit.
> 
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/entry/entry_32.S| 114 
> ++-
>  arch/x86/kernel/asm-offsets_32.c |   5 ++
>  2 files changed, 56 insertions(+), 63 deletions(-)
> 
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 7700cf695d8c..ad9a85e62128 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -987,51 +987,48 @@ error_code:
>   jmp ret_from_exception
>  END(page_fault)
>  
> -/*
> - * Debug traps and NMI can happen at the one SYSENTER instruction
> - * that sets up the real kernel stack. Check here, since we can't
> - * allow the wrong stack to be used.
> - *
> - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
> - * already pushed 3 words if it hits on the sysenter instruction:
> - * eflags, cs and eip.
> - *
> - * We just load the right stack, and push the three (known) values
> - * by hand onto the new stack - while updating the return eip past
> - * the instruction that would have done it for sysenter.
> - */
> -.macro FIX_STACK offset ok label
> - cmpw$__KERNEL_CS, 4(%esp)
> - jne \ok
> -\label:
> - movlTSS_sysenter_sp0 + \offset(%esp), %esp
> - pushfl
> - pushl   $__KERNEL_CS
> - pushl   $sysenter_past_esp
> -.endm
> -
>  ENTRY(debug)
> + /*
> +  * #DB can happen at the first instruction of
> +  * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
> +  * happens, then we will be running on a very small stack.  We
> +  * need to detect this condition and switch to the thread
> +  * stack before calling any C code at all.
> +  *
> +  * If you edit this code, keep in mind that NMIs can happen in here.
> +  */

Btw, I think it is a bit more readable if this comment is over ENTRY(debug) like
the one over ENTRY(nmi) below.

>   ASM_CLAC
> - cmpl$entry_SYSENTER_32, (%esp)
> - jne debug_stack_correct
> - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
> -debug_stack_correct:
>   pushl   $-1 # mark this as an int
>   SAVE_ALL
> - TRACE_IRQS_OFF
>   xorl%edx, %edx  # error code 0
>   movl%esp, %eax  # pt_regs pointer
> +
> + /* Are we currently on the SYSENTER stack? */
> + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
> + subl%eax, %ecx  /* ecx = (end of SYENTER_stack) - esp */
 ^^^
Just a typo for the committer of this patch to fix: SYSENTER_stack

> + cmpl$SIZEOF_SYSENTER_stack, %ecx
> + jb  .Ldebug_from_sysenter_stack
> +
> + TRACE_IRQS_OFF
> + calldo_debug
> + jmp ret_from_exception
> +
> +.Ldebug_from_sysenter_stack:
> + /* We're on the SYSENTER stack.  Switch off. */
> + movl%esp, %ebp
> + movlPER_CPU_VAR(cpu_current_top_of_stack), %esp
> + TRACE_IRQS_OFF
> 

[PATCH v2 09/10] x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup

2016-03-05 Thread Andy Lutomirski
Right after SYSENTER, we can get a #DB or NMI.  On x86_32, there's no IST,
so the exception handler is invoked on the temporary SYSENTER stack.

Because the SYSENTER stack is very small, we have a fixup to switch
off the stack quickly when this happens.  The old fixup had several issues:

1. It checked the interrupt frame's CS and EIP.  This wasn't
   obviously correct on Xen or if vm86 mode was in use [1].

2. In the NMI handler, it did some frightening digging into the
   stack frame.  I'm not convinced this digging was correct.

3. The fixup didn't switch stacks and then switch back.  Instead, it
   synthesized a brand new stack frame that would redirect the IRET
   back to the SYSENTER code.  That frame was highly questionable.
   For one thing, if NMI nested inside #DB, we would effectively
   abort the #DB prologue, which was probably safe but was
   frightening.  For another, the code used PUSHFL to write the
   FLAGS portion of the frame, which was simply bogus -- by the time
   PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
   flags were clobbered.

Simplify this considerably.  Instead of looking at the saved frame
to see where we came from, check the hardware ESP register against
the SYSENTER stack directly.  Malicious user code cannot spoof the
kernel ESP register, and by moving the check after SAVE_ALL, we can
use normal PER_CPU accesses to find all the relevant addresses.

With this patch applied, the improved syscall_nt_32 test finally
passes on 32-bit kernels.

[1] It isn't obviously correct, but it is nonetheless safe from vm86
shenanigans as far as I can tell.  A user can't point EIP at
entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
like all kernel addresses, is greater than 0x and would thus
violate the CS segment limit.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/entry/entry_32.S| 114 ++-
 arch/x86/kernel/asm-offsets_32.c |   5 ++
 2 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7700cf695d8c..ad9a85e62128 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -987,51 +987,48 @@ error_code:
jmp ret_from_exception
 END(page_fault)
 
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-   cmpw$__KERNEL_CS, 4(%esp)
-   jne \ok
-\label:
-   movlTSS_sysenter_sp0 + \offset(%esp), %esp
-   pushfl
-   pushl   $__KERNEL_CS
-   pushl   $sysenter_past_esp
-.endm
-
 ENTRY(debug)
+   /*
+* #DB can happen at the first instruction of
+* entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
+* happens, then we will be running on a very small stack.  We
+* need to detect this condition and switch to the thread
+* stack before calling any C code at all.
+*
+* If you edit this code, keep in mind that NMIs can happen in here.
+*/
ASM_CLAC
-   cmpl$entry_SYSENTER_32, (%esp)
-   jne debug_stack_correct
-   FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
pushl   $-1 # mark this as an int
SAVE_ALL
-   TRACE_IRQS_OFF
xorl%edx, %edx  # error code 0
movl%esp, %eax  # pt_regs pointer
+
+   /* Are we currently on the SYSENTER stack? */
+   PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+   subl%eax, %ecx  /* ecx = (end of SYENTER_stack) - esp */
+   cmpl$SIZEOF_SYSENTER_stack, %ecx
+   jb  .Ldebug_from_sysenter_stack
+
+   TRACE_IRQS_OFF
+   calldo_debug
+   jmp ret_from_exception
+
+.Ldebug_from_sysenter_stack:
+   /* We're on the SYSENTER stack.  Switch off. */
+   movl%esp, %ebp
+   movlPER_CPU_VAR(cpu_current_top_of_stack), %esp
+   TRACE_IRQS_OFF
calldo_debug
+   movl%ebp, %esp
jmp ret_from_exception
 END(debug)
 
 /*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter 

[PATCH v2 09/10] x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup

2016-03-05 Thread Andy Lutomirski
Right after SYSENTER, we can get a #DB or NMI.  On x86_32, there's no IST,
so the exception handler is invoked on the temporary SYSENTER stack.

Because the SYSENTER stack is very small, we have a fixup to switch
off the stack quickly when this happens.  The old fixup had several issues:

1. It checked the interrupt frame's CS and EIP.  This wasn't
   obviously correct on Xen or if vm86 mode was in use [1].

2. In the NMI handler, it did some frightening digging into the
   stack frame.  I'm not convinced this digging was correct.

3. The fixup didn't switch stacks and then switch back.  Instead, it
   synthesized a brand new stack frame that would redirect the IRET
   back to the SYSENTER code.  That frame was highly questionable.
   For one thing, if NMI nested inside #DB, we would effectively
   abort the #DB prologue, which was probably safe but was
   frightening.  For another, the code used PUSHFL to write the
   FLAGS portion of the frame, which was simply bogus -- by the time
   PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
   flags were clobbered.

Simplify this considerably.  Instead of looking at the saved frame
to see where we came from, check the hardware ESP register against
the SYSENTER stack directly.  Malicious user code cannot spoof the
kernel ESP register, and by moving the check after SAVE_ALL, we can
use normal PER_CPU accesses to find all the relevant addresses.

With this patch applied, the improved syscall_nt_32 test finally
passes on 32-bit kernels.

[1] It isn't obviously correct, but it is nonetheless safe from vm86
shenanigans as far as I can tell.  A user can't point EIP at
entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
like all kernel addresses, is greater than 0x and would thus
violate the CS segment limit.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/entry/entry_32.S| 114 ++-
 arch/x86/kernel/asm-offsets_32.c |   5 ++
 2 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7700cf695d8c..ad9a85e62128 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -987,51 +987,48 @@ error_code:
jmp ret_from_exception
 END(page_fault)
 
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-   cmpw$__KERNEL_CS, 4(%esp)
-   jne \ok
-\label:
-   movlTSS_sysenter_sp0 + \offset(%esp), %esp
-   pushfl
-   pushl   $__KERNEL_CS
-   pushl   $sysenter_past_esp
-.endm
-
 ENTRY(debug)
+   /*
+* #DB can happen at the first instruction of
+* entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
+* happens, then we will be running on a very small stack.  We
+* need to detect this condition and switch to the thread
+* stack before calling any C code at all.
+*
+* If you edit this code, keep in mind that NMIs can happen in here.
+*/
ASM_CLAC
-   cmpl$entry_SYSENTER_32, (%esp)
-   jne debug_stack_correct
-   FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
pushl   $-1 # mark this as an int
SAVE_ALL
-   TRACE_IRQS_OFF
xorl%edx, %edx  # error code 0
movl%esp, %eax  # pt_regs pointer
+
+   /* Are we currently on the SYSENTER stack? */
+   PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+   subl%eax, %ecx  /* ecx = (end of SYENTER_stack) - esp */
+   cmpl$SIZEOF_SYSENTER_stack, %ecx
+   jb  .Ldebug_from_sysenter_stack
+
+   TRACE_IRQS_OFF
+   calldo_debug
+   jmp ret_from_exception
+
+.Ldebug_from_sysenter_stack:
+   /* We're on the SYSENTER stack.  Switch off. */
+   movl%esp, %ebp
+   movlPER_CPU_VAR(cpu_current_top_of_stack), %esp
+   TRACE_IRQS_OFF
calldo_debug
+   movl%ebp, %esp
jmp ret_from_exception
 END(debug)
 
 /*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
+ * NMI is