3.14-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Andy Lutomirski <l...@kernel.org>

commit 9b6e6a8334d56354853f9c255d1395c2ba570e0a upstream.

Returning to userspace is tricky: IRET can fail, and ESPFIX can
rearrange the stack prior to IRET.

The NMI nesting fixup relies on a precise stack layout and
atomic IRET.  Rather than trying to teach the NMI nesting fixup
to handle ESPFIX and failed IRET, punt: run NMIs that came from
user mode on the normal kernel stack.

This will make some nested NMIs visible to C code, but the C
code is okay with that.

As a side effect, this should speed up perf: it eliminates an
RDMSR when NMIs come from user mode.

Signed-off-by: Andy Lutomirski <l...@kernel.org>
Reviewed-by: Steven Rostedt <rost...@goodmis.org>
Reviewed-by: Borislav Petkov <b...@suse.de>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: sta...@vger.kernel.org
Signed-off-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>

---
 arch/x86/kernel/entry_64.S |   77 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 4 deletions(-)

--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1715,19 +1715,88 @@ ENTRY(nmi)
         * a nested NMI that updated the copy interrupt stack frame, a
         * jump will be made to the repeat_nmi code that will handle the second
         * NMI.
+        *
+        * However, espfix prevents us from directly returning to userspace
+        * with a single IRET instruction.  Similarly, IRET to user mode
+        * can fault.  We therefore handle NMIs from user space like
+        * other IST entries.
         */
 
        /* Use %rdx as out temp variable throughout */
        pushq_cfi %rdx
        CFI_REL_OFFSET rdx, 0
 
+       testb   $3, CS-RIP+8(%rsp)
+       jz      .Lnmi_from_kernel
+
+       /*
+        * NMI from user mode.  We need to run on the thread stack, but we
+        * can't go through the normal entry paths: NMIs are masked, and
+        * we don't want to enable interrupts, because then we'll end
+        * up in an awkward situation in which IRQs are on but NMIs
+        * are off.
+        */
+       SWAPGS
+       cld
+       movq    %rsp, %rdx
+       movq    PER_CPU_VAR(kernel_stack), %rsp
+       addq    $KERNEL_STACK_OFFSET, %rsp
+       pushq   5*8(%rdx)       /* pt_regs->ss */
+       pushq   4*8(%rdx)       /* pt_regs->rsp */
+       pushq   3*8(%rdx)       /* pt_regs->flags */
+       pushq   2*8(%rdx)       /* pt_regs->cs */
+       pushq   1*8(%rdx)       /* pt_regs->rip */
+       pushq   $-1             /* pt_regs->orig_ax */
+       pushq   %rdi            /* pt_regs->di */
+       pushq   %rsi            /* pt_regs->si */
+       pushq   (%rdx)          /* pt_regs->dx */
+       pushq   %rcx            /* pt_regs->cx */
+       pushq   %rax            /* pt_regs->ax */
+       pushq   %r8             /* pt_regs->r8 */
+       pushq   %r9             /* pt_regs->r9 */
+       pushq   %r10            /* pt_regs->r10 */
+       pushq   %r11            /* pt_regs->r11 */
+       pushq   %rbx            /* pt_regs->rbx */
+       pushq   %rbp            /* pt_regs->rbp */
+       pushq   %r12            /* pt_regs->r12 */
+       pushq   %r13            /* pt_regs->r13 */
+       pushq   %r14            /* pt_regs->r14 */
+       pushq   %r15            /* pt_regs->r15 */
+
+       /*
+        * At this point we no longer need to worry about stack damage
+        * due to nesting -- we're on the normal thread stack and we're
+        * done with the NMI stack.
+        */
+       movq    %rsp, %rdi
+       movq    $-1, %rsi
+       call    do_nmi
+
+       /*
+        * Return back to user mode.  We must *not* do the normal exit
+        * work, because we don't want to enable interrupts.  Fortunately,
+        * do_nmi doesn't modify pt_regs.
+        */
+       SWAPGS
+
        /*
-        * If %cs was not the kernel segment, then the NMI triggered in user
-        * space, which means it is definitely not nested.
+        * Open-code the entire return process for compatibility with varying
+        * register layouts across different kernel versions.
         */
-       cmpl $__KERNEL_CS, 16(%rsp)
-       jne first_nmi
+       addq    $6*8, %rsp      /* skip bx, bp, and r12-r15 */
+       popq    %r11            /* pt_regs->r11 */
+       popq    %r10            /* pt_regs->r10 */
+       popq    %r9             /* pt_regs->r9 */
+       popq    %r8             /* pt_regs->r8 */
+       popq    %rax            /* pt_regs->ax */
+       popq    %rcx            /* pt_regs->cx */
+       popq    %rdx            /* pt_regs->dx */
+       popq    %rsi            /* pt_regs->si */
+       popq    %rdi            /* pt_regs->di */
+       addq    $8, %rsp        /* skip orig_ax */
+       INTERRUPT_RETURN
 
+.Lnmi_from_kernel:
        /*
         * Check the special variable on the stack to see if NMIs are
         * executing.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to