Commit-ID:  a0d14b8909de55139b8702fe0c7e80b69763dcfb
Gitweb:     https://git.kernel.org/tip/a0d14b8909de55139b8702fe0c7e80b69763dcfb
Author:     Peter Zijlstra <pet...@infradead.org>
AuthorDate: Thu, 11 Jul 2019 13:40:59 +0200
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Wed, 17 Jul 2019 23:17:38 +0200

x86/mm, tracing: Fix CR2 corruption

Despite the current efforts to read CR2 before tracing happens there still
exist a number of possible holes:

  idtentry page_fault             do_page_fault           has_error_code=1
    call error_entry
      TRACE_IRQS_OFF
        call trace_hardirqs_off*
          #PF // modifies CR2

      CALL_enter_from_user_mode
        __context_tracking_exit()
          trace_user_exit(0)
            #PF // modifies CR2

    call do_page_fault
      address = read_cr2(); /* whoopsie */

And similar for i386.

Fix it by pulling the CR2 read into the entry code, before any of that
stuff gets a chance to run and ruin things.

Reported-by: He Zhe <zhe...@windriver.com>
Reported-by: Eiichi Tsukata <de...@etsukata.com>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Andy Lutomirski <l...@kernel.org>
Cc: b...@alien8.de
Cc: rost...@goodmis.org
Cc: torva...@linux-foundation.org
Cc: h...@zytor.com
Cc: dave.han...@linux.intel.com
Cc: jgr...@suse.com
Cc: j...@joelfernandes.org
Link: https://lkml.kernel.org/r/20190711114336.116812...@infradead.org

Debugged-by: Steven Rostedt <rost...@goodmis.org>
---
 arch/x86/entry/entry_32.S       | 25 ++++++++++++++++++++++---
 arch/x86/entry/entry_64.S       | 35 ++++++++++++++++++-----------------
 arch/x86/include/asm/kvm_para.h |  2 +-
 arch/x86/include/asm/traps.h    |  4 ++--
 arch/x86/kernel/kvm.c           |  8 ++++----
 arch/x86/kernel/traps.c         |  6 +-----
 arch/x86/mm/fault.c             | 30 +++++++++++-------------------
 7 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4d4b6100f0e8..2bb986f305ac 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, 
HYPERV_STIMER0_VECTOR,
 
 ENTRY(page_fault)
        ASM_CLAC
-       pushl   $do_page_fault
-       ALIGN
-       jmp common_exception
+       pushl   $0; /* %gs's slot on the stack */
+
+       SAVE_ALL switch_stacks=1 skip_gs=1
+
+       ENCODE_FRAME_POINTER
+       UNWIND_ESPFIX_STACK
+
+       /* fixup %gs */
+       GS_TO_REG %ecx
+       REG_TO_PTGS %ecx
+       SET_KERNEL_GS %ecx
+
+       GET_CR2_INTO(%ecx)                      # might clobber %eax
+
+       /* fixup orig %eax */
+       movl    PT_ORIG_EAX(%esp), %edx         # get the error code
+       movl    $-1, PT_ORIG_EAX(%esp)          # no syscall to restart
+
+       TRACE_IRQS_OFF
+       movl    %esp, %eax                      # pt_regs pointer
+       call    do_page_fault
+       jmp     ret_from_exception
 END(page_fault)
 
 common_exception:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 95ae05f0edf2..7cb2e1f1ec09 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -864,7 +864,7 @@ apicinterrupt IRQ_WORK_VECTOR                       
irq_work_interrupt              smp_irq_work_interrupt
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 
-.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, 
ist_offset=0
+.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, 
shift_ist=-1, ist_offset=0
 
        .if \paranoid
        call    paranoid_entry
@@ -874,12 +874,21 @@ apicinterrupt IRQ_WORK_VECTOR                     
irq_work_interrupt              smp_irq_work_interrupt
        .endif
        UNWIND_HINT_REGS
 
-       .if \paranoid
+       .if \read_cr2
+       GET_CR2_INTO(%rdx);                     /* can clobber %rax */
+       .endif
+
        .if \shift_ist != -1
        TRACE_IRQS_OFF_DEBUG                    /* reload IDT in case of 
recursion */
        .else
        TRACE_IRQS_OFF
        .endif
+
+       .if \paranoid == 0
+       testb   $3, CS(%rsp)
+       jz      .Lfrom_kernel_no_context_tracking_\@
+       CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
        .endif
 
        movq    %rsp, %rdi                      /* pt_regs pointer */
@@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR                       
irq_work_interrupt              smp_irq_work_interrupt
  *                     fresh stack.  (This is for #DB, which has a nasty habit
  *                     of recursing.)
  * @create_gap:                create a 6-word stack gap when coming from 
kernel mode.
+ * @read_cr2:          load CR2 into the 3rd argument; done before calling any 
C code
  *
  * idtentry generates an IDT stub that sets up a usable kernel context,
  * creates struct pt_regs, and calls @do_sym.  The stub has the following
@@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR                       
irq_work_interrupt              smp_irq_work_interrupt
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 
ist_offset=0 create_gap=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 
ist_offset=0 create_gap=0 read_cr2=0
 ENTRY(\sym)
        UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -985,7 +995,7 @@ ENTRY(\sym)
 .Lfrom_usermode_no_gap_\@:
        .endif
 
-       idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, 
\ist_offset
+       idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, 
\shift_ist, \ist_offset
 
        .if \paranoid == 1
        /*
@@ -994,7 +1004,7 @@ ENTRY(\sym)
         * run in real process context if user_mode(regs).
         */
 .Lfrom_usermode_switch_stack_\@:
-       idtentry_part \do_sym, \has_error_code, paranoid=0
+       idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0
        .endif
 
 _ASM_NOKPROBE(\sym)
@@ -1006,7 +1016,7 @@ idtentry overflow                 do_overflow             
        has_error_code=0
 idtentry bounds                                do_bounds                       
has_error_code=0
 idtentry invalid_op                    do_invalid_op                   
has_error_code=0
 idtentry device_not_available          do_device_not_available         
has_error_code=0
-idtentry double_fault                  do_double_fault                 
has_error_code=1 paranoid=2
+idtentry double_fault                  do_double_fault                 
has_error_code=1 paranoid=2 read_cr2=1
 idtentry coprocessor_segment_overrun   do_coprocessor_segment_overrun  
has_error_code=0
 idtentry invalid_TSS                   do_invalid_TSS                  
has_error_code=1
 idtentry segment_not_present           do_segment_not_present          
has_error_code=1
@@ -1179,10 +1189,10 @@ idtentry xenint3                do_int3                 
has_error_code=0
 #endif
 
 idtentry general_protection    do_general_protection   has_error_code=1
-idtentry page_fault            do_page_fault           has_error_code=1
+idtentry page_fault            do_page_fault           has_error_code=1        
read_cr2=1
 
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault      do_async_page_fault     has_error_code=1
+idtentry async_page_fault      do_async_page_fault     has_error_code=1        
read_cr2=1
 #endif
 
 #ifdef CONFIG_X86_MCE
@@ -1281,18 +1291,9 @@ ENTRY(error_entry)
        movq    %rax, %rsp                      /* switch stack */
        ENCODE_FRAME_POINTER
        pushq   %r12
-
-       /*
-        * We need to tell lockdep that IRQs are off.  We can't do this until
-        * we fix gsbase, and we should do it before enter_from_user_mode
-        * (which can take locks).
-        */
-       TRACE_IRQS_OFF
-       CALL_enter_from_user_mode
        ret
 
 .Lerror_entry_done:
-       TRACE_IRQS_OFF
        ret
 
        /*
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 5ed3cf1c3934..9b4df6eaa11a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
+void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, 
unsigned long address);
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 7d6f3f3fad78..5dd1674ddf4c 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -74,14 +74,14 @@ dotraplinkage void do_invalid_TSS(struct pt_regs *regs, 
long error_code);
 dotraplinkage void do_segment_not_present(struct pt_regs *regs, long 
error_code);
 dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code);
 #ifdef CONFIG_X86_64
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code);
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, 
unsigned long address);
 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
 asmlinkage __visible notrace
 struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
 void __init trap_init(void);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *regs, long 
error_code);
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long 
error_code);
+dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long 
error_code, unsigned long address);
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long 
error_code);
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
 dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 82caf01b63dd..3231440d6253 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
 dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned 
long address)
 {
        enum ctx_state prev_state;
 
        switch (kvm_read_and_reset_pf_reason()) {
        default:
-               do_page_fault(regs, error_code);
+               do_page_fault(regs, error_code, address);
                break;
        case KVM_PV_REASON_PAGE_NOT_PRESENT:
                /* page is swapped out by the host. */
                prev_state = exception_enter();
-               kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+               kvm_async_pf_task_wait((u32)address, !user_mode(regs));
                exception_exit(prev_state);
                break;
        case KVM_PV_REASON_PAGE_READY:
                rcu_irq_enter();
-               kvm_async_pf_task_wake((u32)read_cr2());
+               kvm_async_pf_task_wake((u32)address);
                rcu_irq_exit();
                break;
        }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 87095a477154..4bb0f8447112 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_overflow(const 
char *message,
 
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, 
unsigned long cr2)
 {
        static const char str[] = "double fault";
        struct task_struct *tsk = current;
-#ifdef CONFIG_VMAP_STACK
-       unsigned long cr2;
-#endif
 
 #ifdef CONFIG_X86_ESPFIX64
        extern unsigned char native_irq_return_iret[];
@@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, 
long error_code)
         * stack even if the actual trigger for the double fault was
         * something else.
         */
-       cr2 = read_cr2();
        if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
                handle_stack_overflow("kernel stack overflow (double-fault)", 
regs, cr2);
 #endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 794f364cb882..0799cc79efd3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1507,9 +1507,8 @@ good_area:
 NOKPROBE_SYMBOL(do_user_addr_fault);
 
 /*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
+ * Explicitly marked noinline such that the function tracer sees this as the
+ * page_fault entry point.
  */
 static noinline void
 __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
@@ -1528,33 +1527,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
 }
 NOKPROBE_SYMBOL(__do_page_fault);
 
-static nokprobe_inline void
-trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
-                        unsigned long error_code)
+static __always_inline void
+trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
+                        unsigned long address)
 {
+       if (!trace_pagefault_enabled())
+               return;
+
        if (user_mode(regs))
                trace_page_fault_user(address, regs, error_code);
        else
                trace_page_fault_kernel(address, regs, error_code);
 }
 
-/*
- * We must have this function blacklisted from kprobes, tagged with notrace
- * and call read_cr2() before calling anything else. To avoid calling any
- * kind of tracing machinery before we've observed the CR2 value.
- *
- * exception_{enter,exit}() contains all sorts of tracepoints.
- */
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+dotraplinkage void
+do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long 
address)
 {
-       unsigned long address = read_cr2(); /* Get the faulting address */
        enum ctx_state prev_state;
 
        prev_state = exception_enter();
-       if (trace_pagefault_enabled())
-               trace_page_fault_entries(address, regs, error_code);
-
+       trace_page_fault_entries(regs, error_code, address);
        __do_page_fault(regs, error_code, address);
        exception_exit(prev_state);
 }

Reply via email to