Commit-ID:  4012e77a903d114f915fc607d6d2ed54a3d6c9b1
Gitweb:     https://git.kernel.org/tip/4012e77a903d114f915fc607d6d2ed54a3d6c9b1
Author:     Andy Lutomirski <l...@kernel.org>
AuthorDate: Wed, 29 Aug 2018 08:47:18 -0700
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Fri, 31 Aug 2018 17:08:22 +0200

x86/nmi: Fix NMI uaccess race against CR3 switching

A NMI can hit in the middle of context switching or in the middle of
switch_mm_irqs_off().  In either case, CR3 might not match current->mm,
which could cause copy_from_user_nmi() and friends to read the wrong
memory.

Fix it by adding a new nmi_uaccess_okay() helper and checking it in
copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.

Signed-off-by: Andy Lutomirski <l...@kernel.org>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Rik van Riel <r...@surriel.com>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Borislav Petkov <b...@alien8.de>
Cc: Jann Horn <ja...@google.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/dd956eba16646fd0b15c3c0741269dfd84452dac.1535557289.git.l...@kernel.org


---
 arch/x86/events/core.c          |  2 +-
 arch/x86/include/asm/tlbflush.h | 40 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/usercopy.c         |  5 +++++
 arch/x86/mm/tlb.c               |  7 +++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5f4829f10129..dfb2f7c0d019 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *regs
 
        perf_callchain_store(entry, regs->ip);
 
-       if (!current->mm)
+       if (!nmi_uaccess_okay())
                return;
 
        if (perf_callchain_user32(regs, entry))
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 29c9da6c62fc..58ce5288878e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -175,8 +175,16 @@ struct tlb_state {
         * are on.  This means that it may not match current->active_mm,
         * which will contain the previous user mm when we're in lazy TLB
         * mode even if we've already switched back to swapper_pg_dir.
+        *
+        * During switch_mm_irqs_off(), loaded_mm will be set to
+        * LOADED_MM_SWITCHING during the brief interrupts-off window
+        * when CR3 and loaded_mm would otherwise be inconsistent.  This
+        * is for nmi_uaccess_okay()'s benefit.
         */
        struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
        u16 loaded_mm_asid;
        u16 next_asid;
        /* last user mm's ctx id */
@@ -246,6 +254,38 @@ struct tlb_state {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm.  It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+       struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+       struct mm_struct *current_mm = current->mm;
+
+       VM_WARN_ON_ONCE(!loaded_mm);
+
+       /*
+        * The condition we want to check is
+        * current_mm->pgd == __va(read_cr3_pa()).  This may be slow, though,
+        * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+        * is supposed to be reasonably fast.
+        *
+        * Instead, we check the almost equivalent but somewhat conservative
+        * condition below, and we rely on the fact that switch_mm_irqs_off()
+        * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+        */
+       if (loaded_mm != current_mm)
+               return false;
+
+       VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+       return true;
+}
+
 /* Initialize cr4 shadow for this CPU. */
 static inline void cr4_init_shadow(void)
 {
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index c8c6ad0d58b8..3f435d7fca5e 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -7,6 +7,8 @@
 #include <linux/uaccess.h>
 #include <linux/export.h>
 
+#include <asm/tlbflush.h>
+
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
@@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, 
unsigned long n)
        if (__range_not_ok(from, n, TASK_SIZE))
                return n;
 
+       if (!nmi_uaccess_okay())
+               return n;
+
        /*
         * Even though this function is typically called from NMI/IRQ context
         * disable pagefaults so that its behaviour is consistent even when
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9517d1b2a281..e96b99eb800c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -305,6 +305,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 
                choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
+               /* Let nmi_uaccess_okay() know that we're changing CR3. */
+               this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+               barrier();
+
                if (need_flush) {
                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, 
next->context.ctx_id);
                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, 
next_tlb_gen);
@@ -335,6 +339,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
                if (next != &init_mm)
                        this_cpu_write(cpu_tlbstate.last_ctx_id, 
next->context.ctx_id);
 
+               /* Make sure we write CR3 before loaded_mm. */
+               barrier();
+
                this_cpu_write(cpu_tlbstate.loaded_mm, next);
                this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
        }

Reply via email to