[GIT PULL] x86 fixes

Ingo Molnar Sat, 14 Oct 2017 09:16:57 -0700

Linus,

Please pull the latest x86-urgent-for-linus git tree from:


   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
x86-urgent-for-linus

   # HEAD: 1f161f67a272cc4f29f27934dd3f74cb657eb5c4 x86/microcode: Do the 
family check first

A landry list of fixes:

 - fix reboot breakage on some PCID-enabled system
 - fix crashes/hangs on some PCID-enabled systems
 - fix microcode loading on certain older CPUs
 - various unwinder fixes
 - extend an APIC quirk to more hardware systems and disable APIC related 
warning 
   on virtualized systems
 - various Hyper-V fixes
 - a macro definition robustness fix
 - remove jprobes IRQ disabling
 - various mem-encryption fixes

 Thanks,

        Ingo

------------------>
Andy Lutomirski (2):
      x86/mm/64: Fix reboot interaction with CR4.PCIDE
      x86/mm: Flush more aggressively in lazy TLB mode

Borislav Petkov (1):
      x86/microcode: Do the family check first

Josh Poimboeuf (5):
      kprobes/x86: Set up frame pointer in kprobe trampoline
      x86/unwind: Fix dereference of untrusted pointer
      x86/unwind: Use MSB for frame pointer encoding on 32-bit
      x86/unwind: Align stack pointer in unwinder dump
      x86/unwind: Disable unwinder warnings on 32-bit

Len Brown (1):
      x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping

Marcelo Henrique Cerri (1):
      x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing

Masami Hiramatsu (1):
      kprobes/x86: Remove IRQ disabling from jprobe handlers

Mathias Krause (1):
      x86/alternatives: Fix alt_max_short macro to really be a max()

Paolo Bonzini (1):
      x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on 
hypervisors

Tom Lendacky (1):
      x86/mm: Disable various instrumentations of mm/mem_encrypt.c and mm/tlb.c

Vitaly Kuznetsov (2):
      x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded 
vCPUs
      x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex structures


 arch/x86/entry/entry_32.S              |   4 +-
 arch/x86/hyperv/hv_init.c              |   5 ++
 arch/x86/hyperv/mmu.c                  |  57 +++++++++---
 arch/x86/include/asm/alternative-asm.h |   4 +-
 arch/x86/include/asm/alternative.h     |   6 +-
 arch/x86/include/asm/mmu_context.h     |   8 +-
 arch/x86/include/asm/mshyperv.h        |   1 +
 arch/x86/include/asm/tlbflush.h        |  24 ++++++
 arch/x86/kernel/apic/apic.c            |  15 +++-
 arch/x86/kernel/cpu/microcode/core.c   |  27 ++++--
 arch/x86/kernel/kprobes/common.h       |  13 ++-
 arch/x86/kernel/kprobes/core.c         |   2 -
 arch/x86/kernel/reboot.c               |   4 +
 arch/x86/kernel/unwind_frame.c         |  38 +++++++-
 arch/x86/mm/Makefile                   |  11 ++-
 arch/x86/mm/tlb.c                      | 153 ++++++++++++++++++++++++---------
 16 files changed, 284 insertions(+), 88 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 8a13d468635a..50e0d2bc4528 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -176,7 +176,7 @@
 /*
  * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
  * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
- * is just setting the LSB, which makes it an invalid stack address and is also
+ * is just clearing the MSB, which makes it an invalid stack address and is 
also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
  * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
@@ -185,7 +185,7 @@
 .macro ENCODE_FRAME_POINTER
 #ifdef CONFIG_FRAME_POINTER
        mov %esp, %ebp
-       orl $0x1, %ebp
+       andl $0x7fffffff, %ebp
 #endif
 .endm
 
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1a8eb550c40f..a5db63f728a2 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
 u32 *hv_vp_index;
 EXPORT_SYMBOL_GPL(hv_vp_index);
 
+u32 hv_max_vp_index;
+
 static int hv_cpu_init(unsigned int cpu)
 {
        u64 msr_vp_index;
@@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu)
 
        hv_vp_index[smp_processor_id()] = msr_vp_index;
 
+       if (msr_vp_index > hv_max_vp_index)
+               hv_max_vp_index = msr_vp_index;
+
        return 0;
 }
 
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 39e7f6e50919..9cc9e1c1e2db 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex {
 /* Each gva in gva_list encodes up to 4096 pages to flush */
 #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
 
-static struct hv_flush_pcpu __percpu *pcpu_flush;
+static struct hv_flush_pcpu __percpu **pcpu_flush;
 
-static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex;
+static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex;
 
 /*
  * Fills in gva_list starting from offset. Returns the number of items added.
@@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex 
*flush,
 {
        int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
 
+       /* valid_bank_mask can represent up to 64 banks */
+       if (hv_max_vp_index / 64 >= 64)
+               return 0;
+
+       /*
+        * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex
+        * structs are not cleared between calls, we risk flushing unneeded
+        * vCPUs otherwise.
+        */
+       for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+               flush->hv_vp_set.bank_contents[vcpu_bank] = 0;
+
        /*
         * Some banks may end up being empty but this is acceptable.
         */
@@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex 
*flush,
                vcpu = hv_cpu_number_to_vp_number(cpu);
                vcpu_bank = vcpu / 64;
                vcpu_offset = vcpu % 64;
-
-               /* valid_bank_mask can represent up to 64 banks */
-               if (vcpu_bank >= 64)
-                       return 0;
-
                __set_bit(vcpu_offset, (unsigned long *)
                          &flush->hv_vp_set.bank_contents[vcpu_bank]);
                if (vcpu_bank >= nr_bank)
@@ -102,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask 
*cpus,
                                    const struct flush_tlb_info *info)
 {
        int cpu, vcpu, gva_n, max_gvas;
+       struct hv_flush_pcpu **flush_pcpu;
        struct hv_flush_pcpu *flush;
        u64 status = U64_MAX;
        unsigned long flags;
@@ -116,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask 
*cpus,
 
        local_irq_save(flags);
 
-       flush = this_cpu_ptr(pcpu_flush);
+       flush_pcpu = this_cpu_ptr(pcpu_flush);
+
+       if (unlikely(!*flush_pcpu))
+               *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+       flush = *flush_pcpu;
+
+       if (unlikely(!flush)) {
+               local_irq_restore(flags);
+               goto do_native;
+       }
 
        if (info->mm) {
                flush->address_space = virt_to_phys(info->mm->pgd);
@@ -173,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask 
*cpus,
                                       const struct flush_tlb_info *info)
 {
        int nr_bank = 0, max_gvas, gva_n;
+       struct hv_flush_pcpu_ex **flush_pcpu;
        struct hv_flush_pcpu_ex *flush;
        u64 status = U64_MAX;
        unsigned long flags;
@@ -187,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct 
cpumask *cpus,
 
        local_irq_save(flags);
 
-       flush = this_cpu_ptr(pcpu_flush_ex);
+       flush_pcpu = this_cpu_ptr(pcpu_flush_ex);
+
+       if (unlikely(!*flush_pcpu))
+               *flush_pcpu = page_address(alloc_page(GFP_ATOMIC));
+
+       flush = *flush_pcpu;
+
+       if (unlikely(!flush)) {
+               local_irq_restore(flags);
+               goto do_native;
+       }
 
        if (info->mm) {
                flush->address_space = virt_to_phys(info->mm->pgd);
@@ -222,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct 
cpumask *cpus,
                flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
                status = hv_do_rep_hypercall(
                        HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
-                       0, nr_bank + 2, flush, NULL);
+                       0, nr_bank, flush, NULL);
        } else if (info->end &&
                   ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
                status = hv_do_rep_hypercall(
                        HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
-                       0, nr_bank + 2, flush, NULL);
+                       0, nr_bank, flush, NULL);
        } else {
                gva_n = fill_gva_list(flush->gva_list, nr_bank,
                                      info->start, info->end);
                status = hv_do_rep_hypercall(
                        HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
-                       gva_n, nr_bank + 2, flush, NULL);
+                       gva_n, nr_bank, flush, NULL);
        }
 
        local_irq_restore(flags);
@@ -266,7 +295,7 @@ void hyper_alloc_mmu(void)
                return;
 
        if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
-               pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+               pcpu_flush = alloc_percpu(struct hv_flush_pcpu *);
        else
-               pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+               pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *);
 }
diff --git a/arch/x86/include/asm/alternative-asm.h 
b/arch/x86/include/asm/alternative-asm.h
index e7636bac7372..6c98821fef5e 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -62,8 +62,10 @@
 #define new_len2               145f-144f
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
 #define alt_max_short(a, b)    ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index c096624137ae..ccbe24e697c4 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
        alt_end_marker ":\n"
 
 /*
- * max without conditionals. Idea adapted from:
+ * gas compatible max based on the idea from:
  * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
  *
- * The additional "-" is needed because gas works with s32s.
+ * The additional "-" is needed because gas uses a "true" value of -1.
  */
-#define alt_max_short(a, b)    "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") 
- (" b ")))))"
+#define alt_max_short(a, b)    "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") 
< (" b ")))))"
 
 /*
  * Pad the second replacement alternative with additional NOPs if it is
diff --git a/arch/x86/include/asm/mmu_context.h 
b/arch/x86/include/asm/mmu_context.h
index c120b5db178a..3c856a15b98e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, 
struct mm_struct *next)
        DEBUG_LOCKS_WARN_ON(preemptible());
 }
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct 
*tsk)
-{
-       int cpu = smp_processor_id();
-
-       if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
-               cpumask_clear_cpu(cpu, mm_cpumask(mm));
-}
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 738503e1f80c..530f448fddaf 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 
rep_count, u16 varhead_size,
  * to this information.
  */
 extern u32 *hv_vp_index;
+extern u32 hv_max_vp_index;
 
 /**
  * hv_cpu_number_to_vp_number() - Map CPU to VP.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4893abf7f74f..d362161d3291 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,6 +83,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #endif
 
 /*
+ * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point
+ * to init_mm when we switch to a kernel thread (e.g. the idle thread).  If
+ * it's false, then we immediately switch CR3 when entering a kernel thread.
+ */
+DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
+/*
  * 6 because 6 should be plenty and struct tlb_state will fit in
  * two cache lines.
  */
@@ -105,6 +112,23 @@ struct tlb_state {
        u16 next_asid;
 
        /*
+        * We can be in one of several states:
+        *
+        *  - Actively using an mm.  Our CPU's bit will be set in
+        *    mm_cpumask(loaded_mm) and is_lazy == false;
+        *
+        *  - Not using a real mm.  loaded_mm == &init_mm.  Our CPU's bit
+        *    will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+        *
+        *  - Lazily using a real mm.  loaded_mm != &init_mm, our bit
+        *    is set in mm_cpumask(loaded_mm), but is_lazy == true.
+        *    We're heuristically guessing that the CR3 load we
+        *    skipped more than makes up for the overhead added by
+        *    lazy mode.
+        */
+       bool is_lazy;
+
+       /*
         * Access to this CR4 shadow and to H/W CR4 is protected by
         * disabling interrupts when modifying either one.
         */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d705c769f77d..ff891772c9f8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void)
        return ~0U;
 }
 
+static u32 skx_deadline_rev(void)
+{
+       switch (boot_cpu_data.x86_mask) {
+       case 0x03: return 0x01000136;
+       case 0x04: return 0x02000014;
+       }
+
+       return ~0U;
+}
+
 static const struct x86_cpu_id deadline_match[] = {
        DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X,        
hsx_deadline_rev),
        DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X,      0x0b000020),
        DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, 
bdx_deadline_rev),
-       DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X,        0x02000014),
+       DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X,        
skx_deadline_rev),
 
        DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE,     0x22),
        DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT,      0x20),
@@ -600,7 +610,8 @@ static void apic_check_deadline_errata(void)
        const struct x86_cpu_id *m;
        u32 rev;
 
-       if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+       if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
+           boot_cpu_has(X86_FEATURE_HYPERVISOR))
                return;
 
        m = x86_match_cpu(deadline_match);
diff --git a/arch/x86/kernel/cpu/microcode/core.c 
b/arch/x86/kernel/cpu/microcode/core.c
index 86e8f0b2537b..c4fa4a85d4cb 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void)
        bool *res = &dis_ucode_ldr;
 #endif
 
-       if (!have_cpuid_p())
-               return *res;
-
        /*
         * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
         * completely accurate as xen pv guests don't see that CPUID bit set but
@@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const 
char *name)
 void __init load_ucode_bsp(void)
 {
        unsigned int cpuid_1_eax;
+       bool intel = true;
 
-       if (check_loader_disabled_bsp())
+       if (!have_cpuid_p())
                return;
 
        cpuid_1_eax = native_cpuid_eax(1);
 
        switch (x86_cpuid_vendor()) {
        case X86_VENDOR_INTEL:
-               if (x86_family(cpuid_1_eax) >= 6)
-                       load_ucode_intel_bsp();
+               if (x86_family(cpuid_1_eax) < 6)
+                       return;
                break;
+
        case X86_VENDOR_AMD:
-               if (x86_family(cpuid_1_eax) >= 0x10)
-                       load_ucode_amd_bsp(cpuid_1_eax);
+               if (x86_family(cpuid_1_eax) < 0x10)
+                       return;
+               intel = false;
                break;
+
        default:
-               break;
+               return;
        }
+
+       if (check_loader_disabled_bsp())
+               return;
+
+       if (intel)
+               load_ucode_intel_bsp();
+       else
+               load_ucode_amd_bsp(cpuid_1_eax);
 }
 
 static bool check_loader_disabled_ap(void)
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index db2182d63ed0..3fc0f9a794cb 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -3,6 +3,15 @@
 
 /* Kprobes and Optprobes common header */
 
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING "     push %" _ASM_BP "\n" \
+                        "      mov  %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING "     push %" _ASM_BP "\n"
+#endif
+
 #ifdef CONFIG_X86_64
 #define SAVE_REGS_STRING                       \
        /* Skip cs, ip, orig_ax. */             \
@@ -17,7 +26,7 @@
        "       pushq %r10\n"                   \
        "       pushq %r11\n"                   \
        "       pushq %rbx\n"                   \
-       "       pushq %rbp\n"                   \
+       SAVE_RBP_STRING                         \
        "       pushq %r12\n"                   \
        "       pushq %r13\n"                   \
        "       pushq %r14\n"                   \
@@ -48,7 +57,7 @@
        "       pushl %es\n"                    \
        "       pushl %ds\n"                    \
        "       pushl %eax\n"                   \
-       "       pushl %ebp\n"                   \
+       SAVE_RBP_STRING                         \
        "       pushl %edi\n"                   \
        "       pushl %esi\n"                   \
        "       pushl %edx\n"                   \
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f0153714ddac..0742491cbb73 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs 
*regs)
         * raw stack chunk with redzones:
         */
        __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, 
MIN_STACK_SIZE(addr));
-       regs->flags &= ~X86_EFLAGS_IF;
-       trace_hardirqs_off();
        regs->ip = (unsigned long)(jp->entry);
 
        /*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 54180fa6f66f..add33f600531 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type)
        load_cr3(initial_page_table);
 #else
        write_cr3(real_mode_header->trampoline_pgd);
+
+       /* Exiting long mode will fail if CR4.PCIDE is set. */
+       if (static_cpu_has(X86_FEATURE_PCID))
+               cr4_clear_bits(X86_CR4_PCIDE);
 #endif
 
        /* Jump to the identity-mapped low memory code */
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index d145a0b1f529..3dc26f95d46e 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state)
                        state->stack_info.type, state->stack_info.next_sp,
                        state->stack_mask, state->graph_idx);
 
-       for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, 
sizeof(long))) {
+       for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+            sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
                if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
                        break;
 
@@ -174,6 +175,7 @@ static bool is_last_task_frame(struct unwind_state *state)
  * This determines if the frame pointer actually contains an encoded pointer to
  * pt_regs on the stack.  See ENCODE_FRAME_POINTER.
  */
+#ifdef CONFIG_X86_64
 static struct pt_regs *decode_frame_pointer(unsigned long *bp)
 {
        unsigned long regs = (unsigned long)bp;
@@ -183,6 +185,23 @@ static struct pt_regs *decode_frame_pointer(unsigned long 
*bp)
 
        return (struct pt_regs *)(regs & ~0x1);
 }
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+       unsigned long regs = (unsigned long)bp;
+
+       if (regs & 0x80000000)
+               return NULL;
+
+       return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
+#else
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
+#endif
 
 static bool update_stack_state(struct unwind_state *state,
                               unsigned long *next_bp)
@@ -202,7 +221,7 @@ static bool update_stack_state(struct unwind_state *state,
        regs = decode_frame_pointer(next_bp);
        if (regs) {
                frame = (unsigned long *)regs;
-               len = regs_size(regs);
+               len = KERNEL_REGS_SIZE;
                state->got_irq = true;
        } else {
                frame = next_bp;
@@ -226,6 +245,14 @@ static bool update_stack_state(struct unwind_state *state,
            frame < prev_frame_end)
                return false;
 
+       /*
+        * On 32-bit with user mode regs, make sure the last two regs are safe
+        * to access:
+        */
+       if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
+           !on_stack(info, frame, len + 2*sizeof(long)))
+               return false;
+
        /* Move state to the next frame: */
        if (regs) {
                state->regs = regs;
@@ -328,6 +355,13 @@ bool unwind_next_frame(struct unwind_state *state)
            state->regs->sp < (unsigned long)task_pt_regs(state->task))
                goto the_end;
 
+       /*
+        * There are some known frame pointer issues on 32-bit.  Disable
+        * unwinder warnings on 32-bit until it gets objtool support.
+        */
+       if (IS_ENABLED(CONFIG_X86_32))
+               goto the_end;
+
        if (state->regs) {
                printk_deferred_once(KERN_WARNING
                        "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' 
value %p\n",
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 72bf8c01c6e3..e1f095884386 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,12 @@
-# Kernel does not boot with instrumentation of tlb.c.
-KCOV_INSTRUMENT_tlb.o  := n
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c
+KCOV_INSTRUMENT_tlb.o          := n
+KCOV_INSTRUMENT_mem_encrypt.o  := n
+
+KASAN_SANITIZE_mem_encrypt.o   := n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o    = -pg
+endif
 
 obj-y  :=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o 
\
            pat.o pgtable.o physaddr.o setup_nx.o tlb.o
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..658bf0090565 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,8 @@
 
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
+DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
+
 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
                            u16 *new_asid, bool *need_flush)
 {
@@ -80,7 +82,7 @@ void leave_mm(int cpu)
                return;
 
        /* Warn if we're not lazy. */
-       WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+       WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
 
        switch_mm(NULL, &init_mm, NULL);
 }
@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
                __flush_tlb_all();
        }
 #endif
+       this_cpu_write(cpu_tlbstate.is_lazy, false);
 
        if (real_prev == next) {
                VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
                          next->context.ctx_id);
 
-               if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
-                       /*
-                        * There's nothing to do: we weren't lazy, and we
-                        * aren't changing our mm.  We don't need to flush
-                        * anything, nor do we need to update CR3, CR4, or
-                        * LDTR.
-                        */
-                       return;
-               }
-
-               /* Resume remote flushes and then read tlb_gen. */
-               cpumask_set_cpu(cpu, mm_cpumask(next));
-               next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
-               if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
-                   next_tlb_gen) {
-                       /*
-                        * Ideally, we'd have a flush_tlb() variant that
-                        * takes the known CR3 value as input.  This would
-                        * be faster on Xen PV and on hypothetical CPUs
-                        * on which INVPCID is fast.
-                        */
-                       this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
-                                      next_tlb_gen);
-                       write_cr3(build_cr3(next, prev_asid));
-                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
-                                       TLB_FLUSH_ALL);
-               }
-
                /*
-                * We just exited lazy mode, which means that CR4 and/or LDTR
-                * may be stale.  (Changes to the required CR4 and LDTR states
-                * are not reflected in tlb_gen.)
+                * We don't currently support having a real mm loaded without
+                * our cpu set in mm_cpumask().  We have all the bookkeeping
+                * in place to figure out whether we would need to flush
+                * if our cpu were cleared in mm_cpumask(), but we don't
+                * currently use it.
                 */
+               if (WARN_ON_ONCE(real_prev != &init_mm &&
+                                !cpumask_test_cpu(cpu, mm_cpumask(next))))
+                       cpumask_set_cpu(cpu, mm_cpumask(next));
+
+               return;
        } else {
                u16 new_asid;
                bool need_flush;
@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
                }
 
                /* Stop remote flushes for the previous mm */
-               if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
-                       cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
-               VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+               VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+                               real_prev != &init_mm);
+               cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
 
                /*
                 * Start remote flushes and then read tlb_gen.
@@ -233,6 +213,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 }
 
 /*
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm.  Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row.  It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+       if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+               return;
+
+       if (static_branch_unlikely(&tlb_use_lazy_mode)) {
+               /*
+                * There's a significant optimization that may be possible
+                * here.  We have accurate enough TLB flush tracking that we
+                * don't need to maintain coherence of TLB per se when we're
+                * lazy.  We do, however, need to maintain coherence of
+                * paging-structure caches.  We could, in principle, leave our
+                * old mm loaded and only switch to init_mm when
+                * tlb_remove_page() happens.
+                */
+               this_cpu_write(cpu_tlbstate.is_lazy, true);
+       } else {
+               switch_mm(NULL, &init_mm, NULL);
+       }
+}
+
+/*
  * Call this when reinitializing a CPU.  It fixes the following potential
  * problems:
  *
@@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct 
flush_tlb_info *f,
        /* This code cannot presently handle being reentered. */
        VM_WARN_ON(!irqs_disabled());
 
+       if (unlikely(loaded_mm == &init_mm))
+               return;
+
        VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
                   loaded_mm->context.ctx_id);
 
-       if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+       if (this_cpu_read(cpu_tlbstate.is_lazy)) {
                /*
-                * We're in lazy mode -- don't flush.  We can get here on
-                * remote flushes due to races and on local flushes if a
-                * kernel thread coincidentally flushes the mm it's lazily
-                * still using.
+                * We're in lazy mode.  We need to at least flush our
+                * paging-structure cache to avoid speculatively reading
+                * garbage into our TLB.  Since switching to init_mm is barely
+                * slower than a minimal flush, just switch to init_mm.
                 */
+               switch_mm_irqs_off(NULL, &init_mm, NULL);
                return;
        }
 
@@ -611,3 +626,57 @@ static int __init 
create_tlb_single_page_flush_ceiling(void)
        return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
+
+static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
+                                size_t count, loff_t *ppos)
+{
+       char buf[2];
+
+       buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
+       buf[1] = '\n';
+
+       return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t tlblazy_write_file(struct file *file,
+                const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       bool val;
+
+       if (kstrtobool_from_user(user_buf, count, &val))
+               return -EINVAL;
+
+       if (val)
+               static_branch_enable(&tlb_use_lazy_mode);
+       else
+               static_branch_disable(&tlb_use_lazy_mode);
+
+       return count;
+}
+
+static const struct file_operations fops_tlblazy = {
+       .read = tlblazy_read_file,
+       .write = tlblazy_write_file,
+       .llseek = default_llseek,
+};
+
+static int __init init_tlb_use_lazy_mode(void)
+{
+       if (boot_cpu_has(X86_FEATURE_PCID)) {
+               /*
+                * Heuristic: with PCID on, switching to and from
+                * init_mm is reasonably fast, but remote flush IPIs
+                * as expensive as ever, so turn off lazy TLB mode.
+                *
+                * We can't do this in setup_pcid() because static keys
+                * haven't been initialized yet, and it would blow up
+                * badly.
+                */
+               static_branch_disable(&tlb_use_lazy_mode);
+       }
+
+       debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
+                           arch_debugfs_dir, NULL, &fops_tlblazy);
+       return 0;
+}
+late_initcall(init_tlb_use_lazy_mode);

[GIT PULL] x86 fixes

Reply via email to