The TSS is a fairly juicy target for exploits, and, now that the TSS
is in the cpu_entry_area, it's no longer protected by kASLR.  Make it
read-only on x86_64.

On x86_32, it can't be RO because it's written by the CPU during task
switches, and we use a task gate for double faults.  I'd also be
nervous about errata if we tried to make it RO even on configurations
without double fault handling.

Signed-off-by: Andy Lutomirski <l...@kernel.org>
---
 arch/x86/entry/entry_32.S          |  4 ++--
 arch/x86/entry/entry_64.S          |  8 ++++----
 arch/x86/include/asm/fixmap.h      | 13 +++++++++----
 arch/x86/include/asm/processor.h   | 17 ++++++++---------
 arch/x86/include/asm/switch_to.h   |  4 ++--
 arch/x86/include/asm/thread_info.h |  2 +-
 arch/x86/kernel/asm-offsets.c      |  6 ++----
 arch/x86/kernel/asm-offsets_32.c   |  4 ++--
 arch/x86/kernel/cpu/common.c       | 23 ++++++++++++++++-------
 arch/x86/kernel/ioport.c           |  2 +-
 arch/x86/kernel/process.c          |  6 +++---
 arch/x86/kernel/process_32.c       |  2 +-
 arch/x86/kernel/process_64.c       |  2 +-
 arch/x86/kernel/traps.c            |  4 ++--
 arch/x86/lib/delay.c               |  4 ++--
 arch/x86/xen/enlighten_pv.c        |  2 +-
 16 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3629bcbf85a2..bd8b57a5c874 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -942,7 +942,7 @@ ENTRY(debug)
 
        /* Are we currently on the SYSENTER stack? */
        movl    PER_CPU_VAR(cpu_entry_area), %ecx
-       addl    $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + 
SIZEOF_SYSENTER_stack, %ecx
+       addl    $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
        subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
        cmpl    $SIZEOF_SYSENTER_stack, %ecx
        jb      .Ldebug_from_sysenter_stack
@@ -986,7 +986,7 @@ ENTRY(nmi)
 
        /* Are we currently on the SYSENTER stack? */
        movl    PER_CPU_VAR(cpu_entry_area), %ecx
-       addl    $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + 
SIZEOF_SYSENTER_stack, %ecx
+       addl    $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
        subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
        cmpl    $SIZEOF_SYSENTER_stack, %ecx
        jb      .Lnmi_from_sysenter_stack
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 28f4e7553c26..0b0735030328 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -158,7 +158,7 @@ END(native_usergs_sysret64)
        _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
 
 /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH    CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \
+#define RSP_SCRATCH    CPU_ENTRY_AREA_SYSENTER_stack + \
                        SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
 
 ENTRY(entry_SYSCALL_64_trampoline)
@@ -394,7 +394,7 @@ syscall_return_via_sysret:
         * Save old stack pointer and switch to trampoline stack.
         */
        movq    %rsp, %rdi
-       movq    PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+       movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
 
        pushq   RSP-RDI(%rdi)   /* RSP */
        pushq   (%rdi)          /* RDI */
@@ -722,7 +722,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
         * Save old stack pointer and switch to trampoline stack.
         */
        movq    %rsp, %rdi
-       movq    PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+       movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
 
        /* Copy the IRET frame to the trampoline stack. */
        pushq   6*8(%rdi)       /* SS */
@@ -937,7 +937,7 @@ apicinterrupt IRQ_WORK_VECTOR                       
irq_work_interrupt              smp_irq_work_interrupt
 /*
  * Exception entry points.
  */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
 
 /*
  * Switch to the thread stack.  This is called with the IRET frame and
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 9a4caed665fd..3c291d0ebfcd 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -56,9 +56,14 @@ struct cpu_entry_area {
        char gdt[PAGE_SIZE];
 
        /*
-        * The GDT is just below cpu_tss and thus serves (on x86_64) as a
-        * a read-only guard page for the SYSENTER stack at the bottom
-        * of the TSS region.
+        * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+        * a a read-only guard page.
+        */
+       struct SYSENTER_stack_page SYSENTER_stack_page;
+
+       /*
+        * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+        * we need task switches to work, and task switches write to the TSS.
         */
        struct tss_struct tss;
 
@@ -239,7 +244,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int 
cpu)
 
 static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
 {
-       return &get_cpu_entry_area((cpu))->tss.SYSENTER_stack;
+       return &get_cpu_entry_area((cpu))->SYSENTER_stack_page.stack;
 }
 
 extern void setup_cpu_entry_areas(void);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 38fa358fce2d..c9aaa43313d1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -341,13 +341,11 @@ struct SYSENTER_stack {
        unsigned long           words[64];
 };
 
-struct tss_struct {
-       /*
-        * Space for the temporary SYSENTER stack, used for SYSENTER
-        * and the entry trampoline as well.
-        */
-       struct SYSENTER_stack   SYSENTER_stack;
+struct SYSENTER_stack_page {
+       struct SYSENTER_stack stack;
+} __aligned(PAGE_SIZE);
 
+struct tss_struct {
        /*
         * The fixed hardware portion.  This must not cross a page boundary
         * at risk of violating the SDM's advice and potentially triggering
@@ -364,7 +362,7 @@ struct tss_struct {
        unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
 } __aligned(PAGE_SIZE);
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
 
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end
@@ -379,7 +377,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss);
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
 #else
-#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
 #endif
 
 /*
@@ -539,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
 static inline void
 native_load_sp0(unsigned long sp0)
 {
-       this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+       this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 static inline void native_swapgs(void)
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index b0a1aecb365f..3d529ebce27b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -79,10 +79,10 @@ do {                                                        
                \
 static inline void refresh_sysenter_cs(struct thread_struct *thread)
 {
        /* Only happens when SEP is enabled, no need to test "SEP"arately: */
-       if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+       if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == 
thread->sysenter_cs))
                return;
 
-       this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+       this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
        wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 }
 #endif
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 44a04999791e..00223333821a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * 
const stack,
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp1)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
 #endif
 
 #endif
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 46c0995344aa..b8baf3db5a12 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -94,10 +94,8 @@ void common(void) {
        BLANK();
        DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
-       OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack);
-       DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
-
-       /* Layout info for cpu_entry_area */
        OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
        OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, 
entry_trampoline);
+       OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, 
SYSENTER_stack_page);
+       DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 52ce4ea16e53..7d20d9c0b3d6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -47,8 +47,8 @@ void foo(void)
        BLANK();
 
        /* Offset from the sysenter stack to tss.sp0 */
-       DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-              offsetofend(struct tss_struct, SYSENTER_stack));
+       DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, 
tss.x86_tss.sp0) -
+              offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
 
 #ifdef CONFIG_CC_STACKPROTECTOR
        BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c0f11a684acf..f74645c4cd9a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -518,31 +518,40 @@ static const unsigned int 
exception_stack_sizes[N_EXCEPTION_STACKS] = {
 
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+
 #endif
 
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, 
SYSENTER_stack_storage);
+
 /* Setup the fixmap mappings only once per-processor */
 static void __init setup_cpu_entry_area(int cpu)
 {
 #ifdef CONFIG_X86_64
        extern char _entry_trampoline[];
 
-       /* On 64-bit systems, we use a read-only fixmap GDT. */
+       /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
        pgprot_t gdt_prot = PAGE_KERNEL_RO;
+       pgprot_t tss_prot = PAGE_KERNEL_RO;
 #else
        /*
         * On native 32-bit systems, the GDT cannot be read-only because
         * our double fault handler uses a task gate, and entering through
         * a task gate needs to change an available TSS to busy.  If the GDT
-        * is read-only, that will triple fault.
+        * is read-only, that will triple fault.  The TSS cannot be read-only
+        * because the CPU writes to it on task switches.
         *
         * On Xen PV, the GDT must be read-only because the hypervisor requires
         * it.
         */
        pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
                PAGE_KERNEL_RO : PAGE_KERNEL;
+       pgprot_t tss_prot = PAGE_KERNEL;
 #endif
 
        __set_fixmap(get_cpu_entry_area_index(cpu, gdt), 
get_cpu_gdt_paddr(cpu), gdt_prot);
+       set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, 
SYSENTER_stack_page),
+                               per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+                               PAGE_KERNEL);
 
        /*
         * The Intel SDM says (Volume 3, 7.2.1):
@@ -565,9 +574,9 @@ static void __init setup_cpu_entry_area(int cpu)
                      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
        BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
        set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
-                               &per_cpu(cpu_tss, cpu),
+                               &per_cpu(cpu_tss_rw, cpu),
                                sizeof(struct tss_struct) / PAGE_SIZE,
-                               PAGE_KERNEL);
+                               tss_prot);
 
 #ifdef CONFIG_X86_32
        per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
@@ -1339,7 +1348,7 @@ void enable_sep_cpu(void)
                return;
 
        cpu = get_cpu();
-       tss = &per_cpu(cpu_tss, cpu);
+       tss = &per_cpu(cpu_tss_rw, cpu);
 
        /*
         * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1609,7 +1618,7 @@ void cpu_init(void)
        if (cpu)
                load_ucode_ap();
 
-       t = &per_cpu(cpu_tss, cpu);
+       t = &per_cpu(cpu_tss_rw, cpu);
        oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1701,7 +1710,7 @@ void cpu_init(void)
 {
        int cpu = smp_processor_id();
        struct task_struct *curr = current;
-       struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
 
        wait_for_master_cpu(cpu);
 
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 3feb648781c4..2f723301eb58 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long 
num, int turn_on)
         * because the ->io_bitmap_max value must match the bitmap
         * contents:
         */
-       tss = &per_cpu(cpu_tss, get_cpu());
+       tss = &per_cpu(cpu_tss_rw, get_cpu());
 
        if (turn_on)
                bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 298be43d63de..aed9d94bd46f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
        .x86_tss = {
                /*
                 * .sp0 is only used when entering ring 0 from a lower
@@ -82,7 +82,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, 
cpu_tss) = {
        .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
 };
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
 
 DEFINE_PER_CPU(bool, __tss_limit_invalid);
 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
        struct fpu *fpu = &t->fpu;
 
        if (bp) {
-               struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+               struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
 
                t->io_bitmap_ptr = NULL;
                clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 45bf0c5f93e1..5224c6099184 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
        struct fpu *prev_fpu = &prev->fpu;
        struct fpu *next_fpu = &next->fpu;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
        /* never put a printk in __switch_to... printk() calls wake_up*() 
indirectly */
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bafe65b08697..2678b0bc99d9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -400,7 +400,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
        struct fpu *prev_fpu = &prev->fpu;
        struct fpu *next_fpu = &next->fpu;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
        WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
                     this_cpu_read(irq_count) != -1);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b70aec60ebbd..554f14d87575 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -365,7 +365,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, 
long error_code)
                regs->cs == __KERNEL_CS &&
                regs->ip == (unsigned long)native_irq_return_iret)
        {
-               struct pt_regs *gpregs = (struct pt_regs 
*)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+               struct pt_regs *gpregs = (struct pt_regs 
*)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
                /*
                 * regs->sp points to the failing IRET frame on the
@@ -655,7 +655,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack 
*s)
         * exception came from the IRET target.
         */
        struct bad_iret_stack *new_stack =
-               (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1;
+               (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) 
- 1;
 
        /* Copy the IRET target to the new stack. */
        memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 553f8fd23cc4..4846eff7e4c8 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
                delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
 
                /*
-                * Use cpu_tss as a cacheline-aligned, seldomly
+                * Use cpu_tss_rw as a cacheline-aligned, seldomly
                 * accessed per-cpu variable as the monitor target.
                 */
-               __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+               __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
 
                /*
                 * AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5b2b3f3f6531..b0ac47dd7b4a 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -818,7 +818,7 @@ static void xen_load_sp0(unsigned long sp0)
        mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
        xen_mc_issue(PARAVIRT_LAZY_CPU);
-       this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+       this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 void xen_set_iopl_mask(unsigned mask)
-- 
2.13.6

Reply via email to