[tip: x86/misc] selftests/x86: Add a missing .note.GNU-stack section to thunks_32.S

2021-03-18 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/misc branch of tip:

Commit-ID: f706bb59204ba1c47e896b456c97977fc97b7964
Gitweb:
https://git.kernel.org/tip/f706bb59204ba1c47e896b456c97977fc97b7964
Author:Andy Lutomirski 
AuthorDate:Thu, 04 Mar 2021 09:01:55 -08:00
Committer: Borislav Petkov 
CommitterDate: Thu, 18 Mar 2021 11:05:14 +01:00

selftests/x86: Add a missing .note.GNU-stack section to thunks_32.S

test_syscall_vdso_32 ended up with an executable stacks because the asm
was missing the annotation that says that it is modern and doesn't need
an executable stack. Add the annotation.

This was missed in commit aeaaf005da1d ("selftests/x86: Add missing
.note.GNU-stack sections").

Fixes: aeaaf005da1d ("selftests/x86: Add missing .note.GNU-stack sections")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/487ed5348a43c031b816fa7e9efedb75dc324299.1614877299.git.l...@kernel.org
---
 tools/testing/selftests/x86/thunks_32.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/selftests/x86/thunks_32.S 
b/tools/testing/selftests/x86/thunks_32.S
index a71d92d..f3f56e6 100644
--- a/tools/testing/selftests/x86/thunks_32.S
+++ b/tools/testing/selftests/x86/thunks_32.S
@@ -45,3 +45,5 @@ call64_from_32:
ret
 
 .size call64_from_32, .-call64_from_32
+
+.section .note.GNU-stack,"",%progbits


[tip: x86/core] x86/stackprotector/32: Make the canary into a regular percpu variable

2021-03-08 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/core branch of tip:

Commit-ID: 3fb0fdb3bbe7aed495109b3296b06c2409734023
Gitweb:
https://git.kernel.org/tip/3fb0fdb3bbe7aed495109b3296b06c2409734023
Author:Andy Lutomirski 
AuthorDate:Sat, 13 Feb 2021 11:19:44 -08:00
Committer: Borislav Petkov 
CommitterDate: Mon, 08 Mar 2021 13:19:05 +01:00

x86/stackprotector/32: Make the canary into a regular percpu variable

On 32-bit kernels, the stackprotector canary is quite nasty -- it is
stored at %gs:(20), which is nasty because 32-bit kernels use %fs for
percpu storage.  It's even nastier because it means that whether %gs
contains userspace state or kernel state while running kernel code
depends on whether stackprotector is enabled (this is
CONFIG_X86_32_LAZY_GS), and this setting radically changes the way
that segment selectors work.  Supporting both variants is a
maintenance and testing mess.

Merely rearranging so that percpu and the stack canary
share the same segment would be messy as the 32-bit percpu address
layout isn't currently compatible with putting a variable at a fixed
offset.

Fortunately, GCC 8.1 added options that allow the stack canary to be
accessed as %fs:__stack_chk_guard, effectively turning it into an ordinary
percpu variable.  This lets us get rid of all of the code to manage the
stack canary GDT descriptor and the CONFIG_X86_32_LAZY_GS mess.

(That name is special.  We could use any symbol we want for the
 %fs-relative mode, but for CONFIG_SMP=n, gcc refuses to let us use any
 name other than __stack_chk_guard.)

Forcibly disable stackprotector on older compilers that don't support
the new options and turn the stack canary into a percpu variable. The
"lazy GS" approach is now used for all 32-bit configurations.

Also makes load_gs_index() work on 32-bit kernels. On 64-bit kernels,
it loads the GS selector and updates the user GSBASE accordingly. (This
is unchanged.) On 32-bit kernels, it loads the GS selector and updates
GSBASE, which is now always the user base. This means that the overall
effect is the same on 32-bit and 64-bit, which avoids some ifdeffery.

 [ bp: Massage commit message. ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/c0ff7dba14041c7e5d1cae5d4df052f03759bef3.1613243844.git.l...@kernel.org
---
 arch/x86/Kconfig  |  7 +--
 arch/x86/Makefile |  8 ++-
 arch/x86/entry/entry_32.S | 56 +---
 arch/x86/include/asm/processor.h  | 15 +---
 arch/x86/include/asm/ptrace.h |  5 +-
 arch/x86/include/asm/segment.h| 30 ++--
 arch/x86/include/asm/stackprotector.h | 79 --
 arch/x86/include/asm/suspend_32.h |  6 +--
 arch/x86/kernel/asm-offsets_32.c  |  5 +-
 arch/x86/kernel/cpu/common.c  |  5 +-
 arch/x86/kernel/doublefault_32.c  |  4 +-
 arch/x86/kernel/head_32.S | 18 +-
 arch/x86/kernel/setup_percpu.c|  1 +-
 arch/x86/kernel/tls.c |  8 +--
 arch/x86/lib/insn-eval.c  |  4 +-
 arch/x86/platform/pvh/head.S  | 14 +
 arch/x86/power/cpu.c  |  6 +--
 arch/x86/xen/enlighten_pv.c   |  1 +-
 scripts/gcc-x86_32-has-stack-protector.sh |  6 +-
 19 files changed, 60 insertions(+), 218 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879..10cc619 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -360,10 +360,6 @@ config X86_64_SMP
def_bool y
depends on X86_64 && SMP
 
-config X86_32_LAZY_GS
-   def_bool y
-   depends on X86_32 && !STACKPROTECTOR
-
 config ARCH_SUPPORTS_UPROBES
def_bool y
 
@@ -386,7 +382,8 @@ config CC_HAS_SANE_STACKPROTECTOR
default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh 
$(CC))
help
   We have to make sure stack protector is unconditionally disabled if
-  the compiler produces broken code.
+  the compiler produces broken code or if it does not let us control
+  the segment on 32-bit kernels.
 
 menu "Processor type and features"
 
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d6d5a2..952f534 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -79,6 +79,14 @@ ifeq ($(CONFIG_X86_32),y)
 
 # temporary until string.h is fixed
 KBUILD_CFLAGS += -ffreestanding
+
+   ifeq ($(CONFIG_STACKPROTECTOR),y)
+   ifeq ($(CONFIG_SMP),y)
+   KBUILD_CFLAGS += -mstack-protector-guard-reg=fs 
-mstack-protector-guard-symbol=__stack_chk_guard
+   else
+   KBUILD_CFLAGS += -mstack-protector-guard=global
+   endif
+   endif
 else
 BITS := 64
 UTS_MACHINE := x86_64
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017..eb0cb66 100644
--- 

[tip: x86/core] x86/entry/32: Remove leftover macros after stackprotector cleanups

2021-03-08 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/core branch of tip:

Commit-ID: d0962f2b24c99889a386f0658c71535f56358f77
Gitweb:
https://git.kernel.org/tip/d0962f2b24c99889a386f0658c71535f56358f77
Author:Andy Lutomirski 
AuthorDate:Sat, 13 Feb 2021 11:19:45 -08:00
Committer: Borislav Petkov 
CommitterDate: Mon, 08 Mar 2021 13:27:31 +01:00

x86/entry/32: Remove leftover macros after stackprotector cleanups

Now that nonlazy-GS mode is gone, remove the macros from entry_32.S
that obfuscated^Wabstracted GS handling.  The assembled output is
identical before and after this patch.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/b1543116f0f0e68f1763d90d5f7fcec27885dff5.1613243844.git.l...@kernel.org
---
 arch/x86/entry/entry_32.S | 43 +-
 1 file changed, 2 insertions(+), 41 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index eb0cb66..bee9101 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -53,35 +53,6 @@
 
 #define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
 
-/*
- * User gs save/restore
- *
- * This is leftover junk from CONFIG_X86_32_LAZY_GS.  A subsequent patch
- * will remove it entirely.
- */
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
-   pushl   $0
-.endm
-.macro POP_GS pop=0
-   addl$(4 + \pop), %esp
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-
 /* Unconditionally switch to user cr3 */
 .macro SWITCH_TO_USER_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
@@ -234,7 +205,7 @@
 .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
cld
 .if \skip_gs == 0
-   PUSH_GS
+   pushl   $0
 .endif
pushl   %fs
 
@@ -259,9 +230,6 @@
movl$(__USER_DS), %edx
movl%edx, %ds
movl%edx, %es
-.if \skip_gs == 0
-   SET_KERNEL_GS %edx
-.endif
/* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
SWITCH_TO_KERNEL_STACK
@@ -300,7 +268,7 @@
 1: popl%ds
 2: popl%es
 3: popl%fs
-   POP_GS \pop
+   addl$(4 + \pop), %esp   /* pop the unused "gs" slot */
IRET_FRAME
 .pushsection .fixup, "ax"
 4: movl$0, (%esp)
@@ -313,7 +281,6 @@
_ASM_EXTABLE(1b, 4b)
_ASM_EXTABLE(2b, 5b)
_ASM_EXTABLE(3b, 6b)
-   POP_GS_EX
 .endm
 
 .macro RESTORE_ALL_NMI cr3_reg:req pop=0
@@ -928,7 +895,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
movlPT_EIP(%esp), %edx  /* pt_regs->ip */
movlPT_OLDESP(%esp), %ecx   /* pt_regs->sp */
 1: mov PT_FS(%esp), %fs
-   PTGS_TO_GS
 
popl%ebx/* pt_regs->bx */
addl$2*4, %esp  /* skip pt_regs->cx and pt_regs->dx */
@@ -964,7 +930,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
jmp 1b
 .popsection
_ASM_EXTABLE(1b, 2b)
-   PTGS_TO_GS_EX
 
 .Lsysenter_fix_flags:
pushl   $X86_EFLAGS_FIXED
@@ -1106,11 +1071,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
 
-   /* fixup %gs */
-   GS_TO_REG %ecx
movlPT_GS(%esp), %edi   # get the function address
-   REG_TO_PTGS %ecx
-   SET_KERNEL_GS %ecx
 
/* fixup orig %eax */
movlPT_ORIG_EAX(%esp), %edx # get the error code


[tip: x86/urgent] x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls

2021-03-06 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 5d5675df792ff67e74a500c4c94db0f99e6a10ef
Gitweb:
https://git.kernel.org/tip/5d5675df792ff67e74a500c4c94db0f99e6a10ef
Author:Andy Lutomirski 
AuthorDate:Thu, 04 Mar 2021 11:05:54 -08:00
Committer: Borislav Petkov 
CommitterDate: Sat, 06 Mar 2021 13:10:06 +01:00

x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls

On a 32-bit fast syscall that fails to read its arguments from user
memory, the kernel currently does syscall exit work but not
syscall entry work.  This confuses audit and ptrace.  For example:

$ ./tools/testing/selftests/x86/syscall_arg_fault_32
...
strace: pid 264258: entering, ptrace_syscall_info.op == 2
...

This is a minimal fix intended for ease of backporting.  A more
complete cleanup is coming.

Fixes: 0b085e68f407 ("x86/entry: Consolidate 32/64 bit syscall entry")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Signed-off-by: Borislav Petkov 
Cc: sta...@vger.kernel.org
Link: 
https://lore.kernel.org/r/8c82296ddf803b91f8d1e5eac89e5803ba54ab0e.1614884673.git.l...@kernel.org
---
 arch/x86/entry/common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a2433ae..4efd39a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs 
*regs)
regs->ax = -EFAULT;
 
instrumentation_end();
-   syscall_exit_to_user_mode(regs);
+   local_irq_disable();
+   irqentry_exit_to_user_mode(regs);
return false;
}
 


[tip: x86/urgent] x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls

2021-03-06 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: e59ba7bf71a09e474198741563e0e587ae43d1c7
Gitweb:
https://git.kernel.org/tip/e59ba7bf71a09e474198741563e0e587ae43d1c7
Author:Andy Lutomirski 
AuthorDate:Thu, 04 Mar 2021 11:05:54 -08:00
Committer: Borislav Petkov 
CommitterDate: Sat, 06 Mar 2021 11:37:00 +01:00

x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls

On a 32-bit fast syscall that fails to read its arguments from user
memory, the kernel currently does syscall exit work but not
syscall entry work.  This confuses audit and ptrace.  For example:

$ ./tools/testing/selftests/x86/syscall_arg_fault_32
...
strace: pid 264258: entering, ptrace_syscall_info.op == 2
...

This is a minimal fix intended for ease of backporting.  A more
complete cleanup is coming.

Fixes: 0b085e68f407 ("x86/entry: Consolidate 32/64 bit syscall entry")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Link: 
https://lore.kernel.org/r/8c82296ddf803b91f8d1e5eac89e5803ba54ab0e.1614884673.git.l...@kernel.org

---
 arch/x86/entry/common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a2433ae..4efd39a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs 
*regs)
regs->ax = -EFAULT;
 
instrumentation_end();
-   syscall_exit_to_user_mode(regs);
+   local_irq_disable();
+   irqentry_exit_to_user_mode(regs);
return false;
}
 


[tip: x86/urgent] x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls

2021-03-05 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: dabf017539988a9bfc40a38dbafd35c501bacc44
Gitweb:
https://git.kernel.org/tip/dabf017539988a9bfc40a38dbafd35c501bacc44
Author:Andy Lutomirski 
AuthorDate:Thu, 04 Mar 2021 11:05:54 -08:00
Committer: Thomas Gleixner 
CommitterDate: Fri, 05 Mar 2021 11:10:13 +01:00

x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls

On a 32-bit fast syscall that fails to read its arguments from user
memory, the kernel currently does syscall exit work but not
syscall entry work.  This confuses audit and ptrace.  For example:

$ ./tools/testing/selftests/x86/syscall_arg_fault_32
...
strace: pid 264258: entering, ptrace_syscall_info.op == 2
...

This is a minimal fix intended for ease of backporting.  A more
complete cleanup is coming.

Fixes: 0b085e68f407 ("x86/entry: Consolidate 32/64 bit syscall entry")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Link: 
https://lore.kernel.org/r/8c82296ddf803b91f8d1e5eac89e5803ba54ab0e.1614884673.git.l...@kernel.org

---
 arch/x86/entry/common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a2433ae..4efd39a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs 
*regs)
regs->ax = -EFAULT;
 
instrumentation_end();
-   syscall_exit_to_user_mode(regs);
+   local_irq_disable();
+   irqentry_exit_to_user_mode(regs);
return false;
}
 


[tip: x86/mm] x86/fault: Fix AMD erratum #91 errata fixup for user code

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 35f1c89b0cce247bf0213df243ed902989b1dcda
Gitweb:
https://git.kernel.org/tip/35f1c89b0cce247bf0213df243ed902989b1dcda
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:33 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 13:11:41 +01:00

x86/fault: Fix AMD erratum #91 errata fixup for user code

The recent rework of probe_kernel_address() and its conversion to
get_kernel_nofault() inadvertently broke is_prefetch(). Before this
change, probe_kernel_address() was used as a sloppy "read user or
kernel memory" helper, but it doesn't do that any more. The new
get_kernel_nofault() reads *kernel* memory only, which completely broke
is_prefetch() for user access.

Adjust the code to the correct accessor based on access mode. The
manual address bounds check is no longer necessary, since the accessor
helpers (get_user() / get_kernel_nofault()) do the right thing all by
themselves. As a bonus, by using the correct accessor, the open-coded
address bounds check is not needed anymore.

 [ bp: Massage commit message. ]

Fixes: eab0c6089b68 ("maccess: unify the probe kernel arch hooks")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Reviewed-by: Christoph Hellwig 
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/b91f7f92f3367d2d3a88eec3b09c6aab1b2dc8ef.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 27 +--
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f1f1b5a..441c3e9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
  * 32-bit mode:
  *
  *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- *   Check that here and ignore it.
+ *   Check that here and ignore it.  This is AMD erratum #91.
  *
  * 64-bit mode:
  *
@@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char 
*instr,
 #ifdef CONFIG_X86_64
case 0x40:
/*
-* In AMD64 long mode 0x40..0x4F are valid REX prefixes
-* Need to figure out under what instruction mode the
-* instruction was issued. Could check the LDT for lm,
-* but for now it's good enough to assume that long
-* mode only uses well known segments or kernel.
+* In 64-bit mode 0x40..0x4F are valid REX prefixes
 */
return (!user_mode(regs) || user_64bit_mode(regs));
 #endif
@@ -127,20 +123,31 @@ is_prefetch(struct pt_regs *regs, unsigned long 
error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
 
-   if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
-   return 0;
+   /*
+* This code has historically always bailed out if IP points to a
+* not-present page (e.g. due to a race).  No one has ever
+* complained about this.
+*/
+   pagefault_disable();
 
while (instr < max_instr) {
unsigned char opcode;
 
-   if (get_kernel_nofault(opcode, instr))
-   break;
+   if (user_mode(regs)) {
+   if (get_user(opcode, instr))
+   break;
+   } else {
+   if (get_kernel_nofault(opcode, instr))
+   break;
+   }
 
instr++;
 
if (!check_prefetch_opcode(regs, instr, opcode, ))
break;
}
+
+   pagefault_enable();
return prefetch;
 }
 


[tip: x86/mm] x86/fault: Split the OOPS code out from no_context()

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 2cc624b0a7e68ba8957b18600181f7d5b0f3e1b6
Gitweb:
https://git.kernel.org/tip/2cc624b0a7e68ba8957b18600181f7d5b0f3e1b6
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:41 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:33:36 +01:00

x86/fault: Split the OOPS code out from no_context()

Not all callers of no_context() want to run exception fixups.
Separate the OOPS code out from the fixup code in no_context().

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/450f8d8eabafb83a5df349108c8e5ea83a2f939d.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 116 ++-
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index cbb1a97..dbf6a94 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -655,53 +655,20 @@ static void set_signal_archinfo(unsigned long address,
 }
 
 static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, int signal, int si_code)
+page_fault_oops(struct pt_regs *regs, unsigned long error_code,
+   unsigned long address)
 {
-   struct task_struct *tsk = current;
unsigned long flags;
int sig;
 
if (user_mode(regs)) {
/*
-* This is an implicit supervisor-mode access from user
-* mode.  Bypass all the kernel-mode recovery code and just
-* OOPS.
+* Implicit kernel access from user mode?  Skip the stack
+* overflow and EFI special cases.
 */
goto oops;
}
 
-   /* Are we prepared to handle this kernel fault? */
-   if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
-   /*
-* Any interrupt that takes a fault gets the fixup. This makes
-* the below recursive fault logic only apply to a faults from
-* task context.
-*/
-   if (in_interrupt())
-   return;
-
-   /*
-* Per the above we're !in_interrupt(), aka. task context.
-*
-* In this case we need to make sure we're not recursively
-* faulting through the emulate_vsyscall() logic.
-*/
-   if (current->thread.sig_on_uaccess_err && signal) {
-   sanitize_error_code(address, _code);
-
-   set_signal_archinfo(address, error_code);
-
-   /* XXX: hwpoison faults will set the wrong code. */
-   force_sig_fault(signal, si_code, (void __user 
*)address);
-   }
-
-   /*
-* Barring that, we can do the fixup and be happy.
-*/
-   return;
-   }
-
 #ifdef CONFIG_VMAP_STACK
/*
 * Stack overflow?  During boot, we can fault near the initial
@@ -709,8 +676,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 * that we're in vmalloc space to avoid this.
 */
if (is_vmalloc_addr((void *)address) &&
-   (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
-address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+   (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
+address - ((unsigned long)current->stack + THREAD_SIZE) < 
PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void 
*);
/*
 * We're likely to be running with very little stack space
@@ -734,20 +701,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 #endif
 
/*
-* 32-bit:
-*
-*   Valid to do another page fault here, because if this fault
-*   had been triggered by is_prefetch fixup_exception would have
-*   handled it.
-*
-* 64-bit:
-*
-*   Hall of shame of CPU/BIOS bugs.
-*/
-   if (is_prefetch(regs, error_code, address))
-   return;
-
-   /*
 * Buggy firmware could access regions which might page fault, try to
 * recover from such faults.
 */
@@ -763,7 +716,7 @@ oops:
 
show_fault_oops(regs, error_code, address);
 
-   if (task_stack_end_corrupted(tsk))
+   if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 
sig = SIGKILL;
@@ -776,6 +729,61 @@ oops:
oops_end(flags, regs, sig);
 }
 
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+  unsigned long address, int signal, int si_code)
+{
+   if (user_mode(regs)) {
+   /*
+

[tip: x86/mm] x86/fault: Skip the AMD erratum #91 workaround on unaffected CPUs

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: d24df8ecf9b6f81029f520ae7158a8670a28d70b
Gitweb:
https://git.kernel.org/tip/d24df8ecf9b6f81029f520ae7158a8670a28d70b
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:34 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 13:38:12 +01:00

x86/fault: Skip the AMD erratum #91 workaround on unaffected CPUs

According to the Revision Guide for AMD Athlon™ 64 and AMD Opteron™
Processors, only early revisions of family 0xF are affected. This will
avoid unnecessarily fetching instruction bytes before sending SIGSEGV to
user programs.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/477173b7784bc28afb3e53d76ae5ef143917e8dd.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 441c3e9..818902b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -106,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char 
*instr,
}
 }
 
+static bool is_amd_k8_pre_npt(void)
+{
+   struct cpuinfo_x86 *c = _cpu_data;
+
+   return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
+   c->x86_vendor == X86_VENDOR_AMD &&
+   c->x86 == 0xf && c->x86_model < 0x40);
+}
+
 static int
 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 {
@@ -113,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long 
error_code, unsigned long addr)
unsigned char *instr;
int prefetch = 0;
 
+   /* Erratum #91 affects AMD K8, pre-NPT CPUs */
+   if (!is_amd_k8_pre_npt())
+   return 0;
+
/*
 * If it was a exec (instruction fetch) fault on NX page, then
 * do not ignore the fault:


[tip: x86/mm] x86/fault/32: Move is_f00f_bug() to do_kern_addr_fault()

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: f42a40fd53fb5c77bae67d917d66078dbaa46bc2
Gitweb:
https://git.kernel.org/tip/f42a40fd53fb5c77bae67d917d66078dbaa46bc2
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:36 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:11:07 +01:00

x86/fault/32: Move is_f00f_bug() to do_kern_addr_fault()

bad_area() and its relatives are called from many places in fault.c, and
exactly one of them wants the F00F workaround.

__bad_area_nosemaphore() no longer contains any kernel fault code, which
prepares for further cleanups.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/e9668729a48ce6754022b0a4415631e8ebdd00e7.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 91cf7a6..3ffed00 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -482,10 +482,12 @@ static int is_errata100(struct pt_regs *regs, unsigned 
long address)
 }
 
 /* Pentium F0 0F C7 C8 bug workaround: */
-static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
+  unsigned long address)
 {
 #ifdef CONFIG_X86_F00F_BUG
-   if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+   if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
+   idt_is_f00f_address(address)) {
handle_invalid_op(regs);
return 1;
}
@@ -853,9 +855,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
-   if (is_f00f_bug(regs, address))
-   return;
-
no_context(regs, error_code, address, SIGSEGV, si_code);
 }
 
@@ -1195,6 +1194,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
}
 #endif
 
+   if (is_f00f_bug(regs, hw_error_code, address))
+   return;
+
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;


[tip: x86/mm] x86/fault: Document the locking in the fault_signal_pending() path

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: ef2544fb3f6457b79fc73cea39dafd67ee0f2824
Gitweb:
https://git.kernel.org/tip/ef2544fb3f6457b79fc73cea39dafd67ee0f2824
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:37 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:12:07 +01:00

x86/fault: Document the locking in the fault_signal_pending() path

If fault_signal_pending() returns true, then the core mm has unlocked the
mm for us.  Add a comment to help future readers of this code.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/c56de3d103f40e6304437b150aa7b215530d23f7.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3ffed00..013910b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1380,8 +1380,11 @@ good_area:
 */
fault = handle_mm_fault(vma, address, flags, regs);
 
-   /* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
+   /*
+* Quick path to respond to signals.  The core mm code
+* has unlocked the mm for us if we get here.
+*/
if (!user_mode(regs))
no_context(regs, error_code, address, SIGBUS,
   BUS_ADRERR);


[tip: x86/mm] x86/fault: Correct a few user vs kernel checks wrt WRUSS

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 56e62cd28aaae2fcbec8af67b05843c47c6da170
Gitweb:
https://git.kernel.org/tip/56e62cd28aaae2fcbec8af67b05843c47c6da170
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:38 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:13:32 +01:00

x86/fault: Correct a few user vs kernel checks wrt WRUSS

In general, page fault errors for WRUSS should be just like get_user(),
etc.  Fix three bugs in this area:

There is a comment that says that, if the kernel can't handle a page fault
on a user address due to OOM, the OOM-kill-and-retry logic would be
skipped.  The code checked kernel *privilege*, not kernel mode, so it
missed WRUSS.  This means that the kernel would malfunction if it got OOM
on a WRUSS fault -- this would be a kernel-mode, user-privilege fault, and
the OOM killer would be invoked and the handler would retry the faulting
instruction.

A failed user access from kernel while a fatal signal is pending should
fail even if the instruction in question was WRUSS.

do_sigbus() should not send SIGBUS for WRUSS -- it should handle it like
any other kernel mode failure.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/a7b7bcea730bd4069e6b7e629236bb2cf526c2fb.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 013910b..b110484 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -945,7 +945,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
  vm_fault_t fault)
 {
/* Kernel mode? Handle exceptions or die: */
-   if (!(error_code & X86_PF_USER)) {
+   if (!user_mode(regs)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -1217,7 +1217,14 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
 }
 NOKPROBE_SYMBOL(do_kern_addr_fault);
 
-/* Handle faults in the user portion of the address space */
+/*
+ * Handle faults in the user portion of the address space.  Nothing in here
+ * should check X86_PF_USER without a specific justification: for almost
+ * all purposes, we should treat a normal kernel access to user memory
+ * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
+ * The one exception is AC flag handling, which is, per the x86
+ * architecture, special for WRUSS.
+ */
 static inline
 void do_user_addr_fault(struct pt_regs *regs,
unsigned long error_code,
@@ -1406,14 +1413,14 @@ good_area:
if (likely(!(fault & VM_FAULT_ERROR)))
return;
 
-   if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
+   if (fatal_signal_pending(current) && !user_mode(regs)) {
no_context(regs, error_code, address, 0, 0);
return;
}
 
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
-   if (!(error_code & X86_PF_USER)) {
+   if (!user_mode(regs)) {
no_context(regs, error_code, address,
   SIGSEGV, SEGV_MAPERR);
return;


[tip: x86/mm] x86/fault: Fold mm_fault_error() into do_user_addr_fault()

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: ec352711ceba890ea3a0c182c2d49c86c1a5e30e
Gitweb:
https://git.kernel.org/tip/ec352711ceba890ea3a0c182c2d49c86c1a5e30e
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:35 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:10:07 +01:00

x86/fault: Fold mm_fault_error() into do_user_addr_fault()

mm_fault_error() is logically just the end of do_user_addr_fault().
Combine the functions.  This makes the code easier to read.

Most of the churn here is from renaming hw_error_code to error_code in
do_user_addr_fault().

This makes no difference at all to the generated code (objdump -dr) as
compared to changing noinline to __always_inline in the definition of
mm_fault_error().

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/dedc4d9c9b047e51ce38b991bd23971a28af4e7b.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 97 
 1 file changed, 45 insertions(+), 52 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 818902b..91cf7a6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -981,40 +981,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
 }
 
-static noinline void
-mm_fault_error(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, vm_fault_t fault)
-{
-   if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
-   no_context(regs, error_code, address, 0, 0);
-   return;
-   }
-
-   if (fault & VM_FAULT_OOM) {
-   /* Kernel mode? Handle exceptions or die: */
-   if (!(error_code & X86_PF_USER)) {
-   no_context(regs, error_code, address,
-  SIGSEGV, SEGV_MAPERR);
-   return;
-   }
-
-   /*
-* We ran out of memory, call the OOM killer, and return the
-* userspace (which will retry the fault, or kill us if we got
-* oom-killed):
-*/
-   pagefault_out_of_memory();
-   } else {
-   if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
-VM_FAULT_HWPOISON_LARGE))
-   do_sigbus(regs, error_code, address, fault);
-   else if (fault & VM_FAULT_SIGSEGV)
-   bad_area_nosemaphore(regs, error_code, address);
-   else
-   BUG();
-   }
-}
-
 static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
 {
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
@@ -1252,7 +1218,7 @@ NOKPROBE_SYMBOL(do_kern_addr_fault);
 /* Handle faults in the user portion of the address space */
 static inline
 void do_user_addr_fault(struct pt_regs *regs,
-   unsigned long hw_error_code,
+   unsigned long error_code,
unsigned long address)
 {
struct vm_area_struct *vma;
@@ -1272,8 +1238,8 @@ void do_user_addr_fault(struct pt_regs *regs,
 * Reserved bits are never expected to be set on
 * entries in the user portion of the page tables.
 */
-   if (unlikely(hw_error_code & X86_PF_RSVD))
-   pgtable_bad(regs, hw_error_code, address);
+   if (unlikely(error_code & X86_PF_RSVD))
+   pgtable_bad(regs, error_code, address);
 
/*
 * If SMAP is on, check for invalid kernel (supervisor) access to user
@@ -1283,10 +1249,10 @@ void do_user_addr_fault(struct pt_regs *regs,
 * enforcement appears to be consistent with the USER bit.
 */
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
-!(hw_error_code & X86_PF_USER) &&
+!(error_code & X86_PF_USER) &&
 !(regs->flags & X86_EFLAGS_AC)))
{
-   bad_area_nosemaphore(regs, hw_error_code, address);
+   bad_area_nosemaphore(regs, error_code, address);
return;
}
 
@@ -1295,7 +1261,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 * in a region with pagefaults disabled then we must not take the fault
 */
if (unlikely(faulthandler_disabled() || !mm)) {
-   bad_area_nosemaphore(regs, hw_error_code, address);
+   bad_area_nosemaphore(regs, error_code, address);
return;
}
 
@@ -1316,9 +1282,9 @@ void do_user_addr_fault(struct pt_regs *regs,
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
-   if (hw_error_code & X86_PF_WRITE)
+   if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
-   if (hw_error_code & X86_PF_INSTR)
+   if 

[tip: x86/mm] x86/fault: Improve kernel-executing-user-memory handling

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 03c81ea3331658f613bb2913d33764a4e0410cbd
Gitweb:
https://git.kernel.org/tip/03c81ea3331658f613bb2913d33764a4e0410cbd
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:39 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:20:54 +01:00

x86/fault: Improve kernel-executing-user-memory handling

Right now, the case of the kernel trying to execute from user memory
is treated more or less just like the kernel getting a page fault on a
user access. In the failure path, it checks for erratum #93, tries to
otherwise fix up the error, and then oopses.

If it manages to jump to the user address space, with or without SMEP,
it should not try to resolve the page fault. This is an error, pure and
simple. Rearrange the code so that this case is caught early, check for
erratum #93, and bail out.

 [ bp: Massage commit message. ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/ab8719c7afb8bd501c4eee0e36493150fbbe5f6a.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b110484..cbb1a97 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -447,6 +447,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long 
address)
|| boot_cpu_data.x86 != 0xf)
return 0;
 
+   if (user_mode(regs))
+   return 0;
+
if (address != regs->ip)
return 0;
 
@@ -744,9 +747,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_prefetch(regs, error_code, address))
return;
 
-   if (is_errata93(regs, address))
-   return;
-
/*
 * Buggy firmware could access regions which might page fault, try to
 * recover from such faults.
@@ -1239,6 +1239,21 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current;
mm = tsk->mm;
 
+   if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == 
X86_PF_INSTR)) {
+   /*
+* Whoops, this is kernel mode code trying to execute from
+* user memory.  Unless this is AMD erratum #93, which
+* corrupts RIP such that it looks like a user address,
+* this is unrecoverable.  Don't even try to look up the
+* VMA.
+*/
+   if (is_errata93(regs, address))
+   return;
+
+   bad_area_nosemaphore(regs, error_code, address);
+   return;
+   }
+
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
return;


[tip: x86/mm] x86/fault: Bypass no_context() for implicit kernel faults from usermode

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 5042d40a264c8a508d58ed71e4c07b05175b3635
Gitweb:
https://git.kernel.org/tip/5042d40a264c8a508d58ed71e4c07b05175b3635
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:42 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:39:52 +01:00

x86/fault: Bypass no_context() for implicit kernel faults from usermode

Drop an indentation level and remove the last user_mode(regs) == true
caller of no_context() by directly OOPSing for implicit kernel faults
from usermode.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/6e3d1129494a8de1e59d28012286e3a292a2296e.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 59 +++-
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index dbf6a94..187975b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -826,44 +826,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned 
long error_code,
 {
struct task_struct *tsk = current;
 
-   /* User mode accesses just cause a SIGSEGV */
-   if (user_mode(regs) && (error_code & X86_PF_USER)) {
-   /*
-* It's possible to have interrupts off here:
-*/
-   local_irq_enable();
+   if (!user_mode(regs)) {
+   no_context(regs, error_code, address, pkey, si_code);
+   return;
+   }
 
-   /*
-* Valid to do another page fault here because this one came
-* from user space:
-*/
-   if (is_prefetch(regs, error_code, address))
-   return;
+   if (!(error_code & X86_PF_USER)) {
+   /* Implicit user access to kernel memory -- just oops */
+   page_fault_oops(regs, error_code, address);
+   return;
+   }
 
-   if (is_errata100(regs, address))
-   return;
+   /*
+* User mode accesses just cause a SIGSEGV.
+* It's possible to have interrupts off here:
+*/
+   local_irq_enable();
 
-   sanitize_error_code(address, _code);
+   /*
+* Valid to do another page fault here because this one came
+* from user space:
+*/
+   if (is_prefetch(regs, error_code, address))
+   return;
 
-   if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, 
address))
-   return;
+   if (is_errata100(regs, address))
+   return;
 
-   if (likely(show_unhandled_signals))
-   show_signal_msg(regs, error_code, address, tsk);
+   sanitize_error_code(address, _code);
 
-   set_signal_archinfo(address, error_code);
+   if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
+   return;
 
-   if (si_code == SEGV_PKUERR)
-   force_sig_pkuerr((void __user *)address, pkey);
+   if (likely(show_unhandled_signals))
+   show_signal_msg(regs, error_code, address, tsk);
 
-   force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+   set_signal_archinfo(address, error_code);
 
-   local_irq_disable();
+   if (si_code == SEGV_PKUERR)
+   force_sig_pkuerr((void __user *)address, pkey);
 
-   return;
-   }
+   force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 
-   no_context(regs, error_code, address, SIGSEGV, si_code);
+   local_irq_disable();
 }
 
 static noinline void


[tip: x86/mm] x86/{fault,efi}: Fix and rename efi_recover_from_page_fault()

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: c46f52231e79af025e2c89e889d69ec20a4c024f
Gitweb:
https://git.kernel.org/tip/c46f52231e79af025e2c89e889d69ec20a4c024f
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:46 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 18:39:23 +01:00

x86/{fault,efi}: Fix and rename efi_recover_from_page_fault()

efi_recover_from_page_fault() doesn't recover -- it does a special EFI
mini-oops.  Rename it to make it clear that it crashes.

While renaming it, I noticed a blatant bug: a page fault oops in a
different thread happening concurrently with an EFI runtime service call
would be misinterpreted as an EFI page fault.  Fix that.

This isn't quite exact. The situation could be improved by using a
special CS for calls into EFI.

 [ bp: Massage commit message and simplify in interrupt check. ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/f43b1e80830dc78ed60ed8b0826f4f189254570c.1612924255.git.l...@kernel.org
---
 arch/x86/include/asm/efi.h |  2 +-
 arch/x86/mm/fault.c| 11 ++-
 arch/x86/platform/efi/quirks.c | 16 
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c98f783..4b7706d 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -150,7 +150,7 @@ extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
 extern void efi_switch_mm(struct mm_struct *mm);
-extern void efi_recover_from_page_fault(unsigned long phys_addr);
+extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
 extern void efi_free_boot_services(void);
 
 /* kexec external ABI */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 1c3054b..7b3a125 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,7 +16,7 @@
 #include /* prefetchw*/
 #include /* exception_enter(), ...   */
 #include  /* faulthandler_disabled()  */
-#include  /* efi_recover_from_page_fault()*/
+#include  /* 
efi_crash_gracefully_on_page_fault()*/
 #include 
 
 #include /* boot_cpu_has, ...*/
@@ -25,7 +25,7 @@
 #include   /* emulate_vsyscall */
 #include   /* struct vm86  */
 #include/* vma_pkey()   */
-#include/* efi_recover_from_page_fault()*/
+#include/* 
efi_crash_gracefully_on_page_fault()*/
 #include   /* store_idt(), ... */
 #include /* exception stack  
*/
 #include  /* VMALLOC_START, ...   */
@@ -701,11 +701,12 @@ page_fault_oops(struct pt_regs *regs, unsigned long 
error_code,
 #endif
 
/*
-* Buggy firmware could access regions which might page fault, try to
-* recover from such faults.
+* Buggy firmware could access regions which might page fault.  If
+* this happens, EFI has a special OOPS path that will try to
+* avoid hanging the system.
 */
if (IS_ENABLED(CONFIG_EFI))
-   efi_recover_from_page_fault(address);
+   efi_crash_gracefully_on_page_fault(address);
 
 oops:
/*
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 5a40fe4..67d93a2 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, 
void *kbuff,
  * @return: Returns, if the page fault is not handled. This function
  * will never return if the page fault is handled successfully.
  */
-void efi_recover_from_page_fault(unsigned long phys_addr)
+void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
 {
if (!IS_ENABLED(CONFIG_X86_64))
return;
 
/*
+* If we get an interrupt/NMI while processing an EFI runtime service
+* then this is a regular OOPS, not an EFI failure.
+*/
+   if (in_interrupt())
+   return;
+
+   /*
 * Make sure that an efi runtime service caused the page fault.
+* READ_ONCE() because we might be OOPSing in a different thread,
+* and we don't want to trip KTSAN while trying to OOPS.
 */
-   if (efi_rts_work.efi_rts_id == EFI_NONE)
+   if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE ||
+   current_work() != _rts_work.work)
return;
 
/*
@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
set_current_state(TASK_IDLE);
schedule();
}
-
-   return;
 }


[tip: x86/mm] x86/fault: Rename no_context() to kernelmode_fixup_or_oops()

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 6456a2a69ee16ad402f26d272d0b67ce1d25061f
Gitweb:
https://git.kernel.org/tip/6456a2a69ee16ad402f26d272d0b67ce1d25061f
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:43 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:41:19 +01:00

x86/fault: Rename no_context() to kernelmode_fixup_or_oops()

The name no_context() has never been very clear.  It's only called for
faults from kernel mode, so rename it and change the no-longer-useful
user_mode(regs) check to a WARN_ON_ONCE.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/c21940efe676024bb4bc721f7d70c29c420e127e.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 28 ++--
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 187975b..3566a59 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -730,17 +730,10 @@ oops:
 }
 
 static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, int signal, int si_code)
+kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
+unsigned long address, int signal, int si_code)
 {
-   if (user_mode(regs)) {
-   /*
-* This is an implicit supervisor-mode access from user
-* mode.  Bypass all the kernel-mode recovery code and just
-* OOPS.
-*/
-   goto oops;
-   }
+   WARN_ON_ONCE(user_mode(regs));
 
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
@@ -780,7 +773,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_prefetch(regs, error_code, address))
return;
 
-oops:
page_fault_oops(regs, error_code, address);
 }
 
@@ -827,7 +819,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
error_code,
struct task_struct *tsk = current;
 
if (!user_mode(regs)) {
-   no_context(regs, error_code, address, pkey, si_code);
+   kernelmode_fixup_or_oops(regs, error_code, address, pkey, 
si_code);
return;
}
 
@@ -959,7 +951,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
 {
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
-   no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+   kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, 
BUS_ADRERR);
return;
}
 
@@ -1421,8 +1413,8 @@ good_area:
 * has unlocked the mm for us if we get here.
 */
if (!user_mode(regs))
-   no_context(regs, error_code, address, SIGBUS,
-  BUS_ADRERR);
+   kernelmode_fixup_or_oops(regs, error_code, address,
+SIGBUS, BUS_ADRERR);
return;
}
 
@@ -1442,15 +1434,15 @@ good_area:
return;
 
if (fatal_signal_pending(current) && !user_mode(regs)) {
-   no_context(regs, error_code, address, 0, 0);
+   kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
return;
}
 
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
-   no_context(regs, error_code, address,
-  SIGSEGV, SEGV_MAPERR);
+   kernelmode_fixup_or_oops(regs, error_code, address,
+SIGSEGV, SEGV_MAPERR);
return;
}
 


[tip: x86/mm] x86/fault: Don't look for extable entries for SMEP violations

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 66fcd98883816dba3b66da20b5fc86fa410638b5
Gitweb:
https://git.kernel.org/tip/66fcd98883816dba3b66da20b5fc86fa410638b5
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:44 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 14:45:39 +01:00

x86/fault: Don't look for extable entries for SMEP violations

If the kernel gets a SMEP violation or a fault that would have been a
SMEP violation if it had SMEP support, it shouldn't run fixups. Just
OOPS.

 [ bp: Massage commit message. ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/46160d8babce2abf1d6daa052146002efa24ac56.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3566a59..1a0cfed 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1250,12 +1250,12 @@ void do_user_addr_fault(struct pt_regs *regs,
 * user memory.  Unless this is AMD erratum #93, which
 * corrupts RIP such that it looks like a user address,
 * this is unrecoverable.  Don't even try to look up the
-* VMA.
+* VMA or look for extable entries.
 */
if (is_errata93(regs, address))
return;
 
-   bad_area_nosemaphore(regs, error_code, address);
+   page_fault_oops(regs, error_code, address);
return;
}
 


[tip: x86/mm] x86/fault: Don't run fixups for SMAP violations

2021-02-10 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/mm branch of tip:

Commit-ID: ca247283781d754216395a41c5e8be8ec79a5f1c
Gitweb:
https://git.kernel.org/tip/ca247283781d754216395a41c5e8be8ec79a5f1c
Author:Andy Lutomirski 
AuthorDate:Tue, 09 Feb 2021 18:33:45 -08:00
Committer: Borislav Petkov 
CommitterDate: Wed, 10 Feb 2021 16:27:57 +01:00

x86/fault: Don't run fixups for SMAP violations

A SMAP-violating kernel access is not a recoverable condition.  Imagine
kernel code that, outside of a uaccess region, dereferences a pointer to
the user range by accident.  If SMAP is on, this will reliably generate
as an intentional user access.  This makes it easy for bugs to be
overlooked if code is inadequately tested both with and without SMAP.

This was discovered because BPF can generate invalid accesses to user
memory, but those warnings only got printed if SMAP was off. Make it so
that this type of error will be discovered with SMAP on as well.

 [ bp: Massage commit message. ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/66a02343624b1ff46f02a838c497fc05c1a871b3.1612924255.git.l...@kernel.org
---
 arch/x86/mm/fault.c |  9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 1a0cfed..1c3054b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1279,9 +1279,12 @@ void do_user_addr_fault(struct pt_regs *regs,
 */
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
 !(error_code & X86_PF_USER) &&
-!(regs->flags & X86_EFLAGS_AC)))
-   {
-   bad_area_nosemaphore(regs, error_code, address);
+!(regs->flags & X86_EFLAGS_AC))) {
+   /*
+* No extable entry here.  This was a kernel access to an
+* invalid pointer.  get_kernel_nofault() will not get here.
+*/
+   page_fault_oops(regs, error_code, address);
return;
}
 


[tip: x86/cleanups] x86/ptrace: Clean up PTRACE_GETREGS/PTRACE_PUTREGS regset selection

2021-02-04 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID: f22fecaf39c30acce701ffc3e9875020ba31f1f5
Gitweb:
https://git.kernel.org/tip/f22fecaf39c30acce701ffc3e9875020ba31f1f5
Author:Andy Lutomirski 
AuthorDate:Wed, 03 Feb 2021 10:09:58 -08:00
Committer: Borislav Petkov 
CommitterDate: Thu, 04 Feb 2021 12:33:15 +01:00

x86/ptrace: Clean up PTRACE_GETREGS/PTRACE_PUTREGS regset selection

task_user_regset_view() has nonsensical semantics, but those semantics
appear to be relied on by existing users of PTRACE_GETREGSET and
PTRACE_SETREGSET.  (See added comments below for details.)

It shouldn't be used for PTRACE_GETREGS or PTRACE_SETREGS, though. A
native 64-bit ptrace() call and an x32 ptrace() call using GETREGS
or SETREGS wants the 64-bit regset views, and a 32-bit ptrace() call
(native or compat) should use the 32-bit regset.

task_user_regset_view() almost does this except that it will
malfunction if a ptracer is itself ptraced and the outer ptracer
modifies CS on entry to a ptrace() syscall.  Hopefully that has never
happened.  (The compat ptrace() code already hardcoded the 32-bit
regset, so this change has no effect on that path.)

Improve the situation and deobfuscate the code by hardcoding the
64-bit view in the x32 ptrace() and selecting the view based on the
kernel config in the native ptrace().

I tried to figure out the history behind this API. I naĂŻvely assumed
that PTRAGE_GETREGSET and PTRACE_SETREGSET were ancient APIs that
predated compat, but no. They were introduced by

  2225a122ae26 ("ptrace: Add support for generic 
PTRACE_GETREGSET/PTRACE_SETREGSET")

in 2010, and they are simply a poor design.  ELF core dumps have the
ELF e_machine field and a bunch of register sets in ELF notes, and the
pair (e_machine, NT_XXX) indicates the format of the regset blob.  But
the new PTRACE_GET/SETREGSET API coopted the NT_XXX numbering without
any way to specify which e_machine was in effect.  This is especially
bad on x86, where a process can freely switch between 32-bit and
64-bit mode, and, in fact, the PTRAGE_SETREGSET call itself can cause
this switch to happen.  Oops.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/9daa791d0c7eaebd59c5bc2b2af1b0e7bebe707d.1612375698.git.l...@kernel.org
---
 arch/x86/kernel/ptrace.c | 46 ---
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index bedca01..87a4143 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child)
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 static const struct user_regset_view user_x86_32_view; /* Initialized below. */
 #endif
+#ifdef CONFIG_X86_64
+static const struct user_regset_view user_x86_64_view; /* Initialized below. */
+#endif
 
 long arch_ptrace(struct task_struct *child, long request,
 unsigned long addr, unsigned long data)
@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request,
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
 
+#ifdef CONFIG_X86_64
+   /* This is native 64-bit ptrace() */
+   const struct user_regset_view *regset_view = _x86_64_view;
+#else
+   /* This is native 32-bit ptrace() */
+   const struct user_regset_view *regset_view = _x86_32_view;
+#endif
+
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request,
 
case PTRACE_GETREGS:/* Get all gp regs from the child. */
return copy_regset_to_user(child,
-  task_user_regset_view(current),
+  regset_view,
   REGSET_GENERAL,
   0, sizeof(struct user_regs_struct),
   datap);
 
case PTRACE_SETREGS:/* Set all gp regs in the child. */
return copy_regset_from_user(child,
-task_user_regset_view(current),
+regset_view,
 REGSET_GENERAL,
 0, sizeof(struct user_regs_struct),
 datap);
 
case PTRACE_GETFPREGS:  /* Get the child FPU state. */
return copy_regset_to_user(child,
-  task_user_regset_view(current),
+  regset_view,
   REGSET_FP,
   0, sizeof(struct user_i387_struct),
   

[tip: x86/cleanups] x86/vm86/32: Remove VM86_SCREEN_BITMAP support

2021-01-21 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID: 8ece53ef7f428ee3f8eab936268b1a3fe2725e6b
Gitweb:
https://git.kernel.org/tip/8ece53ef7f428ee3f8eab936268b1a3fe2725e6b
Author:Andy Lutomirski 
AuthorDate:Tue, 19 Jan 2021 09:40:55 -08:00
Committer: Borislav Petkov 
CommitterDate: Thu, 21 Jan 2021 20:08:53 +01:00

x86/vm86/32: Remove VM86_SCREEN_BITMAP support

The implementation was rather buggy.  It unconditionally marked PTEs
read-only, even for VM_SHARED mappings.  I'm not sure whether this is
actually a problem, but it certainly seems unwise.  More importantly, it
released the mmap lock before flushing the TLB, which could allow a racing
CoW operation to falsely believe that the underlying memory was not
writable.

I can't find any users at all of this mechanism, so just remove it.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Acked-by: Stas Sergeev 
Link: 
https://lkml.kernel.org/r/f3086de0babcab36f69949b5780bde851f719bc8.1611078018.git.l...@kernel.org
---
 arch/x86/include/asm/vm86.h  |  1 +-
 arch/x86/include/uapi/asm/vm86.h |  4 +-
 arch/x86/kernel/vm86_32.c| 62 +++
 arch/x86/mm/fault.c  | 30 +---
 4 files changed, 16 insertions(+), 81 deletions(-)

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 26efbec..9e8ac50 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -36,7 +36,6 @@ struct vm86 {
unsigned long saved_sp0;
 
unsigned long flags;
-   unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
index d2ee4e3..18909b8 100644
--- a/arch/x86/include/uapi/asm/vm86.h
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -97,7 +97,7 @@ struct revectored_struct {
 struct vm86_struct {
struct vm86_regs regs;
unsigned long flags;
-   unsigned long screen_bitmap;
+   unsigned long screen_bitmap;/* unused, preserved by vm86() 
*/
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
@@ -106,7 +106,7 @@ struct vm86_struct {
 /*
  * flags masks
  */
-#define VM86_SCREEN_BITMAP 0x0001
+#define VM86_SCREEN_BITMAP 0x0001/* no longer supported */
 
 struct vm86plus_info_struct {
unsigned long force_return_for_pic:1;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 764573d..e5a7a10 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int 
retval)
unsafe_put_user(regs->ds, >regs.ds, Efault_end);
unsafe_put_user(regs->fs, >regs.fs, Efault_end);
unsafe_put_user(regs->gs, >regs.gs, Efault_end);
-   unsafe_put_user(vm86->screen_bitmap, >screen_bitmap, Efault_end);
+
+   /*
+* Don't write screen_bitmap in case some user had a value there
+* and expected it to remain unchanged.
+*/
 
user_access_end();
 
@@ -160,49 +164,6 @@ Efault:
do_exit(SIGSEGV);
 }
 
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
-   struct vm_area_struct *vma;
-   spinlock_t *ptl;
-   pgd_t *pgd;
-   p4d_t *p4d;
-   pud_t *pud;
-   pmd_t *pmd;
-   pte_t *pte;
-   int i;
-
-   mmap_write_lock(mm);
-   pgd = pgd_offset(mm, 0xA);
-   if (pgd_none_or_clear_bad(pgd))
-   goto out;
-   p4d = p4d_offset(pgd, 0xA);
-   if (p4d_none_or_clear_bad(p4d))
-   goto out;
-   pud = pud_offset(p4d, 0xA);
-   if (pud_none_or_clear_bad(pud))
-   goto out;
-   pmd = pmd_offset(pud, 0xA);
-
-   if (pmd_trans_huge(*pmd)) {
-   vma = find_vma(mm, 0xA);
-   split_huge_pmd(vma, pmd, 0xA);
-   }
-   if (pmd_none_or_clear_bad(pmd))
-   goto out;
-   pte = pte_offset_map_lock(mm, pmd, 0xA, );
-   for (i = 0; i < 32; i++) {
-   if (pte_present(*pte))
-   set_pte(pte, pte_wrprotect(*pte));
-   pte++;
-   }
-   pte_unmap_unlock(pte, ptl);
-out:
-   mmap_write_unlock(mm);
-   flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT, 
false);
-}
-
-
-
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
 static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
 
@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user 
*user_vm86, bool plus)
offsetof(struct vm86_struct, int_revectored)))
return -EFAULT;
 
+
+   /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
+   if (v.flags & 

[tip: x86/urgent] x86/fpu: Add kernel_fpu_begin_mask() to selectively initialize state

2021-01-21 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: e45122893a9870813f9bd7b4add4f613e6f29008
Gitweb:
https://git.kernel.org/tip/e45122893a9870813f9bd7b4add4f613e6f29008
Author:Andy Lutomirski 
AuthorDate:Wed, 20 Jan 2021 21:09:48 -08:00
Committer: Borislav Petkov 
CommitterDate: Thu, 21 Jan 2021 12:07:28 +01:00

x86/fpu: Add kernel_fpu_begin_mask() to selectively initialize state

Currently, requesting kernel FPU access doesn't distinguish which parts of
the extended ("FPU") state are needed.  This is nice for simplicity, but
there are a few cases in which it's suboptimal:

 - The vast majority of in-kernel FPU users want XMM/YMM/ZMM state but do
   not use legacy 387 state.  These users want MXCSR initialized but don't
   care about the FPU control word.  Skipping FNINIT would save time.
   (Empirically, FNINIT is several times slower than LDMXCSR.)

 - Code that wants MMX doesn't want or need MXCSR initialized.
   _mmx_memcpy(), for example, can run before CR4.OSFXSR gets set, and
   initializing MXCSR will fail because LDMXCSR generates an #UD when the
   aforementioned CR4 bit is not set.

 - Any future in-kernel users of XFD (eXtended Feature Disable)-capable
   dynamic states will need special handling.

Add a more specific API that allows callers to specify exactly what they
want.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Tested-by: Krzysztof Piotr Olędzki 
Link: 
https://lkml.kernel.org/r/aff1cac8b8fc7ee900cf73e8f2369966621b053f.1611205691.git.l...@kernel.org
---
 arch/x86/include/asm/fpu/api.h | 15 +--
 arch/x86/kernel/fpu/core.c |  9 +
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index a5aba4a..67a4f1c 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -16,14 +16,25 @@
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
  * disables preemption so be careful if you intend to use it for long periods
  * of time.
- * If you intend to use the FPU in softirq you need to check first with
+ * If you intend to use the FPU in irq/softirq you need to check first with
  * irq_fpu_usable() if it is possible.
  */
-extern void kernel_fpu_begin(void);
+
+/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
+#define KFPU_387   _BITUL(0)   /* 387 state will be initialized */
+#define KFPU_MXCSR _BITUL(1)   /* MXCSR will be initialized */
+
+extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
 extern void fpregs_mark_activate(void);
 
+/* Code that is unaware of kernel_fpu_begin_mask() can use this */
+static inline void kernel_fpu_begin(void)
+{
+   kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+}
+
 /*
  * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
  * A context switch will (and softirq might) save CPU's FPU registers to
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index eb86a2b..571220a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -121,7 +121,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
 }
 EXPORT_SYMBOL(copy_fpregs_to_fpstate);
 
-void kernel_fpu_begin(void)
+void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
preempt_disable();
 
@@ -141,13 +141,14 @@ void kernel_fpu_begin(void)
}
__cpu_invalidate_fpregs_state();
 
-   if (boot_cpu_has(X86_FEATURE_XMM))
+   /* Put sane initial values into the control registers. */
+   if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
ldmxcsr(MXCSR_DEFAULT);
 
-   if (boot_cpu_has(X86_FEATURE_FPU))
+   if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
asm volatile ("fninit");
 }
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
 
 void kernel_fpu_end(void)
 {


[tip: x86/urgent] x86/mmx: Use KFPU_387 for MMX string operations

2021-01-21 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 67de8dca50c027ca0fa3b62a488ee5035036a0da
Gitweb:
https://git.kernel.org/tip/67de8dca50c027ca0fa3b62a488ee5035036a0da
Author:Andy Lutomirski 
AuthorDate:Wed, 20 Jan 2021 21:09:49 -08:00
Committer: Borislav Petkov 
CommitterDate: Thu, 21 Jan 2021 13:39:36 +01:00

x86/mmx: Use KFPU_387 for MMX string operations

The default kernel_fpu_begin() doesn't work on systems that support XMM but
haven't yet enabled CR4.OSFXSR.  This causes crashes when _mmx_memcpy() is
called too early because LDMXCSR generates #UD when the aforementioned bit
is clear.

Fix it by using kernel_fpu_begin_mask(KFPU_387) explicitly.

Fixes: 7ad816762f9b ("x86/fpu: Reset MXCSR to default in kernel_fpu_begin()")
Reported-by: Krzysztof Mazur 
Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Tested-by: Krzysztof Piotr Olędzki 
Tested-by: Krzysztof Mazur 
Cc: 
Link: 
https://lkml.kernel.org/r/e7bf21855fe99e5f3baa27446e32623358f69e8d.1611205691.git.l...@kernel.org
---
 arch/x86/lib/mmx_32.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index 4321fa0..419365c 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -26,6 +26,16 @@
 #include 
 #include 
 
+/*
+ * Use KFPU_387.  MMX instructions are not affected by MXCSR,
+ * but both AMD and Intel documentation states that even integer MMX
+ * operations will result in #MF if an exception is pending in FCW.
+ *
+ * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
+ * any subsequent user of the 387 stack will reinitialize it using
+ * KFPU_387.
+ */
+
 void *_mmx_memcpy(void *to, const void *from, size_t len)
 {
void *p;
@@ -37,7 +47,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
p = to;
i = len >> 6; /* len/64 */
 
-   kernel_fpu_begin();
+   kernel_fpu_begin_mask(KFPU_387);
 
__asm__ __volatile__ (
"1: prefetch (%0)\n"/* This set is 28 bytes */
@@ -127,7 +137,7 @@ static void fast_clear_page(void *page)
 {
int i;
 
-   kernel_fpu_begin();
+   kernel_fpu_begin_mask(KFPU_387);
 
__asm__ __volatile__ (
"  pxor %%mm0, %%mm0\n" : :
@@ -160,7 +170,7 @@ static void fast_copy_page(void *to, void *from)
 {
int i;
 
-   kernel_fpu_begin();
+   kernel_fpu_begin_mask(KFPU_387);
 
/*
 * maybe the prefetch stuff can go before the expensive fnsave...
@@ -247,7 +257,7 @@ static void fast_clear_page(void *page)
 {
int i;
 
-   kernel_fpu_begin();
+   kernel_fpu_begin_mask(KFPU_387);
 
__asm__ __volatile__ (
"  pxor %%mm0, %%mm0\n" : :
@@ -282,7 +292,7 @@ static void fast_copy_page(void *to, void *from)
 {
int i;
 
-   kernel_fpu_begin();
+   kernel_fpu_begin_mask(KFPU_387);
 
__asm__ __volatile__ (
"1: prefetch (%0)\n"


[tip: x86/misc] selftests/x86: Use __builtin_ia32_read/writeeflags

2021-01-12 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/misc branch of tip:

Commit-ID: 9297e602adf8d5587d83941c48e4dbae46c8df5f
Gitweb:
https://git.kernel.org/tip/9297e602adf8d5587d83941c48e4dbae46c8df5f
Author:Andy Lutomirski 
AuthorDate:Mon, 02 Nov 2020 11:54:02 -08:00
Committer: Borislav Petkov 
CommitterDate: Tue, 12 Jan 2021 12:31:28 +01:00

selftests/x86: Use __builtin_ia32_read/writeeflags

The asm to read and write EFLAGS from userspace is horrible.  The
compiler builtins are now available on all supported compilers, so
use them instead.

(The compiler builtins are also unnecessarily ugly, but that's a
 more manageable level of ugliness.)

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/aee4b1cdfc56083eb779ce927b7d3459aad2af76.1604346818.git.l...@kernel.org
---
 tools/testing/selftests/x86/helpers.h | 24 
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/x86/helpers.h 
b/tools/testing/selftests/x86/helpers.h
index f5ff2a2..4ef42c4 100644
--- a/tools/testing/selftests/x86/helpers.h
+++ b/tools/testing/selftests/x86/helpers.h
@@ -6,36 +6,20 @@
 
 static inline unsigned long get_eflags(void)
 {
-   unsigned long eflags;
-
-   asm volatile (
 #ifdef __x86_64__
-   "subq $128, %%rsp\n\t"
-   "pushfq\n\t"
-   "popq %0\n\t"
-   "addq $128, %%rsp"
+   return __builtin_ia32_readeflags_u64();
 #else
-   "pushfl\n\t"
-   "popl %0"
+   return __builtin_ia32_readeflags_u32();
 #endif
-   : "=r" (eflags) :: "memory");
-
-   return eflags;
 }
 
 static inline void set_eflags(unsigned long eflags)
 {
-   asm volatile (
 #ifdef __x86_64__
-   "subq $128, %%rsp\n\t"
-   "pushq %0\n\t"
-   "popfq\n\t"
-   "addq $128, %%rsp"
+   __builtin_ia32_writeeflags_u64(eflags);
 #else
-   "pushl %0\n\t"
-   "popfl"
+   __builtin_ia32_writeeflags_u32(eflags);
 #endif
-   :: "r" (eflags) : "flags", "memory");
 }
 
 #endif /* __SELFTESTS_X86_HELPERS_H */


[tip: x86/urgent] x86/membarrier: Get rid of a dubious optimization

2020-12-09 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: a493d1ca1a03b532871f1da27f8dbda2b28b04c4
Gitweb:
https://git.kernel.org/tip/a493d1ca1a03b532871f1da27f8dbda2b28b04c4
Author:Andy Lutomirski 
AuthorDate:Thu, 03 Dec 2020 21:07:03 -08:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 09 Dec 2020 09:37:42 +01:00

x86/membarrier: Get rid of a dubious optimization

sync_core_before_usermode() had an incorrect optimization.  If the kernel
returns from an interrupt, it can get to usermode without IRET. It just has
to schedule to a different task in the same mm and do SYSRET.  Fortunately,
there were no callers of sync_core_before_usermode() that could have had
in_irq() or in_nmi() equal to true, because it's only ever called from the
scheduler.

While at it, clarify a related comment.

Fixes: 70216e18e519 ("membarrier: Provide core serializing command, 
*_SYNC_CORE")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Mathieu Desnoyers 
Cc: sta...@vger.kernel.org
Link: 
https://lore.kernel.org/r/5afc7632be1422f91eaf76111b5b8580a086.1607058304.git.l...@kernel.org

---
 arch/x86/include/asm/sync_core.h |  9 +
 arch/x86/mm/tlb.c| 10 --
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index 0fd4a9d..ab7382f 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void)
/* With PTI, we unconditionally serialize before running user code. */
if (static_cpu_has(X86_FEATURE_PTI))
return;
+
/*
-* Return from interrupt and NMI is done through iret, which is core
-* serializing.
+* Even if we're in an interrupt, we might reschedule before returning,
+* in which case we could switch to a different thread in the same mm
+* and return using SYSRET or SYSEXIT.  Instead of trying to keep
+* track of our need to sync the core, just sync right away.
 */
-   if (in_irq() || in_nmi())
-   return;
sync_core();
 }
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 11666ba..569ac1d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
/*
 * The membarrier system call requires a full memory barrier and
 * core serialization before returning to user-space, after
-* storing to rq->curr. Writing to CR3 provides that full
-* memory barrier and core serializing instruction.
+* storing to rq->curr, when changing mm.  This is because
+* membarrier() sends IPIs to all CPUs that are in the target mm
+* to make them issue memory barriers.  However, if another CPU
+* switches to/from the target mm concurrently with
+* membarrier(), it can cause that CPU not to receive an IPI
+* when it really should issue a memory barrier.  Writing to CR3
+* provides that full memory barrier and core serializing
+* instruction.
 */
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=


[tip: x86/urgent] membarrier: Execute SYNC_CORE on the calling thread

2020-12-09 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: e45cdc71d1fa5ac3a57b23acc31eb959e4f60135
Gitweb:
https://git.kernel.org/tip/e45cdc71d1fa5ac3a57b23acc31eb959e4f60135
Author:Andy Lutomirski 
AuthorDate:Thu, 03 Dec 2020 21:07:06 -08:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 09 Dec 2020 09:37:43 +01:00

membarrier: Execute SYNC_CORE on the calling thread

membarrier()'s MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is documented as
syncing the core on all sibling threads but not necessarily the calling
thread.  This behavior is fundamentally buggy and cannot be used safely.

Suppose a user program has two threads.  Thread A is on CPU 0 and thread B
is on CPU 1.  Thread A modifies some text and calls
membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE).

Then thread B executes the modified code.  If, at any point after
membarrier() decides which CPUs to target, thread A could be preempted and
replaced by thread B on CPU 0.  This could even happen on exit from the
membarrier() syscall.  If this happens, thread B will end up running on CPU
0 without having synced.

In principle, this could be fixed by arranging for the scheduler to issue
sync_core_before_usermode() whenever switching between two threads in the
same mm if there is any possibility of a concurrent membarrier() call, but
this would have considerable overhead.  Instead, make membarrier() sync the
calling CPU as well.

As an optimization, this avoids an extra smp_mb() in the default
barrier-only mode and an extra rseq preempt on the caller.

Fixes: 70216e18e519 ("membarrier: Provide core serializing command, 
*_SYNC_CORE")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Mathieu Desnoyers 
Link: 
https://lore.kernel.org/r/250ded637696d490c69bef1877148db86066881c.1607058304.git.l...@kernel.org

---
 kernel/sched/membarrier.c | 51 --
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 1c278df..9d8df34 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -194,7 +194,8 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
return -EPERM;
}
 
-   if (atomic_read(>mm_users) == 1 || num_online_cpus() == 1)
+   if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
+   (atomic_read(>mm_users) == 1 || num_online_cpus() == 1))
return 0;
 
/*
@@ -213,8 +214,6 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
 
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
goto out;
-   if (cpu_id == raw_smp_processor_id())
-   goto out;
rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu_id)->curr);
if (!p || p->mm != mm) {
@@ -229,16 +228,6 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
for_each_online_cpu(cpu) {
struct task_struct *p;
 
-   /*
-* Skipping the current CPU is OK even through we can be
-* migrated at any point. The current CPU, at the point
-* where we read raw_smp_processor_id(), is ensured to
-* be in program order with respect to the caller
-* thread. Therefore, we can skip this CPU from the
-* iteration.
-*/
-   if (cpu == raw_smp_processor_id())
-   continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
@@ -246,12 +235,38 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
rcu_read_unlock();
}
 
-   preempt_disable();
-   if (cpu_id >= 0)
+   if (cpu_id >= 0) {
+   /*
+* smp_call_function_single() will call ipi_func() if cpu_id
+* is the calling CPU.
+*/
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
-   else
-   smp_call_function_many(tmpmask, ipi_func, NULL, 1);
-   preempt_enable();
+   } else {
+   /*
+* For regular membarrier, we can save a few cycles by
+* skipping the current cpu -- we're about to do smp_mb()
+* below, and if we migrate to a different cpu, this cpu
+* and the new cpu will execute a full barrier in the
+* scheduler.
+*
+* For SYNC_CORE, we do need a barrier on the current cpu --
+* otherwise, if we are migrated and replaced by a different
+* task in the same mm just before, during, 

[tip: x86/urgent] membarrier: Add an actual barrier before rseq_preempt()

2020-12-09 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 2ecedd7569080fd05c1a457e8af2165afecfa29f
Gitweb:
https://git.kernel.org/tip/2ecedd7569080fd05c1a457e8af2165afecfa29f
Author:Andy Lutomirski 
AuthorDate:Thu, 03 Dec 2020 21:07:04 -08:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 09 Dec 2020 09:37:43 +01:00

membarrier: Add an actual barrier before rseq_preempt()

It seems that most RSEQ membarrier users will expect any stores done before
the membarrier() syscall to be visible to the target task(s).  While this
is extremely likely to be true in practice, nothing actually guarantees it
by a strict reading of the x86 manuals.  Rather than providing this
guarantee by accident and potentially causing a problem down the road, just
add an explicit barrier.

Fixes: 70216e18e519 ("membarrier: Provide core serializing command, 
*_SYNC_CORE")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Mathieu Desnoyers 
Cc: sta...@vger.kernel.org
Link: 
https://lore.kernel.org/r/d3e7197e034fa4852afcf370ca49c30496e58e40.1607058304.git.l...@kernel.org

---
 kernel/sched/membarrier.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index e23e74d..7d98ef5 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -40,6 +40,14 @@ static void ipi_mb(void *info)
 
 static void ipi_rseq(void *info)
 {
+   /*
+* Ensure that all stores done by the calling thread are visible
+* to the current task before the current task resumes.  We could
+* probably optimize this away on most architectures, but by the
+* time we've already sent an IPI, the cost of the extra smp_mb()
+* is negligible.
+*/
+   smp_mb();
rseq_preempt(current);
 }
 


[tip: x86/urgent] membarrier: Explicitly sync remote cores when SYNC_CORE is requested

2020-12-09 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 758c9373d84168dc7d039cf85a0e920046b17b41
Gitweb:
https://git.kernel.org/tip/758c9373d84168dc7d039cf85a0e920046b17b41
Author:Andy Lutomirski 
AuthorDate:Thu, 03 Dec 2020 21:07:05 -08:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 09 Dec 2020 09:37:43 +01:00

membarrier: Explicitly sync remote cores when SYNC_CORE is requested

membarrier() does not explicitly sync_core() remote CPUs; instead, it
relies on the assumption that an IPI will result in a core sync.  On x86,
this may be true in practice, but it's not architecturally reliable.  In
particular, the SDM and APM do not appear to guarantee that interrupt
delivery is serializing.  While IRET does serialize, IPI return can
schedule, thereby switching to another task in the same mm that was
sleeping in a syscall.  The new task could then SYSRET back to usermode
without ever executing IRET.

Make this more robust by explicitly calling sync_core_before_usermode()
on remote cores.  (This also helps people who search the kernel tree for
instances of sync_core() and sync_core_before_usermode() -- one might be
surprised that the core membarrier code doesn't currently show up in a
such a search.)

Fixes: 70216e18e519 ("membarrier: Provide core serializing command, 
*_SYNC_CORE")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Mathieu Desnoyers 
Cc: sta...@vger.kernel.org
Link: 
https://lore.kernel.org/r/776b448d5f7bd6b12690707f5ed67bcda7f1d427.1607058304.git.l...@kernel.org

---
 kernel/sched/membarrier.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 7d98ef5..1c278df 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -38,6 +38,23 @@ static void ipi_mb(void *info)
smp_mb();   /* IPIs should be serializing but paranoid. */
 }
 
+static void ipi_sync_core(void *info)
+{
+   /*
+* The smp_mb() in membarrier after all the IPIs is supposed to
+* ensure that memory on remote CPUs that occur before the IPI
+* become visible to membarrier()'s caller -- see scenario B in
+* the big comment at the top of this file.
+*
+* A sync_core() would provide this guarantee, but
+* sync_core_before_usermode() might end up being deferred until
+* after membarrier()'s smp_mb().
+*/
+   smp_mb();   /* IPIs should be serializing but paranoid. */
+
+   sync_core_before_usermode();
+}
+
 static void ipi_rseq(void *info)
 {
/*
@@ -162,6 +179,7 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
if (!(atomic_read(>membarrier_state) &
  MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
+   ipi_func = ipi_sync_core;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;


[tip: x86/misc] selftests/x86/fsgsbase: Fix GS == 1, 2, and 3 tests

2020-11-24 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/misc branch of tip:

Commit-ID: 716572b0003ef67a4889bd7d85baf5099c5a0248
Gitweb:
https://git.kernel.org/tip/716572b0003ef67a4889bd7d85baf5099c5a0248
Author:Andy Lutomirski 
AuthorDate:Mon, 02 Nov 2020 11:51:10 -08:00
Committer: Borislav Petkov 
CommitterDate: Tue, 24 Nov 2020 13:46:16 +01:00

selftests/x86/fsgsbase: Fix GS == 1, 2, and 3 tests

Setting GS to 1, 2, or 3 causes a nonsensical part of the IRET microcode
to change GS back to zero on a return from kernel mode to user mode. The
result is that these tests fail randomly depending on when interrupts
happen. Detect when this happens and let the test pass.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/7567fd44a1d60a9424f25b19a998f12149993b0d.1604346596.git.l...@kernel.org
---
 tools/testing/selftests/x86/fsgsbase.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/x86/fsgsbase.c 
b/tools/testing/selftests/x86/fsgsbase.c
index 7161cfc..8c780cc 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -392,8 +392,8 @@ static void set_gs_and_switch_to(unsigned long local,
local = read_base(GS);
 
/*
-* Signal delivery seems to mess up weird selectors.  Put it
-* back.
+* Signal delivery is quite likely to change a selector
+* of 1, 2, or 3 back to 0 due to IRET being defective.
 */
asm volatile ("mov %0, %%gs" : : "rm" (force_sel));
} else {
@@ -411,6 +411,14 @@ static void set_gs_and_switch_to(unsigned long local,
if (base == local && sel_pre_sched == sel_post_sched) {
printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n",
   sel_pre_sched, local);
+   } else if (base == local && sel_pre_sched >= 1 && sel_pre_sched <= 3 &&
+  sel_post_sched == 0) {
+   /*
+* IRET is misdesigned and will squash selectors 1, 2, or 3
+* to zero.  Don't fail the test just because this happened.
+*/
+   printf("[OK]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx 
because IRET is defective\n",
+  sel_pre_sched, local, sel_post_sched, base);
} else {
nerrs++;
printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 
0x%hx/0x%lx\n",


[tip: x86/misc] selftests/x86: Add missing .note.GNU-stack sections

2020-11-24 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/misc branch of tip:

Commit-ID: aeaaf005da1de075929e56562dced4a58238efc4
Gitweb:
https://git.kernel.org/tip/aeaaf005da1de075929e56562dced4a58238efc4
Author:Andy Lutomirski 
AuthorDate:Mon, 02 Nov 2020 11:51:11 -08:00
Committer: Borislav Petkov 
CommitterDate: Tue, 24 Nov 2020 13:55:39 +01:00

selftests/x86: Add missing .note.GNU-stack sections

Several of the x86 selftests end up with executable stacks because
the asm was missing the annotation that says that they are modern
and don't need executable stacks.  Add the annotations.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/6f043c03e9e0e4557e1e975a63b07a4d18965a68.1604346596.git.l...@kernel.org
---
 tools/testing/selftests/x86/raw_syscall_helper_32.S | 2 ++
 tools/testing/selftests/x86/thunks.S| 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tools/testing/selftests/x86/raw_syscall_helper_32.S 
b/tools/testing/selftests/x86/raw_syscall_helper_32.S
index 94410fa..a10d36a 100644
--- a/tools/testing/selftests/x86/raw_syscall_helper_32.S
+++ b/tools/testing/selftests/x86/raw_syscall_helper_32.S
@@ -45,3 +45,5 @@ int80_and_ret:
 
.type int80_and_ret, @function
.size int80_and_ret, .-int80_and_ret
+
+.section .note.GNU-stack,"",%progbits
diff --git a/tools/testing/selftests/x86/thunks.S 
b/tools/testing/selftests/x86/thunks.S
index 1bb5d62..a2d47d8 100644
--- a/tools/testing/selftests/x86/thunks.S
+++ b/tools/testing/selftests/x86/thunks.S
@@ -57,3 +57,5 @@ call32_from_64:
ret
 
 .size call32_from_64, .-call32_from_64
+
+.section .note.GNU-stack,"",%progbits


[tip: x86/urgent] x86/syscalls: Document the fact that syscalls 512-547 are a legacy mistake

2020-10-14 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: c3b484c439b0bab7a698495f33ef16286a1000c4
Gitweb:
https://git.kernel.org/tip/c3b484c439b0bab7a698495f33ef16286a1000c4
Author:Andy Lutomirski 
AuthorDate:Sun, 11 Oct 2020 19:51:21 -07:00
Committer: Ingo Molnar 
CommitterDate: Wed, 14 Oct 2020 19:53:40 +02:00

x86/syscalls: Document the fact that syscalls 512-547 are a legacy mistake

Since this commit:

  6365b842aae4 ("x86/syscalls: Split the x32 syscalls into their own table")

there is no need for special x32-specific syscall numbers.  I forgot to
update the comments in syscall_64.tbl.  Add comments to make it clear to
future contributors that this range is a legacy wart.

Reported-by: Jessica Clarke 
Signed-off-by: Andy Lutomirski 
Signed-off-by: Ingo Molnar 
Link: 
https://lore.kernel.org/r/6c56fb4ddd18fc60a238eb4d867e4b3d97c6351e.1602471055.git.l...@kernel.org
---
 arch/x86/entry/syscalls/syscall_64.tbl | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index f30d6ae..4adb5d2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,10 +363,10 @@
 439common  faccessat2  sys_faccessat2
 
 #
-# x32-specific system call numbers start at 512 to avoid cache impact
-# for native 64-bit operation. The __x32_compat_sys stubs are created
-# on-the-fly for compat_sys_*() compatibility system calls if X86_X32
-# is defined.
+# Due to a historical design error, certain syscalls are numbered differently
+# in x32 as compared to native x86_64.  These syscalls have numbers 512-547.
+# Do not add new syscalls to this range.  Numbers 548 and above are available
+# for non-x32 use.
 #
 512x32 rt_sigactioncompat_sys_rt_sigaction
 513x32 rt_sigreturncompat_sys_x32_rt_sigreturn
@@ -404,3 +404,5 @@
 545x32 execveatcompat_sys_execveat
 546x32 preadv2 compat_sys_preadv64v2
 547x32 pwritev2compat_sys_pwritev64v2
+# This is the end of the legacy x32 range.  Numbers 548 and above are
+# not special and are not to be used for x32-specific syscalls.


[tip: x86/urgent] x86/debug: Allow a single level of #DB recursion

2020-09-04 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: d5c678aed5eddb944b8e7ce451b107b39245962d
Gitweb:
https://git.kernel.org/tip/d5c678aed5eddb944b8e7ce451b107b39245962d
Author:Andy Lutomirski 
AuthorDate:Wed, 02 Sep 2020 15:25:51 +02:00
Committer: Thomas Gleixner 
CommitterDate: Fri, 04 Sep 2020 15:09:29 +02:00

x86/debug: Allow a single level of #DB recursion

Trying to clear DR7 around a #DB from usermode malfunctions if the tasks
schedules when delivering SIGTRAP.

Rather than trying to define a special no-recursion region, just allow a
single level of recursion.  The same mechanism is used for NMI, and it
hasn't caused any problems yet.

Fixes: 9f58fdde95c9 ("x86/db: Split out dr6/7 handling")
Reported-by: Kyle Huey 
Debugged-by: Josh Poimboeuf 
Signed-off-by: Andy Lutomirski 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Tested-by: Daniel Thompson 
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/8b9bd05f187231df008d48cf818a6a311cbd5c98.1597882384.git.l...@kernel.org
Link: https://lore.kernel.org/r/20200902133200.726584...@infradead.org

---
 arch/x86/kernel/traps.c | 65 +++-
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1f66d2d..81a2fb7 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -729,20 +729,9 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
 #endif
 }
 
-static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
+static __always_inline unsigned long debug_read_clear_dr6(void)
 {
-   /*
-* Disable breakpoints during exception handling; recursive exceptions
-* are exceedingly 'fun'.
-*
-* Since this function is NOKPROBE, and that also applies to
-* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
-* HW_BREAKPOINT_W on our stack)
-*
-* Entry text is excluded for HW_BP_X and cpu_entry_area, which
-* includes the entry stack is excluded for everything.
-*/
-   *dr7 = local_db_save();
+   unsigned long dr6;
 
/*
 * The Intel SDM says:
@@ -755,15 +744,12 @@ static __always_inline void debug_enter(unsigned long 
*dr6, unsigned long *dr7)
 *
 * Keep it simple: clear DR6 immediately.
 */
-   get_debugreg(*dr6, 6);
+   get_debugreg(dr6, 6);
set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */
-   *dr6 &= ~DR6_RESERVED;
-}
+   dr6 &= ~DR6_RESERVED;
 
-static __always_inline void debug_exit(unsigned long dr7)
-{
-   local_db_restore(dr7);
+   return dr6;
 }
 
 /*
@@ -863,6 +849,18 @@ out:
 static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 unsigned long dr6)
 {
+   /*
+* Disable breakpoints during exception handling; recursive exceptions
+* are exceedingly 'fun'.
+*
+* Since this function is NOKPROBE, and that also applies to
+* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
+* HW_BREAKPOINT_W on our stack)
+*
+* Entry text is excluded for HW_BP_X and cpu_entry_area, which
+* includes the entry stack is excluded for everything.
+*/
+   unsigned long dr7 = local_db_save();
bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
 
@@ -883,6 +881,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs 
*regs,
 
instrumentation_end();
idtentry_exit_nmi(regs, irq_state);
+
+   local_db_restore(dr7);
 }
 
 static __always_inline void exc_debug_user(struct pt_regs *regs,
@@ -894,6 +894,15 @@ static __always_inline void exc_debug_user(struct pt_regs 
*regs,
 */
WARN_ON_ONCE(!user_mode(regs));
 
+   /*
+* NB: We can't easily clear DR7 here because
+* idtentry_exit_to_usermode() can invoke ptrace, schedule, access
+* user memory, etc.  This means that a recursive #DB is possible.  If
+* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
+* Since we're not on the IST stack right now, everything will be
+* fine.
+*/
+
irqentry_enter_from_user_mode(regs);
instrumentation_begin();
 
@@ -907,36 +916,24 @@ static __always_inline void exc_debug_user(struct pt_regs 
*regs,
 /* IST stack entry */
 DEFINE_IDTENTRY_DEBUG(exc_debug)
 {
-   unsigned long dr6, dr7;
-
-   debug_enter(, );
-   exc_debug_kernel(regs, dr6);
-   debug_exit(dr7);
+   exc_debug_kernel(regs, debug_read_clear_dr6());
 }
 
 /* User entry, runs on regular task stack */
 DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
 {
-   unsigned long dr6, dr7;
-
-   debug_enter(, );
-   exc_debug_user(regs, dr6);
-   debug_exit(dr7);
+   

[tip: x86/fsgsbase] selftests/x86/fsgsbase: Reap a forgotten child

2020-08-26 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: ab2dd173330a3f07142e68cd65682205036cd00f
Gitweb:
https://git.kernel.org/tip/ab2dd173330a3f07142e68cd65682205036cd00f
Author:Andy Lutomirski 
AuthorDate:Wed, 26 Aug 2020 10:00:45 -07:00
Committer: Ingo Molnar 
CommitterDate: Wed, 26 Aug 2020 20:54:17 +02:00

selftests/x86/fsgsbase: Reap a forgotten child

The ptrace() test forgot to reap its child.  Reap it.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Ingo Molnar 
Link: 
https://lore.kernel.org/r/e7700a503f30e79ab35a63103938a19893dbeff2.1598461151.git.l...@kernel.org
---
 tools/testing/selftests/x86/fsgsbase.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/x86/fsgsbase.c 
b/tools/testing/selftests/x86/fsgsbase.c
index 9983195..0056e25 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -517,6 +517,9 @@ static void test_ptrace_write_gsbase(void)
 
 END:
ptrace(PTRACE_CONT, child, NULL, NULL);
+   wait();
+   if (!WIFEXITED(status))
+   printf("[WARN]\tChild didn't exit cleanly.\n");
 }
 
 int main()


[tip: x86/fsgsbase] selftests/x86/fsgsbase: Test PTRACE_PEEKUSER for GSBASE with invalid LDT GS

2020-08-26 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: 1b9abd1755ad947d7c9913e92e7837b533124c90
Gitweb:
https://git.kernel.org/tip/1b9abd1755ad947d7c9913e92e7837b533124c90
Author:Andy Lutomirski 
AuthorDate:Wed, 26 Aug 2020 10:00:46 -07:00
Committer: Ingo Molnar 
CommitterDate: Wed, 26 Aug 2020 20:54:18 +02:00

selftests/x86/fsgsbase: Test PTRACE_PEEKUSER for GSBASE with invalid LDT GS

This tests commit:

  8ab49526b53d ("x86/fsgsbase/64: Fix NULL deref in 86_fsgsbase_read_task")

Unpatched kernels will OOPS.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Ingo Molnar 
Cc: sta...@vger.kernel.org
Link: 
https://lore.kernel.org/r/c618ae86d1f757e01b1a8e79869f553cb88acf9a.1598461151.git.l...@kernel.org
---
 tools/testing/selftests/x86/fsgsbase.c | 65 +-
 1 file changed, 65 insertions(+)

diff --git a/tools/testing/selftests/x86/fsgsbase.c 
b/tools/testing/selftests/x86/fsgsbase.c
index 0056e25..7161cfc 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -443,6 +443,68 @@ static void test_unexpected_base(void)
 
 #define USER_REGS_OFFSET(r) offsetof(struct user_regs_struct, r)
 
+static void test_ptrace_write_gs_read_base(void)
+{
+   int status;
+   pid_t child = fork();
+
+   if (child < 0)
+   err(1, "fork");
+
+   if (child == 0) {
+   printf("[RUN]\tPTRACE_POKE GS, read GSBASE back\n");
+
+   printf("[RUN]\tARCH_SET_GS to 1\n");
+   if (syscall(SYS_arch_prctl, ARCH_SET_GS, 1) != 0)
+   err(1, "ARCH_SET_GS");
+
+   if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
+   err(1, "PTRACE_TRACEME");
+
+   raise(SIGTRAP);
+   _exit(0);
+   }
+
+   wait();
+
+   if (WSTOPSIG(status) == SIGTRAP) {
+   unsigned long base;
+   unsigned long gs_offset = USER_REGS_OFFSET(gs);
+   unsigned long base_offset = USER_REGS_OFFSET(gs_base);
+
+   /* Read the initial base.  It should be 1. */
+   base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
+   if (base == 1) {
+   printf("[OK]\tGSBASE started at 1\n");
+   } else {
+   nerrs++;
+   printf("[FAIL]\tGSBASE started at 0x%lx\n", base);
+   }
+
+   printf("[RUN]\tSet GS = 0x7, read GSBASE\n");
+
+   /* Poke an LDT selector into GS. */
+   if (ptrace(PTRACE_POKEUSER, child, gs_offset, 0x7) != 0)
+   err(1, "PTRACE_POKEUSER");
+
+   /* And read the base. */
+   base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
+
+   if (base == 0 || base == 1) {
+   printf("[OK]\tGSBASE reads as 0x%lx with invalid GS\n", 
base);
+   } else {
+   nerrs++;
+   printf("[FAIL]\tGSBASE=0x%lx (should be 0 or 1)\n", 
base);
+   }
+   }
+
+   ptrace(PTRACE_CONT, child, NULL, NULL);
+
+   wait();
+   if (!WIFEXITED(status))
+   printf("[WARN]\tChild didn't exit cleanly.\n");
+}
+
 static void test_ptrace_write_gsbase(void)
 {
int status;
@@ -529,6 +591,9 @@ int main()
shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
  MAP_ANONYMOUS | MAP_SHARED, -1, 0);
 
+   /* Do these tests before we have an LDT. */
+   test_ptrace_write_gs_read_base();
+
/* Probe FSGSBASE */
sethandler(SIGILL, sigill, 0);
if (sigsetjmp(jmpbuf, 1) == 0) {


[tip: x86/urgent] x86/ioperm: Fix io bitmap invalidation on Xen PV

2020-07-18 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: cadfad870154e14f745ec845708bc17d166065f2
Gitweb:
https://git.kernel.org/tip/cadfad870154e14f745ec845708bc17d166065f2
Author:Andy Lutomirski 
AuthorDate:Fri, 17 Jul 2020 16:53:55 -07:00
Committer: Thomas Gleixner 
CommitterDate: Sat, 18 Jul 2020 12:31:49 +02:00

x86/ioperm: Fix io bitmap invalidation on Xen PV

tss_invalidate_io_bitmap() wasn't wired up properly through the pvop
machinery, so the TSS and Xen's io bitmap would get out of sync
whenever disabling a valid io bitmap.

Add a new pvop for tss_invalidate_io_bitmap() to fix it.

This is XSA-329.

Fixes: 22fe5b0439dd ("x86/ioperm: Move TSS bitmap update to exit to user work")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Juergen Gross 
Reviewed-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/d53075590e1f91c19f8af705059d3ff99424c020.1595030016.git.l...@kernel.org

---
 arch/x86/include/asm/io_bitmap.h  | 16 
 arch/x86/include/asm/paravirt.h   |  5 +
 arch/x86/include/asm/paravirt_types.h |  1 +
 arch/x86/kernel/paravirt.c|  3 ++-
 arch/x86/kernel/process.c | 18 ++
 arch/x86/xen/enlighten_pv.c   | 12 
 6 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h
index ac1a99f..7f080f5 100644
--- a/arch/x86/include/asm/io_bitmap.h
+++ b/arch/x86/include/asm/io_bitmap.h
@@ -19,12 +19,28 @@ struct task_struct;
 void io_bitmap_share(struct task_struct *tsk);
 void io_bitmap_exit(struct task_struct *tsk);
 
+static inline void native_tss_invalidate_io_bitmap(void)
+{
+   /*
+* Invalidate the I/O bitmap by moving io_bitmap_base outside the
+* TSS limit so any subsequent I/O access from user space will
+* trigger a #GP.
+*
+* This is correct even when VMEXIT rewrites the TSS limit
+* to 0x67 as the only requirement is that the base points
+* outside the limit.
+*/
+   this_cpu_write(cpu_tss_rw.x86_tss.io_bitmap_base,
+  IO_BITMAP_OFFSET_INVALID);
+}
+
 void native_tss_update_io_bitmap(void);
 
 #ifdef CONFIG_PARAVIRT_XXL
 #include 
 #else
 #define tss_update_io_bitmap native_tss_update_io_bitmap
+#define tss_invalidate_io_bitmap native_tss_invalidate_io_bitmap
 #endif
 
 #else
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5ca5d29..3d2afec 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -302,6 +302,11 @@ static inline void write_idt_entry(gate_desc *dt, int 
entry, const gate_desc *g)
 }
 
 #ifdef CONFIG_X86_IOPL_IOPERM
+static inline void tss_invalidate_io_bitmap(void)
+{
+   PVOP_VCALL0(cpu.invalidate_io_bitmap);
+}
+
 static inline void tss_update_io_bitmap(void)
 {
PVOP_VCALL0(cpu.update_io_bitmap);
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 732f62e..8dfcb25 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -141,6 +141,7 @@ struct pv_cpu_ops {
void (*load_sp0)(unsigned long sp0);
 
 #ifdef CONFIG_X86_IOPL_IOPERM
+   void (*invalidate_io_bitmap)(void);
void (*update_io_bitmap)(void);
 #endif
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 674a7d6..de2138b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -324,7 +324,8 @@ struct paravirt_patch_template pv_ops = {
.cpu.swapgs = native_swapgs,
 
 #ifdef CONFIG_X86_IOPL_IOPERM
-   .cpu.update_io_bitmap   = native_tss_update_io_bitmap,
+   .cpu.invalidate_io_bitmap   = native_tss_invalidate_io_bitmap,
+   .cpu.update_io_bitmap   = native_tss_update_io_bitmap,
 #endif
 
.cpu.start_context_switch   = paravirt_nop,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f362ce0..fe67dbd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -322,20 +322,6 @@ void arch_setup_new_exec(void)
 }
 
 #ifdef CONFIG_X86_IOPL_IOPERM
-static inline void tss_invalidate_io_bitmap(struct tss_struct *tss)
-{
-   /*
-* Invalidate the I/O bitmap by moving io_bitmap_base outside the
-* TSS limit so any subsequent I/O access from user space will
-* trigger a #GP.
-*
-* This is correct even when VMEXIT rewrites the TSS limit
-* to 0x67 as the only requirement is that the base points
-* outside the limit.
-*/
-   tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
-}
-
 static inline void switch_to_bitmap(unsigned long tifp)
 {
/*
@@ -346,7 +332,7 @@ static inline void switch_to_bitmap(unsigned long tifp)
 * user mode.
 */
if (tifp & _TIF_IO_BITMAP)
-  

[tip: x86/entry] x86/entry: Rename idtentry_enter/exit_cond_rcu() to idtentry_enter/exit()

2020-07-07 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/entry branch of tip:

Commit-ID: b037b09b9058d84882fa2c4db3806433e2b0f912
Gitweb:
https://git.kernel.org/tip/b037b09b9058d84882fa2c4db3806433e2b0f912
Author:Andy Lutomirski 
AuthorDate:Fri, 03 Jul 2020 10:02:58 -07:00
Committer: Thomas Gleixner 
CommitterDate: Mon, 06 Jul 2020 21:15:52 +02:00

x86/entry: Rename idtentry_enter/exit_cond_rcu() to idtentry_enter/exit()

They were originally called _cond_rcu because they were special versions
with conditional RCU handling.  Now they're the standard entry and exit
path, so the _cond_rcu part is just confusing.  Drop it.

Also change the signature to make them more extensible and more foolproof.

No functional change -- it's pure refactoring.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/247fc67685263e0b673e1d7f808182d28ff80359.1593795633.git.l...@kernel.org

---
 arch/x86/entry/common.c | 50 +---
 arch/x86/include/asm/idtentry.h | 28 ++
 arch/x86/kernel/kvm.c   |  6 ++--
 arch/x86/kernel/traps.c |  6 ++--
 arch/x86/mm/fault.c |  6 ++--
 5 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index e83b3f1..0521546 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -559,8 +559,7 @@ SYSCALL_DEFINE0(ni_syscall)
 }
 
 /**
- * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional
- *  RCU handling
+ * idtentry_enter - Handle state tracking on ordinary idtentries
  * @regs:  Pointer to pt_regs of interrupted context
  *
  * Invokes:
@@ -572,6 +571,9 @@ SYSCALL_DEFINE0(ni_syscall)
  *  - The hardirq tracer to keep the state consistent as low level ASM
  *entry disabled interrupts.
  *
+ * As a precondition, this requires that the entry came from user mode,
+ * idle, or a kernel context in which RCU is watching.
+ *
  * For kernel mode entries RCU handling is done conditional. If RCU is
  * watching then the only RCU requirement is to check whether the tick has
  * to be restarted. If RCU is not watching then rcu_irq_enter() has to be
@@ -585,18 +587,21 @@ SYSCALL_DEFINE0(ni_syscall)
  * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
  * would not be possible.
  *
- * Returns: True if RCU has been adjusted on a kernel entry
- * False otherwise
+ * Returns: An opaque object that must be passed to idtentry_exit()
  *
- * The return value must be fed into the rcu_exit argument of
- * idtentry_exit_cond_rcu().
+ * The return value must be fed into the state argument of
+ * idtentry_exit().
  */
-bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
+idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs)
 {
+   idtentry_state_t ret = {
+   .exit_rcu = false,
+   };
+
if (user_mode(regs)) {
check_user_regs(regs);
enter_from_user_mode();
-   return false;
+   return ret;
}
 
/*
@@ -634,7 +639,8 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
trace_hardirqs_off_finish();
instrumentation_end();
 
-   return true;
+   ret.exit_rcu = true;
+   return ret;
}
 
/*
@@ -649,7 +655,7 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
trace_hardirqs_off();
instrumentation_end();
 
-   return false;
+   return ret;
 }
 
 static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
@@ -667,10 +673,9 @@ static void idtentry_exit_cond_resched(struct pt_regs 
*regs, bool may_sched)
 }
 
 /**
- * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU
- * handling
+ * idtentry_exit - Handle return from exception that used idtentry_enter()
  * @regs:  Pointer to pt_regs (exception entry regs)
- * @rcu_exit:  Invoke rcu_irq_exit() if true
+ * @state: Return value from matching call to idtentry_enter()
  *
  * Depending on the return target (kernel/user) this runs the necessary
  * preemption and work checks if possible and reguired and returns to
@@ -679,10 +684,10 @@ static void idtentry_exit_cond_resched(struct pt_regs 
*regs, bool may_sched)
  * This is the last action before returning to the low level ASM code which
  * just needs to return to the appropriate context.
  *
- * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry
- * function must be fed into the @rcu_exit argument.
+ * Counterpart to idtentry_enter(). The return value of the entry
+ * function must be fed into the @state argument.
  */
-void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit)
+void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state)
 {

[tip: x86/urgent] x86/entry/32: Fix #MC and #DB wiring on x86_32

2020-07-04 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 13cbc0cd4a30c815984ad88e3a2e5976493516a3
Gitweb:
https://git.kernel.org/tip/13cbc0cd4a30c815984ad88e3a2e5976493516a3
Author:Andy Lutomirski 
AuthorDate:Fri, 03 Jul 2020 10:02:56 -07:00
Committer: Thomas Gleixner 
CommitterDate: Sat, 04 Jul 2020 19:47:26 +02:00

x86/entry/32: Fix #MC and #DB wiring on x86_32

DEFINE_IDTENTRY_MCE and DEFINE_IDTENTRY_DEBUG were wired up as non-RAW
on x86_32, but the code expected them to be RAW.

Get rid of all the macro indirection for them on 32-bit and just use
DECLARE_IDTENTRY_RAW and DEFINE_IDTENTRY_RAW directly.

Also add a warning to make sure that we only hit the _kernel paths
in kernel mode.

Reported-by: Naresh Kamboju 
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/9e90a7ee8e72fd757db6d92e1e5ff16339c1ecf9.1593795633.git.l...@kernel.org

---
 arch/x86/include/asm/idtentry.h | 23 +--
 arch/x86/kernel/cpu/mce/core.c  |  4 +++-
 arch/x86/kernel/traps.c |  2 +-
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 94333ac..eeac6dc 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -353,10 +353,6 @@ static __always_inline void __##func(struct pt_regs *regs)
 
 #else  /* CONFIG_X86_64 */
 
-/* Maps to a regular IDTENTRY on 32bit for now */
-# define DECLARE_IDTENTRY_IST  DECLARE_IDTENTRY
-# define DEFINE_IDTENTRY_IST   DEFINE_IDTENTRY
-
 /**
  * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant
  * @vector:Vector number (ignored for C)
@@ -387,16 +383,18 @@ __visible noinstr void func(struct pt_regs *regs, 
\
 #endif /* !CONFIG_X86_64 */
 
 /* C-Code mapping */
+#define DECLARE_IDTENTRY_NMI   DECLARE_IDTENTRY_RAW
+#define DEFINE_IDTENTRY_NMIDEFINE_IDTENTRY_RAW
+
+#ifdef CONFIG_X86_64
 #define DECLARE_IDTENTRY_MCE   DECLARE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_MCEDEFINE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_MCE_USER   DEFINE_IDTENTRY_NOIST
 
-#define DECLARE_IDTENTRY_NMI   DECLARE_IDTENTRY_RAW
-#define DEFINE_IDTENTRY_NMIDEFINE_IDTENTRY_RAW
-
 #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_DEBUG  DEFINE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST
+#endif
 
 #else /* !__ASSEMBLY__ */
 
@@ -443,9 +441,6 @@ __visible noinstr void func(struct pt_regs *regs,   
\
 # define DECLARE_IDTENTRY_MCE(vector, func)\
DECLARE_IDTENTRY(vector, func)
 
-# define DECLARE_IDTENTRY_DEBUG(vector, func)  \
-   DECLARE_IDTENTRY(vector, func)
-
 /* No ASM emitted for DF as this goes through a C shim */
 # define DECLARE_IDTENTRY_DF(vector, func)
 
@@ -549,7 +544,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_BP,  exc_int3);
 DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF,exc_page_fault);
 
 #ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_64
 DECLARE_IDTENTRY_MCE(X86_TRAP_MC,  exc_machine_check);
+#else
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC,  exc_machine_check);
+#endif
 #endif
 
 /* NMI */
@@ -559,7 +558,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
 #endif
 
 /* #DB */
+#ifdef CONFIG_X86_64
 DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB,exc_debug);
+#else
+DECLARE_IDTENTRY_RAW(X86_TRAP_DB,  exc_debug);
+#endif
 #ifdef CONFIG_XEN_PV
 DECLARE_IDTENTRY_RAW(X86_TRAP_DB,  xenpv_exc_debug);
 #endif
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ce9120c..a6a90b5 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1901,6 +1901,8 @@ void (*machine_check_vector)(struct pt_regs *) = 
unexpected_machine_check;
 
 static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
 {
+   WARN_ON_ONCE(user_mode(regs));
+
/*
 * Only required when from kernel mode. See
 * mce_check_crashing_cpu() for details.
@@ -1954,7 +1956,7 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
 }
 #else
 /* 32bit unified entry point */
-DEFINE_IDTENTRY_MCE(exc_machine_check)
+DEFINE_IDTENTRY_RAW(exc_machine_check)
 {
unsigned long dr7;
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c17f9b5..6ed8cc5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -925,7 +925,7 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
 }
 #else
 /* 32 bit does not have separate entry points. */
-DEFINE_IDTENTRY_DEBUG(exc_debug)
+DEFINE_IDTENTRY_RAW(exc_debug)
 {
unsigned long dr6, dr7;
 


[tip: x86/urgent] x86/ldt: Disable 16-bit segments on Xen PV

2020-07-04 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: cc801833a171163edb6385425349ba8903bd1b20
Gitweb:
https://git.kernel.org/tip/cc801833a171163edb6385425349ba8903bd1b20
Author:Andy Lutomirski 
AuthorDate:Fri, 03 Jul 2020 10:02:57 -07:00
Committer: Thomas Gleixner 
CommitterDate: Sat, 04 Jul 2020 19:47:26 +02:00

x86/ldt: Disable 16-bit segments on Xen PV

Xen PV doesn't implement ESPFIX64, so they don't work right.  Disable
them.  Also print a warning the first time anyone tries to use a
16-bit segment on a Xen PV guest that would otherwise allow it
to help people diagnose this change in behavior.

This gets us closer to having all x86 selftests pass on Xen PV.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/92b2975459dfe5929ecf34c3896ad920bd9e3f2d.1593795633.git.l...@kernel.org

---
 arch/x86/kernel/ldt.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 8748321..34e918a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -29,6 +29,8 @@
 #include 
 #include 
 
+#include 
+
 /* This is a multiple of PAGE_SIZE. */
 #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
 
@@ -543,6 +545,37 @@ static int read_default_ldt(void __user *ptr, unsigned 
long bytecount)
return bytecount;
 }
 
+static bool allow_16bit_segments(void)
+{
+   if (!IS_ENABLED(CONFIG_X86_16BIT))
+   return false;
+
+#ifdef CONFIG_XEN_PV
+   /*
+* Xen PV does not implement ESPFIX64, which means that 16-bit
+* segments will not work correctly.  Until either Xen PV implements
+* ESPFIX64 and can signal this fact to the guest or unless someone
+* provides compelling evidence that allowing broken 16-bit segments
+* is worthwhile, disallow 16-bit segments under Xen PV.
+*/
+   if (xen_pv_domain()) {
+   static DEFINE_MUTEX(xen_warning);
+   static bool warned;
+
+   mutex_lock(_warning);
+   if (!warned) {
+   pr_info("Warning: 16-bit segments do not work correctly 
in a Xen PV guest\n");
+   warned = true;
+   }
+   mutex_unlock(_warning);
+
+   return false;
+   }
+#endif
+
+   return true;
+}
+
 static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
struct mm_struct *mm = current->mm;
@@ -574,7 +607,7 @@ static int write_ldt(void __user *ptr, unsigned long 
bytecount, int oldmode)
/* The user wants to clear the entry. */
memset(, 0, sizeof(ldt));
} else {
-   if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+   if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
error = -EINVAL;
goto out;
}


[tip: x86/urgent] x86/entry, selftests: Further improve user entry sanity checks

2020-07-04 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 3c73b81a9164d0c1b6379d6672d2772a9e95168e
Gitweb:
https://git.kernel.org/tip/3c73b81a9164d0c1b6379d6672d2772a9e95168e
Author:Andy Lutomirski 
AuthorDate:Fri, 03 Jul 2020 10:02:54 -07:00
Committer: Thomas Gleixner 
CommitterDate: Sat, 04 Jul 2020 19:47:25 +02:00

x86/entry, selftests: Further improve user entry sanity checks

Chasing down a Xen bug caused me to realize that the new entry sanity
checks are still fairly weak.  Add some more checks.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/881de09e786ab93ce56ee4a2437ba2c308afe7a9.1593795633.git.l...@kernel.org

---
 arch/x86/entry/common.c  | 19 +++
 tools/testing/selftests/x86/syscall_nt.c | 11 +++
 2 files changed, 30 insertions(+)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index f392a8b..e83b3f1 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -49,6 +49,23 @@
 static void check_user_regs(struct pt_regs *regs)
 {
if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
+   /*
+* Make sure that the entry code gave us a sensible EFLAGS
+* register.  Native because we want to check the actual CPU
+* state, not the interrupt state as imagined by Xen.
+*/
+   unsigned long flags = native_save_fl();
+   WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF |
+ X86_EFLAGS_NT));
+
+   /* We think we came from user mode. Make sure pt_regs agrees. */
+   WARN_ON_ONCE(!user_mode(regs));
+
+   /*
+* All entries from user mode (except #DF) should be on the
+* normal thread stack and should have user pt_regs in the
+* correct location.
+*/
WARN_ON_ONCE(!on_thread_stack());
WARN_ON_ONCE(regs != task_pt_regs(current));
}
@@ -577,6 +594,7 @@ SYSCALL_DEFINE0(ni_syscall)
 bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
 {
if (user_mode(regs)) {
+   check_user_regs(regs);
enter_from_user_mode();
return false;
}
@@ -710,6 +728,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, 
bool rcu_exit)
  */
 void noinstr idtentry_enter_user(struct pt_regs *regs)
 {
+   check_user_regs(regs);
enter_from_user_mode();
 }
 
diff --git a/tools/testing/selftests/x86/syscall_nt.c 
b/tools/testing/selftests/x86/syscall_nt.c
index 970e5e1..a108b80 100644
--- a/tools/testing/selftests/x86/syscall_nt.c
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -81,5 +81,16 @@ int main(void)
printf("[RUN]\tSet NT|AC|TF and issue a syscall\n");
do_it(X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_TF);
 
+   /*
+* Now try DF.  This is evil and it's plausible that we will crash
+* glibc, but glibc would have to do something rather surprising
+* for this to happen.
+*/
+   printf("[RUN]\tSet DF and issue a syscall\n");
+   do_it(X86_EFLAGS_DF);
+
+   printf("[RUN]\tSet TF|DF and issue a syscall\n");
+   do_it(X86_EFLAGS_TF | X86_EFLAGS_DF);
+
return nerrs == 0 ? 0 : 1;
 }


[tip: x86/urgent] x86/entry/compat: Clear RAX high bits on Xen PV SYSENTER

2020-07-04 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: db5b2c5a90a111618f071d231a8b945cf522313e
Gitweb:
https://git.kernel.org/tip/db5b2c5a90a111618f071d231a8b945cf522313e
Author:Andy Lutomirski 
AuthorDate:Fri, 03 Jul 2020 10:02:53 -07:00
Committer: Thomas Gleixner 
CommitterDate: Sat, 04 Jul 2020 19:47:25 +02:00

x86/entry/compat: Clear RAX high bits on Xen PV SYSENTER

Move the clearing of the high bits of RAX after Xen PV joins the SYSENTER
path so that Xen PV doesn't skip it.

Arguably this code should be deleted instead, but that would belong in the
merge window.

Fixes: ffae641f5747 ("x86/entry/64/compat: Fix Xen PV SYSENTER frame setup")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/9d33b3f3216dcab008070f1c28b6091ae7199969.1593795633.git.l...@kernel.org

---
 arch/x86/entry/entry_64_compat.S | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 381a6de..541fdaf 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -57,15 +57,6 @@ SYM_CODE_START(entry_SYSENTER_compat)
 
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
-   /*
-* User tracing code (ptrace or signal handlers) might assume that
-* the saved RAX contains a 32-bit number when we're invoking a 32-bit
-* syscall.  Just in case the high bits are nonzero, zero-extend
-* the syscall number.  (This could almost certainly be deleted
-* with no ill effects.)
-*/
-   movl%eax, %eax
-
/* Construct struct pt_regs on stack */
pushq   $__USER32_DS/* pt_regs->ss */
pushq   $0  /* pt_regs->sp = 0 (placeholder) */
@@ -80,6 +71,16 @@ SYM_CODE_START(entry_SYSENTER_compat)
pushq   $__USER32_CS/* pt_regs->cs */
pushq   $0  /* pt_regs->ip = 0 (placeholder) */
 SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
+
+   /*
+* User tracing code (ptrace or signal handlers) might assume that
+* the saved RAX contains a 32-bit number when we're invoking a 32-bit
+* syscall.  Just in case the high bits are nonzero, zero-extend
+* the syscall number.  (This could almost certainly be deleted
+* with no ill effects.)
+*/
+   movl%eax, %eax
+
pushq   %rax/* pt_regs->orig_ax */
pushq   %rdi/* pt_regs->di */
pushq   %rsi/* pt_regs->si */


[tip: x86/urgent] x86/entry/xen: Route #DB correctly on Xen PV

2020-07-04 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: f41f0824224eb12ad84de8972962dd54be5abe3b
Gitweb:
https://git.kernel.org/tip/f41f0824224eb12ad84de8972962dd54be5abe3b
Author:Andy Lutomirski 
AuthorDate:Fri, 03 Jul 2020 10:02:55 -07:00
Committer: Thomas Gleixner 
CommitterDate: Sat, 04 Jul 2020 19:47:25 +02:00

x86/entry/xen: Route #DB correctly on Xen PV

On Xen PV, #DB doesn't use IST. It still needs to be correctly routed
depending on whether it came from user or kernel mode.

Get rid of DECLARE/DEFINE_IDTENTRY_XEN -- it was too hard to follow the
logic.  Instead, route #DB and NMI through DECLARE/DEFINE_IDTENTRY_RAW on
Xen, and do the right thing for #DB.  Also add more warnings to the
exc_debug* handlers to make this type of failure more obvious.

This fixes various forms of corruption that happen when usermode
triggers #DB on Xen PV.

Fixes: 4c0dcd8350a0 ("x86/entry: Implement user mode C entry points for #DB and 
#MCE")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/4163e733cce0b41658e252c6c6b3464f33fdff17.1593795633.git.l...@kernel.org

---
 arch/x86/include/asm/idtentry.h | 24 ++--
 arch/x86/kernel/traps.c | 12 
 arch/x86/xen/enlighten_pv.c | 28 
 arch/x86/xen/xen-asm_64.S   |  5 ++---
 4 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index cf51c50..94333ac 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -398,18 +398,6 @@ __visible noinstr void func(struct pt_regs *regs,  
\
 #define DEFINE_IDTENTRY_DEBUG  DEFINE_IDTENTRY_IST
 #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST
 
-/**
- * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points
- * @vector:Vector number (ignored for C)
- * @func:  Function name of the entry point
- *
- * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM
- * indirection magic.
- */
-#define DECLARE_IDTENTRY_XEN(vector, func) \
-   asmlinkage void xen_asm_exc_xen##func(void);\
-   asmlinkage void asm_exc_xen##func(void)
-
 #else /* !__ASSEMBLY__ */
 
 /*
@@ -469,10 +457,6 @@ __visible noinstr void func(struct pt_regs *regs,  
\
 /* No ASM code emitted for NMI */
 #define DECLARE_IDTENTRY_NMI(vector, func)
 
-/* XEN NMI and DB wrapper */
-#define DECLARE_IDTENTRY_XEN(vector, func) \
-   idtentry vector asm_exc_xen##func exc_##func has_error_code=0
-
 /*
  * ASM code to emit the common vector entry stubs where each stub is
  * packed into 8 bytes.
@@ -570,11 +554,15 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
 
 /* NMI */
 DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
-DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
+#endif
 
 /* #DB */
 DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB,exc_debug);
-DECLARE_IDTENTRY_XEN(X86_TRAP_DB,  debug);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_DB,  xenpv_exc_debug);
+#endif
 
 /* #DF */
 DECLARE_IDTENTRY_DF(X86_TRAP_DF,   exc_double_fault);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f9727b9..c17f9b5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -866,6 +866,12 @@ static __always_inline void exc_debug_kernel(struct 
pt_regs *regs,
trace_hardirqs_off_finish();
 
/*
+* If something gets miswired and we end up here for a user mode
+* #DB, we will malfunction.
+*/
+   WARN_ON_ONCE(user_mode(regs));
+
+   /*
 * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
 * watchpoint at the same time then that will still be handled.
 */
@@ -883,6 +889,12 @@ static __always_inline void exc_debug_kernel(struct 
pt_regs *regs,
 static __always_inline void exc_debug_user(struct pt_regs *regs,
   unsigned long dr6)
 {
+   /*
+* If something gets miswired and we end up here for a kernel mode
+* #DB, we will malfunction.
+*/
+   WARN_ON_ONCE(!user_mode(regs));
+
idtentry_enter_user(regs);
instrumentation_begin();
 
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index acc49fa..0d68948 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -598,6 +598,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, 
int entrynum,
 }
 
 #ifdef CONFIG_X86_64
+void noist_exc_debug(struct pt_regs *regs);
+
+DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
+{
+   /* On Xen PV, NMI doesn't use IST.  The C part is the sane as native. */
+   exc_nmi(regs);
+}
+

[tip: x86/fsgsbase] selftests/x86/fsgsbase: Add a missing memory constraint

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: 8e259031c67a5ea0666428edb64c89e8c6ebd18e
Gitweb:
https://git.kernel.org/tip/8e259031c67a5ea0666428edb64c89e8c6ebd18e
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:24:28 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 15:27:20 +02:00

selftests/x86/fsgsbase: Add a missing memory constraint

The manual call to set_thread_area() via int $0x80 was missing any
indication that the descriptor was a pointer, causing gcc to
occasionally generate wrong code.  Add the missing constraint.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/432968af67259ca92d68b774a731aff468eae610.1593192140.git.l...@kernel.org

---
 tools/testing/selftests/x86/fsgsbase.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/x86/fsgsbase.c 
b/tools/testing/selftests/x86/fsgsbase.c
index f47495d..9983195 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -285,7 +285,8 @@ static unsigned short load_gs(void)
/* 32-bit set_thread_area */
long ret;
asm volatile ("int $0x80"
- : "=a" (ret) : "a" (243), "b" (low_desc)
+ : "=a" (ret), "+m" (*low_desc)
+ : "a" (243), "b" (low_desc)
  : "r8", "r9", "r10", "r11");
memcpy(, low_desc, sizeof(desc));
munmap(low_desc, sizeof(desc));


[tip: x86/fsgsbase] selftests/x86/fsgsbase: Fix a comment in the ptrace_write_gsbase test

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: 979c2c4247cafd8a91628a7306b6871efbd12fdb
Gitweb:
https://git.kernel.org/tip/979c2c4247cafd8a91628a7306b6871efbd12fdb
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:24:27 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 15:27:20 +02:00

selftests/x86/fsgsbase: Fix a comment in the ptrace_write_gsbase test

A comment was unclear.  Fix it.

Fixes: 5e7ec8578fa3 ("selftests/x86/fsgsbase: Test ptracer-induced GS base 
write with FSGSBASE")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/901034a91a40169ec84f1f699ea86704dff762e4.1593192140.git.l...@kernel.org

---
 tools/testing/selftests/x86/fsgsbase.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/x86/fsgsbase.c 
b/tools/testing/selftests/x86/fsgsbase.c
index 9a43498..f47495d 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -498,7 +498,8 @@ static void test_ptrace_write_gsbase(void)
 * base would zero the selector.  On newer kernels,
 * this behavior has changed -- poking the base
 * changes only the base and, if FSGSBASE is not
-* available, this may not effect.
+* available, this may have no effect once the tracee
+* is resumed.
 */
if (gs == 0)
printf("\tNote: this is expected behavior on 
older kernels.\n");


[tip: x86/fsgsbase] x86/fsgsbase: Fix Xen PV support

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: d029bff60aa6c7eab281d52602b6a7a971615324
Gitweb:
https://git.kernel.org/tip/d029bff60aa6c7eab281d52602b6a7a971615324
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:24:30 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 15:27:20 +02:00

x86/fsgsbase: Fix Xen PV support

On Xen PV, SWAPGS doesn't work.  Teach __rdfsbase_inactive() and
__wrgsbase_inactive() to use rdmsrl()/wrmsrl() on Xen PV.  The Xen
pvop code will understand this and issue the correct hypercalls.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/f07c08f178fe9711915862b656722a207cd52c28.1593192140.git.l...@kernel.org

---
 arch/x86/kernel/process_64.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index cb8e37d..e14476f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -163,9 +163,15 @@ static noinstr unsigned long __rdgsbase_inactive(void)
 
lockdep_assert_irqs_disabled();
 
-   native_swapgs();
-   gsbase = rdgsbase();
-   native_swapgs();
+   if (!static_cpu_has(X86_FEATURE_XENPV)) {
+   native_swapgs();
+   gsbase = rdgsbase();
+   native_swapgs();
+   } else {
+   instrumentation_begin();
+   rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+   instrumentation_end();
+   }
 
return gsbase;
 }
@@ -182,9 +188,15 @@ static noinstr void __wrgsbase_inactive(unsigned long 
gsbase)
 {
lockdep_assert_irqs_disabled();
 
-   native_swapgs();
-   wrgsbase(gsbase);
-   native_swapgs();
+   if (!static_cpu_has(X86_FEATURE_XENPV)) {
+   native_swapgs();
+   wrgsbase(gsbase);
+   native_swapgs();
+   } else {
+   instrumentation_begin();
+   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+   instrumentation_end();
+   }
 }
 
 /*


[tip: x86/fsgsbase] x86/ptrace: Fix 32-bit PTRACE_SETREGS vs fsbase and gsbase

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: 40c45904f818c1f6555294ca27afc5fda4f09e68
Gitweb:
https://git.kernel.org/tip/40c45904f818c1f6555294ca27afc5fda4f09e68
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:24:29 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 15:27:20 +02:00

x86/ptrace: Fix 32-bit PTRACE_SETREGS vs fsbase and gsbase

Debuggers expect that doing PTRACE_GETREGS, then poking at a tracee
and maybe letting it run for a while, then doing PTRACE_SETREGS will
put the tracee back where it was.  In the specific case of a 32-bit
tracer and tracee, the PTRACE_GETREGS/SETREGS data structure doesn't
have fs_base or gs_base fields, so FSBASE and GSBASE fields are
never stored anywhere.  Everything used to still work because
nonzero FS or GS would result full reloads of the segment registers
when the tracee resumes, and the bases associated with FS==0 or
GS==0 are irrelevant to 32-bit code.

Adding FSGSBASE support broke this: when FSGSBASE is enabled, FSBASE
and GSBASE are now restored independently of FS and GS for all tasks
when context-switched in.  This means that, if a 32-bit tracer
restores a previous state using PTRACE_SETREGS but the tracee's
pre-restore and post-restore bases don't match, then the tracee is
resumed with the wrong base.

Fix it by explicitly loading the base when a 32-bit tracer pokes FS
or GS on a 64-bit kernel.

Also add a test case.

Fixes: 673903495c85 ("x86/process/64: Use FSBSBASE in switch_to() if available")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/229cc6a50ecbb701abd50fe4ddaf0eda98cd.1593192140.git.l...@kernel.org

---
 arch/x86/include/asm/fsgsbase.h|   2 +-
 arch/x86/kernel/process_64.c   |   4 +-
 arch/x86/kernel/ptrace.c   |  43 ++-
 tools/testing/selftests/x86/Makefile   |   2 +-
 tools/testing/selftests/x86/fsgsbase_restore.c | 245 -
 5 files changed, 280 insertions(+), 16 deletions(-)
 create mode 100644 tools/testing/selftests/x86/fsgsbase_restore.c

diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index aefd537..d552646 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -75,6 +75,8 @@ static inline void x86_fsbase_write_cpu(unsigned long fsbase)
 
 extern unsigned long x86_gsbase_read_cpu_inactive(void);
 extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+extern unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+   unsigned short selector);
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d618969..cb8e37d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -347,8 +347,8 @@ static __always_inline void x86_fsgsbase_load(struct 
thread_struct *prev,
}
 }
 
-static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
-   unsigned short selector)
+unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+unsigned short selector)
 {
unsigned short idx = selector >> 3;
unsigned long base;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 1c7646c..3f00648 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -281,17 +281,9 @@ static int set_segment_reg(struct task_struct *task,
return -EIO;
 
/*
-* This function has some ABI oddities.
-*
-* A 32-bit ptracer probably expects that writing FS or GS will change
-* FSBASE or GSBASE respectively.  In the absence of FSGSBASE support,
-* this code indeed has that effect.  When FSGSBASE is added, this
-* will require a special case.
-*
-* For existing 64-bit ptracers, writing FS or GS *also* currently
-* changes the base if the selector is nonzero the next time the task
-* is run.  This behavior may not be needed, and trying to preserve it
-* when FSGSBASE is added would be complicated at best.
+* Writes to FS and GS will change the stored selector.  Whether
+* this changes the segment base as well depends on whether
+* FSGSBASE is enabled.
 */
 
switch (offset) {
@@ -867,14 +859,39 @@ long arch_ptrace(struct task_struct *child, long request,
 static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 {
struct pt_regs *regs = task_pt_regs(child);
+   int ret;
 
switch (regno) {
 
SEG32(cs);
SEG32(ds);
SEG32(es);
-   SEG32(fs);
-   SEG32(gs);
+
+   /*
+* A 32-bit ptracer on a 64-bit kernel expects that writing
+* FS or GS will also update the base.  This is needed for
+* operations 

[tip: x86/urgent] selftests/x86/syscall_nt: Add more flag combinations

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: e4ef7de160c6b12639c4fc49bcacb25b860ac76d
Gitweb:
https://git.kernel.org/tip/e4ef7de160c6b12639c4fc49bcacb25b860ac76d
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:21:14 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 10:00:26 +02:00

selftests/x86/syscall_nt: Add more flag combinations

Add EFLAGS.AC to the mix.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/12924e2fe2c5826568b7fc9436d85ca7f5eb1743.1593191971.git.l...@kernel.org

---
 tools/testing/selftests/x86/syscall_nt.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/tools/testing/selftests/x86/syscall_nt.c 
b/tools/testing/selftests/x86/syscall_nt.c
index 02309a1..f060534 100644
--- a/tools/testing/selftests/x86/syscall_nt.c
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -73,6 +73,12 @@ int main(void)
printf("[RUN]\tSet NT and issue a syscall\n");
do_it(X86_EFLAGS_NT);
 
+   printf("[RUN]\tSet AC and issue a syscall\n");
+   do_it(X86_EFLAGS_AC);
+
+   printf("[RUN]\tSet NT|AC and issue a syscall\n");
+   do_it(X86_EFLAGS_NT | X86_EFLAGS_AC);
+
/*
 * Now try it again with TF set -- TF forces returns via IRET in all
 * cases except non-ptregs-using 64-bit full fast path syscalls.
@@ -80,8 +86,17 @@ int main(void)
 
sethandler(SIGTRAP, sigtrap, 0);
 
+   printf("[RUN]\tSet TF and issue a syscall\n");
+   do_it(X86_EFLAGS_TF);
+
printf("[RUN]\tSet NT|TF and issue a syscall\n");
do_it(X86_EFLAGS_NT | X86_EFLAGS_TF);
 
+   printf("[RUN]\tSet AC|TF and issue a syscall\n");
+   do_it(X86_EFLAGS_AC | X86_EFLAGS_TF);
+
+   printf("[RUN]\tSet NT|AC|TF and issue a syscall\n");
+   do_it(X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_TF);
+
return nerrs == 0 ? 0 : 1;
 }


[tip: x86/urgent] x86/entry: Move SYSENTER's regs->sp and regs->flags fixups into C

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: d1721250f3ffed9afba3e1fb729947cec64c5a8a
Gitweb:
https://git.kernel.org/tip/d1721250f3ffed9afba3e1fb729947cec64c5a8a
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:21:12 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 10:00:25 +02:00

x86/entry: Move SYSENTER's regs->sp and regs->flags fixups into C

The SYSENTER asm (32-bit and compat) contains fixups for regs->sp and
regs->flags.  Move the fixups into C and fix some comments while at it.

This is a valid cleanup all by itself, and it also simplifies the
subsequent patch that will fix Xen PV SYSENTER.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/fe62bef67eda7fac75b8f3dbafccf571dc4ece6b.1593191971.git.l...@kernel.org

---
 arch/x86/entry/common.c  | 12 
 arch/x86/entry/entry_32.S|  5 ++---
 arch/x86/entry/entry_64_compat.S | 11 +--
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index ed8ccc8..f392a8b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -522,6 +522,18 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs 
*regs)
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) 
== 0;
 #endif
 }
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
+{
+   /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+   regs->sp = regs->bp;
+
+   /* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
+   regs->flags |= X86_EFLAGS_IF;
+
+   return do_fast_syscall_32(regs);
+}
 #endif
 
 SYSCALL_DEFINE0(ni_syscall)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 024d7d2..2d0bd5d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -933,9 +933,8 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
-   pushl   %ebp/* pt_regs->sp (stashed in bp) */
+   pushl   $0  /* pt_regs->sp (placeholder) */
pushfl  /* pt_regs->flags (except IF = 0) */
-   orl $X86_EFLAGS_IF, (%esp)  /* Fix IF */
pushl   $__USER_CS  /* pt_regs->cs */
pushl   $0  /* pt_regs->ip = 0 (placeholder) */
pushl   %eax/* pt_regs->orig_ax */
@@ -965,7 +964,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 .Lsysenter_flags_fixed:
 
movl%esp, %eax
-   calldo_fast_syscall_32
+   calldo_SYSENTER_32
/* XEN PV guests always use IRET path */
ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
"jmp .Lsyscall_32_done", X86_FEATURE_XENPV
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 0f974ae..7b9d815 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -68,16 +68,15 @@ SYM_CODE_START(entry_SYSENTER_compat)
 
/* Construct struct pt_regs on stack */
pushq   $__USER32_DS/* pt_regs->ss */
-   pushq   %rbp/* pt_regs->sp (stashed in bp) */
+   pushq   $0  /* pt_regs->sp = 0 (placeholder) */
 
/*
 * Push flags.  This is nasty.  First, interrupts are currently
-* off, but we need pt_regs->flags to have IF set.  Second, even
-* if TF was set when SYSENTER started, it's clear by now.  We fix
-* that later using TIF_SINGLESTEP.
+* off, but we need pt_regs->flags to have IF set.  Second, if TS
+* was set in usermode, it's still set, and we're singlestepping
+* through this code.  do_SYSENTER_32() will fix up IF.
 */
pushfq  /* pt_regs->flags (except IF = 0) */
-   orl $X86_EFLAGS_IF, (%rsp)  /* Fix saved flags */
pushq   $__USER32_CS/* pt_regs->cs */
pushq   $0  /* pt_regs->ip = 0 (placeholder) */
pushq   %rax/* pt_regs->orig_ax */
@@ -135,7 +134,7 @@ SYM_CODE_START(entry_SYSENTER_compat)
 .Lsysenter_flags_fixed:
 
movq%rsp, %rdi
-   calldo_fast_syscall_32
+   calldo_SYSENTER_32
/* XEN PV guests always use IRET path */
ALTERNATIVE "testl %eax, %eax; jz 
swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", 
X86_FEATURE_XENPV


[tip: x86/urgent] selftests/x86: Consolidate and fix get/set_eflags() helpers

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: cced0b24bb545bfe74fea96de84adc23c0146b05
Gitweb:
https://git.kernel.org/tip/cced0b24bb545bfe74fea96de84adc23c0146b05
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:21:16 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 10:00:27 +02:00

selftests/x86: Consolidate and fix get/set_eflags() helpers

There are several copies of get_eflags() and set_eflags() and they all are
buggy.  Consolidate them and fix them.  The fixes are:

Add memory clobbers.  These are probably unnecessary but they make sure
that the compiler doesn't move something past one of these calls when it
shouldn't.

Respect the redzone on x86_64.  There has no failure been observed related
to this, but it's definitely a bug.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/982ce58ae8dea2f1e57093ee894760e35267e751.1593191971.git.l...@kernel.org

---
 tools/testing/selftests/x86/Makefile  |  4 +-
 tools/testing/selftests/x86/helpers.h | 41 ++-
 tools/testing/selftests/x86/single_step_syscall.c | 17 +--
 tools/testing/selftests/x86/syscall_arg_fault.c   | 21 +---
 tools/testing/selftests/x86/syscall_nt.c  | 20 +---
 tools/testing/selftests/x86/test_vsyscall.c   | 15 +-
 tools/testing/selftests/x86/unwind_vdso.c | 23 +
 7 files changed, 51 insertions(+), 90 deletions(-)
 create mode 100644 tools/testing/selftests/x86/helpers.h

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 5f16821..d2796ea 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -70,10 +70,10 @@ all_64: $(BINARIES_64)
 
 EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64)
 
-$(BINARIES_32): $(OUTPUT)/%_32: %.c
+$(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h
$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
 
-$(BINARIES_64): $(OUTPUT)/%_64: %.c
+$(BINARIES_64): $(OUTPUT)/%_64: %.c helpers.h
$(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
 
 # x86_64 users should be encouraged to install 32-bit libraries
diff --git a/tools/testing/selftests/x86/helpers.h 
b/tools/testing/selftests/x86/helpers.h
new file mode 100644
index 000..f5ff2a2
--- /dev/null
+++ b/tools/testing/selftests/x86/helpers.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __SELFTESTS_X86_HELPERS_H
+#define __SELFTESTS_X86_HELPERS_H
+
+#include 
+
+static inline unsigned long get_eflags(void)
+{
+   unsigned long eflags;
+
+   asm volatile (
+#ifdef __x86_64__
+   "subq $128, %%rsp\n\t"
+   "pushfq\n\t"
+   "popq %0\n\t"
+   "addq $128, %%rsp"
+#else
+   "pushfl\n\t"
+   "popl %0"
+#endif
+   : "=r" (eflags) :: "memory");
+
+   return eflags;
+}
+
+static inline void set_eflags(unsigned long eflags)
+{
+   asm volatile (
+#ifdef __x86_64__
+   "subq $128, %%rsp\n\t"
+   "pushq %0\n\t"
+   "popfq\n\t"
+   "addq $128, %%rsp"
+#else
+   "pushl %0\n\t"
+   "popfl"
+#endif
+   :: "r" (eflags) : "flags", "memory");
+}
+
+#endif /* __SELFTESTS_X86_HELPERS_H */
diff --git a/tools/testing/selftests/x86/single_step_syscall.c 
b/tools/testing/selftests/x86/single_step_syscall.c
index 1063328..120ac74 100644
--- a/tools/testing/selftests/x86/single_step_syscall.c
+++ b/tools/testing/selftests/x86/single_step_syscall.c
@@ -31,6 +31,8 @@
 #include 
 #include 
 
+#include "helpers.h"
+
 static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
   int flags)
 {
@@ -67,21 +69,6 @@ static unsigned char altstack_data[SIGSTKSZ];
 # define INT80_CLOBBERS
 #endif
 
-static unsigned long get_eflags(void)
-{
-   unsigned long eflags;
-   asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
-   return eflags;
-}
-
-static void set_eflags(unsigned long eflags)
-{
-   asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
- : : "rm" (eflags) : "flags");
-}
-
-#define X86_EFLAGS_TF (1UL << 8)
-
 static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 {
ucontext_t *ctx = (ucontext_t*)ctx_void;
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c 
b/tools/testing/selftests/x86/syscall_arg_fault.c
index bc0ecc2..5b7abeb 100644
--- a/tools/testing/selftests/x86/syscall_arg_fault.c
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -15,30 +15,11 @@
 #include 
 #include 
 
-#ifdef __x86_64__
-# define WIDTH "q"
-#else
-# define WIDTH "l"
-#endif
+#include "helpers.h"
 
 /* Our sigaltstack scratch space. */
 static unsigned char altstack_data[SIGSTKSZ];
 
-static unsigned long get_eflags(void)
-{
-   unsigned long eflags;
-   asm volatile ("pushf" WIDTH 

[tip: x86/urgent] selftests/x86/syscall_nt: Clear weird flags after each test

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: a61fa2799ef9bf6c4f54cf7295036577cececc72
Gitweb:
https://git.kernel.org/tip/a61fa2799ef9bf6c4f54cf7295036577cececc72
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:21:15 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 10:00:26 +02:00

selftests/x86/syscall_nt: Clear weird flags after each test

Clear the weird flags before logging to improve strace output --
logging results while, say, TF is set does no one any favors.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/907bfa5a42d4475b8245e18b67a04b13ca51ffdb.1593191971.git.l...@kernel.org

---
 tools/testing/selftests/x86/syscall_nt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/x86/syscall_nt.c 
b/tools/testing/selftests/x86/syscall_nt.c
index f060534..5fc82b9 100644
--- a/tools/testing/selftests/x86/syscall_nt.c
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -59,6 +59,7 @@ static void do_it(unsigned long extraflags)
set_eflags(get_eflags() | extraflags);
syscall(SYS_getpid);
flags = get_eflags();
+   set_eflags(X86_EFLAGS_IF | X86_EFLAGS_FIXED);
if ((flags & extraflags) == extraflags) {
printf("[OK]\tThe syscall worked and flags are still set\n");
} else {


[tip: x86/urgent] x86/entry/64/compat: Fix Xen PV SYSENTER frame setup

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: ffae641f57476369b4d503402b37ebe489d23395
Gitweb:
https://git.kernel.org/tip/ffae641f57476369b4d503402b37ebe489d23395
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:21:13 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 10:00:26 +02:00

x86/entry/64/compat: Fix Xen PV SYSENTER frame setup

The SYSENTER frame setup was nonsense.  It worked by accident because the
normal code into which the Xen asm jumped (entry_SYSENTER_32/compat) threw
away SP without touching the stack.  entry_SYSENTER_compat was recently
modified such that it relied on having a valid stack pointer, so now the
Xen asm needs to invoke it with a valid stack.

Fix it up like SYSCALL: use the Xen-provided frame and skip the bare
metal prologue.

Fixes: 1c3e5d3f60e2 ("x86/entry: Make entry_64_compat.S objtool clean")
Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Boris Ostrovsky 
Link: 
https://lkml.kernel.org/r/947880c41ade688ff4836f665d0c9fcaa9bd1201.1593191971.git.l...@kernel.org

---
 arch/x86/entry/entry_64_compat.S |  1 +
 arch/x86/xen/xen-asm_64.S| 20 
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 7b9d815..381a6de 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -79,6 +79,7 @@ SYM_CODE_START(entry_SYSENTER_compat)
pushfq  /* pt_regs->flags (except IF = 0) */
pushq   $__USER32_CS/* pt_regs->cs */
pushq   $0  /* pt_regs->ip = 0 (placeholder) */
+SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
pushq   %rax/* pt_regs->orig_ax */
pushq   %rdi/* pt_regs->di */
pushq   %rsi/* pt_regs->si */
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 5d252aa..e1e1c7e 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -161,10 +161,22 @@ SYM_FUNC_END(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
 SYM_FUNC_START(xen_sysenter_target)
-   mov 0*8(%rsp), %rcx
-   mov 1*8(%rsp), %r11
-   mov 5*8(%rsp), %rsp
-   jmp entry_SYSENTER_compat
+   /*
+* NB: Xen is polite and clears TF from EFLAGS for us.  This means
+* that we don't need to guard against single step exceptions here.
+*/
+   popq %rcx
+   popq %r11
+
+   /*
+* Neither Xen nor the kernel really knows what the old SS and
+* CS were.  The kernel expects __USER32_DS and __USER32_CS, so
+* report those values even though Xen will guess its own values.
+*/
+   movq $__USER32_DS, 4*8(%rsp)
+   movq $__USER32_CS, 1*8(%rsp)
+
+   jmp entry_SYSENTER_compat_after_hwframe
 SYM_FUNC_END(xen_sysenter_target)
 
 #else /* !CONFIG_IA32_EMULATION */


[tip: x86/urgent] x86/entry: Assert that syscalls are on the right stack

2020-07-01 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: c9c26150e61de441ab58b25c1f64afc049ee0fee
Gitweb:
https://git.kernel.org/tip/c9c26150e61de441ab58b25c1f64afc049ee0fee
Author:Andy Lutomirski 
AuthorDate:Fri, 26 Jun 2020 10:21:11 -07:00
Committer: Thomas Gleixner 
CommitterDate: Wed, 01 Jul 2020 10:00:25 +02:00

x86/entry: Assert that syscalls are on the right stack

Now that the entry stack is a full page, it's too easy to regress the
system call entry code and end up on the wrong stack without noticing.
Assert that all system calls (SYSCALL64, SYSCALL32, SYSENTER, and INT80)
are on the right stack and have pt_regs in the right place.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/52059e42bb0ab8551153d012d68f7be18d72ff8e.1593191971.git.l...@kernel.org

---
 arch/x86/entry/common.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index bd3f141..ed8ccc8 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -45,6 +45,15 @@
 #define CREATE_TRACE_POINTS
 #include 
 
+/* Check that the stack and regs on entry from user mode are sane. */
+static void check_user_regs(struct pt_regs *regs)
+{
+   if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
+   WARN_ON_ONCE(!on_thread_stack());
+   WARN_ON_ONCE(regs != task_pt_regs(current));
+   }
+}
+
 #ifdef CONFIG_CONTEXT_TRACKING
 /**
  * enter_from_user_mode - Establish state when coming from user mode
@@ -127,9 +136,6 @@ static long syscall_trace_enter(struct pt_regs *regs)
unsigned long ret = 0;
u32 work;
 
-   if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
-   BUG_ON(regs != task_pt_regs(current));
-
work = READ_ONCE(ti->flags);
 
if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
@@ -346,6 +352,8 @@ __visible noinstr void do_syscall_64(unsigned long nr, 
struct pt_regs *regs)
 {
struct thread_info *ti;
 
+   check_user_regs(regs);
+
enter_from_user_mode();
instrumentation_begin();
 
@@ -409,6 +417,8 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs)
 /* Handles int $0x80 */
 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
 {
+   check_user_regs(regs);
+
enter_from_user_mode();
instrumentation_begin();
 
@@ -460,6 +470,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs 
*regs)
vdso_image_32.sym_int80_landing_pad;
bool success;
 
+   check_user_regs(regs);
+
/*
 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.


[tip: x86/fsgsbase] selftests/x86: Add a syscall_arg_fault_64 test for negative GSBASE

2020-06-22 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: a5d25e01c8146ad8846da4760422e12242fceafe
Gitweb:
https://git.kernel.org/tip/a5d25e01c8146ad8846da4760422e12242fceafe
Author:Andy Lutomirski 
AuthorDate:Wed, 27 May 2020 16:02:36 -07:00
Committer: Borislav Petkov 
CommitterDate: Mon, 22 Jun 2020 18:56:36 +02:00

selftests/x86: Add a syscall_arg_fault_64 test for negative GSBASE

If the kernel erroneously allows WRGSBASE and user code writes a
negative value, paranoid_entry will get confused. Check for this by
writing a negative value to GSBASE and doing SYSENTER with TF set. A
successful run looks like:

[RUN]   SYSENTER with TF, invalid state, and GSBASE < 0
[SKIP]  Illegal instruction

A failed run causes a kernel hang, and I believe it's because we
double-fault and then get a never ending series of page faults and,
when we exhaust the double fault stack we double fault again,
starting the process over.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Link: 
https://lkml.kernel.org/r/f4f71efc91b9eae5e3dae21c9aee1c70cf5f370e.1590620529.git.l...@kernel.org
---
 tools/testing/selftests/x86/syscall_arg_fault.c | 26 -
 1 file changed, 26 insertions(+)

diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c 
b/tools/testing/selftests/x86/syscall_arg_fault.c
index bc0ecc2..62fba40 100644
--- a/tools/testing/selftests/x86/syscall_arg_fault.c
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -72,6 +72,7 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void 
*ctx_void)
if (ax != -EFAULT && ax != -ENOSYS) {
printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
   (unsigned long)ax);
+   printf("\tIP = 0x%lx\n", (unsigned 
long)ctx->uc_mcontext.gregs[REG_IP]);
n_errs++;
} else {
printf("[OK]\tSeems okay\n");
@@ -226,5 +227,30 @@ int main()
}
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
 
+#ifdef __x86_64__
+   printf("[RUN]\tSYSENTER with TF, invalid state, and GSBASE < 0\n");
+
+   if (sigsetjmp(jmpbuf, 1) == 0) {
+   sigtrap_consecutive_syscalls = 0;
+
+   asm volatile ("wrgsbase %%rax\n\t"
+ :: "a" (0xUL));
+
+   set_eflags(get_eflags() | X86_EFLAGS_TF);
+   asm volatile (
+   "movl $-1, %%eax\n\t"
+   "movl $-1, %%ebx\n\t"
+   "movl $-1, %%ecx\n\t"
+   "movl $-1, %%edx\n\t"
+   "movl $-1, %%esi\n\t"
+   "movl $-1, %%edi\n\t"
+   "movl $-1, %%ebp\n\t"
+   "movl $-1, %%esp\n\t"
+   "sysenter"
+   : : : "memory", "flags");
+   }
+   set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+#endif
+
return 0;
 }


[tip: x86/fsgsbase] x86/process/64: Use FSBSBASE in switch_to() if available

2020-06-18 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: 673903495c85137791d5820d690229efe09c8f7b
Gitweb:
https://git.kernel.org/tip/673903495c85137791d5820d690229efe09c8f7b
Author:Andy Lutomirski 
AuthorDate:Thu, 28 May 2020 16:13:51 -04:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 18 Jun 2020 15:47:02 +02:00

x86/process/64: Use FSBSBASE in switch_to() if available

With the new FSGSBASE instructions, FS and GSABSE can be efficiently read
and writen in __switch_to().  Use that capability to preserve the full
state.

This will enable user code to do whatever it wants with the new
instructions without any kernel-induced gotchas.  (There can still be
architectural gotchas: movl %gs,%eax; movl %eax,%gs may change GSBASE if
WRGSBASE was used, but users are expected to read the CPU manual before
doing things like that.)

This is a considerable speedup.  It seems to save about 100 cycles
per context switch compared to the baseline 4.6-rc1 behavior on a
Skylake laptop. This is mostly due to avoiding the WRMSR operation.

[ chang: 5~10% performance improvements were seen with a context switch
  benchmark that ran threads with different FS/GSBASE values (to the
  baseline 4.16). Minor edit on the changelog. ]

[ tglx: Masaage changelog ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Signed-off-by: Sasha Levin 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1557309753-24073-8-git-send-email-chang.seok@intel.com
Link: https://lkml.kernel.org/r/20200528201402.1708239-6-sas...@kernel.org


---
 arch/x86/kernel/process_64.c | 34 --
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ef2f755..8ccc587 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -236,8 +236,18 @@ static __always_inline void save_fsgs(struct task_struct 
*task)
 {
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
-   save_base_legacy(task, task->thread.fsindex, FS);
-   save_base_legacy(task, task->thread.gsindex, GS);
+   if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+   /*
+* If FSGSBASE is enabled, we can't make any useful guesses
+* about the base, and user code expects us to save the current
+* value.  Fortunately, reading the base directly is efficient.
+*/
+   task->thread.fsbase = rdfsbase();
+   task->thread.gsbase = __rdgsbase_inactive();
+   } else {
+   save_base_legacy(task, task->thread.fsindex, FS);
+   save_base_legacy(task, task->thread.gsindex, GS);
+   }
 }
 
 /*
@@ -319,10 +329,22 @@ static __always_inline void load_seg_legacy(unsigned 
short prev_index,
 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
  struct thread_struct *next)
 {
-   load_seg_legacy(prev->fsindex, prev->fsbase,
-   next->fsindex, next->fsbase, FS);
-   load_seg_legacy(prev->gsindex, prev->gsbase,
-   next->gsindex, next->gsbase, GS);
+   if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+   /* Update the FS and GS selectors if they could have changed. */
+   if (unlikely(prev->fsindex || next->fsindex))
+   loadseg(FS, next->fsindex);
+   if (unlikely(prev->gsindex || next->gsindex))
+   loadseg(GS, next->gsindex);
+
+   /* Update the bases. */
+   wrfsbase(next->fsbase);
+   __wrgsbase_inactive(next->gsbase);
+   } else {
+   load_seg_legacy(prev->fsindex, prev->fsbase,
+   next->fsindex, next->fsbase, FS);
+   load_seg_legacy(prev->gsindex, prev->gsbase,
+   next->gsindex, next->gsbase, GS);
+   }
 }
 
 static unsigned long x86_fsgsbase_read_task(struct task_struct *task,


[tip: x86/fsgsbase] x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit

2020-06-18 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: b745cfba44c152c34363eea9e052367b6b1d652b
Gitweb:
https://git.kernel.org/tip/b745cfba44c152c34363eea9e052367b6b1d652b
Author:Andy Lutomirski 
AuthorDate:Thu, 28 May 2020 16:13:58 -04:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 18 Jun 2020 15:47:05 +02:00

x86/cpu: Enable FSGSBASE on 64bit by default and add a chicken bit

Now that FSGSBASE is fully supported, remove unsafe_fsgsbase, enable
FSGSBASE by default, and add nofsgsbase to disable it.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Signed-off-by: Sasha Levin 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andi Kleen 
Link: 
https://lkml.kernel.org/r/1557309753-24073-17-git-send-email-chang.seok@intel.com
Link: https://lkml.kernel.org/r/20200528201402.1708239-13-sas...@kernel.org


---
 Documentation/admin-guide/kernel-parameters.txt |  3 +--
 arch/x86/kernel/cpu/common.c| 32 +++-
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 7308db7..8c0d045 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3079,8 +3079,7 @@
no5lvl  [X86-64] Disable 5-level paging mode. Forces
kernel to use 4-level paging instead.
 
-   unsafe_fsgsbase [X86] Allow FSGSBASE instructions.  This will be
-   replaced with a nofsgsbase flag.
+   nofsgsbase  [X86] Disables FSGSBASE instructions.
 
no_console_suspend
[HW] Never suspend the console
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7438a31..18857ce 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -441,21 +441,21 @@ static void __init setup_cr_pinning(void)
static_key_enable(_pinning.key);
 }
 
-/*
- * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are
- * updated. This allows us to get the kernel ready incrementally.
- *
- * Once all the pieces are in place, these will go away and be replaced with
- * a nofsgsbase chicken flag.
- */
-static bool unsafe_fsgsbase;
-
-static __init int setup_unsafe_fsgsbase(char *arg)
+static __init int x86_nofsgsbase_setup(char *arg)
 {
-   unsafe_fsgsbase = true;
+   /* Require an exact match without trailing characters. */
+   if (strlen(arg))
+   return 0;
+
+   /* Do not emit a message if the feature is not present. */
+   if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
+   return 1;
+
+   setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
+   pr_info("FSGSBASE disabled via kernel command line\n");
return 1;
 }
-__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase);
+__setup("nofsgsbase", x86_nofsgsbase_setup);
 
 /*
  * Protection Keys are not available in 32-bit mode.
@@ -1512,12 +1512,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_umip(c);
 
/* Enable FSGSBASE instructions if available. */
-   if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
-   if (unsafe_fsgsbase)
-   cr4_set_bits(X86_CR4_FSGSBASE);
-   else
-   clear_cpu_cap(c, X86_FEATURE_FSGSBASE);
-   }
+   if (cpu_has(c, X86_FEATURE_FSGSBASE))
+   cr4_set_bits(X86_CR4_FSGSBASE);
 
/*
 * The vendor-specific functions might have changed features.


[tip: x86/fsgsbase] x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE

2020-06-18 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/fsgsbase branch of tip:

Commit-ID: dd649bd0b3aa012740059b1ba31ecad28a408f7f
Gitweb:
https://git.kernel.org/tip/dd649bd0b3aa012740059b1ba31ecad28a408f7f
Author:Andy Lutomirski 
AuthorDate:Thu, 28 May 2020 16:13:48 -04:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 18 Jun 2020 15:46:59 +02:00

x86/cpu: Add 'unsafe_fsgsbase' to enable CR4.FSGSBASE

This is temporary.  It will allow the next few patches to be tested
incrementally.

Setting unsafe_fsgsbase is a root hole.  Don't do it.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Signed-off-by: Sasha Levin 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andi Kleen 
Reviewed-by: Andy Lutomirski 
Link: 
https://lkml.kernel.org/r/1557309753-24073-4-git-send-email-chang.seok@intel.com
Link: https://lkml.kernel.org/r/20200528201402.1708239-3-sas...@kernel.org


---
 Documentation/admin-guide/kernel-parameters.txt |  3 ++-
 arch/x86/kernel/cpu/common.c| 24 -
 2 files changed, 27 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad..7308db7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3079,6 +3079,9 @@
no5lvl  [X86-64] Disable 5-level paging mode. Forces
kernel to use 4-level paging instead.
 
+   unsafe_fsgsbase [X86] Allow FSGSBASE instructions.  This will be
+   replaced with a nofsgsbase flag.
+
no_console_suspend
[HW] Never suspend the console
Disable suspending of consoles during suspend and
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 043d93c..7438a31 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -442,6 +442,22 @@ static void __init setup_cr_pinning(void)
 }
 
 /*
+ * Temporary hack: FSGSBASE is unsafe until a few kernel code paths are
+ * updated. This allows us to get the kernel ready incrementally.
+ *
+ * Once all the pieces are in place, these will go away and be replaced with
+ * a nofsgsbase chicken flag.
+ */
+static bool unsafe_fsgsbase;
+
+static __init int setup_unsafe_fsgsbase(char *arg)
+{
+   unsafe_fsgsbase = true;
+   return 1;
+}
+__setup("unsafe_fsgsbase", setup_unsafe_fsgsbase);
+
+/*
  * Protection Keys are not available in 32-bit mode.
  */
 static bool pku_disabled;
@@ -1495,6 +1511,14 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smap(c);
setup_umip(c);
 
+   /* Enable FSGSBASE instructions if available. */
+   if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+   if (unsafe_fsgsbase)
+   cr4_set_bits(X86_CR4_FSGSBASE);
+   else
+   clear_cpu_cap(c, X86_FEATURE_FSGSBASE);
+   }
+
/*
 * The vendor-specific functions might have changed features.
 * Now we do "generic changes."


[tip: x86/entry] x86/entry: Treat BUG/WARN as NMI-like entries

2020-06-12 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 15a416e8aaa758b5534f64a3972dae05275bc225
Gitweb:
https://git.kernel.org/tip/15a416e8aaa758b5534f64a3972dae05275bc225
Author:Andy Lutomirski 
AuthorDate:Thu, 11 Jun 2020 20:26:38 -07:00
Committer: Thomas Gleixner 
CommitterDate: Fri, 12 Jun 2020 12:12:57 +02:00

x86/entry: Treat BUG/WARN as NMI-like entries

BUG/WARN are cleverly optimized using UD2 to handle the BUG/WARN out of
line in an exception fixup.

But if BUG or WARN is issued in a funny RCU context, then the
idtentry_enter...() path might helpfully WARN that the RCU context is
invalid, which results in infinite recursion.

Split the BUG/WARN handling into an nmi_enter()/nmi_exit() path in
exc_invalid_op() to increase the chance to survive the experience.

[ tglx: Make the declaration match the implementation ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/f8fe40e0088749734b4435b554f73eee53dcf7a8.1591932307.git.l...@kernel.org

---
 arch/x86/include/asm/idtentry.h |  2 +-
 arch/x86/kernel/traps.c | 64 +++-
 arch/x86/mm/extable.c   | 15 ++--
 3 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index d203c54..2fc6b0c 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -543,7 +543,6 @@ SYM_CODE_END(spurious_entries_start)
 DECLARE_IDTENTRY(X86_TRAP_DE,  exc_divide_error);
 DECLARE_IDTENTRY(X86_TRAP_OF,  exc_overflow);
 DECLARE_IDTENTRY(X86_TRAP_BR,  exc_bounds);
-DECLARE_IDTENTRY(X86_TRAP_UD,  exc_invalid_op);
 DECLARE_IDTENTRY(X86_TRAP_NM,  exc_device_not_available);
 DECLARE_IDTENTRY(X86_TRAP_OLD_MF,  exc_coproc_segment_overrun);
 DECLARE_IDTENTRY(X86_TRAP_SPURIOUS,exc_spurious_interrupt_bug);
@@ -561,6 +560,7 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, 
exc_general_protection);
 DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC,exc_alignment_check);
 
 /* Raw exception entries which need extra work */
+DECLARE_IDTENTRY_RAW(X86_TRAP_UD,  exc_invalid_op);
 DECLARE_IDTENTRY_RAW(X86_TRAP_BP,  exc_int3);
 DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF,exc_page_fault);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7febae3..af75109 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -97,24 +97,6 @@ int is_valid_bugaddr(unsigned long addr)
return ud == INSN_UD0 || ud == INSN_UD2;
 }
 
-int fixup_bug(struct pt_regs *regs, int trapnr)
-{
-   if (trapnr != X86_TRAP_UD)
-   return 0;
-
-   switch (report_bug(regs->ip, regs)) {
-   case BUG_TRAP_TYPE_NONE:
-   case BUG_TRAP_TYPE_BUG:
-   break;
-
-   case BUG_TRAP_TYPE_WARN:
-   regs->ip += LEN_UD2;
-   return 1;
-   }
-
-   return 0;
-}
-
 static nokprobe_inline int
 do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
  struct pt_regs *regs, long error_code)
@@ -190,13 +172,6 @@ static void do_error_trap(struct pt_regs *regs, long 
error_code, char *str,
 {
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 
-   /*
-* WARN*()s end up here; fix them up before we call the
-* notifier chain.
-*/
-   if (!user_mode(regs) && fixup_bug(regs, trapnr))
-   return;
-
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
cond_local_irq_enable(regs);
@@ -241,9 +216,46 @@ static inline void handle_invalid_op(struct pt_regs *regs)
  ILL_ILLOPN, error_get_trap_addr(regs));
 }
 
-DEFINE_IDTENTRY(exc_invalid_op)
+DEFINE_IDTENTRY_RAW(exc_invalid_op)
 {
+   bool rcu_exit;
+
+   /*
+* Handle BUG/WARN like NMIs instead of like normal idtentries:
+* if we bugged/warned in a bad RCU context, for example, the last
+* thing we want is to BUG/WARN again in the idtentry code, ad
+* infinitum.
+*/
+   if (!user_mode(regs) && is_valid_bugaddr(regs->ip)) {
+   enum bug_trap_type type;
+
+   nmi_enter();
+   instrumentation_begin();
+   trace_hardirqs_off_finish();
+   type = report_bug(regs->ip, regs);
+   if (regs->flags & X86_EFLAGS_IF)
+   trace_hardirqs_on_prepare();
+   instrumentation_end();
+   nmi_exit();
+
+   if (type == BUG_TRAP_TYPE_WARN) {
+   /* Skip the ud2. */
+   regs->ip += LEN_UD2;
+   return;
+   }
+
+   /*
+* Else, if this was a BUG and report_bug returns or if this
+* was just a normal #UD, we want to continue 

[tip: x86/urgent] x86/syscalls: Revert "x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long"

2020-05-26 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 700d3a5a664df267f01ec8887fd2d8ff98f67e7f
Gitweb:
https://git.kernel.org/tip/700d3a5a664df267f01ec8887fd2d8ff98f67e7f
Author:Andy Lutomirski 
AuthorDate:Fri, 08 May 2020 17:25:32 -07:00
Committer: Borislav Petkov 
CommitterDate: Tue, 26 May 2020 16:42:43 +02:00

x86/syscalls: Revert "x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long"

Revert

  45e29d119e99 ("x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long")

and add a comment to discourage someone else from making the same
mistake again.

It turns out that some user code fails to compile if __X32_SYSCALL_BIT
is unsigned long. See, for example [1] below.

 [ bp: Massage and do the same thing in the respective tools/ header. ]

Fixes: 45e29d119e99 ("x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long")
Reported-by: Thorsten Glaser 
Signed-off-by: Andy Lutomirski 
Signed-off-by: Borislav Petkov 
Cc: sta...@kernel.org
Link: [1] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=954294
Link: 
https://lkml.kernel.org/r/92e55442b744a5951fdc9cfee10badd0a5f7f828.1588983892.git.l...@kernel.org
---
 arch/x86/include/uapi/asm/unistd.h   | 11 +--
 tools/arch/x86/include/uapi/asm/unistd.h |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/uapi/asm/unistd.h 
b/arch/x86/include/uapi/asm/unistd.h
index 196fdd0..be5e2e7 100644
--- a/arch/x86/include/uapi/asm/unistd.h
+++ b/arch/x86/include/uapi/asm/unistd.h
@@ -2,8 +2,15 @@
 #ifndef _UAPI_ASM_X86_UNISTD_H
 #define _UAPI_ASM_X86_UNISTD_H
 
-/* x32 syscall flag bit */
-#define __X32_SYSCALL_BIT  0x4000UL
+/*
+ * x32 syscall flag bit.  Some user programs expect syscall NR macros
+ * and __X32_SYSCALL_BIT to have type int, even though syscall numbers
+ * are, for practical purposes, unsigned long.
+ *
+ * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right
+ * thing regardless.
+ */
+#define __X32_SYSCALL_BIT  0x4000
 
 #ifndef __KERNEL__
 # ifdef __i386__
diff --git a/tools/arch/x86/include/uapi/asm/unistd.h 
b/tools/arch/x86/include/uapi/asm/unistd.h
index 196fdd0..30d7d04 100644
--- a/tools/arch/x86/include/uapi/asm/unistd.h
+++ b/tools/arch/x86/include/uapi/asm/unistd.h
@@ -3,7 +3,7 @@
 #define _UAPI_ASM_X86_UNISTD_H
 
 /* x32 syscall flag bit */
-#define __X32_SYSCALL_BIT  0x4000UL
+#define __X32_SYSCALL_BIT  0x4000
 
 #ifndef __KERNEL__
 # ifdef __i386__


[tip: x86/entry] x86/hw_breakpoint: Prevent data breakpoints on cpu_entry_area

2020-05-19 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 3ea11ac991d594728e5df42f7eb1145072b9c2bc
Gitweb:
https://git.kernel.org/tip/3ea11ac991d594728e5df42f7eb1145072b9c2bc
Author:Andy Lutomirski 
AuthorDate:Mon, 24 Feb 2020 13:24:58 +01:00
Committer: Thomas Gleixner 
CommitterDate: Fri, 15 May 2020 20:03:03 +02:00

x86/hw_breakpoint: Prevent data breakpoints on cpu_entry_area

A data breakpoint near the top of an IST stack will cause unrecoverable
recursion.  A data breakpoint on the GDT, IDT, or TSS is terrifying.
Prevent either of these from happening.

Co-developed-by: Peter Zijlstra 
Signed-off-by: Andy Lutomirski 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Borislav Petkov 
Reviewed-by: Lai Jiangshan 
Reviewed-by: Alexandre Chartre 
Link: https://lkml.kernel.org/r/20200505134058.272448...@linutronix.de

---
 arch/x86/kernel/hw_breakpoint.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 4d8d53e..d42fc0e 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -227,10 +227,35 @@ int arch_check_bp_in_kernelspace(struct 
arch_hw_breakpoint *hw)
return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
 }
 
+/*
+ * Checks whether the range from addr to end, inclusive, overlaps the CPU
+ * entry area range.
+ */
+static inline bool within_cpu_entry_area(unsigned long addr, unsigned long end)
+{
+   return end >= CPU_ENTRY_AREA_BASE &&
+  addr < (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_TOTAL_SIZE);
+}
+
 static int arch_build_bp_info(struct perf_event *bp,
  const struct perf_event_attr *attr,
  struct arch_hw_breakpoint *hw)
 {
+   unsigned long bp_end;
+
+   bp_end = attr->bp_addr + attr->bp_len - 1;
+   if (bp_end < attr->bp_addr)
+   return -EINVAL;
+
+   /*
+* Prevent any breakpoint of any type that overlaps the
+* cpu_entry_area.  This protects the IST stacks and also
+* reduces the chance that we ever find out what happens if
+* there's a data breakpoint on the GDT, IDT, or TSS.
+*/
+   if (within_cpu_entry_area(attr->bp_addr, bp_end))
+   return -EINVAL;
+
hw->address = attr->bp_addr;
hw->mask = 0;
 


[tip: x86/entry] x86/kvm: Handle async page faults directly through do_page_fault()

2020-05-19 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/entry branch of tip:

Commit-ID: ef68017eb5704eb2b0577c3aa6619e13caf2b59f
Gitweb:
https://git.kernel.org/tip/ef68017eb5704eb2b0577c3aa6619e13caf2b59f
Author:Andy Lutomirski 
AuthorDate:Fri, 28 Feb 2020 10:42:48 -08:00
Committer: Thomas Gleixner 
CommitterDate: Tue, 19 May 2020 15:53:57 +02:00

x86/kvm: Handle async page faults directly through do_page_fault()

KVM overloads #PF to indicate two types of not-actually-page-fault
events.  Right now, the KVM guest code intercepts them by modifying
the IDT and hooking the #PF vector.  This makes the already fragile
fault code even harder to understand, and it also pollutes call
traces with async_page_fault and do_async_page_fault for normal page
faults.

Clean it up by moving the logic into do_page_fault() using a static
branch.  This gets rid of the platform trap_init override mechanism
completely.

[ tglx: Fixed up 32bit, removed error code from the async functions and
massaged coding style ]

Signed-off-by: Andy Lutomirski 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Alexandre Chartre 
Acked-by: Paolo Bonzini 
Acked-by: Peter Zijlstra 
Link: https://lkml.kernel.org/r/20200505134059.169270...@linutronix.de


---
 arch/x86/entry/entry_32.S   |  8 +---
 arch/x86/entry/entry_64.S   |  4 +---
 arch/x86/include/asm/kvm_para.h | 19 ++--
 arch/x86/include/asm/x86_init.h |  2 +--
 arch/x86/kernel/kvm.c   | 39 +---
 arch/x86/kernel/traps.c |  2 +--
 arch/x86/kernel/x86_init.c  |  1 +-
 arch/x86/mm/fault.c | 19 -
 8 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b67bae7..8ba0985 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1693,14 +1693,6 @@ SYM_CODE_START(general_protection)
jmp common_exception
 SYM_CODE_END(general_protection)
 
-#ifdef CONFIG_KVM_GUEST
-SYM_CODE_START(async_page_fault)
-   ASM_CLAC
-   pushl   $do_async_page_fault
-   jmp common_exception_read_cr2
-SYM_CODE_END(async_page_fault)
-#endif
-
 SYM_CODE_START(rewind_stack_do_exit)
/* Prevent any naive code from trying to unwind to our caller. */
xorl%ebp, %ebp
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3063aa9..9ab3ea6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1202,10 +1202,6 @@ idtentry xendebugdo_debug
has_error_code=0
 idtentry general_protectiondo_general_protection   has_error_code=1
 idtentry page_faultdo_page_fault   has_error_code=1
read_cr2=1
 
-#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault  do_async_page_fault has_error_code=1
read_cr2=1
-#endif
-
 #ifdef CONFIG_X86_MCE
 idtentry machine_check do_mce  has_error_code=0
paranoid=1
 #endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 9b4df6e..5261363 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -91,8 +91,18 @@ unsigned int kvm_arch_para_hints(void);
 void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
-extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, 
unsigned long address);
+void kvm_disable_steal_time(void);
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
+
+DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
+static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 
token)
+{
+   if (static_branch_unlikely(_async_pf_enabled))
+   return __kvm_handle_async_pf(regs, token);
+   else
+   return false;
+}
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
@@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void)
 {
return;
 }
+
+static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+   return false;
+}
 #endif
 
 #endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 96d9cd2..6807153 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -50,14 +50,12 @@ struct x86_init_resources {
  * @pre_vector_init:   init code to run before interrupt vectors
  * are set up.
  * @intr_init: interrupt init code
- * @trap_init: platform specific trap setup
  * @intr_mode_select:  interrupt delivery mode selection
  * @intr_mode_init:interrupt delivery mode setup
  */
 struct x86_init_irqs {
void (*pre_vector_init)(void);
void (*intr_init)(void);
-   void (*trap_init)(void);
void 

[tip: x86/entry] syscalls/x86: Wire up COMPAT_SYSCALL_DEFINE0

2019-10-11 Thread tip-bot2 for Andy Lutomirski
The following commit has been merged into the x86/entry branch of tip:

Commit-ID: cf3b83e19d7c928e05a5d193c375463182c6029a
Gitweb:
https://git.kernel.org/tip/cf3b83e19d7c928e05a5d193c375463182c6029a
Author:Andy Lutomirski 
AuthorDate:Tue, 08 Oct 2019 15:40:46 -07:00
Committer: Ingo Molnar 
CommitterDate: Fri, 11 Oct 2019 12:49:18 +02:00

syscalls/x86: Wire up COMPAT_SYSCALL_DEFINE0

x86 has special handling for COMPAT_SYSCALL_DEFINEx, but there was
no override for COMPAT_SYSCALL_DEFINE0.  Wire it up so that we can
use it for rt_sigreturn.

Signed-off-by: Andy Lutomirski 
Signed-off-by: Sami Tolvanen 
Cc: Borislav Petkov 
Cc: H . Peter Anvin 
Cc: Kees Cook 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Link: https://lkml.kernel.org/r/20191008224049.115427-3-samitolva...@google.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/syscall_wrapper.h | 32 +++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/syscall_wrapper.h 
b/arch/x86/include/asm/syscall_wrapper.h
index 90eb70d..3dab048 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -28,13 +28,21 @@
  * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this
  * case as well.
  */
+#define __IA32_COMPAT_SYS_STUB0(x, name)   \
+   asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs);\
+   ALLOW_ERROR_INJECTION(__ia32_compat_sys_##name, ERRNO); \
+   asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs)\
+   {   \
+   return __se_compat_sys_##name();\
+   }
+
 #define __IA32_COMPAT_SYS_STUBx(x, name, ...)  \
asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs);\
ALLOW_ERROR_INJECTION(__ia32_compat_sys##name, ERRNO);  \
asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs)\
{   \
return 
__se_compat_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\
-   }   \
+   }
 
 #define __IA32_SYS_STUBx(x, name, ...) \
asmlinkage long __ia32_sys##name(const struct pt_regs *regs);   \
@@ -76,15 +84,24 @@
  * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common
  * with x86_64 obviously do not need such care.
  */
+#define __X32_COMPAT_SYS_STUB0(x, name, ...)   \
+   asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs);\
+   ALLOW_ERROR_INJECTION(__x32_compat_sys_##name, ERRNO);  \
+   asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs)\
+   {   \
+   return __se_compat_sys_##name();\
+   }
+
 #define __X32_COMPAT_SYS_STUBx(x, name, ...)   \
asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs);\
ALLOW_ERROR_INJECTION(__x32_compat_sys##name, ERRNO);   \
asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs)\
{   \
return 
__se_compat_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\
-   }   \
+   }
 
 #else /* CONFIG_X86_X32 */
+#define __X32_COMPAT_SYS_STUB0(x, name)
 #define __X32_COMPAT_SYS_STUBx(x, name, ...)
 #endif /* CONFIG_X86_X32 */
 
@@ -95,6 +112,17 @@
  * mapping of registers to parameters, we need to generate stubs for each
  * of them.
  */
+#define COMPAT_SYSCALL_DEFINE0(name)   \
+   static long __se_compat_sys_##name(void);   \
+   static inline long __do_compat_sys_##name(void);\
+   __IA32_COMPAT_SYS_STUB0(x, name)\
+   __X32_COMPAT_SYS_STUB0(x, name) \
+   static long __se_compat_sys_##name(void)\
+   {   \
+   return __do_compat_sys_##name();\
+   }   \
+   static inline long __do_compat_sys_##name(void)
+
 #define COMPAT_SYSCALL_DEFINEx(x, name, ...)   
\
static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));  
\
static inline long 
__do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\