Module Name: src
Committed By: martin
Date: Thu Mar 22 16:59:04 UTC 2018
Modified Files:
src/sys/arch/amd64/amd64 [netbsd-8]: amd64_trap.S db_machdep.c
genassym.cf locore.S machdep.c trap.c vector.S
src/sys/arch/amd64/conf [netbsd-8]: GENERIC kern.ldscript
src/sys/arch/amd64/include [netbsd-8]: frameasm.h param.h pmap.h
src/sys/arch/x86/conf [netbsd-8]: files.x86
src/sys/arch/x86/include [netbsd-8]: cpu.h pmap.h
src/sys/arch/x86/x86 [netbsd-8]: cpu.c pmap.c vm_machdep.c
x86_machdep.c
src/sys/arch/xen/conf [netbsd-8]: files.compat
Added Files:
src/sys/arch/x86/x86 [netbsd-8]: svs.c
Log Message:
Pull up the following revisions, requested by maxv in ticket #652:
sys/arch/amd64/amd64/amd64_trap.S upto 1.39 (partial, patch)
sys/arch/amd64/amd64/db_machdep.c 1.6 (patch)
sys/arch/amd64/amd64/genassym.cf 1.65,1.66,1.67 (patch)
sys/arch/amd64/amd64/locore.S upto 1.159 (partial, patch)
sys/arch/amd64/amd64/machdep.c 1.299-1.302 (patch)
sys/arch/amd64/amd64/trap.c upto 1.113 (partial, patch)
sys/arch/amd64/amd64/amd64/vector.S upto 1.61 (partial, patch)
sys/arch/amd64/conf/GENERIC 1.477,1.478 (patch)
sys/arch/amd64/conf/kern.ldscript 1.26 (patch)
sys/arch/amd64/include/frameasm.h upto 1.37 (partial, patch)
sys/arch/amd64/include/param.h 1.25 (patch)
sys/arch/amd64/include/pmap.h 1.41,1.43,1.44 (patch)
sys/arch/x86/conf/files.x86 1.91,1.93 (patch)
sys/arch/x86/include/cpu.h 1.88,1.89 (patch)
sys/arch/x86/include/pmap.h 1.75 (patch)
sys/arch/x86/x86/cpu.c 1.144,1.146,1.148,1.149 (patch)
sys/arch/x86/x86/pmap.c upto 1.289 (partial, patch)
sys/arch/x86/x86/vm_machdep.c 1.31,1.32 (patch)
sys/arch/x86/x86/x86_machdep.c 1.104,1.106,1.108 (patch)
sys/arch/x86/x86/svs.c 1.1-1.14
sys/arch/xen/conf/files.compat 1.30 (patch)
Backport SVS. Not enabled yet.
To generate a diff of this commit:
cvs rdiff -u -r1.5.6.1 -r1.5.6.2 src/sys/arch/amd64/amd64/amd64_trap.S
cvs rdiff -u -r1.4 -r1.4.30.1 src/sys/arch/amd64/amd64/db_machdep.c
cvs rdiff -u -r1.60.10.1 -r1.60.10.2 src/sys/arch/amd64/amd64/genassym.cf
cvs rdiff -u -r1.123.6.4 -r1.123.6.5 src/sys/arch/amd64/amd64/locore.S
cvs rdiff -u -r1.255.6.5 -r1.255.6.6 src/sys/arch/amd64/amd64/machdep.c
cvs rdiff -u -r1.96.4.1 -r1.96.4.2 src/sys/arch/amd64/amd64/trap.c
cvs rdiff -u -r1.49.2.1 -r1.49.2.2 src/sys/arch/amd64/amd64/vector.S
cvs rdiff -u -r1.459.2.5 -r1.459.2.6 src/sys/arch/amd64/conf/GENERIC
cvs rdiff -u -r1.22.6.2 -r1.22.6.3 src/sys/arch/amd64/conf/kern.ldscript
cvs rdiff -u -r1.20.32.1 -r1.20.32.2 src/sys/arch/amd64/include/frameasm.h
cvs rdiff -u -r1.21.6.1 -r1.21.6.2 src/sys/arch/amd64/include/param.h
cvs rdiff -u -r1.39 -r1.39.8.1 src/sys/arch/amd64/include/pmap.h
cvs rdiff -u -r1.88 -r1.88.6.1 src/sys/arch/x86/conf/files.x86
cvs rdiff -u -r1.71.2.3 -r1.71.2.4 src/sys/arch/x86/include/cpu.h
cvs rdiff -u -r1.64.6.1 -r1.64.6.2 src/sys/arch/x86/include/pmap.h
cvs rdiff -u -r1.130.2.4 -r1.130.2.5 src/sys/arch/x86/x86/cpu.c
cvs rdiff -u -r1.245.6.5 -r1.245.6.6 src/sys/arch/x86/x86/pmap.c
cvs rdiff -u -r0 -r1.14.2.2 src/sys/arch/x86/x86/svs.c
cvs rdiff -u -r1.28.6.2 -r1.28.6.3 src/sys/arch/x86/x86/vm_machdep.c
cvs rdiff -u -r1.91.4.1 -r1.91.4.2 src/sys/arch/x86/x86/x86_machdep.c
cvs rdiff -u -r1.25.8.1 -r1.25.8.2 src/sys/arch/xen/conf/files.compat
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/arch/amd64/amd64/amd64_trap.S
diff -u src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.1 src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.2
--- src/sys/arch/amd64/amd64/amd64_trap.S:1.5.6.1 Wed Mar 7 14:50:56 2018
+++ src/sys/arch/amd64/amd64/amd64_trap.S Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: amd64_trap.S,v 1.5.6.1 2018/03/07 14:50:56 martin Exp $ */
+/* $NetBSD: amd64_trap.S,v 1.5.6.2 2018/03/22 16:59:03 martin Exp $ */
/*
* Copyright (c) 1998, 2007, 2008, 2017 The NetBSD Foundation, Inc.
@@ -95,13 +95,19 @@
#define PRE_TRAP
#endif
+#define TRAPENTRY \
+ INTRENTRY ; \
+ jmp .Lalltraps_noentry
+
#define TRAP_NJ(a) PRE_TRAP ; pushq $(a)
#define ZTRAP_NJ(a) PRE_TRAP ; pushq $0 ; pushq $(a)
-#define TRAP(a) TRAP_NJ(a) ; jmp _C_LABEL(alltraps)
-#define ZTRAP(a) ZTRAP_NJ(a) ; jmp _C_LABEL(alltraps)
+#define TRAP(a) TRAP_NJ(a) ; TRAPENTRY
+#define ZTRAP(a) ZTRAP_NJ(a) ; TRAPENTRY
.text
+ TEXT_USER_BEGIN
+
IDTVEC(trap00)
ZTRAP(T_DIVIDE)
IDTVEC_END(trap00)
@@ -128,6 +134,7 @@ IDTVEC(trap02)
ZTRAP_NJ(T_NMI)
subq $TF_REGSIZE,%rsp
INTR_SAVE_GPRS
+ SVS_ENTER_ALTSTACK
cld
movw %gs,TF_GS(%rsp)
movw %fs,TF_FS(%rsp)
@@ -143,6 +150,7 @@ IDTVEC(trap02)
movq %rsp,%rdi
incq CPUVAR(NTRAP)
call _C_LABEL(nmitrap)
+ SVS_LEAVE_ALTSTACK
swapgs
jmp .Lnmileave
@@ -150,6 +158,7 @@ IDTVEC(trap02)
movq %rsp,%rdi
incq CPUVAR(NTRAP)
call _C_LABEL(nmitrap)
+ SVS_LEAVE_ALTSTACK
.Lnmileave:
INTR_RESTORE_GPRS
@@ -221,6 +230,7 @@ IDTVEC(trap08)
TRAP_NJ(T_DOUBLEFLT)
subq $TF_REGSIZE,%rsp
INTR_SAVE_GPRS
+ SVS_ENTER_ALTSTACK
testb $SEL_UPL,TF_CS(%rsp)
jz 1f
swapgs
@@ -235,6 +245,7 @@ IDTVEC(trap08)
incq CPUVAR(NTRAP)
call _C_LABEL(doubletrap)
+ SVS_LEAVE_ALTSTACK
INTR_RESTORE_GPRS
testb $SEL_UPL,TF_CS(%rsp)
@@ -260,22 +271,22 @@ IDTVEC_END(trap10)
* equivalent of iret, if it does this code would be needed
* in order to copy the user segment registers into the fault frame.
*/
-#define check_swapgs alltraps
+#define kernuser_reenter alltraps
#endif
IDTVEC(trap11) /* #NP() Segment not present */
TRAP_NJ(T_SEGNPFLT)
- jmp check_swapgs
+ jmp kernuser_reenter
IDTVEC_END(trap11)
IDTVEC(trap12) /* #SS() Stack exception */
TRAP_NJ(T_STKFLT)
- jmp check_swapgs
+ jmp kernuser_reenter
IDTVEC_END(trap12)
IDTVEC(trap13) /* #GP() General protection */
TRAP_NJ(T_PROTFLT)
- jmp check_swapgs
+ jmp kernuser_reenter
IDTVEC_END(trap13)
IDTVEC(trap14)
@@ -352,68 +363,135 @@ IDTVEC(intrspurious)
jmp .Lalltraps_checkusr
IDTVEC_END(intrspurious)
-
+#ifndef kernuser_reenter
/*
- * trap() calls here when it detects a fault in INTRFASTEXIT (loading the
- * segment registers or during the iret itself). The address of the (possibly
- * reconstructed) user trap frame is passed as an argument.
- *
- * Typically the code will have raised a SIGSEGV which will be actioned
- * by the code below.
+ * We need to worry about traps in kernel mode while the kernel %gs isn't
+ * loaded. When such traps happen, we have CPL=0 and %gs=userland, and we
+ * must perform an additional swapgs to get %gs=kernel.
*/
- .type _C_LABEL(trap_return_fault_return), @function
-LABEL(trap_return_fault_return)
- mov %rdi,%rsp /* frame for user return */
-#ifdef DIAGNOSTIC
- /* We can't recover the saved %rbx, so suppress warning */
- movl CPUVAR(ILEVEL),%ebx
-#endif
- jmp .Lalltraps_checkusr
-END(trap_return_fault_return)
-#ifndef check_swapgs
+#define TF_SMALL(val, reg) (val - TF_REGSIZE)(reg)
+#define TF_SMALL_REGPUSHED(val, reg) (val - (TF_REGSIZE - 8))(reg)
+
/*
- * We need to worry about traps in kernel mode while the kernel %gs isn't
- * loaded. These are either faults on iretq during return to user or loads to
- * %gs.
+ * It is possible that we received a trap in kernel mode, but with the user
+ * context loaded. There are three cases where this can happen:
*
- * When such traps happen, we have CPL=0 and %gs=userland, and we must perform
- * an additional swapgs to get %gs=kernel.
+ * o Execution of IRETQ.
+ * o Reload of ES.
+ * o Reload of DS.
+ *
+ * When this happens, the kernel is re-entered in kernel mode, but the
+ * previous context is in kernel mode too.
+ *
+ * We have two iret frames in the stack. In the first one, we also pushed
+ * 'trapno' and 'err'. The 'rsp' field points to the outer iret frame:
+ *
+ * +---------------------------------------------------+
+ * | trapno | err | rip | cs=ring0 | rflags | rsp | ss |
+ * +-------------------------------------------|-------+
+ * |
+ * +---------------------------------+
+ * |
+ * | +------------------------------------+
+ * +--> | rip | cs=ring3 | rflags | rsp | ss |
+ * +------------------------------------+
+ *
+ * We perform a three-step procedure:
+ *
+ * o We update RSP to point to the outer frame. This outer frame is in the
+ * same stack as the current frame, and likely just after the current
+ * frame.
+ *
+ * o We push, in this outer frame, the 'err' and 'trapno' fields of the
+ * CURRENT frame.
+ *
+ * o We do a normal INTRENTRY. Now that RSP points to the outer frame,
+ * everything behaves as if we had received a trap from the outer frame,
+ * that is to say, from userland directly.
+ *
+ * Finally, we jump to 'calltrap' and handle the trap smoothly.
+ *
+ * Two notes regarding SVS:
+ *
+ * o With SVS, we will receive the trap while the user page tables are
+ * loaded. That's not a problem, we don't touch anything unmapped here.
+ *
+ * o With SVS, when the user page tables are loaded, the stack is really
+ * small, and can contain only one trapframe structure. Therefore, in
+ * intrfastexit, we must save the GPRs and pop their part of the stack
+ * right away. If we weren't doing that, and the reload of ES faulted for
+ * example, then the CPU would try to push an iret frame on the current
+ * stack (nested), and would double-fault because it touches the redzone
+ * below the stack (see the documentation in x86/x86/svs.c). By popping
+ * the GPR part of the stack, we leave enough stack for the CPU to push
+ * an iret frame, and for us to push one 8-byte register (%rdi) too.
*/
-NENTRY(check_swapgs)
- INTRENTRY_L(3f,1:)
-2:
+ _ALIGN_TEXT
+LABEL(kernuser_reenter)
+ testb $SEL_UPL,TF_SMALL(TF_CS, %rsp)
+ jz .Lkernelmode
+
+.Lnormal_entry:
+ INTRENTRY
sti
jmp calltrap
-3:
- /*
- * Trap in kernel mode.
- */
+
+.Lkernelmode:
+ /* We will clobber %rdi */
+ pushq %rdi
+
/* Case 1: fault on iretq? */
- movq TF_RIP(%rsp),%rax
- cmpw $0xcf48,(%rax) /* Faulting instruction is iretq ? */
- jne 5f /* Jump if not */
- movq TF_RSP(%rsp),%rax /* Must read %rsp, may be a pad word */
- testb $SEL_UPL,8(%rax) /* Check %cs of outer iret frame */
- je 2b /* jump if iret was to kernel */
- jmp 1b /* to user - must restore %gs */
+ leaq do_iret(%rip),%rdi
+ cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+ jne 5f
+ movq TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rdi /* get %rsp */
+ testb $SEL_UPL,8(%rdi) /* check %cs of outer iret frame */
+ je .Lnormal_entry /* jump if iret was to kernel */
+ jmp .Lkernelmode_but_user /* to user - must restore %gs */
5:
- /* Case 2: move to %gs? */
- movw (%rax),%ax
- andb $070,%ah /* mask mod/rm from mod/reg/rm */
- cmpw $0x8e+050*256,%ax /* Any move to %gs (reg 5) */
- jne 2b /* No - normal kernel fault */
- jmp 1b /* Yes - restore %gs */
-END(check_swapgs)
+ /* Case 2: move to %es? */
+ leaq do_mov_es(%rip),%rdi
+ cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+ je .Lkernelmode_but_user
+
+ /* Case 3: move to %ds? */
+ leaq do_mov_ds(%rip),%rdi
+ cmpq %rdi,TF_SMALL_REGPUSHED(TF_RIP, %rsp)
+ je .Lkernelmode_but_user
+
+ /* None of the above cases: normal kernel fault */
+ popq %rdi
+ jmp .Lnormal_entry
+
+.Lkernelmode_but_user:
+ /*
+ * Here we have %rdi pushed on the stack, hence 8+.
+ */
+ movq %rsp,%rdi
+ movq TF_SMALL_REGPUSHED(TF_RSP, %rsp),%rsp
+
+ /* Push tf_err and tf_trapno */
+ pushq 8+8(%rdi) /* 8+8(%rdi) = current TF_ERR */
+ pushq 8+0(%rdi) /* 8+0(%rdi) = current TF_TRAPNO */
+
+ /* Restore %rdi */
+ movq (%rdi),%rdi
+
+ jmp .Lnormal_entry
+END(kernuser_reenter)
#endif
+ TEXT_USER_END
+
/*
* All traps go through here. Call the generic trap handler, and
* check for ASTs afterwards.
*/
NENTRY(alltraps)
INTRENTRY
+.Lalltraps_noentry:
STI(si)
calltrap:
Index: src/sys/arch/amd64/amd64/db_machdep.c
diff -u src/sys/arch/amd64/amd64/db_machdep.c:1.4 src/sys/arch/amd64/amd64/db_machdep.c:1.4.30.1
--- src/sys/arch/amd64/amd64/db_machdep.c:1.4 Wed Oct 3 17:43:22 2012
+++ src/sys/arch/amd64/amd64/db_machdep.c Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: db_machdep.c,v 1.4 2012/10/03 17:43:22 riastradh Exp $ */
+/* $NetBSD: db_machdep.c,v 1.4.30.1 2018/03/22 16:59:03 martin Exp $ */
/*
* Mach Operating System
@@ -26,7 +26,7 @@
* rights to redistribute these changes.
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: db_machdep.c,v 1.4 2012/10/03 17:43:22 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: db_machdep.c,v 1.4.30.1 2018/03/22 16:59:03 martin Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -213,11 +213,13 @@ db_frame_info(long *frame, db_addr_t cal
if (!strcmp(name, "trap")) {
*is_trap = TRAP;
narg = 0;
- } else if (!strcmp(name, "syscall")) {
+ } else if (!strcmp(name, "syscall") ||
+ !strcmp(name, "handle_syscall")) {
*is_trap = SYSCALL;
narg = 0;
} else if (name[0] == 'X') {
if (!strncmp(name, "Xintr", 5) ||
+ !strncmp(name, "Xhandle", 7) ||
!strncmp(name, "Xresume", 7) ||
!strncmp(name, "Xstray", 6) ||
!strncmp(name, "Xhold", 5) ||
Index: src/sys/arch/amd64/amd64/genassym.cf
diff -u src/sys/arch/amd64/amd64/genassym.cf:1.60.10.1 src/sys/arch/amd64/amd64/genassym.cf:1.60.10.2
--- src/sys/arch/amd64/amd64/genassym.cf:1.60.10.1 Tue Mar 13 15:47:44 2018
+++ src/sys/arch/amd64/amd64/genassym.cf Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-# $NetBSD: genassym.cf,v 1.60.10.1 2018/03/13 15:47:44 martin Exp $
+# $NetBSD: genassym.cf,v 1.60.10.2 2018/03/22 16:59:03 martin Exp $
#
# Copyright (c) 1998, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -236,6 +236,13 @@ define CPU_INFO_CURLDT offsetof(struct
define CPU_INFO_IDLELWP offsetof(struct cpu_info, ci_data.cpu_idlelwp)
define CPU_INFO_PMAP offsetof(struct cpu_info, ci_pmap)
define CPU_INFO_TSS offsetof(struct cpu_info, ci_tss)
+ifdef SVS
+define CPU_INFO_UPDIRPA offsetof(struct cpu_info, ci_svs_updirpa)
+define CPU_INFO_KPDIRPA offsetof(struct cpu_info, ci_svs_kpdirpa)
+define CPU_INFO_RSP0 offsetof(struct cpu_info, ci_svs_rsp0)
+define CPU_INFO_URSP0 offsetof(struct cpu_info, ci_svs_ursp0)
+define CPU_INFO_KRSP0 offsetof(struct cpu_info, ci_svs_krsp0)
+endif
define CPU_INFO_NSYSCALL offsetof(struct cpu_info, ci_data.cpu_nsyscall)
define CPU_INFO_NTRAP offsetof(struct cpu_info, ci_data.cpu_ntrap)
define CPU_INFO_NINTR offsetof(struct cpu_info, ci_data.cpu_nintr)
Index: src/sys/arch/amd64/amd64/locore.S
diff -u src/sys/arch/amd64/amd64/locore.S:1.123.6.4 src/sys/arch/amd64/amd64/locore.S:1.123.6.5
--- src/sys/arch/amd64/amd64/locore.S:1.123.6.4 Tue Mar 13 15:47:44 2018
+++ src/sys/arch/amd64/amd64/locore.S Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: locore.S,v 1.123.6.4 2018/03/13 15:47:44 martin Exp $ */
+/* $NetBSD: locore.S,v 1.123.6.5 2018/03/22 16:59:03 martin Exp $ */
/*
* Copyright-o-rama!
@@ -160,6 +160,7 @@
#include "opt_compat_netbsd32.h"
#include "opt_compat_ibcs2.h"
#include "opt_xen.h"
+#include "opt_svs.h"
#include "assym.h"
#include "lapic.h"
@@ -329,6 +330,9 @@
.globl _C_LABEL(bootinfo)
.globl _C_LABEL(biosbasemem)
.globl _C_LABEL(biosextmem)
+ .globl do_mov_es
+ .globl do_mov_ds
+ .globl do_iret
.type _C_LABEL(tablesize), @object
_C_LABEL(tablesize): .long TABLESIZE
@@ -1080,6 +1084,16 @@ ENTRY(cpu_switchto)
movq %rbp,PCB_RBP(%rax)
skip_save:
+#ifdef SVS
+ pushq %rdx
+ movb _C_LABEL(svs_enabled),%dl
+ testb %dl,%dl
+ jz .Lskip_svs
+ callq _C_LABEL(svs_lwp_switch)
+.Lskip_svs:
+ popq %rdx
+#endif
+
/* Switch to newlwp's stack. */
movq L_PCB(%r12),%r14
movq PCB_RSP(%r14),%rsp
@@ -1097,6 +1111,19 @@ skip_save:
jnz switch_return
/* Switch ring0 stack */
+#ifdef SVS
+ movb _C_LABEL(svs_enabled),%al
+ testb %al,%al
+ jz .Lno_svs_switch
+
+ movq CPUVAR(RSP0),%rax
+ movq CPUVAR(TSS),%rdi
+ movq %rax,TSS_RSP0(%rdi)
+ jmp .Lring0_switched
+
+.Lno_svs_switch:
+#endif
+
#ifndef XEN
movq PCB_RSP0(%r14),%rax
movq CPUVAR(TSS),%rdi
@@ -1105,6 +1132,7 @@ skip_save:
movq %r14,%rdi
callq _C_LABEL(x86_64_switch_context);
#endif
+.Lring0_switched:
/* Don't bother with the rest if switching to a system process. */
testl $LW_SYSTEM,L_FLAG(%r12)
@@ -1223,74 +1251,12 @@ ENTRY(savectx)
ret
END(savectx)
-IDTVEC(syscall32)
- sysret /* go away please */
-IDTVEC_END(syscall32)
-
/*
- * syscall()
- *
- * syscall insn entry.
+ * Syscall handler.
*/
-IDTVEC(syscall)
-#ifndef XEN
- /*
- * The user %rip is in %rcx and the user %rflags in %r11. The kernel %cs
- * and %ss are loaded, but nothing else is.
- *
- * The 'swapgs' instruction gives us access to cpu-specific memory where
- * we can save a user register and then read the LWP's kernel stack
- * pointer.
- *
- * This code doesn't seem to set %ds, this may not matter since it is
- * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that
- * is ignored as well.
- */
- swapgs
- movq %r15,CPUVAR(SCRATCH)
- movq CPUVAR(CURLWP),%r15
- movq L_PCB(%r15),%r15
- movq PCB_RSP0(%r15),%r15 /* LWP's kernel stack pointer */
-
- /* Make stack look like an 'int nn' frame */
-#define SP(x) (x)-(TF_SS+8)(%r15)
- movq $(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS) /* user %ss */
- movq %rsp,SP(TF_RSP) /* user %rsp */
- movq %r11,SP(TF_RFLAGS) /* user %rflags */
- movq $(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS) /* user %cs */
- movq %rcx,SP(TF_RIP) /* user %rip */
-
- leaq SP(0),%rsp /* %rsp now valid after frame */
- movq CPUVAR(SCRATCH),%r15
-#undef SP
-
- movq $2,TF_ERR(%rsp) /* syscall instruction size */
- movq $T_ASTFLT,TF_TRAPNO(%rsp)
-
- movw %es,TF_ES(%rsp)
- sti
- INTR_SAVE_GPRS
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw $(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
-#else
- /* Xen already switched to kernel stack */
- pushq %rsi
+NENTRY(handle_syscall)
STI(si)
- popq %rsi
- addq $0x10,%rsp /* gap to match cs:rip */
- pushq $2 /* error code */
- pushq $T_ASTFLT
- subq $TF_REGSIZE,%rsp
- INTR_SAVE_GPRS
- cld
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw $(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
-#endif
-do_syscall:
movq CPUVAR(CURLWP),%r14
incq CPUVAR(NSYSCALL) /* count it atomically */
movq %rsp,L_MD_REGS(%r14) /* save pointer to frame */
@@ -1315,32 +1281,18 @@ do_syscall:
jne spl_error
#endif
+ /*
+ * Decide if we need to take a slow path. That's the case when we
+ * want to reload %cs and %ss on a 64bit LWP (MDL_IRET set), or when
+ * we're returning to a 32bit LWP (MDL_COMPAT32 set).
+ *
+ * In either case, we jump into intrfastexit and return to userland
+ * with the iret instruction.
+ */
testl $(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14)
- INTR_RESTORE_GPRS
- movw TF_ES(%rsp),%es
- movw TF_DS(%rsp),%ds
- SWAPGS
- jnz 2f
-#ifndef XEN
- movq TF_RIP(%rsp),%rcx /* %rip for sysret */
- movq TF_RFLAGS(%rsp),%r11 /* %flags for sysret */
- movq TF_RSP(%rsp),%rsp
- sysretq
-#else
- addq $TF_RIP,%rsp
- pushq $256 /* VGCF_IN_SYSCALL */
- jmp HYPERVISOR_iret
-#endif
+ jnz intrfastexit
-/*
- * If the syscall might have modified some registers, or we are a 32bit
- * process we must return to user with an 'iret' instruction.
- * If the iret faults in kernel (assumed due to illegal register values)
- * then a SIGSEGV will be signalled.
- */
-2:
- addq $TF_RIP,%rsp
- iretq
+ jmp syscall_sysret
#ifdef DIAGNOSTIC
/* Report SPL error */
@@ -1372,7 +1324,7 @@ spl_error:
movq %rsp,%rdi
call _C_LABEL(trap)
jmp .Lsyscall_checkast /* re-check ASTs */
-IDTVEC_END(syscall)
+END(handle_syscall)
/*
* void lwp_trampoline(void);
@@ -1392,10 +1344,96 @@ NENTRY(lwp_trampoline)
END(lwp_trampoline)
/*
+ * Entry points of the 'syscall' instruction, 64bit and 32bit mode.
+ */
+
+#define SP(x) (x)-(TF_SS+8)(%rax)
+
+.macro SYSCALL_ENTRY name,is_svs
+IDTVEC(\name)
+#ifndef XEN
+ /*
+ * The user %rip is in %rcx and the user %rflags in %r11. The kernel %cs
+ * and %ss are loaded, but nothing else is.
+ *
+ * The 'swapgs' instruction gives us access to cpu-specific memory where
+ * we can save a user register and then read the LWP's kernel stack
+ * pointer.
+ *
+ * This code doesn't seem to set %ds, this may not matter since it is
+ * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that
+ * is ignored as well.
+ */
+ swapgs
+
+ /* Get the LWP's kernel stack pointer in %rax */
+ .if \is_svs
+ movq %rax,SVS_UTLS+UTLS_SCRATCH
+ movq SVS_UTLS+UTLS_RSP0,%rax
+ .else
+ movq %rax,CPUVAR(SCRATCH)
+ movq CPUVAR(CURLWP),%rax
+ movq L_PCB(%rax),%rax
+ movq PCB_RSP0(%rax),%rax
+ .endif
+
+ /* Make stack look like an 'int nn' frame */
+ movq $(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS) /* user %ss */
+ movq %rsp,SP(TF_RSP) /* user %rsp */
+ movq %r11,SP(TF_RFLAGS) /* user %rflags */
+ movq $(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS) /* user %cs */
+ movq %rcx,SP(TF_RIP) /* user %rip */
+ leaq SP(0),%rsp /* %rsp now valid after frame */
+
+ /* Restore %rax */
+ .if \is_svs
+ movq SVS_UTLS+UTLS_SCRATCH,%rax
+ .else
+ movq CPUVAR(SCRATCH),%rax
+ .endif
+
+ movq $2,TF_ERR(%rsp) /* syscall instruction size */
+ movq $T_ASTFLT,TF_TRAPNO(%rsp)
+#else
+ /* Xen already switched to kernel stack */
+ addq $0x10,%rsp /* gap to match cs:rip */
+ pushq $2 /* error code */
+ pushq $T_ASTFLT
+ subq $TF_REGSIZE,%rsp
+ cld
+#endif
+ INTR_SAVE_GPRS
+ movw $(LSEL(LUDATA_SEL, SEL_UPL)),TF_DS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ .if \is_svs
+ SVS_ENTER
+ .endif
+ jmp handle_syscall
+IDTVEC_END(\name)
+.endm
+
+SYSCALL_ENTRY syscall,is_svs=0
+
+ TEXT_USER_BEGIN
+
+#ifdef SVS
+SYSCALL_ENTRY syscall_svs,is_svs=1
+#endif
+
+IDTVEC(syscall32)
+ sysret /* go away please */
+IDTVEC_END(syscall32)
+
+ TEXT_USER_END
+
+/*
* osyscall()
*
* Trap gate entry for int $80 syscall, also used by sigreturn.
*/
+ TEXT_USER_BEGIN
IDTVEC(osyscall)
#ifdef XEN
movq (%rsp),%rcx
@@ -1405,9 +1443,37 @@ IDTVEC(osyscall)
pushq $2 /* size of instruction for restart */
pushq $T_ASTFLT /* trap # for doing ASTs */
INTRENTRY
- STI(si)
- jmp do_syscall
+ jmp handle_syscall
IDTVEC_END(osyscall)
+ TEXT_USER_END
+
+/*
+ * Return to userland via 'sysret'.
+ */
+ TEXT_USER_BEGIN
+ _ALIGN_TEXT
+LABEL(syscall_sysret)
+ SVS_LEAVE
+
+ /* Set default the 64bit values in %ds and %es. */
+ movq $GSEL(GUDATA_SEL, SEL_UPL),%rax
+ movw %ax,%ds
+ movw %ax,%es
+
+ INTR_RESTORE_GPRS
+ SWAPGS
+#ifndef XEN
+ movq TF_RIP(%rsp),%rcx /* %rip for sysret */
+ movq TF_RFLAGS(%rsp),%r11 /* %flags for sysret */
+ movq TF_RSP(%rsp),%rsp
+ sysretq
+#else
+ addq $TF_RIP,%rsp
+ pushq $256 /* VGCF_IN_SYSCALL */
+ jmp HYPERVISOR_iret
+#endif
+END(syscall_sysret)
+ TEXT_USER_END
/*
* bool sse2_idlezero_page(void *pg)
@@ -1451,7 +1517,6 @@ END(sse2_idlezero_page)
*
* Zero a page without polluting the cache.
*/
-
ENTRY(pagezero)
movq $-PAGE_SIZE,%rdx
subq %rdx,%rdi
@@ -1471,15 +1536,92 @@ ENTRY(pagezero)
ret
END(pagezero)
+ TEXT_USER_BEGIN
+
+/*
+ * In intrfastexit, we advance %rsp at the beginning. We then access the
+ * segment registers in the trapframe with TF_BACKW (backwards). See the
+ * documentation in amd64_trap.S for an explanation.
+ */
+
+#define TF_BACKW(val, reg) (val - (TF_REGSIZE+16))(reg)
+
_ALIGN_TEXT
LABEL(intrfastexit)
- INTR_RESTORE_GPRS
- testq $SEL_UPL,TF_CS(%rsp)
- je 99f
NOT_XEN(cli;)
- movw TF_ES(%rsp),%es
- movw TF_DS(%rsp),%ds
+ SVS_LEAVE
+ INTR_RESTORE_GPRS
+ addq $(TF_REGSIZE+16),%rsp /* iret frame */
+
+ testb $SEL_UPL,TF_BACKW(TF_CS, %rsp)
+ jz .Lkexit
SWAPGS
-99: addq $TF_REGSIZE+16,%rsp
+do_mov_es:
+ movw TF_BACKW(TF_ES, %rsp),%es
+do_mov_ds:
+ movw TF_BACKW(TF_DS, %rsp),%ds
+
+.Lkexit:
+do_iret:
iretq
END(intrfastexit)
+
+ TEXT_USER_END
+
+#ifdef SVS
+ .globl svs_enter, svs_enter_end
+ .globl svs_enter_altstack, svs_enter_altstack_end
+ .globl svs_leave, svs_leave_end
+ .globl svs_leave_altstack, svs_leave_altstack_end
+ .globl nosvs_enter, nosvs_enter_end
+ .globl nosvs_enter_altstack, nosvs_enter_altstack_end
+ .globl nosvs_leave, nosvs_leave_end
+ .globl nosvs_leave_altstack, nosvs_leave_altstack_end
+
+LABEL(svs_enter)
+ movq SVS_UTLS+UTLS_KPDIRPA,%rax
+ movq %rax,%cr3
+ movq CPUVAR(KRSP0),%rsp
+LABEL(svs_enter_end)
+
+LABEL(svs_enter_altstack)
+ testb $SEL_UPL,TF_CS(%rsp)
+ jz 1234f
+ movq SVS_UTLS+UTLS_KPDIRPA,%rax
+ movq %rax,%cr3
+1234:
+LABEL(svs_enter_altstack_end)
+
+LABEL(svs_leave)
+ testb $SEL_UPL,TF_CS(%rsp)
+ jz 1234f
+ movq CPUVAR(URSP0),%rsp
+ movq CPUVAR(UPDIRPA),%rax
+ movq %rax,%cr3
+1234:
+LABEL(svs_leave_end)
+
+LABEL(svs_leave_altstack)
+ testb $SEL_UPL,TF_CS(%rsp)
+ jz 1234f
+ movq CPUVAR(UPDIRPA),%rax
+ movq %rax,%cr3
+1234:
+LABEL(svs_leave_altstack_end)
+
+LABEL(nosvs_enter)
+ NOSVS_ENTER
+LABEL(nosvs_enter_end)
+
+LABEL(nosvs_enter_altstack)
+ NOSVS_ENTER_ALTSTACK
+LABEL(nosvs_enter_altstack_end)
+
+LABEL(nosvs_leave)
+ NOSVS_LEAVE
+LABEL(nosvs_leave_end)
+
+LABEL(nosvs_leave_altstack)
+ NOSVS_LEAVE_ALTSTACK
+LABEL(nosvs_leave_altstack_end)
+#endif
Index: src/sys/arch/amd64/amd64/machdep.c
diff -u src/sys/arch/amd64/amd64/machdep.c:1.255.6.5 src/sys/arch/amd64/amd64/machdep.c:1.255.6.6
--- src/sys/arch/amd64/amd64/machdep.c:1.255.6.5 Fri Mar 16 13:17:56 2018
+++ src/sys/arch/amd64/amd64/machdep.c Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: machdep.c,v 1.255.6.5 2018/03/16 13:17:56 martin Exp $ */
+/* $NetBSD: machdep.c,v 1.255.6.6 2018/03/22 16:59:03 martin Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
@@ -111,7 +111,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.255.6.5 2018/03/16 13:17:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.255.6.6 2018/03/22 16:59:03 martin Exp $");
/* #define XENDEBUG_LOW */
@@ -123,6 +123,7 @@ __KERNEL_RCSID(0, "$NetBSD: machdep.c,v
#include "opt_mtrr.h"
#include "opt_realmem.h"
#include "opt_xen.h"
+#include "opt_svs.h"
#ifndef XEN
#include "opt_physmem.h"
#endif
@@ -1544,6 +1545,9 @@ init_x86_64(paddr_t first_avail)
#endif /* XEN */
cpu_probe(&cpu_info_primary);
+#ifdef SVS
+ svs_init();
+#endif
cpu_init_msrs(&cpu_info_primary, true);
use_pae = 1; /* PAE always enabled in long mode */
Index: src/sys/arch/amd64/amd64/trap.c
diff -u src/sys/arch/amd64/amd64/trap.c:1.96.4.1 src/sys/arch/amd64/amd64/trap.c:1.96.4.2
--- src/sys/arch/amd64/amd64/trap.c:1.96.4.1 Wed Mar 7 14:50:56 2018
+++ src/sys/arch/amd64/amd64/trap.c Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: trap.c,v 1.96.4.1 2018/03/07 14:50:56 martin Exp $ */
+/* $NetBSD: trap.c,v 1.96.4.2 2018/03/22 16:59:03 martin Exp $ */
/*-
* Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.96.4.1 2018/03/07 14:50:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.96.4.2 2018/03/22 16:59:03 martin Exp $");
#include "opt_ddb.h"
#include "opt_kgdb.h"
@@ -266,9 +266,6 @@ trap(struct trapframe *frame)
extern char fusuintrfailure[], kcopy_fault[];
extern char IDTVEC(osyscall)[];
extern char IDTVEC(syscall32)[];
-#ifndef XEN
- struct trapframe *vframe;
-#endif
ksiginfo_t ksi;
void *onfault;
int type, error;
@@ -357,92 +354,7 @@ copyfault:
return;
}
- /*
- * Check for failure during return to user mode.
- * This can happen loading invalid values into the segment
- * registers, or during the 'iret' itself.
- *
- * We do this by looking at the instruction we faulted on.
- * The specific instructions we recognize only happen when
- * returning from a trap, syscall, or interrupt.
- */
-
-kernelfault:
-#ifdef XEN
- /*
- * XXX: there has to be an equivalent 'problem'
- * but I (dsl) don't know exactly what happens!
- * For now panic the kernel.
- */
goto we_re_toast;
-#else
- KSI_INIT_TRAP(&ksi);
- ksi.ksi_signo = SIGSEGV;
- ksi.ksi_code = SEGV_ACCERR;
- ksi.ksi_trap = type;
-
- /* Get %rsp value before fault - there may be a pad word
- * below the trap frame. */
- vframe = (void *)frame->tf_rsp;
- if (frame->tf_rip == 0) {
- /*
- * Assume that if we jumped to null we
- * probably did it via a null function
- * pointer, so print the return address.
- */
- printf("kernel jumped to null; return addr was %p\n",
- *(void **)frame->tf_rsp);
- goto we_re_toast;
- }
- switch (*(uint16_t *)frame->tf_rip) {
- case 0xcf48: /* iretq */
- /*
- * The 'iretq' instruction faulted, so we have the
- * 'user' registers saved after the kernel
- * %rip:%cs:%fl:%rsp:%ss of the iret, and below that
- * the user %rip:%cs:%fl:%rsp:%ss the 'iret' was
- * processing.
- * We must copy the user register back over the
- * kernel fault frame to generate a normal stack
- * frame (eg for sending a SIGSEGV).
- */
- vframe = (void *)((char *)vframe
- - offsetof(struct trapframe, tf_rip));
- memmove(vframe, frame,
- offsetof(struct trapframe, tf_rip));
- /* Set the faulting address to the user %eip */
- ksi.ksi_addr = (void *)vframe->tf_rip;
- break;
- case 0x848e: /* mov 0xa8(%rsp),%es (8e 84 24 a8 00 00 00) */
- case 0x9c8e: /* mov 0xb0(%rsp),%ds (8e 9c 24 b0 00 00 00) */
-#ifdef USER_LDT
- case 0xa48e: /* mov 0xa0(%rsp),%fs (8e a4 24 a0 00 00 00) */
- case 0xac8e: /* mov 0x98(%rsp),%gs (8e ac 24 98 00 00 00) */
-#endif
- /*
- * We faulted loading one of the user segment registers.
- * The stack frame containing the user registers is
- * still valid and pointed to by tf_rsp.
- * Maybe we should check the iretq follows.
- */
- if (KERNELMODE(vframe->tf_cs, vframe->tf_eflags))
- goto we_re_toast;
- /* There is no valid address for the fault */
- break;
-
- default:
- goto we_re_toast;
- }
-
- /* XXX: worry about on-stack trampolines for nested
- * handlers?? */
- /* Save outer frame for any signal return */
- l->l_md.md_regs = vframe;
- (*p->p_emul->e_trapsignal)(l, &ksi);
- /* Return to user by reloading the user frame */
- trap_return_fault_return(vframe);
- /* NOTREACHED */
-#endif
case T_PROTFLT|T_USER: /* protection fault */
#if defined(COMPAT_NETBSD32) && defined(COMPAT_10)
@@ -700,7 +612,7 @@ faultcommon:
goto copyfault;
printf("uvm_fault(%p, 0x%lx, %d) -> %x\n",
map, va, ftype, error);
- goto kernelfault;
+ goto we_re_toast;
}
KSI_INIT_TRAP(&ksi);
Index: src/sys/arch/amd64/amd64/vector.S
diff -u src/sys/arch/amd64/amd64/vector.S:1.49.2.1 src/sys/arch/amd64/amd64/vector.S:1.49.2.2
--- src/sys/arch/amd64/amd64/vector.S:1.49.2.1 Wed Mar 7 14:50:56 2018
+++ src/sys/arch/amd64/amd64/vector.S Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: vector.S,v 1.49.2.1 2018/03/07 14:50:56 martin Exp $ */
+/* $NetBSD: vector.S,v 1.49.2.2 2018/03/22 16:59:03 martin Exp $ */
/*-
* Copyright (c) 1998, 2007, 2008 The NetBSD Foundation, Inc.
@@ -114,10 +114,7 @@ IDTVEC(recurse_lapic_ipi)
INTRENTRY
jmp 1f
IDTVEC_END(recurse_lapic_ipi)
-IDTVEC(intr_x2apic_ipi)
- pushq $0
- pushq $T_ASTFLT
- INTRENTRY
+IDTVEC(handle_x2apic_ipi)
movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
xorl %eax,%eax
xorl %edx,%edx
@@ -126,17 +123,14 @@ IDTVEC(intr_x2apic_ipi)
cmpl $IPL_HIGH,%ebx
jae 2f
jmp 1f
-IDTVEC_END(intr_x2apic_ipi)
-IDTVEC(intr_lapic_ipi)
- pushq $0
- pushq $T_ASTFLT
- INTRENTRY
+IDTVEC_END(handle_x2apic_ipi)
+IDTVEC(handle_lapic_ipi)
movq _C_LABEL(local_apic_va),%rbx
movl $0,LAPIC_EOI(%rbx)
movl CPUVAR(ILEVEL),%ebx
cmpl $IPL_HIGH,%ebx
jae 2f
-IDTVEC_END(intr_lapic_ipi)
+IDTVEC_END(handle_lapic_ipi)
IDTVEC(resume_lapic_ipi)
1:
incl CPUVAR(IDEPTH)
@@ -150,12 +144,23 @@ IDTVEC(resume_lapic_ipi)
INTRFASTEXIT
IDTVEC_END(resume_lapic_ipi)
-#if defined(DDB)
-IDTVEC(intrddb)
-1:
+ TEXT_USER_BEGIN
+IDTVEC(intr_x2apic_ipi)
pushq $0
- pushq $T_BPTFLT
+ pushq $T_ASTFLT
+ INTRENTRY
+ jmp _C_LABEL(Xhandle_x2apic_ipi)
+IDTVEC_END(intr_x2apic_ipi)
+IDTVEC(intr_lapic_ipi)
+ pushq $0
+ pushq $T_ASTFLT
INTRENTRY
+ jmp _C_LABEL(Xhandle_lapic_ipi)
+IDTVEC_END(intr_lapic_ipi)
+ TEXT_USER_END
+
+#if defined(DDB)
+IDTVEC(handle_intrddbipi)
movl $0xf,%eax
movq %rax,%cr8
movq _C_LABEL(local_apic_va),%rbx
@@ -165,13 +170,8 @@ IDTVEC(intrddb)
xorl %eax,%eax
movq %rax,%cr8
INTRFASTEXIT
-IDTVEC_END(intrddb)
-
-IDTVEC(x2apic_intrddb)
-1:
- pushq $0
- pushq $T_BPTFLT
- INTRENTRY
+IDTVEC_END(handle_intrddbipi)
+IDTVEC(handle_x2apic_intrddbipi)
movl $0xf,%eax
movq %rax,%cr8
movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
@@ -183,7 +183,23 @@ IDTVEC(x2apic_intrddb)
xorl %eax,%eax
movq %rax,%cr8
INTRFASTEXIT
+IDTVEC_END(handle_x2apic_intrddbipi)
+
+ TEXT_USER_BEGIN
+IDTVEC(intrddb)
+ pushq $0
+ pushq $T_BPTFLT
+ INTRENTRY
+ jmp _C_LABEL(Xhandle_intrddbipi)
+IDTVEC_END(intrddb)
+IDTVEC(x2apic_intrddb)
+ pushq $0
+ pushq $T_BPTFLT
+ INTRENTRY
+ jmp _C_LABEL(Xhandle_x2apic_intrddbipi)
IDTVEC_END(x2apic_intrddb)
+ TEXT_USER_END
+
#endif /* DDB */
#endif /* MULTIPROCESSOR */
@@ -197,10 +213,7 @@ IDTVEC(recurse_lapic_ltimer)
INTRENTRY
jmp 1f
IDTVEC_END(recurse_lapic_ltimer)
-IDTVEC(intr_x2apic_ltimer)
- pushq $0
- pushq $T_ASTFLT
- INTRENTRY
+IDTVEC(handle_x2apic_ltimer)
movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
xorl %eax,%eax
xorl %edx,%edx
@@ -209,11 +222,8 @@ IDTVEC(intr_x2apic_ltimer)
cmpl $IPL_CLOCK,%ebx
jae 2f
jmp 1f
-IDTVEC_END(intr_x2apic_ltimer)
-IDTVEC(intr_lapic_ltimer)
- pushq $0
- pushq $T_ASTFLT
- INTRENTRY
+IDTVEC_END(handle_x2apic_ltimer)
+IDTVEC(handle_lapic_ltimer)
movq _C_LABEL(local_apic_va),%rbx
movl $0,LAPIC_EOI(%rbx)
movl CPUVAR(ILEVEL),%ebx
@@ -234,33 +244,57 @@ IDTVEC(resume_lapic_ltimer)
orl $(1 << LIR_TIMER),CPUVAR(IPENDING)
INTRFASTEXIT
IDTVEC_END(resume_lapic_ltimer)
+
+ TEXT_USER_BEGIN
+IDTVEC(intr_x2apic_ltimer)
+ pushq $0
+ pushq $T_ASTFLT
+ INTRENTRY
+ jmp _C_LABEL(Xhandle_x2apic_ltimer)
+IDTVEC_END(intr_x2apic_ltimer)
+IDTVEC(intr_lapic_ltimer)
+ pushq $0
+ pushq $T_ASTFLT
+ INTRENTRY
+ jmp _C_LABEL(Xhandle_lapic_ltimer)
+IDTVEC_END(intr_lapic_ltimer)
+ TEXT_USER_END
+
#endif /* NLAPIC > 0 */
#ifndef XEN
/*
* TLB shootdown handler.
*/
-IDTVEC(intr_lapic_tlb)
- pushq $0
- pushq $T_ASTFLT
- INTRENTRY
+IDTVEC(handle_lapic_tlb)
movq _C_LABEL(local_apic_va),%rax
movl $0,LAPIC_EOI(%rax)
callq _C_LABEL(pmap_tlb_intr)
INTRFASTEXIT
-IDTVEC_END(intr_lapic_tlb)
-
-IDTVEC(intr_x2apic_tlb)
- pushq $0
- pushq $T_ASTFLT
- INTRENTRY
+IDTVEC_END(handle_lapic_tlb)
+IDTVEC(handle_x2apic_tlb)
movl $(MSR_X2APIC_BASE + MSR_X2APIC_EOI),%ecx
xorl %eax,%eax
xorl %edx,%edx
wrmsr
callq _C_LABEL(pmap_tlb_intr)
INTRFASTEXIT
+IDTVEC_END(handle_x2apic_tlb)
+
+ TEXT_USER_BEGIN
+IDTVEC(intr_lapic_tlb)
+ pushq $0
+ pushq $T_ASTFLT
+ INTRENTRY
+ jmp _C_LABEL(Xhandle_lapic_tlb)
+IDTVEC_END(intr_lapic_tlb)
+IDTVEC(intr_x2apic_tlb)
+ pushq $0
+ pushq $T_ASTFLT
+ INTRENTRY
+ jmp _C_LABEL(Xhandle_x2apic_tlb)
IDTVEC_END(intr_x2apic_tlb)
+ TEXT_USER_END
#endif /* !XEN */
@@ -269,7 +303,7 @@ IDTVEC_END(intr_x2apic_tlb)
#ifndef XEN
/*
- * This macro defines the generic stub code. Its arguments modifiy it
+ * This macro defines the generic stub code. Its arguments modify it
* for specific PICs.
*/
@@ -285,10 +319,7 @@ IDTVEC(resume_ ## name ## num) \
movq CPUVAR(ISOURCES) + (num) * 8,%r14 ;\
movl IS_MAXLEVEL(%r14),%ebx ;\
jmp 1f ;\
-IDTVEC(intr_ ## name ## num) ;\
- pushq $0 /* dummy error code */ ;\
- pushq $T_ASTFLT /* trap # for doing ASTs */ ;\
- INTRENTRY ;\
+IDTVEC(handle_ ## name ## num) ;\
movq CPUVAR(ISOURCES) + (num) * 8,%r14 ;\
mask(num) /* mask it in hardware */ ;\
early_ack(num) /* and allow other intrs */ ;\
@@ -339,7 +370,16 @@ IDTVEC(intr_ ## name ## num) ;\
9: \
unmask(num) ;\
late_ack(num) ;\
- INTRFASTEXIT
+ INTRFASTEXIT ;\
+IDTVEC_END(handle_ ## name ## num) ;\
+ TEXT_USER_BEGIN ;\
+IDTVEC(intr_ ## name ## num) ;\
+ pushq $0 /* dummy error code */ ;\
+ pushq $T_ASTFLT /* trap # for doing ASTs */ ;\
+ INTRENTRY ;\
+ jmp _C_LABEL(Xhandle_ ## name ## num) ;\
+IDTVEC_END(intr_ ## name ## num) ;\
+ TEXT_USER_END
#define ICUADDR IO_ICU1
Index: src/sys/arch/amd64/conf/GENERIC
diff -u src/sys/arch/amd64/conf/GENERIC:1.459.2.5 src/sys/arch/amd64/conf/GENERIC:1.459.2.6
--- src/sys/arch/amd64/conf/GENERIC:1.459.2.5 Sun Feb 11 21:17:34 2018
+++ src/sys/arch/amd64/conf/GENERIC Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-# $NetBSD: GENERIC,v 1.459.2.5 2018/02/11 21:17:34 snj Exp $
+# $NetBSD: GENERIC,v 1.459.2.6 2018/03/22 16:59:03 martin Exp $
#
# GENERIC machine description file
#
@@ -22,7 +22,7 @@ include "arch/amd64/conf/std.amd64"
options INCLUDE_CONFIG_FILE # embed config file in kernel binary
-#ident "GENERIC-$Revision: 1.459.2.5 $"
+#ident "GENERIC-$Revision: 1.459.2.6 $"
maxusers 64 # estimated number of users
@@ -73,6 +73,9 @@ options USERCONF # userconf(4) support
#options PIPE_SOCKETPAIR # smaller, but slower pipe(2)
options SYSCTL_INCLUDE_DESCR # Include sysctl descriptions in kernel
+# CPU-related options
+#options SVS # Separate Virtual Space
+
# CPU features
acpicpu* at cpu? # ACPI CPU (including frequency scaling)
coretemp* at cpu? # Intel on-die thermal sensor
Index: src/sys/arch/amd64/conf/kern.ldscript
diff -u src/sys/arch/amd64/conf/kern.ldscript:1.22.6.2 src/sys/arch/amd64/conf/kern.ldscript:1.22.6.3
--- src/sys/arch/amd64/conf/kern.ldscript:1.22.6.2 Tue Mar 6 10:17:11 2018
+++ src/sys/arch/amd64/conf/kern.ldscript Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: kern.ldscript,v 1.22.6.2 2018/03/06 10:17:11 martin Exp $ */
+/* $NetBSD: kern.ldscript,v 1.22.6.3 2018/03/22 16:59:03 martin Exp $ */
#include "assym.h"
@@ -15,6 +15,12 @@ SECTIONS
{
.text :
{
+ . = ALIGN(__PAGE_SIZE);
+ __text_user_start = . ;
+ *(.text.user)
+ . = ALIGN(__PAGE_SIZE);
+ __text_user_end = . ;
+
*(.text)
*(.text.*)
*(.stub)
Index: src/sys/arch/amd64/include/frameasm.h
diff -u src/sys/arch/amd64/include/frameasm.h:1.20.32.1 src/sys/arch/amd64/include/frameasm.h:1.20.32.2
--- src/sys/arch/amd64/include/frameasm.h:1.20.32.1 Wed Mar 7 14:50:57 2018
+++ src/sys/arch/amd64/include/frameasm.h Thu Mar 22 16:59:03 2018
@@ -1,10 +1,11 @@
-/* $NetBSD: frameasm.h,v 1.20.32.1 2018/03/07 14:50:57 martin Exp $ */
+/* $NetBSD: frameasm.h,v 1.20.32.2 2018/03/22 16:59:03 martin Exp $ */
#ifndef _AMD64_MACHINE_FRAMEASM_H
#define _AMD64_MACHINE_FRAMEASM_H
#ifdef _KERNEL_OPT
#include "opt_xen.h"
+#include "opt_svs.h"
#endif
/*
@@ -35,6 +36,19 @@
#define STI(temp_reg) sti
#endif /* XEN */
+#define HP_NAME_SVS_ENTER 5
+#define HP_NAME_SVS_LEAVE 6
+#define HP_NAME_SVS_ENTER_ALT 7
+#define HP_NAME_SVS_LEAVE_ALT 8
+
+#define HOTPATCH(name, size) \
+123: ; \
+ .pushsection .rodata.hotpatch, "a" ; \
+ .byte name ; \
+ .byte size ; \
+ .quad 123b ; \
+ .popsection
+
#define SWAPGS NOT_XEN(swapgs)
/*
@@ -74,21 +88,68 @@
movq TF_RBX(%rsp),%rbx ; \
movq TF_RAX(%rsp),%rax
-#define INTRENTRY_L(kernel_trap, usertrap) \
+#define TEXT_USER_BEGIN .pushsection .text.user, "ax"
+#define TEXT_USER_END .popsection
+
+#ifdef SVS
+
+/* XXX: put this somewhere else */
+#define SVS_UTLS 0xffffc00000000000 /* PMAP_PCPU_BASE */
+#define UTLS_KPDIRPA 0
+#define UTLS_SCRATCH 8
+#define UTLS_RSP0 16
+
+#define SVS_ENTER_BYTES 22
+#define NOSVS_ENTER \
+ .byte 0xEB, (SVS_ENTER_BYTES-2) /* jmp */ ; \
+ .fill (SVS_ENTER_BYTES-2),1,0xCC
+#define SVS_ENTER \
+ HOTPATCH(HP_NAME_SVS_ENTER, SVS_ENTER_BYTES) ; \
+ NOSVS_ENTER
+
+#define SVS_LEAVE_BYTES 31
+#define NOSVS_LEAVE \
+ .byte 0xEB, (SVS_LEAVE_BYTES-2) /* jmp */ ; \
+ .fill (SVS_LEAVE_BYTES-2),1,0xCC
+#define SVS_LEAVE \
+ HOTPATCH(HP_NAME_SVS_LEAVE, SVS_LEAVE_BYTES) ; \
+ NOSVS_LEAVE
+
+#define SVS_ENTER_ALT_BYTES 23
+#define NOSVS_ENTER_ALTSTACK \
+ .byte 0xEB, (SVS_ENTER_ALT_BYTES-2) /* jmp */ ; \
+ .fill (SVS_ENTER_ALT_BYTES-2),1,0xCC
+#define SVS_ENTER_ALTSTACK \
+ HOTPATCH(HP_NAME_SVS_ENTER_ALT, SVS_ENTER_ALT_BYTES) ; \
+ NOSVS_ENTER_ALTSTACK
+
+#define SVS_LEAVE_ALT_BYTES 22
+#define NOSVS_LEAVE_ALTSTACK \
+ .byte 0xEB, (SVS_LEAVE_ALT_BYTES-2) /* jmp */ ; \
+ .fill (SVS_LEAVE_ALT_BYTES-2),1,0xCC
+#define SVS_LEAVE_ALTSTACK \
+ HOTPATCH(HP_NAME_SVS_LEAVE_ALT, SVS_LEAVE_ALT_BYTES) ; \
+ NOSVS_LEAVE_ALTSTACK
+
+#else
+#define SVS_ENTER /* nothing */
+#define SVS_LEAVE /* nothing */
+#define SVS_ENTER_ALTSTACK /* nothing */
+#define SVS_LEAVE_ALTSTACK /* nothing */
+#endif
+
+#define INTRENTRY \
subq $TF_REGSIZE,%rsp ; \
INTR_SAVE_GPRS ; \
cld ; \
testb $SEL_UPL,TF_CS(%rsp) ; \
- je kernel_trap ; \
-usertrap ; \
+ je 98f ; \
SWAPGS ; \
+ SVS_ENTER ; \
movw %gs,TF_GS(%rsp) ; \
movw %fs,TF_FS(%rsp) ; \
movw %es,TF_ES(%rsp) ; \
- movw %ds,TF_DS(%rsp)
-
-#define INTRENTRY \
- INTRENTRY_L(98f,) ; \
+ movw %ds,TF_DS(%rsp) ; \
98:
#define INTRFASTEXIT \
Index: src/sys/arch/amd64/include/param.h
diff -u src/sys/arch/amd64/include/param.h:1.21.6.1 src/sys/arch/amd64/include/param.h:1.21.6.2
--- src/sys/arch/amd64/include/param.h:1.21.6.1 Fri Mar 16 13:17:56 2018
+++ src/sys/arch/amd64/include/param.h Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: param.h,v 1.21.6.1 2018/03/16 13:17:56 martin Exp $ */
+/* $NetBSD: param.h,v 1.21.6.2 2018/03/22 16:59:03 martin Exp $ */
#ifdef __x86_64__
@@ -53,9 +53,9 @@
#define SSIZE 1 /* initial stack size/NBPG */
#define SINCR 1 /* increment of stack/NBPG */
#ifdef DIAGNOSTIC
-#define UPAGES 4 /* pages of u-area (1 for redzone) */
+#define UPAGES 5 /* pages of u-area (1 for redzone) */
#else
-#define UPAGES 3 /* pages of u-area */
+#define UPAGES 4 /* pages of u-area */
#endif
#define USPACE (UPAGES * NBPG) /* total size of u-area */
Index: src/sys/arch/amd64/include/pmap.h
diff -u src/sys/arch/amd64/include/pmap.h:1.39 src/sys/arch/amd64/include/pmap.h:1.39.8.1
--- src/sys/arch/amd64/include/pmap.h:1.39 Fri Nov 11 12:06:31 2016
+++ src/sys/arch/amd64/include/pmap.h Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.h,v 1.39 2016/11/11 12:06:31 maxv Exp $ */
+/* $NetBSD: pmap.h,v 1.39.8.1 2018/03/22 16:59:03 martin Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -218,6 +218,12 @@
*/
#define NPTECL 8
+void svs_pmap_sync(struct pmap *, int);
+void svs_lwp_switch(struct lwp *, struct lwp *);
+void svs_pdir_switch(struct pmap *);
+void svs_init(void);
+extern bool svs_enabled;
+
#include <x86/pmap.h>
#ifndef XEN
Index: src/sys/arch/x86/conf/files.x86
diff -u src/sys/arch/x86/conf/files.x86:1.88 src/sys/arch/x86/conf/files.x86:1.88.6.1
--- src/sys/arch/x86/conf/files.x86:1.88 Fri Mar 10 14:40:56 2017
+++ src/sys/arch/x86/conf/files.x86 Thu Mar 22 16:59:03 2018
@@ -1,4 +1,4 @@
-# $NetBSD: files.x86,v 1.88 2017/03/10 14:40:56 maxv Exp $
+# $NetBSD: files.x86,v 1.88.6.1 2018/03/22 16:59:03 martin Exp $
# options for MP configuration through the MP spec
defflag opt_mpbios.h MPBIOS MPVERBOSE MPDEBUG MPBIOS_SCANPCI
@@ -16,6 +16,8 @@ defflag opt_pcifixup.h PCI_ADDR_FIXUP PC
# To be able to test for NetBSD/xen in shared files
defflag opt_xen.h DO_NOT_DEFINE
+defflag SVS
+
define cpubus { [apid = -1] }
define cpufeaturebus {}
define ioapicbus { [apid = -1] }
@@ -90,6 +92,7 @@ file arch/x86/x86/pmap.c machdep
file arch/x86/x86/pmap_tlb.c machdep
file arch/x86/x86/pmc.c machdep
file arch/x86/x86/procfs_machdep.c procfs
+file arch/x86/x86/svs.c machdep & svs
file arch/x86/x86/sys_machdep.c machdep
file arch/x86/x86/syscall.c machdep
file arch/x86/x86/tsc.c machdep
Index: src/sys/arch/x86/include/cpu.h
diff -u src/sys/arch/x86/include/cpu.h:1.71.2.3 src/sys/arch/x86/include/cpu.h:1.71.2.4
--- src/sys/arch/x86/include/cpu.h:1.71.2.3 Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/include/cpu.h Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.h,v 1.71.2.3 2018/03/16 13:17:56 martin Exp $ */
+/* $NetBSD: cpu.h,v 1.71.2.4 2018/03/22 16:59:04 martin Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
@@ -47,6 +47,7 @@
#if defined(_KERNEL) || defined(_KMEMUSER)
#if defined(_KERNEL_OPT)
#include "opt_xen.h"
+#include "opt_svs.h"
#ifdef i386
#include "opt_user_ldt.h"
#include "opt_vm86.h"
@@ -197,6 +198,18 @@ struct cpu_info {
pd_entry_t * ci_pae_l3_pdir; /* VA pointer to L3 PD */
#endif
+#ifdef SVS
+ pd_entry_t * ci_svs_updir;
+ paddr_t ci_svs_updirpa;
+ paddr_t ci_svs_kpdirpa;
+ kmutex_t ci_svs_mtx;
+ pd_entry_t * ci_svs_rsp0_pte;
+ vaddr_t ci_svs_rsp0;
+ vaddr_t ci_svs_ursp0;
+ vaddr_t ci_svs_krsp0;
+ vaddr_t ci_svs_utls;
+#endif
+
#if defined(XEN) && (defined(PAE) || defined(__x86_64__))
/* Currently active user PGD (can't use rcr3() with Xen) */
pd_entry_t * ci_kpm_pdir; /* per-cpu PMD (va) */
@@ -342,6 +355,7 @@ void cpu_broadcast_halt(void);
void cpu_kick(struct cpu_info *);
void cpu_pcpuarea_init(struct cpu_info *);
+void cpu_svs_init(struct cpu_info *);
#define curcpu() x86_curcpu()
#define curlwp x86_curlwp()
Index: src/sys/arch/x86/include/pmap.h
diff -u src/sys/arch/x86/include/pmap.h:1.64.6.1 src/sys/arch/x86/include/pmap.h:1.64.6.2
--- src/sys/arch/x86/include/pmap.h:1.64.6.1 Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/include/pmap.h Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.h,v 1.64.6.1 2018/03/16 13:17:56 martin Exp $ */
+/* $NetBSD: pmap.h,v 1.64.6.2 2018/03/22 16:59:04 martin Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -126,9 +126,13 @@ struct pcpu_entry {
uint8_t ist0[PAGE_SIZE];
uint8_t ist1[PAGE_SIZE];
uint8_t ist2[PAGE_SIZE];
+ uint8_t rsp0[2 * PAGE_SIZE];
} __packed;
struct pcpu_area {
+#ifdef SVS
+ uint8_t utls[PAGE_SIZE];
+#endif
uint8_t idt[PAGE_SIZE];
uint8_t ldt[PAGE_SIZE];
struct pcpu_entry ent[MAXCPUS];
Index: src/sys/arch/x86/x86/cpu.c
diff -u src/sys/arch/x86/x86/cpu.c:1.130.2.4 src/sys/arch/x86/x86/cpu.c:1.130.2.5
--- src/sys/arch/x86/x86/cpu.c:1.130.2.4 Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/x86/cpu.c Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.c,v 1.130.2.4 2018/03/16 13:17:56 martin Exp $ */
+/* $NetBSD: cpu.c,v 1.130.2.5 2018/03/22 16:59:04 martin Exp $ */
/*-
* Copyright (c) 2000-2012 NetBSD Foundation, Inc.
@@ -62,12 +62,13 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.130.2.4 2018/03/16 13:17:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.130.2.5 2018/03/22 16:59:04 martin Exp $");
#include "opt_ddb.h"
#include "opt_mpbios.h" /* for MPDEBUG */
#include "opt_mtrr.h"
#include "opt_multiprocessor.h"
+#include "opt_svs.h"
#include "lapic.h"
#include "ioapic.h"
@@ -410,6 +411,10 @@ cpu_attach(device_t parent, device_t sel
KASSERT(ci->ci_data.cpu_idlelwp != NULL);
}
+#ifdef SVS
+ cpu_svs_init(ci);
+#endif
+
pmap_reference(pmap_kernel());
ci->ci_pmap = pmap_kernel();
ci->ci_tlbstate = TLBSTATE_STALE;
@@ -597,6 +602,9 @@ cpu_init(struct cpu_info *ci)
* hardware supports it.
*/
if (cpu_feature[0] & CPUID_PGE)
+#ifdef SVS
+ if (!svs_enabled)
+#endif
cr4 |= CR4_PGE; /* enable global TLB caching */
/*
@@ -1071,7 +1079,7 @@ mp_cpu_start_cleanup(struct cpu_info *ci
#ifdef __x86_64__
typedef void (vector)(void);
-extern vector Xsyscall, Xsyscall32;
+extern vector Xsyscall, Xsyscall32, Xsyscall_svs;
#endif
void
@@ -1085,6 +1093,11 @@ cpu_init_msrs(struct cpu_info *ci, bool
wrmsr(MSR_CSTAR, (uint64_t)Xsyscall32);
wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
+#ifdef SVS
+ if (svs_enabled)
+ wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs);
+#endif
+
if (full) {
wrmsr(MSR_FSBASE, 0);
wrmsr(MSR_GSBASE, (uint64_t)ci);
@@ -1245,6 +1258,10 @@ x86_cpu_idle_halt(void)
void
cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
{
+#ifdef SVS
+ svs_pdir_switch(pmap);
+#endif
+
#ifdef PAE
struct cpu_info *ci = curcpu();
bool interrupts_enabled;
Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.245.6.5 src/sys/arch/x86/x86/pmap.c:1.245.6.6
--- src/sys/arch/x86/x86/pmap.c:1.245.6.5 Fri Mar 16 13:17:56 2018
+++ src/sys/arch/x86/x86/pmap.c Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.c,v 1.245.6.5 2018/03/16 13:17:56 martin Exp $ */
+/* $NetBSD: pmap.c,v 1.245.6.6 2018/03/22 16:59:04 martin Exp $ */
/*-
* Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc.
@@ -171,12 +171,13 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.245.6.5 2018/03/16 13:17:56 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.245.6.6 2018/03/22 16:59:04 martin Exp $");
#include "opt_user_ldt.h"
#include "opt_lockdebug.h"
#include "opt_multiprocessor.h"
#include "opt_xen.h"
+#include "opt_svs.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -2051,31 +2052,30 @@ pmap_free_ptp(struct pmap *pmap, struct
do {
index = pl_i(va, level + 1);
opde = pmap_pte_testset(&pdes[level - 1][index], 0);
-#if defined(XEN)
-# if defined(__x86_64__)
+
/*
- * If ptp is a L3 currently mapped in kernel space,
- * on any cpu, clear it before freeing
+ * On Xen-amd64 or SVS, we need to sync the top level page
+ * directory on each CPU.
*/
+#if defined(XEN) && defined(__x86_64__)
if (level == PTP_LEVELS - 1) {
- /*
- * Update the per-cpu PD on all cpus the current
- * pmap is active on
- */
xen_kpm_sync(pmap, index);
}
-# endif /*__x86_64__ */
+#elif defined(SVS)
+ if (svs_enabled && level == PTP_LEVELS - 1) {
+ svs_pmap_sync(pmap, index);
+ }
+#endif
+
invaladdr = level == 1 ? (vaddr_t)ptes :
(vaddr_t)pdes[level - 2];
pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
opde, TLBSHOOT_FREE_PTP1);
+
+#if defined(XEN)
pmap_tlb_shootnow();
-#else /* XEN */
- invaladdr = level == 1 ? (vaddr_t)ptes :
- (vaddr_t)pdes[level - 2];
- pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
- opde, TLBSHOOT_FREE_PTP1);
-#endif /* XEN */
+#endif
+
pmap_freepage(pmap, ptp, level);
if (level < PTP_LEVELS - 1) {
ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
@@ -2157,15 +2157,19 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
pa = VM_PAGE_TO_PHYS(ptp);
pmap_pte_set(&pva[index], (pd_entry_t)
(pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
+
+ /*
+ * On Xen-amd64 or SVS, we need to sync the top level page
+ * directory on each CPU.
+ */
#if defined(XEN) && defined(__x86_64__)
if (i == PTP_LEVELS) {
-
- /*
- * Update the per-cpu PD on all cpus the current
- * pmap is active on
- */
xen_kpm_sync(pmap, index);
}
+#elif defined(SVS)
+ if (svs_enabled && i == PTP_LEVELS) {
+ svs_pmap_sync(pmap, index);
+ }
#endif
pmap_pte_flush();
pmap_stats_update(pmap, 1, 0);
Index: src/sys/arch/x86/x86/vm_machdep.c
diff -u src/sys/arch/x86/x86/vm_machdep.c:1.28.6.2 src/sys/arch/x86/x86/vm_machdep.c:1.28.6.3
--- src/sys/arch/x86/x86/vm_machdep.c:1.28.6.2 Sat Mar 17 11:23:18 2018
+++ src/sys/arch/x86/x86/vm_machdep.c Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: vm_machdep.c,v 1.28.6.2 2018/03/17 11:23:18 martin Exp $ */
+/* $NetBSD: vm_machdep.c,v 1.28.6.3 2018/03/22 16:59:04 martin Exp $ */
/*-
* Copyright (c) 1982, 1986 The Regents of the University of California.
@@ -80,7 +80,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.28.6.2 2018/03/17 11:23:18 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.28.6.3 2018/03/22 16:59:04 martin Exp $");
#include "opt_mtrr.h"
@@ -178,9 +178,16 @@ cpu_lwp_fork(struct lwp *l1, struct lwp
* returns normally.
*/
uv = uvm_lwp_getuarea(l2);
+ KASSERT(uv % PAGE_SIZE == 0);
#ifdef __x86_64__
- pcb2->pcb_rsp0 = (uv + USPACE - 16) & ~0xf;
+#ifdef SVS
+ pcb2->pcb_rsp0 = (uv + USPACE - PAGE_SIZE +
+ sizeof(struct trapframe));
+ KASSERT((pcb2->pcb_rsp0 & 0xF) == 0);
+#else
+ pcb2->pcb_rsp0 = (uv + USPACE - 16);
+#endif
tf = (struct trapframe *)pcb2->pcb_rsp0 - 1;
#else
pcb2->pcb_esp0 = (uv + USPACE - 16);
Index: src/sys/arch/x86/x86/x86_machdep.c
diff -u src/sys/arch/x86/x86/x86_machdep.c:1.91.4.1 src/sys/arch/x86/x86/x86_machdep.c:1.91.4.2
--- src/sys/arch/x86/x86/x86_machdep.c:1.91.4.1 Wed Jun 21 17:41:50 2017
+++ src/sys/arch/x86/x86/x86_machdep.c Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: x86_machdep.c,v 1.91.4.1 2017/06/21 17:41:50 snj Exp $ */
+/* $NetBSD: x86_machdep.c,v 1.91.4.2 2018/03/22 16:59:04 martin Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
@@ -31,11 +31,12 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.91.4.1 2017/06/21 17:41:50 snj Exp $");
+__KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.91.4.2 2018/03/22 16:59:04 martin Exp $");
#include "opt_modular.h"
#include "opt_physmem.h"
#include "opt_splash.h"
+#include "opt_svs.h"
#include <sys/types.h>
#include <sys/param.h>
@@ -1179,6 +1180,22 @@ SYSCTL_SETUP(sysctl_machdep_setup, "sysc
NULL, 0, &use_pae, 0,
CTL_MACHDEP, CTL_CREATE, CTL_EOL);
+#ifdef SVS
+ int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS);
+ const struct sysctlnode *svs_rnode = NULL;
+ sysctl_createv(clog, 0, NULL, &svs_rnode,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "svs", NULL,
+ NULL, 0, NULL, 0,
+ CTL_MACHDEP, CTL_CREATE);
+ sysctl_createv(clog, 0, &svs_rnode, &svs_rnode,
+ CTLFLAG_READWRITE,
+ CTLTYPE_BOOL, "enabled",
+ SYSCTL_DESCR("Whether the kernel uses SVS"),
+ sysctl_machdep_svs_enabled, 0, &svs_enabled, 0,
+ CTL_CREATE, CTL_EOL);
+#endif
+
/* None of these can ever change once the system has booted */
const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present,
CPU_FPU_PRESENT);
Index: src/sys/arch/xen/conf/files.compat
diff -u src/sys/arch/xen/conf/files.compat:1.25.8.1 src/sys/arch/xen/conf/files.compat:1.25.8.2
--- src/sys/arch/xen/conf/files.compat:1.25.8.1 Tue Aug 1 23:18:30 2017
+++ src/sys/arch/xen/conf/files.compat Thu Mar 22 16:59:04 2018
@@ -1,4 +1,4 @@
-# $NetBSD: files.compat,v 1.25.8.1 2017/08/01 23:18:30 snj Exp $
+# $NetBSD: files.compat,v 1.25.8.2 2018/03/22 16:59:04 martin Exp $
# NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp
# options for MP configuration through the MP spec
@@ -29,6 +29,7 @@ defflag opt_pcifixup.h XXXOPT_PCIFIXUP
defflag opt_vm86.h XXXVM86
defflag opt_pmc.h XXXPMC
+defflag opt_svs.h XXXSVS
# User-settable LDT (used by WINE)
defflag opt_user_ldt.h XXXUSER_LDT
Added files:
Index: src/sys/arch/x86/x86/svs.c
diff -u /dev/null src/sys/arch/x86/x86/svs.c:1.14.2.2
--- /dev/null Thu Mar 22 16:59:04 2018
+++ src/sys/arch/x86/x86/svs.c Thu Mar 22 16:59:04 2018
@@ -0,0 +1,753 @@
+/* $NetBSD: svs.c,v 1.14.2.2 2018/03/22 16:59:04 martin Exp $ */
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.14.2.2 2018/03/22 16:59:04 martin Exp $");
+
+#include "opt_svs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/cpu.h>
+#include <sys/sysctl.h>
+#include <sys/xcall.h>
+
+#include <x86/cputypes.h>
+#include <machine/cpuvar.h>
+#include <machine/frameasm.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_page.h>
+
+/*
+ * Separate Virtual Space
+ *
+ * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context
+ * switch to a user pmap, the lower half of updirpa is populated with the
+ * entries containing the userland pages.
+ *
+ * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * We use a special per-cpu page that we call UTLS, for User Thread Local
+ * Storage. Each CPU has one UTLS page. This page has two VAs:
+ *
+ * o When the user page tables are loaded in CR3, the VA to access this
+ * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is
+ * _constant_ across CPUs, but in the user page tables this VA points to
+ * the physical page of the UTLS that is _local_ to the CPU.
+ *
+ * o When the kernel page tables are loaded in CR3, the VA to access this
+ * page is ci->ci_svs_utls.
+ *
+ * +----------------------------------------------------------------------+
+ * | CPU0 Local Data (Physical Page) |
+ * | +------------------+ +-------------+ |
+ * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | |
+ * | +------------------+ +-------------+ |
+ * +-------------------------------------------------------------^--------+
+ * |
+ * +----------+
+ * |
+ * +----------------------------------------------------------------------+ |
+ * | CPU1 Local Data (Physical Page) | |
+ * | +------------------+ +-------------+ | |
+ * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | |
+ * | +------------------+ +-------------+ | |
+ * +-------------------------------------------------------------^--------+ |
+ * | |
+ * +------------------+ /----------------------+ |
+ * | Kern Page Tables | ci->ci_svs_utls |
+ * +------------------+ \---------------------------------+
+ *
+ * The goal of the UTLS page is to provide an area where we can store whatever
+ * we want, in a way that it is accessible both when the Kernel and when the
+ * User page tables are loaded in CR3.
+ *
+ * We store in the UTLS page three 64bit values:
+ *
+ * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel
+ * page tables.
+ *
+ * o UTLS_SCRATCH: a dummy place where we temporarily store a value during
+ * the syscall entry procedure.
+ *
+ * o UTLS_RSP0: the value we must put in RSP in order to have a stack where
+ * we can push the register states. This is used only during the syscall
+ * entry procedure, because there the CPU does not automatically switch
+ * RSP (it does not use the TSS.rsp0 mechanism described below).
+ *
+ * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between
+ * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to
+ * the stack of the new LWP. Then the execution continues. At some point, the
+ * user LWP we context-switched to will perform a syscall or will receive an
+ * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a
+ * stack. The kernel then pushes the register states on this stack, and
+ * executes in kernel mode normally.
+ *
+ * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore,
+ * when an interrupt is received while we were in kernel mode, the CPU does not
+ * read TSS.rsp0. Instead, it just uses the current stack.
+ *
+ * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU
+ * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do
+ * _not_ have associated physical addresses. They are only two VAs.
+ *
+ * The first page is unmapped and acts as a redzone. The second page is
+ * dynamically kentered into the highest page of the real per-lwp kernel stack;
+ * but pay close attention, it is kentered _only_ in the user page tables.
+ * That is to say, the VA of this second page is mapped when the user page
+ * tables are loaded, but not mapped when the kernel page tables are loaded.
+ *
+ * During a context switch, svs_lwp_switch() gets called first. This function
+ * does the kenter job described above, not in the kernel page tables (that
+ * are currently loaded), but in the user page tables (that are not loaded).
+ *
+ * VIRTUAL ADDRESSES PHYSICAL ADDRESSES
+ *
+ * +-----------------------------+
+ * | KERNEL PAGE TABLES |
+ * | +-------------------+ | +-------------------+
+ * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) |
+ * | +-------------------+ | +-------------------+
+ * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) |
+ * | +-------------------+ | +-------------------+
+ * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) |
+ * | +-------------------+ | +-------------------+
+ * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) |
+ * | +-------------------+ | +-> +-------------------+
+ * +-----------------------------+ |
+ * |
+ * +---------------------------------------+ |
+ * | USER PAGE TABLES | |
+ * | +----------------------------------+ | |
+ * | | pcpuarea->ent[cid].rsp0 (page 0) | | |
+ * | +----------------------------------+ | |
+ * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+
+ * | +----------------------------------+ |
+ * +---------------------------------------+
+ *
+ * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1)
+ * in TSS.rsp0. Later, when returning to userland on the lwp we context-
+ * switched to, we will load the user page tables and execute in userland
+ * normally.
+ *
+ * Next time an interrupt or syscall is received, the CPU will automatically
+ * use TSS.rsp0 as a stack. Here it is executing with the user page tables
+ * loaded, and therefore TSS.rsp0 is _mapped_.
+ *
+ * As part of the kernel entry procedure, we now switch CR3 to load the kernel
+ * page tables. Here, we are still using the stack pointer we set in TSS.rsp0.
+ *
+ * Remember that it was only one page of stack which was mapped only in the
+ * user page tables. We just switched to the kernel page tables, so we must
+ * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so,
+ * without touching the stack (since it is now unmapped, touching it would
+ * fault).
+ *
+ * After we updated RSP, we can continue execution exactly as in the non-SVS
+ * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if
+ * we updated RSP to a totally different VA, this VA points to the same
+ * physical page as TSS.rsp0. So in the end, the values the CPU pushed are
+ * still here even with the new RSP.
+ *
+ * Thanks to this double-kenter optimization, we don't need to copy the
+ * trapframe during each user<->kernel transition.
+ *
+ * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * o Touching ci_svs_updir without holding ci_svs_mtx first is *not*
+ * allowed.
+ *
+ * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded
+ * in their CR3 register. It must *not* be replaced by pm_cpus.
+ *
+ * o When a context switch on the current CPU is made from a user LWP
+ * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's
+ * pm_kernel_cpus still contains the current CPU. It implies that the
+ * remote CPUs that execute other threads of the user process we just
+ * left will keep synchronizing us against their changes.
+ *
+ * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~
+ *
+ * o PTE Space
+ * o Direct Map
+ * o Remote PCPU Areas
+ * o Kernel Heap
+ * o Kernel Image
+ *
+ * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Ordered from highest priority to lowest:
+ *
+ * o The NMI stack is not double-entered. Therefore if we ever receive an NMI
+ * and leave it, the content of the stack will be visible to userland (via
+ * Meltdown). Normally we never leave NMIs, unless a privileged user
+ * launched PMCs. That's unlikely to happen, our PMC support is pretty
+ * minimal, and privileged only.
+ *
+ * o Narrow down the entry points: hide the 'jmp handler' instructions. This
+ * makes sense on GENERIC_KASLR kernels.
+ *
+ * o Right now there is only one global LDT, and that's not compatible with
+ * USER_LDT.
+ */
+
+bool svs_enabled __read_mostly = false;
+
+struct svs_utls {
+ paddr_t kpdirpa;
+ uint64_t scratch;
+ vaddr_t rsp0;
+};
+
+static pd_entry_t *
+svs_tree_add(struct cpu_info *ci, vaddr_t va)
+{
+ extern const vaddr_t ptp_masks[];
+ extern const int ptp_shifts[];
+ extern const long nbpd[];
+ pd_entry_t *dstpde;
+ size_t i, pidx, mod;
+ struct vm_page *pg;
+ paddr_t pa;
+
+ dstpde = ci->ci_svs_updir;
+ mod = (size_t)-1;
+
+ for (i = PTP_LEVELS; i > 1; i--) {
+ pidx = pl_i(va % mod, i);
+
+ if (!pmap_valid_entry(dstpde[pidx])) {
+ pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+ if (pg == 0)
+ panic("%s: failed to allocate PA for CPU %d\n",
+ __func__, cpu_index(ci));
+ pa = VM_PAGE_TO_PHYS(pg);
+
+ dstpde[pidx] = PG_V | PG_RW | pa;
+ }
+
+ pa = (paddr_t)(dstpde[pidx] & PG_FRAME);
+ dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa);
+ mod = nbpd[i-1];
+ }
+
+ return dstpde;
+}
+
+static void
+svs_page_add(struct cpu_info *ci, vaddr_t va)
+{
+ pd_entry_t *srcpde, *dstpde, pde;
+ size_t idx, pidx;
+ paddr_t pa;
+
+ /* Create levels L4, L3 and L2. */
+ dstpde = svs_tree_add(ci, va);
+
+ pidx = pl1_i(va % NBPD_L2);
+
+ /*
+ * If 'va' is in a large page, we need to compute its physical
+ * address manually.
+ */
+ idx = pl2_i(va);
+ srcpde = L2_BASE;
+ if (!pmap_valid_entry(srcpde[idx])) {
+ panic("%s: L2 page not mapped", __func__);
+ }
+ if (srcpde[idx] & PG_PS) {
+ pa = srcpde[idx] & PG_2MFRAME;
+ pa += (paddr_t)(va % NBPD_L2);
+ pde = (srcpde[idx] & ~(PG_G|PG_PS|PG_2MFRAME)) | pa;
+
+ if (pmap_valid_entry(dstpde[pidx])) {
+ panic("%s: L1 page already mapped", __func__);
+ }
+ dstpde[pidx] = pde;
+ return;
+ }
+
+ /*
+ * Normal page, just copy the PDE.
+ */
+ idx = pl1_i(va);
+ srcpde = L1_BASE;
+ if (!pmap_valid_entry(srcpde[idx])) {
+ panic("%s: L1 page not mapped", __func__);
+ }
+ if (pmap_valid_entry(dstpde[pidx])) {
+ panic("%s: L1 page already mapped", __func__);
+ }
+ dstpde[pidx] = srcpde[idx] & ~(PG_G);
+}
+
+static void
+svs_rsp0_init(struct cpu_info *ci)
+{
+ const cpuid_t cid = cpu_index(ci);
+ vaddr_t va, rsp0;
+ pd_entry_t *pd;
+ size_t pidx;
+
+ rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
+
+ /* The first page is a redzone. */
+ va = rsp0 + PAGE_SIZE;
+
+ /* Create levels L4, L3 and L2. */
+ pd = svs_tree_add(ci, va);
+
+ /* Get the info for L1. */
+ pidx = pl1_i(va % NBPD_L2);
+ if (pmap_valid_entry(pd[pidx])) {
+ panic("%s: rsp0 page already mapped", __func__);
+ }
+
+ ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
+ ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
+ ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
+ ci->ci_svs_krsp0 = 0;
+}
+
+static void
+svs_utls_init(struct cpu_info *ci)
+{
+ const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
+ struct svs_utls *utls;
+ struct vm_page *pg;
+ pd_entry_t *pd;
+ size_t pidx;
+ paddr_t pa;
+ vaddr_t va;
+
+ /* Create levels L4, L3 and L2 of the UTLS page. */
+ pd = svs_tree_add(ci, utlsva);
+
+ /* Allocate L1. */
+ pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+ if (pg == 0)
+ panic("%s: failed to allocate PA for CPU %d\n", __func__,
+ cpu_index(ci));
+ pa = VM_PAGE_TO_PHYS(pg);
+
+ /* Enter L1. */
+ if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
+ panic("%s: local page already mapped", __func__);
+ }
+ pidx = pl1_i(utlsva % NBPD_L2);
+ if (pmap_valid_entry(pd[pidx])) {
+ panic("%s: L1 page already mapped", __func__);
+ }
+ pd[pidx] = PG_V | PG_RW | pmap_pg_nx | pa;
+
+ /*
+ * Now, allocate a VA in the kernel map, that points to the UTLS
+ * page. After that, the UTLS page will be accessible in kernel
+ * mode via ci_svs_utls.
+ */
+ va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
+ UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
+ if (va == 0) {
+ panic("%s: unable to allocate VA\n", __func__);
+ }
+ pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
+ pmap_update(pmap_kernel());
+
+ ci->ci_svs_utls = va;
+
+ /* Initialize the constant fields of the UTLS page */
+ utls = (struct svs_utls *)ci->ci_svs_utls;
+ utls->rsp0 = ci->ci_svs_rsp0;
+}
+
+static void
+svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size)
+{
+ size_t i, n;
+
+ KASSERT(size % PAGE_SIZE == 0);
+ n = size / PAGE_SIZE;
+ for (i = 0; i < n; i++) {
+ svs_page_add(ci, va + i * PAGE_SIZE);
+ }
+}
+
+void
+cpu_svs_init(struct cpu_info *ci)
+{
+ extern char __text_user_start;
+ extern char __text_user_end;
+ const cpuid_t cid = cpu_index(ci);
+ struct vm_page *pg;
+
+ KASSERT(ci != NULL);
+
+ pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+ if (pg == 0)
+ panic("%s: failed to allocate L4 PA for CPU %d\n",
+ __func__, cpu_index(ci));
+ ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg);
+
+ ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
+ UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
+ if (ci->ci_svs_updir == NULL)
+ panic("%s: failed to allocate L4 VA for CPU %d\n",
+ __func__, cpu_index(ci));
+
+ pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa,
+ VM_PROT_READ | VM_PROT_WRITE, 0);
+
+ pmap_update(pmap_kernel());
+
+ ci->ci_svs_kpdirpa = pmap_pdirpa(pmap_kernel(), 0);
+
+ mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM);
+
+ svs_page_add(ci, (vaddr_t)&pcpuarea->idt);
+ svs_page_add(ci, (vaddr_t)&pcpuarea->ldt);
+ svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
+ offsetof(struct pcpu_entry, rsp0));
+ svs_range_add(ci, (vaddr_t)&__text_user_start,
+ (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start);
+
+ svs_rsp0_init(ci);
+ svs_utls_init(ci);
+}
+
+void
+svs_pmap_sync(struct pmap *pmap, int index)
+{
+ CPU_INFO_ITERATOR cii;
+ struct cpu_info *ci;
+ cpuid_t cid;
+
+ KASSERT(pmap != NULL);
+ KASSERT(pmap != pmap_kernel());
+ KASSERT(mutex_owned(pmap->pm_lock));
+ KASSERT(kpreempt_disabled());
+ KASSERT(index < 255);
+
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ cid = cpu_index(ci);
+
+ if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
+ continue;
+ }
+
+ /* take the lock and check again */
+ mutex_enter(&ci->ci_svs_mtx);
+ if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
+ ci->ci_svs_updir[index] = pmap->pm_pdir[index];
+ }
+ mutex_exit(&ci->ci_svs_mtx);
+ }
+}
+
+void
+svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
+{
+ struct cpu_info *ci = curcpu();
+ struct svs_utls *utls;
+ struct pcb *pcb;
+ pt_entry_t *pte;
+ uintptr_t rsp0;
+ vaddr_t va;
+
+ if (newlwp->l_flag & LW_SYSTEM) {
+ return;
+ }
+
+#ifdef DIAGNOSTIC
+ if (oldlwp != NULL && !(oldlwp->l_flag & LW_SYSTEM)) {
+ pcb = lwp_getpcb(oldlwp);
+ rsp0 = pcb->pcb_rsp0;
+ va = rounddown(rsp0, PAGE_SIZE);
+ KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe));
+ pte = ci->ci_svs_rsp0_pte;
+ KASSERT(*pte == L1_BASE[pl1_i(va)]);
+ }
+#endif
+
+ pcb = lwp_getpcb(newlwp);
+ rsp0 = pcb->pcb_rsp0;
+ va = rounddown(rsp0, PAGE_SIZE);
+
+ /* Update the kernel rsp0 in cpu_info */
+ ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe);
+ KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) ==
+ (ci->ci_svs_ursp0 % PAGE_SIZE));
+
+ utls = (struct svs_utls *)ci->ci_svs_utls;
+ utls->scratch = 0;
+
+ /*
+ * Enter the user rsp0. We don't need to flush the TLB here, since
+ * the user page tables are not loaded.
+ */
+ pte = ci->ci_svs_rsp0_pte;
+ *pte = L1_BASE[pl1_i(va)];
+}
+
+static inline pt_entry_t
+svs_pte_atomic_read(struct pmap *pmap, size_t idx)
+{
+ /*
+ * XXX: We don't have a basic atomic_fetch_64 function?
+ */
+ return atomic_cas_64(&pmap->pm_pdir[idx], 666, 666);
+}
+
+/*
+ * We may come here with the pmap unlocked. So read its PTEs atomically. If
+ * a remote CPU is updating them at the same time, it's not a problem: the
+ * remote CPU will call svs_pmap_sync afterwards, and our updirpa will be
+ * synchronized properly.
+ */
+void
+svs_pdir_switch(struct pmap *pmap)
+{
+ struct cpu_info *ci = curcpu();
+ struct svs_utls *utls;
+ pt_entry_t pte;
+ size_t i;
+
+ KASSERT(kpreempt_disabled());
+ KASSERT(pmap != pmap_kernel());
+
+ ci->ci_svs_kpdirpa = pmap_pdirpa(pmap, 0);
+
+ /* Update the info in the UTLS page */
+ utls = (struct svs_utls *)ci->ci_svs_utls;
+ utls->kpdirpa = ci->ci_svs_kpdirpa;
+
+ mutex_enter(&ci->ci_svs_mtx);
+
+ /* User slots. */
+ for (i = 0; i < 255; i++) {
+ pte = svs_pte_atomic_read(pmap, i);
+ ci->ci_svs_updir[i] = pte;
+ }
+
+ mutex_exit(&ci->ci_svs_mtx);
+}
+
+static void
+svs_enable(void)
+{
+ extern uint8_t svs_enter, svs_enter_end;
+ extern uint8_t svs_enter_altstack, svs_enter_altstack_end;
+ extern uint8_t svs_leave, svs_leave_end;
+ extern uint8_t svs_leave_altstack, svs_leave_altstack_end;
+ u_long psl, cr0;
+ uint8_t *bytes;
+ size_t size;
+
+ svs_enabled = true;
+
+ x86_patch_window_open(&psl, &cr0);
+
+ bytes = &svs_enter;
+ size = (size_t)&svs_enter_end - (size_t)&svs_enter;
+ x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size);
+
+ bytes = &svs_enter_altstack;
+ size = (size_t)&svs_enter_altstack_end -
+ (size_t)&svs_enter_altstack;
+ x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size);
+
+ bytes = &svs_leave;
+ size = (size_t)&svs_leave_end - (size_t)&svs_leave;
+ x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size);
+
+ bytes = &svs_leave_altstack;
+ size = (size_t)&svs_leave_altstack_end -
+ (size_t)&svs_leave_altstack;
+ x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size);
+
+ x86_patch_window_close(psl, cr0);
+}
+
+static void
+svs_disable_hotpatch(void)
+{
+ extern uint8_t nosvs_enter, nosvs_enter_end;
+ extern uint8_t nosvs_enter_altstack, nosvs_enter_altstack_end;
+ extern uint8_t nosvs_leave, nosvs_leave_end;
+ extern uint8_t nosvs_leave_altstack, nosvs_leave_altstack_end;
+ u_long psl, cr0;
+ uint8_t *bytes;
+ size_t size;
+
+ x86_patch_window_open(&psl, &cr0);
+
+ bytes = &nosvs_enter;
+ size = (size_t)&nosvs_enter_end - (size_t)&nosvs_enter;
+ x86_hotpatch(HP_NAME_SVS_ENTER, bytes, size);
+
+ bytes = &nosvs_enter_altstack;
+ size = (size_t)&nosvs_enter_altstack_end -
+ (size_t)&nosvs_enter_altstack;
+ x86_hotpatch(HP_NAME_SVS_ENTER_ALT, bytes, size);
+
+ bytes = &nosvs_leave;
+ size = (size_t)&nosvs_leave_end - (size_t)&nosvs_leave;
+ x86_hotpatch(HP_NAME_SVS_LEAVE, bytes, size);
+
+ bytes = &nosvs_leave_altstack;
+ size = (size_t)&nosvs_leave_altstack_end -
+ (size_t)&nosvs_leave_altstack;
+ x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, bytes, size);
+
+ x86_patch_window_close(psl, cr0);
+}
+
+static volatile unsigned long svs_cpu_barrier1 __cacheline_aligned;
+static volatile unsigned long svs_cpu_barrier2 __cacheline_aligned;
+typedef void (vector)(void);
+
+static void
+svs_disable_cpu(void *arg1, void *arg2)
+{
+ struct cpu_info *ci = curcpu();
+ extern vector Xsyscall;
+ u_long psl;
+
+ psl = x86_read_psl();
+ x86_disable_intr();
+
+ atomic_dec_ulong(&svs_cpu_barrier1);
+ while (atomic_cas_ulong(&svs_cpu_barrier1, 0, 0) != 0) {
+ x86_pause();
+ }
+
+ /* cpu0 is the one that does the hotpatch job */
+ if (ci == &cpu_info_primary) {
+ svs_enabled = false;
+ svs_disable_hotpatch();
+ }
+
+ /* put back the non-SVS syscall entry point */
+ wrmsr(MSR_LSTAR, (uint64_t)Xsyscall);
+
+ /* enable global pages */
+ if (cpu_feature[0] & CPUID_PGE)
+ lcr4(rcr4() | CR4_PGE);
+
+ atomic_dec_ulong(&svs_cpu_barrier2);
+ while (atomic_cas_ulong(&svs_cpu_barrier2, 0, 0) != 0) {
+ x86_pause();
+ }
+
+ /* Write back and invalidate cache, flush pipelines. */
+ wbinvd();
+ x86_flush();
+
+ x86_write_psl(psl);
+}
+
+static int
+svs_disable(void)
+{
+ struct cpu_info *ci = NULL;
+ CPU_INFO_ITERATOR cii;
+ uint64_t xc;
+
+ mutex_enter(&cpu_lock);
+
+ /*
+ * We expect all the CPUs to be online.
+ */
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ struct schedstate_percpu *spc = &ci->ci_schedstate;
+ if (spc->spc_flags & SPCF_OFFLINE) {
+ printf("[!] cpu%d offline, SVS not disabled\n",
+ cpu_index(ci));
+ mutex_exit(&cpu_lock);
+ return EOPNOTSUPP;
+ }
+ }
+
+ svs_cpu_barrier1 = ncpu;
+ svs_cpu_barrier2 = ncpu;
+
+ printf("[+] Disabling SVS...");
+ xc = xc_broadcast(0, svs_disable_cpu, NULL, NULL);
+ xc_wait(xc);
+ printf(" done!\n");
+
+ mutex_exit(&cpu_lock);
+
+ return 0;
+}
+
+int sysctl_machdep_svs_enabled(SYSCTLFN_ARGS);
+
+int
+sysctl_machdep_svs_enabled(SYSCTLFN_ARGS)
+{
+ struct sysctlnode node;
+ int error, val;
+
+ val = *(int *)rnode->sysctl_data;
+
+ node = *rnode;
+ node.sysctl_data = &val;
+
+ error = sysctl_lookup(SYSCTLFN_CALL(&node));
+ if (error != 0 || newp == NULL)
+ return error;
+
+ if (val == 1) {
+ error = EINVAL;
+ } else {
+ if (svs_enabled)
+ error = svs_disable();
+ else
+ error = 0;
+ }
+
+ return error;
+}
+
+void
+svs_init(void)
+{
+ if (cpu_vendor != CPUVENDOR_INTEL) {
+ return;
+ }
+ svs_enable();
+}