Module Name: src
Committed By: riastradh
Date: Sun Jul 16 19:55:44 UTC 2023
Modified Files:
src/sys/arch/amd64/amd64: locore.S machdep.c
src/sys/arch/i386/i386: gdt.c locore.S machdep.c
src/sys/arch/x86/x86: cpu.c pmap.c
Log Message:
x86: Sprinkle extensive commentary about %fs/%gs initialization.
Plus some other side quests like the three-stage GDT metamorphosis
lifecycle.
No functional change intended.
To generate a diff of this commit:
cvs rdiff -u -r1.222 -r1.223 src/sys/arch/amd64/amd64/locore.S
cvs rdiff -u -r1.366 -r1.367 src/sys/arch/amd64/amd64/machdep.c
cvs rdiff -u -r1.73 -r1.74 src/sys/arch/i386/i386/gdt.c
cvs rdiff -u -r1.195 -r1.196 src/sys/arch/i386/i386/locore.S
cvs rdiff -u -r1.839 -r1.840 src/sys/arch/i386/i386/machdep.c
cvs rdiff -u -r1.208 -r1.209 src/sys/arch/x86/x86/cpu.c
cvs rdiff -u -r1.423 -r1.424 src/sys/arch/x86/x86/pmap.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/arch/amd64/amd64/locore.S
diff -u src/sys/arch/amd64/amd64/locore.S:1.222 src/sys/arch/amd64/amd64/locore.S:1.223
--- src/sys/arch/amd64/amd64/locore.S:1.222 Sat Jun 24 05:31:04 2023
+++ src/sys/arch/amd64/amd64/locore.S Sun Jul 16 19:55:43 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: locore.S,v 1.222 2023/06/24 05:31:04 msaitoh Exp $ */
+/* $NetBSD: locore.S,v 1.223 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright-o-rama!
@@ -974,6 +974,20 @@ longmode_hi:
movl $VM_GUEST_XENPV, _C_LABEL(vm_guest)
+ /*
+ * Initialize cpu_info_primary.ci_self := &cpu_info_primary,
+ * and initialize some MSRs with
+ * cpu_init_msrs(&cpu_info_primary, full=true). This sets up
+ * SYSCALL/SYSRET (XXX why?) and %fs/%gs, which is needed for
+ * the %gs-relative addressing used by CPUVAR(...), curcpu(),
+ * and curlwp.
+ *
+ * XXX Is it necessary to set cpu_info_primary.ci_self here?
+ * Isn't it statically initialized in x86/cpu.c?
+ *
+ * XXX Why do we immediately clear the segment registers just
+ * afterward?
+ */
movq $cpu_info_primary,%rdi
movq %rdi,CPU_INFO_SELF(%rdi) /* ci->ci_self = ci */
movq $1,%rsi
Index: src/sys/arch/amd64/amd64/machdep.c
diff -u src/sys/arch/amd64/amd64/machdep.c:1.366 src/sys/arch/amd64/amd64/machdep.c:1.367
--- src/sys/arch/amd64/amd64/machdep.c:1.366 Wed Oct 26 23:38:06 2022
+++ src/sys/arch/amd64/amd64/machdep.c Sun Jul 16 19:55:43 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: machdep.c,v 1.366 2022/10/26 23:38:06 riastradh Exp $ */
+/* $NetBSD: machdep.c,v 1.367 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
@@ -110,7 +110,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.366 2022/10/26 23:38:06 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.367 2023/07/16 19:55:43 riastradh Exp $");
#include "opt_modular.h"
#include "opt_user_ldt.h"
@@ -1721,7 +1721,26 @@ init_x86_64(paddr_t first_avail)
#ifdef SVS
svs_init();
#endif
+
+ /*
+ * Initialize MSRs on cpu0:
+ *
+ * - Enables SYSCALL/SYSRET.
+ *
+ * - Sets up %fs and %gs so that %gs points to the current
+ * struct cpu_info as needed for CPUVAR(...), curcpu(), and
+ * curlwp.
+ *
+ * - Enables the no-execute bit if supported.
+ *
+ * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
+ * will work on cpu0.
+ *
+ * Note: The call to cpu_init_msrs for secondary CPUs happens
+ * in cpu_hatch.
+ */
cpu_init_msrs(&cpu_info_primary, true);
+
#ifndef XENPV
cpu_speculation_init(&cpu_info_primary);
#endif
Index: src/sys/arch/i386/i386/gdt.c
diff -u src/sys/arch/i386/i386/gdt.c:1.73 src/sys/arch/i386/i386/gdt.c:1.74
--- src/sys/arch/i386/i386/gdt.c:1.73 Sat Aug 20 23:48:50 2022
+++ src/sys/arch/i386/i386/gdt.c Sun Jul 16 19:55:43 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: gdt.c,v 1.73 2022/08/20 23:48:50 riastradh Exp $ */
+/* $NetBSD: gdt.c,v 1.74 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright (c) 1996, 1997, 2009 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.73 2022/08/20 23:48:50 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.74 2023/07/16 19:55:43 riastradh Exp $");
#include "opt_multiprocessor.h"
#include "opt_xen.h"
@@ -115,8 +115,30 @@ setgdt(int slot, const void *base, size_
#endif
/*
- * Initialize the GDT. We already have a gdtstore, which was temporarily used
- * by the bootstrap code. Now, we allocate a new gdtstore, and put it in cpu0.
+ * gdt_init()
+ *
+ * Create a permanent Global Descriptor Table (GDT) for the
+ * primary CPU. This replaces the second tepmorary GDT that was
+ * allocated in pmap_bootstrap with pmap_bootstrap_valloc and
+ * pmap_bootstrap_palloc -- which in turn replaced the initial
+ * temporary GDT allocated on the stack early at boot and
+ * initialized with initgdt.
+ *
+ * 1. Allocate permanent space for the primary CPU's GDT with
+ * uvm_km(9).
+ *
+ * 2. Copy the temporary GDT's contents over. See initgdt for the
+ * original initialization; it was copied from the initial
+ * temporary GDT to the second temporary GDT in init386.
+ *
+ * 3. Make sure the GCPU_SEL segment descriptor points to
+ * &cpu_info_primary.
+ *
+ * XXX Is this necessary? It appears to be redundant with
+ * initgdt.
+ *
+ * 4. Load the permanent GDT address into the Global Descriptor
+ * Table Register (GDTR) with LGDT (via gdt_init_cpu).
*/
void
gdt_init(void)
Index: src/sys/arch/i386/i386/locore.S
diff -u src/sys/arch/i386/i386/locore.S:1.195 src/sys/arch/i386/i386/locore.S:1.196
--- src/sys/arch/i386/i386/locore.S:1.195 Sun May 14 09:05:38 2023
+++ src/sys/arch/i386/i386/locore.S Sun Jul 16 19:55:43 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: locore.S,v 1.195 2023/05/14 09:05:38 riastradh Exp $ */
+/* $NetBSD: locore.S,v 1.196 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright-o-rama!
@@ -128,7 +128,7 @@
*/
#include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.195 2023/05/14 09:05:38 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.196 2023/07/16 19:55:43 riastradh Exp $");
#include "opt_copy_symtab.h"
#include "opt_ddb.h"
@@ -1071,6 +1071,31 @@ begin:
call _C_LABEL(multiboot2_post_reloc)
#endif
+ /*
+ * Initialize a temporary GDT (Global Descriptor Table) on the
+ * stack and make the segment registers to use it.
+ *
+ * This creates a segment descriptor for the CPU-local segment
+ * and loads %fs with its segment selector to set up addressing
+ * for %fs. Thus, after this point, CPUVAR(...), curcpu(), and
+ * curlwp will work.
+ *
+ * Later, we will replace this temporary GDT on the stack by a
+ * permanent GDT allocated with uvm_km in gdt_init.
+ *
+ * XXX Intel recommends ensuring the GDT address is aligned on
+ * an 8-byte boundary for performance. Perhaps not an issue
+ * early at boot, but maybe worth doing?
+ *
+ * Intel 64 and IA-32 Architectures, Software Developer's
+ * Manual, Volume 3: System Programming Guide, Order
+ * Number 325383, April 2022, Sec. 3.5.1 `Segment
+ * Descriptor Tables', p. 3-15:
+ *
+ * The base address of the GDT should be aligned
+ * on an eight-byte boundary to yield the best
+ * processor performance.
+ */
subl $NGDT*8, %esp /* space for temporary gdt */
pushl %esp
call _C_LABEL(initgdt)
Index: src/sys/arch/i386/i386/machdep.c
diff -u src/sys/arch/i386/i386/machdep.c:1.839 src/sys/arch/i386/i386/machdep.c:1.840
--- src/sys/arch/i386/i386/machdep.c:1.839 Wed Oct 26 23:38:07 2022
+++ src/sys/arch/i386/i386/machdep.c Sun Jul 16 19:55:43 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: machdep.c,v 1.839 2022/10/26 23:38:07 riastradh Exp $ */
+/* $NetBSD: machdep.c,v 1.840 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009, 2017
@@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.839 2022/10/26 23:38:07 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.840 2023/07/16 19:55:43 riastradh Exp $");
#include "opt_beep.h"
#include "opt_compat_freebsd.h"
@@ -969,6 +969,41 @@ cpu_init_idt(struct cpu_info *ci)
lidt(®ion);
}
+/*
+ * initgdt(tgdt)
+ *
+ * Initialize a temporary Global Descriptor Table (GDT) using
+ * storage space at tgdt.
+ *
+ * 1. Set up segment descriptors for our purposes, including a
+ * CPU-local segment descriptor pointing at &cpu_info_primary.
+ *
+ * 2. Load the address into the Global Descriptor Table Register.
+ *
+ * 3. Set up segment selectors for all the segment registers using
+ * it so that %fs-relative addressing works for the CPU-local
+ * data.
+ *
+ * After this put, CPUVAR(...), curcpu(), and curlwp will work.
+ *
+ * Eventually the kernel will switch to a second temporary GDT
+ * allocated with pmap_bootstrap_valloc in pmap_bootstrap, and
+ * then to permanent GDT allocated with uvm_km(9) in gdt_init.
+ * But the first temporary GDT is needed now to get us going with
+ * early access to curcpu() and curlwp before we enter kernel
+ * main.
+ *
+ * XXX The purpose of each of the segment descriptors should be
+ * written down somewhere in a single place that can be cross-
+ * referenced.
+ *
+ * References:
+ *
+ * - Intel 64 and IA-32 Architectures Software Developer's Manual,
+ * Volume 3: System Programming Guide, Order Number 325384,
+ * April 2022, Sec. 3.5.1 `Segment Descriptor Tables',
+ * pp. 3-14 through 3-16.
+ */
void
initgdt(union descriptor *tgdt)
{
@@ -1165,7 +1200,15 @@ init386(paddr_t first_avail)
uvm_lwp_setuarea(&lwp0, lwp0uarea);
cpu_probe(&cpu_info_primary);
+
+ /*
+ * Initialize the no-execute bit on cpu0, if supported.
+ *
+ * Note: The call to cpu_init_msrs for secondary CPUs happens
+ * in cpu_hatch.
+ */
cpu_init_msrs(&cpu_info_primary, true);
+
#ifndef XENPV
cpu_speculation_init(&cpu_info_primary);
#endif
@@ -1332,7 +1375,25 @@ init386(paddr_t first_avail)
idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
idt = (idt_descriptor_t *)iv->iv_idt;
-#ifndef XENPV
+#ifndef XENPV
+ /*
+ * Switch from the initial temporary GDT that was allocated on
+ * the stack by our caller, start. That temporary GDT will be
+ * popped off the stack when init386 returns before start calls
+ * main, so we need to use a second temporary GDT allocated in
+ * pmap_bootstrap with pmap_bootstrap_valloc/palloc to make
+ * sure at least the CPU-local data area, used by CPUVAR(...),
+ * curcpu(), and curlwp via %fs-relative addressing, will
+ * continue to work.
+ *
+ * Later, in gdt_init via cpu_startup, we will finally allocate
+ * a permanent GDT with uvm_km(9).
+ *
+ * The content of the second temporary GDT is the same as the
+ * content of the initial GDT, initialized in initgdt, except
+ * for the address of the LDT, which is also that we are also
+ * switching to a new temporary LDT at a new address.
+ */
tgdt = gdtstore;
gdtstore = (union descriptor *)gdt_vaddr;
ldtstore = (union descriptor *)ldt_vaddr;
@@ -1390,10 +1451,22 @@ init386(paddr_t first_avail)
GSEL(GCODE_SEL, SEL_KPL));
#ifndef XENPV
+ /*
+ * Activate the second temporary GDT, allocated in
+ * pmap_bootstrap with pmap_bootstrap_valloc/palloc, and
+ * initialized with the content of the initial temporary GDT in
+ * initgdt, plus an updated LDT.
+ *
+ * This ensures the %fs-relative addressing for the CPU-local
+ * area used by CPUVAR(...), curcpu(), and curlwp will continue
+ * to work after init386 returns and the initial temporary GDT
+ * is popped off, before we call main and later create a
+ * permanent GDT in gdt_init via cpu_startup.
+ */
setregion(®ion, gdtstore, NGDT * sizeof(gdtstore[0]) - 1);
lgdt(®ion);
#endif
-
+
lldt(GSEL(GLDT_SEL, SEL_KPL));
cpu_init_idt(&cpu_info_primary);
Index: src/sys/arch/x86/x86/cpu.c
diff -u src/sys/arch/x86/x86/cpu.c:1.208 src/sys/arch/x86/x86/cpu.c:1.209
--- src/sys/arch/x86/x86/cpu.c:1.208 Fri Mar 3 14:40:00 2023
+++ src/sys/arch/x86/x86/cpu.c Sun Jul 16 19:55:43 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.c,v 1.208 2023/03/03 14:40:00 riastradh Exp $ */
+/* $NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright (c) 2000-2020 NetBSD Foundation, Inc.
@@ -62,7 +62,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.208 2023/03/03 14:40:00 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $");
#include "opt_ddb.h"
#include "opt_mpbios.h" /* for MPDEBUG */
@@ -919,7 +919,29 @@ cpu_hatch(void *v)
* prevent a race against cpu0. See sys/conf/ssp.mk.
*/
+ /*
+ * Initialize MSRs on this CPU:
+ *
+ * - On amd64: Enables SYSCALL/SYSRET.
+ *
+ * - On amd64: Sets up %fs and %gs so that %gs points to the
+ * current struct cpu_info as needed for CPUVAR(...),
+ * curcpu(), and curlwp.
+ *
+ * (On i386, CPUVAR(...), curcpu(), and curlwp are made to
+ * work first by the conifguration of segment descriptors in
+ * the Global Descriptor Table (GDT) in initgdt.)
+ *
+ * - Enables the no-execute bit if supported.
+ *
+ * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
+ * will work on this CPU.
+ *
+ * Note: The call to cpu_init_msrs for cpu0 happens in
+ * init386/init_x86_64.
+ */
cpu_init_msrs(ci, true);
+
cpu_probe(ci);
cpu_speculation_init(ci);
#if NHYPERV > 0
@@ -1197,10 +1219,55 @@ typedef void (vector)(void);
extern vector Xsyscall, Xsyscall32, Xsyscall_svs;
#endif
+/*
+ * cpu_init_msrs(ci, full)
+ *
+ * Initialize some Model-Specific Registers (MSRs) on the current
+ * CPU, whose struct cpu_info pointer is ci, for:
+ *
+ * - SYSCALL/SYSRET.
+ * - %fs/%gs on amd64 if `full' is true; needed to make
+ * CPUVAR(...), curcpu(), and curlwp work. (We do this at boot,
+ * but skip it on ACPI wakeup.)
+ * - No-execute bit, if supported.
+ *
+ * References:
+ *
+ * - Intel 64 and IA-32 Architectures Software Developer's Manual,
+ * Volume 3: System Programming Guide, Order Number 325384,
+ * April 2022, Sec. 5.8.8 `Fast System Calls in 64-Bit Mode',
+ * pp. 5-22 through 5-23.
+ *
+ * - Intel 64 and IA-32 Architectures Software Developer's Manual,
+ * Volume 4: Model-Specific Registers, Order Number 335592,
+ * April 2022, Sec. 2.1 `Architectural MSRs', Table 2-2,
+ * pp. 2-60 through 2-61.
+ */
void
cpu_init_msrs(struct cpu_info *ci, bool full)
{
#ifdef __x86_64__
+ /*
+ * On amd64, set up the syscall target address registers
+ * for SYSCALL/SYSRET:
+ *
+ * - IA32_STAR, c000_0081h (MSR_STAR): System Call Target
+ * Address. Code and stack segment selectors for SYSRET
+ * (bits 48:63) and SYSCALL (bits 32:47).
+ *
+ * - IA32_LSTAR, c000_0082h (MSR_LSTAR): IA-32e Mode System
+ * Call Target Address. Target rip for SYSCALL when executed
+ * in 64-bit mode.
+ *
+ * - IA32_CSTAR, c000_0083h (MSR_CSTAR): IA-32e Mode System
+ * Call Target Address. Target rip for SYSCALL when executed
+ * in compatibility mode. (XXX Manual says this is `[n]ot
+ * used, as the SYSCALL instruction is not recognized in
+ * compatibility mode', so why do we set it?)
+ *
+ * - IA32_FMASK, c000_0084h (MSR_SFMASK): System Call Flag
+ * Mask. Mask for the RFLAGS register on SYSCALL.
+ */
wrmsr(MSR_STAR,
((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
((uint64_t)LSEL(LSYSRETBASE_SEL, SEL_UPL) << 48));
@@ -1213,6 +1280,22 @@ cpu_init_msrs(struct cpu_info *ci, bool
wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs);
#endif
+ /*
+ * On amd64 if `full' is true -- used at boot, but not on ACPI
+ * wakeup -- then additionally set up %fs and %gs:
+ *
+ * - IA32_FS_BASE, c000_0100h (MSR_FSBASE): Base address of
+ * %fs. Not used in NetBSD kernel, so zero it.
+ *
+ * - IA32_GS_BASE, c000_0101h (MSR_GSBASE): Base address of
+ * %gs. Used in NetBSD kernel by CPUVAR(...), curcpu(), and
+ * curlwp for access to the CPU-local area, so set it to ci.
+ *
+ * - IA32_KERNEL_GS_BASE, c000_0102h (MSR_KERNELGSBASE): Base
+ * address of what swapgs will leave in %gs when switching to
+ * userland. Zero for now; will be set to pcb->pcb_gs in
+ * cpu_switchto for user threads.
+ */
if (full) {
wrmsr(MSR_FSBASE, 0);
wrmsr(MSR_GSBASE, (uint64_t)ci);
@@ -1220,6 +1303,12 @@ cpu_init_msrs(struct cpu_info *ci, bool
}
#endif /* __x86_64__ */
+ /*
+ * If the no-execute bit is supported, enable it in:
+ *
+ * - IA32_EFER, c000_0080h (MSR_EFER): Extended Feature
+ * Enables.
+ */
if (cpu_feature[2] & CPUID_NOX)
wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE);
}
Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.423 src/sys/arch/x86/x86/pmap.c:1.424
--- src/sys/arch/x86/x86/pmap.c:1.423 Sat Sep 24 11:05:47 2022
+++ src/sys/arch/x86/x86/pmap.c Sun Jul 16 19:55:43 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $ */
+/* $NetBSD: pmap.c,v 1.424 2023/07/16 19:55:43 riastradh Exp $ */
/*
* Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
@@ -130,7 +130,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.424 2023/07/16 19:55:43 riastradh Exp $");
#include "opt_user_ldt.h"
#include "opt_lockdebug.h"
@@ -1351,7 +1351,19 @@ pmap_bootstrap(vaddr_t kva_start)
#endif
/*
- * Allocate space for the IDT, GDT and LDT.
+ * Allocate space for the Interrupt Descriptor Table (IDT),
+ * Global Descriptor Table (GDT), and Local Descriptor Table
+ * (LDT).
+ *
+ * Currently there is an initial temporary GDT allocated on the
+ * stack by the caller of init386/init_x86_64, which is (among
+ * other things) needed on i386 for %fs-relative addressing for
+ * CPU-local data (CPUVAR(...), curcpu(), curlwp). This
+ * initial temporary GDT will be popped off the stack before we
+ * can enter main, so we need to make sure there is space for a
+ * second temporary GDT to continue existing when we enter main
+ * before we allocate space for the permanent GDT with
+ * uvm_km(9) in gdt_init via cpu_startup and switch to that.
*/
idt_vaddr = pmap_bootstrap_valloc(1);
idt_paddr = pmap_bootstrap_palloc(1);