Module Name: src Committed By: riastradh Date: Sun Jul 16 19:55:44 UTC 2023
Modified Files: src/sys/arch/amd64/amd64: locore.S machdep.c src/sys/arch/i386/i386: gdt.c locore.S machdep.c src/sys/arch/x86/x86: cpu.c pmap.c Log Message: x86: Sprinkle extensive commentary about %fs/%gs initialization. Plus some other side quests like the three-stage GDT metamorphosis lifecycle. No functional change intended. To generate a diff of this commit: cvs rdiff -u -r1.222 -r1.223 src/sys/arch/amd64/amd64/locore.S cvs rdiff -u -r1.366 -r1.367 src/sys/arch/amd64/amd64/machdep.c cvs rdiff -u -r1.73 -r1.74 src/sys/arch/i386/i386/gdt.c cvs rdiff -u -r1.195 -r1.196 src/sys/arch/i386/i386/locore.S cvs rdiff -u -r1.839 -r1.840 src/sys/arch/i386/i386/machdep.c cvs rdiff -u -r1.208 -r1.209 src/sys/arch/x86/x86/cpu.c cvs rdiff -u -r1.423 -r1.424 src/sys/arch/x86/x86/pmap.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/amd64/amd64/locore.S diff -u src/sys/arch/amd64/amd64/locore.S:1.222 src/sys/arch/amd64/amd64/locore.S:1.223 --- src/sys/arch/amd64/amd64/locore.S:1.222 Sat Jun 24 05:31:04 2023 +++ src/sys/arch/amd64/amd64/locore.S Sun Jul 16 19:55:43 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: locore.S,v 1.222 2023/06/24 05:31:04 msaitoh Exp $ */ +/* $NetBSD: locore.S,v 1.223 2023/07/16 19:55:43 riastradh Exp $ */ /* * Copyright-o-rama! @@ -974,6 +974,20 @@ longmode_hi: movl $VM_GUEST_XENPV, _C_LABEL(vm_guest) + /* + * Initialize cpu_info_primary.ci_self := &cpu_info_primary, + * and initialize some MSRs with + * cpu_init_msrs(&cpu_info_primary, full=true). This sets up + * SYSCALL/SYSRET (XXX why?) and %fs/%gs, which is needed for + * the %gs-relative addressing used by CPUVAR(...), curcpu(), + * and curlwp. + * + * XXX Is it necessary to set cpu_info_primary.ci_self here? + * Isn't it statically initialized in x86/cpu.c? + * + * XXX Why do we immediately clear the segment registers just + * afterward? + */ movq $cpu_info_primary,%rdi movq %rdi,CPU_INFO_SELF(%rdi) /* ci->ci_self = ci */ movq $1,%rsi Index: src/sys/arch/amd64/amd64/machdep.c diff -u src/sys/arch/amd64/amd64/machdep.c:1.366 src/sys/arch/amd64/amd64/machdep.c:1.367 --- src/sys/arch/amd64/amd64/machdep.c:1.366 Wed Oct 26 23:38:06 2022 +++ src/sys/arch/amd64/amd64/machdep.c Sun Jul 16 19:55:43 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: machdep.c,v 1.366 2022/10/26 23:38:06 riastradh Exp $ */ +/* $NetBSD: machdep.c,v 1.367 2023/07/16 19:55:43 riastradh Exp $ */ /* * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 @@ -110,7 +110,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.366 2022/10/26 23:38:06 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.367 2023/07/16 19:55:43 riastradh Exp $"); #include "opt_modular.h" #include "opt_user_ldt.h" @@ -1721,7 +1721,26 @@ init_x86_64(paddr_t first_avail) #ifdef SVS svs_init(); #endif + + /* + * Initialize MSRs on cpu0: + * + * - Enables SYSCALL/SYSRET. + * + * - Sets up %fs and %gs so that %gs points to the current + * struct cpu_info as needed for CPUVAR(...), curcpu(), and + * curlwp. + * + * - Enables the no-execute bit if supported. + * + * Thus, after this point, CPUVAR(...), curcpu(), and curlwp + * will work on cpu0. + * + * Note: The call to cpu_init_msrs for secondary CPUs happens + * in cpu_hatch. + */ cpu_init_msrs(&cpu_info_primary, true); + #ifndef XENPV cpu_speculation_init(&cpu_info_primary); #endif Index: src/sys/arch/i386/i386/gdt.c diff -u src/sys/arch/i386/i386/gdt.c:1.73 src/sys/arch/i386/i386/gdt.c:1.74 --- src/sys/arch/i386/i386/gdt.c:1.73 Sat Aug 20 23:48:50 2022 +++ src/sys/arch/i386/i386/gdt.c Sun Jul 16 19:55:43 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: gdt.c,v 1.73 2022/08/20 23:48:50 riastradh Exp $ */ +/* $NetBSD: gdt.c,v 1.74 2023/07/16 19:55:43 riastradh Exp $ */ /* * Copyright (c) 1996, 1997, 2009 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.73 2022/08/20 23:48:50 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.74 2023/07/16 19:55:43 riastradh Exp $"); #include "opt_multiprocessor.h" #include "opt_xen.h" @@ -115,8 +115,30 @@ setgdt(int slot, const void *base, size_ #endif /* - * Initialize the GDT. We already have a gdtstore, which was temporarily used - * by the bootstrap code. Now, we allocate a new gdtstore, and put it in cpu0. + * gdt_init() + * + * Create a permanent Global Descriptor Table (GDT) for the + * primary CPU. This replaces the second tepmorary GDT that was + * allocated in pmap_bootstrap with pmap_bootstrap_valloc and + * pmap_bootstrap_palloc -- which in turn replaced the initial + * temporary GDT allocated on the stack early at boot and + * initialized with initgdt. + * + * 1. Allocate permanent space for the primary CPU's GDT with + * uvm_km(9). + * + * 2. Copy the temporary GDT's contents over. See initgdt for the + * original initialization; it was copied from the initial + * temporary GDT to the second temporary GDT in init386. + * + * 3. Make sure the GCPU_SEL segment descriptor points to + * &cpu_info_primary. + * + * XXX Is this necessary? It appears to be redundant with + * initgdt. + * + * 4. Load the permanent GDT address into the Global Descriptor + * Table Register (GDTR) with LGDT (via gdt_init_cpu). */ void gdt_init(void) Index: src/sys/arch/i386/i386/locore.S diff -u src/sys/arch/i386/i386/locore.S:1.195 src/sys/arch/i386/i386/locore.S:1.196 --- src/sys/arch/i386/i386/locore.S:1.195 Sun May 14 09:05:38 2023 +++ src/sys/arch/i386/i386/locore.S Sun Jul 16 19:55:43 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: locore.S,v 1.195 2023/05/14 09:05:38 riastradh Exp $ */ +/* $NetBSD: locore.S,v 1.196 2023/07/16 19:55:43 riastradh Exp $ */ /* * Copyright-o-rama! @@ -128,7 +128,7 @@ */ #include <machine/asm.h> -__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.195 2023/05/14 09:05:38 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.196 2023/07/16 19:55:43 riastradh Exp $"); #include "opt_copy_symtab.h" #include "opt_ddb.h" @@ -1071,6 +1071,31 @@ begin: call _C_LABEL(multiboot2_post_reloc) #endif + /* + * Initialize a temporary GDT (Global Descriptor Table) on the + * stack and make the segment registers to use it. + * + * This creates a segment descriptor for the CPU-local segment + * and loads %fs with its segment selector to set up addressing + * for %fs. Thus, after this point, CPUVAR(...), curcpu(), and + * curlwp will work. + * + * Later, we will replace this temporary GDT on the stack by a + * permanent GDT allocated with uvm_km in gdt_init. + * + * XXX Intel recommends ensuring the GDT address is aligned on + * an 8-byte boundary for performance. Perhaps not an issue + * early at boot, but maybe worth doing? + * + * Intel 64 and IA-32 Architectures, Software Developer's + * Manual, Volume 3: System Programming Guide, Order + * Number 325383, April 2022, Sec. 3.5.1 `Segment + * Descriptor Tables', p. 3-15: + * + * The base address of the GDT should be aligned + * on an eight-byte boundary to yield the best + * processor performance. + */ subl $NGDT*8, %esp /* space for temporary gdt */ pushl %esp call _C_LABEL(initgdt) Index: src/sys/arch/i386/i386/machdep.c diff -u src/sys/arch/i386/i386/machdep.c:1.839 src/sys/arch/i386/i386/machdep.c:1.840 --- src/sys/arch/i386/i386/machdep.c:1.839 Wed Oct 26 23:38:07 2022 +++ src/sys/arch/i386/i386/machdep.c Sun Jul 16 19:55:43 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: machdep.c,v 1.839 2022/10/26 23:38:07 riastradh Exp $ */ +/* $NetBSD: machdep.c,v 1.840 2023/07/16 19:55:43 riastradh Exp $ */ /* * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009, 2017 @@ -67,7 +67,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.839 2022/10/26 23:38:07 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.840 2023/07/16 19:55:43 riastradh Exp $"); #include "opt_beep.h" #include "opt_compat_freebsd.h" @@ -969,6 +969,41 @@ cpu_init_idt(struct cpu_info *ci) lidt(®ion); } +/* + * initgdt(tgdt) + * + * Initialize a temporary Global Descriptor Table (GDT) using + * storage space at tgdt. + * + * 1. Set up segment descriptors for our purposes, including a + * CPU-local segment descriptor pointing at &cpu_info_primary. + * + * 2. Load the address into the Global Descriptor Table Register. + * + * 3. Set up segment selectors for all the segment registers using + * it so that %fs-relative addressing works for the CPU-local + * data. + * + * After this put, CPUVAR(...), curcpu(), and curlwp will work. + * + * Eventually the kernel will switch to a second temporary GDT + * allocated with pmap_bootstrap_valloc in pmap_bootstrap, and + * then to permanent GDT allocated with uvm_km(9) in gdt_init. + * But the first temporary GDT is needed now to get us going with + * early access to curcpu() and curlwp before we enter kernel + * main. + * + * XXX The purpose of each of the segment descriptors should be + * written down somewhere in a single place that can be cross- + * referenced. + * + * References: + * + * - Intel 64 and IA-32 Architectures Software Developer's Manual, + * Volume 3: System Programming Guide, Order Number 325384, + * April 2022, Sec. 3.5.1 `Segment Descriptor Tables', + * pp. 3-14 through 3-16. + */ void initgdt(union descriptor *tgdt) { @@ -1165,7 +1200,15 @@ init386(paddr_t first_avail) uvm_lwp_setuarea(&lwp0, lwp0uarea); cpu_probe(&cpu_info_primary); + + /* + * Initialize the no-execute bit on cpu0, if supported. + * + * Note: The call to cpu_init_msrs for secondary CPUs happens + * in cpu_hatch. + */ cpu_init_msrs(&cpu_info_primary, true); + #ifndef XENPV cpu_speculation_init(&cpu_info_primary); #endif @@ -1332,7 +1375,25 @@ init386(paddr_t first_avail) idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary)); idt = (idt_descriptor_t *)iv->iv_idt; -#ifndef XENPV +#ifndef XENPV + /* + * Switch from the initial temporary GDT that was allocated on + * the stack by our caller, start. That temporary GDT will be + * popped off the stack when init386 returns before start calls + * main, so we need to use a second temporary GDT allocated in + * pmap_bootstrap with pmap_bootstrap_valloc/palloc to make + * sure at least the CPU-local data area, used by CPUVAR(...), + * curcpu(), and curlwp via %fs-relative addressing, will + * continue to work. + * + * Later, in gdt_init via cpu_startup, we will finally allocate + * a permanent GDT with uvm_km(9). + * + * The content of the second temporary GDT is the same as the + * content of the initial GDT, initialized in initgdt, except + * for the address of the LDT, which is also that we are also + * switching to a new temporary LDT at a new address. + */ tgdt = gdtstore; gdtstore = (union descriptor *)gdt_vaddr; ldtstore = (union descriptor *)ldt_vaddr; @@ -1390,10 +1451,22 @@ init386(paddr_t first_avail) GSEL(GCODE_SEL, SEL_KPL)); #ifndef XENPV + /* + * Activate the second temporary GDT, allocated in + * pmap_bootstrap with pmap_bootstrap_valloc/palloc, and + * initialized with the content of the initial temporary GDT in + * initgdt, plus an updated LDT. + * + * This ensures the %fs-relative addressing for the CPU-local + * area used by CPUVAR(...), curcpu(), and curlwp will continue + * to work after init386 returns and the initial temporary GDT + * is popped off, before we call main and later create a + * permanent GDT in gdt_init via cpu_startup. + */ setregion(®ion, gdtstore, NGDT * sizeof(gdtstore[0]) - 1); lgdt(®ion); #endif - + lldt(GSEL(GLDT_SEL, SEL_KPL)); cpu_init_idt(&cpu_info_primary); Index: src/sys/arch/x86/x86/cpu.c diff -u src/sys/arch/x86/x86/cpu.c:1.208 src/sys/arch/x86/x86/cpu.c:1.209 --- src/sys/arch/x86/x86/cpu.c:1.208 Fri Mar 3 14:40:00 2023 +++ src/sys/arch/x86/x86/cpu.c Sun Jul 16 19:55:43 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.c,v 1.208 2023/03/03 14:40:00 riastradh Exp $ */ +/* $NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $ */ /* * Copyright (c) 2000-2020 NetBSD Foundation, Inc. @@ -62,7 +62,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.208 2023/03/03 14:40:00 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $"); #include "opt_ddb.h" #include "opt_mpbios.h" /* for MPDEBUG */ @@ -919,7 +919,29 @@ cpu_hatch(void *v) * prevent a race against cpu0. See sys/conf/ssp.mk. */ + /* + * Initialize MSRs on this CPU: + * + * - On amd64: Enables SYSCALL/SYSRET. + * + * - On amd64: Sets up %fs and %gs so that %gs points to the + * current struct cpu_info as needed for CPUVAR(...), + * curcpu(), and curlwp. + * + * (On i386, CPUVAR(...), curcpu(), and curlwp are made to + * work first by the conifguration of segment descriptors in + * the Global Descriptor Table (GDT) in initgdt.) + * + * - Enables the no-execute bit if supported. + * + * Thus, after this point, CPUVAR(...), curcpu(), and curlwp + * will work on this CPU. + * + * Note: The call to cpu_init_msrs for cpu0 happens in + * init386/init_x86_64. + */ cpu_init_msrs(ci, true); + cpu_probe(ci); cpu_speculation_init(ci); #if NHYPERV > 0 @@ -1197,10 +1219,55 @@ typedef void (vector)(void); extern vector Xsyscall, Xsyscall32, Xsyscall_svs; #endif +/* + * cpu_init_msrs(ci, full) + * + * Initialize some Model-Specific Registers (MSRs) on the current + * CPU, whose struct cpu_info pointer is ci, for: + * + * - SYSCALL/SYSRET. + * - %fs/%gs on amd64 if `full' is true; needed to make + * CPUVAR(...), curcpu(), and curlwp work. (We do this at boot, + * but skip it on ACPI wakeup.) + * - No-execute bit, if supported. + * + * References: + * + * - Intel 64 and IA-32 Architectures Software Developer's Manual, + * Volume 3: System Programming Guide, Order Number 325384, + * April 2022, Sec. 5.8.8 `Fast System Calls in 64-Bit Mode', + * pp. 5-22 through 5-23. + * + * - Intel 64 and IA-32 Architectures Software Developer's Manual, + * Volume 4: Model-Specific Registers, Order Number 335592, + * April 2022, Sec. 2.1 `Architectural MSRs', Table 2-2, + * pp. 2-60 through 2-61. + */ void cpu_init_msrs(struct cpu_info *ci, bool full) { #ifdef __x86_64__ + /* + * On amd64, set up the syscall target address registers + * for SYSCALL/SYSRET: + * + * - IA32_STAR, c000_0081h (MSR_STAR): System Call Target + * Address. Code and stack segment selectors for SYSRET + * (bits 48:63) and SYSCALL (bits 32:47). + * + * - IA32_LSTAR, c000_0082h (MSR_LSTAR): IA-32e Mode System + * Call Target Address. Target rip for SYSCALL when executed + * in 64-bit mode. + * + * - IA32_CSTAR, c000_0083h (MSR_CSTAR): IA-32e Mode System + * Call Target Address. Target rip for SYSCALL when executed + * in compatibility mode. (XXX Manual says this is `[n]ot + * used, as the SYSCALL instruction is not recognized in + * compatibility mode', so why do we set it?) + * + * - IA32_FMASK, c000_0084h (MSR_SFMASK): System Call Flag + * Mask. Mask for the RFLAGS register on SYSCALL. + */ wrmsr(MSR_STAR, ((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((uint64_t)LSEL(LSYSRETBASE_SEL, SEL_UPL) << 48)); @@ -1213,6 +1280,22 @@ cpu_init_msrs(struct cpu_info *ci, bool wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs); #endif + /* + * On amd64 if `full' is true -- used at boot, but not on ACPI + * wakeup -- then additionally set up %fs and %gs: + * + * - IA32_FS_BASE, c000_0100h (MSR_FSBASE): Base address of + * %fs. Not used in NetBSD kernel, so zero it. + * + * - IA32_GS_BASE, c000_0101h (MSR_GSBASE): Base address of + * %gs. Used in NetBSD kernel by CPUVAR(...), curcpu(), and + * curlwp for access to the CPU-local area, so set it to ci. + * + * - IA32_KERNEL_GS_BASE, c000_0102h (MSR_KERNELGSBASE): Base + * address of what swapgs will leave in %gs when switching to + * userland. Zero for now; will be set to pcb->pcb_gs in + * cpu_switchto for user threads. + */ if (full) { wrmsr(MSR_FSBASE, 0); wrmsr(MSR_GSBASE, (uint64_t)ci); @@ -1220,6 +1303,12 @@ cpu_init_msrs(struct cpu_info *ci, bool } #endif /* __x86_64__ */ + /* + * If the no-execute bit is supported, enable it in: + * + * - IA32_EFER, c000_0080h (MSR_EFER): Extended Feature + * Enables. + */ if (cpu_feature[2] & CPUID_NOX) wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE); } Index: src/sys/arch/x86/x86/pmap.c diff -u src/sys/arch/x86/x86/pmap.c:1.423 src/sys/arch/x86/x86/pmap.c:1.424 --- src/sys/arch/x86/x86/pmap.c:1.423 Sat Sep 24 11:05:47 2022 +++ src/sys/arch/x86/x86/pmap.c Sun Jul 16 19:55:43 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $ */ +/* $NetBSD: pmap.c,v 1.424 2023/07/16 19:55:43 riastradh Exp $ */ /* * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. @@ -130,7 +130,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.424 2023/07/16 19:55:43 riastradh Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" @@ -1351,7 +1351,19 @@ pmap_bootstrap(vaddr_t kva_start) #endif /* - * Allocate space for the IDT, GDT and LDT. + * Allocate space for the Interrupt Descriptor Table (IDT), + * Global Descriptor Table (GDT), and Local Descriptor Table + * (LDT). + * + * Currently there is an initial temporary GDT allocated on the + * stack by the caller of init386/init_x86_64, which is (among + * other things) needed on i386 for %fs-relative addressing for + * CPU-local data (CPUVAR(...), curcpu(), curlwp). This + * initial temporary GDT will be popped off the stack before we + * can enter main, so we need to make sure there is space for a + * second temporary GDT to continue existing when we enter main + * before we allocate space for the permanent GDT with + * uvm_km(9) in gdt_init via cpu_startup and switch to that. */ idt_vaddr = pmap_bootstrap_valloc(1); idt_paddr = pmap_bootstrap_palloc(1);