On 12/11/2012 04:27 PM, Yinghai Lu wrote:
On Tue, Dec 11, 2012 at 3:57 PM, H. Peter Anvin <h...@zytor.com> wrote:
Well, we could invoke it on the bootloader page tables, but as you say
it may not be a good idea... depending on how much memory we may be
talking about.  One solution -- which I have to admit is starting to
sound really good -- is to set up a #PF handler which cycles through a
set of page tables and creates a "virtual identity map"... it does have
the advantage of making the entire physical address space available
without any additional funnies.

so that #PF handler will work before
arch/x86/kernel/setup.c::setup_arch/early_trap_init

early_strap_intit will install another handler there for #PF

for 64bit, moving early_ioremap_init ahead is very simple, like attach patch

but for 32 bit looks like it is not that easy.


Here is an incomplete patch for illustration purposes only what I mean with an early-mapping #PF handler. This creates a set of transient page tables on demand which allows us to access memory as if it was all mapped, but using only O(1) storage. The replacement policy is trivial: if we run out, we start over from scratch.

The "identity page tables" used during the transition to high virtual addresses are kind of magic; there is a bunch of extra aliases created, but the way it is done guarantees that the range we actually cares about is mapped correctly. The aliases don't matter and get scrubbed shortly thereafter anyway.

This should, obviously, be used on native only -- in particular Xen should instead rely on the initial page tables provided by the domain builder, which should map all physical memory.

Once the proper memory-map-aware page tables are built, we should turn this off by swapping to the newly built real init_level4_pgt instead.

        -hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16..2d88344 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_PGTABLE_64_DEFS_H
 #define _ASM_X86_PGTABLE_64_DEFS_H
 
+#include <asm/sparsemem.h>
+
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 
+#define EARLY_DYNAMIC_PAGE_TABLES      64
+
 #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57..9443c77 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,11 +26,73 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2, early_pgt_resets = 0;
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
 {
-       pgd_t *pgd = pgd_offset_k(0UL);
-       pgd_clear(pgd);
-       __flush_tlb_all();
+       unsigned long i;
+
+       for (i = 0; i < PTRS_PER_PGD-1; i++)
+               early_level4_pgt[i].pgd = 0;
+
+       next_early_pgt = 0;
+       early_pgt_resets++;
+
+       __native_flush_tlb();
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
+{
+       unsigned long physaddr = address - __PAGE_OFFSET;
+       unsigned long i;
+       pgdval_t pgd, *pgd_p;
+       pudval_t *pud_p;
+       pmdval_t pmd, *pmd_p;
+
+       if (physaddr >= MAXMEM)
+               return -1;      /* Invalid address - puke */
+
+       i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1);
+       pgd_p = &early_level4_pgt[i].pgd;
+       pgd = *pgd_p;
+
+       /*
+        * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+        * critical -- __PAGE_OFFSET would point us back into the dynamic
+        * range and we might end up looping forever...
+        */
+       if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) {
+               pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map);
+       } else {
+               if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1)
+                       reset_early_page_tables();
+
+               pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+               for (i = 0; i < PTRS_PER_PUD; i++)
+                       pud_p[i] = 0;
+
+               *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + _KERNPG_TABLE;
+       }
+       i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+       pud_p += i;
+
+       pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+       pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               pmd_p[i] = pmd;
+               pmd += PMD_SIZE;
+       }
+
+       *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + _KERNPG_TABLE;
+
+       return 0;
 }
 
 /* Don't add a printk in there. printk relies on the PDA which is not 
initialized 
@@ -70,12 +132,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
                                (__START_KERNEL & PGDIR_MASK)));
        BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
+       /* Kill off the identity-map trampoline */
+       reset_early_page_tables();
+
        /* clear bss before set_intr_gate with early_idt_handler */
        clear_bss();
 
-       /* Make NULL pointers segfault */
-       zap_identity_mappings();
-
+       /* XXX - this is wrong... we need to build page tables from scratch */
        max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 
        for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 94bf9cc..e13ff91 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
        .code64
        .globl startup_64
 startup_64:
-
        /*
         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
         * and someone has loaded an identity mapped page table
         * for us.  These identity mapped page tables map all of the
         * kernel pages and possibly all of memory.
         *
-        * %esi holds a physical pointer to real_mode_data.
+        * %rsi holds a physical pointer to real_mode_data.
         *
         * We come here either directly from a 64bit bootloader, or from
         * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
         * tables and then reload them.
         */
 
-       /* Compute the delta between the address I am compiled to run at and the
+       /*
+        * Compute the delta between the address I am compiled to run at and the
         * address I am actually running at.
         */
        leaq    _text(%rip), %rbp
@@ -78,53 +78,66 @@ startup_64:
        testl   %eax, %eax
        jnz     bad_address
 
-       /* Is the address too large? */
-       leaq    _text(%rip), %rdx
-       movq    $PGDIR_SIZE, %rax
-       cmpq    %rax, %rdx
-       jae     bad_address
-
-       /* Fixup the physical addresses in the page table
+       /*
+        * Is the address too large?
         */
-       addq    %rbp, init_level4_pgt + 0(%rip)
-       addq    %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
-       addq    %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+       leaq    _text(%rip), %rax
+       shrq    $MAX_PHYSMEM_BITS, %rax
+       jnz     bad_address
 
-       addq    %rbp, level3_ident_pgt + 0(%rip)
+       /*
+        * Fixup the physical addresses in the page table
+        */
+       addq    %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
 
        addq    %rbp, level3_kernel_pgt + (510*8)(%rip)
        addq    %rbp, level3_kernel_pgt + (511*8)(%rip)
 
        addq    %rbp, level2_fixmap_pgt + (506*8)(%rip)
 
-       /* Add an Identity mapping if I am above 1G */
+       /*
+        * Set up the identity mapping for the switchover.  These
+        * entries should *NOT* have the global bit set!  This also
+        * creates a bunch of nonsense entries but that is fine --
+        * it avoids problems around wraparound.
+        */
        leaq    _text(%rip), %rdi
-       andq    $PMD_PAGE_MASK, %rdi
+       leaq    early_level4_pgt(%rip), %rbx
 
        movq    %rdi, %rax
-       shrq    $PUD_SHIFT, %rax
-       andq    $(PTRS_PER_PUD - 1), %rax
-       jz      ident_complete
+       shrq    $PGDIR_SHIFT, %rax
+
+       leaq    (4096 + _KERNPG_TABLE)(%rbx), %rdx
+       movq    %rdx, 0(%rbx,%rax,8)
+       movq    %rdx, 8(%rbx,%rax,8)
 
-       leaq    (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), 
%rdx
-       leaq    level3_ident_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
+       addq    $4096, %rdx
+       movq    %rdi, %rax
+       shrq    $PUD_SHIFT, %rax
+       andl    $(PTRS_PER_PUD-1), %eax
+       movq    %rdx, (4096+0)(%rbx,%rax,8)
+       movq    %rdx, (4096+8)(%rbx,%rax,8)
 
+       addq    $8192, %rbx
        movq    %rdi, %rax
-       shrq    $PMD_SHIFT, %rax
-       andq    $(PTRS_PER_PMD - 1), %rax
-       leaq    __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
-       leaq    level2_spare_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+       shrq    $PMD_SHIFT, %rdi
+       addq    $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+       movl    $PTRS_PER_PMD, %ecx
 
+1:
+       andq    $(PTRS_PER_PMD - 1), %rdi
+       movq    %rax, (%rbx,%rdi,8)
+       incq    %rdi
+       addq    $PMD_SIZE, %rax
+       decl    %ecx
+       jnz     1b
+       
        /*
         * Fixup the kernel text+data virtual addresses. Note that
         * we might write invalid pmds, when the kernel is relocated
         * cleanup_highmap() fixes this up along with the mappings
         * beyond _end.
         */
-
        leaq    level2_kernel_pgt(%rip), %rdi
        leaq    4096(%rdi), %r8
        /* See if it is a valid page table entry */
@@ -149,7 +162,7 @@ ENTRY(secondary_startup_64)
         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
         * and someone has loaded a mapped page table.
         *
-        * %esi holds a physical pointer to real_mode_data.
+        * %rsi holds a physical pointer to real_mode_data.
         *
         * We come here either from startup_64 (using physical addresses)
         * or from trampoline.S (using virtual addresses).
@@ -196,7 +209,7 @@ ENTRY(secondary_startup_64)
        movq    %rax, %cr0
 
        /* Setup a boot time stack */
-       movq stack_start(%rip),%rsp
+       movq stack_start(%rip), %rsp
 
        /* zero EFLAGS after setting rsp */
        pushq $0
@@ -236,31 +249,31 @@ ENTRY(secondary_startup_64)
        movl    initial_gs+4(%rip),%edx
        wrmsr   
 
-       /* esi is pointer to real mode structure with interesting info.
+       /* rsi is pointer to real mode structure with interesting info.
           pass it to C */
-       movl    %esi, %edi
+       movq    %rsi, %rdi
        
        /* Finally jump to run C code and to be on real kernel address
         * Since we are running on identity-mapped space we have to jump
         * to the full 64bit address, this is only possible as indirect
         * jump.  In addition we need to ensure %cs is set so we make this
-        * a far return.
+        * a far jump.
         */
-       movq    initial_code(%rip),%rax
        pushq   $0              # fake return address to stop unwinder
-       pushq   $__KERNEL_CS    # set correct cs
-       pushq   %rax            # target address in negative space
-       lretq
+       /* gas 2.22 is buggy and mis-assembles ljmpq */
+       rex64 ljmp *initial_code(%rip)
 
        /* SMP bootup changes these two */
        __REFDATA
-       .align  8
-       ENTRY(initial_code)
+       .balign 8
+       GLOBAL(initial_code)
        .quad   x86_64_start_kernel
-       ENTRY(initial_gs)
+       .word   __KERNEL_CS
+       .balign 8
+       GLOBAL(initial_gs)
        .quad   INIT_PER_CPU_VAR(irq_stack_union)
 
-       ENTRY(stack_start)
+       GLOBAL(stack_start)
        .quad  init_thread_union+THREAD_SIZE-8
        .word  0
        __FINITDATA
@@ -268,7 +281,7 @@ ENTRY(secondary_startup_64)
 bad_address:
        jmp bad_address
 
-       .section ".init.text","ax"
+       __INIT
        .globl early_idt_handlers
 early_idt_handlers:
        # 104(%rsp) %rflags
@@ -305,14 +318,22 @@ ENTRY(early_idt_handler)
        pushq %r11              #  0(%rsp)
 
        cmpl $__KERNEL_CS,96(%rsp)
-       jne 10f
+       jne 11f
 
+       cmpl $14,72(%rsp)       # Page fault?
+       jnz 10f
+       GET_CR2_INTO(%rdi)      # can clobber any volatile register if pv
+       call early_make_pgtable
+       andl %eax,%eax
+       jz 20f                  # All good
+
+10:
        leaq 88(%rsp),%rdi      # Pointer to %rip
        call early_fixup_exception
        andl %eax,%eax
        jnz 20f                 # Found an exception entry
 
-10:
+11:
 #ifdef CONFIG_EARLY_PRINTK
        GET_CR2_INTO(%r9)       # can clobber any volatile register if pv
        movl 80(%rsp),%r8d      # error code
@@ -334,7 +355,7 @@ ENTRY(early_idt_handler)
 1:     hlt
        jmp 1b
 
-20:    # Exception table entry found
+20:    # Exception table entry found or page table generated
        popq %r11
        popq %r10
        popq %r9
@@ -348,6 +369,8 @@ ENTRY(early_idt_handler)
        decl early_recursion_flag(%rip)
        INTERRUPT_RETURN
 
+       __INITDATA
+       
        .balign 4
 early_recursion_flag:
        .long 0
@@ -358,11 +381,10 @@ early_idt_msg:
 early_idt_ripmsg:
        .asciz "RIP %s\n"
 #endif /* CONFIG_EARLY_PRINTK */
-       .previous
 
 #define NEXT_PAGE(name) \
        .balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)                       \
@@ -372,46 +394,21 @@ ENTRY(name)
        i = i + 1 ;                                     \
        .endr
 
-       .data
-       /*
-        * This default setting generates an ident mapping at address 0x100000
-        * and a mapping for the kernel that precisely maps virtual address
-        * 0xffffffff80000000 to physical address 0x000000. (always using
-        * 2Mbyte large pages provided by PAE mode)
-        */
-NEXT_PAGE(init_level4_pgt)
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .org    init_level4_pgt + L4_START_KERNEL*8, 0
-       /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+       __INITDATA
+NEXT_PAGE(early_level4_pgt)
+       .fill   511,8,0
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
-NEXT_PAGE(level3_ident_pgt)
-       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .fill   511,8,0
+NEXT_PAGE(early_dynamic_pgts)
+       .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
 
+       .data
 NEXT_PAGE(level3_kernel_pgt)
        .fill   L3_START_KERNEL,8,0
        /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
        .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
 
-NEXT_PAGE(level2_fixmap_pgt)
-       .fill   506,8,0
-       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
-       .fill   5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
-       .fill   512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
-       /* Since I easily can, map the first 1G.
-        * Don't set NX because code runs from these pages.
-        */
-       PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
 NEXT_PAGE(level2_kernel_pgt)
        /*
         * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -426,11 +423,13 @@ NEXT_PAGE(level2_kernel_pgt)
        PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
                KERNEL_IMAGE_SIZE/PMD_SIZE)
 
-NEXT_PAGE(level2_spare_pgt)
-       .fill   512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+       .fill   506,8,0
+       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+       .fill   5,8,0
 
 #undef PMDS
-#undef NEXT_PAGE
 
        .data
        .align 16
@@ -456,6 +455,7 @@ ENTRY(nmi_idt_table)
        .skip IDT_ENTRIES * 16
 
        __PAGE_ALIGNED_BSS
-       .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
+       .skip PAGE_SIZE
+NEXT_PAGE(init_level4_pgt)
        .skip PAGE_SIZE

Reply via email to