On Tue, Nov 27, 2012 at 11:50:32PM -0800, Yinghai Lu wrote: > Current when kernel is loaded above 1G, only [_text, _text+2M] is set > up with extra ident page table. > That is not enough, some variables that could be used early are out of > that range, like BRK for early page table. > Need to set map for [_text, _end] include text/data/bss/brk... > > Also current kernel is not allowed to be loaded above 512g, it thinks > that address is too big. > We need to add one extra spare page for level3 to point that 512g range. > Need to check _text range and set level4 pg with that spare level3 page, > and set level3 with level2 page to cover [_text, _end] with extra mapping. > > At last, to handle crossing GB boundary, we need to add another > level2 spare page. To handle crossing 512GB boundary, we need to > add another level3 spare page to next 512G range. > > Test on with kexec-tools with local test code to force loading kernel > cross 1G, 5G, 512g, 513g. > > We need this to put relocatable 64bit bzImage high above 1g. > > -v4: add crossing GB boundary handling. > -v5: use spare pages from BRK, so could save pages when kernel is not > loaded above 1GB. > > Signed-off-by: Yinghai Lu <ying...@kernel.org> > Cc: "Eric W. Biederman" <ebied...@xmission.com> > --- > arch/x86/kernel/head_64.S | 203 > +++++++++++++++++++++++++++++++++++++++++---- > 1 files changed, 187 insertions(+), 16 deletions(-) > > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S > index 94bf9cc..338799a 100644 > --- a/arch/x86/kernel/head_64.S > +++ b/arch/x86/kernel/head_64.S > @@ -20,6 +20,7 @@ > #include <asm/processor-flags.h> > #include <asm/percpu.h> > #include <asm/nops.h> > +#include <asm/setup.h> > > #ifdef CONFIG_PARAVIRT > #include <asm/asm-offsets.h> > @@ -42,6 +43,13 @@ L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) > L4_START_KERNEL = pgd_index(__START_KERNEL_map) > L3_START_KERNEL = pud_index(__START_KERNEL_map) > > +/* two for level3, and two for level2 */ > +SPARE_MAP_SIZE = (4 * PAGE_SIZE) > +RESERVE_BRK(spare_map, SPARE_MAP_SIZE)
Perhaps 'spare_directory' ? Or 'spare_table' ? > + > +#define spare_page(x) (__brk_base + (x) * PAGE_SIZE) > +#define add_one_spare_page addq $PAGE_SIZE, _brk_end(%rip) > + > .text > __HEAD > .code64 > @@ -78,12 +86,6 @@ startup_64: > testl %eax, %eax > jnz bad_address > > - /* Is the address too large? */ > - leaq _text(%rip), %rdx > - movq $PGDIR_SIZE, %rax > - cmpq %rax, %rdx > - jae bad_address > - > /* Fixup the physical addresses in the page table > */ > addq %rbp, init_level4_pgt + 0(%rip) > @@ -97,25 +99,196 @@ startup_64: > > addq %rbp, level2_fixmap_pgt + (506*8)(%rip) > > - /* Add an Identity mapping if I am above 1G */ > + /* Add an Identity mapping if _end is above 1G */ > + leaq _end(%rip), %r9 > + decq %r9 > + cmp $PUD_SIZE, %r9 > + jl ident_complete > + > + /* Clear spare pages */ > + leaq __brk_base(%rip), %rdi > + xorq %rax, %rax > + movq $(SPARE_MAP_SIZE/8), %rcx > +1: decq %rcx > + movq %rax, (%rdi) > + leaq 8(%rdi), %rdi > + jnz 1b > + > + /* get end */ > + andq $PMD_PAGE_MASK, %r9 > + /* round start to 1G if it is below 1G */ > leaq _text(%rip), %rdi > andq $PMD_PAGE_MASK, %rdi > + cmp $PUD_SIZE, %rdi > + jg 1f > + movq $PUD_SIZE, %rdi > +1: > + /* get 512G index */ > + movq %r9, %r8 > + shrq $PGDIR_SHIFT, %r8 > + andq $(PTRS_PER_PGD - 1), %r8 > + movq %rdi, %rax > + shrq $PGDIR_SHIFT, %rax > + andq $(PTRS_PER_PGD - 1), %rax > + > + /* cross two 512G ? */ > + cmp %r8, %rax > + jne set_level3_other_512g > + > + /* all in first 512G ? */ > + cmp $0, %rax > + je skip_level3_spare > + > + /* same 512G other than first 512g */ > + /* > + * We need one level3, one or two level 2, > + * so use first one for level3. > + */ > + leaq (spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + leaq init_level4_pgt(%rip), %rbx > + movq %rdx, 0(%rbx, %rax, 8) > + addq $L4_PAGE_OFFSET, %rax > + movq %rdx, 0(%rbx, %rax, 8) > + /* one level3 in BRK */ > + add_one_spare_page > + > + /* get 1G index */ > + movq %r9, %r8 > + shrq $PUD_SHIFT, %r8 > + andq $(PTRS_PER_PUD - 1), %r8 > + movq %rdi, %rax > + shrq $PUD_SHIFT, %rax > + andq $(PTRS_PER_PUD - 1), %rax > + > + /* same 1G ? */ > + cmp %r8, %rax > + je set_level2_start_only_not_first_512g > + > + /* set level2 for end */ > + leaq spare_page(0)(%rip), %rbx > + leaq (spare_page(2) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + movq %rdx, 0(%rbx, %r8, 8) > + /* second one level2 in BRK */ > + add_one_spare_page > + > +set_level2_start_only_not_first_512g: > + leaq spare_page(0)(%rip), %rbx > + leaq (spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + movq %rdx, 0(%rbx, %rax, 8) > + /* first one level2 in BRK */ > + add_one_spare_page > + > + /* one spare level3 before level2*/ > + leaq spare_page(1)(%rip), %rbx > + jmp set_level2_spare > + > +set_level3_other_512g: > + /* > + * We need one or two level3, and two level2, > + * so use first two for level2. > + */ > + /* for level2 last on first 512g */ > + leaq level3_ident_pgt(%rip), %rcx > + /* start is in first 512G ? */ > + cmp $0, %rax > + je set_level2_start_other_512g > > + /* Set level3 for _text */ > + leaq (spare_page(3) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + leaq init_level4_pgt(%rip), %rbx > + movq %rdx, 0(%rbx, %rax, 8) > + addq $L4_PAGE_OFFSET, %rax > + movq %rdx, 0(%rbx, %rax, 8) > + /* first one level3 in BRK */ > + add_one_spare_page > + > + /* for level2 last not on first 512G */ > + leaq spare_page(3)(%rip), %rcx > + > +set_level2_start_other_512g: > + /* always need to set level2 */ > movq %rdi, %rax > shrq $PUD_SHIFT, %rax > andq $(PTRS_PER_PUD - 1), %rax > - jz ident_complete > + movq %rcx, %rbx /* %rcx : level3 spare or level3_ident_pgt */ > + leaq (spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + movq %rdx, 0(%rbx, %rax, 8) > + /* first one level2 in BRK */ > + add_one_spare_page > > - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), > %rdx > +set_level3_end_other_512g: > + leaq (spare_page(2) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + leaq init_level4_pgt(%rip), %rbx > + movq %rdx, 0(%rbx, %r8, 8) > + addq $L4_PAGE_OFFSET, %r8 > + movq %rdx, 0(%rbx, %r8, 8) > + /* second one level3 in BRK */ > + add_one_spare_page > + > + /* always need to set level2 */ > + movq %r9, %r8 > + shrq $PUD_SHIFT, %r8 > + andq $(PTRS_PER_PUD - 1), %r8 > + leaq spare_page(2)(%rip), %rbx > + leaq (spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + movq %rdx, 0(%rbx, %r8, 8) > + /* second one level2 in BRK */ > + add_one_spare_page > + > + /* no spare level3 before level2 */ > + leaq spare_page(0)(%rip), %rbx > + jmp set_level2_spare > + > +skip_level3_spare: > + /* We have one or two level2 */ > + /* get 1G index */ > + movq %r9, %r8 > + shrq $PUD_SHIFT, %r8 > + andq $(PTRS_PER_PUD - 1), %r8 > + movq %rdi, %rax > + shrq $PUD_SHIFT, %rax > + andq $(PTRS_PER_PUD - 1), %rax > + > + /* same 1G ? */ > + cmp %r8, %rax > + je set_level2_start_only_first_512g > + > + /* set level2 without level3 spare */ > + leaq level3_ident_pgt(%rip), %rbx > + leaq (spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > + movq %rdx, 0(%rbx, %r8, 8) > + /* second one level2 in BRK */ > + add_one_spare_page > + > +set_level2_start_only_first_512g: > + /* set level2 without level3 spare */ > leaq level3_ident_pgt(%rip), %rbx > + leaq (spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx > movq %rdx, 0(%rbx, %rax, 8) > + /* first one level2 in BRK */ > + add_one_spare_page > > + /* no spare level3 */ > + leaq spare_page(0)(%rip), %rbx > + > +set_level2_spare: > movq %rdi, %rax > shrq $PMD_SHIFT, %rax > andq $(PTRS_PER_PMD - 1), %rax > leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx > - leaq level2_spare_pgt(%rip), %rbx > - movq %rdx, 0(%rbx, %rax, 8) > + /* %rbx is set before */ > + movq %r9, %r8 > + shrq $PMD_SHIFT, %r8 > + andq $(PTRS_PER_PMD - 1), %r8 > + cmp %r8, %rax > + jl 1f > + addq $PTRS_PER_PMD, %r8 > +1: movq %rdx, 0(%rbx, %rax, 8) > + addq $PMD_SIZE, %rdx > + incq %rax > + cmp %r8, %rax > + jle 1b > + > ident_complete: > > /* > @@ -423,11 +596,9 @@ NEXT_PAGE(level2_kernel_pgt) > * If you want to increase this then increase MODULES_VADDR > * too.) > */ > - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, > - KERNEL_IMAGE_SIZE/PMD_SIZE) > - > -NEXT_PAGE(level2_spare_pgt) > - .fill 512, 8, 0 > + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) > + /* hold the whole page */ > + .fill (PTRS_PER_PMD - (KERNEL_IMAGE_SIZE/PMD_SIZE)), 8, 0 > > #undef PMDS > #undef NEXT_PAGE > -- > 1.7.7 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/