Module Name:    src
Committed By:   ryo
Date:           Thu Oct  4 23:53:14 UTC 2018

Modified Files:
        src/sys/arch/aarch64/aarch64: aarch64_machdep.c locore.S pmap.c
        src/sys/arch/aarch64/conf: files.aarch64
        src/sys/arch/aarch64/include: pmap.h
Added Files:
        src/sys/arch/aarch64/aarch64: pmapboot.c

Log Message:
cleanup locore, and changed the way to map memories during boot.
- add functions bootpage_enter() and bootpage_alloc() to adapt various layout
  of physical memory map. especially for 64bit physical memory layout.
  pmapboot_alloc() allocates pagetable pages from _end[].
- changed to map only the required amount for PA=VA identity mapping
  (kernel image, UART device, and FDT blob) with L2_BLOCK(2Mbyte).
- changing page permission for kernel image, and making KSEG mapping are done
  at cpu_kernel_vm_init() instead of at locore.
- optimize PTE entries with PTE Contiguous bit. it is enabled on devmap only 
for now.

reviewed by skrll@, thanks.


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 src/sys/arch/aarch64/aarch64/aarch64_machdep.c
cvs rdiff -u -r1.27 -r1.28 src/sys/arch/aarch64/aarch64/locore.S
cvs rdiff -u -r1.25 -r1.26 src/sys/arch/aarch64/aarch64/pmap.c
cvs rdiff -u -r0 -r1.1 src/sys/arch/aarch64/aarch64/pmapboot.c
cvs rdiff -u -r1.4 -r1.5 src/sys/arch/aarch64/conf/files.aarch64
cvs rdiff -u -r1.11 -r1.12 src/sys/arch/aarch64/include/pmap.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/aarch64/aarch64/aarch64_machdep.c
diff -u src/sys/arch/aarch64/aarch64/aarch64_machdep.c:1.11 src/sys/arch/aarch64/aarch64/aarch64_machdep.c:1.12
--- src/sys/arch/aarch64/aarch64/aarch64_machdep.c:1.11	Sun Aug 26 18:15:49 2018
+++ src/sys/arch/aarch64/aarch64/aarch64_machdep.c	Thu Oct  4 23:53:13 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: aarch64_machdep.c,v 1.11 2018/08/26 18:15:49 ryo Exp $ */
+/* $NetBSD: aarch64_machdep.c,v 1.12 2018/10/04 23:53:13 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: aarch64_machdep.c,v 1.11 2018/08/26 18:15:49 ryo Exp $");
+__KERNEL_RCSID(1, "$NetBSD: aarch64_machdep.c,v 1.12 2018/10/04 23:53:13 ryo Exp $");
 
 #include "opt_arm_debug.h"
 #include "opt_ddb.h"
@@ -100,28 +100,57 @@ vaddr_t physical_end;
 /* filled in before cleaning bss. keep in .data */
 u_long kern_vtopdiff __attribute__((__section__(".data")));
 
+long kernend_extra;	/* extra memory allocated from round_page(_end[]) */
+
 void
 cpu_kernel_vm_init(uint64_t memory_start, uint64_t memory_size)
 {
-
 	extern char __kernel_text[];
 	extern char _end[];
+	extern char __data_start[];
+	extern char __rodata_start[];
 
 	vaddr_t kernstart = trunc_page((vaddr_t)__kernel_text);
 	vaddr_t kernend = round_page((vaddr_t)_end);
-
-	paddr_t	kernstart_phys = KERN_VTOPHYS(kernstart);
+	paddr_t kernstart_phys = KERN_VTOPHYS(kernstart);
 	paddr_t kernend_phys = KERN_VTOPHYS(kernend);
+	vaddr_t data_start = (vaddr_t)__data_start;
+	vaddr_t rodata_start = (vaddr_t)__rodata_start;
 
-	VPRINTF("%s: kernel phys start %lx end %lx\n", __func__,
-	    kernstart_phys, kernend_phys);
-
-        fdt_add_reserved_memory_range(kernstart_phys,
-	     kernend_phys - kernstart_phys);
+	/* add KSEG mappings of whole memory */
+	VPRINTF("Creating KSEG tables for 0x%016lx-0x%016lx\n",
+	    memory_start, memory_start + memory_size);
+	const pt_entry_t ksegattr =
+	    LX_BLKPAG_ATTR_NORMAL_WB |
+	    LX_BLKPAG_AP_RW |
+	    LX_BLKPAG_PXN |
+	    LX_BLKPAG_UXN;
+	pmapboot_enter(AARCH64_PA_TO_KVA(memory_start), memory_start,
+	    memory_size, L1_SIZE, ksegattr, PMAPBOOT_ENTER_NOOVERWRITE,
+	    bootpage_alloc, NULL);
 
 	/*
-	 * XXX whole bunch of stuff to map kernel correctly
+	 * at this point, whole kernel image is mapped as "rwx".
+	 * permission should be changed to:
+	 *
+	 *    text     rwx => r-x
+	 *    rodata   rwx => r--
+	 *    data     rwx => rw-
+	 *
+	 * kernel image has mapped by L2 block. (2Mbyte)
 	 */
+	pmapboot_protect(L2_TRUNC_BLOCK(kernstart),
+	    L2_TRUNC_BLOCK(data_start), VM_PROT_WRITE);
+	pmapboot_protect(L2_ROUND_BLOCK(rodata_start),
+	    L2_ROUND_BLOCK(kernend + kernend_extra), VM_PROT_EXECUTE);
+
+	aarch64_tlbi_all();
+
+
+	VPRINTF("%s: kernel phys start %lx end %lx+%lx\n", __func__,
+	    kernstart_phys, kernend_phys, kernend_extra);
+	fdt_add_reserved_memory_range(kernstart_phys,
+	     kernend_phys - kernstart_phys + kernend_extra);
 }
 
 
@@ -172,8 +201,8 @@ initarm_common(vaddr_t kvm_base, vsize_t
 
 	kernstart = trunc_page((vaddr_t)__kernel_text);
 	kernend = round_page((vaddr_t)_end);
-	kernstart_l2 = kernstart & -L2_SIZE;		/* trunk L2_SIZE(2M) */
-	kernend_l2 = (kernend + L2_SIZE - 1) & -L2_SIZE;/* round L2_SIZE(2M) */
+	kernstart_l2 = L2_TRUNC_BLOCK(kernstart);
+	kernend_l2 = L2_ROUND_BLOCK(kernend + kernend_extra);
 	kernelvmstart = kernend_l2;
 
 #ifdef MODULAR
@@ -212,13 +241,15 @@ initarm_common(vaddr_t kvm_base, vsize_t
 	    "kernel_start_l2       = 0x%016lx\n"
 	    "kernel_start          = 0x%016lx\n"
 	    "kernel_end            = 0x%016lx\n"
+	    "pagetables            = 0x%016lx\n"
+	    "pagetables_end        = 0x%016lx\n"
 	    "kernel_end_l2         = 0x%016lx\n"
 #ifdef MODULAR
 	    "module_start          = 0x%016lx\n"
 	    "module_end            = 0x%016lx\n"
 #endif
 	    "(kernel va area)\n"
-	    "(devmap va area)\n"
+	    "(devmap va area)      = 0x%016lx\n"
 	    "VM_MAX_KERNEL_ADDRESS = 0x%016lx\n"
 	    "------------------------------------------\n",
 	    kern_vtopdiff,
@@ -230,11 +261,14 @@ initarm_common(vaddr_t kvm_base, vsize_t
 	    kernstart_l2,
 	    kernstart,
 	    kernend,
+	    round_page(kernend),
+	    round_page(kernend) + kernend_extra,
 	    kernend_l2,
 #ifdef MODULAR
 	    module_start,
 	    module_end,
 #endif
+	    VM_KERNEL_IO_ADDRESS,
 	    VM_MAX_KERNEL_ADDRESS);
 
 	/*

Index: src/sys/arch/aarch64/aarch64/locore.S
diff -u src/sys/arch/aarch64/aarch64/locore.S:1.27 src/sys/arch/aarch64/aarch64/locore.S:1.28
--- src/sys/arch/aarch64/aarch64/locore.S:1.27	Thu Oct  4 09:09:29 2018
+++ src/sys/arch/aarch64/aarch64/locore.S	Thu Oct  4 23:53:13 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: locore.S,v 1.27 2018/10/04 09:09:29 ryo Exp $	*/
+/*	$NetBSD: locore.S,v 1.28 2018/10/04 23:53:13 ryo Exp $	*/
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <[email protected]>
@@ -26,280 +26,331 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "opt_arm_debug.h"
+#include "opt_console.h"
 #include "opt_cpuoptions.h"
-#include "opt_multiprocessor.h"
 #include "opt_ddb.h"
-#include "opt_arm_debug.h"
+#include "opt_fdt.h"
+#include "opt_multiprocessor.h"
 
 #include <aarch64/asm.h>
 #include <aarch64/hypervisor.h>
 #include "assym.h"
 
-RCSID("$NetBSD: locore.S,v 1.27 2018/10/04 09:09:29 ryo Exp $")
+RCSID("$NetBSD: locore.S,v 1.28 2018/10/04 23:53:13 ryo Exp $")
 
-/* #define DEBUG_LOCORE */
-/* #define DEBUG_MMU */
 
-#if (defined(VERBOSE_INIT_ARM) || defined(DEBUG_LOCORE)) && defined(EARLYCONS)
-#define VERBOSE_LOCORE
-#endif
+/*#define DEBUG_LOCORE			/* debug print */
+/*#define DEBUG_LOCORE_PRINT_LOCK	/* avoid mixing AP's output */
+/*#define DEBUG_MMU			/* dump MMU table */
 
 #define LOCORE_EL2
 
-#define PRINT(string)	bl xprint;.asciz string;.align 2
+#define BOOT_AP_STACKSIZE	256	/* size of temporally stack for APs */
+#define BOOTPAGE_ALLOC_MAX	(1024 * 1024)	/* reserved size from _end[] */
+
+#if (defined(VERBOSE_INIT_ARM) || defined(DEBUG_LOCORE)) && defined(EARLYCONS)
+#define VERBOSE_LOCORE
+#endif
 
 #ifdef VERBOSE_LOCORE
-#define VERBOSE(string)	PRINT(string)
+#define VPRINT(string)		PRINT(string)
 #else
-#define VERBOSE(string)
+#define VPRINT(string)
 #endif
 
+/* DPRINTREG macro use x19 internally. x0-x15 may be broken */
+#if (defined(DEBUG_LOCORE) && defined(EARLYCONS))
+#define DPRINT(string)		PRINT(string)
+#define DPRINTREG(str, reg)	mov x19,reg; PRINT(str); mov x0,x19; bl print_x0
+#define DPRINTSREG(str, reg)	mrs x19,reg; PRINT(str); mov x0,x19; bl print_x0
+#else
+#define DPRINT(string)
+#define DPRINTREG(str, reg)
+#define DPRINTSREG(str, reg)
+#endif
+
+#define PRINT(string)	bl xprint; .asciz string; .align 2
+
+
 /* load far effective address (pc relative) */
 .macro	ADDR, reg, addr
 	adrp	\reg, \addr
 	add	\reg, \reg, #:lo12:\addr
 .endm
 
-ENTRY_NP(aarch64_start)
-	/* Zero the BSS. The size must be aligned 16, usually it should be. */
-	ADDR	x0, __bss_start__
-	ADDR	x1, __bss_end__
-	b	2f
-1:	stp	xzr, xzr, [x0], #16
-2:	cmp	x0, x1
-	b.lo	1b
+	.text
+	.align	3
+ASENTRY_NP(aarch64_start)
+	/* keep lr & sp for return to bootloader if possible */
+	mov	x27, lr
+	mov	x28, sp
 
 	/* set stack pointer for boot */
 	ADDR	x0, bootstk
 	mov	sp, x0
 
-#ifdef VERBOSE_INIT_ARM
-	PRINT("boot NetBSD/evbarm (aarch64)\r\n")
-#endif
+	bl	clear_bss
 
-#ifdef DEBUG_LOCORE
-	PRINT("PC               = ")
-	bl	1f
-1:	mov	x0, lr
-	bl	print_x0
+	PRINT("boot NetBSD/aarch64\n")
 
-	PRINT("SP               = ")
 	bl	1f
-1:	mov	x0, sp
-	bl	print_x0
-
-	PRINT("CurrentEL        = ")
-	mrs	x0, CurrentEL
-	lsr	x0, x0, #2
-	bl	print_x0
-
-	cmp	x0, #2
-	bne	1f
-
+1:	DPRINTREG("PC               = ", lr)
+	DPRINTREG("SP               = ", sp)
+	mrs	x20, CurrentEL
+	lsr	x20, x20, #2
+	DPRINTREG("CurrentEL        = ", x20)
+	cmp	x20, #2
+	bcc	1f
 	/* EL2 registers can be accessed in EL2 or higher */
-	PRINT("SCTLR_EL2        = ")
-	mrs	x0, sctlr_el2
-	bl	print_x0
-
-	PRINT("HCR_EL2          = ")
-	mrs	x0, hcr_el2
-	bl	print_x0
+	DPRINTSREG("SCTLR_EL2        = ", sctlr_el2)
+	DPRINTSREG("HCR_EL2          = ", hcr_el2)
 1:
-
-	PRINT("CNTFREQ_EL0      = ")
-	mrs	x0, cntfrq_el0
-	bl	print_x0
-
-	PRINT("DAIF             = ")
-	mrs	x0, daif
-	bl	print_x0
-
-	PRINT("MPIDR_EL1        = ")
-	mrs	x0, mpidr_el1
-	bl	print_x0
-
-#if 0
-	PRINT("L2CTLR_EL1       = ")
-	mrs	x0, s3_1_c11_c0_2
-	bl	print_x0
-#endif
-
-	PRINT("ID_AA64MPFR0_EL1 = ")
-	mrs	x0, id_aa64pfr0_el1
-	bl	print_x0
-
-	PRINT("ID_AA64MPFR1_EL1 = ")
-	mrs	x0, id_aa64pfr1_el1
-	bl	print_x0
-
-	PRINT("ID_AA64ISAR0_EL1 = ")
-	mrs	x0, id_aa64isar0_el1
-	bl	print_x0
-
-	PRINT("ID_AA64ISAR1_EL1 = ")
-	mrs	x0, id_aa64isar1_el1
-	bl	print_x0
-
-
-	PRINT("ID_AA64MMFR0_EL1 = ")
-	mrs	x0, id_aa64mmfr0_el1
-	bl	print_x0
-
-	PRINT("ID_AA64MMFR1_EL1 = ")
-	mrs	x0, id_aa64mmfr1_el1
-	bl	print_x0
-#endif
+	DPRINTSREG("SPSR_EL1         = ", spsr_el1)
+	DPRINTSREG("CNTFREQ_EL0      = ", cntfrq_el0)
+	DPRINTSREG("SCTLR_EL1        = ", sctlr_el1)
+	DPRINTSREG("MIDR_EL1         = ", midr_el1)
+	DPRINTSREG("MPIDR_EL1        = ", mpidr_el1)
+	DPRINTSREG("ID_AA64MPFR0_EL1 = ", id_aa64pfr0_el1)
+	DPRINTSREG("ID_AA64MPFR1_EL1 = ", id_aa64pfr1_el1)
+	DPRINTSREG("ID_AA64ISAR0_EL1 = ", id_aa64isar0_el1)
+	DPRINTSREG("ID_AA64ISAR1_EL1 = ", id_aa64isar1_el1)
+	DPRINTSREG("ID_AA64MMFR0_EL1 = ", id_aa64mmfr0_el1)
+	DPRINTSREG("ID_AA64MMFR1_EL1 = ", id_aa64mmfr1_el1)
 
 
 #ifdef LOCORE_EL2
-	VERBOSE("Drop to EL1...")
+	VPRINT("Drop to EL1...")
 # include <aarch64/aarch64/locore_el2.S>
-	VERBOSE("OK\r\n")
-#ifdef DEBUG_LOCORE
-	PRINT("CurrentEL        = ")
-	mrs	x0, CurrentEL
-	lsr	x0, x0, #2
-	bl	print_x0
-#endif /* DEBUG_LOCORE */
+	VPRINT("OK\n")
+	mrs	x20, CurrentEL
+	lsr	x20, x20, #2
+	DPRINTREG("CurrentEL        = ", x20)
 #endif /* LOCORE_EL2 */
 
-#ifdef DEBUG_LOCORE
-	PRINT("DAIF             = ")
-	mrs	x0, daif
-	bl	print_x0
-#endif
 
 	bl	mmu_disable
-
 	bl	init_sysregs
+	bl	init_mmutable
+	cbnz	x0, aarch64_fatal
+	bl	save_ttbrs
 
-	bl	arm_boot_l0pt_init
-
-	VERBOSE("MMU Enable...")
+	VPRINT("MMU Enable...")
 	bl	mmu_enable
-	VERBOSE("OK\r\n")
+	VPRINT("OK\n")
 
-	/* set exception vector */
-	ldr	x2, =el1_vectors	/* el1_vectors is in kva */
-	msr	vbar_el1, x2
-
-#ifdef DEBUG_LOCORE
-	PRINT("SPSR_EL1        = ")
-	mrs	x0, spsr_el1
-	bl	print_x0
-
-	PRINT("DAIF            = ")
-	mrs	x0, daif
-	bl	print_x0
-
-	PRINT("VSTART          = ")
-	ldr	x0, =vstart	/* virtual address of vstart */
-	bl	print_x0
-#endif
+	ldr	x20, =vstart	/* virtual address of vstart */
+	DPRINTSREG("SPSR_EL1         = ", spsr_el1)
+	DPRINTSREG("DAIF             = ", daif)
+	DPRINTREG("vstart           = ", x20)
+	br	x20		/* jump to the kernel virtual address */
+
+aarch64_fatal:
+	PRINT("fatal error occured while booting\n")
+	/* return to bootloader. if switched from EL2 to EL1, It might fail */
+	mov	lr, x27
+	mov	sp, x28
+	ret
 
-	ldr	x0, =vstart	/* virtual address of vstart */
-	br	x0		/* jump to the kernel virtual address */
 
 /*
  * vstart is in kernel virtual address
  */
 vstart:
+	DPRINTREG("PC               = ", x20)
+
+	/* set exception vector */
+	ADDR	x0, _C_LABEL(el1_vectors)
+	msr	vbar_el1, x0
+
+	/* set lwp0 stack */
 	ADDR	x0, lwp0uspace
 	add	x0, x0, #(UPAGES * PAGE_SIZE)
 	sub	x0, x0, #TF_SIZE	/* lwp0space + USPACE - TF_SIZE */
 	mov	sp, x0			/* define lwp0 ksp bottom */
+	DPRINTREG("SP(lwp0,kvm)     = ", sp)
 
-#ifdef DEBUG_LOCORE
-	PRINT("VSP             = ")
-	mov	x0, sp
-	bl	print_x0
-#endif
+	/* lwp-private = NULL */
+	msr	tpidr_el0, xzr
 
-	msr	tpidr_el0, xzr		/* tpidr_el0 (for TLS) = NULL */
+	/* set curcpu() */
 	ADDR	x0, cpu_info_store	/* cpu_info_store is cpu_info[0] */
 	msr	tpidr_el1, x0		/* curcpu is cpu_info[0] */
+	DPRINTREG("curcpu           = ", x0);
+
 
 	mov	fp, #0			/* trace back starts here */
-	PRINT("initarm\r\n")
+	PRINT("initarm\n")
 	bl	_C_LABEL(initarm)	/* Off we go */
 
-	PRINT("main\r\n")
+	PRINT("main\n")
 	bl	_C_LABEL(main)		/* call main() */
 
 	adr	x0, .Lmainreturned
 	b	_C_LABEL(panic)
 	/* NOTREACHED */
-END(aarch64_start)
+ASEND(aarch64_start)
 
 .Lmainreturned:
 	.asciz	"main() returned"
+	.align 2
+
+
+ASENTRY_NP(clear_bss)
+	/* Zero the BSS. The size must be aligned 16, usually it should be. */
+	ADDR	x14, __bss_start__
+	ADDR	x15, __bss_end__
+	b	2f
+1:	stp	xzr, xzr, [x14], #16
+2:	cmp	x14, x15
+	b.lo	1b
+	ret
+ASEND(clear_bss)
+
+
+init_sysregs:
+	stp	x0, lr, [sp, #-16]!
+
+	/* Disable debug event */
+	msr	mdscr_el1, xzr
+
+	/* Clear context id register */
+	msr	contextidr_el1, xzr
+
+	/* No trap system register access, and Trap FP/SIMD access */
+	msr	cpacr_el1, xzr
+
+	/* allow to read CNTVCT_EL0 and CNTFRQ_EL0 from EL0 */
+	mrs	x0, cntkctl_el1
+	orr	x0, x0, #CNTKCTL_EL0VCTEN
+	msr	cntkctl_el1, x0
+
+	/* any exception not masked */
+	msr	daif, xzr
+
+	ldp	x0, lr, [sp], #16
+	ret
 
-	.align 3
-	.text
 
 #ifdef MULTIPROCESSOR
 
-#if defined(VERBOSE_LOCORE) || defined(DEBUG_LOCORE)
+#ifdef DEBUG_LOCORE
 /*
- * print "[CPU$x27] " (x27 as cpuindex)
- * XXX: max 4 digit
+ * atomic_ops doesn't work before MMU enabled, so using Peterson's algorithm.
+ * this is only used to serialize debug print and avoid mixing output.
+ * Not absolutely necessary.
+ *
+ * x27 for cpuindex.
  */
-ENTRY_NP(printcpu)
+locore_lock_enter:
+#ifdef DEBUG_LOCORE_PRINT_LOCK
+	mov	x3, xzr			/* x3 = level */
+levelloop:
+	/* lock_level[] and lock_turn[] are always accessed via PA(devmap) */
+	ADDR	x0, kern_vtopdiff
+	ldr	x0, [x0]
+	ldr	x4, =lock_level
+	sub	x4, x4, x0
+	ldr	x5, =lock_turn
+	sub	x5, x5, x0
+
+	strh	w3, [x4, x27, lsl #1]	/* lock_level[i] = level */
+	dsb	sy
+	strh	w27, [x5, x3, lsl #1]	/* lock_turn[level] = i */
+	dsb	sy
+waitloop:
+	dmb	sy
+	ldrh	w0, [x5, x3, lsl #1]	/* lock_turn[level] == i ? */
+	cmp	x27, x0
+	bne	nextlevel
+
+	mov	x2, xzr			/* k = 0 */
+levelcheck:
+	cmp	x2, x27
+	beq	levelcheck_next
+
+	dmb	sy
+	ldrsh	w0, [x4, x2, lsl #1]	/* lock_level[k] >= level */
+	cmp	w0, w3
+	bge	waitloop
+levelcheck_next:
+	add	x2, x2, #1		/* k++ */
+	cmp	x2, #MAXCPUS
+	bne	levelcheck
+nextlevel:
+	add	x3, x3, #1
+	cmp	x3, #(MAXCPUS - 1)
+	bne	levelloop
+#endif /* DEBUG_LOCORE_PRINT_LOCK */
+	ret
+
+
+locore_lock_exit:
+#ifdef DEBUG_LOCORE_PRINT_LOCK
+	/* lock_level[] and lock_turn[] are always accessed via PA(devmap) */
+	ADDR	x0, kern_vtopdiff
+	ldr	x0, [x0]
+	ldr	x1, =lock_level
+	sub	x1, x1, x0
+	mvn	x0, xzr
+	strh	w0, [x1, x27, lsl #1]	/* lock_level[i] = -1 */
+	dsb	sy
+#endif /* DEBUG_LOCORE_PRINT_LOCK */
+	ret
+
+
+/* print "[CPU$x27] " (x27 for cpuindex) */
+printcpu:
 	stp	x0, lr, [sp, #-16]!
-	stp	x25, x26, [sp, #-16]!
-	PRINT("[CPU")
-	mov	x26, x27		/* n = cpuindex */
-	mov	x25, xzr		/* zeropad = 0 */
-	mov	x1, #1000
-	udiv	x0, x26, x1		/* x0 = n / 1000 */
-	msub	x26, x0, x1, x26	/* n %= 1000 */
-	cbz	x0, 1f			/* if (x0 == 0) goto 1f */
-	add	x0, x0, #'0'
-	bl	uartputc
-	mov	x25, #1			/* zeropad = 1 */
-1:
-	mov	x1, #100
-	udiv	x0, x26, x1		/* x0 = n / 100 */
-	msub	x26, x0, x1, x26	/* n %= 100 */
-	adds	x25, x25, x0		/* if ((zeropad + x0) == 0) */
-	beq	1f			/*   goto 1f */
-	add	x0, x0, #'0'
-	bl	uartputc
-	mov	x25, #1			/* zeropad = 1 */
-1:
-	mov	x1, #10
-	udiv	x0, x26, x1		/* x0 = n / 10 */
-	msub	x26, x0, x1, x26	/* n %= 10 */
-	adds	x25, x25, x0		/* if ((zeropad + x0) == 0) */
-	beq	1f			/*   goto 1f */
-	add	x0, x0, #'0'
-	bl	uartputc
-1:
-	add	x0, x26, #'0'
-	bl	uartputc
-	PRINT("] ")
-	ldp	x25, x26, [sp], #16
+	PRINT("[CPU");			\
+	mov	x0, x27;		\
+	bl	_printdec_x0;		\
+	PRINT("] ");			\
 	ldp	x0, lr, [sp], #16
 	ret
-END(printcpu)
-#define PRINTCPU()	bl	printcpu
-#else
-#define PRINTCPU()
-#endif /* VERBOSE_LOCORE || DEBUG_LOCORE */
 
-#ifdef VERBOSE_LOCORE
-#define VERBOSE_PRINTCPU()	PRINTCPU()
-#else
-#define VERBOSE_PRINTCPU()
-#endif
+#define CPU_DPRINT(str)			\
+	bl	locore_lock_enter;	\
+	bl	printcpu;		\
+	DPRINT(str);			\
+	bl	locore_lock_exit
+
+/*
+ * CPU_DPRINTREG macro use x19 internally. x0-x15 may be broken.
+ * x27 for cpuindex.
+ */
+#define CPU_DPRINTREG(str,reg)		\
+	mov	x19, reg;		\
+	bl	locore_lock_enter;	\
+	bl	printcpu;		\
+	PRINT(str);			\
+	mov	x0, x19;		\
+	bl	print_x0;		\
+	bl	locore_lock_exit
+
+#define CPU_DPRINTSREG(str, reg)	\
+	mrs	x19, reg;		\
+	CPU_DPRINTREG(str, x19)
+
+#else /* DEBUG_LOCORE */
+
+#define CPU_DPRINT(str)
+#define CPU_DPRINTREG(str,reg)
+#define CPU_DPRINTSREG(str, reg)
+
+#endif /* DEBUG_LOCORE */
 
 ENTRY_NP(aarch64_mpstart)
-ENTRY_NP(cortex_mpstart)	/* compat arm */
+ENTRY_NP(cpu_mpstart)
 	mrs	x3, mpidr_el1
 	ldr	x0, =(MPIDR_AFF0|MPIDR_AFF1|MPIDR_AFF2|MPIDR_AFF3)
 	and	x3, x3, x0
 
-	ADDR	x0, cpu_mpidr
+	/*
+	 * resolve own cpuindex. my mpidr is stored in
+	 * extern uint64_t cpu_mpidr[MAXCPUS]
+	 */
+	ADDR	x0, _C_LABEL(cpu_mpidr)
 	mov	x1, xzr
 1:
 	add	x1, x1, #1
@@ -310,99 +361,71 @@ ENTRY_NP(cortex_mpstart)	/* compat arm *
 	bne	1b
 
 	mov	x27, x1			/* x27 = cpuindex */
-
 	mov	x0, #1
 	lsl	x28, x0, x27		/* x28 = 1 << cpuindex */
 
-	/* x27 = cpuindex, x28 = (1 << cpuindex) */
+
+	/*
+	 * x27 = cpuindex
+	 * x28 = (1 << cpuindex)
+	 */
 
 	/* set stack pointer for boot */
-#define BOOT_STACKSIZE	256
-	mov	x1, #BOOT_STACKSIZE
+	mov	x1, #BOOT_AP_STACKSIZE
 	mul	x1, x1, x27
 	ADDR	x0, bootstk_cpus
-	sub	sp, x0, x1	/* sp= bootstk_cpus-(BOOT_STACKSIZE*cpuindex) */
+	sub	sp, x0, x1  /* sp = bootstk_cpus-(BOOT_AP_STACKSIZE*cpuindex) */
 
-#ifdef DEBUG_LOCORE
-	PRINTCPU()
-	PRINT("PC               = ")
-	bl	1f
-1:	mov	x0, lr
-	bl	print_x0
 
-	PRINTCPU()
-	PRINT("SP               = ")
 	bl	1f
-1:	mov	x0, sp
-	bl	print_x0
+1:	CPU_DPRINTREG("PC               = ", lr)
+	CPU_DPRINTREG("SP               = ", sp)
+	mrs	x20, CurrentEL
+	lsr	x20, x20, #2
+	CPU_DPRINTREG("CurrentEL        = ", x20)
+	cmp	x20, #2
+	bcc	1f
+	/* EL2 registers can be accessed in EL2 or higher */
+	CPU_DPRINTSREG("SCTLR_EL2        = ", sctlr_el2)
+	CPU_DPRINTSREG("HCR_EL2          = ", hcr_el2)
+1:
+	CPU_DPRINTSREG("SPSR_EL1         = ", spsr_el1)
+	CPU_DPRINTSREG("SCTLR_EL1        = ", sctlr_el1)
+	CPU_DPRINTSREG("MIDR_EL1         = ", midr_el1)
+	CPU_DPRINTSREG("MPIDR_EL1        = ", mpidr_el1)
 
-	PRINTCPU()
-	PRINT("CurrentEL        = ")
-	mrs	x0, CurrentEL
-	lsr	x0, x0, #2
-	bl	print_x0
-#endif
 
 #ifdef LOCORE_EL2
-#ifdef DEBUG_LOCORE
-	VERBOSE_PRINTCPU()
-	VERBOSE("Drop to EL1...")
-#endif
+	CPU_DPRINT("Drop to EL1...\n")
 	bl	drop_to_el1
-#ifdef DEBUG_LOCORE
-	VERBOSE("OK\r\n")
-#endif
-#ifdef DEBUG_LOCORE
-	PRINTCPU()
-	PRINT("CurrentEL        = ")
-	mrs	x0, CurrentEL
-	lsr	x0, x0, #2
-	bl	print_x0
-#endif /* DEBUG_LOCORE */
+	CPU_DPRINT("Drop to EL1 OK\n")
+	mrs	x20, CurrentEL
+	lsr	x20, x20, #2
+	CPU_DPRINTREG("CurrentEL        = ", x20)
 #endif /* LOCORE_EL2 */
 
-	bl	mmu_disable
 
+	bl	mmu_disable
 	bl	init_sysregs
 
-#ifdef DEBUG_LOCORE
-	VERBOSE_PRINTCPU()
-	VERBOSE("MMU Enable...")
-#endif
+	CPU_DPRINT("MMU Enable...\n")
+	bl	load_ttbrs
 	bl	mmu_enable
-#ifdef DEBUG_LOCORE
-	VERBOSE("OK\r\n")
-#endif
+	CPU_DPRINT("MMU Enable OK\n")
 
 	/* jump to virtual address */
-	ldr	x0, =mp_vstart
-	br	x0
+	ldr	x20, =mp_vstart
+	br	x20
 
 mp_vstart:
+	CPU_DPRINTREG("PC               = ", x20)
+
 	/* set exception vector */
-	ADDR	x0, el1_vectors
+	ADDR	x0, _C_LABEL(el1_vectors)
 	msr	vbar_el1, x0
 
-#ifdef DEBUG_LOCORE
-	PRINTCPU()
-	PRINT("PC               = ")
-	bl	1f
-1:	mov	x0, lr
-	bl	print_x0
-
-	PRINTCPU()
-	PRINT("arm_cpu_hatched  = ")
-	ADDR	x0, _C_LABEL(arm_cpu_hatched)
-	ldr	w0, [x0]
-	bl	print_x0
-
-	PRINTCPU()
-	PRINT("my cpubit        = ")
-	mov	x0, x28
-	bl	print_x0
-#endif
-
-	msr	tpidr_el0, xzr		/* tpidr_el0 (for TLS) = NULL */
+	/* lwp-private = NULL */
+	msr	tpidr_el0, xzr
 
 	/* set curcpu(), and fill curcpu()->ci_{midr,mpidr} */
 	mov	x0, #CPU_INFO_SIZE
@@ -416,6 +439,8 @@ mp_vstart:
 	mrs	x1, mpidr_el1
 	str	x1, [x0, #CI_MPIDR]	/* curcpu()->ci_mpidr = mpidr_el1 */
 
+	CPU_DPRINTREG("arm_cpu_hatched |= ", x28)
+
 	/*
 	 * atomic_or_32(&arm_cpu_hatched, (1 << cpuindex))
 	 * to tell my activity to primary processor.
@@ -425,46 +450,17 @@ mp_vstart:
 	bl	_C_LABEL(atomic_or_32)	/* hatched! */
 	sev
 
-#ifdef DEBUG_LOCORE
-	PRINTCPU()
-	PRINT("arm_cpu_hatched -> ")
-	ADDR	x0, _C_LABEL(arm_cpu_hatched)
-	ldr	w0, [x0]
-	bl	print_x0
-#endif
-
-#ifdef DEBUG_LOCORE
-	PRINTCPU()
-	PRINT("Hatched.\r\n")
-#endif
-
 	/* wait for my bit of arm_cpu_mbox become true */
-	ADDR	x1, _C_LABEL(arm_cpu_mbox)
+	ADDR	x0, _C_LABEL(arm_cpu_mbox)
 1:
 	dmb	sy
-	ldr	x0, [x1]
-	tst	x0, x28
+	ldr	x20, [x0]
+	tst	x20, x28
 	bne	9f
 	wfe
 	b	1b
 9:
-
-#ifdef DEBUG_LOCORE
-	/* XXX: delay to prevent the mixing of console output */
-	mov	x0, #0x4000000
-	mul	x0, x0, x27	/* delay (cpuindex * 0x4000000) */
-1:	subs	x0, x0, #1
-	bne	1b
-
-	PRINTCPU()
-	PRINT("MBOX received\r\n")
-
-	PRINTCPU()
-	PRINT("arm_cpu_mbox  = ")
-	ADDR	x0, _C_LABEL(arm_cpu_mbox)
-	ldr	x0, [x0]
-	bl	print_x0
-#endif
+//	CPU_DPRINTREG("got arm_cpu_mbox = ", x20)
 
 	/* fill my cpu_info */
 	mrs	x0, tpidr_el1		/* curcpu() */
@@ -472,6 +468,7 @@ mp_vstart:
 	ldr	x1, [x0, #CI_IDLELWP]	/* x1 = curcpu()->ci_data.cpu_idlelwp */
 	str	x1, [x0, #CI_CURLWP]	/* curlwp is idlelwp */
 
+	/* get my stack from lwp */
 	ldr	x2, [x1, #L_PCB]	/* x2 = lwp_getpcb(idlelwp) */
 	add	x2, x2, #(UPAGES * PAGE_SIZE)
 	sub	sp, x2, #TF_SIZE	/* sp = pcb + USPACE - TF_SIZE */
@@ -484,8 +481,7 @@ mp_vstart:
 END(aarch64_mpstart)
 
 toomanycpus:
-	PRINTCPU()
-	PRINT("too many cpus, or MPIDR not exists in cpu_mpidr[]\r\n")
+	CPU_DPRINT("too many cpus, or MPIDR not exists in cpu_mpidr[]\n")
 1:	wfi
 	b	1b
 
@@ -500,50 +496,58 @@ END(aarch64_mpstart)
 
 #endif /* MULTIPROCESSOR */
 
+
 /*
  * xprint - print strings pointed by $PC(LR)
  *          and return to the end of string.
+ *          "\n" will be replaced "\r\n"
  * e.g.)
- *    bl        xprint        <- call
- *    .ascii    "Hello\r\n\0" <- wouldn't return here
+ *    bl        xprint      <- call
+ *    .ascii    "Hello\n\0" <- wouldn't return here
  *    .align    2
- *    nop                     <- return to here
+ *    nop                   <- return to here
  *
- * x0 is preserved despite being caller saved.
  */
-ENTRY_NP(xprint)
-	stp	x0, x19, [sp, #-16]!
+xprint:
+	mov	x0, lr
+	bl	_C_LABEL(uartputs)
+	add	x0, x0, #3
+	bic	lr, x0, #3
+	ret
 
-	mov	x19, lr
+/*
+ * uartputs(str) - print strings with replacing "\n" to "\r\n".
+ * returns the address after the end of the string. (x0 = next of '\0')
+ */
+ENTRY_NP(uartputs)
+	stp	x19, lr, [sp, #-16]!
+	mov	x19, x0
 	ldrb	w0, [x19], #1
-	cbz	w0, 2f
-
+	cbz	w0, 9f
 1:
+	cmp	x0, #'\n'
+	bne	2f
+	mov	x0, #'\r'
 	bl	uartputc
-	ldrb	w0, [x19], #1
-	cbnz	w0, 1b
-
+	mov	x0, #'\n'
 2:
-	add	x19, x19, #3
-	bic	lr, x19, #3
-	ldp	x0, x19, [sp], #16
-	ret
-END(xprint)
-
-ENTRY_NP(uartputs)
-	mov	x11, x0
-	ldrb	w0, [x11], #1
-	cbz	w0, 9f
-1:	bl	uartputc
-	ldrb	w0, [x11], #1
+	bl	uartputc
+	ldrb	w0, [x19], #1
 	cbnz	w0, 1b
 9:
-	mov	x0, x11
+	mov	x0, x19
+	ldp	x19, lr, [sp], #16
 	ret
 END(uartputs)
 
-/* x0 is preserved despite being caller saved. */
-ENTRY_NP(_print_x0)
+
+/*
+ * print x0 in 16 widths hexadecimal.
+ *
+ * x0 is preserved despite being caller saved.
+ * other caller saved registers will be broken.
+ */
+_print_x0:
 	stp	x0, lr, [sp, #-16]!
 	stp	x20, x21, [sp, #-16]!
 
@@ -563,392 +567,320 @@ ENTRY_NP(_print_x0)
 	ldp	x20, x21, [sp], #16
 	ldp	x0, lr, [sp], #16
 	ret
-END(_print_x0)
-
-/* Preserve x{0,1,2} despite them being caller saved */
-ENTRY_NP(print_x0)
-	stp	x0, lr, [sp, #-16]!
-	stp	x1, x2, [sp, #-16]!
-	bl	_print_x0
-	PRINT("\r\n")
-	ldp	x1, x2, [sp], #16
-	ldp	x0, lr, [sp], #16
-	ret
-END(print_x0)
 
-/* Preserve x{0,1,2} despite them being caller saved */
-ENTRY_NP(printn_x1)
-	stp	x0, lr, [sp, #-16]!
-	stp	x1, x2, [sp, #-16]!
-	mov	x0, x1
-	bl	_print_x0
-	ldp	x1, x2, [sp], #16
-	ldp	x0, lr, [sp], #16
-	ret
-END(printn_x1)
-
-/* Preserve x{0,1,2} despite them being caller saved */
-ENTRY_NP(print_x2)
-	stp	x0, lr, [sp, #-16]!
-	mov	x0, x2
-	bl	_print_x0
-	PRINT("\r\n")
-	ldp	x0, lr, [sp], #16
-	ret
-END(print_x2)
-
-ENTRY_NP(arm_boot_l0pt_init)
-	stp	x0, lr, [sp, #-16]!
-
-	/* Clean the page table */
-	ADDR	x0, mmutables_start
-	ADDR	x1, mmutables_end
-1:
-	stp	xzr, xzr, [x0], #16
-	stp	xzr, xzr, [x0], #16
-	stp	xzr, xzr, [x0], #16
-	stp	xzr, xzr, [x0], #16
-	cmp	x0, x1
-	b.lo	1b
-
-	VERBOSE("Creating VA=PA tables\r\n")
-
-	/* VA=PA table, link L0->L1 */
-	ADDR	x0, ttbr0_l0table
-	mov	x1, #0
-	ADDR	x2, ttbr0_l1table
-	bl	l0_settable
-
-	/* VA=PA L1 blocks */
-	ADDR	x0, ttbr0_l1table
-	mov	x1, #0			/* VA */
-	mov	x2, #0			/* PA */
-	mov	x3, #LX_BLKPAG_ATTR_DEVICE_MEM
-	mov	x4, #4			/* 4GB = whole 32bit */
-	bl	l1_setblocks
-
-	VERBOSE("Creating KSEG tables\r\n")
-
-	/* KSEG table, link L0->L1 */
-	ADDR	x0, ttbr1_l0table
-	mov	x1, #AARCH64_KSEG_START
-	ADDR	x2, ttbr1_l1table_kseg
-	bl	l0_settable
-
-	/* KSEG L1 blocks */
-	ADDR	x0, ttbr1_l1table_kseg
-	mov	x1, #AARCH64_KSEG_START
-	mov	x2, #0
-	mov	x3, #LX_BLKPAG_ATTR_NORMAL_WB
-	orr	x3, x3, #(LX_BLKPAG_PXN|LX_BLKPAG_UXN)
-	mov	x4, #Ln_ENTRIES		/* whole l1 table */
-	bl	l1_setblocks
-
-	VERBOSE("Creating KVA=PA tables\r\n")
-
-	/* KVA=PA table, link L0->L1 */
-	ADDR	x0, ttbr1_l0table
-	mov	x1, #VM_MIN_KERNEL_ADDRESS
-	ADDR	x2, ttbr1_l1table_kva
-	bl	l0_settable
-
-	/* KVA=PA table, link L1->L2 */
-	ADDR	x0, ttbr1_l1table_kva
-	mov	x1, #VM_MIN_KERNEL_ADDRESS
-	ADDR	x2, ttbr1_l2table_kva
-	bl	l1_settable
-
-	/* KVA=PA L2 blocks */
-	ADDR	x0, ttbr1_l2table_kva
-	adr	x2, start		/* physical addr. before MMU */
-	and	x2, x2, #L2_BLK_OA	/* L2 block size aligned (2MB) */
-	mov	x1, #VM_MIN_KERNEL_ADDRESS
-	mov	x3, #(LX_BLKPAG_ATTR_NORMAL_WB|LX_BLKPAG_UXN)
-
-	/* kernelsize = _end - start */
-	ldr	x1, =start
-	ldr	x4, =_end
-	sub	x4, x4, x1
-
-	/* round up kernelsize to L2_SIZE (2MB) */
-	add	x4, x4, #L2_SIZE
-	sub	x4, x4, #1
-	lsr	x4, x4, #L2_SHIFT
-	bl	l2_setblocks
-
-	/* map READONLY from VM_MIN_KERNEL_ADDRESS to __data_start */
-	VERBOSE("Set kernel text/rodata READONLY\r\n")
-	ldr	x3, =__data_start
-	ands	x0, x3, #(L2_SIZE - 1)
-	beq	1f
-	ldr	x1, =_erodata
-	and	x1, x1, #L2_ADDR_BITS	/* _erodata & L2_ADDR_BIT */
-	and	x0, x3, #L2_ADDR_BITS	/* __data_start & L2_ADDR_BIT */
-	cmp	x0, x1
-	bne	1f
-	/* __data_start and _erodata are in same L2 block */
-	PRINT("Warning: data section not aligned on size of L2 block\r\n")
-1:
-	/* x3 = l2pde_index(__data_start) */
-	and	x3, x3, #L2_ADDR_BITS
-	lsr	x3, x3, #L2_SHIFT
-
-	/* x2 = l2pde_inex(VM_MIN_KERNEL_ADDRESS) */
-	mov	x2, #VM_MIN_KERNEL_ADDRESS
-	and	x2, x2, #L2_ADDR_BITS
-	lsr	x2, x2, #L2_SHIFT
-
-	ADDR	x1, ttbr1_l2table_kva
-	b	9f
-1:
-	ldr	x0, [x1, x2, lsl #3]	/* x0 = l2table[x2] */
-	and	x0, x0, #~LX_BLKPAG_AP
-	orr	x0, x0, #LX_BLKPAG_AP_RO
-	str	x0, [x1, x2, lsl #3]	/* l2table[x2] = x0 */
-	add	x2, x2, #1
-9:
-	cmp	x2, x3
-	blo	1b
-
-
-	/* add eXecute Never bit from _rodata to _end */
-	VERBOSE("Set kernel rodata/data non-Executable\r\n")
-	ldr	x0, =__rodata_start
-	ands	x0, x0, #(L2_SIZE - 1)
-	beq	1f
-	PRINT("Warning: rodata section not aligned on size of L2 block\r\n")
-1:
-	/* x2 = l2pde_index(__rodata_start) */
-	ldr	x2, =__rodata_start
-	mov	x0, #(L2_SIZE - 1)
-	add	x2, x2, x0		/* round block */
-	and	x2, x2, #L2_ADDR_BITS
-	lsr	x2, x2, #L2_SHIFT
-
-	/* x3 = l2pde_inex(_end) */
-	ldr	x3, =_end
-	and	x3, x3, #L2_ADDR_BITS
-	lsr	x3, x3, #L2_SHIFT
+/*
+ * print x0 in decimal.
+ *
+ * x0 is preserved despite being caller saved.
+ * other caller saved registers will be broken.
+ */
+_printdec_x0:
+	stp	x0, lr, [sp, #-(16+32)]!
+	add	x8, sp, #(16+32)
 
-	ADDR	x1, ttbr1_l2table_kva
-	b	9f
+	strb	wzr, [x8, #-1]!
 1:
-	ldr	x0, [x1, x2, lsl #3]	/* x0 = l2table[x2] */
-	orr	x0, x0, #(LX_BLKPAG_UXN|LX_BLKPAG_PXN)
-	str	x0, [x1, x2, lsl #3]	/* l2table[x2] = x0 */
-	add	x2, x2, #1
-9:
-	cmp	x2, x3			/* including the L2 block of _end[] */
-	bls	1b
+	mov	x10, #10
+	udiv	x1, x0, x10	/* x1 = x0 / 10 */
+	msub	x3, x1, x10, x0	/* x3 = x0 % 10 */
+	mov	x0, x1
 
+	add	x3, x3, #'0'
+	strb	w3, [x8, #-1]!
+	cbnz	x0, 1b
 
-	VERBOSE("Creating devmap tables\r\n")
-	/* devmap=PA table, link L1->L2 */
-	ADDR	x0, ttbr1_l1table_kva
-	ldr	x1, .L_devmap_addr
-	ADDR	x2, ttbr1_l2table_devmap
-	bl	l1_settable
+	mov	x0, x8
+	bl	uartputs
 
-	ldp	x0, lr, [sp], #16
+	ldp	x0, lr, [sp], #(16+32)
 	ret
-END(arm_boot_l0pt_init)
-
-	.align 3
-.L_devmap_addr:
-	.quad	VM_KERNEL_IO_ADDRESS
 
 /*
- *	x0 = l0table
- *	x1 = vaddr
- *	x2 = l1table
+ * print x0 in 16 widths hexadecimal with crlf.
+ *
+ * x0 is preserved despite being caller saved.
+ * other caller saved registers will be broken.
  */
-ENTRY_NP(l0_settable)
+print_x0:
 	stp	x0, lr, [sp, #-16]!
-
-	and	x2, x2, #~PAGE_MASK
-	mov	x8, #L0_TABLE
-	orr	x2, x2, x8
-	and	x1, x1, #L0_ADDR_BITS
-	lsr	x1, x1, #L0_SHIFT
-	str	x2, [x0, x1, lsl #3]	/* l0table[x1] = x2 */
-
-#ifdef DEBUG_MMU
-	PRINT("L0 entry[")
-	bl printn_x1
-	PRINT("]=")
-	bl print_x2
-#endif
-
+	bl	_print_x0
+	PRINT("\n")
 	ldp	x0, lr, [sp], #16
 	ret
-END(l0_settable)
 
+#ifdef DEBUG_MMU
 /*
- *	x0 = l1table
- *	x1 = vaddr
- *	x2 = paddr
- *	x3 = attr
- *	x4 = N entries
+ * tinyprintf() supports only maximum 7 '%x', '%d' and '%s' formats.
+ * width and any modifiers are ignored. '\n' will be replaced to '\r\n'.
+ *
+ * '%x' will be always expanded 16 widths hexadicimal.
+ * e.g., tinyprintf("Hello %s %x\n", "World", 0x12345)
+ * outputs "Hello World 0000000000012345\r\n"
  */
-ENTRY_NP(l1_setblocks)
+tinyprintf:
 	stp	x0, lr, [sp, #-16]!
 	stp	x19, x20, [sp, #-16]!
-	stp	x21, x22, [sp, #-16]!
+	stp	x7, x8, [sp, #-16]!
+	stp	x5, x6, [sp, #-16]!
+	stp	x3, x4, [sp, #-16]!
+	stp	x1, x2, [sp, #-16]!
 
-	mov	x19, x0			/* l1table */
-	mov	x22, x4			/* N entries */
+	mov	x20, xzr
+	mov	x19, x0
+	ldrb	w0, [x19], #1
+	cbz	w0, tinyprintf_done
 
-	and	x21, x2, #L1_ADDR_BITS	/* PA[38:30] */
-	mov	x9, #L1_BLOCK
-	orr	x21, x21, x9
-	orr	x21, x21, x3		/* add in attr */
-	mov	x9, #(LX_BLKPAG_AF|LX_BLKPAG_AP_RW)
-	orr	x21, x21, x9
-#ifdef MULTIPROCESSOR
-	orr	x21, x21, #LX_BLKPAG_SH_IS
-#endif
-	and	x20, x1, #L1_ADDR_BITS	/* VA[38:30] */
-	lsr	x20, x20, #L1_SHIFT
+tinyprintf_loop:
+	cmp	x0, #'\n'
+	bne	1f
+	/* '\n' -> '\r', '\n' */
+	mov	x0, #'\r'
+	bl	uartputc
+	mov	x0, #'\n'
 1:
-	str	x21, [x19, x20, lsl #3]	/* l1table[x20] = x21 */
 
-#ifdef DEBUG_MMU
-	PRINT("L1 entry[")
-	mov	x1, x20
-	bl	printn_x1
-	PRINT("]=")
-	mov	x2, x21
-	bl	print_x2
-#endif
-	mov	x9, #L1_SIZE
-	add	x21, x21, x9
+	cmp	x0, #'%'
+	bne	tinyprintf_putc
+	cmp	x20, #8
+	bcs	tinyprintf_putc
+
+tinyprintf_fetch_fmt:
+	ldrb	w9, [x19], #1
+	cbz	w9, tinyprintf_done
+
+	/* width and modifier are ignored */
+	cmp	x9, #'h'
+	beq	tinyprintf_fetch_fmt
+	cmp	x9, #'l'
+	beq	tinyprintf_fetch_fmt
+	cmp	x9, #'j'
+	beq	tinyprintf_fetch_fmt
+	cmp	x9, #'t'
+	beq	tinyprintf_fetch_fmt
+	cmp	x9, #'z'
+	beq	tinyprintf_fetch_fmt
+	cmp	x9, #'0'
+	bcc	1f
+	cmp	x9, #'9'
+	bls	tinyprintf_fetch_fmt
+1:
+	ldr	x0, [sp, x20, lsl #3]	/* get Nth argument */
 	add	x20, x20, #1
-	subs	x22, x22, #1
-	bne	1b
 
-	ldp	x21, x22, [sp], #16
+	cmp	x9, #'x'
+	bne	5f
+	/* "%x" format */
+	bl	_print_x0
+	b	tinyprintf_next
+5:
+	cmp	x9, #'d'
+	bne	5f
+	/* "%d" format */
+	bl	_printdec_x0
+	b	tinyprintf_next
+5:
+	cmp	x9, #'s'
+	bne	5f
+	/* "%s" format */
+	bl	_C_LABEL(uartputs)
+	b	tinyprintf_next
+5:
+
+tinyprintf_putc:
+	bl	uartputc
+tinyprintf_next:
+	ldrb	w0, [x19], #1
+	cbnz	w0, tinyprintf_loop
+
+tinyprintf_done:
+	mov	x0, x19
+
+	ldp	x1, x2, [sp], #16
+	ldp	x3, x4, [sp], #16
+	ldp	x5, x6, [sp], #16
+	ldp	x7, x8, [sp], #16
 	ldp	x19, x20, [sp], #16
 	ldp	x0, lr, [sp], #16
 	ret
-END(l1_setblocks)
-
-/*
- *	x0 = l1table
- *	x1 = vaddr
- *	x2 = l2table
- */
-ENTRY_NP(l1_settable)
-	stp	x0, lr, [sp, #-16]!
+#endif /* defined(DEBUG_LOCORE) || defined(DEBUG_MMU) */
 
-	and	x2, x2, #~PAGE_MASK
-	mov	x8, #L1_TABLE
-	orr	x2, x2, x8
-	and	x1, x1, #L1_ADDR_BITS
-	lsr	x1, x1, #L1_SHIFT
-	str	x2, [x0, x1, lsl #3]	/* l1table[x1] = x2 */
 
-#ifdef DEBUG_MMU
-	PRINT("L1 entry[")
-	bl printn_x1
-	PRINT("]=")
-	bl print_x2
-#endif
+save_ttbrs:
+	/* save ttbr[01]_el1 for AP */
+	mrs	x0, ttbr0_el1
+	mrs	x1, ttbr1_el1
+	ADDR	x2, ttbr_save
+	stp	x0, x1, [x2]
+	ret
 
-	ldp	x0, lr, [sp], #16
+load_ttbrs:
+	/* load ttbr[01]_el1 */
+	ADDR	x2, ttbr_save
+	ldp	x0, x1, [x2]
+	msr	ttbr0_el1, x0
+	msr	ttbr1_el1, x1
 	ret
-END(l1_settable)
 
-/*
- *	x0 = l2table
- *	x1 = vaddr
- *	x2 = paddr
- *	x3 = attr
- *	x4 = N entries
- */
-ENTRY_NP(l2_setblocks)
-	stp	x0, lr, [sp, #-16]!
-	stp	x19, x20, [sp, #-16]!
-	stp	x21, x22, [sp, #-16]!
 
-	mov	x19, x0			/* l1table */
-	mov	x22, x4			/* N entries */
+init_mmutable:
+	stp	x26, lr, [sp, #-16]!
 
-	and	x21, x2, #L2_BLOCK_MASK
-	mov	x9, #L2_BLOCK
-	orr	x21, x21, x9
-	orr	x21, x21, x3		/* Add attr bits */
-	mov	x9, #(LX_BLKPAG_AF|LX_BLKPAG_AP_RW)
-	orr	x21, x21, x9
-#ifdef MULTIPROCESSOR
-	orr	x21, x21, #LX_BLKPAG_SH_IS
-#endif
-	and	x20, x1, #L2_ADDR_BITS
-	lsr	x20, x20, #L2_SHIFT
-1:
-	str	x21, [x19, x20, lsl #3]	/* l2table[x20] = x21 */
+	/* first allocated page must be kernel l0pt = ARM_BOOTSTRAP_LxPT */
+	bl	bootpage_alloc
+	cbz	x0, init_mmutable_error
+	msr	ttbr1_el1, x0
+
+	bl	bootpage_alloc
+	cbz	x0, init_mmutable_error
+	msr	ttbr0_el1, x0
 
 #ifdef DEBUG_MMU
-	PRINT("L2 entry[")
-	mov	x1, x20
-	bl	printn_x1
-	PRINT("]=")
-	mov	x2, x21
-	bl	print_x2
+	adr	x26, tinyprintf
+#else
+	mov	x26, xzr
 #endif
-	mov	x9, #L2_SIZE
-	add	x21, x21, x9
-	add	x20, x20, #1
-	subs	x22, x22, #1
-	bne	1b
-
-	ldp	x21, x22, [sp], #16
-	ldp	x19, x20, [sp], #16
-	ldp	x0, lr, [sp], #16
-	ret
-END(l2_setblocks)
-
-ENTRY_NP(init_sysregs)
-	stp	x0, lr, [sp, #-16]!
-
-	/* Disable debug event */
-	msr	mdscr_el1, xzr
 
-	/* Clear context id register */
-	msr	contextidr_el1, xzr
+	/*
+	 * int
+	 * pmapboot_enter(
+	 *     x0: vaddr_t va,
+	 *     x1: paddr_t pa,
+	 *     x2: psize_t size,
+	 *     x3: psize_t blocksize,  // L[123]_SIZE
+	 *     x4: pt_entry_t attr,    // pte attributes. LX_BLKPAG_*
+	 *     x5: flags,
+	 *     x6: pd_entry_t *(*physpage_allocator)(void),
+	 *     x7: void (*pr)(const char *, ...)
+	 *  );
+	 */
 
-	/* No trap system register access, and Trap FP/SIMD access */
-	msr	cpacr_el1, xzr
+#ifdef CONSADDR
+	VPRINT("Creating VA=PA tables for CONSADDR\n")
+	mov	x7, x26				/* pr func */
+	adr	x6, bootpage_alloc		/* allocator */
+	mov	x5, xzr				/* flags = 0 */
+	mov	x4, #LX_BLKPAG_ATTR_DEVICE_MEM|LX_BLKPAG_AP_RW	/* attr */
+	mov	x3, #L2_SIZE			/* blocksize */
+	mov	x2, #L2_SIZE			/* size */
+	ldr	x1, =CONSADDR			/* pa */
+	mov	x0, x1				/* va */
+	bl	pmapboot_enter
+	cbnz	x0, init_mmutable_error
+#elif defined(EARLYCONS)
+	/* CONSADDR is unknown, but need to map UART */
+	VPRINT("Creating VA=PA tables (0x00000000-0xffffffff)\n")
+	mov	x7, x26				/* pr func */
+	adr	x6, bootpage_alloc		/* allocator */
+	mov	x5, xzr				/* flags = 0 */
+	mov	x4, #LX_BLKPAG_ATTR_DEVICE_MEM|LX_BLKPAG_AP_RW	/* attr */
+	mov	x3, #L2_SIZE			/* blocksize */
+	mov	x2, #(1024*1024*1024*4)		/* size */
+	mov	x1, xzr				/* pa */
+	mov	x0, xzr				/* va */
+	bl	pmapboot_enter
+	cbnz	x0, init_mmutable_error
+#endif
+
+	/* identity mapping for kernel image */
+	VPRINT("Creating VA=PA tables for kernel image\n")
+	mov	x7, x26				/* pr func */
+	adr	x6, bootpage_alloc		/* allocator */
+	mov	x5, xzr				/* flags = 0 */
+	mov	x4, #LX_BLKPAG_ATTR_NORMAL_NC|LX_BLKPAG_AP_RW	/* attr */
+	mov	x3, #L2_SIZE			/* blocksize */
+	adr	x0, start			/* va = start */
+	ADDR	x2, _end
+	sub	x2, x2, x0			/* size = _end - start */
+	add	x2, x2, #BOOTPAGE_ALLOC_MAX	/* for boopage_alloc() */
+	mov	x1, x0				/* pa */
+	bl	pmapboot_enter
+	cbnz	x0, init_mmutable_error
+
+#ifdef FDT
+	ADDR	x8, _C_LABEL(fdt_addr_r)
+	ldr	x8, [x8]
+
+	VPRINT("Creating VA=PA tables for FDT\n")
+	mov	x7, x26				/* pr func */
+	adr	x6, bootpage_alloc		/* allocator */
+	mov	x5, xzr				/* flags = 0 */
+	mov	x4, #LX_BLKPAG_ATTR_NORMAL_NC|LX_BLKPAG_AP_RW	/* attr */
+	mov	x3, #L2_SIZE			/* blocksize */
+	mov	x2, #L2_SIZE			/* size */
+	mov	x1, x8				/* pa */
+	mov	x0, x8				/* va */
+	bl	pmapboot_enter
+	cbnz	x0, init_mmutable_error
+#endif
+
+	VPRINT("Creating KVA=PA tables\n")
+	mov	x7, x26				/* pr func */
+	adr	x6, bootpage_alloc		/* allocator */
+	mov	x5, xzr				/* flags = 0 */
+	mov	x4, #LX_BLKPAG_ATTR_NORMAL_WB|LX_BLKPAG_AP_RW	/* attr */
+	orr	x4, x4, #LX_BLKPAG_UXN
+	mov	x3, #L2_SIZE			/* blocksize */
+	adr	x1, start			/* va = start */
+	ADDR	x2, _end
+	sub	x2, x2, x1			/* size = _end - start */
+	mov	x0, #VM_MIN_KERNEL_ADDRESS	/* va */
+	bl	pmapboot_enter
+	cbnz	x0, init_mmutable_error
 
-	/* allow to read CNTVCT_EL0 and CNTFRQ_EL0 from EL0 */
-	mrs	x0, cntkctl_el1
-	orr	x0, x0, #CNTKCTL_EL0VCTEN
-	msr	cntkctl_el1, x0
+	VPRINT("OK\n");
+	mov	x0, xzr
+	b	init_mmutable_done
+init_mmutable_error:
+	mvn	x0, xzr
+init_mmutable_done:
+	ldp	x26, lr, [sp], #16
+	ret
 
-	/* any exception not masked */
-	msr	daif, xzr
+/* return PA of allocated page */
+ENTRY_NP(bootpage_alloc)
+	/* x2 = kernend_extra */
+	ADDR	x3, kernend_extra
+	ldr	x2, [x3]
+	/* if (kernend_extra < 0) return NULL */
+	mov	x0, xzr
+	cmp	x2, xzr
+	bmi	bootpage_alloc_done
 
-	ldp	x0, lr, [sp], #16
+	/* x0 = PA of _end[] */
+	ADDR	x1, kern_vtopdiff
+	ldr	x1, [x1]
+	ldr	x0, =ARM_BOOTSTRAP_LxPT
+	sub	x0, x0, x1
+
+	/* x0 = ARM_BOOTSTRAP_LxPT + kernend_extra */
+	add	x0, x0, x2
+
+	/* kernend_extra += PAGE_SIZE; */
+	add	x2, x2, #PAGE_SIZE
+	str	x2, [x3]
+
+	/* clear allocated page */
+	mov	x1, x0
+	add	x2, x1, #PAGE_SIZE
+1:	stp	xzr, xzr, [x1], #16
+	cmp	x1, x2
+	bcc	1b
+bootpage_alloc_done:
 	ret
-END(init_sysregs)
+END(bootpage_alloc)
+
 
-ENTRY_NP(mmu_disable)
+mmu_disable:
 	dsb	sy
 	mrs	x0, sctlr_el1
 	bic	x0, x0, SCTLR_M		/* clear MMU enable bit */
 	msr	sctlr_el1, x0
 	isb
 	ret
-END(mmu_disable)
 
-ENTRY_NP(mmu_enable)
+mmu_enable:
 	dsb	sy
 
-	ADDR	x0, ttbr0_l0table
-	msr	ttbr0_el1, x0
-	ADDR	x0, ttbr1_l0table
-	msr	ttbr1_el1, x0
-	isb
-
 	/* Invalidate all TLB */
 	dsb	ishst
 #ifdef MULTIPROCESSOR
@@ -967,10 +899,6 @@ ENTRY_NP(mmu_enable)
 	ldr	x0, tcr_setting
 	mrs	x1, id_aa64mmfr0_el1
 	bfi	x0, x1, #32, #3
-#ifdef MULTIPROCESSOR
-	ldr	x1, tcr_setting_inner_shareable
-	orr	x0, x0, x1
-#endif
 	msr	tcr_el1, x0
 
 	/*
@@ -992,7 +920,7 @@ ENTRY_NP(mmu_enable)
 	isb
 
 	ret
-END(mmu_enable)
+
 
 	.align 3
 mair_setting:
@@ -1003,6 +931,13 @@ mair_setting:
 	    __SHIFTIN(MAIR_DEVICE_nGnRnE, MAIR_ATTR3))
 
 #define VIRT_BIT	48
+
+#ifdef MULTIPROCESSOR
+#define TCR_SHAREABLE	(TCR_SH0_INNER | TCR_SH1_INNER)
+#else
+#define TCR_SHAREABLE	(TCR_SH0_NONE | TCR_SH1_NONE)
+#endif
+
 tcr_setting:
 	.quad (						\
 	    __SHIFTIN(64 - VIRT_BIT, TCR_T1SZ) |	\
@@ -1012,11 +947,7 @@ tcr_setting:
 	    TCR_ORGN0_WB_WA |				\
 	    TCR_IRGN0_WB_WA |				\
 	    TCR_ORGN1_WB_WA |				\
-	    TCR_IRGN1_WB_WA)
-#ifdef MULTIPROCESSOR
-tcr_setting_inner_shareable:
-	.quad (TCR_SH0_INNER | TCR_SH1_INNER)
-#endif
+	    TCR_IRGN1_WB_WA) | TCR_SHAREABLE
 
 
 #ifdef AARCH64_ALIGNMENT_CHECK
@@ -1071,6 +1002,22 @@ sctlr_clear:
 	    SCTLR_A |       /* Alignment check enable */ \
 	    0)
 
+.L_devmap_addr:
+	.quad	VM_KERNEL_IO_ADDRESS
+
+	.data
+
+#ifdef DEBUG_LOCORE_PRINT_LOCK
+	.align 2
+lock_level:
+	.fill	MAXCPUS, 2, -1
+lock_turn:
+	.fill	(MAXCPUS - 1), 2, -1
+#endif /* DEBUG_LOCORE_PRINT_LOCK */
+
+	.align 3
+ttbr_save:
+	.space	8 * 2
 
 	.bss
 
@@ -1081,34 +1028,12 @@ _C_LABEL(lwp0uspace):
 bootstk:
 
 #ifdef MULTIPROCESSOR
-	.space	BOOT_STACKSIZE * (MAXCPUS - 1)
+	.space	BOOT_AP_STACKSIZE * (MAXCPUS - 1)
 bootstk_cpus:
 #endif
 
-
+	.section ".init_pagetable", "aw", %nobits
 	.align PGSHIFT
-mmutables_start:
-/*
- * PA == VA mapping using L1 1G block (whole 32bit)
- */
-ttbr0_l0table:
-	.space	PAGE_SIZE
-ttbr0_l1table:
-	.space	PAGE_SIZE
-
-/*
- * KVA    => PA mapping using L2 2MB block (kernelsize, max 2MB*512=2Gbyte)
- * DEVMAP => PA mapping using L2 2MB block (devmap size, max 2MB*512=2Gbyte)
- * KSEG   => PA mapping using L1 1GB block * 512
- */
-ttbr1_l0table:
-	.space	PAGE_SIZE
-ttbr1_l1table_kseg:
-	.space	PAGE_SIZE
-ttbr1_l1table_kva:
-	.space	PAGE_SIZE
-ttbr1_l2table_kva:
-	.space	PAGE_SIZE
-ttbr1_l2table_devmap:
-	.space	PAGE_SIZE
-mmutables_end:
+	.global ARM_BOOTSTRAP_LxPT
+ARM_BOOTSTRAP_LxPT:
+l0pt_kern:

Index: src/sys/arch/aarch64/aarch64/pmap.c
diff -u src/sys/arch/aarch64/aarch64/pmap.c:1.25 src/sys/arch/aarch64/aarch64/pmap.c:1.26
--- src/sys/arch/aarch64/aarch64/pmap.c:1.25	Thu Oct  4 09:09:29 2018
+++ src/sys/arch/aarch64/aarch64/pmap.c	Thu Oct  4 23:53:13 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.c,v 1.25 2018/10/04 09:09:29 ryo Exp $	*/
+/*	$NetBSD: pmap.c,v 1.26 2018/10/04 23:53:13 ryo Exp $	*/
 
 /*
  * Copyright (c) 2017 Ryo Shimizu <[email protected]>
@@ -27,12 +27,13 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.25 2018/10/04 09:09:29 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.26 2018/10/04 23:53:13 ryo Exp $");
 
 #include "opt_arm_debug.h"
 #include "opt_ddb.h"
-#include "opt_uvmhist.h"
+#include "opt_multiprocessor.h"
 #include "opt_pmap.h"
+#include "opt_uvmhist.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
@@ -260,55 +261,33 @@ pm_addr_check(struct pmap *pm, vaddr_t v
 
 static const struct pmap_devmap *pmap_devmap_table;
 
-/* XXX: for now, only support for devmap */
 static vsize_t
-_pmap_map_chunk(pd_entry_t *l2, vaddr_t va, paddr_t pa, vsize_t size,
+pmap_map_chunk(vaddr_t va, paddr_t pa, vsize_t size,
     vm_prot_t prot, u_int flags)
 {
-	pd_entry_t oldpte __debugused;
 	pt_entry_t attr;
-	vsize_t resid;
+	psize_t blocksize;
+	int rc;
 
-	oldpte = l2[l2pde_index(va)];
-	KDASSERT(!l2pde_valid(oldpte));
+	/* devmap always use L2 mapping */
+	blocksize = L2_SIZE;
 
 	attr = _pmap_pte_adjust_prot(L2_BLOCK, prot, VM_PROT_ALL, false);
 	attr = _pmap_pte_adjust_cacheflags(attr, flags | PMAP_DEV);
-#ifdef MULTIPROCESSOR
-	attr |= LX_BLKPAG_SH_IS;
-#endif
 	/* user cannot execute, and kernel follows the prot */
 	attr |= (LX_BLKPAG_UXN|LX_BLKPAG_PXN);
 	if (prot & VM_PROT_EXECUTE)
 		attr &= ~LX_BLKPAG_PXN;
 
-	resid = (size + (L2_SIZE - 1)) & ~(L2_SIZE - 1);
-	size = resid;
-
-	while (resid > 0) {
-		pt_entry_t pte;
+	rc = pmapboot_enter(va, pa, size, blocksize, attr,
+	    PMAPBOOT_ENTER_NOOVERWRITE, bootpage_alloc, NULL);
+	if (rc != 0)
+		panic("%s: pmapboot_enter failed. %lx is already mapped?\n",
+		    __func__, va);
 
-		pte = pa | attr;
+	aarch64_tlbi_by_va(va);
 
-		if (prot & VM_PROT_EXECUTE) {
-			pt_entry_t tpte;
-			/* need write permission to invalidate icache */
-			tpte = pte & ~(LX_BLKPAG_AF|LX_BLKPAG_AP);
-			tpte |= (LX_BLKPAG_AF|LX_BLKPAG_AP_RW);
-			tpte |= (LX_BLKPAG_UXN|LX_BLKPAG_PXN);
-			atomic_swap_64(&l2[l2pde_index(va)], tpte);
-			aarch64_tlbi_by_va(va);
-			cpu_icache_sync_range(va, L2_SIZE);
-		}
-		atomic_swap_64(&l2[l2pde_index(va)], pte);
-		aarch64_tlbi_by_va(va);
-
-		va += L2_SIZE;
-		pa += L2_SIZE;
-		resid -= L2_SIZE;
-	}
-
-	return size;
+	return ((va + size + blocksize - 1) & ~(blocksize - 1)) - va;
 }
 
 void
@@ -320,14 +299,11 @@ pmap_devmap_register(const struct pmap_d
 void
 pmap_devmap_bootstrap(const struct pmap_devmap *table)
 {
-	pd_entry_t *l0, *l1, *l2;
 	vaddr_t va;
 	int i;
 
 	pmap_devmap_register(table);
 
-	l0 = (void *)AARCH64_PA_TO_KVA(reg_ttbr1_el1_read());
-
 	VPRINTF("%s:\n", __func__);
 	for (i = 0; table[i].pd_size != 0; i++) {
 		VPRINTF(" devmap: pa %08lx-%08lx = va %016lx\n",
@@ -336,34 +312,16 @@ pmap_devmap_bootstrap(const struct pmap_
 		    table[i].pd_va);
 		va = table[i].pd_va;
 
+		KASSERT((VM_KERNEL_IO_ADDRESS <= va) &&
+		    (va < (VM_KERNEL_IO_ADDRESS + VM_KERNEL_IO_SIZE)));
+
 		/* update and check virtual_devmap_addr */
 		if ((virtual_devmap_addr == 0) ||
 		    (virtual_devmap_addr > va)) {
 			virtual_devmap_addr = va;
-
-			/* XXX: only one L2 table is allocated for devmap  */
-			if ((VM_MAX_KERNEL_ADDRESS - virtual_devmap_addr) >
-			    (L2_SIZE * Ln_ENTRIES)) {
-				panic("devmap va:%016lx out of range."
-				    " available devmap range is %016lx-%016lx",
-				    va,
-				    VM_MAX_KERNEL_ADDRESS -
-				    (L2_SIZE * Ln_ENTRIES),
-				    VM_MAX_KERNEL_ADDRESS);
-			}
 		}
 
-		l1 = (void *)l0pde_pa(l0[l0pde_index(va)]);
-		KASSERT(l1 != NULL);
-		l1 = (void *)AARCH64_PA_TO_KVA((paddr_t)l1);
-
-		l2 = (void *)l1pde_pa(l1[l1pde_index(va)]);
-		if (l2 == NULL)
-			panic("L2 table for devmap is not allocated");
-
-		l2 = (void *)AARCH64_PA_TO_KVA((paddr_t)l2);
-
-		_pmap_map_chunk(l2,
+		pmap_map_chunk(
 		    table[i].pd_va,
 		    table[i].pd_pa,
 		    table[i].pd_size,
@@ -2064,46 +2022,59 @@ pmap_kvattr(vaddr_t va, vm_prot_t prot)
 	return opte;
 }
 
-static void
-pmap_db_pte_print(pt_entry_t pte, int level, void (*pr)(const char *, ...))
+void
+pmap_db_pte_print(pt_entry_t pte, int level,
+    void (*pr)(const char *, ...) __printflike(1, 2))
 {
 	if (pte == 0) {
 		pr(" UNUSED\n");
+		return;
+	}
+
+	pr(" %s", (pte & LX_VALID) ? "VALID" : "**INVALID**");
+
+	if ((level == 0) ||
+	    ((level == 1) && l1pde_is_table(pte)) ||
+	    ((level == 2) && l2pde_is_table(pte))) {
+
+		/* L0/L1/L2 TABLE */
+		if ((level == 0) && ((pte & LX_TYPE) != LX_TYPE_TBL))
+			pr(" **ILLEGAL TYPE**"); /* L0 doesn't support block */
+		else
+			pr(" TABLE");
 
-	} else if (level == 0) {
-		/* L0 pde */
-		pr(", %s",
-		    l1pde_is_table(pte) ? "TABLE" : "***ILLEGAL TYPE***");
-		pr(", %s", l0pde_valid(pte) ? "VALID" : "***INVALID***");
+		pr(", PA=%lx", l0pde_pa(pte));
 
-		pr(", PA=%016lx", l0pde_pa(pte));
+		if (pte & LX_TBL_NSTABLE)
+			pr(", NSTABLE");
+		if (pte & LX_TBL_APTABLE)
+			pr(", APTABLE");
+		if (pte & LX_TBL_UXNTABLE)
+			pr(", UXNTABLE");
+		if (pte & LX_TBL_PXNTABLE)
+			pr(", PXNTABLE");
 
 	} else if (((level == 1) && l1pde_is_block(pte)) ||
 	    ((level == 2) && l2pde_is_block(pte)) ||
-		(level == 3)) {
+	    (level == 3)) {
 
+		/* L1/L2 BLOCK or L3 PAGE */
 		if (level == 3) {
-			pr(", %s",
-			    l3pte_is_page(pte) ? " PAGE" : "**ILLEGAL TYPE**");
-			pr(", %s",
-			    l3pte_valid(pte) ? "VALID" : "**INVALID**");
-		} else {
-			pr(", %s", l1pde_is_table(pte) ? "TABLE" : "BLOCK");
-			pr(", %s",
-			    l1pde_valid(pte) ? "VALID" : "**INVALID**");
-		}
+			pr(" %s", l3pte_is_page(pte) ?
+			    "PAGE" : "**ILLEGAL TYPE**");
+		} else
+			pr(" BLOCK");
 
-		pr(", PA=%016lx", l3pte_pa(pte));
+		pr(", PA=%lx", l3pte_pa(pte));
 
-		/* L[12] block, or L3 pte */
-		pr(", %s", (pte & LX_BLKPAG_UXN) ? "UXN" : "---");
-		pr(", %s", (pte & LX_BLKPAG_PXN) ? "PXN" : "---");
+		pr(", %s", (pte & LX_BLKPAG_UXN) ? "UXN" : "user-exec");
+		pr(", %s", (pte & LX_BLKPAG_PXN) ? "PXN" : "kernel-exec");
 
 		if (pte & LX_BLKPAG_CONTIG)
-			pr(",CONTIG");
+			pr(", CONTIG");
 
-		pr(", %s", (pte & LX_BLKPAG_NG) ? "NG" : "--");
-		pr(", %s", (pte & LX_BLKPAG_AF) ? "AF" : "--");
+		pr(", %s", (pte & LX_BLKPAG_NG) ? "NG" : "global");
+		pr(", %s", (pte & LX_BLKPAG_AF) ? "AF" : "*cannot-access*");
 
 		switch (pte & LX_BLKPAG_SH) {
 		case LX_BLKPAG_SH_NS:
@@ -2122,6 +2093,7 @@ pmap_db_pte_print(pt_entry_t pte, int le
 
 		pr(", %s", (pte & LX_BLKPAG_AP_RO) ? "RO" : "RW");
 		pr(", %s", (pte & LX_BLKPAG_APUSER) ? "EL0" : "EL1");
+		pr(", %s", (pte & LX_BLKPAG_NS) ? "NS" : "secure");
 
 		switch (pte & LX_BLKPAG_ATTR_MASK) {
 		case LX_BLKPAG_ATTR_NORMAL_WB:
@@ -2138,57 +2110,47 @@ pmap_db_pte_print(pt_entry_t pte, int le
 			break;
 		}
 
+		if (pte & LX_BLKPAG_OS_BOOT)
+			pr(", boot");
 		if (pte & LX_BLKPAG_OS_READ)
 			pr(", pmap_read");
 		if (pte & LX_BLKPAG_OS_WRITE)
 			pr(", pmap_write");
-		if ((pte & LX_BLKPAG_UXN) == 0)
-			pr(", user-executable");
-		if ((pte & LX_BLKPAG_PXN) == 0)
-			pr(", kernel-executable");
-
+		if (pte & LX_BLKPAG_OS_WIRED)
+			pr(", pmap_wired");
 	} else {
-		/* L1 and L2 pde */
-		pr(", %s", l1pde_is_table(pte) ? "TABLE" : "BLOCK");
-		pr(", %s", l1pde_valid(pte) ? "VALID" : "**INVALID**");
-		pr(", PA=%016lx", l1pde_pa(pte));
+		pr(" **ILLEGAL TYPE**");
 	}
 	pr("\n");
 }
 
-
 void
 pmap_db_pteinfo(vaddr_t va, void (*pr)(const char *, ...))
 {
-	struct pmap *pm;
 	struct vm_page *pg;
 	bool user;
-
-
-	if (VM_MAXUSER_ADDRESS > va) {
-		pm = curlwp->l_proc->p_vmspace->vm_map.pmap;
-		user = true;
-	} else {
-		pm = pmap_kernel();
-		user = false;
-	}
-
-
 	pd_entry_t *l0, *l1, *l2, *l3;
 	pd_entry_t pde;
 	pt_entry_t pte;
 	struct vm_page_md *md;
+	uint64_t ttbr;
 	paddr_t pa;
 	unsigned int idx;
 
+	if (va & TTBR_SEL_VA) {
+		user = false;
+		ttbr = reg_ttbr1_el1_read();
+	} else {
+		user = true;
+		ttbr = reg_ttbr0_el1_read();
+	}
+	pa = ttbr & TTBR_BADDR;
+	l0 = (pd_entry_t *)AARCH64_PA_TO_KVA(pa);
+
 	/*
 	 * traverse L0 -> L1 -> L2 -> L3 table
 	 */
-
-	l0 = pm->pm_l0table;
-
-	pr("TTBR%d=%016llx (%016llx)", user ? 0 : 1,
-	    pm->pm_l0table_pa, l0);
+	pr("TTBR%d=%016llx, pa=%016lx, va=%016lx", user ? 0 : 1, ttbr, l0);
 	pr(", input-va=%016llx,"
 	    " L0-index=%d, L1-index=%d, L2-index=%d, L3-index=%d\n",
 	    va,
@@ -2200,37 +2162,37 @@ pmap_db_pteinfo(vaddr_t va, void (*pr)(c
 	idx = l0pde_index(va);
 	pde = l0[idx];
 
-	pr("L0[%3d]=%016llx", idx, pde);
+	pr("L0[%3d]=%016llx:", idx, pde);
 	pmap_db_pte_print(pde, 0, pr);
 
 	if (!l0pde_valid(pde))
 		return;
 
-	l1 = (void *)AARCH64_PA_TO_KVA(l0pde_pa(pde));
+	l1 = (pd_entry_t *)AARCH64_PA_TO_KVA(l0pde_pa(pde));
 	idx = l1pde_index(va);
 	pde = l1[idx];
 
-	pr(" L1[%3d]=%016llx", idx, pde);
+	pr(" L1[%3d]=%016llx:", idx, pde);
 	pmap_db_pte_print(pde, 1, pr);
 
 	if (!l1pde_valid(pde) || l1pde_is_block(pde))
 		return;
 
-	l2 = (void *)AARCH64_PA_TO_KVA(l1pde_pa(pde));
+	l2 = (pd_entry_t *)AARCH64_PA_TO_KVA(l1pde_pa(pde));
 	idx = l2pde_index(va);
 	pde = l2[idx];
 
-	pr("  L2[%3d]=%016llx", idx, pde);
+	pr("  L2[%3d]=%016llx:", idx, pde);
 	pmap_db_pte_print(pde, 2, pr);
 
 	if (!l2pde_valid(pde) || l2pde_is_block(pde))
 		return;
 
-	l3 = (void *)AARCH64_PA_TO_KVA(l2pde_pa(pde));
+	l3 = (pd_entry_t *)AARCH64_PA_TO_KVA(l2pde_pa(pde));
 	idx = l3pte_index(va);
 	pte = l3[idx];
 
-	pr("   L3[%3d]=%016llx", idx, pte);
+	pr("   L3[%3d]=%016llx:", idx, pte);
 	pmap_db_pte_print(pte, 3, pr);
 
 	pa = l3pte_pa(pte);

Index: src/sys/arch/aarch64/conf/files.aarch64
diff -u src/sys/arch/aarch64/conf/files.aarch64:1.4 src/sys/arch/aarch64/conf/files.aarch64:1.5
--- src/sys/arch/aarch64/conf/files.aarch64:1.4	Fri Sep 21 16:53:20 2018
+++ src/sys/arch/aarch64/conf/files.aarch64	Thu Oct  4 23:53:14 2018
@@ -1,4 +1,4 @@
-#	$NetBSD: files.aarch64,v 1.4 2018/09/21 16:53:20 jakllsch Exp $
+#	$NetBSD: files.aarch64,v 1.5 2018/10/04 23:53:14 ryo Exp $
 
 defflag opt_cpuoptions.h	AARCH64_ALIGNMENT_CHECK
 defflag opt_cpuoptions.h	AARCH64_EL0_STACK_ALIGNMENT_CHECK
@@ -105,6 +105,7 @@ file	arch/aarch64/aarch64/vm_machdep.c
 
 # pmap
 file	arch/aarch64/aarch64/pmap.c
+file	arch/aarch64/aarch64/pmapboot.c
 file	arch/aarch64/aarch64/pmap_page.S
 #file	uvm/pmap/pmap_pvt.c
 

Index: src/sys/arch/aarch64/include/pmap.h
diff -u src/sys/arch/aarch64/include/pmap.h:1.11 src/sys/arch/aarch64/include/pmap.h:1.12
--- src/sys/arch/aarch64/include/pmap.h:1.11	Thu Oct  4 09:09:29 2018
+++ src/sys/arch/aarch64/include/pmap.h	Thu Oct  4 23:53:14 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.h,v 1.11 2018/10/04 09:09:29 ryo Exp $ */
+/* $NetBSD: pmap.h,v 1.12 2018/10/04 23:53:14 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -125,10 +125,23 @@ void pmap_bootstrap(vaddr_t, vaddr_t);
 bool pmap_fault_fixup(struct pmap *, vaddr_t, vm_prot_t, bool user);
 
 /* for ddb */
-void pmap_db_pteinfo(vaddr_t, void (*)(const char *, ...));
+void pmap_db_pteinfo(vaddr_t, void (*)(const char *, ...) __printflike(1, 2));
 pt_entry_t *kvtopte(vaddr_t);
 pt_entry_t pmap_kvattr(vaddr_t, vm_prot_t);
 
+/* locore.S */
+pd_entry_t *bootpage_alloc(void);
+
+/* pmap_locore.c */
+int pmapboot_enter(vaddr_t, paddr_t, psize_t, psize_t,
+    pt_entry_t, uint64_t, pd_entry_t *(*)(void),
+    void (*pr)(const char *, ...) __printflike(1, 2));
+#define PMAPBOOT_ENTER_NOBLOCK		0x00000001
+#define PMAPBOOT_ENTER_NOOVERWRITE	0x00000002
+int pmapboot_protect(vaddr_t, vaddr_t, vm_prot_t);
+void pmap_db_pte_print(pt_entry_t, int,
+    void (*pr)(const char *, ...) __printflike(1, 2));
+
 /* Hooks for the pool allocator */
 paddr_t vtophys(vaddr_t);
 #define VTOPHYS_FAILED		((paddr_t)-1L)	/* POOL_PADDR_INVALID */

Added files:

Index: src/sys/arch/aarch64/aarch64/pmapboot.c
diff -u /dev/null src/sys/arch/aarch64/aarch64/pmapboot.c:1.1
--- /dev/null	Thu Oct  4 23:53:14 2018
+++ src/sys/arch/aarch64/aarch64/pmapboot.c	Thu Oct  4 23:53:13 2018
@@ -0,0 +1,420 @@
+/*	$NetBSD: pmapboot.c,v 1.1 2018/10/04 23:53:13 ryo Exp $	*/
+
+/*
+ * Copyright (c) 2018 Ryo Shimizu <[email protected]>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: pmapboot.c,v 1.1 2018/10/04 23:53:13 ryo Exp $");
+
+#include "opt_arm_debug.h"
+#include "opt_ddb.h"
+#include "opt_multiprocessor.h"
+#include "opt_pmap.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <uvm/uvm.h>
+
+#include <aarch64/armreg.h>
+#include <aarch64/cpufunc.h>
+#include <aarch64/pmap.h>
+#include <aarch64/pte.h>
+
+
+#define OPTIMIZE_TLB_CONTIG
+
+
+static void
+pmapboot_protect_entry(pt_entry_t *pte, vm_prot_t clrprot)
+{
+	if (clrprot & VM_PROT_READ)
+		*pte &= ~LX_BLKPAG_AF;
+	if (clrprot & VM_PROT_WRITE) {
+		*pte &= ~LX_BLKPAG_AP;
+		*pte |= LX_BLKPAG_AP_RO;
+	}
+	if (clrprot & VM_PROT_EXECUTE)
+		*pte |= LX_BLKPAG_UXN|LX_BLKPAG_PXN;
+}
+
+/*
+ * like pmap_protect(), but not depend on struct pmap.
+ * this work before pmap_bootstrap().
+ * 'clrprot' specified by bit of VM_PROT_{READ,WRITE,EXECUTE}
+ * will be dropped from a pte entry.
+ *
+ * require KSEG(cached) mappings because TLB entries are already cached on.
+ */
+int
+pmapboot_protect(vaddr_t sva, vaddr_t eva, vm_prot_t clrprot)
+{
+	int idx;
+	vaddr_t va;
+	paddr_t pa;
+	pd_entry_t *l0, *l1, *l2, *l3;
+
+	for (va = sva; va < eva;) {
+		/*
+		 * 0x0000xxxxxxxxxxxx -> l0 = (ttbr0_el1 & TTBR_BADDR)
+		 * 0xffffxxxxxxxxxxxx -> l0 = (ttbr1_el1 & TTBR_BADDR)
+		 */
+		if (va & TTBR_SEL_VA)
+			pa = (reg_ttbr1_el1_read() & TTBR_BADDR);
+		else
+			pa = (reg_ttbr0_el1_read() & TTBR_BADDR);
+		l0 = (pd_entry_t *)AARCH64_PA_TO_KVA(pa);
+
+		idx = l0pde_index(va);
+		if (!l0pde_valid(l0[idx]))
+			return -1;
+		pa = l0pde_pa(l0[idx]);
+		l1 = (pd_entry_t *)AARCH64_PA_TO_KVA(pa);
+
+		idx = l1pde_index(va);
+		if (!l1pde_valid(l1[idx]))
+			return -1;
+		if (l1pde_is_block(l1[idx])) {
+			pmapboot_protect_entry(&l1[idx], clrprot);
+			va += L1_SIZE;
+			continue;
+		}
+		pa = l1pde_pa(l1[idx]);
+		l2 = (pd_entry_t *)AARCH64_PA_TO_KVA(pa);
+
+		idx = l2pde_index(va);
+		if (!l2pde_valid(l2[idx]))
+			return -1;
+		if (l2pde_is_block(l2[idx])) {
+			pmapboot_protect_entry(&l2[idx], clrprot);
+			va += L2_SIZE;
+			continue;
+		}
+		pa = l2pde_pa(l2[idx]);
+		l3 = (pd_entry_t *)AARCH64_PA_TO_KVA(pa);
+
+		idx = l3pte_index(va);
+		if (!l3pte_valid(l3[idx]))
+			return -1;
+		if (!l3pte_is_page(l3[idx]))
+			return -1;
+
+		pmapboot_protect_entry(&l3[idx], clrprot);
+		va += L3_SIZE;
+	}
+
+	return 0;
+}
+
+
+/*
+ * these function will be called from locore without MMU.
+ * the load address varies depending on the bootloader.
+ * cannot use absolute addressing to refer text/data/bss.
+ *
+ * (*pr) function may be minimal printf. (when provided from locore)
+ * it supports only maximum 7 argument, and only '%d', '%x', and '%s' formats.
+ */
+
+static void
+pmapboot_pte_print(pt_entry_t pte, int level,
+    void (*pr)(const char *, ...) __printflike(1, 2))
+{
+#ifdef DDB
+	pmap_db_pte_print(pte, level, pr);
+#else
+	__USE(level);
+	pr(" PA=%016lx\n", lxpde_pa(pte));
+#endif
+}
+
+#ifdef OPTIMIZE_TLB_CONTIG
+static inline bool
+tlb_contiguous_p(vaddr_t addr, vaddr_t start, vaddr_t end, vsize_t blocksize)
+{
+	/*
+	 * when using 4KB granule, 16 adjacent and aligned entries can be
+	 * unified to one TLB cache entry.
+	 * in other size of granule, not supported.
+	 */
+	if (((addr & ~((blocksize << 4) - 1)) >= start) &&
+	    ((addr | ((blocksize << 4) - 1)) <= end))
+		return true;
+	return false;
+}
+#endif /* OPTIMIZE_TLB_CONTIG */
+
+
+#ifdef VERBOSE_INIT_ARM
+#define VPRINTF(fmt, args...)	\
+	while (pr != NULL) { pr(fmt, ## args); break; }
+#define VPRINT_PTE(pte, l)	\
+	while (pr != NULL) { pmapboot_pte_print((pte), (l), pr); break; }
+#else
+#define VPRINTF(fmt, args...)	__nothing
+#define VPRINT_PTE(pte, l)	__nothing
+#endif
+
+/*
+ * pmapboot_enter() accesses pagetables by physical address.
+ * this should be called while identity mapping (VA=PA) available.
+ */
+int
+pmapboot_enter(vaddr_t va, paddr_t pa, psize_t size, psize_t blocksize,
+    pt_entry_t attr, uint64_t flags, pd_entry_t *(*physpage_allocator)(void),
+    void (*pr)(const char *, ...) __printflike(1, 2))
+{
+	int level, idx0, idx1, idx2, idx3, nskip = 0;
+	int ttbr __unused;
+	vaddr_t va_end;
+	pd_entry_t *l0, *l1, *l2, *l3, pte;
+	bool noblock, nooverwrite;
+#ifdef OPTIMIZE_TLB_CONTIG
+	vaddr_t va_start;
+	pd_entry_t *ll;
+	int i, llidx;
+#endif
+
+	switch (blocksize) {
+	case L1_SIZE:
+		level = 1;
+		break;
+	case L2_SIZE:
+		level = 2;
+		break;
+	case L3_SIZE:
+		level = 3;
+		break;
+	default:
+		return -1;
+	}
+
+	noblock = flags & PMAPBOOT_ENTER_NOBLOCK;
+	nooverwrite = flags & PMAPBOOT_ENTER_NOOVERWRITE;
+
+	VPRINTF("pmapboot_enter: va=0x%lx, pa=0x%lx, size=0x%lx, "
+	    "blocksize=0x%lx, attr=0x%016lx, "
+	    "noblock=%d, nooverwrite=%d\n",
+	    va, pa, size, blocksize, attr, noblock, nooverwrite);
+
+	va_end = (va + size - 1) & ~(blocksize - 1);
+	pa &= ~(blocksize - 1);
+	va &= ~(blocksize - 1);
+#ifdef OPTIMIZE_TLB_CONTIG
+	va_start = va;
+#endif
+
+	attr |= LX_BLKPAG_OS_BOOT;
+
+	while (va <= va_end) {
+		/*
+		 * 0x0000xxxxxxxxxxxx -> l0 = (ttbr0_el1 & TTBR_BADDR)
+		 * 0xffffxxxxxxxxxxxx -> l0 = (ttbr1_el1 & TTBR_BADDR)
+		 */
+		if (va & TTBR_SEL_VA) {
+			l0 = (pd_entry_t *)(reg_ttbr1_el1_read() & TTBR_BADDR);
+			ttbr = 1;
+		} else {
+			l0 = (pd_entry_t *)(reg_ttbr0_el1_read() & TTBR_BADDR);
+			ttbr = 0;
+		}
+
+#ifdef OPTIMIZE_TLB_CONTIG
+		ll = NULL;
+		llidx = -1;
+#endif
+
+		idx0 = l0pde_index(va);
+		if (l0[idx0] == 0) {
+			l1 = physpage_allocator();
+			if (l1 == NULL) {
+				VPRINTF("pmapboot_enter: cannot allocate L1 page\n");
+				return -1;
+			}
+
+			pte = (uint64_t)l1 | L0_TABLE;
+			l0[idx0] = pte;
+			VPRINTF("TTBR%d[%d] (new)\t= %016lx:", ttbr, idx0, pte);
+			VPRINT_PTE(pte, 0);
+		} else {
+			l1 = (uint64_t *)(l0[idx0] & LX_TBL_PA);
+		}
+
+		idx1 = l1pde_index(va);
+		if (level == 1) {
+			if (noblock)
+				goto nextblk;
+			if (nooverwrite && l1pde_valid(l1[idx1])) {
+				nskip++;
+				goto nextblk;
+			}
+
+			pte = pa |
+			    L1_BLOCK |
+			    LX_BLKPAG_AF |
+#ifdef MULTIPROCESSOR
+			    LX_BLKPAG_SH_IS |
+#endif
+			    attr;
+#ifdef OPTIMIZE_TLB_CONTIG
+			if (tlb_contiguous_p(va, va_start, va_end, blocksize))
+				pte |= LX_BLKPAG_CONTIG;
+			ll = l1;
+			llidx = idx1;
+#endif
+			l1[idx1] = pte;
+			VPRINTF("TTBR%d[%d][%d]\t= %016lx:", ttbr,
+			    idx0, idx1, pte);
+			VPRINT_PTE(pte, 1);
+			goto nextblk;
+		}
+
+		if (!l1pde_valid(l1[idx1])) {
+			l2 = physpage_allocator();
+			if (l2 == NULL) {
+				VPRINTF("pmapboot_enter: cannot allocate L2 page\n");
+				return -1;
+			}
+
+			pte = (uint64_t)l2 | L1_TABLE;
+			l1[idx1] = pte;
+			VPRINTF("TTBR%d[%d][%d] (new)\t= %016lx:", ttbr,
+			    idx0, idx1, pte);
+			VPRINT_PTE(pte, 1);
+		} else {
+			l2 = (uint64_t *)(l1[idx1] & LX_TBL_PA);
+		}
+
+		idx2 = l2pde_index(va);
+		if (level == 2) {
+			if (noblock)
+				goto nextblk;
+			if (nooverwrite && l2pde_valid(l2[idx2])) {
+				nskip++;
+				goto nextblk;
+			}
+
+			pte = pa |
+			    L2_BLOCK |
+			    LX_BLKPAG_AF |
+#ifdef MULTIPROCESSOR
+			    LX_BLKPAG_SH_IS |
+#endif
+			    attr;
+#ifdef OPTIMIZE_TLB_CONTIG
+			if (tlb_contiguous_p(va, va_start, va_end, blocksize))
+				pte |= LX_BLKPAG_CONTIG;
+			ll = l2;
+			llidx = idx2;
+#endif
+			l2[idx2] = pte;
+			VPRINTF("TTBR%d[%d][%d][%d]\t= %016lx:", ttbr,
+			    idx0, idx1, idx2, pte);
+			VPRINT_PTE(pte, 2);
+			goto nextblk;
+		}
+
+		if (!l2pde_valid(l2[idx2])) {
+			l3 = physpage_allocator();
+			if (l3 == NULL) {
+				VPRINTF("pmapboot_enter: cannot allocate L3 page\n");
+				return -1;
+			}
+
+			pte = (uint64_t)l3 | L2_TABLE;
+			l2[idx2] = pte;
+			VPRINTF("TTBR%d[%d][%d][%d] (new)\t= %016lx:", ttbr,
+			    idx0, idx1, idx2, pte);
+			VPRINT_PTE(pte, 2);
+		} else {
+			l3 = (uint64_t *)(l2[idx2] & LX_TBL_PA);
+		}
+
+		idx3 = l3pte_index(va);
+		if (noblock)
+			goto nextblk;
+		if (nooverwrite && l3pte_valid(l3[idx3])) {
+			nskip++;
+			goto nextblk;
+		}
+
+		pte = pa |
+		    L3_PAGE |
+		    LX_BLKPAG_AF |
+#ifdef MULTIPROCESSOR
+		    LX_BLKPAG_SH_IS |
+#endif
+		    attr;
+#ifdef OPTIMIZE_TLB_CONTIG
+		if (tlb_contiguous_p(va, va_start, va_end, blocksize))
+			pte |= LX_BLKPAG_CONTIG;
+		ll = l3;
+		llidx = idx3;
+#endif
+		l3[idx3] = pte;
+		VPRINTF("TTBR%d[%d][%d][%d][%d]\t= %lx:", ttbr,
+		    idx0, idx1, idx2, idx3, pte);
+		VPRINT_PTE(pte, 3);
+
+ nextblk:
+#ifdef OPTIMIZE_TLB_CONTIG
+		/*
+		 * when overwrite pte, also contiguous bit before/after
+		 * this pte should be cleared.
+		 */
+		if ((ll != NULL) && (va == va_start) &&
+		    ((llidx & 15) != 0)) {
+			/* clear CONTIG flag in front of this pte entry */
+			for (i = (llidx & ~15); i < llidx; i++) {
+				ll[i] &= ~LX_BLKPAG_CONTIG;
+			}
+		}
+		if ((ll != NULL) && (va == va_end) &&
+		    ((llidx & 15) != 15)) {
+			/* clear CONTIG flag in back of this pte entry */
+			for (i = (llidx + 1); i < ((llidx + 16) & ~15); i++) {
+				ll[i] &= ~LX_BLKPAG_CONTIG;
+			}
+		}
+#endif
+		switch (level) {
+		case 1:
+			va += L1_SIZE;
+			pa += L1_SIZE;
+			break;
+		case 2:
+			va += L2_SIZE;
+			pa += L2_SIZE;
+			break;
+		case 3:
+			va += L3_SIZE;
+			pa += L3_SIZE;
+			break;
+		}
+	}
+
+	return nskip;
+}

Reply via email to