Hi Logan, Logan Gunthorpe <log...@deltatee.com> 於 2019年1月10日 週四 上午5:07寫道: > > This patch implements sparsemem support for risc-v which helps pave the > way for memory hotplug and eventually P2P support. > > We introduce Kconfig options for virtual and physical address bits which > are used to calculate the size of the vmemmap and set the > MAX_PHYSMEM_BITS. > > The vmemmap is located directly before the VMALLOC region and sized > such that we can allocate enough pages to populate all the virtual > address space in the system (similar to the way it's done in arm64). > > During initialization, call memblocks_present() and sparse_init(), > and provide a stub for vmemmap_populate() (all of which is similar to > arm64). > > Signed-off-by: Logan Gunthorpe <log...@deltatee.com> > Reviewed-by: Palmer Dabbelt <pal...@sifive.com> > Cc: Albert Ou <a...@eecs.berkeley.edu> > Cc: Andrew Waterman <and...@sifive.com> > Cc: Olof Johansson <o...@lixom.net> > Cc: Michael Clark <michaeljcl...@mac.com> > Cc: Rob Herring <r...@kernel.org> > Cc: Zong Li <z...@andestech.com> > --- > arch/riscv/Kconfig | 23 +++++++++++++++++++++++ > arch/riscv/include/asm/pgtable.h | 21 +++++++++++++++++---- > arch/riscv/include/asm/sparsemem.h | 11 +++++++++++ > arch/riscv/kernel/setup.c | 4 +++- > arch/riscv/mm/init.c | 8 ++++++++ > 5 files changed, 62 insertions(+), 5 deletions(-) > create mode 100644 arch/riscv/include/asm/sparsemem.h > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index e0d7d61779a6..bd659327bc6b 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -54,12 +54,32 @@ config ZONE_DMA32 > bool > default y if 64BIT > > +config VA_BITS > + int > + default 32 if 32BIT > + default 39 if 64BIT > + > +config PA_BITS > + int > + default 34 if 32BIT > + default 56 if 64BIT > + > config PAGE_OFFSET > hex > default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB > default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB > default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB > > +config ARCH_FLATMEM_ENABLE > + def_bool y > + > +config ARCH_SPARSEMEM_ENABLE > + def_bool y > + select SPARSEMEM_VMEMMAP_ENABLE > + > +config ARCH_SELECT_MEMORY_MODEL > + def_bool ARCH_SPARSEMEM_ENABLE > + > config STACKTRACE_SUPPORT > def_bool y > > @@ -94,6 +114,9 @@ config PGTABLE_LEVELS > config HAVE_KPROBES > def_bool n > > +config HAVE_ARCH_PFN_VALID > + def_bool y > + > menu "Platform type" > > choice > diff --git a/arch/riscv/include/asm/pgtable.h > b/arch/riscv/include/asm/pgtable.h > index 16301966d65b..e1162336f5ea 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -89,6 +89,23 @@ extern pgd_t swapper_pg_dir[]; > #define __S110 PAGE_SHARED_EXEC > #define __S111 PAGE_SHARED_EXEC > > +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > +#define VMALLOC_END (PAGE_OFFSET - 1) > +#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) > + > +/* > + * Roughly size the vmemmap space to be large enough to fit enough > + * struct pages to map half the virtual address space. Then > + * position vmemmap directly below the VMALLOC region. > + */ > +#define VMEMMAP_SHIFT \ > + (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) > +#define VMEMMAP_SIZE (1UL << VMEMMAP_SHIFT) > +#define VMEMMAP_END (VMALLOC_START - 1) > +#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) > + > +#define vmemmap ((struct page *)VMEMMAP_START) > + > /* > * ZERO_PAGE is a global shared page that is always zero, > * used for zero-mapped memory areas, etc. > @@ -411,10 +428,6 @@ static inline void pgtable_cache_init(void) > /* No page table caches to initialize */ > } > > -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > -#define VMALLOC_END (PAGE_OFFSET - 1) > -#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) > - > /* > * Task size is 0x40000000000 for RV64 or 0xb800000 for RV32. > * Note that PGDIR_SIZE must evenly divide TASK_SIZE. > diff --git a/arch/riscv/include/asm/sparsemem.h > b/arch/riscv/include/asm/sparsemem.h > new file mode 100644 > index 000000000000..b58ba2d9ed6e > --- /dev/null > +++ b/arch/riscv/include/asm/sparsemem.h > @@ -0,0 +1,11 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +#ifndef __ASM_SPARSEMEM_H > +#define __ASM_SPARSEMEM_H > + > +#ifdef CONFIG_SPARSEMEM > +#define MAX_PHYSMEM_BITS CONFIG_PA_BITS > +#define SECTION_SIZE_BITS 27 > +#endif /* CONFIG_SPARSEMEM */ > + > +#endif /* __ASM_SPARSEMEM_H */ > diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c > index fc8006a042eb..98f39adefb1a 100644 > --- a/arch/riscv/kernel/setup.c > +++ b/arch/riscv/kernel/setup.c > @@ -193,6 +193,9 @@ static void __init setup_bootmem(void) > PFN_PHYS(end_pfn - start_pfn), > &memblock.memory, 0); > } > + > + memblocks_present(); > + sparse_init(); > }
I just applied this patch to Linux kernel 5.2. I used a dts with 2 memory nodes with hole int it. memory@80000000 { device_type = "memory"; reg = <0x0 0x80000000 0x0 0x40000000>; }; memory@180000000 { device_type = "memory"; reg = <0x1 0x80000000 0x0 0x40000000>; }; I found it will boot failure. Did I miss anything? [ 0.000000] Sorting __ex_table... [ 0.000000] BUG: Bad page state in process swapper pfn:180001 [ 0.000000] page:ffffffcf05400038 refcount:0 mapcount:94371937 mapping:00000000ffffffff index:0x4000000000000000 [ 0.000000] anon [ 0.000000] flags: 0x0() [ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000 00000000ffffffff [ 0.000000] raw: 4000000000000000 ffffffcf05a00060 0000000005a00060 [ 0.000000] page dumped because: non-NULL mapping [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.0-00001-g737d8214d9a9 #3 [ 0.000000] Call Trace: [ 0.000000] [<ffffffe00017759c>] walk_stackframe+0x0/0xa0 [ 0.000000] [<ffffffe00017769c>] show_stack+0x2a/0x34 [ 0.000000] [<ffffffe00070c53e>] dump_stack+0x62/0x7c [ 0.000000] [<ffffffe0002330ae>] bad_page+0xca/0x120 [ 0.000000] [<ffffffe00023313c>] free_pages_check_bad+0x38/0x7a [ 0.000000] [<ffffffe00023368a>] __free_pages_ok+0x496/0x4ba [ 0.000000] [<ffffffe000234a82>] __free_pages.part.4+0xe/0x22 [ 0.000000] [<ffffffe000234c9e>] __free_pages_core+0x9a/0xa6 [ 0.000000] [<ffffffe000009b0a>] memblock_free_pages+0x12/0x1a [ 0.000000] [<ffffffe00000b496>] memblock_free_all+0x144/0x1a8 [ 0.000000] [<ffffffe00000274a>] mem_init+0x28/0x36 [ 0.000000] [<ffffffe0000008a0>] start_kernel+0x1bc/0x360 [ 0.000000] [<ffffffe000000074>] clear_bss_done+0x34/0x38 [ 0.000000] Disabling lock debugging due to kernel taint [ 0.000000] BUG: Bad page state in process swapper pfn:180002 [ 0.000000] page:ffffffcf05400070 refcount:0 mapcount:94371993 mapping:00000000ffffffff index:0x4000000000000000 [ 0.000000] anon [ 0.000000] flags: 0x0() [ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000 00000000ffffffff [ 0.000000] raw: 4000000000000000 ffffffcf05a00098 0000000005a00098 [ 0.000000] page dumped because: non-NULL mapping [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Tainted: G B 5.2.0-00001-g737d8214d9a9 #3 [ 0.000000] Call Trace: [ 0.000000] [<ffffffe00017759c>] walk_stackframe+0x0/0xa0 [ 0.000000] [<ffffffe00017769c>] show_stack+0x2a/0x34 [ 0.000000] [<ffffffe00070c53e>] dump_stack+0x62/0x7c [ 0.000000] [<ffffffe0002330ae>] bad_page+0xca/0x120 [ 0.000000] [<ffffffe00023313c>] free_pages_check_bad+0x38/0x7a [ 0.000000] [<ffffffe00023368a>] __free_pages_ok+0x496/0x4ba [ 0.000000] [<ffffffe000234a82>] __free_pages.part.4+0xe/0x22 [ 0.000000] [<ffffffe000234c9e>] __free_pages_core+0x9a/0xa6 [ 0.000000] [<ffffffe000009b0a>] memblock_free_pages+0x12/0x1a [ 0.000000] [<ffffffe00000b496>] memblock_free_all+0x144/0x1a8 [ 0.000000] [<ffffffe00000274a>] mem_init+0x28/0x36 [ 0.000000] [<ffffffe0000008a0>] start_kernel+0x1bc/0x360 [ 0.000000] [<ffffffe000000074>] clear_bss_done+0x34/0x38 [ 0.000000] BUG: Bad page state in process swapper pfn:180003 [ 0.000000] page:ffffffcf054000a8 refcount:0 mapcount:94372049 mapping:00000000ffffffff index:0x4000000000000000 [ 0.000000] anon [ 0.000000] flags: 0x0() [ 0.000000] raw: 0000000000000000 0000000000000000 0000000000000000 00000000ffffffff [ 0.000000] raw: 4000000000000000 ffffffcf05a000d0 0000000005a000d0 [ 0.000000] page dumped because: non-NULL mapping I look this issue more closely. I found it always sets each memblock region to node 0. Does this make sense? I am not sure if I understand this correctly. Do you have any idea for this? Thank you. :) for_each_memblock(memory, reg) { unsigned long start_pfn = memblock_region_memory_base_pfn(reg); unsigned long end_pfn = memblock_region_memory_end_pfn(reg); memblock_set_node(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), &memblock.memory, 0); ^^^ } [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000080200000-0x00000000bfffffff] [ 0.000000] node 0: [mem 0x0000000180000000-0x00000001bfffffff] [ 0.000000] Initmem setup node 0 [mem 0x0000000080200000-0x00000001bfffffff]