[PATCH v5 5/5] arch, mm: make kernel_page_present() always available
From: Mike Rapoport For architectures that enable ARCH_HAS_SET_MEMORY having the ability to verify that a page is mapped in the kernel direct map can be useful regardless of hibernation. Add RISC-V implementation of kernel_page_present(), update its forward declarations and stubs to be a part of set_memory API and remove ugly ifdefery in inlcude/linux/mm.h around current declarations of kernel_page_present(). Signed-off-by: Mike Rapoport Acked-by: Kirill A. Shutemov --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/pageattr.c| 4 +--- arch/riscv/include/asm/set_memory.h | 1 + arch/riscv/mm/pageattr.c| 29 + arch/x86/include/asm/set_memory.h | 1 + arch/x86/mm/pat/set_memory.c| 4 +--- include/linux/mm.h | 7 --- include/linux/set_memory.h | 5 + 8 files changed, 39 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 9384fd8fc13c..45217f21f1fe 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -140,6 +140,7 @@ int set_memory_valid(unsigned long addr, int numpages, int enable); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +bool kernel_page_present(struct page *page); #include diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 439325532be1..92eccaf595c8 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -186,8 +186,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) set_memory_valid((unsigned long)page_address(page), numpages, enable); } +#endif /* CONFIG_DEBUG_PAGEALLOC */ -#ifdef CONFIG_HIBERNATION /* * This function is used to determine if a linear map page has been marked as * not-valid. Walk the page table and check the PTE_VALID bit. This is based @@ -234,5 +234,3 @@ bool kernel_page_present(struct page *page) ptep = pte_offset_kernel(pmdp, addr); return pte_valid(READ_ONCE(*ptep)); } -#endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 4c5bae7ca01c..d690b08dff2a 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -24,6 +24,7 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +bool kernel_page_present(struct page *page); #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index 321b09d2e2ea..87ba5a68bbb8 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -198,3 +198,32 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) __pgprot(0), __pgprot(_PAGE_PRESENT)); } #endif + +bool kernel_page_present(struct page *page) +{ + unsigned long addr = (unsigned long)page_address(page); + pgd_t *pgd; + pud_t *pud; + p4d_t *p4d; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + return false; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return false; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return false; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return false; + + pte = pte_offset_kernel(pmd, addr); + return pte_present(*pte); +} diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 5948218f35c5..4352f08bfbb5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -82,6 +82,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index bc9be96b777f..16f878c26667 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2226,8 +2226,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) arch_flush_lazy_mmu_mode(); } +#endif /* CONFIG_DEBUG_PAGEALLOC */ -#ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) { unsigned int level; @@ -2239,8 +2239,6 @@ bool kernel_page_present(struct page *page) pte = lookup_address((unsigned long)page_address(page), ); return (pte_val(*pte) & _PAGE_PRESENT); } -#endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC */ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long
[PATCH v5 4/5] arch, mm: restore dependency of __kernel_map_pages() on DEBUG_PAGEALLOC
From: Mike Rapoport The design of DEBUG_PAGEALLOC presumes that __kernel_map_pages() must never fail. With this assumption is wouldn't be safe to allow general usage of this function. Moreover, some architectures that implement __kernel_map_pages() have this function guarded by #ifdef DEBUG_PAGEALLOC and some refuse to map/unmap pages when page allocation debugging is disabled at runtime. As all the users of __kernel_map_pages() were converted to use debug_pagealloc_map_pages() it is safe to make it available only when DEBUG_PAGEALLOC is set. Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Acked-by: Kirill A. Shutemov --- arch/Kconfig | 3 +++ arch/arm64/Kconfig | 4 +--- arch/arm64/mm/pageattr.c | 8 ++-- arch/powerpc/Kconfig | 5 + arch/riscv/Kconfig | 4 +--- arch/riscv/include/asm/pgtable.h | 2 -- arch/riscv/mm/pageattr.c | 2 ++ arch/s390/Kconfig| 4 +--- arch/sparc/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- arch/x86/mm/pat/set_memory.c | 2 ++ include/linux/mm.h | 10 +++--- 12 files changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..56d4752b6db6 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1028,6 +1028,9 @@ config HAVE_STATIC_CALL_INLINE bool depends on HAVE_STATIC_CALL +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1d466addb078..a932810cfd90 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -71,6 +71,7 @@ config ARM64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_SYM_ANNOTATIONS + select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_ATOMIC_RMW @@ -1025,9 +1026,6 @@ config HOLES_IN_ZONE source "kernel/Kconfig.hz" -config ARCH_SUPPORTS_DEBUG_PAGEALLOC - def_bool y - config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_VMEMMAP_ENABLE diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 1b94f5b82654..439325532be1 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -155,7 +155,7 @@ int set_direct_map_invalid_noflush(struct page *page) .clear_mask = __pgprot(PTE_VALID), }; - if (!rodata_full) + if (!debug_pagealloc_enabled() && !rodata_full) return 0; return apply_to_page_range(_mm, @@ -170,7 +170,7 @@ int set_direct_map_default_noflush(struct page *page) .clear_mask = __pgprot(PTE_RDONLY), }; - if (!rodata_full) + if (!debug_pagealloc_enabled() && !rodata_full) return 0; return apply_to_page_range(_mm, @@ -178,6 +178,7 @@ int set_direct_map_default_noflush(struct page *page) PAGE_SIZE, change_page_range, ); } +#ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (!debug_pagealloc_enabled() && !rodata_full) @@ -186,6 +187,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) set_memory_valid((unsigned long)page_address(page), numpages, enable); } +#ifdef CONFIG_HIBERNATION /* * This function is used to determine if a linear map page has been marked as * not-valid. Walk the page table and check the PTE_VALID bit. This is based @@ -232,3 +234,5 @@ bool kernel_page_present(struct page *page) ptep = pte_offset_kernel(pmdp, addr); return pte_valid(READ_ONCE(*ptep)); } +#endif /* CONFIG_HIBERNATION */ +#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe08492..ad8a83f3ddca 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -146,6 +146,7 @@ config PPC select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOCif PPC32 || PPC_BOOK3S_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS @@ -355,10 +356,6 @@ config PPC_OF_PLATFORM_PCI depends on PCI depends on PPC64 # not supported on 32 bits yet -config ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on PPC32 || PPC_BOOK3S_64 - def_bool y - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 44377fd7860e..9283c6f9ae2a 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -14,6 +14,7 @@ config RISCV def_bool y
[PATCH v5 3/5] PM: hibernate: make direct map manipulations more explicit
From: Mike Rapoport When DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP is enabled a page may be not present in the direct map and has to be explicitly mapped before it could be copied. Introduce hibernate_map_page() and hibernation_unmap_page() that will explicitly use set_direct_map_{default,invalid}_noflush() for ARCH_HAS_SET_DIRECT_MAP case and debug_pagealloc_{map,unmap}_pages() for DEBUG_PAGEALLOC case. The remapping of the pages in safe_copy_page() presumes that it only changes protection bits in an existing PTE and so it is safe to ignore return value of set_direct_map_{default,invalid}_noflush(). Still, add a pr_warn() so that future changes in set_memory APIs will not silently break hibernation. Signed-off-by: Mike Rapoport Acked-by: Rafael J. Wysocki Reviewed-by: David Hildenbrand Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka --- include/linux/mm.h | 12 kernel/power/snapshot.c | 38 -- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index bb8c70178f4e..e198b938f5c5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2927,16 +2927,6 @@ static inline bool debug_pagealloc_enabled_static(void) #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) extern void __kernel_map_pages(struct page *page, int numpages, int enable); -/* - * When called in DEBUG_PAGEALLOC context, the call should most likely be - * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() - */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) -{ - __kernel_map_pages(page, numpages, enable); -} - static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) @@ -2953,8 +2943,6 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) {} static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 46b1804c1ddf..d848377dd8dc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -76,6 +76,40 @@ static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ + +/* + * The calls to set_direct_map_*() should not fail because remapping a page + * here means that we only update protection bits in an existing PTE. + * It is still worth to have a warning here if something changes and this + * will no longer be the case. + */ +static inline void hibernate_map_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + int ret = set_direct_map_default_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + } else { + debug_pagealloc_map_pages(page, 1); + } +} + +static inline void hibernate_unmap_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + unsigned long addr = (unsigned long)page_address(page); + int ret = set_direct_map_invalid_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } else { + debug_pagealloc_unmap_pages(page, 1); + } +} + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -1355,9 +1389,9 @@ static void safe_copy_page(void *dst, struct page *s_page) if (kernel_page_present(s_page)) { do_copy_page(dst, page_address(s_page)); } else { - kernel_map_pages(s_page, 1, 1); + hibernate_map_page(s_page); do_copy_page(dst, page_address(s_page)); - kernel_map_pages(s_page, 1, 0); + hibernate_unmap_page(s_page); } } -- 2.28.0
[PATCH v5 2/5] slab: debug: split slab_kernel_map() to map and unmap variants
From: Mike Rapoport Instead of using slab_kernel_map() with 'map' parameter to remap pages when DEBUG_PAGEALLOC is enabled, use dedicated helpers slab_kernel_map() and slab_kernel_unmap(). Signed-off-by: Mike Rapoport --- mm/slab.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 07317386e150..0719421d69f7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1428,17 +1428,21 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) return false; } -static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) +static void slab_kernel_map(struct kmem_cache *cachep, void *objp) { if (!is_debug_pagealloc_cache(cachep)) return; - if (map) - debug_pagealloc_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE); - else - debug_pagealloc_unmap_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE); + debug_pagealloc_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE); +} + +static void slab_kernel_unmap(struct kmem_cache *cachep, void *objp) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + debug_pagealloc_unmap_pages(virt_to_page(objp), + cachep->size / PAGE_SIZE); } static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1585,7 +1589,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1); + slab_kernel_map(cachep, objp); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -2360,7 +2364,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0); + slab_kernel_unmap(cachep, objp); } } #endif @@ -2728,7 +2732,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0); + slab_kernel_unmap(cachep, objp); } return objp; } @@ -2993,7 +2997,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1); + slab_kernel_map(cachep, objp); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) -- 2.28.0
[PATCH v5 1/5] mm: introduce debug_pagealloc_{map, unmap}_pages() helpers
From: Mike Rapoport When CONFIG_DEBUG_PAGEALLOC is enabled, it unmaps pages from the kernel direct mapping after free_pages(). The pages than need to be mapped back before they could be used. Theese mapping operations use __kernel_map_pages() guarded with with debug_pagealloc_enabled(). The only place that calls __kernel_map_pages() without checking whether DEBUG_PAGEALLOC is enabled is the hibernation code that presumes availability of this function when ARCH_HAS_SET_DIRECT_MAP is set. Still, on arm64, __kernel_map_pages() will bail out when DEBUG_PAGEALLOC is not enabled but set_direct_map_invalid_noflush() may render some pages not present in the direct map and hibernation code won't be able to save such pages. To make page allocation debugging and hibernation interaction more robust, the dependency on DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP has to be made more explicit. Start with combining the guard condition and the call to __kernel_map_pages() into debug_pagealloc_map_pages() and debug_pagealloc_unmap_pages() functions to emphasize that __kernel_map_pages() should not be called without DEBUG_PAGEALLOC and use these new functions to map/unmap pages when page allocation debugging is enabled. Signed-off-by: Mike Rapoport Reviewed-by: David Hildenbrand Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka --- include/linux/mm.h | 15 +++ mm/memory_hotplug.c | 3 +-- mm/page_alloc.c | 6 ++ mm/slab.c | 16 +++- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ef360fe70aaf..bb8c70178f4e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2936,12 +2936,27 @@ kernel_map_pages(struct page *page, int numpages, int enable) { __kernel_map_pages(page, numpages, enable); } + +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 1); +} + +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 0); +} + #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b44d4c7ba73b..f18f86ba2a68 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -614,8 +614,7 @@ void generic_online_page(struct page *page, unsigned int order) * so we should map it first. This is better than introducing a special * case in page freeing fast path. */ - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); __free_pages_core(page, order); totalram_pages_add(1UL << order); #ifdef CONFIG_HIGHMEM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23f5066bd4a5..db1bf70458d0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1272,8 +1272,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ arch_free_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 0); + debug_pagealloc_unmap_pages(page, 1 << order); kasan_free_nondeferred_pages(page, order); @@ -2270,8 +2269,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); kasan_alloc_pages(page, order); kernel_poison_pages(page, 1 << order, 1); set_page_owner(page, order, gfp_flags); diff --git a/mm/slab.c b/mm/slab.c index b1113561b98b..07317386e150 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1428,21 +1428,19 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) return false; } -#ifdef CONFIG_DEBUG_PAGEALLOC static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) { if (!is_debug_pagealloc_cache(cachep)) return; - kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); + if (map) + debug_pagealloc_map_pages(virt_to_page(objp), + cachep->size / PAGE_SIZE); +
[PATCH v5 0/5] arch, mm: improve robustness of direct map manipulation
From: Mike Rapoport Hi, During recent discussion about KVM protected memory, David raised a concern about usage of __kernel_map_pages() outside of DEBUG_PAGEALLOC scope [1]. Indeed, for architectures that define CONFIG_ARCH_HAS_SET_DIRECT_MAP it is possible that __kernel_map_pages() would fail, but since this function is void, the failure will go unnoticed. Moreover, there's lack of consistency of __kernel_map_pages() semantics across architectures as some guard this function with #ifdef DEBUG_PAGEALLOC, some refuse to update the direct map if page allocation debugging is disabled at run time and some allow modifying the direct map regardless of DEBUG_PAGEALLOC settings. This set straightens this out by restoring dependency of __kernel_map_pages() on DEBUG_PAGEALLOC and updating the call sites accordingly. Since currently the only user of __kernel_map_pages() outside DEBUG_PAGEALLOC is hibernation, it is updated to make direct map accesses there more explicit. [1] https://lore.kernel.org/lkml/2759b4bf-e1e3-d006-7d86-78a403482...@redhat.com v5 changes: * use pairs of _map()/_unmap() functions instead of _map(..., int enable) as Vlastimil suggested v4 changes: * s/WARN_ON/pr_warn_once/ per David and Kirill * rebase on v5.10-rc2 * add Acked/Reviewed tags https://lore.kernel.org/lkml/20201103162057.22916-1-r...@kernel.org v3 changes: * update arm64 changes to avoid regression, per Rick's comments * fix bisectability https://lore.kernel.org/lkml/20201101170815.9795-1-r...@kernel.org v2 changes: * Rephrase patch 2 changelog to better describe the change intentions and implications * Move removal of kernel_map_pages() from patch 1 to patch 2, per David https://lore.kernel.org/lkml/20201029161902.19272-1-r...@kernel.org v1: https://lore.kernel.org/lkml/20201025101555.3057-1-r...@kernel.org Mike Rapoport (5): mm: introduce debug_pagealloc_{map,unmap}_pages() helpers slab: debug: split slab_kernel_map() to map and unmap variants PM: hibernate: make direct map manipulations more explicit arch, mm: restore dependency of __kernel_map_pages() on DEBUG_PAGEALLOC arch, mm: make kernel_page_present() always available arch/Kconfig| 3 +++ arch/arm64/Kconfig | 4 +-- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/pageattr.c| 6 +++-- arch/powerpc/Kconfig| 5 +--- arch/riscv/Kconfig | 4 +-- arch/riscv/include/asm/pgtable.h| 2 -- arch/riscv/include/asm/set_memory.h | 1 + arch/riscv/mm/pageattr.c| 31 ++ arch/s390/Kconfig | 4 +-- arch/sparc/Kconfig | 4 +-- arch/x86/Kconfig| 4 +-- arch/x86/include/asm/set_memory.h | 1 + arch/x86/mm/pat/set_memory.c| 4 +-- include/linux/mm.h | 40 ++--- include/linux/set_memory.h | 5 kernel/power/snapshot.c | 38 +-- mm/memory_hotplug.c | 3 +-- mm/page_alloc.c | 6 ++--- mm/slab.c | 26 ++- 20 files changed, 127 insertions(+), 65 deletions(-) -- 2.28.0
[Bug 209733] Starting new KVM virtual machines on PPC64 starts to hang after box is up for a while
https://bugzilla.kernel.org/show_bug.cgi?id=209733 --- Comment #2 from Cameron (c...@neo-zeon.de) --- Verified this happens with 5.9.6 and and Debian vendor kernel of linux-image-5.9.0-1-powerpc64le. Might also be worth mentioning this is occurring with qemu-system-ppc package version 1:3.1+dfsg-8+deb10u8. -- You are receiving this mail because: You are watching the assignee of the bug.
[PATCH] KVM: PPC: fix comparison to bool warning
From: Kaixu Xia Fix the following coccicheck warning: ./arch/powerpc/kvm/booke.c:503:6-16: WARNING: Comparison to bool ./arch/powerpc/kvm/booke.c:505:6-17: WARNING: Comparison to bool ./arch/powerpc/kvm/booke.c:507:6-16: WARNING: Comparison to bool Reported-by: Tosk Robot Signed-off-by: Kaixu Xia --- arch/powerpc/kvm/booke.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index b1abcb816439..288a9820ec01 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -500,11 +500,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, vcpu->arch.regs.nip = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; - if (update_esr == true) + if (update_esr) kvmppc_set_esr(vcpu, vcpu->arch.queued_esr); - if (update_dear == true) + if (update_dear) kvmppc_set_dar(vcpu, vcpu->arch.queued_dear); - if (update_epr == true) { + if (update_epr) { if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) { -- 2.20.0
Re: [PATCH] powerpc/64s: Remove RFI
Le 06/11/2020 à 12:36, Christophe Leroy a écrit : Last use of RFI on PPC64 was removed by commit b8e90cb7bc04 ("powerpc/64: Convert the syscall exit path to use RFI_TO_USER/KERNEL"). Remove the macro. Forget this crazy patch. I missed two RFI in head_64.S Christophe Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/ppc_asm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 511786f0e40d..bedf3eb52ebc 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -495,7 +495,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #endif #ifdef CONFIG_PPC_BOOK3S_64 -#define RFIrfid #define MTMSRD(r) mtmsrd r #define MTMSR_EERI(reg) mtmsrd reg,1 #else
Re: [PATCH] powerpc/32s: Use relocation offset when setting early hash table
On Nov 07 2020, Serge Belyshev wrote: > Christophe Leroy writes: > >> When calling early_hash_table(), the kernel hasn't been yet >> relocated to its linking address, so data must be addressed >> with relocation offset. >> >> Add relocation offset to write into Hash in early_hash_table(). >> >> Reported-by: Erhard Furtner >> Reported-by: Andreas Schwab >> Fixes: 69a1593abdbc ("powerpc/32s: Setup the early hash table at all time.") >> Signed-off-by: Christophe Leroy > > Tested-by: Serge Belyshev Works here as well. Thanks, Andreas. -- Andreas Schwab, sch...@linux-m68k.org GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1 "And now for something completely different."
Re: Kernel panic from malloc() on SUSE 15.1?
On Fri, Nov 6, 2020 at 4:25 AM Michael Ellerman wrote: > So something seems to have gone wrong linking this, I see eg: > > 10004a8c : > 10004a8c: 2b 10 40 3c lis r2,4139 > 10004a90: 88 f7 42 38 addir2,r2,-2168 > 10004a94: a6 02 08 7c mflrr0 > 10004a98: 10 00 01 f8 std r0,16(r1) > 10004a9c: f8 ff e1 fb std r31,-8(r1) > 10004aa0: 81 ff 21 f8 stdur1,-128(r1) > 10004aa4: 78 0b 3f 7c mr r31,r1 > 10004aa8: 60 00 7f f8 std r3,96(r31) > 10004aac: 68 00 9f f8 std r4,104(r31) > 10004ab0: 00 00 00 60 nop > 10004ab4: 30 80 22 e9 ld r9,-32720(r2) > 10004ab8: 00 00 a9 2f cmpdi cr7,r9,0 > 10004abc: 30 00 9e 41 beq cr7,10004aec > 10004ac0: 60 00 7f e8 ld r3,96(r31) > 10004ac4: 68 00 9f e8 ld r4,104(r31) > 10004ac8: 39 b5 ff 4b bl 1000 <_init-0x1f00> > > Notice that last bl (branch and link) to 0x1000. But there's no text > at 0x1000, that's the start of the page which happens to be the ELF > magic. > > I've seen something like this before, but I can't remember when/where so > I haven't been able to track down what the problem was. > > Anyway hopefully someone on the list will know. > > That still doesn't explain the kernel crash though. > Interesting. Sounds highly unlikely that the linker would have picked that address at random, but it makes no sense. And, agreed, jumping into junk should crash the program, not the kernel. > On my machine it doesn't crash the kernel, so I can catch it later. For > me it's here: > ie. in the syscall_random() that I mentioned above. > > You should be able to catch it there too if you do: > > (gdb) b *0x1000 > (gdb) r > > Hopefully it will stop without crashing the kernel, and then a `bt` will > show that you're in the same place as me. > > If you can get that to work, when you're stopped there, can you do an > `info registers` and send us the output. > Indeed, setting the breakpoint you suggested works, and the stack looks almost the same - only differences are a few bits off in main's argv pointer, rand_drbg_get_entropy's pout pointer, and the final address - you get 0x1004, I get 0x1000. Output, including "info registers", below. Hoping they provide some useful clues. Thanks again for looking into this. # gdb --args /tmp/ossl/rand_test ... (gdb) b *0x1000 Breakpoint 1 at 0x1000 (gdb) r Starting program: /tmp/ossl/rand_test Breakpoint 1, 0x1000 in ?? () (gdb) bt #0 0x1000 in ?? () #1 0x10004acc in syscall_random (buf=0x102b0730, buflen=32) at crypto/rand/rand_unix.c:371 #2 0x100053fc in rand_pool_acquire_entropy (pool=0x102b06e0) at crypto/rand/rand_unix.c:636 #3 0x10002b58 in rand_drbg_get_entropy (drbg=0x102b02e0, pout=0x7fffecf0, entropy=256, min_len=32, max_len=2147483647, prediction_resistance=0) at crypto/rand/rand_lib.c:198 #4 0x1001ed9c in RAND_DRBG_instantiate (drbg=0x102b02e0, pers=0x10248d00 "OpenSSL NIST SP 800-90A DRBG", perslen=28) at crypto/rand/drbg_lib.c:338 #5 0x10020300 in drbg_setup (parent=0x0) at crypto/rand/drbg_lib.c:895 #6 0x10020414 in do_rand_drbg_init () at crypto/rand/drbg_lib.c:924 #7 0x1002034c in do_rand_drbg_init_ossl_ () at crypto/rand/drbg_lib.c:909 #8 0x10005d1c in CRYPTO_THREAD_run_once (once=0x102ab4d8 , init=0x1002032c ) at crypto/threads_none.c:70 #9 0x100209c4 in RAND_DRBG_get0_master () at crypto/rand/drbg_lib.c:1102 #10 0x10020914 in drbg_status () at crypto/rand/drbg_lib.c:1084 #11 0x10004a58 in RAND_status () at crypto/rand/rand_lib.c:961 #12 0x10002890 in main (argc=1, argv=0x7368) at rand_test.c:6 (gdb) info registers r0 0x100053fc 268456956 r1 0x7fffeaf0 140737488349936 r2 0x102af788 271251336 r3 0x102b0730 271255344 r4 0x2032 r5 0x3048 r6 0x102b0760 271255392 r7 0x1 1 r8 0x0 0 r9 0x7fffb7dacc00 140736277957632 r100x102b0730 271255344 r110x1016 r120x7fffb7e19280 140736278401664 r130x7fffb7ffa100 140736280371456 r140x0 0 r150x0 0 r160x0 0 r170x0 0 r180x0 0 r190x0 0 r200x0 0 r210x0 0 r220x0 0 r230x0 0 r240x0 0 r250x0 0 r260x0 0 r27
[PATCH] KVM: PPC: Book3S: Assign boolean values to a bool variable
From: Kaixu Xia Fix the following coccinelle warnings: ./arch/powerpc/kvm/book3s_xics.c:476:3-15: WARNING: Assignment of 0/1 to bool variable ./arch/powerpc/kvm/book3s_xics.c:504:3-15: WARNING: Assignment of 0/1 to bool variable Reported-by: Tosk Robot Signed-off-by: Kaixu Xia --- arch/powerpc/kvm/book3s_xics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 5fee5a11550d..303e3cb096db 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -473,7 +473,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(>lock); local_irq_restore(flags); new_irq = reject; - check_resend = 0; + check_resend = false; goto again; } } else { @@ -501,7 +501,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, state->resend = 0; arch_spin_unlock(>lock); local_irq_restore(flags); - check_resend = 0; + check_resend = false; goto again; } } -- 2.20.0
Re: [PATCH] powerpc: add compile-time support for lbarx, lwarx
On Sat, Nov 07, 2020 at 08:12:13AM +0100, Gabriel Paubert wrote: > On Sat, Nov 07, 2020 at 01:23:28PM +1000, Nicholas Piggin wrote: > > ISA v2.06 (POWER7 and up) as well as e6500 support lbarx and lwarx. > > Hmm, lwarx exists since original Power AFAIR, Almost: it was new on PowerPC. Segher
[PATCH] panic: don't dump stack twice on warn
Before commit 3f388f28639f ("panic: dump registers on panic_on_warn"), __warn() was calling show_regs() when regs was not NULL, and show_stack() otherwise. After that commit, show_stack() is called regardless of whether show_regs() has been called or not, leading to duplicated Call Trace: [7.112617] [ cut here ] [7.117041] WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/nohash/8xx.c:186 mmu_mark_initmem_nx+0x24/0x94 [7.126021] CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 [7.135202] NIP: c00128b4 LR: c0010228 CTR: [7.140205] REGS: c9023e40 TRAP: 0700 Not tainted (5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty) [7.149131] MSR: 00029032 CR: 24000424 XER: [7.155760] [7.155760] GPR00: c0010228 c9023ef8 c210 0074c000 c2151000 c07b3880 [7.155760] GPR08: ff000900 0074c000 c800 c33b53a8 24000822 c0003a20 [7.155760] GPR16: [7.155760] GPR24: 0080 [7.191092] NIP [c00128b4] mmu_mark_initmem_nx+0x24/0x94 [7.196333] LR [c0010228] free_initmem+0x20/0x58 [7.200855] Call Trace: [7.203319] [c9023f18] [c0010228] free_initmem+0x20/0x58 [7.208564] [c9023f28] [c0003a3c] kernel_init+0x1c/0x114 [7.213813] [c9023f38] [c000f184] ret_from_kernel_thread+0x14/0x1c [7.219869] Instruction dump: [7.222805] 7d291850 7d234b78 4e800020 9421ffe0 7c0802a6 bfc10018 3fe0c060 3bff [7.230462] 3fff4080 3bff 90010024 57ff0010 <0fe0> 392001cd 7c3e0b78 953e0008 [7.238327] CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 [7.247500] Call Trace: [7.249977] [c9023dc0] [c001e070] __warn+0x8c/0xd8 (unreliable) [7.255815] [c9023de0] [c05e0e5c] report_bug+0x11c/0x154 [7.261085] [c9023e10] [c0009ea4] program_check_exception+0x1dc/0x6e0 [7.267430] [c9023e30] [c000f43c] ret_from_except_full+0x0/0x4 [7.273238] --- interrupt: 700 at mmu_mark_initmem_nx+0x24/0x94 [7.273238] LR = free_initmem+0x20/0x58 [7.283155] [c9023ef8] [] 0x0 (unreliable) [7.287913] [c9023f18] [c0010228] free_initmem+0x20/0x58 [7.293160] [c9023f28] [c0003a3c] kernel_init+0x1c/0x114 [7.298410] [c9023f38] [c000f184] ret_from_kernel_thread+0x14/0x1c [7.304479] ---[ end trace 31702cd2a9570752 ]--- Only call show_stack() when regs is NULL. Fixes: 3f388f28639f ("panic: dump registers on panic_on_warn") Cc: Alexey Kardashevskiy Signed-off-by: Christophe Leroy --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fd..332736a72a58 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - dump_stack(); + if (!regs) + dump_stack(); print_irqtrace_events(current); -- 2.25.0
Re: [RFC PATCH] powerpc: show registers when unwinding interrupt frames
Le 07/11/2020 à 03:33, Nicholas Piggin a écrit : It's often useful to know the register state for interrupts in the stack frame. In the below example (with this patch applied), the important information is the state of the page fault. A blatant case like this probably rather should have the page fault regs passed down to the warning, but quite often there are less obvious cases where an interrupt shows up that might give some more clues. I like it. I was wondering about interrupts that do not save NV registers, but that seems to be handled: [0.455489] --- interrupt: 301 at cmpxchg_futex_value_locked+0x2c/0x58 [0.461886] NIP: c0089c08 LR: c0755df0 CTR: c02e59a4 [0.466889] REGS: c9023db0 TRAP: 0301 Not tainted (5.10.0-rc2-s3k-dev-01371-gfb45a2414e96-dirty) [0.475815] MSR: 9032 CR: 28000244 XER: [0.482450] DAR: DSISR: c000 [0.482450] GPR00: c0755dc8 c9023e68 c210 c9023e78 [0.482450] GPR08: 1032 8000 0003 42000242 [0.500988] NIP [c0089c08] cmpxchg_futex_value_locked+0x2c/0x58 [0.506842] LR [c0755df0] futex_init+0x74/0xd0 [0.511194] --- interrupt: 301 Christophe The downside is longer and more complex bug output. Bug: Write fault blocked by AMR! WARNING: CPU: 0 PID: 72 at arch/powerpc/include/asm/book3s/64/kup-radix.h:164 __do_page_fault+0x880/0xa90 Modules linked in: CPU: 0 PID: 72 Comm: systemd-gpt-aut Not tainted NIP: c006e2f0 LR: c006e2ec CTR: REGS: ca4f3420 TRAP: 0700 MSR: 80021033 CR: 28002840 XER: 2004 CFAR: c0128be0 IRQMASK: 3 GPR00: c006e2ec ca4f36c0 c14f0700 0020 GPR04: 0001 c1290f50 0001 c1290f80 GPR08: c1612b08 e0f7 GPR12: 48002840 c16e c00c00021c80 c0fd6f60 GPR16: ca104698 0003 c87f GPR20: 0100 c70330b8 0004 GPR24: 0200 0300 0200 ca5b0c00 GPR28: 0a00 7fffb2a90038 ca4f3820 NIP [c006e2f0] __do_page_fault+0x880/0xa90 LR [c006e2ec] __do_page_fault+0x87c/0xa90 Call Trace: [ca4f36c0] [c006e2ec] __do_page_fault+0x87c/0xa90 (unreliable) [ca4f3780] [c0e1c034] do_page_fault+0x34/0x90 [ca4f37b0] [c0008908] data_access_common_virt+0x158/0x1b0 --- interrupt: 300 at __copy_tofrom_user_base+0x9c/0x5a4 NIP: c009b028 LR: c0802978 CTR: 0800 REGS: ca4f3820 TRAP: 0300 MSR: 8280b033 CR: 24004840 XER: CFAR: c009aff4 DAR: 7fffb2a90038 DSISR: 0a00 IRQMASK: 0 GPR00: ca4f3ac0 c14f0700 7fffb2a90028 GPR04: c8720010 0001 GPR08: 0001 GPR12: 4000 c16e c00c00021c80 c0fd6f60 GPR16: ca104698 0003 c87f GPR20: 0100 c70330b8 0004 GPR24: ca4f3c80 c872 0001 GPR28: 0001 0872 0001 c1515b98 NIP [c009b028] __copy_tofrom_user_base+0x9c/0x5a4 LR [c0802978] copyout+0x68/0xc0 --- interrupt: 300 [ca4f3af0] [c08074b8] copy_page_to_iter+0x188/0x540 [ca4f3b50] [c035c678] generic_file_buffered_read+0x358/0xd80 [ca4f3c40] [c04c1e90] blkdev_read_iter+0x50/0x80 [ca4f3c60] [c045733c] new_sync_read+0x12c/0x1c0 [ca4f3d00] [c045a1f0] vfs_read+0x1d0/0x240 [ca4f3d50] [c045a7f4] ksys_read+0x84/0x140 [ca4f3da0] [c0033a60] system_call_exception+0x100/0x280 [ca4f3e10] [c000c508] system_call_common+0xf8/0x2f8 Instruction dump: eae10078 3beb 4bfff890 6042 792917e1 4182ff18 3c82ffab 3884a5e0 3c62ffab 3863a6e8 480ba891 6000 <0fe0> 3beb 4bfff860 e93c0938 Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/process.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ea36a29c8b01..799f00b32f74 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1475,12 +1475,10 @@ static void print_msr_bits(unsigned long val) #define LAST_VOLATILE 12 #endif -void show_regs(struct pt_regs * regs) +static void __show_regs(struct pt_regs *regs) { int i,
Re: [RFC PATCH 0/9] powerpc/64s: fast interrupt exit
Le 06/11/2020 à 16:59, Nicholas Piggin a écrit : This series attempts to improve the speed of interrupts and system calls in two major ways. Firstly, the SRR/HSRR registers do not need to be reloaded if they were not used or clobbered fur the duration of the interrupt. Secondly, an alternate return location facility is added for soft-masked asynchronous interrupts and then that's used to set everything up for return without having to disable MSR RI or EE. After this series, the entire system call / interrupt handler fast path executes no mtsprs and one mtmsrd to enable interrupts initially, and the system call vectored path doesn't even need to do that. Interesting series. Unfortunately, can't be done on PPC32 (at least on non bookE), because it would mean mapping kernel at 0 instead of 0xC000. Not sure libc would like it, and anyway it would be an issue for catching NULL pointer dereferencing, unless we use page tables instead of BATs to map kernel mem, which would be serious performance cut. Christophe Thanks, Nick Nicholas Piggin (9): powerpc/64s: syscall real mode entry use mtmsrd rather than rfid powerpc/64s: system call avoid setting MSR[RI] until we set MSR[EE] powerpc/64s: introduce different functions to return from SRR vs HSRR interrupts powerpc/64s: avoid reloading (H)SRR registers if they are still valid powerpc/64: move interrupt return asm to interrupt_64.S powerpc/64s: save one more register in the masked interrupt handler powerpc/64s: allow alternate return locations for soft-masked interrupts powerpc/64s: interrupt soft-enable race fix powerpc/64s: use interrupt restart table to speed up return from interrupt arch/powerpc/Kconfig.debug | 5 + arch/powerpc/include/asm/asm-prototypes.h | 4 +- arch/powerpc/include/asm/head-64.h | 2 +- arch/powerpc/include/asm/interrupt.h | 18 + arch/powerpc/include/asm/paca.h| 3 + arch/powerpc/include/asm/ppc_asm.h | 8 + arch/powerpc/include/asm/ptrace.h | 28 +- arch/powerpc/kernel/asm-offsets.c | 5 + arch/powerpc/kernel/entry_64.S | 508 --- arch/powerpc/kernel/exceptions-64s.S | 180 -- arch/powerpc/kernel/fpu.S | 2 + arch/powerpc/kernel/head_64.S | 5 +- arch/powerpc/kernel/interrupt_64.S | 720 + arch/powerpc/kernel/irq.c | 79 ++- arch/powerpc/kernel/kgdb.c | 2 +- arch/powerpc/kernel/kprobes-ftrace.c | 2 +- arch/powerpc/kernel/kprobes.c | 10 +- arch/powerpc/kernel/process.c | 21 +- arch/powerpc/kernel/rtas.c | 13 +- arch/powerpc/kernel/signal.c | 2 +- arch/powerpc/kernel/signal_64.c| 14 + arch/powerpc/kernel/syscall_64.c | 242 --- arch/powerpc/kernel/syscalls.c | 2 + arch/powerpc/kernel/traps.c| 18 +- arch/powerpc/kernel/vector.S | 6 +- arch/powerpc/kernel/vmlinux.lds.S | 10 + arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/restart_table.c | 26 + arch/powerpc/lib/sstep.c | 5 +- arch/powerpc/math-emu/math.c | 2 +- arch/powerpc/mm/fault.c| 2 +- arch/powerpc/perf/core-book3s.c| 19 +- arch/powerpc/platforms/powernv/opal-call.c | 3 + arch/powerpc/sysdev/fsl_pci.c | 2 +- 34 files changed, 1244 insertions(+), 726 deletions(-) create mode 100644 arch/powerpc/kernel/interrupt_64.S create mode 100644 arch/powerpc/lib/restart_table.c
Re: [PATCH] powerpc/32s: Use relocation offset when setting early hash table
Christophe Leroy writes: > When calling early_hash_table(), the kernel hasn't been yet > relocated to its linking address, so data must be addressed > with relocation offset. > > Add relocation offset to write into Hash in early_hash_table(). > > Reported-by: Erhard Furtner > Reported-by: Andreas Schwab > Fixes: 69a1593abdbc ("powerpc/32s: Setup the early hash table at all time.") > Signed-off-by: Christophe Leroy Tested-by: Serge Belyshev
Re: [PATCH 18/18] powerpc/64s: move power4 idle entirely to C
Le 05/11/2020 à 15:34, Nicholas Piggin a écrit : Christophe asked about doing this, most of the code is still in asm but maybe it's slightly nicer? I don't know if it's worthwhile. Heu... I don't think I was asking for that, but why not, see later comments. At first I was just asking to write the following in C: + + .globl power4_idle_nap_return +power4_idle_nap_return: + blr In extenso, instead of the above do somewhere something like: void power4_idle_nap_return(void) { } --- arch/powerpc/kernel/idle.c| 25 - arch/powerpc/kernel/idle_book3s.S | 22 -- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index ae0e2632393d..849e77a45915 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -72,6 +72,9 @@ int powersave_nap; #ifdef CONFIG_PPC_970_NAP void power4_idle(void) { + unsigned long msr_idle = MSR_KERNEL|MSR_EE|MSR_POW; + unsigned long tmp1, tmp2; + if (!cpu_has_feature(CPU_FTR_CAN_NAP)) return; @@ -84,13 +87,25 @@ void power4_idle(void) if (cpu_has_feature(CPU_FTR_ALTIVEC)) asm volatile("DSSALL ; sync" ::: "memory"); - power4_idle_nap(); - + asm volatile( +" ld %0,PACA_THREAD_INFO(r13)\n" +" ld %1,TI_LOCAL_FLAGS(%0) \n" +" ori %1,%1,_TLF_NAPPING \n" +" std %1,TI_LOCAL_FLAGS(%0) \n" Can't this just be: current_thread_info()->local_flags |= _TLF_NAPPING; /* -* power4_idle_nap returns with interrupts enabled (soft and hard). -* to our caller with interrupts enabled (soft and hard). Our caller -* can cope with either interrupts disabled or enabled upon return. +* NAPPING bit is set, from this point onward nap_adjust_return() +* will cause interrupts to return to power4_idle_nap_return. */ +"1: sync\n" +" isync \n" +" mtmsrd %2 \n" +" isync \n" +" b 1b \n" And this: for (;;) { mb(); isync(); mtmsr(MSR_KERNEL|MSR_EE|MSR_POW); isync(); } +" .globl power4_idle_nap_return \n" +"power4_idle_nap_return: \n" + : "=r"(tmp1), "=r"(tmp2) + : "r"(msr_idle) + ); } #endif Christophe
Re: [PATCH] powerpc/32s: Setup the early hash table at all time.
Le 29/10/2020 à 22:07, Andreas Schwab a écrit : On Okt 01 2020, Christophe Leroy wrote: At the time being, an early hash table is set up when CONFIG_KASAN is selected. There is nothing wrong with setting such an early hash table all the time, even if it is not used. This is a statically allocated 256 kB table which lies in the init data section. This makes the code simpler and may in the future allow to setup early IO mappings with fixmap instead of hard coding BATs. Put create_hpte() and flush_hash_pages() in the .ref.text section in order to avoid warning for the reference to early_hash[]. This reference is removed by MMU_init_hw_patch() before init memory is freed. This breaks booting on the iBook G4. Can you test patch https://patchwork.ozlabs.org/project/linuxppc-dev/patch/9e225a856a8b22e0e77587ee22ab7a2f5bca8753.1604740029.git.christophe.le...@csgroup.eu/ Thanks Christophe
[Bug 209869] Kernel 5.10-rc1 fails to boot on a PowerMac G4 3,6 at an early stage
https://bugzilla.kernel.org/show_bug.cgi?id=209869 --- Comment #11 from Christophe Leroy (christophe.le...@csgroup.eu) --- Can (In reply to Erhard F. from comment #10) > (In reply to Christophe Leroy from comment #9) > > Ok, what about 5.10-rc1 + KASAN without reverting the patch ? > Nope, does not boot. Same 5.10-rc1 .config + KASAN but without reverting the > patch. Can you test patch at https://patchwork.ozlabs.org/project/linuxppc-dev/patch/9e225a856a8b22e0e77587ee22ab7a2f5bca8753.1604740029.git.christophe.le...@csgroup.eu/ -- You are receiving this mail because: You are watching the assignee of the bug.
[PATCH] powerpc/32s: Use relocation offset when setting early hash table
When calling early_hash_table(), the kernel hasn't been yet relocated to its linking address, so data must be addressed with relocation offset. Add relocation offset to write into Hash in early_hash_table(). Reported-by: Erhard Furtner Reported-by: Andreas Schwab Fixes: 69a1593abdbc ("powerpc/32s: Setup the early hash table at all time.") Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/head_book3s_32.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 5eb9eedac920..8aa7eb11754e 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -156,6 +156,7 @@ __after_mmu_off: bl initial_bats bl load_segment_registers BEGIN_MMU_FTR_SECTION + bl reloc_offset bl early_hash_table END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) #if defined(CONFIG_BOOTX_TEXT) @@ -932,7 +933,7 @@ early_hash_table: ori r6, r6, 3 /* 256kB table */ mtspr SPRN_SDR1, r6 lis r6, early_hash@h - lis r3, Hash@ha + addis r3, r3, Hash@ha stw r6, Hash@l(r3) blr -- 2.25.0
Re: [PATCH] powerpc: add compile-time support for lbarx, lwarx
Le 07/11/2020 à 04:23, Nicholas Piggin a écrit : ISA v2.06 (POWER7 and up) as well as e6500 support lbarx and lwarx. Add a compile option that allows code to use it, and add support in cmpxchg and xchg 8 and 16 bit values. Do you mean lharx ? Because lwarx exists on all powerpcs I think. Signed-off-by: Nicholas Piggin --- arch/powerpc/Kconfig | 3 + arch/powerpc/include/asm/cmpxchg.h | 236 - arch/powerpc/platforms/Kconfig.cputype | 5 + 3 files changed, 243 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe08492..d231af06f75a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -266,6 +266,9 @@ config PPC_BARRIER_NOSPEC default y depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E +config PPC_LBARX_LWARX + bool s/LWARX/LHARX/ ? And maybe better with PPC_HAS_LBARX_LWARX ? + config EARLY_PRINTK bool default y diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index cf091c4c22e5..17fd996dc0d4 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -77,10 +77,76 @@ u32 __cmpxchg_##type##sfx(volatile void *p, u32 old, u32 new) \ * the previous value stored there. */ +#ifndef CONFIG_PPC_LBARX_LWARX XCHG_GEN(u8, _local, "memory"); XCHG_GEN(u8, _relaxed, "cc"); XCHG_GEN(u16, _local, "memory"); XCHG_GEN(u16, _relaxed, "cc"); +#else +static __always_inline unsigned long +__xchg_u8_local(volatile void *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lbarx %0,0,%2 \n" +" stbcx. %3,0,%2 \n\ + bne-1b" + : "=" (prev), "+m" (*(volatile unsigned char *)p) + : "r" (p), "r" (val) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__xchg_u8_relaxed(u8 *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lbarx %0,0,%2\n" +" stbcx. %3,0,%2\n" +" bne-1b" + : "=" (prev), "+m" (*p) + : "r" (p), "r" (val) + : "cc"); + + return prev; +} + +static __always_inline unsigned long +__xchg_u16_local(volatile void *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lharx %0,0,%2 \n" +" sthcx. %3,0,%2 \n\ + bne-1b" + : "=" (prev), "+m" (*(volatile unsigned short *)p) + : "r" (p), "r" (val) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__xchg_u16_relaxed(u16 *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lharx %0,0,%2\n" +" sthcx. %3,0,%2\n" +" bne-1b" + : "=" (prev), "+m" (*p) + : "r" (p), "r" (val) + : "cc"); + + return prev; +} +#endif That's a lot of code duplication. Could we use some macro, in the same spirit as what is done in arch/powerpc/include/asm/io.h for in_be16(), in_be32(), in_be64() and friends ? static __always_inline unsigned long __xchg_u32_local(volatile void *p, unsigned long val) @@ -198,11 +264,12 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) (__typeof__(*(ptr))) __xchg_relaxed((ptr), \ (unsigned long)_x_, sizeof(*(ptr)));\ }) + /* * Compare and exchange - if *p == old, set it to new, * and return the old value of *p. */ - +#ifndef CONFIG_PPC_LBARX_LWARX CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); CMPXCHG_GEN(u8, _local, , , "memory"); CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); @@ -211,6 +278,173 @@ CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); CMPXCHG_GEN(u16, _local, , , "memory"); CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); CMPXCHG_GEN(u16, _relaxed, , , "cc"); +#else +static __always_inline unsigned long +__cmpxchg_u8(volatile unsigned char *p, unsigned long old, unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( + PPC_ATOMIC_ENTRY_BARRIER +"1: lbarx %0,0,%2 # __cmpxchg_u8\n\ + cmpw0,%0,%3\n\ + bne-2f\n" +" stbcx. %4,0,%2\n\ + bne-1b" + PPC_ATOMIC_EXIT_BARRIER + "\n\ +2:" + : "=" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u8_local(volatile unsigned char *p, unsigned long old, + unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( +"1: lbarx %0,0,%2 # __cmpxchg_u8\n\ + cmpw0,%0,%3\n\ + bne-2f\n" +" stbcx. %4,0,%2\n\ + bne-1b" + "\n\ +2:" + : "=" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc",