[PATCH v5 5/5] arch, mm: make kernel_page_present() always available

2020-11-07 Thread Mike Rapoport
From: Mike Rapoport 

For architectures that enable ARCH_HAS_SET_MEMORY having the ability to
verify that a page is mapped in the kernel direct map can be useful
regardless of hibernation.

Add RISC-V implementation of kernel_page_present(), update its forward
declarations and stubs to be a part of set_memory API and remove ugly
ifdefery in inlcude/linux/mm.h around current declarations of
kernel_page_present().

Signed-off-by: Mike Rapoport 
Acked-by: Kirill A. Shutemov 
---
 arch/arm64/include/asm/cacheflush.h |  1 +
 arch/arm64/mm/pageattr.c|  4 +---
 arch/riscv/include/asm/set_memory.h |  1 +
 arch/riscv/mm/pageattr.c| 29 +
 arch/x86/include/asm/set_memory.h   |  1 +
 arch/x86/mm/pat/set_memory.c|  4 +---
 include/linux/mm.h  |  7 ---
 include/linux/set_memory.h  |  5 +
 8 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/cacheflush.h 
b/arch/arm64/include/asm/cacheflush.h
index 9384fd8fc13c..45217f21f1fe 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -140,6 +140,7 @@ int set_memory_valid(unsigned long addr, int numpages, int 
enable);
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
 
 #include 
 
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 439325532be1..92eccaf595c8 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -186,8 +186,8 @@ void __kernel_map_pages(struct page *page, int numpages, 
int enable)
 
set_memory_valid((unsigned long)page_address(page), numpages, enable);
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-#ifdef CONFIG_HIBERNATION
 /*
  * This function is used to determine if a linear map page has been marked as
  * not-valid. Walk the page table and check the PTE_VALID bit. This is based
@@ -234,5 +234,3 @@ bool kernel_page_present(struct page *page)
ptep = pte_offset_kernel(pmdp, addr);
return pte_valid(READ_ONCE(*ptep));
 }
-#endif /* CONFIG_HIBERNATION */
-#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/riscv/include/asm/set_memory.h 
b/arch/riscv/include/asm/set_memory.h
index 4c5bae7ca01c..d690b08dff2a 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -24,6 +24,7 @@ static inline int set_memory_nx(unsigned long addr, int 
numpages) { return 0; }
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 321b09d2e2ea..87ba5a68bbb8 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -198,3 +198,32 @@ void __kernel_map_pages(struct page *page, int numpages, 
int enable)
 __pgprot(0), __pgprot(_PAGE_PRESENT));
 }
 #endif
+
+bool kernel_page_present(struct page *page)
+{
+   unsigned long addr = (unsigned long)page_address(page);
+   pgd_t *pgd;
+   pud_t *pud;
+   p4d_t *p4d;
+   pmd_t *pmd;
+   pte_t *pte;
+
+   pgd = pgd_offset_k(addr);
+   if (!pgd_present(*pgd))
+   return false;
+
+   p4d = p4d_offset(pgd, addr);
+   if (!p4d_present(*p4d))
+   return false;
+
+   pud = pud_offset(p4d, addr);
+   if (!pud_present(*pud))
+   return false;
+
+   pmd = pmd_offset(pud, addr);
+   if (!pmd_present(*pmd))
+   return false;
+
+   pte = pte_offset_kernel(pmd, addr);
+   return pte_present(*pte);
+}
diff --git a/arch/x86/include/asm/set_memory.h 
b/arch/x86/include/asm/set_memory.h
index 5948218f35c5..4352f08bfbb5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -82,6 +82,7 @@ int set_pages_rw(struct page *page, int numpages);
 
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
 
 extern int kernel_set_to_readonly;
 
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index bc9be96b777f..16f878c26667 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2226,8 +2226,8 @@ void __kernel_map_pages(struct page *page, int numpages, 
int enable)
 
arch_flush_lazy_mmu_mode();
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-#ifdef CONFIG_HIBERNATION
 bool kernel_page_present(struct page *page)
 {
unsigned int level;
@@ -2239,8 +2239,6 @@ bool kernel_page_present(struct page *page)
pte = lookup_address((unsigned long)page_address(page), );
return (pte_val(*pte) & _PAGE_PRESENT);
 }
-#endif /* CONFIG_HIBERNATION */
-#endif /* CONFIG_DEBUG_PAGEALLOC */
 
 int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long 

[PATCH v5 4/5] arch, mm: restore dependency of __kernel_map_pages() on DEBUG_PAGEALLOC

2020-11-07 Thread Mike Rapoport
From: Mike Rapoport 

The design of DEBUG_PAGEALLOC presumes that __kernel_map_pages() must never
fail. With this assumption is wouldn't be safe to allow general usage of
this function.

Moreover, some architectures that implement __kernel_map_pages() have this
function guarded by #ifdef DEBUG_PAGEALLOC and some refuse to map/unmap
pages when page allocation debugging is disabled at runtime.

As all the users of __kernel_map_pages() were converted to use
debug_pagealloc_map_pages() it is safe to make it available only when
DEBUG_PAGEALLOC is set.

Signed-off-by: Mike Rapoport 
Acked-by: David Hildenbrand 
Acked-by: Kirill A. Shutemov 
---
 arch/Kconfig |  3 +++
 arch/arm64/Kconfig   |  4 +---
 arch/arm64/mm/pageattr.c |  8 ++--
 arch/powerpc/Kconfig |  5 +
 arch/riscv/Kconfig   |  4 +---
 arch/riscv/include/asm/pgtable.h |  2 --
 arch/riscv/mm/pageattr.c |  2 ++
 arch/s390/Kconfig|  4 +---
 arch/sparc/Kconfig   |  4 +---
 arch/x86/Kconfig |  4 +---
 arch/x86/mm/pat/set_memory.c |  2 ++
 include/linux/mm.h   | 10 +++---
 12 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 56b6ccc0e32d..56d4752b6db6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1028,6 +1028,9 @@ config HAVE_STATIC_CALL_INLINE
bool
depends on HAVE_STATIC_CALL
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+   bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1d466addb078..a932810cfd90 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -71,6 +71,7 @@ config ARM64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_USE_SYM_ANNOTATIONS
+   select ARCH_SUPPORTS_DEBUG_PAGEALLOC
select ARCH_SUPPORTS_MEMORY_FAILURE
select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK
select ARCH_SUPPORTS_ATOMIC_RMW
@@ -1025,9 +1026,6 @@ config HOLES_IN_ZONE
 
 source "kernel/Kconfig.hz"
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-   def_bool y
-
 config ARCH_SPARSEMEM_ENABLE
def_bool y
select SPARSEMEM_VMEMMAP_ENABLE
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 1b94f5b82654..439325532be1 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -155,7 +155,7 @@ int set_direct_map_invalid_noflush(struct page *page)
.clear_mask = __pgprot(PTE_VALID),
};
 
-   if (!rodata_full)
+   if (!debug_pagealloc_enabled() && !rodata_full)
return 0;
 
return apply_to_page_range(_mm,
@@ -170,7 +170,7 @@ int set_direct_map_default_noflush(struct page *page)
.clear_mask = __pgprot(PTE_RDONLY),
};
 
-   if (!rodata_full)
+   if (!debug_pagealloc_enabled() && !rodata_full)
return 0;
 
return apply_to_page_range(_mm,
@@ -178,6 +178,7 @@ int set_direct_map_default_noflush(struct page *page)
   PAGE_SIZE, change_page_range, );
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
if (!debug_pagealloc_enabled() && !rodata_full)
@@ -186,6 +187,7 @@ void __kernel_map_pages(struct page *page, int numpages, 
int enable)
set_memory_valid((unsigned long)page_address(page), numpages, enable);
 }
 
+#ifdef CONFIG_HIBERNATION
 /*
  * This function is used to determine if a linear map page has been marked as
  * not-valid. Walk the page table and check the PTE_VALID bit. This is based
@@ -232,3 +234,5 @@ bool kernel_page_present(struct page *page)
ptep = pte_offset_kernel(pmdp, addr);
return pte_valid(READ_ONCE(*ptep));
 }
+#endif /* CONFIG_HIBERNATION */
+#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e9f13fe08492..ad8a83f3ddca 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -146,6 +146,7 @@ config PPC
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_SUPPORTS_ATOMIC_RMW
+   select ARCH_SUPPORTS_DEBUG_PAGEALLOCif PPC32 || PPC_BOOK3S_64
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_QUEUED_RWLOCKS  if PPC_QUEUED_SPINLOCKS
@@ -355,10 +356,6 @@ config PPC_OF_PLATFORM_PCI
depends on PCI
depends on PPC64 # not supported on 32 bits yet
 
-config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-   depends on PPC32 || PPC_BOOK3S_64
-   def_bool y
-
 config ARCH_SUPPORTS_UPROBES
def_bool y
 
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 44377fd7860e..9283c6f9ae2a 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -14,6 +14,7 @@ config RISCV
def_bool y
   

[PATCH v5 3/5] PM: hibernate: make direct map manipulations more explicit

2020-11-07 Thread Mike Rapoport
From: Mike Rapoport 

When DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP is enabled a page may be
not present in the direct map and has to be explicitly mapped before it
could be copied.

Introduce hibernate_map_page() and hibernation_unmap_page() that will
explicitly use set_direct_map_{default,invalid}_noflush() for
ARCH_HAS_SET_DIRECT_MAP case and debug_pagealloc_{map,unmap}_pages() for
DEBUG_PAGEALLOC case.

The remapping of the pages in safe_copy_page() presumes that it only
changes protection bits in an existing PTE and so it is safe to ignore
return value of set_direct_map_{default,invalid}_noflush().

Still, add a pr_warn() so that future changes in set_memory APIs will not
silently break hibernation.

Signed-off-by: Mike Rapoport 
Acked-by: Rafael J. Wysocki 
Reviewed-by: David Hildenbrand 
Acked-by: Kirill A. Shutemov 
Acked-by: Vlastimil Babka 
---
 include/linux/mm.h  | 12 
 kernel/power/snapshot.c | 38 --
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bb8c70178f4e..e198b938f5c5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2927,16 +2927,6 @@ static inline bool debug_pagealloc_enabled_static(void)
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
 extern void __kernel_map_pages(struct page *page, int numpages, int enable);
 
-/*
- * When called in DEBUG_PAGEALLOC context, the call should most likely be
- * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
- */
-static inline void
-kernel_map_pages(struct page *page, int numpages, int enable)
-{
-   __kernel_map_pages(page, numpages, enable);
-}
-
 static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
 {
if (debug_pagealloc_enabled_static())
@@ -2953,8 +2943,6 @@ static inline void debug_pagealloc_unmap_pages(struct 
page *page, int numpages)
 extern bool kernel_page_present(struct page *page);
 #endif /* CONFIG_HIBERNATION */
 #else  /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
-static inline void
-kernel_map_pages(struct page *page, int numpages, int enable) {}
 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) 
{}
 static inline void debug_pagealloc_unmap_pages(struct page *page, int 
numpages) {}
 #ifdef CONFIG_HIBERNATION
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 46b1804c1ddf..d848377dd8dc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -76,6 +76,40 @@ static inline void hibernate_restore_protect_page(void 
*page_address) {}
 static inline void hibernate_restore_unprotect_page(void *page_address) {}
 #endif /* CONFIG_STRICT_KERNEL_RWX  && CONFIG_ARCH_HAS_SET_MEMORY */
 
+
+/*
+ * The calls to set_direct_map_*() should not fail because remapping a page
+ * here means that we only update protection bits in an existing PTE.
+ * It is still worth to have a warning here if something changes and this
+ * will no longer be the case.
+ */
+static inline void hibernate_map_page(struct page *page)
+{
+   if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+   int ret = set_direct_map_default_noflush(page);
+
+   if (ret)
+   pr_warn_once("Failed to remap page\n");
+   } else {
+   debug_pagealloc_map_pages(page, 1);
+   }
+}
+
+static inline void hibernate_unmap_page(struct page *page)
+{
+   if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+   unsigned long addr = (unsigned long)page_address(page);
+   int ret  = set_direct_map_invalid_noflush(page);
+
+   if (ret)
+   pr_warn_once("Failed to remap page\n");
+
+   flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+   } else {
+   debug_pagealloc_unmap_pages(page, 1);
+   }
+}
+
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
@@ -1355,9 +1389,9 @@ static void safe_copy_page(void *dst, struct page *s_page)
if (kernel_page_present(s_page)) {
do_copy_page(dst, page_address(s_page));
} else {
-   kernel_map_pages(s_page, 1, 1);
+   hibernate_map_page(s_page);
do_copy_page(dst, page_address(s_page));
-   kernel_map_pages(s_page, 1, 0);
+   hibernate_unmap_page(s_page);
}
 }
 
-- 
2.28.0



[PATCH v5 2/5] slab: debug: split slab_kernel_map() to map and unmap variants

2020-11-07 Thread Mike Rapoport
From: Mike Rapoport 

Instead of using slab_kernel_map() with 'map' parameter to remap pages when
DEBUG_PAGEALLOC is enabled, use dedicated helpers slab_kernel_map() and
slab_kernel_unmap().

Signed-off-by: Mike Rapoport 
---
 mm/slab.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 07317386e150..0719421d69f7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1428,17 +1428,21 @@ static bool is_debug_pagealloc_cache(struct kmem_cache 
*cachep)
return false;
 }
 
-static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp)
 {
if (!is_debug_pagealloc_cache(cachep))
return;
 
-   if (map)
-   debug_pagealloc_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE);
-   else
-   debug_pagealloc_unmap_pages(virt_to_page(objp),
-   cachep->size / PAGE_SIZE);
+   debug_pagealloc_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE);
+}
+
+static void slab_kernel_unmap(struct kmem_cache *cachep, void *objp)
+{
+   if (!is_debug_pagealloc_cache(cachep))
+   return;
+
+   debug_pagealloc_unmap_pages(virt_to_page(objp),
+   cachep->size / PAGE_SIZE);
 }
 
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char 
val)
@@ -1585,7 +1589,7 @@ static void slab_destroy_debugcheck(struct kmem_cache 
*cachep,
 
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
-   slab_kernel_map(cachep, objp, 1);
+   slab_kernel_map(cachep, objp);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -2360,7 +2364,7 @@ static void cache_init_objs_debug(struct kmem_cache 
*cachep, struct page *page)
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
-   slab_kernel_map(cachep, objp, 0);
+   slab_kernel_unmap(cachep, objp);
}
}
 #endif
@@ -2728,7 +2732,7 @@ static void *cache_free_debugcheck(struct kmem_cache 
*cachep, void *objp,
 
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
-   slab_kernel_map(cachep, objp, 0);
+   slab_kernel_unmap(cachep, objp);
}
return objp;
 }
@@ -2993,7 +2997,7 @@ static void *cache_alloc_debugcheck_after(struct 
kmem_cache *cachep,
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
-   slab_kernel_map(cachep, objp, 1);
+   slab_kernel_map(cachep, objp);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
-- 
2.28.0



[PATCH v5 1/5] mm: introduce debug_pagealloc_{map, unmap}_pages() helpers

2020-11-07 Thread Mike Rapoport
From: Mike Rapoport 

When CONFIG_DEBUG_PAGEALLOC is enabled, it unmaps pages from the kernel
direct mapping after free_pages(). The pages than need to be mapped back
before they could be used. Theese mapping operations use
__kernel_map_pages() guarded with with debug_pagealloc_enabled().

The only place that calls __kernel_map_pages() without checking whether
DEBUG_PAGEALLOC is enabled is the hibernation code that presumes
availability of this function when ARCH_HAS_SET_DIRECT_MAP is set.
Still, on arm64, __kernel_map_pages() will bail out when DEBUG_PAGEALLOC is
not enabled but set_direct_map_invalid_noflush() may render some pages not
present in the direct map and hibernation code won't be able to save such
pages.

To make page allocation debugging and hibernation interaction more robust,
the dependency on DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP has to be made
more explicit.

Start with combining the guard condition and the call to
__kernel_map_pages() into debug_pagealloc_map_pages() and
debug_pagealloc_unmap_pages() functions to emphasize that
__kernel_map_pages() should not be called without DEBUG_PAGEALLOC and use
these new functions to map/unmap pages when page allocation debugging is
enabled.

Signed-off-by: Mike Rapoport 
Reviewed-by: David Hildenbrand 
Acked-by: Kirill A. Shutemov 
Acked-by: Vlastimil Babka 
---
 include/linux/mm.h  | 15 +++
 mm/memory_hotplug.c |  3 +--
 mm/page_alloc.c |  6 ++
 mm/slab.c   | 16 +++-
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef360fe70aaf..bb8c70178f4e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2936,12 +2936,27 @@ kernel_map_pages(struct page *page, int numpages, int 
enable)
 {
__kernel_map_pages(page, numpages, enable);
 }
+
+static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
+{
+   if (debug_pagealloc_enabled_static())
+   __kernel_map_pages(page, numpages, 1);
+}
+
+static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
+{
+   if (debug_pagealloc_enabled_static())
+   __kernel_map_pages(page, numpages, 0);
+}
+
 #ifdef CONFIG_HIBERNATION
 extern bool kernel_page_present(struct page *page);
 #endif /* CONFIG_HIBERNATION */
 #else  /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable) {}
+static inline void debug_pagealloc_map_pages(struct page *page, int numpages) 
{}
+static inline void debug_pagealloc_unmap_pages(struct page *page, int 
numpages) {}
 #ifdef CONFIG_HIBERNATION
 static inline bool kernel_page_present(struct page *page) { return true; }
 #endif /* CONFIG_HIBERNATION */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b44d4c7ba73b..f18f86ba2a68 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -614,8 +614,7 @@ void generic_online_page(struct page *page, unsigned int 
order)
 * so we should map it first. This is better than introducing a special
 * case in page freeing fast path.
 */
-   if (debug_pagealloc_enabled_static())
-   kernel_map_pages(page, 1 << order, 1);
+   debug_pagealloc_map_pages(page, 1 << order);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
 #ifdef CONFIG_HIGHMEM
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23f5066bd4a5..db1bf70458d0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1272,8 +1272,7 @@ static __always_inline bool free_pages_prepare(struct 
page *page,
 */
arch_free_page(page, order);
 
-   if (debug_pagealloc_enabled_static())
-   kernel_map_pages(page, 1 << order, 0);
+   debug_pagealloc_unmap_pages(page, 1 << order);
 
kasan_free_nondeferred_pages(page, order);
 
@@ -2270,8 +2269,7 @@ inline void post_alloc_hook(struct page *page, unsigned 
int order,
set_page_refcounted(page);
 
arch_alloc_page(page, order);
-   if (debug_pagealloc_enabled_static())
-   kernel_map_pages(page, 1 << order, 1);
+   debug_pagealloc_map_pages(page, 1 << order);
kasan_alloc_pages(page, order);
kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
diff --git a/mm/slab.c b/mm/slab.c
index b1113561b98b..07317386e150 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1428,21 +1428,19 @@ static bool is_debug_pagealloc_cache(struct kmem_cache 
*cachep)
return false;
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
 static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
 {
if (!is_debug_pagealloc_cache(cachep))
return;
 
-   kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+   if (map)
+   debug_pagealloc_map_pages(virt_to_page(objp),
+ cachep->size / PAGE_SIZE);
+   

[PATCH v5 0/5] arch, mm: improve robustness of direct map manipulation

2020-11-07 Thread Mike Rapoport
From: Mike Rapoport 

Hi,

During recent discussion about KVM protected memory, David raised a concern
about usage of __kernel_map_pages() outside of DEBUG_PAGEALLOC scope [1].

Indeed, for architectures that define CONFIG_ARCH_HAS_SET_DIRECT_MAP it is
possible that __kernel_map_pages() would fail, but since this function is
void, the failure will go unnoticed.

Moreover, there's lack of consistency of __kernel_map_pages() semantics
across architectures as some guard this function with
#ifdef DEBUG_PAGEALLOC, some refuse to update the direct map if page
allocation debugging is disabled at run time and some allow modifying the
direct map regardless of DEBUG_PAGEALLOC settings.

This set straightens this out by restoring dependency of
__kernel_map_pages() on DEBUG_PAGEALLOC and updating the call sites
accordingly. 

Since currently the only user of __kernel_map_pages() outside
DEBUG_PAGEALLOC is hibernation, it is updated to make direct map accesses
there more explicit.

[1] https://lore.kernel.org/lkml/2759b4bf-e1e3-d006-7d86-78a403482...@redhat.com

v5 changes:
* use pairs of _map()/_unmap() functions instead of _map(..., int enable) as
  Vlastimil suggested

v4 changes:
* s/WARN_ON/pr_warn_once/ per David and Kirill
* rebase on v5.10-rc2
* add Acked/Reviewed tags
https://lore.kernel.org/lkml/20201103162057.22916-1-r...@kernel.org

v3 changes:
* update arm64 changes to avoid regression, per Rick's comments
* fix bisectability
https://lore.kernel.org/lkml/20201101170815.9795-1-r...@kernel.org

v2 changes:
* Rephrase patch 2 changelog to better describe the change intentions and
implications
* Move removal of kernel_map_pages() from patch 1 to patch 2, per David
https://lore.kernel.org/lkml/20201029161902.19272-1-r...@kernel.org

v1:
https://lore.kernel.org/lkml/20201025101555.3057-1-r...@kernel.org

Mike Rapoport (5):
  mm: introduce debug_pagealloc_{map,unmap}_pages() helpers
  slab: debug: split slab_kernel_map() to map and unmap variants
  PM: hibernate: make direct map manipulations more explicit
  arch, mm: restore dependency of __kernel_map_pages() on DEBUG_PAGEALLOC
  arch, mm: make kernel_page_present() always available

 arch/Kconfig|  3 +++
 arch/arm64/Kconfig  |  4 +--
 arch/arm64/include/asm/cacheflush.h |  1 +
 arch/arm64/mm/pageattr.c|  6 +++--
 arch/powerpc/Kconfig|  5 +---
 arch/riscv/Kconfig  |  4 +--
 arch/riscv/include/asm/pgtable.h|  2 --
 arch/riscv/include/asm/set_memory.h |  1 +
 arch/riscv/mm/pageattr.c| 31 ++
 arch/s390/Kconfig   |  4 +--
 arch/sparc/Kconfig  |  4 +--
 arch/x86/Kconfig|  4 +--
 arch/x86/include/asm/set_memory.h   |  1 +
 arch/x86/mm/pat/set_memory.c|  4 +--
 include/linux/mm.h  | 40 ++---
 include/linux/set_memory.h  |  5 
 kernel/power/snapshot.c | 38 +--
 mm/memory_hotplug.c |  3 +--
 mm/page_alloc.c |  6 ++---
 mm/slab.c   | 26 ++-
 20 files changed, 127 insertions(+), 65 deletions(-)

-- 
2.28.0



[Bug 209733] Starting new KVM virtual machines on PPC64 starts to hang after box is up for a while

2020-11-07 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=209733

--- Comment #2 from Cameron (c...@neo-zeon.de) ---
Verified this happens with 5.9.6 and and Debian vendor kernel of
linux-image-5.9.0-1-powerpc64le.

Might also be worth mentioning this is occurring with qemu-system-ppc package
version 1:3.1+dfsg-8+deb10u8.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[PATCH] KVM: PPC: fix comparison to bool warning

2020-11-07 Thread xiakaixu1987
From: Kaixu Xia 

Fix the following coccicheck warning:

./arch/powerpc/kvm/booke.c:503:6-16: WARNING: Comparison to bool
./arch/powerpc/kvm/booke.c:505:6-17: WARNING: Comparison to bool
./arch/powerpc/kvm/booke.c:507:6-16: WARNING: Comparison to bool

Reported-by: Tosk Robot 
Signed-off-by: Kaixu Xia 
---
 arch/powerpc/kvm/booke.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index b1abcb816439..288a9820ec01 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -500,11 +500,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
*vcpu,
 
vcpu->arch.regs.nip = vcpu->arch.ivpr |
vcpu->arch.ivor[priority];
-   if (update_esr == true)
+   if (update_esr)
kvmppc_set_esr(vcpu, vcpu->arch.queued_esr);
-   if (update_dear == true)
+   if (update_dear)
kvmppc_set_dar(vcpu, vcpu->arch.queued_dear);
-   if (update_epr == true) {
+   if (update_epr) {
if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) {
-- 
2.20.0



Re: [PATCH] powerpc/64s: Remove RFI

2020-11-07 Thread Christophe Leroy




Le 06/11/2020 à 12:36, Christophe Leroy a écrit :

Last use of RFI on PPC64 was removed by
commit b8e90cb7bc04 ("powerpc/64: Convert the syscall exit path to
use RFI_TO_USER/KERNEL").

Remove the macro.


Forget this crazy patch. I missed two RFI in head_64.S 

Christophe



Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/ppc_asm.h | 1 -
  1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index 511786f0e40d..bedf3eb52ebc 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -495,7 +495,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, 
CPU_FTR_CELL_TB_BUG, 96)
  #endif
  
  #ifdef CONFIG_PPC_BOOK3S_64

-#define RFIrfid
  #define MTMSRD(r) mtmsrd  r
  #define MTMSR_EERI(reg)   mtmsrd  reg,1
  #else



Re: [PATCH] powerpc/32s: Use relocation offset when setting early hash table

2020-11-07 Thread Andreas Schwab
On Nov 07 2020, Serge Belyshev wrote:

> Christophe Leroy  writes:
>
>> When calling early_hash_table(), the kernel hasn't been yet
>> relocated to its linking address, so data must be addressed
>> with relocation offset.
>>
>> Add relocation offset to write into Hash in early_hash_table().
>>
>> Reported-by: Erhard Furtner 
>> Reported-by: Andreas Schwab 
>> Fixes: 69a1593abdbc ("powerpc/32s: Setup the early hash table at all time.")
>> Signed-off-by: Christophe Leroy 
>
> Tested-by: Serge Belyshev 

Works here as well.

Thanks, Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."


Re: Kernel panic from malloc() on SUSE 15.1?

2020-11-07 Thread Carl Jacobsen
On Fri, Nov 6, 2020 at 4:25 AM Michael Ellerman  wrote:

> So something seems to have gone wrong linking this, I see eg:
>
> 10004a8c :
> 10004a8c:   2b 10 40 3c lis r2,4139
> 10004a90:   88 f7 42 38 addir2,r2,-2168
> 10004a94:   a6 02 08 7c mflrr0
> 10004a98:   10 00 01 f8 std r0,16(r1)
> 10004a9c:   f8 ff e1 fb std r31,-8(r1)
> 10004aa0:   81 ff 21 f8 stdur1,-128(r1)
> 10004aa4:   78 0b 3f 7c mr  r31,r1
> 10004aa8:   60 00 7f f8 std r3,96(r31)
> 10004aac:   68 00 9f f8 std r4,104(r31)
> 10004ab0:   00 00 00 60 nop
> 10004ab4:   30 80 22 e9 ld  r9,-32720(r2)
> 10004ab8:   00 00 a9 2f cmpdi   cr7,r9,0
> 10004abc:   30 00 9e 41 beq cr7,10004aec 
> 10004ac0:   60 00 7f e8 ld  r3,96(r31)
> 10004ac4:   68 00 9f e8 ld  r4,104(r31)
> 10004ac8:   39 b5 ff 4b bl  1000 <_init-0x1f00>
>
> Notice that last bl (branch and link) to 0x1000. But there's no text
> at 0x1000, that's the start of the page which happens to be the ELF
> magic.
>
> I've seen something like this before, but I can't remember when/where so
> I haven't been able to track down what the problem was.
>
> Anyway hopefully someone on the list will know.
>
> That still doesn't explain the kernel crash though.
>

Interesting. Sounds highly unlikely that the linker would have picked
that address at random, but it makes no sense. And, agreed, jumping
into junk should crash the program, not the kernel.


> On my machine it doesn't crash the kernel, so I can catch it later. For
> me it's here:
> 

ie. in the syscall_random() that I mentioned above.
>
> You should be able to catch it there too if you do:
>
> (gdb) b *0x1000
> (gdb) r
>
> Hopefully it will stop without crashing the kernel, and then a `bt` will
> show that you're in the same place as me.
>
> If you can get that to work, when you're stopped there, can you do an
> `info registers` and send us the output.
>

Indeed, setting the breakpoint you suggested works, and the stack looks
almost the same - only differences are a few bits off in main's argv
pointer, rand_drbg_get_entropy's pout pointer, and the final address - you
get 0x1004, I get 0x1000. Output, including "info
registers", below. Hoping they provide some useful clues. Thanks again for
looking into this.

# gdb --args /tmp/ossl/rand_test
...
(gdb) b *0x1000
Breakpoint 1 at 0x1000
(gdb) r
Starting program: /tmp/ossl/rand_test

Breakpoint 1, 0x1000 in ?? ()
(gdb) bt
#0  0x1000 in ?? ()
#1  0x10004acc in syscall_random (buf=0x102b0730, buflen=32) at
crypto/rand/rand_unix.c:371
#2  0x100053fc in rand_pool_acquire_entropy (pool=0x102b06e0) at
crypto/rand/rand_unix.c:636
#3  0x10002b58 in rand_drbg_get_entropy (drbg=0x102b02e0,
pout=0x7fffecf0, entropy=256, min_len=32,
max_len=2147483647, prediction_resistance=0) at
crypto/rand/rand_lib.c:198
#4  0x1001ed9c in RAND_DRBG_instantiate (drbg=0x102b02e0,
pers=0x10248d00  "OpenSSL NIST SP 800-90A DRBG",
perslen=28) at crypto/rand/drbg_lib.c:338
#5  0x10020300 in drbg_setup (parent=0x0) at
crypto/rand/drbg_lib.c:895
#6  0x10020414 in do_rand_drbg_init () at crypto/rand/drbg_lib.c:924
#7  0x1002034c in do_rand_drbg_init_ossl_ () at
crypto/rand/drbg_lib.c:909
#8  0x10005d1c in CRYPTO_THREAD_run_once (once=0x102ab4d8
,
init=0x1002032c ) at crypto/threads_none.c:70
#9  0x100209c4 in RAND_DRBG_get0_master () at
crypto/rand/drbg_lib.c:1102
#10 0x10020914 in drbg_status () at crypto/rand/drbg_lib.c:1084
#11 0x10004a58 in RAND_status () at crypto/rand/rand_lib.c:961
#12 0x10002890 in main (argc=1, argv=0x7368) at
rand_test.c:6
(gdb) info registers
r0 0x100053fc  268456956
r1 0x7fffeaf0  140737488349936
r2 0x102af788  271251336
r3 0x102b0730  271255344
r4 0x2032
r5 0x3048
r6 0x102b0760  271255392
r7 0x1 1
r8 0x0 0
r9 0x7fffb7dacc00  140736277957632
r100x102b0730  271255344
r110x1016
r120x7fffb7e19280  140736278401664
r130x7fffb7ffa100  140736280371456
r140x0 0
r150x0 0
r160x0 0
r170x0 0
r180x0 0
r190x0 0
r200x0 0
r210x0 0
r220x0 0
r230x0 0
r240x0 0
r250x0 0
r260x0 0
r27   

[PATCH] KVM: PPC: Book3S: Assign boolean values to a bool variable

2020-11-07 Thread xiakaixu1987
From: Kaixu Xia 

Fix the following coccinelle warnings:

./arch/powerpc/kvm/book3s_xics.c:476:3-15: WARNING: Assignment of 0/1 to bool 
variable
./arch/powerpc/kvm/book3s_xics.c:504:3-15: WARNING: Assignment of 0/1 to bool 
variable

Reported-by: Tosk Robot 
Signed-off-by: Kaixu Xia 
---
 arch/powerpc/kvm/book3s_xics.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 5fee5a11550d..303e3cb096db 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -473,7 +473,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, 
struct kvmppc_icp *icp,
arch_spin_unlock(>lock);
local_irq_restore(flags);
new_irq = reject;
-   check_resend = 0;
+   check_resend = false;
goto again;
}
} else {
@@ -501,7 +501,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, 
struct kvmppc_icp *icp,
state->resend = 0;
arch_spin_unlock(>lock);
local_irq_restore(flags);
-   check_resend = 0;
+   check_resend = false;
goto again;
}
}
-- 
2.20.0



Re: [PATCH] powerpc: add compile-time support for lbarx, lwarx

2020-11-07 Thread Segher Boessenkool
On Sat, Nov 07, 2020 at 08:12:13AM +0100, Gabriel Paubert wrote:
> On Sat, Nov 07, 2020 at 01:23:28PM +1000, Nicholas Piggin wrote:
> > ISA v2.06 (POWER7 and up) as well as e6500 support lbarx and lwarx.
> 
> Hmm, lwarx exists since original Power AFAIR,

Almost: it was new on PowerPC.


Segher


[PATCH] panic: don't dump stack twice on warn

2020-11-07 Thread Christophe Leroy
Before commit 3f388f28639f ("panic: dump registers on panic_on_warn"),
__warn() was calling show_regs() when regs was not NULL, and
show_stack() otherwise.

After that commit, show_stack() is called regardless of whether
show_regs() has been called or not, leading to duplicated Call Trace:

[7.112617] [ cut here ]
[7.117041] WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/nohash/8xx.c:186 
mmu_mark_initmem_nx+0x24/0x94
[7.126021] CPU: 0 PID: 1 Comm: swapper Not tainted 
5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092
[7.135202] NIP:  c00128b4 LR: c0010228 CTR: 
[7.140205] REGS: c9023e40 TRAP: 0700   Not tainted  
(5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty)
[7.149131] MSR:  00029032   CR: 24000424  XER: 
[7.155760]
[7.155760] GPR00: c0010228 c9023ef8 c210 0074c000   
c2151000 c07b3880
[7.155760] GPR08: ff000900 0074c000 c800 c33b53a8 24000822  
c0003a20 
[7.155760] GPR16:       
 
[7.155760] GPR24:       
 0080
[7.191092] NIP [c00128b4] mmu_mark_initmem_nx+0x24/0x94
[7.196333] LR [c0010228] free_initmem+0x20/0x58
[7.200855] Call Trace:
[7.203319] [c9023f18] [c0010228] free_initmem+0x20/0x58
[7.208564] [c9023f28] [c0003a3c] kernel_init+0x1c/0x114
[7.213813] [c9023f38] [c000f184] ret_from_kernel_thread+0x14/0x1c
[7.219869] Instruction dump:
[7.222805] 7d291850 7d234b78 4e800020 9421ffe0 7c0802a6 bfc10018 3fe0c060 
3bff
[7.230462] 3fff4080 3bff 90010024 57ff0010 <0fe0> 392001cd 7c3e0b78 
953e0008
[7.238327] CPU: 0 PID: 1 Comm: swapper Not tainted 
5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092
[7.247500] Call Trace:
[7.249977] [c9023dc0] [c001e070] __warn+0x8c/0xd8 (unreliable)
[7.255815] [c9023de0] [c05e0e5c] report_bug+0x11c/0x154
[7.261085] [c9023e10] [c0009ea4] program_check_exception+0x1dc/0x6e0
[7.267430] [c9023e30] [c000f43c] ret_from_except_full+0x0/0x4
[7.273238] --- interrupt: 700 at mmu_mark_initmem_nx+0x24/0x94
[7.273238] LR = free_initmem+0x20/0x58
[7.283155] [c9023ef8] [] 0x0 (unreliable)
[7.287913] [c9023f18] [c0010228] free_initmem+0x20/0x58
[7.293160] [c9023f28] [c0003a3c] kernel_init+0x1c/0x114
[7.298410] [c9023f38] [c000f184] ret_from_kernel_thread+0x14/0x1c
[7.304479] ---[ end trace 31702cd2a9570752 ]---

Only call show_stack() when regs is NULL.

Fixes: 3f388f28639f ("panic: dump registers on panic_on_warn")
Cc: Alexey Kardashevskiy 
Signed-off-by: Christophe Leroy 
---
 kernel/panic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/panic.c b/kernel/panic.c
index 396142ee43fd..332736a72a58 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, 
unsigned taint,
panic("panic_on_warn set ...\n");
}
 
-   dump_stack();
+   if (!regs)
+   dump_stack();
 
print_irqtrace_events(current);
 
-- 
2.25.0



Re: [RFC PATCH] powerpc: show registers when unwinding interrupt frames

2020-11-07 Thread Christophe Leroy




Le 07/11/2020 à 03:33, Nicholas Piggin a écrit :

It's often useful to know the register state for interrupts in
the stack frame. In the below example (with this patch applied),
the important information is the state of the page fault.

A blatant case like this probably rather should have the page
fault regs passed down to the warning, but quite often there are
less obvious cases where an interrupt shows up that might give
some more clues.


I like it.

I was wondering about interrupts that do not save NV registers, but that seems 
to be handled:

[0.455489] --- interrupt: 301 at cmpxchg_futex_value_locked+0x2c/0x58
[0.461886] NIP:  c0089c08 LR: c0755df0 CTR: c02e59a4
[0.466889] REGS: c9023db0 TRAP: 0301   Not tainted  
(5.10.0-rc2-s3k-dev-01371-gfb45a2414e96-dirty)
[0.475815] MSR:  9032   CR: 28000244  XER: 
[0.482450] DAR:  DSISR: c000
[0.482450] GPR00: c0755dc8 c9023e68 c210 c9023e78   
 
[0.482450] GPR08: 1032  8000 0003 42000242
[0.500988] NIP [c0089c08] cmpxchg_futex_value_locked+0x2c/0x58
[0.506842] LR [c0755df0] futex_init+0x74/0xd0
[0.511194] --- interrupt: 301

Christophe



The downside is longer and more complex bug output.

   Bug: Write fault blocked by AMR!
   WARNING: CPU: 0 PID: 72 at 
arch/powerpc/include/asm/book3s/64/kup-radix.h:164 __do_page_fault+0x880/0xa90
   Modules linked in:
   CPU: 0 PID: 72 Comm: systemd-gpt-aut Not tainted
   NIP:  c006e2f0 LR: c006e2ec CTR: 
   REGS: ca4f3420 TRAP: 0700
   MSR:  80021033   CR: 28002840  XER: 2004
   CFAR: c0128be0 IRQMASK: 3
   GPR00: c006e2ec ca4f36c0 c14f0700 0020
   GPR04: 0001 c1290f50 0001 c1290f80
   GPR08: c1612b08   e0f7
   GPR12: 48002840 c16e c00c00021c80 c0fd6f60
   GPR16:  ca104698 0003 c87f
   GPR20: 0100 c70330b8  0004
   GPR24: 0200 0300 0200 ca5b0c00
   GPR28:  0a00 7fffb2a90038 ca4f3820
   NIP [c006e2f0] __do_page_fault+0x880/0xa90
   LR [c006e2ec] __do_page_fault+0x87c/0xa90
   Call Trace:
   [ca4f36c0] [c006e2ec] __do_page_fault+0x87c/0xa90 
(unreliable)
   [ca4f3780] [c0e1c034] do_page_fault+0x34/0x90
   [ca4f37b0] [c0008908] data_access_common_virt+0x158/0x1b0
   --- interrupt: 300 at __copy_tofrom_user_base+0x9c/0x5a4
   NIP:  c009b028 LR: c0802978 CTR: 0800
   REGS: ca4f3820 TRAP: 0300
   MSR:  8280b033   CR: 24004840  XER: 

   CFAR: c009aff4 DAR: 7fffb2a90038 DSISR: 0a00 IRQMASK: 0
   GPR00:  ca4f3ac0 c14f0700 7fffb2a90028
   GPR04: c8720010 0001  
   GPR08:    0001
   GPR12: 4000 c16e c00c00021c80 c0fd6f60
   GPR16:  ca104698 0003 c87f
   GPR20: 0100 c70330b8  0004
   GPR24: ca4f3c80 c872 0001 
   GPR28: 0001 0872 0001 c1515b98
   NIP [c009b028] __copy_tofrom_user_base+0x9c/0x5a4
   LR [c0802978] copyout+0x68/0xc0
   --- interrupt: 300
   [ca4f3af0] [c08074b8] copy_page_to_iter+0x188/0x540
   [ca4f3b50] [c035c678] generic_file_buffered_read+0x358/0xd80
   [ca4f3c40] [c04c1e90] blkdev_read_iter+0x50/0x80
   [ca4f3c60] [c045733c] new_sync_read+0x12c/0x1c0
   [ca4f3d00] [c045a1f0] vfs_read+0x1d0/0x240
   [ca4f3d50] [c045a7f4] ksys_read+0x84/0x140
   [ca4f3da0] [c0033a60] system_call_exception+0x100/0x280
   [ca4f3e10] [c000c508] system_call_common+0xf8/0x2f8
   Instruction dump:
   eae10078 3beb 4bfff890 6042 792917e1 4182ff18 3c82ffab 3884a5e0
   3c62ffab 3863a6e8 480ba891 6000 <0fe0> 3beb 4bfff860 e93c0938

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/kernel/process.c | 20 ++--
  1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ea36a29c8b01..799f00b32f74 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1475,12 +1475,10 @@ static void print_msr_bits(unsigned long val)
  #define LAST_VOLATILE 12
  #endif
  
-void show_regs(struct pt_regs * regs)

+static void __show_regs(struct pt_regs *regs)
  {
int i, 

Re: [RFC PATCH 0/9] powerpc/64s: fast interrupt exit

2020-11-07 Thread Christophe Leroy




Le 06/11/2020 à 16:59, Nicholas Piggin a écrit :

This series attempts to improve the speed of interrupts and system calls
in two major ways.

Firstly, the SRR/HSRR registers do not need to be reloaded if they were
not used or clobbered fur the duration of the interrupt.

Secondly, an alternate return location facility is added for soft-masked
asynchronous interrupts and then that's used to set everything up for
return without having to disable MSR RI or EE.

After this series, the entire system call / interrupt handler fast path
executes no mtsprs and one mtmsrd to enable interrupts initially, and
the system call vectored path doesn't even need to do that.


Interesting series.

Unfortunately, can't be done on PPC32 (at least on non bookE), because it would mean mapping kernel 
at 0 instead of 0xC000. Not sure libc would like it, and anyway it would be an issue for 
catching NULL pointer dereferencing, unless we use page tables instead of BATs to map kernel mem, 
which would be serious performance cut.


Christophe



Thanks,
Nick

Nicholas Piggin (9):
   powerpc/64s: syscall real mode entry use mtmsrd rather than rfid
   powerpc/64s: system call avoid setting MSR[RI] until we set MSR[EE]
   powerpc/64s: introduce different functions to return from SRR vs HSRR
 interrupts
   powerpc/64s: avoid reloading (H)SRR registers if they are still valid
   powerpc/64: move interrupt return asm to interrupt_64.S
   powerpc/64s: save one more register in the masked interrupt handler
   powerpc/64s: allow alternate return locations for soft-masked
 interrupts
   powerpc/64s: interrupt soft-enable race fix
   powerpc/64s: use interrupt restart table to speed up return from
 interrupt

  arch/powerpc/Kconfig.debug |   5 +
  arch/powerpc/include/asm/asm-prototypes.h  |   4 +-
  arch/powerpc/include/asm/head-64.h |   2 +-
  arch/powerpc/include/asm/interrupt.h   |  18 +
  arch/powerpc/include/asm/paca.h|   3 +
  arch/powerpc/include/asm/ppc_asm.h |   8 +
  arch/powerpc/include/asm/ptrace.h  |  28 +-
  arch/powerpc/kernel/asm-offsets.c  |   5 +
  arch/powerpc/kernel/entry_64.S | 508 ---
  arch/powerpc/kernel/exceptions-64s.S   | 180 --
  arch/powerpc/kernel/fpu.S  |   2 +
  arch/powerpc/kernel/head_64.S  |   5 +-
  arch/powerpc/kernel/interrupt_64.S | 720 +
  arch/powerpc/kernel/irq.c  |  79 ++-
  arch/powerpc/kernel/kgdb.c |   2 +-
  arch/powerpc/kernel/kprobes-ftrace.c   |   2 +-
  arch/powerpc/kernel/kprobes.c  |  10 +-
  arch/powerpc/kernel/process.c  |  21 +-
  arch/powerpc/kernel/rtas.c |  13 +-
  arch/powerpc/kernel/signal.c   |   2 +-
  arch/powerpc/kernel/signal_64.c|  14 +
  arch/powerpc/kernel/syscall_64.c   | 242 ---
  arch/powerpc/kernel/syscalls.c |   2 +
  arch/powerpc/kernel/traps.c|  18 +-
  arch/powerpc/kernel/vector.S   |   6 +-
  arch/powerpc/kernel/vmlinux.lds.S  |  10 +
  arch/powerpc/lib/Makefile  |   2 +-
  arch/powerpc/lib/restart_table.c   |  26 +
  arch/powerpc/lib/sstep.c   |   5 +-
  arch/powerpc/math-emu/math.c   |   2 +-
  arch/powerpc/mm/fault.c|   2 +-
  arch/powerpc/perf/core-book3s.c|  19 +-
  arch/powerpc/platforms/powernv/opal-call.c |   3 +
  arch/powerpc/sysdev/fsl_pci.c  |   2 +-
  34 files changed, 1244 insertions(+), 726 deletions(-)
  create mode 100644 arch/powerpc/kernel/interrupt_64.S
  create mode 100644 arch/powerpc/lib/restart_table.c



Re: [PATCH] powerpc/32s: Use relocation offset when setting early hash table

2020-11-07 Thread Serge Belyshev
Christophe Leroy  writes:

> When calling early_hash_table(), the kernel hasn't been yet
> relocated to its linking address, so data must be addressed
> with relocation offset.
>
> Add relocation offset to write into Hash in early_hash_table().
>
> Reported-by: Erhard Furtner 
> Reported-by: Andreas Schwab 
> Fixes: 69a1593abdbc ("powerpc/32s: Setup the early hash table at all time.")
> Signed-off-by: Christophe Leroy 

Tested-by: Serge Belyshev 


Re: [PATCH 18/18] powerpc/64s: move power4 idle entirely to C

2020-11-07 Thread Christophe Leroy




Le 05/11/2020 à 15:34, Nicholas Piggin a écrit :

Christophe asked about doing this, most of the code is still in
asm but maybe it's slightly nicer? I don't know if it's worthwhile.


Heu... I don't think I was asking for that, but why not, see later comments.

At first I was just asking to write the following in C:

+
+   .globl power4_idle_nap_return
+power4_idle_nap_return:
+   blr


In extenso, instead of the above do somewhere something like:

void power4_idle_nap_return(void)
{
}



---
  arch/powerpc/kernel/idle.c| 25 -
  arch/powerpc/kernel/idle_book3s.S | 22 --
  2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index ae0e2632393d..849e77a45915 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -72,6 +72,9 @@ int powersave_nap;
  #ifdef CONFIG_PPC_970_NAP
  void power4_idle(void)
  {
+   unsigned long msr_idle = MSR_KERNEL|MSR_EE|MSR_POW;
+   unsigned long tmp1, tmp2;
+
if (!cpu_has_feature(CPU_FTR_CAN_NAP))
return;
  
@@ -84,13 +87,25 @@ void power4_idle(void)

if (cpu_has_feature(CPU_FTR_ALTIVEC))
asm volatile("DSSALL ; sync" ::: "memory");
  
-	power4_idle_nap();

-
+   asm volatile(
+" ld  %0,PACA_THREAD_INFO(r13)\n"
+" ld  %1,TI_LOCAL_FLAGS(%0)   \n"
+" ori %1,%1,_TLF_NAPPING  \n"
+" std %1,TI_LOCAL_FLAGS(%0)   \n"


Can't this just be:

current_thread_info()->local_flags |= _TLF_NAPPING;


/*
-* power4_idle_nap returns with interrupts enabled (soft and hard).
-* to our caller with interrupts enabled (soft and hard). Our caller
-* can cope with either interrupts disabled or enabled upon return.
+* NAPPING bit is set, from this point onward nap_adjust_return()
+* will cause interrupts to return to power4_idle_nap_return.
 */
+"1:   sync\n"
+" isync   \n"
+" mtmsrd  %2  \n"
+" isync   \n"
+" b   1b  \n"


And this:

for (;;) {
mb();
isync();
mtmsr(MSR_KERNEL|MSR_EE|MSR_POW);
isync();
}



+" .globl power4_idle_nap_return   \n"
+"power4_idle_nap_return:  \n"
+   : "=r"(tmp1), "=r"(tmp2)
+   : "r"(msr_idle)
+   );
  }
  #endif
  


Christophe


Re: [PATCH] powerpc/32s: Setup the early hash table at all time.

2020-11-07 Thread Christophe Leroy




Le 29/10/2020 à 22:07, Andreas Schwab a écrit :

On Okt 01 2020, Christophe Leroy wrote:


At the time being, an early hash table is set up when
CONFIG_KASAN is selected.

There is nothing wrong with setting such an early hash table
all the time, even if it is not used. This is a statically
allocated 256 kB table which lies in the init data section.

This makes the code simpler and may in the future allow to
setup early IO mappings with fixmap instead of hard coding BATs.

Put create_hpte() and flush_hash_pages() in the .ref.text section
in order to avoid warning for the reference to early_hash[]. This
reference is removed by MMU_init_hw_patch() before init memory is
freed.


This breaks booting on the iBook G4.



Can you test patch 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/9e225a856a8b22e0e77587ee22ab7a2f5bca8753.1604740029.git.christophe.le...@csgroup.eu/


Thanks
Christophe


[Bug 209869] Kernel 5.10-rc1 fails to boot on a PowerMac G4 3,6 at an early stage

2020-11-07 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=209869

--- Comment #11 from Christophe Leroy (christophe.le...@csgroup.eu) ---
Can (In reply to Erhard F. from comment #10)
> (In reply to Christophe Leroy from comment #9)
> > Ok, what about 5.10-rc1 + KASAN without reverting the patch ?
> Nope, does not boot. Same 5.10-rc1 .config + KASAN but without reverting the
> patch.

Can you test patch at
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/9e225a856a8b22e0e77587ee22ab7a2f5bca8753.1604740029.git.christophe.le...@csgroup.eu/

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[PATCH] powerpc/32s: Use relocation offset when setting early hash table

2020-11-07 Thread Christophe Leroy
When calling early_hash_table(), the kernel hasn't been yet
relocated to its linking address, so data must be addressed
with relocation offset.

Add relocation offset to write into Hash in early_hash_table().

Reported-by: Erhard Furtner 
Reported-by: Andreas Schwab 
Fixes: 69a1593abdbc ("powerpc/32s: Setup the early hash table at all time.")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_book3s_32.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/head_book3s_32.S 
b/arch/powerpc/kernel/head_book3s_32.S
index 5eb9eedac920..8aa7eb11754e 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -156,6 +156,7 @@ __after_mmu_off:
bl  initial_bats
bl  load_segment_registers
 BEGIN_MMU_FTR_SECTION
+   bl  reloc_offset
bl  early_hash_table
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
 #if defined(CONFIG_BOOTX_TEXT)
@@ -932,7 +933,7 @@ early_hash_table:
ori r6, r6, 3   /* 256kB table */
mtspr   SPRN_SDR1, r6
lis r6, early_hash@h
-   lis r3, Hash@ha
+   addis   r3, r3, Hash@ha
stw r6, Hash@l(r3)
blr
 
-- 
2.25.0



Re: [PATCH] powerpc: add compile-time support for lbarx, lwarx

2020-11-07 Thread Christophe Leroy




Le 07/11/2020 à 04:23, Nicholas Piggin a écrit :

ISA v2.06 (POWER7 and up) as well as e6500 support lbarx and lwarx.
Add a compile option that allows code to use it, and add support in
cmpxchg and xchg 8 and 16 bit values.


Do you mean lharx ? Because lwarx exists on all powerpcs I think.



Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/Kconfig   |   3 +
  arch/powerpc/include/asm/cmpxchg.h | 236 -
  arch/powerpc/platforms/Kconfig.cputype |   5 +
  3 files changed, 243 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e9f13fe08492..d231af06f75a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -266,6 +266,9 @@ config PPC_BARRIER_NOSPEC
default y
depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E
  
+config PPC_LBARX_LWARX

+   bool


s/LWARX/LHARX/ ?

And maybe better with PPC_HAS_LBARX_LWARX ?


+
  config EARLY_PRINTK
bool
default y
diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index cf091c4c22e5..17fd996dc0d4 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -77,10 +77,76 @@ u32 __cmpxchg_##type##sfx(volatile void *p, u32 old, u32 
new)   \
   * the previous value stored there.
   */
  
+#ifndef CONFIG_PPC_LBARX_LWARX

  XCHG_GEN(u8, _local, "memory");
  XCHG_GEN(u8, _relaxed, "cc");
  XCHG_GEN(u16, _local, "memory");
  XCHG_GEN(u16, _relaxed, "cc");
+#else
+static __always_inline unsigned long
+__xchg_u8_local(volatile void *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:   lbarx   %0,0,%2 \n"
+" stbcx.  %3,0,%2 \n\
+   bne-1b"
+   : "=" (prev), "+m" (*(volatile unsigned char *)p)
+   : "r" (p), "r" (val)
+   : "cc", "memory");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u8_relaxed(u8 *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:   lbarx   %0,0,%2\n"
+" stbcx.  %3,0,%2\n"
+" bne-1b"
+   : "=" (prev), "+m" (*p)
+   : "r" (p), "r" (val)
+   : "cc");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u16_local(volatile void *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:   lharx   %0,0,%2 \n"
+" sthcx.  %3,0,%2 \n\
+   bne-1b"
+   : "=" (prev), "+m" (*(volatile unsigned short *)p)
+   : "r" (p), "r" (val)
+   : "cc", "memory");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u16_relaxed(u16 *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:   lharx   %0,0,%2\n"
+" sthcx.  %3,0,%2\n"
+" bne-1b"
+   : "=" (prev), "+m" (*p)
+   : "r" (p), "r" (val)
+   : "cc");
+
+   return prev;
+}
+#endif


That's a lot of code duplication. Could we use some macro, in the same spirit as what is done in 
arch/powerpc/include/asm/io.h for in_be16(), in_be32(), in_be64() and friends ?


  
  static __always_inline unsigned long

  __xchg_u32_local(volatile void *p, unsigned long val)
@@ -198,11 +264,12 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int 
size)
(__typeof__(*(ptr))) __xchg_relaxed((ptr),  \
(unsigned long)_x_, sizeof(*(ptr)));\
  })
+
  /*
   * Compare and exchange - if *p == old, set it to new,
   * and return the old value of *p.
   */
-
+#ifndef CONFIG_PPC_LBARX_LWARX
  CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, 
"memory");
  CMPXCHG_GEN(u8, _local, , , "memory");
  CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
@@ -211,6 +278,173 @@ CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, 
PPC_ATOMIC_EXIT_BARRIER, "memory");
  CMPXCHG_GEN(u16, _local, , , "memory");
  CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
  CMPXCHG_GEN(u16, _relaxed, , , "cc");
+#else
+static __always_inline unsigned long
+__cmpxchg_u8(volatile unsigned char *p, unsigned long old, unsigned long new)
+{
+   unsigned int prev;
+
+   __asm__ __volatile__ (
+   PPC_ATOMIC_ENTRY_BARRIER
+"1:   lbarx   %0,0,%2 # __cmpxchg_u8\n\
+   cmpw0,%0,%3\n\
+   bne-2f\n"
+" stbcx.  %4,0,%2\n\
+   bne-1b"
+   PPC_ATOMIC_EXIT_BARRIER
+   "\n\
+2:"
+   : "=" (prev), "+m" (*p)
+   : "r" (p), "r" (old), "r" (new)
+   : "cc", "memory");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u8_local(volatile unsigned char *p, unsigned long old,
+   unsigned long new)
+{
+   unsigned int prev;
+
+   __asm__ __volatile__ (
+"1:   lbarx   %0,0,%2 # __cmpxchg_u8\n\
+   cmpw0,%0,%3\n\
+   bne-2f\n"
+" stbcx.  %4,0,%2\n\
+   bne-1b"
+   "\n\
+2:"
+   : "=" (prev), "+m" (*p)
+   : "r" (p), "r" (old), "r" (new)
+   : "cc",