from:"David Hildenbrand"

[PATCH v1 1/3] mm: turn USE_SPLIT_PTE_PTLOCKS / USE_SPLIT_PTE_PTLOCKS into Kconfig options

2024-07-26 Thread David Hildenbrand

Let's clean that up a bit and prepare for depending on
CONFIG_SPLIT_PMD_PTLOCKS in other Kconfig options.

More cleanups would be reasonable (like the arch-specific "depends on"
for CONFIG_SPLIT_PTE_PTLOCKS), but we'll leave that for another day.

Signed-off-by: David Hildenbrand 
---
 arch/arm/mm/fault-armv.c  |  6 +++---
 arch/x86/xen/mmu_pv.c |  7 ---
 include/linux/mm.h|  8 
 include/linux/mm_types.h  |  2 +-
 include/linux/mm_types_task.h |  3 ---
 kernel/fork.c |  4 ++--
 mm/Kconfig| 18 +++---
 mm/memory.c   |  2 +-
 8 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 2286c2ea60ec4..831793cd6ff94 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -61,7 +61,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned 
long address,
return ret;
 }
 
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
 /*
  * If we are using split PTE locks, then we need to take the page
  * lock here.  Otherwise we are using shared mm->page_table_lock
@@ -80,10 +80,10 @@ static inline void do_pte_unlock(spinlock_t *ptl)
 {
spin_unlock(ptl);
 }
-#else /* !USE_SPLIT_PTE_PTLOCKS */
+#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
 static inline void do_pte_lock(spinlock_t *ptl) {}
 static inline void do_pte_unlock(spinlock_t *ptl) {}
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
 
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
unsigned long pfn)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index f1ce39d6d32cb..f4a316894bbb4 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -665,7 +665,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct 
mm_struct *mm)
 {
spinlock_t *ptl = NULL;
 
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, &mm->page_table_lock);
 #endif
@@ -1553,7 +1553,8 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, 
unsigned long pfn,
 
__set_pfn_prot(pfn, PAGE_KERNEL_RO);
 
-   if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
+   if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) &&
+   !pinned)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
 
xen_mc_issue(XEN_LAZY_MMU);
@@ -1581,7 +1582,7 @@ static inline void xen_release_ptpage(unsigned long pfn, 
unsigned level)
if (pinned) {
xen_mc_batch();
 
-   if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+   if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS))
__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 
__set_pfn_prot(pfn, PAGE_KERNEL);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0472a5090b180..dff43101572ec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2843,7 +2843,7 @@ static inline void pagetable_free(struct ptdesc *pt)
__free_pages(page, compound_order(page));
 }
 
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
 bool ptlock_alloc(struct ptdesc *ptdesc);
@@ -2895,7 +2895,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
return true;
 }
 
-#else  /* !USE_SPLIT_PTE_PTLOCKS */
+#else  /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
@@ -2906,7 +2906,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct 
*mm, pte_t *pte)
 static inline void ptlock_cache_init(void) {}
 static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
 static inline void ptlock_free(struct ptdesc *ptdesc) {}
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
 
 static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
 {
@@ -2966,7 +2966,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t 
*pmd,
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
NULL: pte_offset_kernel(pmd, address))
 
-#if USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_SPLIT_PMD_PTLOCKS)
 
 static inline struct page *pmd_pgtable_page(pmd_t *pmd)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4854249792545..165c58b12ccc9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -947,7 +947,7 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_subscriptions *notifier_subscriptions;
 #endif
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPA

[PATCH v1 3/3] powerpc/8xx: document and enforce that split PT locks are not used

2024-07-26 Thread David Hildenbrand

Right now, we cannot have split PT locks because 8xx does not support
SMP.

But for the sake of documentation *why* 8xx is fine regarding what
we documented in huge_pte_lockptr(), let's just add code to enforce it
at the same time as documenting it.

This should also make everybody who wants to copy from the 8xx approach of
supporting such unusual ways of mapping hugetlb folios aware that it gets
tricky once multiple page tables are involved.

Signed-off-by: David Hildenbrand 
---
 arch/powerpc/mm/pgtable.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index ab0656115424f..7316396e452d8 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -297,6 +297,12 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 }
 
 #if defined(CONFIG_PPC_8xx)
+
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS) || defined(CONFIG_SPLIT_PMD_PTLOCKS)
+/* We need the same lock to protect the PMD table and the two PTE tables. */
+#error "8M hugetlb folios are incompatible with split page table locks"
+#endif
+
 static void __set_huge_pte_at(pmd_t *pmd, pte_t *ptep, pte_basic_t val)
 {
pte_basic_t *entry = (pte_basic_t *)ptep;
-- 
2.45.2

[PATCH v1 2/3] mm/hugetlb: enforce that PMD PT sharing has split PMD PT locks

2024-07-26 Thread David Hildenbrand

Sharing page tables between processes but falling back to per-MM page
table locks cannot possibly work.

So, let's make sure that we do have split PMD locks by adding a new
Kconfig option and letting that depend on CONFIG_SPLIT_PMD_PTLOCKS.

Signed-off-by: David Hildenbrand 
---
 fs/Kconfig  | 4 
 include/linux/hugetlb.h | 5 ++---
 mm/hugetlb.c| 8 
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index a46b0cbc4d8f6..0e4efec1d92e6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -288,6 +288,10 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
 
+config HUGETLB_PMD_PAGE_TABLE_SHARING
+   def_bool HUGETLB_PAGE
+   depends on ARCH_WANT_HUGE_PMD_SHARE && SPLIT_PMD_PTLOCKS
+
 config ARCH_HAS_GIGANTIC_PAGE
bool
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index da800e56fe590..4d2f3224ff027 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1243,7 +1243,7 @@ static inline __init void hugetlb_cma_reserve(int order)
 }
 #endif
 
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
 static inline bool hugetlb_pmd_shared(pte_t *pte)
 {
return page_count(virt_to_page(pte)) > 1;
@@ -1279,8 +1279,7 @@ bool __vma_private_lock(struct vm_area_struct *vma);
 static inline pte_t *
 hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
 {
-#if defined(CONFIG_HUGETLB_PAGE) && \
-   defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
+#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP)
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0858a18272073..c4d94e122c41f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7211,7 +7211,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long 
start, long end,
return 0;
 }
 
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
 static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
unsigned long addr, pgoff_t idx)
@@ -7373,7 +7373,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct 
vm_area_struct *vma,
return 1;
 }
 
-#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
 
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
  unsigned long addr, pud_t *pud)
@@ -7396,7 +7396,7 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned 
long addr)
 {
return false;
 }
-#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -7494,7 +7494,7 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
 /* See description above.  Architectures can provide their own version. */
 __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
 {
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
if (huge_page_size(h) == PMD_SIZE)
return PUD_SIZE - PMD_SIZE;
 #endif
-- 
2.45.2

[PATCH v1 0/3] mm: split PTE/PMD PT table Kconfig cleanups+clarifications

2024-07-26 Thread David Hildenbrand

This series is a follow up to the fixes:
"[PATCH v1 0/2] mm/hugetlb: fix hugetlb vs. core-mm PT locking"

When working on the fixes, I wondered why 8xx is fine (-> never uses split
PT locks) and how PT locking even works properly with PMD page table
sharing (-> always requires split PMD PT locks).

Let's improve the split PT lock detection, make hugetlb properly depend
on it and make 8xx bail out if it would ever get enabled by accident.

As an alternative to patch #3 we could extend the Kconfig SPLIT_PTE_PTLOCKS
option from patch #2 -- but enforcing it closer to the code that actually
implements it feels a bit nicer for documentation purposes, and there
is no need to actually disable it because it should always be disabled
(!SMP).

Did a bunch of cross-compilations to make sure that split PTE/PMD
PT locks are still getting used where we would expect them.

[1] https://lkml.kernel.org/r/20240725183955.2268884-1-da...@redhat.com

Cc: Andrew Morton 
Cc: Oscar Salvador 
Cc: Peter Xu 
Cc: Muchun Song 
Cc: Russell King 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: "Naveen N. Rao" 
Cc: Juergen Gross 
Cc: Boris Ostrovsky 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: "H. Peter Anvin" 
Cc: Alexander Viro 
Cc: Christian Brauner 

David Hildenbrand (3):
  mm: turn USE_SPLIT_PTE_PTLOCKS / USE_SPLIT_PTE_PTLOCKS into Kconfig
options
  mm/hugetlb: enforce that PMD PT sharing has split PMD PT locks
  powerpc/8xx: document and enforce that split PT locks are not used

 arch/arm/mm/fault-armv.c  |  6 +++---
 arch/powerpc/mm/pgtable.c |  6 ++
 arch/x86/xen/mmu_pv.c |  7 ---
 fs/Kconfig|  4 
 include/linux/hugetlb.h   |  5 ++---
 include/linux/mm.h|  8 
 include/linux/mm_types.h  |  2 +-
 include/linux/mm_types_task.h |  3 ---
 kernel/fork.c |  4 ++--
 mm/Kconfig| 18 +++---
 mm/hugetlb.c  |  8 
 mm/memory.c   |  2 +-
 12 files changed, 42 insertions(+), 31 deletions(-)

-- 
2.45.2

Re: [PATCH v1 0/3] mm/memory_hotplug: use PageOffline() instead of PageReserved() for !ZONE_DEVICE

2024-06-25 Thread David Hildenbrand


On 26.06.24 00:43, Andrew Morton wrote:

afaict we're in decent state to move this series into mm-stable.  I've
tagged the following issues:

https://lkml.kernel.org/r/80532f73e52e2c21fdc9aac7bce24aefb76d11b0.ca...@linux.intel.com
https://lkml.kernel.org/r/30b5d493-b7c2-4e63-86c1-dcc73d21d...@redhat.com

Have these been addressed and are we ready to send this series into the world?


Yes, should all be addressed and this should be good to go.

--
Cheers,

David / dhildenb

Re: [PATCH v1 1/3] mm: pass meminit_context to __free_pages_core()

2024-06-12 Thread David Hildenbrand


On 11.06.24 21:19, Andrew Morton wrote:

On Tue, 11 Jun 2024 12:06:56 +0200 David Hildenbrand  wrote:


On 07.06.24 11:09, David Hildenbrand wrote:

In preparation for further changes, let's teach __free_pages_core()
about the differences of memory hotplug handling.

Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory
hotplug path, and spell out why memory freed via memblock
cannot currently use adjust_managed_page_count().

Signed-off-by: David Hildenbrand 
---


@Andrew, can you squash the following?


Sure.

I queued it against "mm: pass meminit_context to __free_pages_core()",
not against


Subject: [PATCH] fixup: mm/highmem: make nr_free_highpages() return "unsigned
   long"




Can you squash the following as well? (hopefully the last fixup, otherwise I
might just resend a v2)


From 53c8c5834e638b2ae5e2a34fa7d49ce0dcf25192 Mon Sep 17 00:00:00 2001
From: David Hildenbrand 
Date: Wed, 12 Jun 2024 20:31:07 +0200
Subject: [PATCH] fixup: mm: pass meminit_context to __free_pages_core()

Let's add the parameter name also in the declaration.

Signed-off-by: David Hildenbrand 
---
 mm/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/internal.h b/mm/internal.h
index 14bab8a41baf6..254dd907bf9a2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -605,7 +605,7 @@ extern void __putback_isolated_page(struct page *page, 
unsigned int order,
 extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
 extern void __free_pages_core(struct page *page, unsigned int order,
-   enum meminit_context);
+   enum meminit_context context);
 
 /*

  * This will have no effect, other than possibly generating a warning, if the
--
2.45.2


--
Cheers,

David / dhildenb

Re: [PATCH v1 1/3] mm: pass meminit_context to __free_pages_core()

2024-06-11 Thread David Hildenbrand


On 11.06.24 21:41, Tim Chen wrote:

On Fri, 2024-06-07 at 11:09 +0200, David Hildenbrand wrote:

In preparation for further changes, let's teach __free_pages_core()
about the differences of memory hotplug handling.

Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory
hotplug path, and spell out why memory freed via memblock
cannot currently use adjust_managed_page_count().

Signed-off-by: David Hildenbrand 
---
  mm/internal.h   |  3 ++-
  mm/kmsan/init.c |  2 +-
  mm/memory_hotplug.c |  9 +
  mm/mm_init.c|  4 ++--
  mm/page_alloc.c | 17 +++--
  5 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 12e95fdf61e90..3fdee779205ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -604,7 +604,8 @@ extern void __putback_isolated_page(struct page *page, 
unsigned int order,
int mt);
  extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
-extern void __free_pages_core(struct page *page, unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order,
+   enum meminit_context);


Shouldn't the above be
enum meminit_context context);


Although C allows parameters without names in declarations, this was 
unintended.


Thanks!

--
Cheers,

David / dhildenb

Re: [PATCH v1 1/3] mm: pass meminit_context to __free_pages_core()

2024-06-11 Thread David Hildenbrand


On 11.06.24 21:19, Andrew Morton wrote:

On Tue, 11 Jun 2024 12:06:56 +0200 David Hildenbrand  wrote:


On 07.06.24 11:09, David Hildenbrand wrote:

In preparation for further changes, let's teach __free_pages_core()
about the differences of memory hotplug handling.

Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory
hotplug path, and spell out why memory freed via memblock
cannot currently use adjust_managed_page_count().

Signed-off-by: David Hildenbrand 
---


@Andrew, can you squash the following?


Sure.

I queued it against "mm: pass meminit_context to __free_pages_core()",
not against


Ah yes, sorry. Thanks!

--
Cheers,

David / dhildenb

Re: [PATCH v1 1/3] mm: pass meminit_context to __free_pages_core()

2024-06-11 Thread David Hildenbrand


On 07.06.24 11:09, David Hildenbrand wrote:

In preparation for further changes, let's teach __free_pages_core()
about the differences of memory hotplug handling.

Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory
hotplug path, and spell out why memory freed via memblock
cannot currently use adjust_managed_page_count().

Signed-off-by: David Hildenbrand 
---


@Andrew, can you squash the following?

From 0a7921cf21cacf178ca7485da0138fc38a97a28e Mon Sep 17 00:00:00 2001
From: David Hildenbrand 
Date: Tue, 11 Jun 2024 12:05:09 +0200
Subject: [PATCH] fixup: mm/highmem: make nr_free_highpages() return "unsigned
 long"

Fixup the memblock comment.

Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0c8a8354be36..fc53f96db58a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1245,7 +1245,7 @@ void __free_pages_core(struct page *page, unsigned int 
order,
debug_pagealloc_map_pages(page, nr_pages);
adjust_managed_page_count(page, nr_pages);
} else {
-   /* memblock adjusts totalram_pages() ahead of time. */
+   /* memblock adjusts totalram_pages() manually. */
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
}
 
--

2.45.2



--
Cheers,

David / dhildenb

Re: [PATCH v1 2/3] mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with PageOffline() instead of PageReserved()

2024-06-11 Thread David Hildenbrand


On 07.06.24 11:09, David Hildenbrand wrote:

We currently initialize the memmap such that PG_reserved is set and the
refcount of the page is 1. In virtio-mem code, we have to manually clear
that PG_reserved flag to make memory offlining with partially hotplugged
memory blocks possible: has_unmovable_pages() would otherwise bail out on
such pages.

We want to avoid PG_reserved where possible and move to typed pages
instead. Further, we want to further enlighten memory offlining code about
PG_offline: offline pages in an online memory section. One example is
handling managed page count adjustments in a cleaner way during memory
offlining.

So let's initialize the pages with PG_offline instead of PG_reserved.
generic_online_page()->__free_pages_core() will now clear that flag before
handing that memory to the buddy.

Note that the page refcount is still 1 and would forbid offlining of such
memory except when special care is take during GOING_OFFLINE as
currently only implemented by virtio-mem.

With this change, we can now get non-PageReserved() pages in the XEN
balloon list. From what I can tell, that can already happen via
decrease_reservation(), so that should be fine.

HV-balloon should not really observe a change: partial online memory
blocks still cannot get surprise-offlined, because the refcount of these
PageOffline() pages is 1.

Update virtio-mem, HV-balloon and XEN-balloon code to be aware that
hotplugged pages are now PageOffline() instead of PageReserved() before
they are handed over to the buddy.

We'll leave the ZONE_DEVICE case alone for now.



@Andrew, can we add here:

"Note that self-hosted vmemmap pages will no longer be marked as 
reserved. This matches ordinary vmemmap pages allocated from the buddy 
during memory hotplug. Now, really only vmemmap pages allocated from 
memblock during early boot will be marked reserved. Existing 
PageReserved() checks seem to be handling all relevant cases correctly 
even after this change."


--
Cheers,

David / dhildenb

Re: [PATCH v1 2/3] mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with PageOffline() instead of PageReserved()

2024-06-11 Thread David Hildenbrand


On 11.06.24 09:45, Oscar Salvador wrote:

On Mon, Jun 10, 2024 at 10:56:02AM +0200, David Hildenbrand wrote:

There are fortunately not that many left.

I'd even say marking them (vmemmap) reserved is more wrong than right: note
that ordinary vmemmap pages after memory hotplug are not reserved! Only
bootmem should be reserved.


Ok, that is a very good point that I missed.
I thought that hotplugged-vmemmap pages (not selfhosted) were marked as
Reserved, that is why I thought this would be inconsistent.
But then, if that is the case, I think we are safe as kernel can already
encounter vmemmap pages that are not reserved and it deals with them
somehow.


Let's take at the relevant core-mm ones (arch stuff is mostly just for MMIO
remapping)


...

Any PageReserved user that I am missing, or why we should handle these
vmemmap pages differently than the ones allocated during ordinary memory
hotplug?


No, I cannot think of a reason why normal vmemmap pages should behave
different than self-hosted.

I was also confused because I thought that after this change
pfn_to_online_page() would be different for self-hosted vmemmap pages,
because I thought that somehow we relied on PageOffline(), but it is not
the case.


Fortunately not :) PageFakeOffline() or PageLogicallyOffline()  might be 
clearer, but I don't quite like these names. If you have a good idea, 
please let me know.





In the future, we might want to consider using a dedicated page type for
them, so we can stop using a bit that doesn't allow to reliably identify
them. (we should mark all vmemmap with that type then)


Yes, a all-vmemmap pages type would be a good thing, so we do not have
to special case.

Just one last thing.
Now self-hosted vmemmap pages will have the PageOffline cleared, and that
will still remain after the memory-block they belong to has gone
offline, which is ok because those vmemmap pages lay around until the
chunk of memory gets removed.


Yes, and that memmap might even get poisoned in debug kernels to catch 
any wrong access.




Ok, just wanted to convince myself that there will no be surprises.

Thanks David for claryfing.


Thanks for the review and raising that. I'll add more details to the 
patch description!


--
Cheers,

David / dhildenb

Re: [PATCH v1 3/3] mm/memory_hotplug: skip adjust_managed_page_count() for PageOffline() pages when offlining

2024-06-10 Thread David Hildenbrand


On 10.06.24 06:29, Oscar Salvador wrote:

On Fri, Jun 07, 2024 at 11:09:38AM +0200, David Hildenbrand wrote:

We currently have a hack for virtio-mem in place to handle memory
offlining with PageOffline pages for which we already adjusted the
managed page count.

Let's enlighten memory offlining code so we can get rid of that hack,
and document the situation.

Signed-off-by: David Hildenbrand 


Acked-by: Oscar Salvador 



Thanks for the review!

--
Cheers,

David / dhildenb

Re: [PATCH v1 2/3] mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with PageOffline() instead of PageReserved()

2024-06-10 Thread David Hildenbrand


On 10.06.24 06:23, Oscar Salvador wrote:

On Fri, Jun 07, 2024 at 11:09:37AM +0200, David Hildenbrand wrote:

We currently initialize the memmap such that PG_reserved is set and the
refcount of the page is 1. In virtio-mem code, we have to manually clear
that PG_reserved flag to make memory offlining with partially hotplugged
memory blocks possible: has_unmovable_pages() would otherwise bail out on
such pages.

We want to avoid PG_reserved where possible and move to typed pages
instead. Further, we want to further enlighten memory offlining code about
PG_offline: offline pages in an online memory section. One example is
handling managed page count adjustments in a cleaner way during memory
offlining.

So let's initialize the pages with PG_offline instead of PG_reserved.
generic_online_page()->__free_pages_core() will now clear that flag before
handing that memory to the buddy.

Note that the page refcount is still 1 and would forbid offlining of such
memory except when special care is take during GOING_OFFLINE as
currently only implemented by virtio-mem.

With this change, we can now get non-PageReserved() pages in the XEN
balloon list. From what I can tell, that can already happen via
decrease_reservation(), so that should be fine.

HV-balloon should not really observe a change: partial online memory
blocks still cannot get surprise-offlined, because the refcount of these
PageOffline() pages is 1.

Update virtio-mem, HV-balloon and XEN-balloon code to be aware that
hotplugged pages are now PageOffline() instead of PageReserved() before
they are handed over to the buddy.

We'll leave the ZONE_DEVICE case alone for now.

Signed-off-by: David Hildenbrand 



diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 27e3be75edcf7..0254059efcbe1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -734,7 +734,7 @@ static inline void section_taint_zone_device(unsigned long 
pfn)
  /*
   * Associate the pfn range with the given zone, initializing the memmaps
   * and resizing the pgdat/zone data to span the added pages. After this
- * call, all affected pages are PG_reserved.
+ * call, all affected pages are PageOffline().
   *
   * All aligned pageblocks are initialized to the specified migratetype
   * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
@@ -1100,8 +1100,12 @@ int mhp_init_memmap_on_memory(unsigned long pfn, 
unsigned long nr_pages,
  
  	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
  
-	for (i = 0; i < nr_pages; i++)

-   SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+   for (i = 0; i < nr_pages; i++) {
+   struct page *page = pfn_to_page(pfn + i);
+
+   __ClearPageOffline(page);
+   SetPageVmemmapSelfHosted(page);


So, refresh my memory here please.
AFAIR, those VmemmapSelfHosted pages were marked Reserved before, but now,
memmap_init_range() will not mark them reserved anymore.


Correct.


I do not think that is ok? I am worried about walkers getting this wrong.

We usually skip PageReserved pages in walkers because are pages we cannot deal
with for those purposes, but with this change, we will leak
PageVmemmapSelfHosted, and I am not sure whether are ready for that.


There are fortunately not that many left.

I'd even say marking them (vmemmap) reserved is more wrong than right: 
note that ordinary vmemmap pages after memory hotplug are not reserved! 
Only bootmem should be reserved.


Let's take at the relevant core-mm ones (arch stuff is mostly just for 
MMIO remapping)


fs/proc/task_mmu.c: if (PageReserved(page))
fs/proc/task_mmu.c: if (PageReserved(page))

-> If we find vmemmap pages mapped into user space we already messed up
   seriously

kernel/power/snapshot.c:if (PageReserved(page) ||
kernel/power/snapshot.c:if (PageReserved(page)

-> There should be no change (saveable_page() would still allow saving
   them, highmem does not apply)

mm/hugetlb_vmemmap.c:   if (!PageReserved(head))
mm/hugetlb_vmemmap.c:   if (PageReserved(page))

-> Wants to identify bootmem, but we exclude these
   PageVmemmapSelfHosted() on the splitting part already properly


mm/page_alloc.c:VM_WARN_ON_ONCE(PageReserved(p));
mm/page_alloc.c:if (PageReserved(page))

-> pfn_range_valid_contig() would scan them, just like for ordinary
   vmemmap pages during hotplug. We'll simply fail isolating/migrating
   them similarly (like any unmovable allocations) later

mm/page_ext.c:  BUG_ON(PageReserved(page));

-> free_page_ext handling, does not apply

mm/page_isolation.c:if (PageReserved(page))

-> has_unmovable_pages() should still detect them as unmovable (e.g.,
   neither movable nor LRU).

mm/page_owner.c:if (PageReserved(page))
mm/page_owner.c:if (PageReserved(page))

-> Simply page_ext_get() will return NUL

Re: [PATCH v1 1/3] mm: pass meminit_context to __free_pages_core()

2024-06-10 Thread David Hildenbrand


On 10.06.24 06:03, Oscar Salvador wrote:

On Fri, Jun 07, 2024 at 11:09:36AM +0200, David Hildenbrand wrote:

In preparation for further changes, let's teach __free_pages_core()
about the differences of memory hotplug handling.

Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory
hotplug path, and spell out why memory freed via memblock
cannot currently use adjust_managed_page_count().

Signed-off-by: David Hildenbrand 


All looks good but I am puzzled with something.


+   } else {
+   /* memblock adjusts totalram_pages() ahead of time. */
+   atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+   }


You say that memblock adjusts totalram_pages ahead of time, and I guess
you mean in memblock_free_all()


And memblock_free_late(), which uses atomic_long_inc().



  pages = free_low_memory_core_early()
  totalram_pages_add(pages);

but that is not ahead, it looks like it is upading __after__ sending
them to buddy?


Right (it's suboptimal, but not really problematic so far. Hopefully Wei 
can clean it up and move it in here as well)


For the time being

"/* memblock adjusts totalram_pages() manually. */"

?

Thanks!

--
Cheers,

David / dhildenb

Re: [PATCH v1 1/3] mm: pass meminit_context to __free_pages_core()

2024-06-07 Thread David Hildenbrand


On 07.06.24 11:09, David Hildenbrand wrote:

In preparation for further changes, let's teach __free_pages_core()
about the differences of memory hotplug handling.

Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory
hotplug path, and spell out why memory freed via memblock
cannot currently use adjust_managed_page_count().

Signed-off-by: David Hildenbrand 
---
  mm/internal.h   |  3 ++-
  mm/kmsan/init.c |  2 +-
  mm/memory_hotplug.c |  9 +
  mm/mm_init.c|  4 ++--
  mm/page_alloc.c | 17 +++--
  5 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 12e95fdf61e90..3fdee779205ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -604,7 +604,8 @@ extern void __putback_isolated_page(struct page *page, 
unsigned int order,
int mt);
  extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
-extern void __free_pages_core(struct page *page, unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order,
+   enum meminit_context);
  
  /*

   * This will have no effect, other than possibly generating a warning, if the
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 3ac3b8921d36f..ca79636f858e5 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -172,7 +172,7 @@ static void do_collection(void)
shadow = smallstack_pop(&collect);
origin = smallstack_pop(&collect);
kmsan_setup_meta(page, shadow, origin, collect.order);
-   __free_pages_core(page, collect.order);
+   __free_pages_core(page, collect.order, MEMINIT_EARLY);
}
  }
  
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 171ad975c7cfd..27e3be75edcf7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -630,14 +630,7 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
  
  void generic_online_page(struct page *page, unsigned int order)

  {
-   /*
-* Freeing the page with debug_pagealloc enabled will try to unmap it,
-* so we should map it first. This is better than introducing a special
-* case in page freeing fast path.
-*/
-   debug_pagealloc_map_pages(page, 1 << order);
-   __free_pages_core(page, order);
-   totalram_pages_add(1UL << order);
+   __free_pages_core(page, order, MEMINIT_HOTPLUG);
  }
  EXPORT_SYMBOL_GPL(generic_online_page);
  
diff --git a/mm/mm_init.c b/mm/mm_init.c

index 019193b0d8703..feb5b6e8c8875 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1938,7 +1938,7 @@ static void __init deferred_free_range(unsigned long pfn,
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-   __free_pages_core(page, 0);
+   __free_pages_core(page, 0, MEMINIT_EARLY);
}
  }


The build bot just reminded me that I missed another case in this function:
(CONFIG_DEFERRED_STRUCT_PAGE_INIT)

diff --git a/mm/mm_init.c b/mm/mm_init.c
index feb5b6e8c8875..5a0752261a795 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1928,7 +1928,7 @@ static void __init deferred_free_range(unsigned long pfn,
if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
for (i = 0; i < nr_pages; i += pageblock_nr_pages)
set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
-   __free_pages_core(page, MAX_PAGE_ORDER);
+   __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
return;
}
 


--
Cheers,

David / dhildenb

[PATCH v1 3/3] mm/memory_hotplug: skip adjust_managed_page_count() for PageOffline() pages when offlining

2024-06-07 Thread David Hildenbrand

We currently have a hack for virtio-mem in place to handle memory
offlining with PageOffline pages for which we already adjusted the
managed page count.

Let's enlighten memory offlining code so we can get rid of that hack,
and document the situation.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c| 11 ++-
 include/linux/memory_hotplug.h |  4 ++--
 include/linux/page-flags.h |  8 ++--
 mm/memory_hotplug.c|  6 +++---
 mm/page_alloc.c| 12 ++--
 5 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index b90df29621c81..b0b8714415783 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1269,12 +1269,6 @@ static void 
virtio_mem_fake_offline_going_offline(unsigned long pfn,
struct page *page;
unsigned long i;
 
-   /*
-* Drop our reference to the pages so the memory can get offlined
-* and add the unplugged pages to the managed page counters (so
-* offlining code can correctly subtract them again).
-*/
-   adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
/* Drop our reference to the pages so the memory can get offlined. */
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(pfn + i);
@@ -1293,10 +1287,9 @@ static void 
virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
unsigned long i;
 
/*
-* Get the reference we dropped when going offline and subtract the
-* unplugged pages from the managed page counters.
+* Get the reference again that we dropped via page_ref_dec_and_test()
+* when going offline.
 */
-   adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
for (i = 0; i < nr_pages; i++)
page_ref_inc(pfn_to_page(pfn + i));
 }
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7a9ff464608d7..ebe876930e782 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -175,8 +175,8 @@ extern int mhp_init_memmap_on_memory(unsigned long pfn, 
unsigned long nr_pages,
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long 
nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group);
-extern void __offline_isolated_pages(unsigned long start_pfn,
-unsigned long end_pfn);
+extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
+   unsigned long end_pfn);
 
 typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e0362ce7fc109..0876aca0833e7 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1024,11 +1024,15 @@ PAGE_TYPE_OPS(Buddy, buddy, buddy)
  * putting them back to the buddy, it can do so via the memory notifier by
  * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
  * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
- * pages (now with a reference count of zero) are treated like free pages,
- * allowing the containing memory block to get offlined. A driver that
+ * pages (now with a reference count of zero) are treated like free (unmanaged)
+ * pages, allowing the containing memory block to get offlined. A driver that
  * relies on this feature is aware that re-onlining the memory block will
  * require not giving them to the buddy via generic_online_page().
  *
+ * Memory offlining code will not adjust the managed page count for any
+ * PageOffline() pages, treating them like they were never exposed to the
+ * buddy using generic_online_page().
+ *
  * There are drivers that mark a page PageOffline() and expect there won't be
  * any further access to page content. PFN walkers that read content of random
  * pages should check PageOffline() and synchronize with such drivers using
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0254059efcbe1..965707a02556f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1941,7 +1941,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned 
long nr_pages,
struct zone *zone, struct memory_group *group)
 {
const unsigned long end_pfn = start_pfn + nr_pages;
-   unsigned long pfn, system_ram_pages = 0;
+   unsigned long pfn, managed_pages, system_ram_pages = 0;
const int node = zone_to_nid(zone);
unsigned long flags;
struct memory_notify arg;
@@ -2062,7 +2062,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned 
long nr_pages,
} while (ret);
 
/* Mark all sections offline and remove free pages from the buddy. */
-   __offline_isolated_pages(start_pfn, end_pfn);
+   managed_pages = __offlin

[PATCH v1 0/3] mm/memory_hotplug: use PageOffline() instead of PageReserved() for !ZONE_DEVICE

2024-06-07 Thread David Hildenbrand

This can be a considered a long-overdue follow-up to some parts of [1].
The patches are based on [2], but they are not strictly required -- just
makes it clearer why we can use adjust_managed_page_count() for memory
hotplug without going into details about highmem.

We stop initializing pages with PageReserved() in memory hotplug code --
except when dealing with ZONE_DEVICE for now. Instead, we use
PageOffline(): all pages are initialized to PageOffline() when onlining a
memory section, and only the ones actually getting exposed to the
system/page allocator will get PageOffline cleared.

This way, we enlighten memory hotplug more about PageOffline() pages and
can cleanup some hacks we have in virtio-mem code.

What about ZONE_DEVICE? PageOffline() is wrong, but we might just stop
using PageReserved() for them later by simply checking for
is_zone_device_page() at suitable places. That will be a separate patch
set / proposal.

This primarily affects virtio-mem, HV-balloon and XEN balloon. I only
briefly tested with virtio-mem, which benefits most from these cleanups.

[1] https://lore.kernel.org/all/20191024120938.11237-1-da...@redhat.com/
[2] https://lkml.kernel.org/r/20240607083711.62833-1-da...@redhat.com

Cc: Andrew Morton 
Cc: Mike Rapoport 
Cc: Oscar Salvador 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Wei Liu 
Cc: Dexuan Cui 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Xuan Zhuo 
Cc: "Eugenio Pérez" 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: Oleksandr Tyshchenko 
Cc: Alexander Potapenko 
Cc: Marco Elver 
Cc: Dmitry Vyukov 

David Hildenbrand (3):
  mm: pass meminit_context to __free_pages_core()
  mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with
PageOffline() instead of PageReserved()
  mm/memory_hotplug: skip adjust_managed_page_count() for PageOffline()
pages when offlining

 drivers/hv/hv_balloon.c|  5 ++--
 drivers/virtio/virtio_mem.c| 29 +-
 drivers/xen/balloon.c  |  9 --
 include/linux/memory_hotplug.h |  4 +--
 include/linux/page-flags.h | 20 +++--
 mm/internal.h  |  3 +-
 mm/kmsan/init.c|  2 +-
 mm/memory_hotplug.c| 31 +--
 mm/mm_init.c   | 14 ++---
 mm/page_alloc.c| 55 +++---
 10 files changed, 108 insertions(+), 64 deletions(-)


base-commit: 19b8422c5bd56fb5e7085995801c6543a98bda1f
prerequisite-patch-id: ca280eafd2732d7912e0c5249dc0df9ecbef19ca
prerequisite-patch-id: 8f43ebc81fdf7b9b665b57614e9e569535094758
-- 
2.45.1

[PATCH v1 2/3] mm/memory_hotplug: initialize memmap of !ZONE_DEVICE with PageOffline() instead of PageReserved()

2024-06-07 Thread David Hildenbrand

We currently initialize the memmap such that PG_reserved is set and the
refcount of the page is 1. In virtio-mem code, we have to manually clear
that PG_reserved flag to make memory offlining with partially hotplugged
memory blocks possible: has_unmovable_pages() would otherwise bail out on
such pages.

We want to avoid PG_reserved where possible and move to typed pages
instead. Further, we want to further enlighten memory offlining code about
PG_offline: offline pages in an online memory section. One example is
handling managed page count adjustments in a cleaner way during memory
offlining.

So let's initialize the pages with PG_offline instead of PG_reserved.
generic_online_page()->__free_pages_core() will now clear that flag before
handing that memory to the buddy.

Note that the page refcount is still 1 and would forbid offlining of such
memory except when special care is take during GOING_OFFLINE as
currently only implemented by virtio-mem.

With this change, we can now get non-PageReserved() pages in the XEN
balloon list. From what I can tell, that can already happen via
decrease_reservation(), so that should be fine.

HV-balloon should not really observe a change: partial online memory
blocks still cannot get surprise-offlined, because the refcount of these
PageOffline() pages is 1.

Update virtio-mem, HV-balloon and XEN-balloon code to be aware that
hotplugged pages are now PageOffline() instead of PageReserved() before
they are handed over to the buddy.

We'll leave the ZONE_DEVICE case alone for now.

Signed-off-by: David Hildenbrand 
---
 drivers/hv/hv_balloon.c |  5 ++---
 drivers/virtio/virtio_mem.c | 18 --
 drivers/xen/balloon.c   |  9 +++--
 include/linux/page-flags.h  | 12 +---
 mm/memory_hotplug.c | 16 ++--
 mm/mm_init.c| 10 --
 mm/page_alloc.c | 32 +++-
 7 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index e000fa3b9f978..c1be38edd8361 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -693,9 +693,8 @@ static void hv_page_online_one(struct hv_hotadd_state *has, 
struct page *pg)
if (!PageOffline(pg))
__SetPageOffline(pg);
return;
-   }
-   if (PageOffline(pg))
-   __ClearPageOffline(pg);
+   } else if (!PageOffline(pg))
+   return;
 
/* This frame is currently backed; online the page. */
generic_online_page(pg, 0);
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index a3857bacc8446..b90df29621c81 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1146,12 +1146,16 @@ static void virtio_mem_set_fake_offline(unsigned long 
pfn,
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
 
-   __SetPageOffline(page);
-   if (!onlined) {
+   if (!onlined)
+   /*
+* Pages that have not been onlined yet were initialized
+* to PageOffline(). Remember that we have to route them
+* through generic_online_page().
+*/
SetPageDirty(page);
-   /* FIXME: remove after cleanups */
-   ClearPageReserved(page);
-   }
+   else
+   __SetPageOffline(page);
+   VM_WARN_ON_ONCE(!PageOffline(page));
}
page_offline_end();
 }
@@ -1166,9 +1170,11 @@ static void virtio_mem_clear_fake_offline(unsigned long 
pfn,
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
 
-   __ClearPageOffline(page);
if (!onlined)
+   /* generic_online_page() will clear PageOffline(). */
ClearPageDirty(page);
+   else
+   __ClearPageOffline(page);
}
 }
 
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index aaf2514fcfa46..528395133b4f8 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -146,7 +146,8 @@ static DECLARE_WAIT_QUEUE_HEAD(balloon_wq);
 /* balloon_append: add the given page to the balloon. */
 static void balloon_append(struct page *page)
 {
-   __SetPageOffline(page);
+   if (!PageOffline(page))
+   __SetPageOffline(page);
 
/* Lowmem is re-populated first, so highmem pages go at list tail. */
if (PageHighMem(page)) {
@@ -412,7 +413,11 @@ static enum bp_state increase_reservation(unsigned long 
nr_pages)
 
xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]);
 
-   /* Relinquish the page back to the allocator. */
+   /*
+* Relinquish the page back to the

[PATCH v1 1/3] mm: pass meminit_context to __free_pages_core()

2024-06-07 Thread David Hildenbrand

In preparation for further changes, let's teach __free_pages_core()
about the differences of memory hotplug handling.

Move the memory hotplug specific handling from generic_online_page() to
__free_pages_core(), use adjust_managed_page_count() on the memory
hotplug path, and spell out why memory freed via memblock
cannot currently use adjust_managed_page_count().

Signed-off-by: David Hildenbrand 
---
 mm/internal.h   |  3 ++-
 mm/kmsan/init.c |  2 +-
 mm/memory_hotplug.c |  9 +
 mm/mm_init.c|  4 ++--
 mm/page_alloc.c | 17 +++--
 5 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 12e95fdf61e90..3fdee779205ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -604,7 +604,8 @@ extern void __putback_isolated_page(struct page *page, 
unsigned int order,
int mt);
 extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
-extern void __free_pages_core(struct page *page, unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order,
+   enum meminit_context);
 
 /*
  * This will have no effect, other than possibly generating a warning, if the
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 3ac3b8921d36f..ca79636f858e5 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -172,7 +172,7 @@ static void do_collection(void)
shadow = smallstack_pop(&collect);
origin = smallstack_pop(&collect);
kmsan_setup_meta(page, shadow, origin, collect.order);
-   __free_pages_core(page, collect.order);
+   __free_pages_core(page, collect.order, MEMINIT_EARLY);
}
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 171ad975c7cfd..27e3be75edcf7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -630,14 +630,7 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
 
 void generic_online_page(struct page *page, unsigned int order)
 {
-   /*
-* Freeing the page with debug_pagealloc enabled will try to unmap it,
-* so we should map it first. This is better than introducing a special
-* case in page freeing fast path.
-*/
-   debug_pagealloc_map_pages(page, 1 << order);
-   __free_pages_core(page, order);
-   totalram_pages_add(1UL << order);
+   __free_pages_core(page, order, MEMINIT_HOTPLUG);
 }
 EXPORT_SYMBOL_GPL(generic_online_page);
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 019193b0d8703..feb5b6e8c8875 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1938,7 +1938,7 @@ static void __init deferred_free_range(unsigned long pfn,
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-   __free_pages_core(page, 0);
+   __free_pages_core(page, 0, MEMINIT_EARLY);
}
 }
 
@@ -2513,7 +2513,7 @@ void __init memblock_free_pages(struct page *page, 
unsigned long pfn,
}
}
 
-   __free_pages_core(page, order);
+   __free_pages_core(page, order, MEMINIT_EARLY);
 }
 
 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2224965ada468..e0c8a8354be36 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1214,7 +1214,8 @@ static void __free_pages_ok(struct page *page, unsigned 
int order,
__count_vm_events(PGFREE, 1 << order);
 }
 
-void __free_pages_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order,
+   enum meminit_context context)
 {
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1234,7 +1235,19 @@ void __free_pages_core(struct page *page, unsigned int 
order)
__ClearPageReserved(p);
set_page_count(p, 0);
 
-   atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+   if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) &&
+   unlikely(context == MEMINIT_HOTPLUG)) {
+   /*
+* Freeing the page with debug_pagealloc enabled will try to
+* unmap it; some archs don't like double-unmappings, so
+* map it first.
+*/
+   debug_pagealloc_map_pages(page, nr_pages);
+   adjust_managed_page_count(page, nr_pages);
+   } else {
+   /* memblock adjusts totalram_pages() ahead of time. */
+   atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+   }
 
if (page_contains_unaccepted(page, order)) {
if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
-- 
2.45.1

Re: [PATCH v6 6/8] xen: mapcache: Pass the ram_addr offset to xen_map_cache()

2024-05-16 Thread David Hildenbrand


On 16.05.24 17:48, Edgar E. Iglesias wrote:

From: "Edgar E. Iglesias" 

Pass the ram_addr offset to xen_map_cache.
This is in preparation for adding grant mappings that need
to compute the address within the RAMBlock.

No functional changes.

Signed-off-by: Edgar E. Iglesias 
---


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH v4 14/17] xen: Add xen_mr_is_memory()

2024-05-02 Thread David Hildenbrand


On 30.04.24 18:49, Edgar E. Iglesias wrote:

From: "Edgar E. Iglesias" 

Add xen_mr_is_memory() to abstract away tests for the
xen_memory MR.

Signed-off-by: Edgar E. Iglesias 
---


[...]


  #endif
diff --git a/system/physmem.c b/system/physmem.c
index ad7a8c7d95..1a5ffcba2a 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2227,7 +2227,7 @@ static void *qemu_ram_ptr_length(RAMBlock *block, 
ram_addr_t addr,
   * because we don't want to map the entire memory in QEMU.
   * In that case just map the requested area.
   */
-if (block->offset == 0) {
+if (xen_mr_is_memory(block->mr)) {
  return xen_map_cache(block->mr, addr, len, lock, lock,
   is_write);
  }


I'd have moved that into a separate patch, because this is not a simple 
abstraction here.


Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH v4 13/17] softmmu: Pass RAM MemoryRegion and is_write xen_map_cache()

2024-05-02 Thread David Hildenbrand


On 30.04.24 18:49, Edgar E. Iglesias wrote:

From: "Edgar E. Iglesias" 

Propagate MR and is_write to xen_map_cache().


I'm pretty sure the patch subject is missing a "to" :)


This is in preparation for adding support for grant mappings.

No functional change.



Reviewed-by: David Hildenbrand 
--
Cheers,

David / dhildenb

Re: [PATCH-for-9.0 v2 04/19] system/physmem: Do not include 'hw/xen/xen.h' but 'sysemu/xen.h'

2023-11-14 Thread David Hildenbrand


On 14.11.23 15:38, Philippe Mathieu-Daudé wrote:

physmem.c doesn't use any declaration from "hw/xen/xen.h",
it only requires "sysemu/xen.h" and "system/xen-mapcache.h".

Suggested-by: David Woodhouse 
Signed-off-by: Philippe Mathieu-Daudé 
---


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH 3/3] sysemu/xen: Allow elision of xen_hvm_modified_memory()

2023-09-06 Thread David Hildenbrand


On 05.09.23 14:21, Philippe Mathieu-Daudé wrote:

Call xen_enabled() before xen_hvm_modified_memory() to let
the compiler elide its call.

Have xen_enabled() return a boolean to match its declaration
in the CONFIG_XEN_IS_POSSIBLE case.

Suggested-by: Daniel Henrique Barboza 
Signed-off-by: Philippe Mathieu-Daudé 
---


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH v6 00/33] Split ptdesc from struct page

2023-06-28 Thread David Hildenbrand


On 27.06.23 22:13, Hugh Dickins wrote:

On Tue, 27 Jun 2023, David Hildenbrand wrote:

On 27.06.23 06:44, Hugh Dickins wrote:

On Mon, 26 Jun 2023, Vishal Moola (Oracle) wrote:


The MM subsystem is trying to shrink struct page. This patchset
introduces a memory descriptor for page table tracking - struct ptdesc.

...

   39 files changed, 686 insertions(+), 455 deletions(-)


I don't see the point of this patchset: to me it is just obfuscation of
the present-day tight relationship between page table and struct page.

Matthew already explained:


The intent is to get ptdescs to be dynamically allocated at some point
in the ~2-3 years out future when we have finished the folio project ...


So in a kindly mood, I'd say that this patchset is ahead of its time.
But I can certainly adapt to it, if everyone else sees some point to it.


I share your thoughts, that code churn which will help eventually in the far,
far future (not wanting to sound too pessimistic, but it's not going to be
there tomorrow ;) ).

However, if it's just the same as the other conversions we already did (e.g.,
struct slab), then I guess there is no reason to stop now -- the obfuscation
already happened.

... or is there a difference regarding this conversion and the previous ones?


I was aware of the struct slab thing, didn't see much point there myself
either; but it was welcomed by Vlastimil, and barely affected outside of
slab allocators, so I had no reason to object.

You think that if a little unnecessary churn (a *lot* of churn if you
include folios, which did save some repeated calls to compound_head())
has already occurred, that's a good precedent for allowing more and more?


Well, if you phrase it like that ... no, I'm not in favor of unnecessary 
churn at all. Yes, folios were/are a lot of churn (IMHO not unnecessary, 
though).


I'm not a friend of these "overlays"; it all only really makes sense to 
me once we actually allocate the descriptors dynamically. Maybe some of 
the existing/ongoing conversions were different (that's why I was asking 
for the difference, as you said the "struct slab" thing was well received).


If they are primarily only unnecessary churn for now (and unclear 
when/how it will become useful), I share your opinion.


--
Cheers,

David / dhildenb

Re: [PATCH v6 00/33] Split ptdesc from struct page

2023-06-27 Thread David Hildenbrand


On 27.06.23 06:44, Hugh Dickins wrote:

On Mon, 26 Jun 2023, Vishal Moola (Oracle) wrote:


The MM subsystem is trying to shrink struct page. This patchset
introduces a memory descriptor for page table tracking - struct ptdesc.

...

  39 files changed, 686 insertions(+), 455 deletions(-)


I don't see the point of this patchset: to me it is just obfuscation of
the present-day tight relationship between page table and struct page.

Matthew already explained:


The intent is to get ptdescs to be dynamically allocated at some point
in the ~2-3 years out future when we have finished the folio project ...


So in a kindly mood, I'd say that this patchset is ahead of its time.
But I can certainly adapt to it, if everyone else sees some point to it.


I share your thoughts, that code churn which will help eventually in the 
far, far future (not wanting to sound too pessimistic, but it's not 
going to be there tomorrow ;) ).


However, if it's just the same as the other conversions we already did 
(e.g., struct slab), then I guess there is no reason to stop now -- the 
obfuscation already happened.


... or is there a difference regarding this conversion and the previous 
ones?


--
Cheers,

David / dhildenb

Re: [PATCH v1 1/1] Q35 Support

2023-06-21 Thread David Hildenbrand


On 21.06.23 18:35, Joel Upham wrote:
Sorry, this was sent in error when I did the git send-email for the 
folder. This was before I broke each patch down (after looking at the 
Qemu submission guidance). This is my first time sending a patch in this 
way, so thanks for the understanding. This patch can be ignored, as they 
are all covered elsewhere.


We've all been there (messing with git send-email), no need to feel bad :)

--
Cheers,

David / dhildenb

Re: [PATCH v1 1/1] Q35 Support

2023-06-21 Thread David Hildenbrand


On 20.06.23 19:24, Joel Upham wrote:

Inexpressive patch subject and non-existant patch desciption. I have no 
clue what this is supposed to do, except that it involes q35 and xen ()I 
guess ?.



---
  hw/acpi/ich9.c|   22 +-
  hw/acpi/pcihp.c   |6 +-
  hw/core/machine.c |   19 +
  hw/i386/pc_piix.c |3 +-
  hw/i386/pc_q35.c  |   39 +-
  hw/i386/xen/xen-hvm.c |7 +-
  hw/i386/xen/xen_platform.c|   19 +-
  hw/isa/lpc_ich9.c |   53 +-
  hw/isa/piix3.c|2 +-
  hw/pci-host/q35.c |   28 +-
  hw/pci/pci.c  |   17 +
  hw/xen/xen-host-pci-device.c  |  106 +++-
  hw/xen/xen-host-pci-device.h  |6 +-
  hw/xen/xen_pt.c   |   49 +-
  hw/xen/xen_pt.h   |   19 +-
  hw/xen/xen_pt_config_init.c   | 1103 ++---
  include/hw/acpi/ich9.h|1 +
  include/hw/acpi/pcihp.h   |2 +
  include/hw/boards.h   |1 +
  include/hw/i386/pc.h  |3 +
  include/hw/pci-host/q35.h |4 +-
  include/hw/pci/pci.h  |3 +
  include/hw/southbridge/ich9.h |1 +
  include/hw/xen/xen.h  |4 +-
  qemu-options.hx   |1 +
  softmmu/datadir.c |1 -
  softmmu/qdev-monitor.c|3 +-
  stubs/xen-hw-stub.c   |4 +-
  28 files changed, 1395 insertions(+), 131 deletions(-)



Usually people refrain from reviewing such massive patches. Most 
probably this can be broken up into reviewable pieces.


Was this supposed to be an RFC?

--
Cheers,

David / dhildenb

Re: [PATCH v9 02/42] mm: Move pte/pmd_mkwrite() callers with no VMA to _novma()

2023-06-13 Thread David Hildenbrand


On 13.06.23 18:19, Edgecombe, Rick P wrote:

On Tue, 2023-06-13 at 10:44 +0300, Mike Rapoport wrote:

Previous patches have done the first step, so next move the callers
that
don't have a VMA to pte_mkwrite_novma(). Also do the same for


I hear x86 maintainers asking to drop "previous patches" ;-)

Maybe
This is the second step of the conversion that moves the callers ...


Really? I've not heard that. Just a strong aversion to "this patch".
I've got feedback to say "previous patches" and not "the last patch" so
it doesn't get stale. I guess it could be "previous changes".


Talking about patches make sense when discussing literal patches sent to 
the mailing list. In the git log, it's commit, and "future commits" or 
"follow-up work".


Yes, we use "patches" all of the time in commit logs, especially when we 
 include the cover letter in the commit message (as done frequently in 
the -mm tree).


--
Cheers,

David / dhildenb

Re: [PATCH v9 02/42] mm: Move pte/pmd_mkwrite() callers with no VMA to _novma()

2023-06-13 Thread David Hildenbrand


On 13.06.23 02:10, Rick Edgecombe wrote:

The x86 Shadow stack feature includes a new type of memory called shadow
stack. This shadow stack memory has some unusual properties, which requires
some core mm changes to function properly.

One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite(). Future patches will make pte_mkwrite() take a VMA, so
that the x86 implementation of it can know whether to create regular
writable memory or shadow stack memory.

But there are a couple of challenges to this. Modifying the signatures of
each arch pte_mkwrite() implementation would be error prone because some
are generated with macros and would need to be re-implemented. Also, some
pte_mkwrite() callers operate on kernel memory without a VMA.

So this can be done in a three step process. First pte_mkwrite() can be
renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite()
added that just calls pte_mkwrite_novma(). Next callers without a VMA can
be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers
can be changed to take/pass a VMA.

Previous patches have done the first step, so next move the callers that
don't have a VMA to pte_mkwrite_novma(). Also do the same for
pmd_mkwrite(). This will be ok for the shadow stack feature, as these
callers are on kernel memory which will not need to be made shadow stack,
and the other architectures only currently support one type of memory
in pte_mkwrite()

Cc: linux-...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-s...@vger.kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: linux-a...@vger.kernel.org
Cc: linux...@kvack.org
Signed-off-by: Rick Edgecombe 
---


Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH 01/33] s390: Use _pt_s390_gaddr for gmap address tracking

2023-04-19 Thread David Hildenbrand


On 18.04.23 23:33, Vishal Moola wrote:

On Tue, Apr 18, 2023 at 8:45 AM David Hildenbrand  wrote:


On 17.04.23 22:50, Vishal Moola (Oracle) wrote:

s390 uses page->index to keep track of page tables for the guest address
space. In an attempt to consolidate the usage of page fields in s390,
replace _pt_pad_2 with _pt_s390_gaddr to replace page->index in gmap.

This will help with the splitting of struct ptdesc from struct page, as
well as allow s390 to use _pt_frag_refcount for fragmented page table
tracking.

Since page->_pt_s390_gaddr aliases with mapping, ensure its set to NULL
before freeing the pages as well.

Signed-off-by: Vishal Moola (Oracle) 
---


[...]


diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3fc9e680f174..2616d64c0e8c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -144,7 +144,7 @@ struct page {
   struct {/* Page table pages */
   unsigned long _pt_pad_1;/* compound_head */
   pgtable_t pmd_huge_pte; /* protected by page->ptl */
- unsigned long _pt_pad_2;/* mapping */
+ unsigned long _pt_s390_gaddr;   /* mapping */
   union {
   struct mm_struct *pt_mm; /* x86 pgds only */
   atomic_t pt_frag_refcount; /* powerpc */


The confusing part is, that these gmap page tables are not ordinary
process page tables that we would ordinarily place into this section
here. That's why they are also not allocated/freed using the typical
page table constructor/destructor ...


I initially thought the same, so I was quite confused when I saw
__gmap_segment_gaddr was using pmd_pgtable_page().

Although they are not ordinary process page tables, since we
eventually want to move them out of struct page, I think shifting them
to be in ptdescs, being a memory descriptor for page tables, makes
the most sense.


Seeing utilities like tlb_remove_page_ptdesc() that don't really apply 
to such page tables, I wonder if we should much rather treat such 
shadow/auxiliary/... page tables (just like other architectures like 
x86, arm, ... employ as well) as a distinct type.


And have ptdesc be the common type for all process page tables.



Another option is to leave pmd_pgtable_page() as is just for this case.
Or we can revert commit 7e25de77bc5ea which uses the function here
then figure out where these gmap pages table pages will go later.


I'm always confused when reading gmap code, so let me have another look :)

The confusing part is that s390x shares the lowest level page tables 
(PTE tables) between the process and gmap ("guest mapping", similar to 
EPT on x86-64). It maps these process PTE tables (covering 1 MiB) into 
gmap-specific PMD tables.


pmd_pgtable_page() should indeed always give us a gmap-specific 
PMD-table. In fact, something allocated via gmap_alloc_table().


Decoupling both concepts sounds like a good idea.

--
Thanks,

David / dhildenb

Re: [PATCH 01/33] s390: Use _pt_s390_gaddr for gmap address tracking

2023-04-18 Thread David Hildenbrand


On 17.04.23 22:50, Vishal Moola (Oracle) wrote:

s390 uses page->index to keep track of page tables for the guest address
space. In an attempt to consolidate the usage of page fields in s390,
replace _pt_pad_2 with _pt_s390_gaddr to replace page->index in gmap.

This will help with the splitting of struct ptdesc from struct page, as
well as allow s390 to use _pt_frag_refcount for fragmented page table
tracking.

Since page->_pt_s390_gaddr aliases with mapping, ensure its set to NULL
before freeing the pages as well.

Signed-off-by: Vishal Moola (Oracle) 
---


[...]


diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3fc9e680f174..2616d64c0e8c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -144,7 +144,7 @@ struct page {
struct {/* Page table pages */
unsigned long _pt_pad_1;/* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
-   unsigned long _pt_pad_2;/* mapping */
+   unsigned long _pt_s390_gaddr;   /* mapping */
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */


The confusing part is, that these gmap page tables are not ordinary 
process page tables that we would ordinarily place into this section 
here. That's why they are also not allocated/freed using the typical 
page table constructor/destructor ...


--
Thanks,

David / dhildenb

Re: [PATCH v7 13/41] mm: Make pte_mkwrite() take a VMA

2023-03-01 Thread David Hildenbrand


On 01.03.23 08:03, Christophe Leroy wrote:



Le 27/02/2023 à 23:29, Rick Edgecombe a écrit :

The x86 Control-flow Enforcement Technology (CET) feature includes a new
type of memory called shadow stack. This shadow stack memory has some
unusual properties, which requires some core mm changes to function
properly.

One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite().

In addition to VM_WRITE, the shadow stack VMA's will have a flag denoting
that they are special shadow stack flavor of writable memory. So make
pte_mkwrite() take a VMA, so that the x86 implementation of it can know to
create regular writable memory or shadow stack memory.

Apply the same changes for pmd_mkwrite() and huge_pte_mkwrite().


I'm not sure it is a good idea to add a second argument to
pte_mkwrite(). All pte_mk() only take a pte and nothing else.


We touched on this in previous revisions and so far there was no strong 
push back. This turned out to be cleaner and easier than the 
alternatives we evaluated.


pte_modify(), for example, takes another argument. Sure, we could try 
thinking about passing something else than a VMA to identify the 
writability type, but I am not convinced that will look particularly better.




I think you should do the same as commit d9ed9faac283 ("mm: add new
arch_make_huge_pte() method for tile support")



We already have 3 architectures intending to support shadow stacks in 
one way or the other. Replacing all pte_mkwrite() with 
arch_pte_mkwrite() doesn't sound particularly appealing to me.



--
Thanks,

David / dhildenb

Re: [PATCH v6 13/41] mm: Make pte_mkwrite() take a VMA

2023-02-20 Thread David Hildenbrand


On 18.02.23 22:14, Rick Edgecombe wrote:

The x86 Control-flow Enforcement Technology (CET) feature includes a new
type of memory called shadow stack. This shadow stack memory has some
unusual properties, which requires some core mm changes to function
properly.

One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite().

In addition to VM_WRITE, the shadow stack VMA's will have a flag denoting
that they are special shadow stack flavor of writable memory. So make
pte_mkwrite() take a VMA, so that the x86 implementation of it can know to
create regular writable memory or shadow stack memory.

Apply the same changes for pmd_mkwrite() and huge_pte_mkwrite().

No functional change.

Cc: linux-...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: linux-al...@vger.kernel.org
Cc: linux-snps-...@lists.infradead.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-c...@vger.kernel.org
Cc: linux-hexa...@vger.kernel.org
Cc: linux-i...@vger.kernel.org
Cc: loonga...@lists.linux.dev
Cc: linux-m...@lists.linux-m68k.org
Cc: Michal Simek 
Cc: Dinh Nguyen 
Cc: linux-m...@vger.kernel.org
Cc: openr...@lists.librecores.org
Cc: linux-par...@vger.kernel.org
Cc: linuxppc-...@lists.ozlabs.org
Cc: linux-ri...@lists.infradead.org
Cc: linux-s...@vger.kernel.org
Cc: linux...@vger.kernel.org
Cc: sparcli...@vger.kernel.org
Cc: linux...@lists.infradead.org
Cc: xen-devel@lists.xenproject.org
Cc: linux-a...@vger.kernel.org
Cc: linux...@kvack.org
Tested-by: Pengfei Xu 
Suggested-by: David Hildenbrand 
Signed-off-by: Rick Edgecombe 

---
Hi Non-x86 Arch’s,

x86 has a feature that allows for the creation of a special type of
writable memory (shadow stack) that is only writable in limited specific
ways. Previously, changes were proposed to core MM code to teach it to
decide when to create normally writable memory or the special shadow stack
writable memory, but David Hildenbrand suggested[0] to change
pXX_mkwrite() to take a VMA, so awareness of shadow stack memory can be
moved into x86 code.

Since pXX_mkwrite() is defined in every arch, it requires some tree-wide
changes. So that is why you are seeing some patches out of a big x86
series pop up in your arch mailing list. There is no functional change.
After this refactor, the shadow stack series goes on to use the arch
helpers to push shadow stack memory details inside arch/x86.

Testing was just 0-day build testing.

Hopefully that is enough context. Thanks!

[0] 
https://lore.kernel.org/lkml/0e29a2d0-08d8-bcd6-ff26-4bea0e403...@redhat.com/#t

v6:
  - New patch
---
  Documentation/mm/arch_pgtable_helpers.rst|  9 ++---
  arch/alpha/include/asm/pgtable.h |  6 +-
  arch/arc/include/asm/hugepage.h  |  2 +-
  arch/arc/include/asm/pgtable-bits-arcv2.h|  7 ++-
  arch/arm/include/asm/pgtable-3level.h|  7 ++-
  arch/arm/include/asm/pgtable.h   |  2 +-
  arch/arm64/include/asm/pgtable.h |  4 ++--
  arch/csky/include/asm/pgtable.h  |  2 +-
  arch/hexagon/include/asm/pgtable.h   |  2 +-
  arch/ia64/include/asm/pgtable.h  |  2 +-
  arch/loongarch/include/asm/pgtable.h |  4 ++--
  arch/m68k/include/asm/mcf_pgtable.h  |  2 +-
  arch/m68k/include/asm/motorola_pgtable.h |  6 +-
  arch/m68k/include/asm/sun3_pgtable.h |  6 +-
  arch/microblaze/include/asm/pgtable.h|  2 +-
  arch/mips/include/asm/pgtable.h  |  6 +++---
  arch/nios2/include/asm/pgtable.h |  2 +-
  arch/openrisc/include/asm/pgtable.h  |  2 +-
  arch/parisc/include/asm/pgtable.h|  6 +-
  arch/powerpc/include/asm/book3s/32/pgtable.h |  2 +-
  arch/powerpc/include/asm/book3s/64/pgtable.h |  4 ++--
  arch/powerpc/include/asm/nohash/32/pgtable.h |  2 +-
  arch/powerpc/include/asm/nohash/32/pte-8xx.h |  2 +-
  arch/powerpc/include/asm/nohash/64/pgtable.h |  2 +-
  arch/riscv/include/asm/pgtable.h |  6 +++---
  arch/s390/include/asm/hugetlb.h  |  4 ++--
  arch/s390/include/asm/pgtable.h  |  4 ++--
  arch/sh/include/asm/pgtable_32.h | 10 --
  arch/sparc/include/asm/pgtable_32.h  |  2 +-
  arch/sparc/include/asm/pgtable_64.h  |  6 +++---
  arch/um/include/asm/pgtable.h|  2 +-
  arch/x86/include/asm/pgtable.h   |  6 --
  arch/xtensa/include/asm/pgtable.h|  2 +-
  include/asm-generic/hugetlb.h|  4 ++--
  include/linux/mm.h   |  2 +-
  mm/debug_vm_pgtable.c| 16 
  mm/huge_memory.c |  6 +++---
  mm/hugetlb.c |  4 ++--
  mm/mem

Re: [PATCH v6 11/41] mm: Introduce pte_mkwrite_kernel()

2023-02-20 Thread David Hildenbrand


On 18.02.23 22:14, Rick Edgecombe wrote:

The x86 Control-flow Enforcement Technology (CET) feature includes a new
type of memory called shadow stack. This shadow stack memory has some
unusual properties, which requires some core mm changes to function
properly.

One of these changes is to allow for pte_mkwrite() to create different
types of writable memory (the existing conventionally writable type and
also the new shadow stack type). Future patches will convert pte_mkwrite()
to take a VMA in order to facilitate this, however there are places in the
kernel where pte_mkwrite() is called outside of the context of a VMA.
These are for kernel memory. So create a new variant called
pte_mkwrite_kernel() and switch the kernel users over to it. Have
pte_mkwrite() and pte_mkwrite_kernel() be the same for now. Future patches
will introduce changes to make pte_mkwrite() take a VMA.

Only do this for architectures that need it because they call pte_mkwrite()
in arch code without an associated VMA. Since it will only currently be
used in arch code, so do not include it in arch_pgtable_helpers.rst.

Cc: linux-...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-s...@vger.kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: linux-a...@vger.kernel.org
Cc: linux...@kvack.org
Tested-by: Pengfei Xu 
Suggested-by: David Hildenbrand 
Signed-off-by: Rick Edgecombe 



Acked-by: David Hildenbrand 

Do we also have to care about pmd_mkwrite() ?

--
Thanks,

David / dhildenb

Re: [PATCH v6 11/41] mm: Introduce pte_mkwrite_kernel()

2023-02-20 Thread David Hildenbrand


On 19.02.23 21:38, Kees Cook wrote:

On Sat, Feb 18, 2023 at 01:14:03PM -0800, Rick Edgecombe wrote:

The x86 Control-flow Enforcement Technology (CET) feature includes a new
type of memory called shadow stack. This shadow stack memory has some
unusual properties, which requires some core mm changes to function
properly.

One of these changes is to allow for pte_mkwrite() to create different
types of writable memory (the existing conventionally writable type and
also the new shadow stack type). Future patches will convert pte_mkwrite()
to take a VMA in order to facilitate this, however there are places in the
kernel where pte_mkwrite() is called outside of the context of a VMA.
These are for kernel memory. So create a new variant called
pte_mkwrite_kernel() and switch the kernel users over to it. Have
pte_mkwrite() and pte_mkwrite_kernel() be the same for now. Future patches
will introduce changes to make pte_mkwrite() take a VMA.

Only do this for architectures that need it because they call pte_mkwrite()
in arch code without an associated VMA. Since it will only currently be
used in arch code, so do not include it in arch_pgtable_helpers.rst.

Cc: linux-...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-s...@vger.kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: linux-a...@vger.kernel.org
Cc: linux...@kvack.org
Tested-by: Pengfei Xu 
Suggested-by: David Hildenbrand 
Signed-off-by: Rick Edgecombe 


I think it's a little weird that it's the only PTE helper taking a vma,
but it does seem like the right approach.


Right. We could pass the vm flags instead, but not sure if that really 
improves the situation. So unless someone has a better idea, this LGTM.


--
Thanks,

David / dhildenb

Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-01 Thread David Hildenbrand

On 01.09.22 16:23, Kent Overstreet wrote:
> On Thu, Sep 01, 2022 at 10:05:03AM +0200, David Hildenbrand wrote:
>> On 31.08.22 21:01, Kent Overstreet wrote:
>>> On Wed, Aug 31, 2022 at 12:47:32PM +0200, Michal Hocko wrote:
>>>> On Wed 31-08-22 11:19:48, Mel Gorman wrote:
>>>>> Whatever asking for an explanation as to why equivalent functionality
>>>>> cannot not be created from ftrace/kprobe/eBPF/whatever is reasonable.
>>>>
>>>> Fully agreed and this is especially true for a change this size
>>>> 77 files changed, 3406 insertions(+), 703 deletions(-)
>>>
>>> In the case of memory allocation accounting, you flat cannot do this with 
>>> ftrace
>>> - you could maybe do a janky version that isn't fully accurate, much slower,
>>> more complicated for the developer to understand and debug and more 
>>> complicated
>>> for the end user.
>>>
>>> But please, I invite anyone who's actually been doing this with ftrace to
>>> demonstrate otherwise.
>>>
>>> Ftrace just isn't the right tool for the job here - we're talking about 
>>> adding
>>> per callsite accounting to some of the fastest fast paths in the kernel.
>>>
>>> And the size of the changes for memory allocation accounting are much more
>>> reasonable:
>>>  33 files changed, 623 insertions(+), 99 deletions(-)
>>>
>>> The code tagging library should exist anyways, it's been open coded half a 
>>> dozen
>>> times in the kernel already.
>>
>> Hi Kent,
>>
>> independent of the other discussions, if it's open coded already, does
>> it make sense to factor that already-open-coded part out independently
>> of the remainder of the full series here?
> 
> It's discussed in the cover letter, that is exactly how the patch series is
> structured.

Skimming over the patches (that I was CCed on) and skimming over the
cover letter, I got the impression that everything after patch 7 is
introducing something new instead of refactoring something out.

>  
>> [I didn't immediately spot if this series also attempts already to
>> replace that open-coded part]
> 
> Uh huh.
> 
> Honestly, some days it feels like lkml is just as bad as slashdot, with people
> wanting to get in their two cents without actually reading...

... and of course you had to reply like that. I should just have learned
from my last upstream experience with you and kept you on my spam list.

Thanks, bye

-- 
Thanks,

David / dhildenb

Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-01 Thread David Hildenbrand

On 31.08.22 21:01, Kent Overstreet wrote:
> On Wed, Aug 31, 2022 at 12:47:32PM +0200, Michal Hocko wrote:
>> On Wed 31-08-22 11:19:48, Mel Gorman wrote:
>>> Whatever asking for an explanation as to why equivalent functionality
>>> cannot not be created from ftrace/kprobe/eBPF/whatever is reasonable.
>>
>> Fully agreed and this is especially true for a change this size
>> 77 files changed, 3406 insertions(+), 703 deletions(-)
> 
> In the case of memory allocation accounting, you flat cannot do this with 
> ftrace
> - you could maybe do a janky version that isn't fully accurate, much slower,
> more complicated for the developer to understand and debug and more 
> complicated
> for the end user.
> 
> But please, I invite anyone who's actually been doing this with ftrace to
> demonstrate otherwise.
> 
> Ftrace just isn't the right tool for the job here - we're talking about adding
> per callsite accounting to some of the fastest fast paths in the kernel.
> 
> And the size of the changes for memory allocation accounting are much more
> reasonable:
>  33 files changed, 623 insertions(+), 99 deletions(-)
> 
> The code tagging library should exist anyways, it's been open coded half a 
> dozen
> times in the kernel already.

Hi Kent,

independent of the other discussions, if it's open coded already, does
it make sense to factor that already-open-coded part out independently
of the remainder of the full series here?

[I didn't immediately spot if this series also attempts already to
replace that open-coded part]

-- 
Thanks,

David / dhildenb

Re: [PATCH v2] mm, page_alloc: fix build_zonerefs_node()

2022-04-07 Thread David Hildenbrand

On 07.04.22 14:06, Juergen Gross wrote:
> Since commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from
> zones with pages managed by the buddy allocator") only zones with free
> memory are included in a built zonelist. This is problematic when e.g.
> all memory of a zone has been ballooned out when zonelists are being
> rebuilt.
> 
> The decision whether to rebuild the zonelists when onlining new memory
> is done based on populated_zone() returning 0 for the zone the memory
> will be added to. The new zone is added to the zonelists only, if it
> has free memory pages (managed_zone() returns a non-zero value) after
> the memory has been onlined. This implies, that onlining memory will
> always free the added pages to the allocator immediately, but this is
> not true in all cases: when e.g. running as a Xen guest the onlined
> new memory will be added only to the ballooned memory list, it will be
> freed only when the guest is being ballooned up afterwards.
> 
> Another problem with using managed_zone() for the decision whether a
> zone is being added to the zonelists is, that a zone with all memory
> used will in fact be removed from all zonelists in case the zonelists
> happen to be rebuilt.
> 
> Use populated_zone() when building a zonelist as it has been done
> before that commit.
> 
> Cc: sta...@vger.kernel.org
> Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with 
> pages managed by the buddy allocator")
> Reported-by: Marek Marczykowski-Górecki 
> Signed-off-by: Juergen Gross 
> Acked-by: Michal Hocko 
> ---
> V2:
> - updated commit message (Michal Hocko)
> ---
>  mm/page_alloc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index bdc8f60ae462..3d0662af3289 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6128,7 +6128,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct 
> zoneref *zonerefs)
>   do {
>   zone_type--;
>   zone = pgdat->node_zones + zone_type;
> - if (managed_zone(zone)) {
> + if (populated_zone(zone)) {
>   zoneref_set_zone(zone, &zonerefs[nr_zones++]);
>   check_highest_zone(zone_type);
>   }

Did you drop my Ack?

Also, I'd appreciate getting CCed on patches where I commented.

-- 
Thanks,

David / dhildenb

Re: [PATCH] mm, page_alloc: fix build_zonerefs_node()

2022-04-07 Thread David Hildenbrand

On 07.04.22 14:04, Michal Hocko wrote:
> On Thu 07-04-22 13:58:44, David Hildenbrand wrote:
> [...]
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 3589febc6d31..130a2feceddc 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -6112,10 +6112,8 @@ static int build_zonerefs_node(pg_data_t *pgdat, 
>>> struct zoneref *zonerefs)
>>> do {
>>> zone_type--;
>>> zone = pgdat->node_zones + zone_type;
>>> -   if (managed_zone(zone)) {
>>> -   zoneref_set_zone(zone, &zonerefs[nr_zones++]);
>>> -   check_highest_zone(zone_type);
>>> -   }
>>> +   zoneref_set_zone(zone, &zonerefs[nr_zones++]);
>>> +   check_highest_zone(zone_type);
>>> } while (zone_type);
>>>  
>>> return nr_zones;
>>
>> I don't think having !populated zones in the zonelist is a particularly
>> good idea. Populated vs !populated changes only during page
>> onlininge/offlining.
>>
>> If I'm not wrong, with your patch we'd even include ZONE_DEVICE here ...
> 
> What kind of problem that would cause? The allocator wouldn't see any
> pages at all so it would fallback to the next one. Maybe kswapd would
> need some tweak to have a bail out condition but as mentioned in the
> thread already. !populated or !managed for that matter are not all that
> much different from completely depleted zones. The fact that we are
> making that distinction has led to some bugs and I suspect it makes the
> code more complex without a very good reason.

I assume performance problems. Assume you have an ordinary system with
multiple NUMA nodes and no MOVABLE memory. Most nodes will only have
ZONE_NORMAL. Yet, you'd include ZONE_DMA* and ZONE_MOVABLE that will
always remain empty to be traversed on each and every allocation
fallback. Of course, we could measure, but IMHO at least *that* part of
memory onlining/offlining is not the complicated part :D

Populated vs. !populated is under pretty good control via page
onlining/offlining. We have to be careful with "managed pages", because
that's a moving target, especially with memory ballooning. And I assume
that's the bigger source of bugs.

> 
>> I'd vote for going with the simple fix first, which should be good
>> enough AFAIKT.
> 
> yes, see the other reply
> 

I think we were composing almost simultaneously :)

-- 
Thanks,

David / dhildenb

Re: [PATCH] mm, page_alloc: fix build_zonerefs_node()

2022-04-07 Thread David Hildenbrand

On 07.04.22 13:40, Michal Hocko wrote:
> On Thu 07-04-22 13:17:19, Juergen Gross wrote:
>> On 07.04.22 13:07, Michal Hocko wrote:
>>> On Thu 07-04-22 12:45:41, Juergen Gross wrote:
 On 07.04.22 12:34, Michal Hocko wrote:
> Ccing Mel
>
> On Thu 07-04-22 11:32:21, Juergen Gross wrote:
>> Since commit 9d3be21bf9c0 ("mm, page_alloc: simplify zonelist
>> initialization") only zones with free memory are included in a built
>> zonelist. This is problematic when e.g. all memory of a zone has been
>> ballooned out.
>
> What is the actual problem there?

 When running as Xen guest new hotplugged memory will not be onlined
 automatically, but only on special request. This is done in order to
 support adding e.g. the possibility to use another GB of memory, while
 adding only a part of that memory initially.

 In case adding that memory is populating a new zone, the page allocator
 won't be able to use this memory when it is onlined, as the zone wasn't
 added to the zonelist, due to managed_zone() returning 0.
>>>
>>> How is that memory onlined? Because "regular" onlining (online_pages())
>>> does rebuild zonelists if their zone hasn't been populated before.
>>
>> The Xen balloon driver has an own callback for onlining pages. The pages
>> are just added to the ballooned-out page list without handing them to the
>> allocator. This is done only when the guest is ballooned up.
> 
> OK, I see. Let me just rephrase to see whether we are on the same page.
> Xen is overriding the online_page_callback to xen_online_page which
> doesn't free pages to the page allocator which means that a zone might
> remain unpopulated after onlining. This means that the default zone
> lists rebuild is not done and later on when those pages are finally
> released to the allocator there is no build_all_zonelists happening so
> those freed pages are not really visible to the allocator via zonelists
> fallback allocation.
> 
> Now to your patch. I suspect this is not sufficient for the full hotplug
> situation. Consider a new NUMA node to be hotadded. hotadd_new_pgdat
> will call build_all_zonelists but the zone is not populated yet at that
> moment unless I am missing something. We do rely on online_pages to
> rebuild once pages are onlined - which usually means they are freed to
> the page allocator.
> 
> The zonelists building is kinda messy TBH. I have to say that I am not
> really clear on Mel's 6aa303defb74 ("mm, vmscan: only allocate and
> reclaim from zones with pages managed by the buddy allocator") because
> as you have said unpoppulated zone is not (or shouldn't be) really all
> that different from a depleted zone.
> 
> I think a better and more complete fix would be the following. In other
> words the zonelists will be built for all present zones. Not sure
> whether that is going to break 6aa303defb74 though.
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 2a9627dc784c..880c455e2557 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1062,7 +1062,6 @@ int __ref online_pages(unsigned long pfn, unsigned long 
> nr_pages,
>  struct zone *zone, struct memory_group *group)
>  {
>   unsigned long flags;
> - int need_zonelists_rebuild = 0;
>   const int nid = zone_to_nid(zone);
>   int ret;
>   struct memory_notify arg;
> @@ -1106,17 +1105,13 @@ int __ref online_pages(unsigned long pfn, unsigned 
> long nr_pages,
>* This means the page allocator ignores this zone.
>* So, zonelist must be updated after online.
>*/
> - if (!populated_zone(zone)) {
> - need_zonelists_rebuild = 1;
> + if (!populated_zone(zone))
>   setup_zone_pageset(zone);
> - }
>  
>   online_pages_range(pfn, nr_pages);
>   adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
>  
>   node_states_set_node(nid, &arg);
> - if (need_zonelists_rebuild)
> - build_all_zonelists(NULL);
>  
>   /* Basic onlining is complete, allow allocation of onlined pages. */
>   undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
> @@ -1985,10 +1980,8 @@ int __ref offline_pages(unsigned long start_pfn, 
> unsigned long nr_pages,
>   /* reinitialise watermarks and update pcp limits */
>   init_per_zone_wmark_min();
>  
> - if (!populated_zone(zone)) {
> + if (!populated_zone(zone))
>   zone_pcp_reset(zone);
> - build_all_zonelists(NULL);
> - }
>  
>   node_states_clear_node(node, &arg);
>   if (arg.status_change_nid >= 0) {
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3589febc6d31..130a2feceddc 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6112,10 +6112,8 @@ static int build_zonerefs_node(pg_data_t *pgdat, 
> struct zoneref *zonerefs)
>   do {
>   zone_type--;
>   zone = pgdat->node_zones + zone_type;
> - if (managed_z

Re: [PATCH] mm, page_alloc: fix build_zonerefs_node()

2022-04-07 Thread David Hildenbrand

On 07.04.22 11:32, Juergen Gross wrote:
> Since commit 9d3be21bf9c0 ("mm, page_alloc: simplify zonelist
> initialization") only zones with free memory are included in a built
> zonelist. This is problematic when e.g. all memory of a zone has been
> ballooned out.
> 
> Use populated_zone() when building a zonelist as it has been done
> before that commit.
> 
> Cc: sta...@vger.kernel.org
> Fixes: 9d3be21bf9c0 ("mm, page_alloc: simplify zonelist initialization")
> Reported-by: Marek Marczykowski-Górecki 
> Signed-off-by: Juergen Gross 
> ---
>  mm/page_alloc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index bdc8f60ae462..3d0662af3289 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6128,7 +6128,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct 
> zoneref *zonerefs)
>   do {
>   zone_type--;
>   zone = pgdat->node_zones + zone_type;
> - if (managed_zone(zone)) {
> + if (populated_zone(zone)) {
>   zoneref_set_zone(zone, &zonerefs[nr_zones++]);
>   check_highest_zone(zone_type);
>   }

Let's see if we have to find another way to properly handle fadump.

Acked-by: David Hildenbrand 

-- 
Thanks,

David / dhildenb

Re: [PATCH] xen/balloon: fix page onlining when populating new zone

2022-04-07 Thread David Hildenbrand

On 07.04.22 10:50, Juergen Gross wrote:
> On 07.04.22 10:23, David Hildenbrand wrote:
>> On 06.04.22 15:32, Juergen Gross wrote:
>>> When onlining a new memory page in a guest the Xen balloon driver is
>>> adding it to the ballooned pages instead making it available to be
>>> used immediately. This is meant to enable to add a new upper memory
>>> limit to a guest via hotplugging memory, without having to assign the
>>> new memory in one go.
>>>
>>> In case the upper memory limit will be raised above 4G, the new memory
>>> will populate the ZONE_NORMAL memory zone, which wasn't populated
>>> before. The newly populated zone won't be added to the list of zones
>>> looked at by the page allocator though, as only zones with available
>>> memory are being added, and the memory isn't yet available as it is
>>> ballooned out.
>>
>> I think we just recently discussed these corner cases on the -mm list.
> 
> Indeed.
> 
>> The issue is having effectively populated zones without manages pages
>> because everything is inflated in a balloon.
> 
> Correct.
> 
>> That can theoretically also happen when managing to fully inflate the
>> balloon in one zone and then, somehow, the zones get rebuilt.
> 
> I think you are right. I didn't think of that scenario.
> 
>> build_zonerefs_node() documents "Add all populated zones of a node to
>> the zonelist" but checks for managed zones, which is wrong.
>>
>> See https://lkml.kernel.org/r/20220201070044.zbm3obsoimhz3xd3@master
> 
> I found commit 6aa303defb7454 which introduced this test. I thought
> it was needed due to the problem this commit tried to solve. Maybe I
> was wrong and that commit shouldn't have changed the condition when
> building the zonelist, but just the ones in the allocation paths.

In regard to kswapd, that is currently being worked on via

https://lkml.kernel.org/r/20220329010901.1654-2-richard.weiy...@gmail.com

-- 
Thanks,

David / dhildenb

Re: [PATCH] xen/balloon: fix page onlining when populating new zone

2022-04-07 Thread David Hildenbrand

On 06.04.22 15:32, Juergen Gross wrote:
> When onlining a new memory page in a guest the Xen balloon driver is
> adding it to the ballooned pages instead making it available to be
> used immediately. This is meant to enable to add a new upper memory
> limit to a guest via hotplugging memory, without having to assign the
> new memory in one go.
> 
> In case the upper memory limit will be raised above 4G, the new memory
> will populate the ZONE_NORMAL memory zone, which wasn't populated
> before. The newly populated zone won't be added to the list of zones
> looked at by the page allocator though, as only zones with available
> memory are being added, and the memory isn't yet available as it is
> ballooned out.

I think we just recently discussed these corner cases on the -mm list.
The issue is having effectively populated zones without manages pages
because everything is inflated in a balloon.

That can theoretically also happen when managing to fully inflate the
balloon in one zone and then, somehow, the zones get rebuilt.

build_zonerefs_node() documents "Add all populated zones of a node to
the zonelist" but checks for managed zones, which is wrong.

See https://lkml.kernel.org/r/20220201070044.zbm3obsoimhz3xd3@master

> 
> This will result in the new memory being assigned to the guest, but
> without the allocator being able to use it.
> 
> When running as a PV guest the situation is even worse: when having
> been started with less memory than allowed, and the upper limit being
> lower than 4G, ballooning up will have the same effect as hotplugging
> new memory. This is due to the usage of the zone device functionality
> since commit 9e2369c06c8a ("xen: add helpers to allocate unpopulated
> memory") for creating mappings of other guest's pages, which as a side
> effect is being used for PV guest ballooning, too.
> 
> Fix this by checking in xen_online_page() whether the new memory page
> will be the first in a new zone. If this is the case, add another page
> to the balloon and use the first memory page of the new chunk as a
> replacement for this now ballooned out page. This will result in the
> newly populated zone containing one page being available for the page
> allocator, which in turn will lead to the zone being added to the
> allocator.

This somehow feels like a hack for something that should be handled in
the core instead :/


-- 
Thanks,

David / dhildenb

[PATCH v2 9/9] virtio-mem: kdump mode to sanitize /proc/vmcore access

2021-10-05 Thread David Hildenbrand

Although virtio-mem currently supports reading unplugged memory in the
hypervisor, this will change in the future, indicated to the device via
a new feature flag. We similarly sanitized /proc/kcore access recently. [1]

Let's register a vmcore callback, to allow vmcore code to check if a PFN
belonging to a virtio-mem device is either currently plugged and should
be dumped or is currently unplugged and should not be accessed, instead
mapping the shared zeropage or returning zeroes when reading.

This is important when not capturing /proc/vmcore via tools like
"makedumpfile" that can identify logically unplugged virtio-mem memory via
PG_offline in the memmap, but simply by e.g., copying the file.

Distributions that support virtio-mem+kdump have to make sure that the
virtio_mem module will be part of the kdump kernel or the kdump initrd;
dracut was recently [2] extended to include virtio-mem in the generated
initrd. As long as no special kdump kernels are used, this will
automatically make sure that virtio-mem will be around in the kdump initrd
and sanitize /proc/vmcore access -- with dracut.

With this series, we'll send one virtio-mem state request for every
~2 MiB chunk of virtio-mem memory indicated in the vmcore that we intend
to read/map.

In the future, we might want to allow building virtio-mem for kdump
mode only, even without CONFIG_MEMORY_HOTPLUG and friends: this way,
we could support special stripped-down kdump kernels that have many
other config options disabled; we'll tackle that once required. Further,
we might want to try sensing bigger blocks (e.g., memory sections)
first before falling back to device blocks on demand.

Tested with Fedora rawhide, which contains a recent kexec-tools version
(considering "System RAM (virtio_mem)" when creating the vmcore header) and
a recent dracut version (including the virtio_mem module in the kdump
initrd).

[1] https://lkml.kernel.org/r/20210526093041.8800-1-da...@redhat.com
[2] https://github.com/dracutdevs/dracut/pull/1157

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 136 
 1 file changed, 124 insertions(+), 12 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 76d8aef3cfd2..3573b78f6b05 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -223,6 +223,9 @@ struct virtio_mem {
 * When this lock is held the pointers can't change, ONLINE and
 * OFFLINE blocks can't change the state and no subblocks will get
 * plugged/unplugged.
+*
+* In kdump mode, used to serialize requests, last_block_addr and
+* last_block_plugged.
 */
struct mutex hotplug_mutex;
bool hotplug_active;
@@ -230,6 +233,9 @@ struct virtio_mem {
/* An error occurred we cannot handle - stop processing requests. */
bool broken;
 
+   /* Cached valued of is_kdump_kernel() when the device was probed. */
+   bool in_kdump;
+
/* The driver is being removed. */
spinlock_t removal_lock;
bool removing;
@@ -243,6 +249,13 @@ struct virtio_mem {
/* Memory notifier (online/offline events). */
struct notifier_block memory_notifier;
 
+#ifdef CONFIG_PROC_VMCORE
+   /* vmcore callback for /proc/vmcore handling in kdump mode */
+   struct vmcore_cb vmcore_cb;
+   uint64_t last_block_addr;
+   bool last_block_plugged;
+#endif /* CONFIG_PROC_VMCORE */
+
/* Next device in the list of virtio-mem devices. */
struct list_head next;
 };
@@ -2293,6 +2306,12 @@ static void virtio_mem_run_wq(struct work_struct *work)
uint64_t diff;
int rc;
 
+   if (unlikely(vm->in_kdump)) {
+   dev_warn_once(&vm->vdev->dev,
+"unexpected workqueue run in kdump kernel\n");
+   return;
+   }
+
hrtimer_cancel(&vm->retry_timer);
 
if (vm->broken)
@@ -2521,6 +2540,86 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm)
return rc;
 }
 
+#ifdef CONFIG_PROC_VMCORE
+static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
+uint64_t size)
+{
+   const uint64_t nb_vm_blocks = size / vm->device_block_size;
+   const struct virtio_mem_req req = {
+   .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
+   .u.state.addr = cpu_to_virtio64(vm->vdev, addr),
+   .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
+   };
+   int rc = -ENOMEM;
+
+   dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
+   addr + size - 1);
+
+   switch (virtio_mem_send_request(vm, &req)) {
+   case VIRTIO_MEM_RESP_ACK:
+   return virtio16_to_cpu(vm->vdev, vm->resp.

[PATCH v2 8/9] virtio-mem: factor out hotplug specifics from virtio_mem_remove() into virtio_mem_deinit_hotplug()

2021-10-05 Thread David Hildenbrand

Let's prepare for a new virtio-mem kdump mode in which we don't actually
hot(un)plug any memory but only observe the state of device blocks.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 1be3ee7f684d..76d8aef3cfd2 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2667,9 +2667,8 @@ static int virtio_mem_probe(struct virtio_device *vdev)
return rc;
 }
 
-static void virtio_mem_remove(struct virtio_device *vdev)
+static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
 {
-   struct virtio_mem *vm = vdev->priv;
unsigned long mb_id;
int rc;
 
@@ -2716,7 +2715,8 @@ static void virtio_mem_remove(struct virtio_device *vdev)
 * away. Warn at least.
 */
if (virtio_mem_has_memory_added(vm)) {
-   dev_warn(&vdev->dev, "device still has system memory added\n");
+   dev_warn(&vm->vdev->dev,
+"device still has system memory added\n");
} else {
virtio_mem_delete_resource(vm);
kfree_const(vm->resource_name);
@@ -2730,6 +2730,13 @@ static void virtio_mem_remove(struct virtio_device *vdev)
} else {
vfree(vm->bbm.bb_states);
}
+}
+
+static void virtio_mem_remove(struct virtio_device *vdev)
+{
+   struct virtio_mem *vm = vdev->priv;
+
+   virtio_mem_deinit_hotplug(vm);
 
/* reset the device and cleanup the queues */
vdev->config->reset(vdev);
-- 
2.31.1

[PATCH v2 7/9] virtio-mem: factor out hotplug specifics from virtio_mem_probe() into virtio_mem_init_hotplug()

2021-10-05 Thread David Hildenbrand

Let's prepare for a new virtio-mem kdump mode in which we don't actually
hot(un)plug any memory but only observe the state of device blocks.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 87 +++--
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 2ba7e8d6ba8d..1be3ee7f684d 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -260,6 +260,8 @@ static void virtio_mem_fake_offline_going_offline(unsigned 
long pfn,
 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
   unsigned long nr_pages);
 static void virtio_mem_retry(struct virtio_mem *vm);
+static int virtio_mem_create_resource(struct virtio_mem *vm);
+static void virtio_mem_delete_resource(struct virtio_mem *vm);
 
 /*
  * Register a virtio-mem device so it will be considered for the online_page
@@ -2395,7 +2397,8 @@ static int virtio_mem_init_vq(struct virtio_mem *vm)
 static int virtio_mem_init_hotplug(struct virtio_mem *vm)
 {
const struct range pluggable_range = mhp_get_pluggable_range(true);
-   uint64_t sb_size, addr;
+   uint64_t unit_pages, sb_size, addr;
+   int rc;
 
/* bad device setup - warn only */
if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
@@ -2474,7 +2477,48 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm)
dev_info(&vm->vdev->dev, "big block size: 0x%llx",
 (unsigned long long)vm->bbm.bb_size);
 
+   /* create the parent resource for all memory */
+   rc = virtio_mem_create_resource(vm);
+   if (rc)
+   return rc;
+
+   /* use a single dynamic memory group to cover the whole memory device */
+   if (vm->in_sbm)
+   unit_pages = PHYS_PFN(memory_block_size_bytes());
+   else
+   unit_pages = PHYS_PFN(vm->bbm.bb_size);
+   rc = memory_group_register_dynamic(vm->nid, unit_pages);
+   if (rc < 0)
+   goto out_del_resource;
+   vm->mgid = rc;
+
+   /*
+* If we still have memory plugged, we have to unplug all memory first.
+* Registering our parent resource makes sure that this memory isn't
+* actually in use (e.g., trying to reload the driver).
+*/
+   if (vm->plugged_size) {
+   vm->unplug_all_required = true;
+   dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
+   }
+
+   /* register callbacks */
+   vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
+   rc = register_memory_notifier(&vm->memory_notifier);
+   if (rc)
+   goto out_unreg_group;
+   rc = register_virtio_mem_device(vm);
+   if (rc)
+   goto out_unreg_mem;
+
return 0;
+out_unreg_mem:
+   unregister_memory_notifier(&vm->memory_notifier);
+out_unreg_group:
+   memory_group_unregister(vm->mgid);
+out_del_resource:
+   virtio_mem_delete_resource(vm);
+   return rc;
 }
 
 static int virtio_mem_init(struct virtio_mem *vm)
@@ -2578,7 +2622,6 @@ static bool virtio_mem_has_memory_added(struct virtio_mem 
*vm)
 static int virtio_mem_probe(struct virtio_device *vdev)
 {
struct virtio_mem *vm;
-   uint64_t unit_pages;
int rc;
 
BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
@@ -2608,40 +2651,6 @@ static int virtio_mem_probe(struct virtio_device *vdev)
if (rc)
goto out_del_vq;
 
-   /* create the parent resource for all memory */
-   rc = virtio_mem_create_resource(vm);
-   if (rc)
-   goto out_del_vq;
-
-   /* use a single dynamic memory group to cover the whole memory device */
-   if (vm->in_sbm)
-   unit_pages = PHYS_PFN(memory_block_size_bytes());
-   else
-   unit_pages = PHYS_PFN(vm->bbm.bb_size);
-   rc = memory_group_register_dynamic(vm->nid, unit_pages);
-   if (rc < 0)
-   goto out_del_resource;
-   vm->mgid = rc;
-
-   /*
-* If we still have memory plugged, we have to unplug all memory first.
-* Registering our parent resource makes sure that this memory isn't
-* actually in use (e.g., trying to reload the driver).
-*/
-   if (vm->plugged_size) {
-   vm->unplug_all_required = true;
-   dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
-   }
-
-   /* register callbacks */
-   vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
-   rc = register_memory_notifier(&vm->memory_notifier);
-   if (rc)
-   goto out_unreg_group;
-   rc = register_virtio_mem_device(vm);
-   if (rc)
-

[PATCH v2 6/9] virtio-mem: factor out hotplug specifics from virtio_mem_init() into virtio_mem_init_hotplug()

2021-10-05 Thread David Hildenbrand

Let's prepare for a new virtio-mem kdump mode in which we don't actually
hot(un)plug any memory but only observe the state of device blocks.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 81 -
 1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index bef8ad6bf466..2ba7e8d6ba8d 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2392,41 +2392,10 @@ static int virtio_mem_init_vq(struct virtio_mem *vm)
return 0;
 }
 
-static int virtio_mem_init(struct virtio_mem *vm)
+static int virtio_mem_init_hotplug(struct virtio_mem *vm)
 {
const struct range pluggable_range = mhp_get_pluggable_range(true);
uint64_t sb_size, addr;
-   uint16_t node_id;
-
-   if (!vm->vdev->config->get) {
-   dev_err(&vm->vdev->dev, "config access disabled\n");
-   return -EINVAL;
-   }
-
-   /*
-* We don't want to (un)plug or reuse any memory when in kdump. The
-* memory is still accessible (but not mapped).
-*/
-   if (is_kdump_kernel()) {
-   dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
-   return -EBUSY;
-   }
-
-   /* Fetch all properties that can't change. */
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
-   &vm->plugged_size);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
-   &vm->device_block_size);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
-   &node_id);
-   vm->nid = virtio_mem_translate_node_id(vm, node_id);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
-   &vm->region_size);
-
-   /* Determine the nid for the device based on the lowest address. */
-   if (vm->nid == NUMA_NO_NODE)
-   vm->nid = memory_add_physaddr_to_nid(vm->addr);
 
/* bad device setup - warn only */
if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
@@ -2496,10 +2465,6 @@ static int virtio_mem_init(struct virtio_mem *vm)
  vm->offline_threshold);
}
 
-   dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
-   dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
-   dev_info(&vm->vdev->dev, "device block size: 0x%llx",
-(unsigned long long)vm->device_block_size);
dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
 memory_block_size_bytes());
if (vm->in_sbm)
@@ -2508,10 +2473,52 @@ static int virtio_mem_init(struct virtio_mem *vm)
else
dev_info(&vm->vdev->dev, "big block size: 0x%llx",
 (unsigned long long)vm->bbm.bb_size);
+
+   return 0;
+}
+
+static int virtio_mem_init(struct virtio_mem *vm)
+{
+   uint16_t node_id;
+
+   if (!vm->vdev->config->get) {
+   dev_err(&vm->vdev->dev, "config access disabled\n");
+   return -EINVAL;
+   }
+
+   /*
+* We don't want to (un)plug or reuse any memory when in kdump. The
+* memory is still accessible (but not mapped).
+*/
+   if (is_kdump_kernel()) {
+   dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
+   return -EBUSY;
+   }
+
+   /* Fetch all properties that can't change. */
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
+   &vm->plugged_size);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
+   &vm->device_block_size);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
+   &node_id);
+   vm->nid = virtio_mem_translate_node_id(vm, node_id);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
+   &vm->region_size);
+
+   /* Determine the nid for the device based on the lowest address. */
+   if (vm->nid == NUMA_NO_NODE)
+   vm->nid = memory_add_physaddr_to_nid(vm->addr);
+
+   dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
+   dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
+   dev_info(&vm->vdev->dev, "device block size: 0x%llx",
+(unsigned long long)vm->device_block_size);
if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
 
-   return 0;
+   return virtio_mem_init_hotplug(vm);
 }
 
 static int virtio_mem_create_resource(struct virtio_mem *vm)
-- 
2.31.1

[PATCH v2 5/9] proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks

2021-10-05 Thread David Hildenbrand

Let's support multiple registered callbacks, making sure that
registering vmcore callbacks cannot fail. Make the callback return a
bool instead of an int, handling how to deal with errors internally.
Drop unused HAVE_OLDMEM_PFN_IS_RAM.

We soon want to make use of this infrastructure from other drivers:
virtio-mem, registering one callback for each virtio-mem device, to
prevent reading unplugged virtio-mem memory.

Handle it via a generic vmcore_cb structure, prepared for future
extensions: for example, once we support virtio-mem on s390x where the
vmcore is completely constructed in the second kernel, we want to detect
and add plugged virtio-mem memory ranges to the vmcore in order for them
to get dumped properly.

Handle corner cases that are unexpected and shouldn't happen in sane
setups: registering a callback after the vmcore has already been opened
(warn only) and unregistering a callback after the vmcore has already been
opened (warn and essentially read only zeroes from that point on).

Signed-off-by: David Hildenbrand 
---
 arch/x86/kernel/aperture_64.c | 13 -
 arch/x86/xen/mmu_hvm.c| 11 ++--
 fs/proc/vmcore.c  | 99 ---
 include/linux/crash_dump.h| 26 +++--
 4 files changed, 111 insertions(+), 38 deletions(-)

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 10562885f5fc..af3ba08b684b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn)
  (pfn >= aperture_pfn_start + aperture_page_count));
 }
 
+#ifdef CONFIG_PROC_VMCORE
+static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
+{
+   return !!gart_mem_pfn_is_ram(pfn);
+}
+
+static struct vmcore_cb gart_vmcore_cb = {
+   .pfn_is_ram = gart_oldmem_pfn_is_ram,
+};
+#endif
+
 static void __init exclude_from_core(u64 aper_base, u32 aper_order)
 {
aperture_pfn_start = aper_base >> PAGE_SHIFT;
aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
 #ifdef CONFIG_PROC_VMCORE
-   WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram));
+   register_vmcore_cb(&gart_vmcore_cb);
 #endif
 #ifdef CONFIG_PROC_KCORE
WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index 6ba8826dcdcc..509bdee3ab90 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -12,10 +12,10 @@
  * The kdump kernel has to check whether a pfn of the crashed kernel
  * was a ballooned page. vmcore is using this function to decide
  * whether to access a pfn of the crashed kernel.
- * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * Returns "false" if the pfn is not backed by a RAM page, the caller may
  * handle the pfn special in this case.
  */
-static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
 {
struct xen_hvm_get_mem_type a = {
.domid = DOMID_SELF,
@@ -24,10 +24,13 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
 
if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) {
pr_warn_once("Unexpected HVMOP_get_mem_type failure\n");
-   return -ENXIO;
+   return true;
}
return a.mem_type != HVMMEM_mmio_dm;
 }
+static struct vmcore_cb xen_vmcore_cb = {
+   .pfn_is_ram = xen_vmcore_pfn_is_ram,
+};
 #endif
 
 static void xen_hvm_exit_mmap(struct mm_struct *mm)
@@ -61,6 +64,6 @@ void __init xen_hvm_init_mmu_ops(void)
if (is_pagetable_dying_supported())
pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap;
 #ifdef CONFIG_PROC_VMCORE
-   WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram));
+   register_vmcore_cb(&xen_vmcore_cb);
 #endif
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a9bd80ab670e..7a04b2eca287 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
 /* Device Dump Size */
 static size_t vmcoredd_orig_sz;
 
-/*
- * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
- * The called function has to take care of module refcounting.
- */
-static int (*oldmem_pfn_is_ram)(unsigned long pfn);
-
-int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+static DECLARE_RWSEM(vmcore_cb_rwsem);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether we had a surprise unregistration of a callback. */
+static bool vmcore_cb_unstable;
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
+
+void register_vmcore_cb(struct vmcore_cb *cb)
 {
-   if (oldmem_pfn_is_ram)
-   return -EBUSY;
-   oldmem_pfn_is_ram = fn;
-   return 0;
+   down_write(&vmcore_cb_rwsem)

[PATCH v2 4/9] proc/vmcore: let pfn_is_ram() return a bool

2021-10-05 Thread David Hildenbrand

The callback should deal with errors internally, it doesn't make sense to
expose these via pfn_is_ram(). We'll rework the callbacks next. Right now
we consider errors as if "it's RAM"; no functional change.

Signed-off-by: David Hildenbrand 
---
 fs/proc/vmcore.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9a15334da208..a9bd80ab670e 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -84,11 +84,11 @@ void unregister_oldmem_pfn_is_ram(void)
 }
 EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
 
-static int pfn_is_ram(unsigned long pfn)
+static bool pfn_is_ram(unsigned long pfn)
 {
int (*fn)(unsigned long pfn);
/* pfn is ram unless fn() checks pagetype */
-   int ret = 1;
+   bool ret = true;
 
/*
 * Ask hypervisor if the pfn is really ram.
@@ -97,7 +97,7 @@ static int pfn_is_ram(unsigned long pfn)
 */
fn = oldmem_pfn_is_ram;
if (fn)
-   ret = fn(pfn);
+   ret = !!fn(pfn);
 
return ret;
 }
@@ -124,7 +124,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
 
/* If pfn is not ram, return zeros for sparse dump files */
-   if (pfn_is_ram(pfn) == 0)
+   if (!pfn_is_ram(pfn))
memset(buf, 0, nr_bytes);
else {
if (encrypted)
-- 
2.31.1

[PATCH v2 3/9] x86/xen: print a warning when HVMOP_get_mem_type fails

2021-10-05 Thread David Hildenbrand

HVMOP_get_mem_type is not expected to fail, "This call failing is
indication of something going quite wrong and it would be good to know
about this." [1]

Let's add a pr_warn_once().

[1] https://lkml.kernel.org/r/3b935aa0-6d85-0bcd-100e-15098add3...@oracle.com

Suggested-by: Boris Ostrovsky 
Signed-off-by: David Hildenbrand 
---
 arch/x86/xen/mmu_hvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index d1b38c77352b..6ba8826dcdcc 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -22,8 +22,10 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
.pfn = pfn,
};
 
-   if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
+   if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) {
+   pr_warn_once("Unexpected HVMOP_get_mem_type failure\n");
return -ENXIO;
+   }
return a.mem_type != HVMMEM_mmio_dm;
 }
 #endif
-- 
2.31.1

[PATCH v2 2/9] x86/xen: simplify xen_oldmem_pfn_is_ram()

2021-10-05 Thread David Hildenbrand

Let's simplify return handling.

Signed-off-by: David Hildenbrand 
---
 arch/x86/xen/mmu_hvm.c | 15 +--
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index b242d1f4b426..d1b38c77352b 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -21,23 +21,10 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
.domid = DOMID_SELF,
.pfn = pfn,
};
-   int ram;
 
if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
return -ENXIO;
-
-   switch (a.mem_type) {
-   case HVMMEM_mmio_dm:
-   ram = 0;
-   break;
-   case HVMMEM_ram_rw:
-   case HVMMEM_ram_ro:
-   default:
-   ram = 1;
-   break;
-   }
-
-   return ram;
+   return a.mem_type != HVMMEM_mmio_dm;
 }
 #endif
 
-- 
2.31.1

[PATCH v2 1/9] x86/xen: update xen_oldmem_pfn_is_ram() documentation

2021-10-05 Thread David Hildenbrand

The callback is only used for the vmcore nowadays.

Reviewed-by: Boris Ostrovsky 
Signed-off-by: David Hildenbrand 
---
 arch/x86/xen/mmu_hvm.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index 57409373750f..b242d1f4b426 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -9,12 +9,9 @@
 
 #ifdef CONFIG_PROC_VMCORE
 /*
- * This function is used in two contexts:
- * - the kdump kernel has to check whether a pfn of the crashed kernel
- *   was a ballooned page. vmcore is using this function to decide
- *   whether to access a pfn of the crashed kernel.
- * - the kexec kernel has to check whether a pfn was ballooned by the
- *   previous kernel. If the pfn is ballooned, handle it properly.
+ * The kdump kernel has to check whether a pfn of the crashed kernel
+ * was a ballooned page. vmcore is using this function to decide
+ * whether to access a pfn of the crashed kernel.
  * Returns 0 if the pfn is not backed by a RAM page, the caller may
  * handle the pfn special in this case.
  */
-- 
2.31.1

[PATCH v2 0/9] proc/vmcore: sanitize access to virtio-mem memory

2021-10-05 Thread David Hildenbrand

As so often with virtio-mem changes that mess with common MM
infrastructure, this might be a good candiate to go via Andrew's tree.

--

After removing /dev/kmem, sanitizing /proc/kcore and handling /dev/mem,
this series tackles the last sane way how a VM could accidentially access
logically unplugged memory managed by a virtio-mem device: /proc/vmcore

When dumping memory via "makedumpfile", PG_offline pages, used by
virtio-mem to flag logically unplugged memory, are already properly
excluded; however, especially when accessing/copying /proc/vmcore "the
usual way", we can still end up reading logically unplugged memory part of
a virtio-mem device.

Patch #1-#3 are cleanups. Patch #4 extends the existing oldmem_pfn_is_ram
mechanism. Patch #5-#7 are virtio-mem refactorings for patch #8, which
implements the virtio-mem logic to query the state of device blocks.

Patch #8:

"
Although virtio-mem currently supports reading unplugged memory in the
hypervisor, this will change in the future, indicated to the device via
a new feature flag. We similarly sanitized /proc/kcore access recently.
[...]
Distributions that support virtio-mem+kdump have to make sure that the
virtio_mem module will be part of the kdump kernel or the kdump initrd;
dracut was recently [2] extended to include virtio-mem in the generated
initrd. As long as no special kdump kernels are used, this will
automatically make sure that virtio-mem will be around in the kdump initrd
and sanitize /proc/vmcore access -- with dracut.
"

This is the last remaining bit to support
VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE [3] in the Linux implementation of
virtio-mem.

Note: this is best-effort. We'll never be able to control what runs inside
the second kernel, really, but we also don't have to care: we only care
about sane setups where we don't want our VM getting zapped once we
touch the wrong memory location while dumping. While we usually expect sane
setups to use "makedumfile", nothing really speaks against just copying
/proc/vmcore, especially in environments where HWpoisioning isn't typically
expected. Also, we really don't want to put all our trust completely on the
memmap, so sanitizing also makes sense when just using "makedumpfile".

[1] https://lkml.kernel.org/r/20210526093041.8800-1-da...@redhat.com
[2] https://github.com/dracutdevs/dracut/pull/1157
[3] https://lists.oasis-open.org/archives/virtio-comment/202109/msg00021.html

v1 -> v2:
- "x86/xen: simplify xen_oldmem_pfn_is_ram()"
-- Simplify even more
- "x86/xen: print a warning when HVMOP_get_mem_type fails"
-- Added
- "virtio-mem: kdump mode to sanitize /proc/vmcore access"
-- Fix wrong range check

Cc: Andrew Morton 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Dave Young 
Cc: Baoquan He 
Cc: Vivek Goyal 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: "Rafael J. Wysocki" 
Cc: x...@kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: ke...@lists.infradead.org
Cc: linux-fsde...@vger.kernel.org
Cc: linux...@kvack.org

David Hildenbrand (9):
  x86/xen: update xen_oldmem_pfn_is_ram() documentation
  x86/xen: simplify xen_oldmem_pfn_is_ram()
  x86/xen: print a warning when HVMOP_get_mem_type fails
  proc/vmcore: let pfn_is_ram() return a bool
  proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore
callbacks
  virtio-mem: factor out hotplug specifics from virtio_mem_init() into
virtio_mem_init_hotplug()
  virtio-mem: factor out hotplug specifics from virtio_mem_probe() into
virtio_mem_init_hotplug()
  virtio-mem: factor out hotplug specifics from virtio_mem_remove() into
virtio_mem_deinit_hotplug()
  virtio-mem: kdump mode to sanitize /proc/vmcore access

 arch/x86/kernel/aperture_64.c |  13 +-
 arch/x86/xen/mmu_hvm.c|  37 ++---
 drivers/virtio/virtio_mem.c   | 297 --
 fs/proc/vmcore.c  | 105 
 include/linux/crash_dump.h|  26 ++-
 5 files changed, 333 insertions(+), 145 deletions(-)


base-commit: 9e1ff307c779ce1f0f810c7ecce3d95bbae40896
-- 
2.31.1

Re: [PATCH v1 2/8] x86/xen: simplify xen_oldmem_pfn_is_ram()

2021-09-29 Thread David Hildenbrand


On 29.09.21 16:22, Boris Ostrovsky wrote:


On 9/29/21 5:03 AM, David Hildenbrand wrote:

On 29.09.21 10:45, David Hildenbrand wrote:



Can we go one step further and do


@@ -20,24 +20,11 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
   struct xen_hvm_get_mem_type a = {
   .domid = DOMID_SELF,
   .pfn = pfn,
+   .mem_type = HVMMEM_ram_rw,
   };
-   int ram;
    -   if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
-   return -ENXIO;
-
-   switch (a.mem_type) {
-   case HVMMEM_mmio_dm:
-   ram = 0;
-   break;
-   case HVMMEM_ram_rw:
-   case HVMMEM_ram_ro:
-   default:
-   ram = 1;
-   break;
-   }
-
-   return ram;
+   HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a);
+   return a.mem_type != HVMMEM_mmio_dm;



I was actually thinking of asking you to add another patch with pr_warn_once() 
here (and print error code as well). This call failing is indication of 
something going quite wrong and it would be good to know about this.


Will include a patch in v2, thanks!


--
Thanks,

David / dhildenb

Re: [PATCH v1 2/8] x86/xen: simplify xen_oldmem_pfn_is_ram()

2021-09-29 Thread David Hildenbrand


On 29.09.21 10:45, David Hildenbrand wrote:


How about

      return a.mem_type != HVMMEM_mmio_dm;



Ha, how could I have missed that :)



Result should be promoted to int and this has added benefit of not requiring 
changes in patch 4.



Can we go one step further and do


@@ -20,24 +20,11 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
  struct xen_hvm_get_mem_type a = {
  .domid = DOMID_SELF,
  .pfn = pfn,
+   .mem_type = HVMMEM_ram_rw,
  };
-   int ram;
   
-   if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))

-   return -ENXIO;
-
-   switch (a.mem_type) {
-   case HVMMEM_mmio_dm:
-   ram = 0;
-   break;
-   case HVMMEM_ram_rw:
-   case HVMMEM_ram_ro:
-   default:
-   ram = 1;
-   break;
-   }
-
-   return ram;
+   HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a);
+   return a.mem_type != HVMMEM_mmio_dm;
   }
   #endif


Assuming that if HYPERVISOR_hvm_op() fails that
.mem_type is not set to HVMMEM_mmio_dm.



Okay we can't, due to "__must_check" ...

--
Thanks,

David / dhildenb

Re: [PATCH v1 2/8] x86/xen: simplify xen_oldmem_pfn_is_ram()

2021-09-29 Thread David Hildenbrand



How about

     return a.mem_type != HVMMEM_mmio_dm;



Ha, how could I have missed that :)



Result should be promoted to int and this has added benefit of not requiring 
changes in patch 4.



Can we go one step further and do


@@ -20,24 +20,11 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
struct xen_hvm_get_mem_type a = {
.domid = DOMID_SELF,
.pfn = pfn,
+   .mem_type = HVMMEM_ram_rw,
};
-   int ram;
 
-   if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))

-   return -ENXIO;
-
-   switch (a.mem_type) {
-   case HVMMEM_mmio_dm:
-   ram = 0;
-   break;
-   case HVMMEM_ram_rw:
-   case HVMMEM_ram_ro:
-   default:
-   ram = 1;
-   break;
-   }
-
-   return ram;
+   HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a);
+   return a.mem_type != HVMMEM_mmio_dm;
 }
 #endif


Assuming that if HYPERVISOR_hvm_op() fails that
.mem_type is not set to HVMMEM_mmio_dm.

--
Thanks,

David / dhildenb

Re: [PATCH v1 8/8] virtio-mem: kdump mode to sanitize /proc/vmcore access

2021-09-29 Thread David Hildenbrand


[...]


+
+static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
+unsigned long pfn)
+{
+   struct virtio_mem *vm = container_of(cb, struct virtio_mem,
+vmcore_cb);
+   uint64_t addr = PFN_PHYS(pfn);
+   bool is_ram;
+   int rc;
+
+   if (!virtio_mem_contains_range(vm, addr, addr + PAGE_SIZE))


Some more testing revealed that this has to be

if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))


--
Thanks,

David / dhildenb

[PATCH v1 7/8] virtio-mem: factor out hotplug specifics from virtio_mem_remove() into virtio_mem_deinit_hotplug()

2021-09-28 Thread David Hildenbrand

Let's prepare for a new virtio-mem kdump mode in which we don't actually
hot(un)plug any memory but only observe the state of device blocks.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 1be3ee7f684d..76d8aef3cfd2 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2667,9 +2667,8 @@ static int virtio_mem_probe(struct virtio_device *vdev)
return rc;
 }
 
-static void virtio_mem_remove(struct virtio_device *vdev)
+static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
 {
-   struct virtio_mem *vm = vdev->priv;
unsigned long mb_id;
int rc;
 
@@ -2716,7 +2715,8 @@ static void virtio_mem_remove(struct virtio_device *vdev)
 * away. Warn at least.
 */
if (virtio_mem_has_memory_added(vm)) {
-   dev_warn(&vdev->dev, "device still has system memory added\n");
+   dev_warn(&vm->vdev->dev,
+"device still has system memory added\n");
} else {
virtio_mem_delete_resource(vm);
kfree_const(vm->resource_name);
@@ -2730,6 +2730,13 @@ static void virtio_mem_remove(struct virtio_device *vdev)
} else {
vfree(vm->bbm.bb_states);
}
+}
+
+static void virtio_mem_remove(struct virtio_device *vdev)
+{
+   struct virtio_mem *vm = vdev->priv;
+
+   virtio_mem_deinit_hotplug(vm);
 
/* reset the device and cleanup the queues */
vdev->config->reset(vdev);
-- 
2.31.1

[PATCH v1 5/8] virtio-mem: factor out hotplug specifics from virtio_mem_init() into virtio_mem_init_hotplug()

2021-09-28 Thread David Hildenbrand

Let's prepare for a new virtio-mem kdump mode in which we don't actually
hot(un)plug any memory but only observe the state of device blocks.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 81 -
 1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index bef8ad6bf466..2ba7e8d6ba8d 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2392,41 +2392,10 @@ static int virtio_mem_init_vq(struct virtio_mem *vm)
return 0;
 }
 
-static int virtio_mem_init(struct virtio_mem *vm)
+static int virtio_mem_init_hotplug(struct virtio_mem *vm)
 {
const struct range pluggable_range = mhp_get_pluggable_range(true);
uint64_t sb_size, addr;
-   uint16_t node_id;
-
-   if (!vm->vdev->config->get) {
-   dev_err(&vm->vdev->dev, "config access disabled\n");
-   return -EINVAL;
-   }
-
-   /*
-* We don't want to (un)plug or reuse any memory when in kdump. The
-* memory is still accessible (but not mapped).
-*/
-   if (is_kdump_kernel()) {
-   dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
-   return -EBUSY;
-   }
-
-   /* Fetch all properties that can't change. */
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
-   &vm->plugged_size);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
-   &vm->device_block_size);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
-   &node_id);
-   vm->nid = virtio_mem_translate_node_id(vm, node_id);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
-   virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
-   &vm->region_size);
-
-   /* Determine the nid for the device based on the lowest address. */
-   if (vm->nid == NUMA_NO_NODE)
-   vm->nid = memory_add_physaddr_to_nid(vm->addr);
 
/* bad device setup - warn only */
if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
@@ -2496,10 +2465,6 @@ static int virtio_mem_init(struct virtio_mem *vm)
  vm->offline_threshold);
}
 
-   dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
-   dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
-   dev_info(&vm->vdev->dev, "device block size: 0x%llx",
-(unsigned long long)vm->device_block_size);
dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
 memory_block_size_bytes());
if (vm->in_sbm)
@@ -2508,10 +2473,52 @@ static int virtio_mem_init(struct virtio_mem *vm)
else
dev_info(&vm->vdev->dev, "big block size: 0x%llx",
 (unsigned long long)vm->bbm.bb_size);
+
+   return 0;
+}
+
+static int virtio_mem_init(struct virtio_mem *vm)
+{
+   uint16_t node_id;
+
+   if (!vm->vdev->config->get) {
+   dev_err(&vm->vdev->dev, "config access disabled\n");
+   return -EINVAL;
+   }
+
+   /*
+* We don't want to (un)plug or reuse any memory when in kdump. The
+* memory is still accessible (but not mapped).
+*/
+   if (is_kdump_kernel()) {
+   dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
+   return -EBUSY;
+   }
+
+   /* Fetch all properties that can't change. */
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
+   &vm->plugged_size);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
+   &vm->device_block_size);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
+   &node_id);
+   vm->nid = virtio_mem_translate_node_id(vm, node_id);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
+   virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
+   &vm->region_size);
+
+   /* Determine the nid for the device based on the lowest address. */
+   if (vm->nid == NUMA_NO_NODE)
+   vm->nid = memory_add_physaddr_to_nid(vm->addr);
+
+   dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
+   dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
+   dev_info(&vm->vdev->dev, "device block size: 0x%llx",
+(unsigned long long)vm->device_block_size);
if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
 
-   return 0;
+   return virtio_mem_init_hotplug(vm);
 }
 
 static int virtio_mem_create_resource(struct virtio_mem *vm)
-- 
2.31.1

[PATCH v1 4/8] proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks

2021-09-28 Thread David Hildenbrand

Let's support multiple registered callbacks, making sure that
registering vmcore callbacks cannot fail. Make the callback return a
bool instead of an int, handling how to deal with errors internally.
Drop unused HAVE_OLDMEM_PFN_IS_RAM.

We soon want to make use of this infrastructure from other drivers:
virtio-mem, registering one callback for each virtio-mem device, to
prevent reading unplugged virtio-mem memory.

Handle it via a generic vmcore_cb structure, prepared for future
extensions: for example, once we support virtio-mem on s390x where the
vmcore is completely constructed in the second kernel, we want to detect
and add plugged virtio-mem memory ranges to the vmcore in order for them
to get dumped properly.

Handle corner cases that are unexpected and shouldn't happen in sane
setups: registering a callback after the vmcore has already been opened
(warn only) and unregistering a callback after the vmcore has already been
opened (warn and essentially read only zeroes from that point on).

Signed-off-by: David Hildenbrand 
---
 arch/x86/kernel/aperture_64.c | 13 -
 arch/x86/xen/mmu_hvm.c| 15 +++---
 fs/proc/vmcore.c  | 99 ---
 include/linux/crash_dump.h| 26 +++--
 4 files changed, 113 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 10562885f5fc..af3ba08b684b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn)
  (pfn >= aperture_pfn_start + aperture_page_count));
 }
 
+#ifdef CONFIG_PROC_VMCORE
+static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
+{
+   return !!gart_mem_pfn_is_ram(pfn);
+}
+
+static struct vmcore_cb gart_vmcore_cb = {
+   .pfn_is_ram = gart_oldmem_pfn_is_ram,
+};
+#endif
+
 static void __init exclude_from_core(u64 aper_base, u32 aper_order)
 {
aperture_pfn_start = aper_base >> PAGE_SHIFT;
aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
 #ifdef CONFIG_PROC_VMCORE
-   WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram));
+   register_vmcore_cb(&gart_vmcore_cb);
 #endif
 #ifdef CONFIG_PROC_KCORE
WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index eb61622df75b..49bd4a6a5858 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -12,10 +12,10 @@
  * The kdump kernel has to check whether a pfn of the crashed kernel
  * was a ballooned page. vmcore is using this function to decide
  * whether to access a pfn of the crashed kernel.
- * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * Returns "false" if the pfn is not backed by a RAM page, the caller may
  * handle the pfn special in this case.
  */
-static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
 {
struct xen_hvm_get_mem_type a = {
.domid = DOMID_SELF,
@@ -23,15 +23,18 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
};
 
if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
-   return -ENXIO;
+   return true;
 
switch (a.mem_type) {
case HVMMEM_mmio_dm:
-   return 0;
+   return false;
default:
-   return 1;
+   return true;
}
 }
+static struct vmcore_cb xen_vmcore_cb = {
+   .pfn_is_ram = xen_vmcore_pfn_is_ram,
+};
 #endif
 
 static void xen_hvm_exit_mmap(struct mm_struct *mm)
@@ -65,6 +68,6 @@ void __init xen_hvm_init_mmu_ops(void)
if (is_pagetable_dying_supported())
pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap;
 #ifdef CONFIG_PROC_VMCORE
-   WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram));
+   register_vmcore_cb(&xen_vmcore_cb);
 #endif
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a9bd80ab670e..7a04b2eca287 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
 /* Device Dump Size */
 static size_t vmcoredd_orig_sz;
 
-/*
- * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
- * The called function has to take care of module refcounting.
- */
-static int (*oldmem_pfn_is_ram)(unsigned long pfn);
-
-int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+static DECLARE_RWSEM(vmcore_cb_rwsem);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether we had a surprise unregistration of a callback. */
+static bool vmcore_cb_unstable;
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
+
+void register_vmcore_cb(struct vmcore_cb *cb)
 {
-   if (oldmem_pfn_is_ram)
-   return -EBUSY;
-   oldmem

[PATCH v1 8/8] virtio-mem: kdump mode to sanitize /proc/vmcore access

2021-09-28 Thread David Hildenbrand

Although virtio-mem currently supports reading unplugged memory in the
hypervisor, this will change in the future, indicated to the device via
a new feature flag. We similarly sanitized /proc/kcore access recently. [1]

Let's register a vmcore callback, to allow vmcore code to check if a PFN
belonging to a virtio-mem device is either currently plugged and should
be dumped or is currently unplugged and should not be accessed, instead
mapping the shared zeropage or returning zeroes when reading.

This is important when not capturing /proc/vmcore via tools like
"makedumpfile" that can identify logically unplugged virtio-mem memory via
PG_offline in the memmap, but simply by e.g., copying the file.

Distributions that support virtio-mem+kdump have to make sure that the
virtio_mem module will be part of the kdump kernel or the kdump initrd;
dracut was recently [2] extended to include virtio-mem in the generated
initrd. As long as no special kdump kernels are used, this will
automatically make sure that virtio-mem will be around in the kdump initrd
and sanitize /proc/vmcore access -- with dracut.

With this series, we'll send one virtio-mem state request for every
~2 MiB chunk of virtio-mem memory indicated in the vmcore that we intend
to read/map.

In the future, we might want to allow building virtio-mem for kdump
mode only, even without CONFIG_MEMORY_HOTPLUG and friends: this way,
we could support special stripped-down kdump kernels that have many
other config options disabled; we'll tackle that once required. Further,
we might want to try sensing bigger blocks (e.g., memory sections)
first before falling back to device blocks on demand.

Tested with Fedora rawhide, which contains a recent kexec-tools version
(considering "System RAM (virtio_mem)" when creating the vmcore header) and
a recent dracut version (including the virtio_mem module in the kdump
initrd).

[1] https://lkml.kernel.org/r/20210526093041.8800-1-da...@redhat.com
[2] https://github.com/dracutdevs/dracut/pull/1157

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 136 
 1 file changed, 124 insertions(+), 12 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 76d8aef3cfd2..ec0b2ab37acb 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -223,6 +223,9 @@ struct virtio_mem {
 * When this lock is held the pointers can't change, ONLINE and
 * OFFLINE blocks can't change the state and no subblocks will get
 * plugged/unplugged.
+*
+* In kdump mode, used to serialize requests, last_block_addr and
+* last_block_plugged.
 */
struct mutex hotplug_mutex;
bool hotplug_active;
@@ -230,6 +233,9 @@ struct virtio_mem {
/* An error occurred we cannot handle - stop processing requests. */
bool broken;
 
+   /* Cached valued of is_kdump_kernel() when the device was probed. */
+   bool in_kdump;
+
/* The driver is being removed. */
spinlock_t removal_lock;
bool removing;
@@ -243,6 +249,13 @@ struct virtio_mem {
/* Memory notifier (online/offline events). */
struct notifier_block memory_notifier;
 
+#ifdef CONFIG_PROC_VMCORE
+   /* vmcore callback for /proc/vmcore handling in kdump mode */
+   struct vmcore_cb vmcore_cb;
+   uint64_t last_block_addr;
+   bool last_block_plugged;
+#endif /* CONFIG_PROC_VMCORE */
+
/* Next device in the list of virtio-mem devices. */
struct list_head next;
 };
@@ -2293,6 +2306,12 @@ static void virtio_mem_run_wq(struct work_struct *work)
uint64_t diff;
int rc;
 
+   if (unlikely(vm->in_kdump)) {
+   dev_warn_once(&vm->vdev->dev,
+"unexpected workqueue run in kdump kernel\n");
+   return;
+   }
+
hrtimer_cancel(&vm->retry_timer);
 
if (vm->broken)
@@ -2521,6 +2540,86 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm)
return rc;
 }
 
+#ifdef CONFIG_PROC_VMCORE
+static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
+uint64_t size)
+{
+   const uint64_t nb_vm_blocks = size / vm->device_block_size;
+   const struct virtio_mem_req req = {
+   .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
+   .u.state.addr = cpu_to_virtio64(vm->vdev, addr),
+   .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
+   };
+   int rc = -ENOMEM;
+
+   dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
+   addr + size - 1);
+
+   switch (virtio_mem_send_request(vm, &req)) {
+   case VIRTIO_MEM_RESP_ACK:
+   return virtio16_to_cpu(vm->vdev, vm->resp.

[PATCH v1 6/8] virtio-mem: factor out hotplug specifics from virtio_mem_probe() into virtio_mem_init_hotplug()

2021-09-28 Thread David Hildenbrand

Let's prepare for a new virtio-mem kdump mode in which we don't actually
hot(un)plug any memory but only observe the state of device blocks.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 87 +++--
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 2ba7e8d6ba8d..1be3ee7f684d 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -260,6 +260,8 @@ static void virtio_mem_fake_offline_going_offline(unsigned 
long pfn,
 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
   unsigned long nr_pages);
 static void virtio_mem_retry(struct virtio_mem *vm);
+static int virtio_mem_create_resource(struct virtio_mem *vm);
+static void virtio_mem_delete_resource(struct virtio_mem *vm);
 
 /*
  * Register a virtio-mem device so it will be considered for the online_page
@@ -2395,7 +2397,8 @@ static int virtio_mem_init_vq(struct virtio_mem *vm)
 static int virtio_mem_init_hotplug(struct virtio_mem *vm)
 {
const struct range pluggable_range = mhp_get_pluggable_range(true);
-   uint64_t sb_size, addr;
+   uint64_t unit_pages, sb_size, addr;
+   int rc;
 
/* bad device setup - warn only */
if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
@@ -2474,7 +2477,48 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm)
dev_info(&vm->vdev->dev, "big block size: 0x%llx",
 (unsigned long long)vm->bbm.bb_size);
 
+   /* create the parent resource for all memory */
+   rc = virtio_mem_create_resource(vm);
+   if (rc)
+   return rc;
+
+   /* use a single dynamic memory group to cover the whole memory device */
+   if (vm->in_sbm)
+   unit_pages = PHYS_PFN(memory_block_size_bytes());
+   else
+   unit_pages = PHYS_PFN(vm->bbm.bb_size);
+   rc = memory_group_register_dynamic(vm->nid, unit_pages);
+   if (rc < 0)
+   goto out_del_resource;
+   vm->mgid = rc;
+
+   /*
+* If we still have memory plugged, we have to unplug all memory first.
+* Registering our parent resource makes sure that this memory isn't
+* actually in use (e.g., trying to reload the driver).
+*/
+   if (vm->plugged_size) {
+   vm->unplug_all_required = true;
+   dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
+   }
+
+   /* register callbacks */
+   vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
+   rc = register_memory_notifier(&vm->memory_notifier);
+   if (rc)
+   goto out_unreg_group;
+   rc = register_virtio_mem_device(vm);
+   if (rc)
+   goto out_unreg_mem;
+
return 0;
+out_unreg_mem:
+   unregister_memory_notifier(&vm->memory_notifier);
+out_unreg_group:
+   memory_group_unregister(vm->mgid);
+out_del_resource:
+   virtio_mem_delete_resource(vm);
+   return rc;
 }
 
 static int virtio_mem_init(struct virtio_mem *vm)
@@ -2578,7 +2622,6 @@ static bool virtio_mem_has_memory_added(struct virtio_mem 
*vm)
 static int virtio_mem_probe(struct virtio_device *vdev)
 {
struct virtio_mem *vm;
-   uint64_t unit_pages;
int rc;
 
BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
@@ -2608,40 +2651,6 @@ static int virtio_mem_probe(struct virtio_device *vdev)
if (rc)
goto out_del_vq;
 
-   /* create the parent resource for all memory */
-   rc = virtio_mem_create_resource(vm);
-   if (rc)
-   goto out_del_vq;
-
-   /* use a single dynamic memory group to cover the whole memory device */
-   if (vm->in_sbm)
-   unit_pages = PHYS_PFN(memory_block_size_bytes());
-   else
-   unit_pages = PHYS_PFN(vm->bbm.bb_size);
-   rc = memory_group_register_dynamic(vm->nid, unit_pages);
-   if (rc < 0)
-   goto out_del_resource;
-   vm->mgid = rc;
-
-   /*
-* If we still have memory plugged, we have to unplug all memory first.
-* Registering our parent resource makes sure that this memory isn't
-* actually in use (e.g., trying to reload the driver).
-*/
-   if (vm->plugged_size) {
-   vm->unplug_all_required = true;
-   dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
-   }
-
-   /* register callbacks */
-   vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
-   rc = register_memory_notifier(&vm->memory_notifier);
-   if (rc)
-   goto out_unreg_group;
-   rc = register_virtio_mem_device(vm);
-   if (rc)
-

[PATCH v1 3/8] proc/vmcore: let pfn_is_ram() return a bool

2021-09-28 Thread David Hildenbrand

The callback should deal with errors internally, it doesn't make sense to
expose these via pfn_is_ram(). We'll rework the callbacks next. Right now
we consider errors as if "it's RAM"; no functional change.

Signed-off-by: David Hildenbrand 
---
 fs/proc/vmcore.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9a15334da208..a9bd80ab670e 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -84,11 +84,11 @@ void unregister_oldmem_pfn_is_ram(void)
 }
 EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
 
-static int pfn_is_ram(unsigned long pfn)
+static bool pfn_is_ram(unsigned long pfn)
 {
int (*fn)(unsigned long pfn);
/* pfn is ram unless fn() checks pagetype */
-   int ret = 1;
+   bool ret = true;
 
/*
 * Ask hypervisor if the pfn is really ram.
@@ -97,7 +97,7 @@ static int pfn_is_ram(unsigned long pfn)
 */
fn = oldmem_pfn_is_ram;
if (fn)
-   ret = fn(pfn);
+   ret = !!fn(pfn);
 
return ret;
 }
@@ -124,7 +124,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
nr_bytes = count;
 
/* If pfn is not ram, return zeros for sparse dump files */
-   if (pfn_is_ram(pfn) == 0)
+   if (!pfn_is_ram(pfn))
memset(buf, 0, nr_bytes);
else {
if (encrypted)
-- 
2.31.1

[PATCH v1 2/8] x86/xen: simplify xen_oldmem_pfn_is_ram()

2021-09-28 Thread David Hildenbrand

Let's simplify return handling.

Signed-off-by: David Hildenbrand 
---
 arch/x86/xen/mmu_hvm.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index b242d1f4b426..eb61622df75b 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -21,23 +21,16 @@ static int xen_oldmem_pfn_is_ram(unsigned long pfn)
.domid = DOMID_SELF,
.pfn = pfn,
};
-   int ram;
 
if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
return -ENXIO;
 
switch (a.mem_type) {
case HVMMEM_mmio_dm:
-   ram = 0;
-   break;
-   case HVMMEM_ram_rw:
-   case HVMMEM_ram_ro:
+   return 0;
default:
-   ram = 1;
-   break;
+   return 1;
}
-
-   return ram;
 }
 #endif
 
-- 
2.31.1

[PATCH v1 1/8] x86/xen: update xen_oldmem_pfn_is_ram() documentation

2021-09-28 Thread David Hildenbrand

The callback is only used for the vmcore nowadays.

Signed-off-by: David Hildenbrand 
---
 arch/x86/xen/mmu_hvm.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index 57409373750f..b242d1f4b426 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -9,12 +9,9 @@
 
 #ifdef CONFIG_PROC_VMCORE
 /*
- * This function is used in two contexts:
- * - the kdump kernel has to check whether a pfn of the crashed kernel
- *   was a ballooned page. vmcore is using this function to decide
- *   whether to access a pfn of the crashed kernel.
- * - the kexec kernel has to check whether a pfn was ballooned by the
- *   previous kernel. If the pfn is ballooned, handle it properly.
+ * The kdump kernel has to check whether a pfn of the crashed kernel
+ * was a ballooned page. vmcore is using this function to decide
+ * whether to access a pfn of the crashed kernel.
  * Returns 0 if the pfn is not backed by a RAM page, the caller may
  * handle the pfn special in this case.
  */
-- 
2.31.1

[PATCH v1 0/8] proc/vmcore: sanitize access to virtio-mem memory

2021-09-28 Thread David Hildenbrand

As so often with virtio-mem changes that mess with common MM
infrastructure, this might be a good candiate to go via Andrew's tree.

--

After removing /dev/kmem, sanitizing /proc/kcore and handling /dev/mem,
this series tackles the last sane way how a VM could accidentially access
logically unplugged memory managed by a virtio-mem device: /proc/vmcore

When dumping memory via "makedumpfile", PG_offline pages, used by
virtio-mem to flag logically unplugged memory, are already properly
excluded; however, especially when accessing/copying /proc/vmcore "the
usual way", we can still end up reading logically unplugged memory part of
a virtio-mem device.

Patch #1-#3 are cleanups. Patch #4 extends the existing oldmem_pfn_is_ram
mechanism. Patch #5-#7 are virtio-mem refactorings for patch #8, which
implements the virtio-mem logic to query the state of device blocks.

Patch #8:

"
Although virtio-mem currently supports reading unplugged memory in the
hypervisor, this will change in the future, indicated to the device via
a new feature flag. We similarly sanitized /proc/kcore access recently.
[...]
Distributions that support virtio-mem+kdump have to make sure that the
virtio_mem module will be part of the kdump kernel or the kdump initrd;
dracut was recently [2] extended to include virtio-mem in the generated
initrd. As long as no special kdump kernels are used, this will
automatically make sure that virtio-mem will be around in the kdump initrd
and sanitize /proc/vmcore access -- with dracut.
"

This is the last remaining bit to support
VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE [3] in the Linux implementation of
virtio-mem.

Note: this is best-effort. We'll never be able to control what runs inside
the second kernel, really, but we also don't have to care: we only care
about sane setups where we don't want our VM getting zapped once we
touch the wrong memory location while dumping. While we usually expect sane
setups to use "makedumfile", nothing really speaks against just copying
/proc/vmcore, especially in environments where HWpoisioning isn't typically
expected. Also, we really don't want to put all our trust completely on the
memmap, so sanitizing also makes sense when just using "makedumpfile".

[1] https://lkml.kernel.org/r/20210526093041.8800-1-da...@redhat.com
[2] https://github.com/dracutdevs/dracut/pull/1157
[3] https://lists.oasis-open.org/archives/virtio-comment/202109/msg00021.html

Cc: Andrew Morton 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Dave Young 
Cc: Baoquan He 
Cc: Vivek Goyal 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: "Rafael J. Wysocki" 
Cc: x...@kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: ke...@lists.infradead.org
Cc: linux-fsde...@vger.kernel.org
Cc: linux...@kvack.org

David Hildenbrand (8):
  x86/xen: update xen_oldmem_pfn_is_ram() documentation
  x86/xen: simplify xen_oldmem_pfn_is_ram()
  proc/vmcore: let pfn_is_ram() return a bool
  proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore
callbacks
  virtio-mem: factor out hotplug specifics from virtio_mem_init() into
virtio_mem_init_hotplug()
  virtio-mem: factor out hotplug specifics from virtio_mem_probe() into
virtio_mem_init_hotplug()
  virtio-mem: factor out hotplug specifics from virtio_mem_remove() into
virtio_mem_deinit_hotplug()
  virtio-mem: kdump mode to sanitize /proc/vmcore access

 arch/x86/kernel/aperture_64.c |  13 +-
 arch/x86/xen/mmu_hvm.c|  31 ++--
 drivers/virtio/virtio_mem.c   | 297 --
 fs/proc/vmcore.c  | 105 
 include/linux/crash_dump.h|  26 ++-
 5 files changed, 332 insertions(+), 140 deletions(-)


base-commit: 5816b3e6577eaa676ceb00a848f0fd65fe2adc29
-- 
2.31.1

Re: [RFC PATCH v2 11/16] softmmu/memory: add memory_region_try_add_subregion function

2021-09-22 Thread David Hildenbrand


On 22.09.21 18:14, Damien Hedde wrote:

It allows to try to add a subregion to a memory region with error
handling. Like memory_region_add_subregion_overlap, it handles
priority as well.
Apart the error handling, the behavior is the same. It can be used
to do the simple memory_region_add_subregion() (with no overlap) by
setting the priority parameter to 0.

This commit is a preparation to further use this function in the
context of qmp command which needs error handling support.

Signed-off-by: Damien Hedde 
---

Adding a new function is obviously not ideal. But there is ~900
occurrences of memory_region_add_subregion[_overlap] calls in the code
base. We do not really see an alternative here.
---
  include/exec/memory.h | 22 ++
  softmmu/memory.c  | 22 ++
  2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index c3d417d317..422e1eda67 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2162,6 +2162,28 @@ void memory_region_add_subregion_overlap(MemoryRegion 
*mr,
   MemoryRegion *subregion,
   int priority);
  
+/**

+ * memory_region_try_add_subregion: Add a subregion to a container
+ *  with error handling.
+ *
+ * Behaves like memory_region_add_subregion_overlap(), but errors are
+ * reported if the subregion cannot be added.
+ *
+ * @mr: the region to contain the new subregion; must be a container
+ *  initialized with memory_region_init().
+ * @offset: the offset relative to @mr where @subregion is added.
+ * @subregion: the subregion to be added.
+ * @priority: used for resolving overlaps; highest priority wins.
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Returns: True in case of success, false otherwise.
+ */
+bool memory_region_try_add_subregion(MemoryRegion *mr,
+ hwaddr offset,
+ MemoryRegion *subregion,
+ int priority,
+ Error **errp);
+
  /**
   * memory_region_get_ram_addr: Get the ram address associated with a memory
   * region
diff --git a/softmmu/memory.c b/softmmu/memory.c
index bfedaf9c4d..eac61f8236 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -2513,22 +2513,28 @@ done:
  memory_region_transaction_commit();
  }
  
-static void memory_region_add_subregion_common(MemoryRegion *mr,

-   hwaddr offset,
-   MemoryRegion *subregion)
+bool memory_region_try_add_subregion(MemoryRegion *mr,
+ hwaddr offset,
+ MemoryRegion *subregion,
+ int priority,
+ Error **errp)
  {
-assert(!subregion->container);
+if (subregion->container) {
+error_setg(errp, "The memory region is already in another region");
+return false;
+}
+subregion->priority = priority;
  subregion->container = mr;
  subregion->addr = offset;
  memory_region_update_container_subregions(subregion);
+return true;
  }
  
  void memory_region_add_subregion(MemoryRegion *mr,

   hwaddr offset,
   MemoryRegion *subregion)
  {
-subregion->priority = 0;
-memory_region_add_subregion_common(mr, offset, subregion);
+memory_region_try_add_subregion(mr, offset, subregion, 0, &error_abort);
  }
  
  void memory_region_add_subregion_overlap(MemoryRegion *mr,

@@ -2536,8 +2542,8 @@ void memory_region_add_subregion_overlap(MemoryRegion *mr,
   MemoryRegion *subregion,
   int priority)
  {
-subregion->priority = priority;
-memory_region_add_subregion_common(mr, offset, subregion);
+memory_region_try_add_subregion(mr, offset, subregion, priority,
+&error_abort);
  }
  
  void memory_region_del_subregion(MemoryRegion *mr,




Reviewed-by: David Hildenbrand 

--
Thanks,

David / dhildenb

Re: [PATCH v2 4/5] mm/page_alloc: place pages to tail in __free_pages_core()

2021-09-07 Thread David Hildenbrand


On 08.09.21 00:40, Sean Anderson wrote:

Hi David,

This patch breaks booting on my custom Xilinx ZynqMP board. Booting
fails just after/during GIC initialization:

[0.00] Booting Linux on physical CPU 0x00 [0x410fd034]
[0.00] Linux version 5.14.0 (sean@plantagenet) (aarch64-linux-gnu-gcc 
(Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) 
#251 SMP Tue Sep 7 18:11:50 EDT 2021
[0.00] Machine model: xlnx,zynqmp
[0.00] earlycon: cdns0 at MMIO 0xff01 (options '115200n8')
[0.00] printk: bootconsole [cdns0] enabled
[0.00] efi: UEFI not found.
[0.00] Zone ranges:
[0.00]   DMA32[mem 0x-0x]
[0.00]   Normal   [mem 0x0001-0x00087fff]
[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x-0x7fef]
[0.00]   node   0: [mem 0x0008-0x00087fff]
[0.00] Initmem setup node 0 [mem 0x-0x00087fff]
[0.00] On node 0, zone Normal: 256 pages in unavailable ranges
[0.00] cma: Reserved 1000 MiB at 0x4140
[0.00] psci: probing for conduit method from DT.
[0.00] psci: PSCIv1.1 detected in firmware.
[0.00] psci: Using standard PSCI v0.2 function IDs
[0.00] psci: MIGRATE_INFO_TYPE not supported.
[0.00] psci: SMC Calling Convention v1.1
[0.00] percpu: Embedded 19 pages/cpu s46752 r0 d31072 u77824
[0.00] Detected VIPT I-cache on CPU0
[0.00] CPU features: detected: ARM erratum 845719
[0.00] Built 1 zonelists, mobility grouping on.  Total pages: 1033987
[0.00] Kernel command line: earlycon clk_ignore_unused 
root=/dev/mmcblk0p2 rootwait rw cma=1000M
[0.00] Dentry cache hash table entries: 524288 (order: 10, 4194304 
bytes, linear)
[0.00] Inode-cache hash table entries: 262144 (order: 9, 2097152 bytes, 
linear)
[0.00] mem auto-init: stack:off, heap alloc:off, heap free:off
[0.00] software IO TLB: mapped [mem 
0x3d40-0x4140] (64MB)
[0.00] Memory: 3023384K/4193280K available (4288K kernel code, 514K 
rwdata, 1200K rodata, 896K init, 187K bss, 145896K reserved, 1024000K 
cma-reserved)
[0.00] rcu: Hierarchical RCU implementation.
[0.00] rcu: RCU event tracing is enabled.
[0.00] rcu: RCU calculated value of scheduler-enlistment delay is 25 
jiffies.
[0.00] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
[0.00] GIC: Adjusting CPU interface base to 0xf902f000
[0.00] Root IRQ handler: gic_handle_irq
[0.00] GIC: Using split EOI/Deactivate mode

and I bisected it to this patch. Applying the following patch (for 5.14)
fixes booting again:


Hi Sean,

unfortunately that patch most likely (with 99.% confidence) revealed 
another latent BUG in your setup.


Some memory that shouldn't be handed to the buddy as free memory is 
getting now allocated earlier than later, resulting in that issue.



I had all different kinds of reports, but they were mostly

a) Firmware bugs that result in uncached memory getting exposed to the 
buddy, resulting in severe performance degradation such that the system 
will no longer boot. [3]


I wrote kstream [1] to be run under the old kernel, to identify these.

b) BUGs that result in unsuitable memory getting exposed to either the 
buddy or devices, resulting in errors during device initialization. [6]


c) Use after free BUGs.

Exposing memory, such as used for ACPI tables, to the buddy as free 
memory although it's still in use. [4]


d) Hypervisor BUGs

The last report (heavy performance degradation) was due to a BUG in 
dpdk. [2]



What the exact symptoms you're experiencing? Really slow boot/stall? 
Then it could be a) and kstream might help.



[1] https://github.com/davidhildenbrand/kstream
[2] 
https://lore.kernel.org/dpdk-dev/20210827161231.579968-1-epere...@redhat.com/T/#u
[3] 
https://lore.kernel.org/r/mw3pr12mb4537c3c6efd9ca3a4b32084df3...@mw3pr12mb4537.namprd12.prod.outlook.com

[4] https://lkml.kernel.org/r/4650320.31r3eYUQgx@kreacher
[5] https://lkml.kernel.org/r/87361onphy.fsf...@codeaurora.org
[6] 
https://lore.kernel.org/r/20201213225517.3838501-1-linus.wall...@linaro.org



--
Thanks,

David / dhildenb

[PATCH v5 02/10] numa: Teach ram block notifiers about resizeable ram blocks

2021-04-29 Thread David Hildenbrand

Ram block notifiers are currently not aware of resizes. To properly
handle resizes during migration, we want to teach ram block notifiers about
resizeable ram.

Introduce the basic infrastructure but keep using max_size in the
existing notifiers. Supply the max_size when adding and removing ram
blocks. Also, notify on resizes.

Acked-by: Paul Durrant 
Reviewed-by: Peter Xu 
Cc: xen-devel@lists.xenproject.org
Cc: haxm-t...@intel.com
Cc: Paul Durrant 
Cc: Stefano Stabellini 
Cc: Anthony Perard 
Cc: Wenchao Wang 
Cc: Colin Xu 
Signed-off-by: David Hildenbrand 
---
 hw/core/numa.c | 22 +-
 hw/i386/xen/xen-mapcache.c |  7 ---
 include/exec/ramlist.h | 13 +
 softmmu/physmem.c  | 12 ++--
 target/i386/hax/hax-mem.c  |  5 +++--
 target/i386/sev.c  | 18 ++
 util/vfio-helpers.c| 16 
 7 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/hw/core/numa.c b/hw/core/numa.c
index 7f08c27a6d..921bf86ab4 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -806,11 +806,12 @@ void query_numa_node_mem(NumaNodeMem node_mem[], 
MachineState *ms)
 static int ram_block_notify_add_single(RAMBlock *rb, void *opaque)
 {
 const ram_addr_t max_size = qemu_ram_get_max_length(rb);
+const ram_addr_t size = qemu_ram_get_used_length(rb);
 void *host = qemu_ram_get_host_addr(rb);
 RAMBlockNotifier *notifier = opaque;
 
 if (host) {
-notifier->ram_block_added(notifier, host, max_size);
+notifier->ram_block_added(notifier, host, size, max_size);
 }
 return 0;
 }
@@ -827,20 +828,31 @@ void ram_block_notifier_remove(RAMBlockNotifier *n)
 QLIST_REMOVE(n, next);
 }
 
-void ram_block_notify_add(void *host, size_t size)
+void ram_block_notify_add(void *host, size_t size, size_t max_size)
 {
 RAMBlockNotifier *notifier;
 
 QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
-notifier->ram_block_added(notifier, host, size);
+notifier->ram_block_added(notifier, host, size, max_size);
 }
 }
 
-void ram_block_notify_remove(void *host, size_t size)
+void ram_block_notify_remove(void *host, size_t size, size_t max_size)
 {
 RAMBlockNotifier *notifier;
 
 QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
-notifier->ram_block_removed(notifier, host, size);
+notifier->ram_block_removed(notifier, host, size, max_size);
+}
+}
+
+void ram_block_notify_resize(void *host, size_t old_size, size_t new_size)
+{
+RAMBlockNotifier *notifier;
+
+QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
+if (notifier->ram_block_resized) {
+notifier->ram_block_resized(notifier, host, old_size, new_size);
+}
 }
 }
diff --git a/hw/i386/xen/xen-mapcache.c b/hw/i386/xen/xen-mapcache.c
index 5b120ed44b..d6dcea65d1 100644
--- a/hw/i386/xen/xen-mapcache.c
+++ b/hw/i386/xen/xen-mapcache.c
@@ -169,7 +169,8 @@ static void xen_remap_bucket(MapCacheEntry *entry,
 
 if (entry->vaddr_base != NULL) {
 if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
-ram_block_notify_remove(entry->vaddr_base, entry->size);
+ram_block_notify_remove(entry->vaddr_base, entry->size,
+entry->size);
 }
 if (munmap(entry->vaddr_base, entry->size) != 0) {
 perror("unmap fails");
@@ -211,7 +212,7 @@ static void xen_remap_bucket(MapCacheEntry *entry,
 }
 
 if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
-ram_block_notify_add(vaddr_base, size);
+ram_block_notify_add(vaddr_base, size, size);
 }
 
 entry->vaddr_base = vaddr_base;
@@ -452,7 +453,7 @@ static void xen_invalidate_map_cache_entry_unlocked(uint8_t 
*buffer)
 }
 
 pentry->next = entry->next;
-ram_block_notify_remove(entry->vaddr_base, entry->size);
+ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size);
 if (munmap(entry->vaddr_base, entry->size) != 0) {
 perror("unmap fails");
 exit(-1);
diff --git a/include/exec/ramlist.h b/include/exec/ramlist.h
index 26704aa3b0..ece6497ee2 100644
--- a/include/exec/ramlist.h
+++ b/include/exec/ramlist.h
@@ -65,15 +65,20 @@ void qemu_mutex_lock_ramlist(void);
 void qemu_mutex_unlock_ramlist(void);
 
 struct RAMBlockNotifier {
-void (*ram_block_added)(RAMBlockNotifier *n, void *host, size_t size);
-void (*ram_block_removed)(RAMBlockNotifier *n, void *host, size_t size);
+void (*ram_block_added)(RAMBlockNotifier *n, void *host, size_t size,
+size_t max_size);
+void (*ram_block_removed)(RAMBlockNotifier *n, void *host, size_t size,
+  size_t max_size);
+void (*ram_block_resized)(RAMBlockNotifier *n, void *host, size_t old_size,
+

[PATCH v1] mm/memory_hotplug: MEMHP_MERGE_RESOURCE -> MHP_MERGE_RESOURCE

2021-01-26 Thread David Hildenbrand

Let's make "MEMHP_MERGE_RESOURCE" consistent with "MHP_NONE", "mhp_t" and
"mhp_flags". As discussed recently [1], "mhp" is our internal
acronym for memory hotplug now.

[1] 
https://lore.kernel.org/linux-mm/c37de2d0-28a1-4f7d-f944-cfd7d81c3...@redhat.com/

Cc: Andrew Morton 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Wei Liu 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: Pankaj Gupta 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: Anshuman Khandual 
Cc: Wei Yang 
Cc: linux-hyp...@vger.kernel.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Signed-off-by: David Hildenbrand 
---
 drivers/hv/hv_balloon.c| 2 +-
 drivers/virtio/virtio_mem.c| 2 +-
 drivers/xen/balloon.c  | 2 +-
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c| 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 8c471823a5af..2f776d78e3c1 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned 
long size,
 
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
-   (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
+   (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
 
if (ret) {
pr_err("hot_add memory failed error is %d\n", ret);
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 85a272c9978e..148bea39b09a 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -623,7 +623,7 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, 
uint64_t addr,
/* Memory might get onlined immediately. */
atomic64_add(size, &vm->offline_size);
rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
-  MEMHP_MERGE_RESOURCE);
+  MHP_MERGE_RESOURCE);
if (rc) {
atomic64_sub(size, &vm->offline_size);
dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index b57b2067ecbf..671c71245a7b 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void)
mutex_unlock(&balloon_mutex);
/* add_memory_resource() requires the device_hotplug lock */
lock_device_hotplug();
-   rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE);
+   rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE);
unlock_device_hotplug();
mutex_lock(&balloon_mutex);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3d99de0db2dd..4b834f5d032e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -53,7 +53,7 @@ typedef int __bitwise mhp_t;
  * with this flag set, the resource pointer must no longer be used as it
  * might be stale, or the resource might have changed.
  */
-#define MEMHP_MERGE_RESOURCE   ((__force mhp_t)BIT(0))
+#define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0))
 
 /*
  * Extended parameters for memory hotplug:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 710e469fb3a1..ae497e3ff77c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1153,7 +1153,7 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * In case we're allowed to merge the resource, flag it and trigger
 * merging now that adding succeeded.
 */
-   if (mhp_flags & MEMHP_MERGE_RESOURCE)
+   if (mhp_flags & MHP_MERGE_RESOURCE)
merge_system_ram_resource(res);
 
/* online pages if requested */
-- 
2.29.2

Re: [PATCH 1/2] sysemu/runstate: Let runstate_is_running() return bool

2021-01-11 Thread David Hildenbrand

On 11.01.21 16:20, Philippe Mathieu-Daudé wrote:
> runstate_check() returns a boolean. runstate_is_running()
> returns what runstate_check() returns, also a boolean.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  include/sysemu/runstate.h | 2 +-
>  softmmu/runstate.c| 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h
> index e557f470d42..3ab35a039a0 100644
> --- a/include/sysemu/runstate.h
> +++ b/include/sysemu/runstate.h
> @@ -6,7 +6,7 @@
>  
>  bool runstate_check(RunState state);
>  void runstate_set(RunState new_state);
> -int runstate_is_running(void);
> +bool runstate_is_running(void);
>  bool runstate_needs_reset(void);
>  bool runstate_store(char *str, size_t size);
>  
> diff --git a/softmmu/runstate.c b/softmmu/runstate.c
> index 636aab0addb..c7a67147d17 100644
> --- a/softmmu/runstate.c
> +++ b/softmmu/runstate.c
> @@ -217,7 +217,7 @@ void runstate_set(RunState new_state)
>  current_run_state = new_state;
>  }
>  
> -int runstate_is_running(void)
> +bool runstate_is_running(void)
>  {
>  return runstate_check(RUN_STATE_RUNNING);
>  }
> 

Reviewed-by: David Hildenbrand 

-- 
Thanks,

David / dhildenb

[PATCH v2 3/5] mm/page_alloc: move pages to tail in move_to_free_list()

2020-10-05 Thread David Hildenbrand

Whenever we move pages between freelists via move_to_free_list()/
move_freepages_block(), we don't actually touch the pages:
1. Page isolation doesn't actually touch the pages, it simply isolates
   pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
   When undoing isolation, we move the pages back to the target list.
2. Page stealing (steal_suitable_fallback()) moves free pages directly
   between lists without touching them.
3. reserve_highatomic_pageblock()/unreserve_highatomic_pageblock() moves
   free pages directly between freelists without touching them.

We already place pages to the tail of the freelists when undoing isolation
via __putback_isolated_page(), let's do it in any case (e.g., if order <=
pageblock_order) and document the behavior. To simplify, let's move the
pages to the tail for all move_to_free_list()/move_freepages_block() users.

In 2., the target list is empty, so there should be no change. In 3.,
we might observe a change, however, highatomic is more concerned about
allocations succeeding than cache hotness - if we ever realize this
change degrades a workload, we can special-case this instance and add a
proper comment.

This change results in all pages getting onlined via online_pages() to
be placed to the tail of the freelist.

Reviewed-by: Oscar Salvador 
Acked-by: Pankaj Gupta 
Reviewed-by: Wei Yang 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: Scott Cheloha 
Cc: Michael Ellerman 
Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 10 +++---
 mm/page_isolation.c |  5 +
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df5ff0cd6df1..b187e46cf640 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -901,13 +901,17 @@ static inline void add_to_free_list_tail(struct page 
*page, struct zone *zone,
area->nr_free++;
 }
 
-/* Used for pages which are on another list */
+/*
+ * Used for pages which are on another list. Move the pages to the tail
+ * of the list - so the moved pages won't immediately be considered for
+ * allocation again (e.g., optimization for memory onlining).
+ */
 static inline void move_to_free_list(struct page *page, struct zone *zone,
 unsigned int order, int migratetype)
 {
struct free_area *area = &zone->free_area[order];
 
-   list_move(&page->lru, &area->free_list[migratetype]);
+   list_move_tail(&page->lru, &area->free_list[migratetype]);
 }
 
 static inline void del_page_from_free_list(struct page *page, struct zone 
*zone,
@@ -2340,7 +2344,7 @@ static inline struct page *__rmqueue_cma_fallback(struct 
zone *zone,
 #endif
 
 /*
- * Move the free pages in a range to the free lists of the requested type.
+ * Move the free pages in a range to the freelist tail of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index abfe26ad59fd..83692b937784 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -106,6 +106,11 @@ static void unset_migratetype_isolate(struct page *page, 
unsigned migratetype)
 * If we isolate freepage with more than pageblock_order, there
 * should be no freepage in the range, so we could avoid costly
 * pageblock scanning for freepage moving.
+*
+* We didn't actually touch any of the isolated pages, so place them
+* to the tail of the freelist. This is an optimization for memory
+* onlining - just onlined memory won't immediately be considered for
+* allocation.
 */
if (!isolated_page) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
-- 
2.26.2

[PATCH v2 2/5] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-10-05 Thread David Hildenbrand

__putback_isolated_page() already documents that pages will be placed to
the tail of the freelist - this is, however, not the case for
"order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
the case for all existing users.

This change affects two users:
- free page reporting
- page isolation, when undoing the isolation (including memory onlining).

This behavior is desireable for pages that haven't really been touched
lately, so exactly the two users that don't actually read/write page
content, but rather move untouched pages.

The new behavior is especially desirable for memory onlining, where we
allow allocation of newly onlined pages via undo_isolate_page_range()
in online_pages(). Right now, we always place them to the head of the
freelist, resulting in undesireable behavior: Assume we add
individual memory chunks via add_memory() and online them right away to
the NORMAL zone. We create a dependency chain of unmovable allocations
e.g., via the memmap. The memmap of the next chunk will be placed onto
previous chunks - if the last block cannot get offlined+removed, all
dependent ones cannot get offlined+removed. While this can already be
observed with individual DIMMs, it's more of an issue for virtio-mem
(and I suspect also ppc DLPAR).

Document that this should only be used for optimizations, and no code
should rely on this behavior for correction (if the order of the
freelists ever changes).

We won't care about page shuffling: memory onlining already properly
shuffles after onlining. free page reporting doesn't care about
physically contiguous ranges, and there are already cases where page
isolation will simply move (physically close) free pages to (currently)
the head of the freelists via move_freepages_block() instead of
shuffling. If this becomes ever relevant, we should shuffle the whole
zone when undoing isolation of larger ranges, and after
free_contig_range().

Reviewed-by: Alexander Duyck 
Reviewed-by: Oscar Salvador 
Reviewed-by: Wei Yang 
Reviewed-by: Pankaj Gupta 
Acked-by: Michal Hocko 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: Scott Cheloha 
Cc: Michael Ellerman 
Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bf235b1953f..df5ff0cd6df1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -94,6 +94,18 @@ typedef int __bitwise fpi_t;
  */
 #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
 
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely on this flag for correctness - it's purely
+ *   to allow for optimizations when handing back either fresh pages
+ *   (memory onlining) or untouched pages (page isolation, free page
+ *   reporting).
+ */
+#define FPI_TO_TAIL((__force fpi_t)BIT(1))
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
@@ -1044,7 +1056,9 @@ static inline void __free_one_page(struct page *page,
 done_merging:
set_page_order(page, order);
 
-   if (is_shuffle_order(order))
+   if (fpi_flags & FPI_TO_TAIL)
+   to_tail = true;
+   else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -3306,7 +3320,7 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
 
/* Return isolated page to tail of freelist. */
__free_one_page(page, page_to_pfn(page), zone, order, mt,
-   FPI_SKIP_REPORT_NOTIFY);
+   FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
 }
 
 /*
-- 
2.26.2

[PATCH v2 4/5] mm/page_alloc: place pages to tail in __free_pages_core()

2020-10-05 Thread David Hildenbrand

__free_pages_core() is used when exposing fresh memory to the buddy
during system boot and when onlining memory in generic_online_page().

generic_online_page() is used in two cases:

1. Direct memory onlining in online_pages().
2. Deferred memory onlining in memory-ballooning-like mechanisms (HyperV
   balloon and virtio-mem), when parts of a section are kept
   fake-offline to be fake-onlined later on.

In 1, we already place pages to the tail of the freelist. Pages will be
freed to MIGRATE_ISOLATE lists first and moved to the tail of the freelists
via undo_isolate_page_range().

In 2, we currently don't implement a proper rule. In case of virtio-mem,
where we currently always online MAX_ORDER - 1 pages, the pages will be
placed to the HEAD of the freelist - undesireable. While the hyper-v
balloon calls generic_online_page() with single pages, usually it will
call it on successive single pages in a larger block.

The pages are fresh, so place them to the tail of the freelist and avoid
the PCP. In __free_pages_core(), remove the now superflouos call to
set_page_refcounted() and add a comment regarding page initialization and
the refcount.

Note: In 2. we currently don't shuffle. If ever relevant (page shuffling
is usually of limited use in virtualized environments), we might want to
shuffle after a sequence of generic_online_page() calls in the
relevant callers.

Reviewed-by: Vlastimil Babka 
Reviewed-by: Oscar Salvador 
Acked-by: Pankaj Gupta 
Reviewed-by: Wei Yang 
Acked-by: Michal Hocko 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Wei Liu 
Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 33 +++--
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b187e46cf640..3dadcc6d4009 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -275,7 +275,8 @@ bool pm_suspended_storage(void)
 unsigned int pageblock_order __read_mostly;
 #endif
 
-static void __free_pages_ok(struct page *page, unsigned int order);
+static void __free_pages_ok(struct page *page, unsigned int order,
+   fpi_t fpi_flags);
 
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
@@ -687,7 +688,7 @@ static void bad_page(struct page *page, const char *reason)
 void free_compound_page(struct page *page)
 {
mem_cgroup_uncharge(page);
-   __free_pages_ok(page, compound_order(page));
+   __free_pages_ok(page, compound_order(page), FPI_NONE);
 }
 
 void prep_compound_page(struct page *page, unsigned int order)
@@ -1423,14 +1424,14 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
 static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
-   int migratetype)
+   int migratetype, fpi_t fpi_flags)
 {
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
-   __free_one_page(page, pfn, zone, order, migratetype, FPI_NONE);
+   __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(&zone->lock);
 }
 
@@ -1508,7 +1509,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, 
phys_addr_t end)
}
 }
 
-static void __free_pages_ok(struct page *page, unsigned int order)
+static void __free_pages_ok(struct page *page, unsigned int order,
+   fpi_t fpi_flags)
 {
unsigned long flags;
int migratetype;
@@ -1520,7 +1522,8 @@ static void __free_pages_ok(struct page *page, unsigned 
int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
-   free_one_page(page_zone(page), page, pfn, order, migratetype);
+   free_one_page(page_zone(page), page, pfn, order, migratetype,
+ fpi_flags);
local_irq_restore(flags);
 }
 
@@ -1530,6 +1533,11 @@ void __free_pages_core(struct page *page, unsigned int 
order)
struct page *p = page;
unsigned int loop;
 
+   /*
+* When initializing the memmap, __init_single_page() sets the refcount
+* of all pages to 1 ("allocated"/"not free"). We have to set the
+* refcount of all involved pages to 0.
+*/
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
@@ -1540,8 +1548,12 @@ void __free_pages_core(struct page *page, unsigned int 
order)
set_page_count(p, 0);
 
atomic_long_add(nr_pages

[PATCH v2 5/5] mm/memory_hotplug: update comment regarding zone shuffling

2020-10-05 Thread David Hildenbrand

As we no longer shuffle via generic_online_page() and when undoing
isolation, we can simplify the comment.

We now effectively shuffle only once (properly) when onlining new
memory.

Reviewed-by: Wei Yang 
Acked-by: Michal Hocko 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: Pankaj Gupta 
Signed-off-by: David Hildenbrand 
---
 mm/memory_hotplug.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 03a00cb68bf7..b44d4c7ba73b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -858,13 +858,10 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages,
undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
 
/*
-* When exposing larger, physically contiguous memory areas to the
-* buddy, shuffling in the buddy (when freeing onlined pages, putting
-* them either to the head or the tail of the freelist) is only helpful
-* for maintaining the shuffle, but not for creating the initial
-* shuffle. Shuffle the whole zone to make sure the just onlined pages
-* are properly distributed across the whole freelist. Make sure to
-* shuffle once pageblocks are no longer isolated.
+* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+* the tail of the freelist when undoing isolation). Shuffle the whole
+* zone to make sure the just onlined pages are properly distributed
+* across the whole freelist - to create an initial shuffle.
 */
shuffle_zone(zone);
 
-- 
2.26.2

[PATCH v2 1/5] mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag

2020-10-05 Thread David Hildenbrand

Let's prepare for additional flags and avoid long parameter lists of bools.
Follow-up patches will also make use of the flags in __free_pages_ok().

Reviewed-by: Alexander Duyck 
Reviewed-by: Vlastimil Babka 
Reviewed-by: Oscar Salvador 
Reviewed-by: Wei Yang 
Reviewed-by: Pankaj Gupta 
Acked-by: Michal Hocko 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7012d67a302d..2bf235b1953f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -78,6 +78,22 @@
 #include "shuffle.h"
 #include "page_reporting.h"
 
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fpi_t;
+
+/* No special request */
+#define FPI_NONE   ((__force fpi_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page.
+ * This does not hinder free page reporting from grabbing the page,
+ * reporting it and marking it "reported" -  it only skips notifying
+ * the free page reporting infrastructure about a newly freed page. For
+ * example, used when temporarily pulling a page from a freelist and
+ * putting it back unmodified.
+ */
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
@@ -952,7 +968,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long 
buddy_pfn,
 static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
-   int migratetype, bool report)
+   int migratetype, fpi_t fpi_flags)
 {
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
@@ -1039,7 +1055,7 @@ static inline void __free_one_page(struct page *page,
add_to_free_list(page, zone, order, migratetype);
 
/* Notify page reporting subsystem of freed page */
-   if (report)
+   if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
 }
 
@@ -1380,7 +1396,7 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
 
-   __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
+   __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1396,7 +1412,7 @@ static void free_one_page(struct zone *zone,
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
-   __free_one_page(page, pfn, zone, order, migratetype, true);
+   __free_one_page(page, pfn, zone, order, migratetype, FPI_NONE);
spin_unlock(&zone->lock);
 }
 
@@ -3289,7 +3305,8 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
lockdep_assert_held(&zone->lock);
 
/* Return isolated page to tail of freelist. */
-   __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+   __free_one_page(page, page_to_pfn(page), zone, order, mt,
+   FPI_SKIP_REPORT_NOTIFY);
 }
 
 /*
-- 
2.26.2

[PATCH v2 0/5] mm: place pages to the freelist tail when onlining and undoing isolation

2020-10-05 Thread David Hildenbrand

r the 2MB memmap is needed, a just-onlined 4MB page will
be split. The remaining 2MB page will be used for the memmap of the next
memory block. So one memory block will hold the memmap of the two following
memory blocks. Finally the pages of the last-onlined memory block will get
used for the next bigger allocations - if any allocation is unmovable,
all dependent memory blocks cannot get unplugged and removed until that
allocation is gone.

Note that with bigger memory blocks (e.g., 256MB), *all* memory
blocks are dependent and none can get unplugged again!

b) Experiment with memory intensive workload

I performed an experiment with an older version of this patch set
(before we used undo_isolate_page_range() in online_pages():
Hotplug 56GB to a VM with an initial 4GB, onlining all memory to
ZONE_NORMAL right from the kernel when adding it. I then run various
memory intensive workloads that consume most system memory for a total of
45 minutes. Once finished, I try to unplug as much memory as possible.

With this change, I am able to remove via virtio-mem (adding individual
128MB memory blocks) 413 out of 448 added memory blocks. Via individual
(256MB) DIMMs 380 out of 448 added memory blocks. (I don't have any numbers
without this patchset, but looking at the above example, it's at most half
of the 448 memory blocks for virtio-mem, and most probably none for DIMMs).

Again, there are workloads that might behave very differently due to the
nature of ZONE_NORMAL.

This change also affects (besodes memory onlining):
- Other users of undo_isolate_page_range(): Pages are always placed to the
  tail.
-- When memory offlining fails
-- When memory isolation fails after having isolated some pageblocks
-- When alloc_contig_range() either succeeds or fails
- Other users of __putback_isolated_page(): Pages are always placed to the
  tail.
-- Free page reporting
- Other users of __free_pages_core()
-- AFAIKs, any memory that is getting exposed to the buddy during boot.
   IIUC we will now usually allocate memory from lower addresses within
   a zone first (especially during boot).
- Other users of generic_online_page()
-- Hyper-V balloon

v1 -> v2:
- Avoid changing indentation/alignment of function parameters
- Minor spelling fixes
- "mm/page_alloc: convert "report" flag of __free_one_page() to a proper
   flag"
-- fop_t -> fpi_t
-- Clarify/extend documentation of FPI_SKIP_REPORT_NOTIFY
- "mm/page_alloc: move pages to tail in move_to_free_list()"
-- Perform change for all move_to_free_list()/move_freepages_block() users
   to simplify.
-- Adjust subject/description accordingly.
- "mm/page_alloc: place pages to tail in __free_pages_core()"
-- s/init_single_page/__init_single_page/

RFC -> v1:
- Tweak some patch descriptions
- "mm/page_alloc: place pages to tail in __putback_isolated_page()"
-- FOP_TO_TAIL now has higher precedence than page shuffling
-- Add a note that nothing should rely on FOP_TO_TAIL for correctness
- "mm/page_alloc: always move pages to the tail of the freelist in
   unset_migratetype_isolate()"
-- Use "bool" parameter for move_freepages_block() as requested
- "mm/page_alloc: place pages to tail in __free_pages_core()"
-- Eliminate set_page_refcounted() + page_ref_dec() and add a comment
- "mm/memory_hotplug: update comment regarding zone shuffling"
-- Added

David Hildenbrand (5):
  mm/page_alloc: convert "report" flag of __free_one_page() to a proper
flag
  mm/page_alloc: place pages to tail in __putback_isolated_page()
  mm/page_alloc: move pages to tail in move_to_free_list()
  mm/page_alloc: place pages to tail in __free_pages_core()
  mm/memory_hotplug: update comment regarding zone shuffling

 mm/memory_hotplug.c | 11 +++---
 mm/page_alloc.c | 84 +++--
 mm/page_isolation.c |  5 +++
 3 files changed, 75 insertions(+), 25 deletions(-)

-- 
2.26.2

Re: [PATCH v1 3/5] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-10-05 Thread David Hildenbrand

On 05.10.20 10:20, Mel Gorman wrote:
> On Mon, Oct 05, 2020 at 08:56:48AM +0200, Michal Hocko wrote:
>> On Fri 02-10-20 17:20:09, David Hildenbrand wrote:
>>> On 02.10.20 15:24, Michal Hocko wrote:
>>>> On Mon 28-09-20 20:21:08, David Hildenbrand wrote:
>>>>> Page isolation doesn't actually touch the pages, it simply isolates
>>>>> pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>>>>>
>>>>> We already place pages to the tail of the freelists when undoing
>>>>> isolation via __putback_isolated_page(), let's do it in any case
>>>>> (e.g., if order <= pageblock_order) and document the behavior.
>>>>>
>>>>> Add a "to_tail" parameter to move_freepages_block() but introduce a
>>>>> a new move_to_free_list_tail() - similar to add_to_free_list_tail().
>>>>>
>>>>> This change results in all pages getting onlined via online_pages() to
>>>>> be placed to the tail of the freelist.
>>>>
>>>> Is there anything preventing to do this unconditionally? Or in other
>>>> words is any of the existing callers of move_freepages_block benefiting
>>>> from adding to the head?
>>>
>>> 1. mm/page_isolation.c:set_migratetype_isolate()
>>>
>>> We move stuff to the MIGRATE_ISOLATE list, we don't care about the order
>>> there.
>>>
>>> 2. steal_suitable_fallback():
>>>
>>> I don't think we care too much about the order when already stealing
>>> pageblocks ... and the freelist is empty I guess?
>>>
>>> 3. reserve_highatomic_pageblock()/unreserve_highatomic_pageblock()
>>>
>>> Not sure if we really care.
>>
>> Honestly, I have no idea. I can imagine that some atomic high order
>> workloads (e.g. in net) might benefit from cache line hot pages but I am
>> not sure this is really observable.
> 
> The highatomic reserve is more concerned that about the allocation
> succeeding than it is about cache hotness.
> 

Thanks Mel and Michal. I'll simplify this patch then - and if it turns
out to be an actual problem, we can change that one instance, adding a
proper comment.

Thanks!

-- 
Thanks,

David / dhildenb

Re: [PATCH v1 3/5] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-10-02 Thread David Hildenbrand

On 02.10.20 15:24, Michal Hocko wrote:
> On Mon 28-09-20 20:21:08, David Hildenbrand wrote:
>> Page isolation doesn't actually touch the pages, it simply isolates
>> pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>>
>> We already place pages to the tail of the freelists when undoing
>> isolation via __putback_isolated_page(), let's do it in any case
>> (e.g., if order <= pageblock_order) and document the behavior.
>>
>> Add a "to_tail" parameter to move_freepages_block() but introduce a
>> a new move_to_free_list_tail() - similar to add_to_free_list_tail().
>>
>> This change results in all pages getting onlined via online_pages() to
>> be placed to the tail of the freelist.
> 
> Is there anything preventing to do this unconditionally? Or in other
> words is any of the existing callers of move_freepages_block benefiting
> from adding to the head?

1. mm/page_isolation.c:set_migratetype_isolate()

We move stuff to the MIGRATE_ISOLATE list, we don't care about the order
there.

2. steal_suitable_fallback():

I don't think we care too much about the order when already stealing
pageblocks ... and the freelist is empty I guess?

3. reserve_highatomic_pageblock()/unreserve_highatomic_pageblock()

Not sure if we really care.

Good question, I tried to be careful of what I touch. Thoughts?

-- 
Thanks,

David / dhildenb

Re: [PATCH v1 4/5] mm/page_alloc: place pages to tail in __free_pages_core()

2020-10-02 Thread David Hildenbrand

On 02.10.20 15:41, Michal Hocko wrote:
> On Mon 28-09-20 20:21:09, David Hildenbrand wrote:
>> __free_pages_core() is used when exposing fresh memory to the buddy
>> during system boot and when onlining memory in generic_online_page().
>>
>> generic_online_page() is used in two cases:
>>
>> 1. Direct memory onlining in online_pages().
>> 2. Deferred memory onlining in memory-ballooning-like mechanisms (HyperV
>>balloon and virtio-mem), when parts of a section are kept
>>fake-offline to be fake-onlined later on.
>>
>> In 1, we already place pages to the tail of the freelist. Pages will be
>> freed to MIGRATE_ISOLATE lists first and moved to the tail of the freelists
>> via undo_isolate_page_range().
>>
>> In 2, we currently don't implement a proper rule. In case of virtio-mem,
>> where we currently always online MAX_ORDER - 1 pages, the pages will be
>> placed to the HEAD of the freelist - undesireable. While the hyper-v
>> balloon calls generic_online_page() with single pages, usually it will
>> call it on successive single pages in a larger block.
>>
>> The pages are fresh, so place them to the tail of the freelists and avoid
>> the PCP. In __free_pages_core(), remove the now superflouos call to
>> set_page_refcounted() and add a comment regarding page initialization and
>> the refcount.
>>
>> Note: In 2. we currently don't shuffle. If ever relevant (page shuffling
>> is usually of limited use in virtualized environments), we might want to
>> shuffle after a sequence of generic_online_page() calls in the
>> relevant callers.
> 
> It took some time to get through all the freeing paths with subtle
> differences but this looks reasonable. You are mentioning that this
> influences a boot time free memory ordering as well but only very
> briefly. I do not expect this to make a huge difference but who knows.
> It makes some sense to add pages in the order they show up in the
> physical address ordering.

I think boot memory is mostly exposed in the physical address ordering.
In that case, higher addresses will now be used less likely immediately
after this patch. I also don't think it's an issue - if we still detect
it's an issue it's fairly easy to change again.

Thanks!

-- 
Thanks,

David / dhildenb

Re: [PATCH v1 1/5] mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag

2020-10-02 Thread David Hildenbrand

On 02.10.20 16:48, David Hildenbrand wrote:
> On 02.10.20 15:41, Matthew Wilcox wrote:
>> On Mon, Sep 28, 2020 at 08:21:06PM +0200, David Hildenbrand wrote:
>>> Let's prepare for additional flags and avoid long parameter lists of bools.
>>> Follow-up patches will also make use of the flags in __free_pages_ok(),
>>> however, I wasn't able to come up with a better name for the type - should
>>> be good enough for internal purposes.
>>
>>> +/* Free One Page flags: for internal, non-pcp variants of free_pages(). */
>>> +typedef int __bitwise fop_t;
>>
>> That invites confusion with f_op.  There's no reason to use _t as a suffix
>> here ... why not free_f?
> 
> git grep "bitwise" | grep typedef | grep include/linux
> 
> indicates that "_t" it the right thing to do.
> 
> I want a name that highlights that is is for the internal variants of
> free_page(), free_f / free_t is too generic.
> 
> fpi_t (Free Page Internal) ?
> 
>>
>>> +/*
>>> + * Skip free page reporting notification for the (possibly merged) page. 
>>> (will
>>> + * *not* mark the page reported, only skip the notification).
>>
>> ... Don't you mean "will not skip marking the page as reported, only
>> skip the notification"?
> 
> Yeah, I can use that.

Reading again, it doesn't quite fit. Marking pages as reported is
handled by mm/page_reporting.c

/*
 * Skip free page reporting notification for the (possibly merged) page.
 * This does not hinder free page reporting from grabbing the page,
 * reporting it and marking it "reported" -  it only skips notifying
 * the free page reporting infrastructure about a newly freed page. For
 * example, used  when temporarily pulling a page from the freelist and
 * putting it back unmodified.
 */

Is that clearer?

-- 
Thanks,

David / dhildenb

Re: [PATCH v1 1/5] mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag

2020-10-02 Thread David Hildenbrand

On 02.10.20 15:41, Matthew Wilcox wrote:
> On Mon, Sep 28, 2020 at 08:21:06PM +0200, David Hildenbrand wrote:
>> Let's prepare for additional flags and avoid long parameter lists of bools.
>> Follow-up patches will also make use of the flags in __free_pages_ok(),
>> however, I wasn't able to come up with a better name for the type - should
>> be good enough for internal purposes.
> 
>> +/* Free One Page flags: for internal, non-pcp variants of free_pages(). */
>> +typedef int __bitwise fop_t;
> 
> That invites confusion with f_op.  There's no reason to use _t as a suffix
> here ... why not free_f?

git grep "bitwise" | grep typedef | grep include/linux

indicates that "_t" it the right thing to do.

I want a name that highlights that is is for the internal variants of
free_page(), free_f / free_t is too generic.

fpi_t (Free Page Internal) ?

> 
>> +/*
>> + * Skip free page reporting notification for the (possibly merged) page. 
>> (will
>> + * *not* mark the page reported, only skip the notification).
> 
> ... Don't you mean "will not skip marking the page as reported, only
> skip the notification"?

Yeah, I can use that.

The way free page reporting works is that

1. Free page reporting infrastructure will get notified after buddy
merging about a newly freed page.

2. Once a certain threshold of free pages is reached, it will pull pages
from the freelist, report them, and mark them as reported. (see
mm/page_reporting.c)

During 2., we didn't actually free a "new page", we only temporarily
removed it from the list, that's why we have to skip the notification.

What we do here is skip 1., not 2.

> 
> *reads code*
> 
> No, I'm still confused.  What does this sentence mean?
> 
> Would it help to have a FOP_DEFAULT that has FOP_REPORT_NOTIFY set and
> then a FOP_SKIP_REPORT_NOTIFY define that is 0?

Hmm, I'm not entirely sure if that improves the situation. Then, I need
3 defines instead of two, and an "inverse" documentation for
FOP_REPORT_NOTIFY.

> 
>> -static inline void __free_one_page(struct page *page,
>> -unsigned long pfn,
>> -struct zone *zone, unsigned int order,
>> -int migratetype, bool report)
>> +static inline void __free_one_page(struct page *page, unsigned long pfn,
>> +   struct zone *zone, unsigned int order,
>> +   int migratetype, fop_t fop_flags)
> 
> Please don't over-indent like this.
> 
> static inline void __free_one_page(struct page *page, unsigned long pfn,
>   struct zone *zone, unsigned int order, int migratetype,
>   fop_t fop_flags)
> 
> reads just as well and then if someone needs to delete the 'static'
> later, they don't need to fiddle around with subsequent lines getting
> the whitespace to line up again.
> 

I don't care too much about this specific instance and can fix it up.
(this is clearly a matter of personal taste)

Thanks!

-- 
Thanks,

David / dhildenb

Re: [PATCH v1 4/5] mm/page_alloc: place pages to tail in __free_pages_core()

2020-09-29 Thread David Hildenbrand

On 29.09.20 11:36, Wei Yang wrote:
> On Mon, Sep 28, 2020 at 08:21:09PM +0200, David Hildenbrand wrote:
>> __free_pages_core() is used when exposing fresh memory to the buddy
>> during system boot and when onlining memory in generic_online_page().
>>
>> generic_online_page() is used in two cases:
>>
>> 1. Direct memory onlining in online_pages().
>> 2. Deferred memory onlining in memory-ballooning-like mechanisms (HyperV
>>   balloon and virtio-mem), when parts of a section are kept
>>   fake-offline to be fake-onlined later on.
>>
>> In 1, we already place pages to the tail of the freelist. Pages will be
>> freed to MIGRATE_ISOLATE lists first and moved to the tail of the freelists
>> via undo_isolate_page_range().
>>
>> In 2, we currently don't implement a proper rule. In case of virtio-mem,
>> where we currently always online MAX_ORDER - 1 pages, the pages will be
>> placed to the HEAD of the freelist - undesireable. While the hyper-v
>> balloon calls generic_online_page() with single pages, usually it will
>> call it on successive single pages in a larger block.
>>
>> The pages are fresh, so place them to the tail of the freelists and avoid
>> the PCP. In __free_pages_core(), remove the now superflouos call to
>> set_page_refcounted() and add a comment regarding page initialization and
>> the refcount.
>>
>> Note: In 2. we currently don't shuffle. If ever relevant (page shuffling
>> is usually of limited use in virtualized environments), we might want to
>> shuffle after a sequence of generic_online_page() calls in the
>> relevant callers.
>>
>> Reviewed-by: Vlastimil Babka 
>> Reviewed-by: Oscar Salvador 
>> Cc: Andrew Morton 
>> Cc: Alexander Duyck 
>> Cc: Mel Gorman 
>> Cc: Michal Hocko 
>> Cc: Dave Hansen 
>> Cc: Vlastimil Babka 
>> Cc: Wei Yang 
>> Cc: Oscar Salvador 
>> Cc: Mike Rapoport 
>> Cc: "K. Y. Srinivasan" 
>> Cc: Haiyang Zhang 
>> Cc: Stephen Hemminger 
>> Cc: Wei Liu 
>> Signed-off-by: David Hildenbrand 
>> ---
>> mm/page_alloc.c | 37 -
>> 1 file changed, 24 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index d5a5f528b8ca..8a2134fe9947 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -270,7 +270,8 @@ bool pm_suspended_storage(void)
>> unsigned int pageblock_order __read_mostly;
>> #endif
>>
>> -static void __free_pages_ok(struct page *page, unsigned int order);
>> +static void __free_pages_ok(struct page *page, unsigned int order,
>> +fop_t fop_flags);
>>
>> /*
>>  * results with 256, 32 in the lowmem_reserve sysctl:
>> @@ -682,7 +683,7 @@ static void bad_page(struct page *page, const char 
>> *reason)
>> void free_compound_page(struct page *page)
>> {
>>  mem_cgroup_uncharge(page);
>> -__free_pages_ok(page, compound_order(page));
>> +__free_pages_ok(page, compound_order(page), FOP_NONE);
>> }
>>
>> void prep_compound_page(struct page *page, unsigned int order)
>> @@ -1419,17 +1420,15 @@ static void free_pcppages_bulk(struct zone *zone, 
>> int count,
>>  spin_unlock(&zone->lock);
>> }
>>
>> -static void free_one_page(struct zone *zone,
>> -struct page *page, unsigned long pfn,
>> -unsigned int order,
>> -int migratetype)
>> +static void free_one_page(struct zone *zone, struct page *page, unsigned 
>> long pfn,
>> +  unsigned int order, int migratetype, fop_t fop_flags)
>> {
>>  spin_lock(&zone->lock);
>>  if (unlikely(has_isolate_pageblock(zone) ||
>>  is_migrate_isolate(migratetype))) {
>>  migratetype = get_pfnblock_migratetype(page, pfn);
>>  }
>> -__free_one_page(page, pfn, zone, order, migratetype, FOP_NONE);
>> +__free_one_page(page, pfn, zone, order, migratetype, fop_flags);
>>  spin_unlock(&zone->lock);
>> }
>>
>> @@ -1507,7 +1506,8 @@ void __meminit reserve_bootmem_region(phys_addr_t 
>> start, phys_addr_t end)
>>  }
>> }
>>
>> -static void __free_pages_ok(struct page *page, unsigned int order)
>> +static void __free_pages_ok(struct page *page, unsigned int order,
>> +fop_t fop_flags)
>> {
>>  unsigned long flags;
>>  int migratetype;
>> @@ -1519,7 +1519,8 @@ static void __

Re: [PATCH v1 3/5] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-09-29 Thread David Hildenbrand

On 29.09.20 11:18, Wei Yang wrote:
> On Mon, Sep 28, 2020 at 08:21:08PM +0200, David Hildenbrand wrote:
>> Page isolation doesn't actually touch the pages, it simply isolates
>> pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>>
>> We already place pages to the tail of the freelists when undoing
>> isolation via __putback_isolated_page(), let's do it in any case
>> (e.g., if order <= pageblock_order) and document the behavior.
>>
>> Add a "to_tail" parameter to move_freepages_block() but introduce a
>> a new move_to_free_list_tail() - similar to add_to_free_list_tail().

s/a a/a/

>>
>> This change results in all pages getting onlined via online_pages() to
>> be placed to the tail of the freelist.
>>
>> Reviewed-by: Oscar Salvador 
>> Cc: Andrew Morton 
>> Cc: Alexander Duyck 
>> Cc: Mel Gorman 
>> Cc: Michal Hocko 
>> Cc: Dave Hansen 
>> Cc: Vlastimil Babka 
>> Cc: Wei Yang 
>> Cc: Oscar Salvador 
>> Cc: Mike Rapoport 
>> Cc: Scott Cheloha 
>> Cc: Michael Ellerman 
>> Signed-off-by: David Hildenbrand 
>> ---
>> include/linux/page-isolation.h |  4 ++--
>> mm/page_alloc.c| 35 +++---
>> mm/page_isolation.c| 12 +---
>> 3 files changed, 35 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>> index 572458016331..3eca9b3c5305 100644
>> --- a/include/linux/page-isolation.h
>> +++ b/include/linux/page-isolation.h
>> @@ -36,8 +36,8 @@ static inline bool is_migrate_isolate(int migratetype)
>> struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>   int migratetype, int flags);
>> void set_pageblock_migratetype(struct page *page, int migratetype);
>> -int move_freepages_block(struct zone *zone, struct page *page,
>> -int migratetype, int *num_movable);
>> +int move_freepages_block(struct zone *zone, struct page *page, int 
>> migratetype,
>> + bool to_tail, int *num_movable);
>>
>> /*
>>  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 9e3ed4a6f69a..d5a5f528b8ca 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -905,6 +905,15 @@ static inline void move_to_free_list(struct page *page, 
>> struct zone *zone,
>>  list_move(&page->lru, &area->free_list[migratetype]);
>> }
>>
>> +/* Used for pages which are on another list */
>> +static inline void move_to_free_list_tail(struct page *page, struct zone 
>> *zone,
>> +  unsigned int order, int migratetype)
>> +{
>> +struct free_area *area = &zone->free_area[order];
>> +
>> +list_move_tail(&page->lru, &area->free_list[migratetype]);
>> +}
>> +
> 
> Would it be better to pass the *to_tail* to move_to_free_list(), so we won't
> have a new function?

Hi,

thanks for the review!

See discussion in RFC + cover letter:

"Add a "to_tail" parameter to move_freepages_block() but introduce a new
move_to_free_list_tail() - similar to add_to_free_list_tail()."

[PATCH v1 2/5] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-28 Thread David Hildenbrand

__putback_isolated_page() already documents that pages will be placed to
the tail of the freelist - this is, however, not the case for
"order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
the case for all existing users.

This change affects two users:
- free page reporting
- page isolation, when undoing the isolation (including memory onlining).

This behavior is desireable for pages that haven't really been touched
lately, so exactly the two users that don't actually read/write page
content, but rather move untouched pages.

The new behavior is especially desirable for memory onlining, where we
allow allocation of newly onlined pages via undo_isolate_page_range()
in online_pages(). Right now, we always place them to the head of the
free list, resulting in undesireable behavior: Assume we add
individual memory chunks via add_memory() and online them right away to
the NORMAL zone. We create a dependency chain of unmovable allocations
e.g., via the memmap. The memmap of the next chunk will be placed onto
previous chunks - if the last block cannot get offlined+removed, all
dependent ones cannot get offlined+removed. While this can already be
observed with individual DIMMs, it's more of an issue for virtio-mem
(and I suspect also ppc DLPAR).

Document that this should only be used for optimizations, and no code
should realy on this for correction (if the order of freepage lists
ever changes).

We won't care about page shuffling: memory onlining already properly
shuffles after onlining. free page reporting doesn't care about
physically contiguous ranges, and there are already cases where page
isolation will simply move (physically close) free pages to (currently)
the head of the freelists via move_freepages_block() instead of
shuffling. If this becomes ever relevant, we should shuffle the whole
zone when undoing isolation of larger ranges, and after
free_contig_range().

Reviewed-by: Alexander Duyck 
Reviewed-by: Oscar Salvador 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: Scott Cheloha 
Cc: Michael Ellerman 
Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index daab90e960fe..9e3ed4a6f69a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -89,6 +89,18 @@ typedef int __bitwise fop_t;
  */
 #define FOP_SKIP_REPORT_NOTIFY ((__force fop_t)BIT(0))
 
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely onto this flag for correctness - it's purely
+ *   to allow for optimizations when handing back either fresh pages
+ *   (memory onlining) or untouched pages (page isolation, free page
+ *   reporting).
+ */
+#define FOP_TO_TAIL((__force fop_t)BIT(1))
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
@@ -1038,7 +1050,9 @@ static inline void __free_one_page(struct page *page, 
unsigned long pfn,
 done_merging:
set_page_order(page, order);
 
-   if (is_shuffle_order(order))
+   if (fop_flags & FOP_TO_TAIL)
+   to_tail = true;
+   else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -3300,7 +3314,7 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
 
/* Return isolated page to tail of freelist. */
__free_one_page(page, page_to_pfn(page), zone, order, mt,
-   FOP_SKIP_REPORT_NOTIFY);
+   FOP_SKIP_REPORT_NOTIFY | FOP_TO_TAIL);
 }
 
 /*
-- 
2.26.2

[PATCH v1 3/5] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-09-28 Thread David Hildenbrand

Page isolation doesn't actually touch the pages, it simply isolates
pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.

We already place pages to the tail of the freelists when undoing
isolation via __putback_isolated_page(), let's do it in any case
(e.g., if order <= pageblock_order) and document the behavior.

Add a "to_tail" parameter to move_freepages_block() but introduce a
a new move_to_free_list_tail() - similar to add_to_free_list_tail().

This change results in all pages getting onlined via online_pages() to
be placed to the tail of the freelist.

Reviewed-by: Oscar Salvador 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: Scott Cheloha 
Cc: Michael Ellerman 
Signed-off-by: David Hildenbrand 
---
 include/linux/page-isolation.h |  4 ++--
 mm/page_alloc.c| 35 +++---
 mm/page_isolation.c| 12 +---
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 572458016331..3eca9b3c5305 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -36,8 +36,8 @@ static inline bool is_migrate_isolate(int migratetype)
 struct page *has_unmovable_pages(struct zone *zone, struct page *page,
 int migratetype, int flags);
 void set_pageblock_migratetype(struct page *page, int migratetype);
-int move_freepages_block(struct zone *zone, struct page *page,
-   int migratetype, int *num_movable);
+int move_freepages_block(struct zone *zone, struct page *page, int migratetype,
+bool to_tail, int *num_movable);
 
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9e3ed4a6f69a..d5a5f528b8ca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -905,6 +905,15 @@ static inline void move_to_free_list(struct page *page, 
struct zone *zone,
list_move(&page->lru, &area->free_list[migratetype]);
 }
 
+/* Used for pages which are on another list */
+static inline void move_to_free_list_tail(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+   struct free_area *area = &zone->free_area[order];
+
+   list_move_tail(&page->lru, &area->free_list[migratetype]);
+}
+
 static inline void del_page_from_free_list(struct page *page, struct zone 
*zone,
   unsigned int order)
 {
@@ -2338,9 +2347,9 @@ static inline struct page *__rmqueue_cma_fallback(struct 
zone *zone,
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-static int move_freepages(struct zone *zone,
- struct page *start_page, struct page *end_page,
- int migratetype, int *num_movable)
+static int move_freepages(struct zone *zone, struct page *start_page,
+ struct page *end_page, int migratetype,
+ bool to_tail, int *num_movable)
 {
struct page *page;
unsigned int order;
@@ -2371,7 +2380,10 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 
order = page_order(page);
-   move_to_free_list(page, zone, order, migratetype);
+   if (to_tail)
+   move_to_free_list_tail(page, zone, order, migratetype);
+   else
+   move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2379,8 +2391,8 @@ static int move_freepages(struct zone *zone,
return pages_moved;
 }
 
-int move_freepages_block(struct zone *zone, struct page *page,
-   int migratetype, int *num_movable)
+int move_freepages_block(struct zone *zone, struct page *page, int migratetype,
+bool to_tail, int *num_movable)
 {
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -2401,7 +2413,7 @@ int move_freepages_block(struct zone *zone, struct page 
*page,
return 0;
 
return move_freepages(zone, start_page, end_page, migratetype,
-   num_movable);
+ to_tail, num_movable);
 }
 
 static void change_pageblock_range(struct page *pageblock_page,
@@ -2526,8 +2538,8 @@ static void steal_suitable_fallback(struct zone *zone, 
struct page *page,
if (!whole_block)
goto single_pa

[PATCH v1 5/5] mm/memory_hotplug: update comment regarding zone shuffling

2020-09-28 Thread David Hildenbrand

As we no longer shuffle via generic_online_page() and when undoing
isolation, we can simplify the comment.

We now effectively shuffle only once (properly) when onlining new
memory.

Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Signed-off-by: David Hildenbrand 
---
 mm/memory_hotplug.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9db80ee29caa..c589bd8801bb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -859,13 +859,10 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages,
undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
 
/*
-* When exposing larger, physically contiguous memory areas to the
-* buddy, shuffling in the buddy (when freeing onlined pages, putting
-* them either to the head or the tail of the freelist) is only helpful
-* for maintaining the shuffle, but not for creating the initial
-* shuffle. Shuffle the whole zone to make sure the just onlined pages
-* are properly distributed across the whole freelist. Make sure to
-* shuffle once pageblocks are no longer isolated.
+* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+* the tail of the freelist when undoing isolation). Shuffle the whole
+* zone to make sure the just onlined pages are properly distributed
+* across the whole freelist - to create an initial shuffle.
 */
shuffle_zone(zone);
 
-- 
2.26.2

[PATCH v1 1/5] mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag

2020-09-28 Thread David Hildenbrand

Let's prepare for additional flags and avoid long parameter lists of bools.
Follow-up patches will also make use of the flags in __free_pages_ok(),
however, I wasn't able to come up with a better name for the type - should
be good enough for internal purposes.

Reviewed-by: Alexander Duyck 
Reviewed-by: Vlastimil Babka 
Reviewed-by: Oscar Salvador 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df90e3654f97..daab90e960fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -77,6 +77,18 @@
 #include "shuffle.h"
 #include "page_reporting.h"
 
+/* Free One Page flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fop_t;
+
+/* No special request */
+#define FOP_NONE   ((__force fop_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page. (will
+ * *not* mark the page reported, only skip the notification).
+ */
+#define FOP_SKIP_REPORT_NOTIFY ((__force fop_t)BIT(0))
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
@@ -948,10 +960,9 @@ buddy_merge_likely(unsigned long pfn, unsigned long 
buddy_pfn,
  * -- nyc
  */
 
-static inline void __free_one_page(struct page *page,
-   unsigned long pfn,
-   struct zone *zone, unsigned int order,
-   int migratetype, bool report)
+static inline void __free_one_page(struct page *page, unsigned long pfn,
+  struct zone *zone, unsigned int order,
+  int migratetype, fop_t fop_flags)
 {
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
@@ -1038,7 +1049,7 @@ static inline void __free_one_page(struct page *page,
add_to_free_list(page, zone, order, migratetype);
 
/* Notify page reporting subsystem of freed page */
-   if (report)
+   if (!(fop_flags & FOP_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
 }
 
@@ -1379,7 +1390,7 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
 
-   __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
+   __free_one_page(page, page_to_pfn(page), zone, 0, mt, FOP_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1395,7 +1406,7 @@ static void free_one_page(struct zone *zone,
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
-   __free_one_page(page, pfn, zone, order, migratetype, true);
+   __free_one_page(page, pfn, zone, order, migratetype, FOP_NONE);
spin_unlock(&zone->lock);
 }
 
@@ -3288,7 +3299,8 @@ void __putback_isolated_page(struct page *page, unsigned 
int order, int mt)
lockdep_assert_held(&zone->lock);
 
/* Return isolated page to tail of freelist. */
-   __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+   __free_one_page(page, page_to_pfn(page), zone, order, mt,
+   FOP_SKIP_REPORT_NOTIFY);
 }
 
 /*
-- 
2.26.2

[PATCH v1 0/5] mm: place pages to the freelist tail when onling and undoing isolation

2020-09-28 Thread David Hildenbrand

r the 2MB memmap is needed, a just-onlined 4MB page will
be split. The remaining 2MB page will be used for the memmap of the next
memory block. So one memory block will hold the memmap of the two following
memory blocks. Finally the pages of the last-onlined memory block will get
used for the next bigger allocations - if any allocation is unmovable,
all dependent memory blocks cannot get unplugged and removed until that
allocation is gone.

Note that with bigger memory blocks (e.g., 256MB), *all* memory
blocks are dependent and none can get unplugged again!

b) Experiment with memory intensive workload

I performed an experiment with an older version of this patch set
(before we used undo_isolate_page_range() in online_pages():
Hotplug 56GB to a VM with an initial 4GB, onlining all memory to
ZONE_NORMAL right from the kernel when adding it. I then run various
memory intensive workloads that consume most system memory for a total of
45 minutes. Once finished, I try to unplug as much memory as possible.

With this change, I am able to remove via virtio-mem (adding individual
128MB memory blocks) 413 out of 448 added memory blocks. Via individual
(256MB) DIMMs 380 out of 448 added memory blocks. (I don't have any numbers
without this patchset, but looking at the above example, it's at most half
of the 448 memory blocks for virtio-mem, and most probably none for DIMMs).

Again, there are workloads that might behave very differently due to the
nature of ZONE_NORMAL.

This change also affects (besodes memory onlining):
- Other users of undo_isolate_page_range(): Pages are always placed to the
  tail.
-- When memory offlining fails
-- When memory isolation fails after having isolated some pageblocks
-- When alloc_contig_range() either succeeds or fails
- Other users of __putback_isolated_page(): Pages are always placed to the
  tail.
-- Free page reporting
- Other users of __free_pages_core()
-- AFAIKs, any memory that is getting exposed to the buddy during boot.
   IIUC we will now usually allocate memory from lower addresses within
   a zone first (especially during boot).
- Other users of generic_online_page()
-- Hyper-V balloon

RFC -> v1:
- Tweak some patch descriptions
- "mm/page_alloc: place pages to tail in __putback_isolated_page()"
-- FOP_TO_TAIL now has higher precedence than page shuffling
-- Add a note that nothing should rely on FOP_TO_TAIL for correctness
- "mm/page_alloc: always move pages to the tail of the freelist in
   unset_migratetype_isolate()"
-- Use "bool" parameter for move_freepages_block() as requested
- "mm/page_alloc: place pages to tail in __free_pages_core()"
-- Eliminate set_page_refcounted() + page_ref_dec() and add a comment
- "mm/memory_hotplug: update comment regarding zone shuffling"
-- Added

David Hildenbrand (5):
  mm/page_alloc: convert "report" flag of __free_one_page() to a proper
flag
  mm/page_alloc: place pages to tail in __putback_isolated_page()
  mm/page_alloc: always move pages to the tail of the freelist in
unset_migratetype_isolate()
  mm/page_alloc: place pages to tail in __free_pages_core()
  mm/memory_hotplug: update comment regarding zone shuffling

 include/linux/page-isolation.h |   4 +-
 mm/memory_hotplug.c|  11 ++--
 mm/page_alloc.c| 114 -
 mm/page_isolation.c|  12 +++-
 4 files changed, 97 insertions(+), 44 deletions(-)

-- 
2.26.2

[PATCH v1 4/5] mm/page_alloc: place pages to tail in __free_pages_core()

2020-09-28 Thread David Hildenbrand

__free_pages_core() is used when exposing fresh memory to the buddy
during system boot and when onlining memory in generic_online_page().

generic_online_page() is used in two cases:

1. Direct memory onlining in online_pages().
2. Deferred memory onlining in memory-ballooning-like mechanisms (HyperV
   balloon and virtio-mem), when parts of a section are kept
   fake-offline to be fake-onlined later on.

In 1, we already place pages to the tail of the freelist. Pages will be
freed to MIGRATE_ISOLATE lists first and moved to the tail of the freelists
via undo_isolate_page_range().

In 2, we currently don't implement a proper rule. In case of virtio-mem,
where we currently always online MAX_ORDER - 1 pages, the pages will be
placed to the HEAD of the freelist - undesireable. While the hyper-v
balloon calls generic_online_page() with single pages, usually it will
call it on successive single pages in a larger block.

The pages are fresh, so place them to the tail of the freelists and avoid
the PCP. In __free_pages_core(), remove the now superflouos call to
set_page_refcounted() and add a comment regarding page initialization and
the refcount.

Note: In 2. we currently don't shuffle. If ever relevant (page shuffling
is usually of limited use in virtualized environments), we might want to
shuffle after a sequence of generic_online_page() calls in the
relevant callers.

Reviewed-by: Vlastimil Babka 
Reviewed-by: Oscar Salvador 
Cc: Andrew Morton 
Cc: Alexander Duyck 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Dave Hansen 
Cc: Vlastimil Babka 
Cc: Wei Yang 
Cc: Oscar Salvador 
Cc: Mike Rapoport 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Wei Liu 
Signed-off-by: David Hildenbrand 
---
 mm/page_alloc.c | 37 -
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d5a5f528b8ca..8a2134fe9947 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -270,7 +270,8 @@ bool pm_suspended_storage(void)
 unsigned int pageblock_order __read_mostly;
 #endif
 
-static void __free_pages_ok(struct page *page, unsigned int order);
+static void __free_pages_ok(struct page *page, unsigned int order,
+   fop_t fop_flags);
 
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
@@ -682,7 +683,7 @@ static void bad_page(struct page *page, const char *reason)
 void free_compound_page(struct page *page)
 {
mem_cgroup_uncharge(page);
-   __free_pages_ok(page, compound_order(page));
+   __free_pages_ok(page, compound_order(page), FOP_NONE);
 }
 
 void prep_compound_page(struct page *page, unsigned int order)
@@ -1419,17 +1420,15 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
spin_unlock(&zone->lock);
 }
 
-static void free_one_page(struct zone *zone,
-   struct page *page, unsigned long pfn,
-   unsigned int order,
-   int migratetype)
+static void free_one_page(struct zone *zone, struct page *page, unsigned long 
pfn,
+ unsigned int order, int migratetype, fop_t fop_flags)
 {
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
-   __free_one_page(page, pfn, zone, order, migratetype, FOP_NONE);
+   __free_one_page(page, pfn, zone, order, migratetype, fop_flags);
spin_unlock(&zone->lock);
 }
 
@@ -1507,7 +1506,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, 
phys_addr_t end)
}
 }
 
-static void __free_pages_ok(struct page *page, unsigned int order)
+static void __free_pages_ok(struct page *page, unsigned int order,
+   fop_t fop_flags)
 {
unsigned long flags;
int migratetype;
@@ -1519,7 +1519,8 @@ static void __free_pages_ok(struct page *page, unsigned 
int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
-   free_one_page(page_zone(page), page, pfn, order, migratetype);
+   free_one_page(page_zone(page), page, pfn, order, migratetype,
+ fop_flags);
local_irq_restore(flags);
 }
 
@@ -1529,6 +1530,11 @@ void __free_pages_core(struct page *page, unsigned int 
order)
struct page *p = page;
unsigned int loop;
 
+   /*
+* When initializing the memmap, init_single_page() sets the refcount
+* of all pages to 1 ("allocated"/"not free"). We have to set the
+* refcount of all involved pages to 0.
+*/
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
@@ -1539,8 +1545,12 @@ void __free_pages_core(struct page *page, unsign

Re: [PATCH RFC 4/4] mm/page_alloc: place pages to tail in __free_pages_core()

2020-09-28 Thread David Hildenbrand

On 28.09.20 09:58, Oscar Salvador wrote:
> On Wed, Sep 16, 2020 at 08:34:11PM +0200, David Hildenbrand wrote:
>> @@ -1523,7 +1524,13 @@ void __free_pages_core(struct page *page, unsigned 
>> int order)
>>  
>>  atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>>  set_page_refcounted(page);
>> -__free_pages(page, order);
>> +
>> +/*
>> + * Bypass PCP and place fresh pages right to the tail, primarily
>> + * relevant for memory onlining.
>> + */
>> +page_ref_dec(page);
>> +__free_pages_ok(page, order, FOP_TO_TAIL);
> 
> Sorry, I must be missing something obvious here, but I am a bit confused here.
> I get the part of placing them at the tail so rmqueue_bulk() won't
> find them, but I do not get why we decrement page's refcount.
> IIUC, its refcount will be 0, but why do we want to do that?
> 
> Another thing a bit unrelated... we mess three times with page's refcount
> (two before this patch).
> Why do we have this dance in place?

Hi Oscar!

Old code:

set_page_refcounted(): sets the refcount to 1.
__free_pages()
  -> put_page_testzero(): sets it to 0
  -> free_the_page()->__free_pages_ok()

New code:

set_page_refcounted(): sets the refcount to 1.
page_ref_dec(page): sets it to 0
__free_pages_ok():


We could skip the set_page_refcounted() + page_ref_dec(page) and lose a
couple of sanity checks but we could simply use a
VM_BUG_ON_PAGE(page_ref_count(page), page), which is what we really care
about when onlining memory.

-- 
Thanks,

David / dhildenb

Re: [PATCH RFC 2/4] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-25 Thread David Hildenbrand

On 25.09.20 15:19, Oscar Salvador wrote:
> On Wed, Sep 16, 2020 at 08:34:09PM +0200, David Hildenbrand wrote:
>> __putback_isolated_page() already documents that pages will be placed to
>> the tail of the freelist - this is, however, not the case for
>> "order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
>> the case for all existing users.
>>
>> This change affects two users:
>> - free page reporting
>> - page isolation, when undoing the isolation.
>>
>> This behavior is desireable for pages that haven't really been touched
>> lately, so exactly the two users that don't actually read/write page
>> content, but rather move untouched pages.
>>
>> The new behavior is especially desirable for memory onlining, where we
>> allow allocation of newly onlined pages via undo_isolate_page_range()
>> in online_pages(). Right now, we always place them to the head of the
>> free list, resulting in undesireable behavior: Assume we add
>> individual memory chunks via add_memory() and online them right away to
>> the NORMAL zone. We create a dependency chain of unmovable allocations
>> e.g., via the memmap. The memmap of the next chunk will be placed onto
>> previous chunks - if the last block cannot get offlined+removed, all
>> dependent ones cannot get offlined+removed. While this can already be
>> observed with individual DIMMs, it's more of an issue for virtio-mem
>> (and I suspect also ppc DLPAR).
>>
>> Note: If we observe a degradation due to the changed page isolation
>> behavior (which I doubt), we can always make this configurable by the
>> instance triggering undo of isolation (e.g., alloc_contig_range(),
>> memory onlining, memory offlining).
>>
>> Cc: Andrew Morton 
>> Cc: Alexander Duyck 
>> Cc: Mel Gorman 
>> Cc: Michal Hocko 
>> Cc: Dave Hansen 
>> Cc: Vlastimil Babka 
>> Cc: Wei Yang 
>> Cc: Oscar Salvador 
>> Cc: Mike Rapoport 
>> Cc: Scott Cheloha 
>> Cc: Michael Ellerman 
>> Signed-off-by: David Hildenbrand 
> 
> LGTM, the only thing is the shuffe_zone topic that Wei and Vlastimil rose.
> Feels a bit odd that takes precedence over something we explicitily demanded.
>

Thanks, yeah I'll be changing that.

> With the comment Vlastimil suggested:
> 
> Reviewed-by: Oscar Salvador 
> 

-- 
Thanks,

David / dhildenb

Re: [PATCH RFC 2/4] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-25 Thread David Hildenbrand

On 24.09.20 12:37, Vlastimil Babka wrote:
> On 9/16/20 8:34 PM, David Hildenbrand wrote:
>> __putback_isolated_page() already documents that pages will be placed to
>> the tail of the freelist - this is, however, not the case for
>> "order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
>> the case for all existing users.
> 
> I think here should be a sentence saying something along "Thus this patch
> introduces a FOP_TO_TAIL flag to really ensure moving pages to tail."

Agreed, thanks!

> 
>> This change affects two users:
>> - free page reporting
>> - page isolation, when undoing the isolation.
>>
>> This behavior is desireable for pages that haven't really been touched
>> lately, so exactly the two users that don't actually read/write page
>> content, but rather move untouched pages.
>>
>> The new behavior is especially desirable for memory onlining, where we
>> allow allocation of newly onlined pages via undo_isolate_page_range()
>> in online_pages(). Right now, we always place them to the head of the
>> free list, resulting in undesireable behavior: Assume we add
>> individual memory chunks via add_memory() and online them right away to
>> the NORMAL zone. We create a dependency chain of unmovable allocations
>> e.g., via the memmap. The memmap of the next chunk will be placed onto
>> previous chunks - if the last block cannot get offlined+removed, all
>> dependent ones cannot get offlined+removed. While this can already be
>> observed with individual DIMMs, it's more of an issue for virtio-mem
>> (and I suspect also ppc DLPAR).
>>
>> Note: If we observe a degradation due to the changed page isolation
>> behavior (which I doubt), we can always make this configurable by the
>> instance triggering undo of isolation (e.g., alloc_contig_range(),
>> memory onlining, memory offlining).
>>
>> Cc: Andrew Morton 
>> Cc: Alexander Duyck 
>> Cc: Mel Gorman 
>> Cc: Michal Hocko 
>> Cc: Dave Hansen 
>> Cc: Vlastimil Babka 
>> Cc: Wei Yang 
>> Cc: Oscar Salvador 
>> Cc: Mike Rapoport 
>> Cc: Scott Cheloha 
>> Cc: Michael Ellerman 
>> Signed-off-by: David Hildenbrand 
>> ---
>>  mm/page_alloc.c | 10 +-
>>  1 file changed, 9 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 91cefb8157dd..bba9a0f60c70 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -89,6 +89,12 @@ typedef int __bitwise fop_t;
>>   */
>>  #define FOP_SKIP_REPORT_NOTIFY  ((__force fop_t)BIT(0))
>>  
>> +/*
>> + * Place the freed page to the tail of the freelist after buddy merging. 
>> Will
>> + * get ignored with page shuffling enabled.
>> + */
>> +#define FOP_TO_TAIL ((__force fop_t)BIT(1))
>> +
>>  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
>>  static DEFINE_MUTEX(pcp_batch_high_lock);
>>  #define MIN_PERCPU_PAGELIST_FRACTION(8)
>> @@ -1040,6 +1046,8 @@ static inline void __free_one_page(struct page *page, 
>> unsigned long pfn,
>>  
>>  if (is_shuffle_order(order))
>>  to_tail = shuffle_pick_tail();
>> +else if (fop_flags & FOP_TO_TAIL)
>> +to_tail = true;
> 
> Should we really let random shuffling decision have a larger priority than
> explicit FOP_TO_TAIL request? Wei Yang mentioned that there's a call to
> shuffle_zone() anyway to process a freshly added memory, so we don't need to 
> do
> that also during the process of addition itself? Might help with your goal of
> reducing dependencies even on systems that do have shuffling enabled?

So, we do have cases where generic_online_page() -> __free_pages_core()
isn't called (see patch #4):

generic_online_page() is used in two cases:

1. Direct memory onlining in online_pages(). Here, we call
   shuffle_zone().
2. Deferred memory onlining in memory-ballooning-like mechanisms (HyperV
   balloon and virtio-mem), when parts of a section are kept
   fake-offline to be fake-onlined later on.

While we shuffle in the fist instance the whole zone, we wouldn't
shuffle in the second case.

But maybe this should be tackled (just like when alloc_contig_free() a
large contiguous range, memory offlining failing, alloc_contig_range()
failing) by manually shuffling the zone again. That would be cleaner,
and the right thing to do when exposing large, contiguous ranges again
to the buddy.

Thanks!


-- 
Thanks,

David / dhildenb

Re: [PATCH RFC 3/4] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-09-25 Thread David Hildenbrand

On 25.09.20 04:45, Wei Yang wrote:
> On Thu, Sep 24, 2020 at 01:13:29PM +0200, Vlastimil Babka wrote:
>> On 9/16/20 8:34 PM, David Hildenbrand wrote:
>>> Page isolation doesn't actually touch the pages, it simply isolates
>>> pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>>>
>>> We already place pages to the tail of the freelists when undoing
>>> isolation via __putback_isolated_page(), let's do it in any case
>>> (e.g., if order == pageblock_order) and document the behavior.
>>>
>>> This change results in all pages getting onlined via online_pages() to
>>> be placed to the tail of the freelist.
>>>
>>> Cc: Andrew Morton 
>>> Cc: Alexander Duyck 
>>> Cc: Mel Gorman 
>>> Cc: Michal Hocko 
>>> Cc: Dave Hansen 
>>> Cc: Vlastimil Babka 
>>> Cc: Wei Yang 
>>> Cc: Oscar Salvador 
>>> Cc: Mike Rapoport 
>>> Cc: Scott Cheloha 
>>> Cc: Michael Ellerman 
>>> Signed-off-by: David Hildenbrand 
>>> ---
>>>  include/linux/page-isolation.h |  2 ++
>>>  mm/page_alloc.c| 36 +-
>>>  mm/page_isolation.c|  8 ++--
>>>  3 files changed, 39 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>>> index 572458016331..a36be2cf4dbb 100644
>>> --- a/include/linux/page-isolation.h
>>> +++ b/include/linux/page-isolation.h
>>> @@ -38,6 +38,8 @@ struct page *has_unmovable_pages(struct zone *zone, 
>>> struct page *page,
>>>  void set_pageblock_migratetype(struct page *page, int migratetype);
>>>  int move_freepages_block(struct zone *zone, struct page *page,
>>> int migratetype, int *num_movable);
>>> +int move_freepages_block_tail(struct zone *zone, struct page *page,
>>> + int migratetype);
>>>  
>>>  /*
>>>   * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index bba9a0f60c70..75b0f49b4022 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -899,6 +899,15 @@ static inline void move_to_free_list(struct page 
>>> *page, struct zone *zone,
>>> list_move(&page->lru, &area->free_list[migratetype]);
>>>  }
>>>  
>>> +/* Used for pages which are on another list */
>>> +static inline void move_to_free_list_tail(struct page *page, struct zone 
>>> *zone,
>>> + unsigned int order, int migratetype)
>>> +{
>>> +   struct free_area *area = &zone->free_area[order];
>>> +
>>> +   list_move_tail(&page->lru, &area->free_list[migratetype]);
>>> +}
>>
>> There are just 3 callers of move_to_free_list() before this patch, I would 
>> just
>> add the to_tail parameter there instead of new wrapper. For callers with
>> constant parameter, the inline will eliminate it anyway.
> 
> Got the same feeling :-)

I once was told boolean parameters are the root of all evil, so I tried
to keep them file-local :)

One thing to be aware of is, that inline optimizations won't help as
long as this function is in mm/page_alloc.c, see below.

> 
>>
>>>  static inline void del_page_from_free_list(struct page *page, struct zone 
>>> *zone,
>>>unsigned int order)
>>>  {
>>> @@ -2323,7 +2332,7 @@ static inline struct page 
>>> *__rmqueue_cma_fallback(struct zone *zone,
>>>   */
>>>  static int move_freepages(struct zone *zone,
>>>   struct page *start_page, struct page *end_page,
>>> - int migratetype, int *num_movable)
>>> + int migratetype, int *num_movable, bool to_tail)
>>>  {
>>> struct page *page;
>>> unsigned int order;
>>> @@ -2354,7 +2363,10 @@ static int move_freepages(struct zone *zone,
>>> VM_BUG_ON_PAGE(page_zone(page) != zone, page);
>>>  
>>> order = page_order(page);
>>> -   move_to_free_list(page, zone, order, migratetype);
>>> +   if (to_tail)
>>> +   move_to_free_list_tail(page, zone, order, migratetype);
>>> +   else
>>> +   move_to_free_list(p

Re: [PATCH RFC 0/4] mm: place pages to the freelist tail when onling and undoing isolation

2020-09-24 Thread David Hildenbrand

>> If that would ever change, the optimization here would be lost and we
>> would have to think of something else. Nothing would actually break -
>> and it's all kept directly in page_alloc.c
> 
> Sure, but then it can become a pointless code churn.

Indeed, and if there are valid concerns that this will happen in the
near future (e.g., < 1 year), I agree that we should look into
alternatives right from the start. Otherwise it's good enough until some
of the other things I mentioned below become real (which could also take
a while ...).

> 
>> I'd like to stress that what I propose here is both simple and powerful.
>>
>>> possible I think, such as preparing a larger MIGRATE_UNMOVABLE area in the
>>> existing memory before we allocate those long-term management structures. Or
>>> onlining a bunch of blocks as zone_movable first and only later convert to
>>> zone_normal in a controlled way when existing normal zone becomes depeted?
>>
>> I see the following (more or less complicated) alternatives
>>
>> 1) Having a larger MIGRATE_UNMOVABLE area
>>
>> a) Sizing it is difficult. I mean you would have to plan ahead for all
>> memory you might eventually hotplug later - and that could even be
> 
> Yeah, hence my worry about existing interfaces that work on 128MB blocks
> individually without a larger strategy.

Yes, in the works :)

> 
>> impossible if you hotplug quite a lot of memory to a smaller machine.
>> (I've seen people in the vm/container world trying to hotplug 128GB
>> DIMMs to 2GB VMs ... and failing for obvious reasons)
> 
> Some planning should still be possible to maximize the contiguous area without
> unmovable allocations.

Indeed, optimizing that is very high on my list of things to look into ...

>>
>> we would, once again, never be able to allocate a gigantic page because
>> all [N] would contain a memmap.
> 
> The second approach should work, if you know how much you are going to online,
> and plan the size the N group accordingly, and if the onlined amount is 
> several
> gigabytes, then only the first one (or first X) will be unusable for a 
> gigantic
> page, but the rest would be? Can't get much better than that.

Indeed, it's the optimal case (assuming one can come up with a safe zone
balance - which is usually possible, but unfortunately, there are
exceptions one at least has to identify).

[...]

> 
> I've reviewed the series and I won't block it - yes it's an optimistic 
> approach
> that can break and leave us with code churn. But at least it's not that much

Thanks.

I'll try to document somewhere that the behavior of FOP_TO_TAIL is a
pure optimization and might change in the future - along with the case
it tried to optimize (so people know what the use case was).

> code and the extra test in  __free_one_page() shouldn't make this hotpath too

I assume the compiler is able to completely propagate constants and
optimize that out - I haven't checked, though.

> worse. But I still hope we can achieve a more robust solution one day.

I definitely agree. I'd also prefer some kind of guarantees, but I
learned that things always sound easier than they actually are when it
comes to memory management in Linux ... and they take a lot of time (for
example, Michal's/Oscar's attempts to implement vmemmap on hotadded memory).

-- 
Thanks,

David / dhildenb

Re: [PATCH RFC 0/4] mm: place pages to the freelist tail when onling and undoing isolation

2020-09-24 Thread David Hildenbrand

On 24.09.20 11:40, Mel Gorman wrote:
> On Wed, Sep 23, 2020 at 05:26:06PM +0200, David Hildenbrand wrote:
>>>>> ???On 2020-09-16 20:34, David Hildenbrand wrote:
>>>>>> When adding separate memory blocks via add_memory*() and onlining them
>>>>>> immediately, the metadata (especially the memmap) of the next block will 
>>>>>> be
>>>>>> placed onto one of the just added+onlined block. This creates a chain
>>>>>> of unmovable allocations: If the last memory block cannot get
>>>>>> offlined+removed() so will all dependant ones. We directly have unmovable
>>>>>> allocations all over the place.
>>>>>> This can be observed quite easily using virtio-mem, however, it can also
>>>>>> be observed when using DIMMs. The freshly onlined pages will usually be
>>>>>> placed to the head of the freelists, meaning they will be allocated next,
>>>>>> turning the just-added memory usually immediately un-removable. The
>>>>>> fresh pages are cold, prefering to allocate others (that might be hot)
>>>>>> also feels to be the natural thing to do.
>>>>>> It also applies to the hyper-v balloon xen-balloon, and ppc64 dlpar: when
>>>>>> adding separate, successive memory blocks, each memory block will have
>>>>>> unmovable allocations on them - for example gigantic pages will fail to
>>>>>> allocate.
>>>>>> While the ZONE_NORMAL doesn't provide any guarantees that memory can get
>>>>>> offlined+removed again (any kind of fragmentation with unmovable
>>>>>> allocations is possible), there are many scenarios (hotplugging a lot of
>>>>>> memory, running workload, hotunplug some memory/as much as possible) 
>>>>>> where
>>>>>> we can offline+remove quite a lot with this patchset.
>>>>>
>>>>> Hi David,
>>>>>
>>>>
>>>> Hi Oscar.
>>>>
>>>>> I did not read through the patchset yet, so sorry if the question is 
>>>>> nonsense, but is this not trying to fix the same issue the vmemmap 
>>>>> patches did? [1]
>>>>
>>>> Not nonesense at all. It only helps to some degree, though. It solves the 
>>>> dependencies due to the memmap. However, it???s not completely ideal, 
>>>> especially for single memory blocks.
>>>>
>>>> With single memory blocks (virtio-mem, xen-balloon, hv balloon, ppc dlpar) 
>>>> you still have unmovable (vmemmap chunks) all over the physical address 
>>>> space. Consider the gigantic page example after hotplug. You directly 
>>>> fragmented all hotplugged memory.
>>>>
>>>> Of course, there might be (less extreme) dependencies due page tables for 
>>>> the identity mapping, extended struct pages and similar.
>>>>
>>>> Having that said, there are other benefits when preferring other memory 
>>>> over just hotplugged memory. Think about adding+onlining memory during 
>>>> boot (dimms under QEMU, virtio-mem), once the system is up you will have 
>>>> most (all) of that memory completely untouched.
>>>>
>>>> So while vmemmap on hotplugged memory would tackle some part of the issue, 
>>>> there are cases where this approach is better, and there are even benefits 
>>>> when combining both.
>>>
>>
>> Hi Vlastimil,
>>
>>> I see the point, but I don't think the head/tail mechanism is great for 
>>> this. It
>>> might sort of work, but with other interfering activity there are no 
>>> guarantees
>>> and it relies on a subtle implementation detail. There are better mechanisms
>>
>> For the specified use case of adding+onlining a whole bunch of memory
>> this works just fine. We don't care too much about "other interfering
>> activity" as you mention here, or about guarantees - this is a pure
>> optimization that seems to work just fine in practice.
>>
>> I'm not sure about the "subtle implementation detail" - buddy merging,
>> and head/tail of buddy lists are a basic concept of our page allocator.
>> If that would ever change, the optimization here would be lost and we
>> would have to think of something else. Nothing would actually break -
>> and it's all kept directly in page_alloc.c
>>

Hi Mel,

thanks for your reply.

> 
> It's somewhat subtle

Re: [PATCH RFC 0/4] mm: place pages to the freelist tail when onling and undoing isolation

2020-09-23 Thread David Hildenbrand

On 23.09.20 16:31, Vlastimil Babka wrote:
> On 9/16/20 9:31 PM, David Hildenbrand wrote:
>>
>>
>>> Am 16.09.2020 um 20:50 schrieb osalva...@suse.de:
>>>
>>> On 2020-09-16 20:34, David Hildenbrand wrote:
>>>> When adding separate memory blocks via add_memory*() and onlining them
>>>> immediately, the metadata (especially the memmap) of the next block will be
>>>> placed onto one of the just added+onlined block. This creates a chain
>>>> of unmovable allocations: If the last memory block cannot get
>>>> offlined+removed() so will all dependant ones. We directly have unmovable
>>>> allocations all over the place.
>>>> This can be observed quite easily using virtio-mem, however, it can also
>>>> be observed when using DIMMs. The freshly onlined pages will usually be
>>>> placed to the head of the freelists, meaning they will be allocated next,
>>>> turning the just-added memory usually immediately un-removable. The
>>>> fresh pages are cold, prefering to allocate others (that might be hot)
>>>> also feels to be the natural thing to do.
>>>> It also applies to the hyper-v balloon xen-balloon, and ppc64 dlpar: when
>>>> adding separate, successive memory blocks, each memory block will have
>>>> unmovable allocations on them - for example gigantic pages will fail to
>>>> allocate.
>>>> While the ZONE_NORMAL doesn't provide any guarantees that memory can get
>>>> offlined+removed again (any kind of fragmentation with unmovable
>>>> allocations is possible), there are many scenarios (hotplugging a lot of
>>>> memory, running workload, hotunplug some memory/as much as possible) where
>>>> we can offline+remove quite a lot with this patchset.
>>>
>>> Hi David,
>>>
>>
>> Hi Oscar.
>>
>>> I did not read through the patchset yet, so sorry if the question is 
>>> nonsense, but is this not trying to fix the same issue the vmemmap patches 
>>> did? [1]
>>
>> Not nonesense at all. It only helps to some degree, though. It solves the 
>> dependencies due to the memmap. However, it‘s not completely ideal, 
>> especially for single memory blocks.
>>
>> With single memory blocks (virtio-mem, xen-balloon, hv balloon, ppc dlpar) 
>> you still have unmovable (vmemmap chunks) all over the physical address 
>> space. Consider the gigantic page example after hotplug. You directly 
>> fragmented all hotplugged memory.
>>
>> Of course, there might be (less extreme) dependencies due page tables for 
>> the identity mapping, extended struct pages and similar.
>>
>> Having that said, there are other benefits when preferring other memory over 
>> just hotplugged memory. Think about adding+onlining memory during boot 
>> (dimms under QEMU, virtio-mem), once the system is up you will have most 
>> (all) of that memory completely untouched.
>>
>> So while vmemmap on hotplugged memory would tackle some part of the issue, 
>> there are cases where this approach is better, and there are even benefits 
>> when combining both.
> 

Hi Vlastimil,

> I see the point, but I don't think the head/tail mechanism is great for this. 
> It
> might sort of work, but with other interfering activity there are no 
> guarantees
> and it relies on a subtle implementation detail. There are better mechanisms

For the specified use case of adding+onlining a whole bunch of memory
this works just fine. We don't care too much about "other interfering
activity" as you mention here, or about guarantees - this is a pure
optimization that seems to work just fine in practice.

I'm not sure about the "subtle implementation detail" - buddy merging,
and head/tail of buddy lists are a basic concept of our page allocator.
If that would ever change, the optimization here would be lost and we
would have to think of something else. Nothing would actually break -
and it's all kept directly in page_alloc.c

I'd like to stress that what I propose here is both simple and powerful.

> possible I think, such as preparing a larger MIGRATE_UNMOVABLE area in the
> existing memory before we allocate those long-term management structures. Or
> onlining a bunch of blocks as zone_movable first and only later convert to
> zone_normal in a controlled way when existing normal zone becomes depeted?

I see the following (more or less complicated) alternatives

1) Having a larger MIGRATE_UNMOVABLE area

a) Sizing it is difficult. I mean you would have to plan ahead for all
memory you might eventually hotplug later - and that could eve

Re: [PATCH] qemu/atomic.h: prefix qemu_ to solve collisions

2020-09-22 Thread David Hildenbrand

On 22.09.20 08:56, Paolo Bonzini wrote:
> On 22/09/20 08:45, David Hildenbrand wrote:
>>> It's certainly a good idea but it's quite verbose.
>>>
>>> What about using atomic__* as the prefix?  It is not very common in QEMU
>>> but there are some cases (and I cannot think of anything better).
>>
>> aqomic_*, lol :)
> 
> Actually qatomic_ would be a good one, wouldn't it?

agreed!

-- 
Thanks,

David / dhildenb

Re: [PATCH] qemu/atomic.h: prefix qemu_ to solve collisions

2020-09-21 Thread David Hildenbrand

On 22.09.20 08:27, Paolo Bonzini wrote:
> On 21/09/20 18:23, Stefan Hajnoczi wrote:
>> clang's C11 atomic_fetch_*() functions only take a C11 atomic type
>> pointer argument. QEMU uses direct types (int, etc) and this causes a
>> compiler error when a QEMU code calls these functions in a source file
>> that also included  via a system header file:
>>
>>   $ CC=clang CXX=clang++ ./configure ... && make
>>   ../util/async.c:79:17: error: address argument to atomic operation must be 
>> a pointer to _Atomic type ('unsigned int *' invalid)
>>
>> Avoid using atomic_*() names in QEMU's atomic.h since that namespace is
>> used by . Prefix QEMU's APIs with qemu_ so that atomic.h
>> and  can co-exist.
>>
>> This patch was generated using:
>>
>>   $ git diff | grep -o '\> >/tmp/changed_identifiers
>>   $ for identifier in $(>sed -i "s%\<$identifier\>%qemu_$identifier%" $(git grep -l 
>> "\<$identifier\>") \
>> done
> 
> It's certainly a good idea but it's quite verbose.
> 
> What about using atomic__* as the prefix?  It is not very common in QEMU
> but there are some cases (and I cannot think of anything better).
> 

aqomic_*, lol :)

> Paolo
> 


-- 
Thanks,

David / dhildenb

1 2 3 4 >

1 - 100 of 396 matches

Mail list logo