Re: [PATCH v4 06/11] riscv: mm: Add memory hotplugging support

2024-06-24 Thread Alexandre Ghiti
On Wed, Jun 5, 2024 at 1:41 PM Björn Töpel  wrote:
>
> From: Björn Töpel 
>
> For an architecture to support memory hotplugging, a couple of
> callbacks needs to be implemented:
>
>  arch_add_memory()
>   This callback is responsible for adding the physical memory into the
>   direct map, and call into the memory hotplugging generic code via
>   __add_pages() that adds the corresponding struct page entries, and
>   updates the vmemmap mapping.
>
>  arch_remove_memory()
>   This is the inverse of the callback above.
>
>  vmemmap_free()
>   This function tears down the vmemmap mappings (if
>   CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>   backing vmemmap pages. Note that for persistent memory, an
>   alternative allocator for the backing pages can be used; The
>   vmem_altmap. This means that when the backing pages are cleared,
>   extra care is needed so that the correct deallocation method is
>   used.
>
>  arch_get_mappable_range()
>   This functions returns the PA range that the direct map can map.
>   Used by the MHP internals for sanity checks.
>
> The page table unmap/teardown functions are heavily based on code from
> the x86 tree. The same remove_pgd_mapping() function is used in both
> vmemmap_free() and arch_remove_memory(), but in the latter function
> the backing pages are not removed.
>
> Signed-off-by: Björn Töpel 
> ---
>  arch/riscv/mm/init.c | 267 +++
>  1 file changed, 267 insertions(+)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 1f7e7c223bec..bfa2dea95354 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -1534,3 +1534,270 @@ struct execmem_info __init *execmem_arch_setup(void)
>  }
>  #endif /* CONFIG_MMU */
>  #endif /* CONFIG_EXECMEM */
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +   struct page *page = pmd_page(*pmd);
> +   struct ptdesc *ptdesc = page_ptdesc(page);
> +   pte_t *pte;
> +   int i;
> +
> +   for (i = 0; i < PTRS_PER_PTE; i++) {
> +   pte = pte_start + i;
> +   if (!pte_none(*pte))
> +   return;
> +   }
> +
> +   pagetable_pte_dtor(ptdesc);
> +   if (PageReserved(page))
> +   free_reserved_page(page);
> +   else
> +   pagetable_free(ptdesc);
> +   pmd_clear(pmd);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +   struct page *page = pud_page(*pud);
> +   struct ptdesc *ptdesc = page_ptdesc(page);
> +   pmd_t *pmd;
> +   int i;
> +
> +   for (i = 0; i < PTRS_PER_PMD; i++) {
> +   pmd = pmd_start + i;
> +   if (!pmd_none(*pmd))
> +   return;
> +   }
> +
> +   pagetable_pmd_dtor(ptdesc);
> +   if (PageReserved(page))
> +   free_reserved_page(page);
> +   else
> +   pagetable_free(ptdesc);
> +   pud_clear(pud);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
> +{
> +   struct page *page = p4d_page(*p4d);
> +   pud_t *pud;
> +   int i;
> +
> +   for (i = 0; i < PTRS_PER_PUD; i++) {
> +   pud = pud_start + i;
> +   if (!pud_none(*pud))
> +   return;
> +   }
> +
> +   if (PageReserved(page))
> +   free_reserved_page(page);
> +   else
> +   free_pages((unsigned long)page_address(page), 0);
> +   p4d_clear(p4d);
> +}
> +
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> +  struct vmem_altmap *altmap)
> +{
> +   int order = get_order(size);
> +
> +   if (altmap) {
> +   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> +   return;
> +   }
> +
> +   if (PageReserved(page)) {
> +   unsigned int nr_pages = 1 << order;
> +
> +   while (nr_pages--)
> +   free_reserved_page(page++);
> +   return;
> +   }
> +
> +   free_pages((unsigned long)page_address(page), order);
> +}
> +
> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long 
> addr, unsigned long end,
> +bool is_vmemmap, struct vmem_altmap 
> *altmap)
> +{
> +   unsigned long next;
> +   pte_t *ptep, pte;
> +
> +   for (; addr < end; addr = next) {
> +   next = (addr + PAGE_SIZE) & PAGE_MASK;
> +   if (next > end)
> +   next = end;
> +
> +   ptep = pte_base + pte_index(addr);
> +   pte = ptep_get(ptep);
> +   if (!pte_present(*ptep))
> +   continue;
> +
> +   pte_clear(_mm, addr, ptep);
> +   if (is_vmemmap)
> +   free_vmemmap_storage(pte_page(pte), PAGE_SIZE, 
> altmap);
> +   }
> +}
> +
> 

[PATCH v4 06/11] riscv: mm: Add memory hotplugging support

2024-06-05 Thread Björn Töpel
From: Björn Töpel 

For an architecture to support memory hotplugging, a couple of
callbacks needs to be implemented:

 arch_add_memory()
  This callback is responsible for adding the physical memory into the
  direct map, and call into the memory hotplugging generic code via
  __add_pages() that adds the corresponding struct page entries, and
  updates the vmemmap mapping.

 arch_remove_memory()
  This is the inverse of the callback above.

 vmemmap_free()
  This function tears down the vmemmap mappings (if
  CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
  backing vmemmap pages. Note that for persistent memory, an
  alternative allocator for the backing pages can be used; The
  vmem_altmap. This means that when the backing pages are cleared,
  extra care is needed so that the correct deallocation method is
  used.

 arch_get_mappable_range()
  This functions returns the PA range that the direct map can map.
  Used by the MHP internals for sanity checks.

The page table unmap/teardown functions are heavily based on code from
the x86 tree. The same remove_pgd_mapping() function is used in both
vmemmap_free() and arch_remove_memory(), but in the latter function
the backing pages are not removed.

Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 267 +++
 1 file changed, 267 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 1f7e7c223bec..bfa2dea95354 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1534,3 +1534,270 @@ struct execmem_info __init *execmem_arch_setup(void)
 }
 #endif /* CONFIG_MMU */
 #endif /* CONFIG_EXECMEM */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   struct page *page = pmd_page(*pmd);
+   struct ptdesc *ptdesc = page_ptdesc(page);
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   pagetable_pte_dtor(ptdesc);
+   if (PageReserved(page))
+   free_reserved_page(page);
+   else
+   pagetable_free(ptdesc);
+   pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   struct page *page = pud_page(*pud);
+   struct ptdesc *ptdesc = page_ptdesc(page);
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   pagetable_pmd_dtor(ptdesc);
+   if (PageReserved(page))
+   free_reserved_page(page);
+   else
+   pagetable_free(ptdesc);
+   pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+   struct page *page = p4d_page(*p4d);
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (!pud_none(*pud))
+   return;
+   }
+
+   if (PageReserved(page))
+   free_reserved_page(page);
+   else
+   free_pages((unsigned long)page_address(page), 0);
+   p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+  struct vmem_altmap *altmap)
+{
+   int order = get_order(size);
+
+   if (altmap) {
+   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+   return;
+   }
+
+   if (PageReserved(page)) {
+   unsigned int nr_pages = 1 << order;
+
+   while (nr_pages--)
+   free_reserved_page(page++);
+   return;
+   }
+
+   free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *ptep, pte;
+
+   for (; addr < end; addr = next) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   ptep = pte_base + pte_index(addr);
+   pte = ptep_get(ptep);
+   if (!pte_present(*ptep))
+   continue;
+
+   pte_clear(_mm, addr, ptep);
+   if (is_vmemmap)
+   free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+   }
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmdp, pmd;
+
+   for (; addr < end; addr = next) {
+   next = pmd_addr_end(addr, end);
+