Re: [PATCH v4 3/4] powerpc/mm: add radix__remove_section_mapping()

2017-01-04 Thread Reza Arbab

On Wed, Jan 04, 2017 at 10:37:58AM +0530, Aneesh Kumar K.V wrote:

Reza Arbab  writes:

+static void remove_pagetable(unsigned long start, unsigned long end)
+{
+   unsigned long addr, next;
+   pud_t *pud_base;
+   pgd_t *pgd;
+
+   spin_lock(_mm.page_table_lock);
+
+   for (addr = start; addr < end; addr = next) {
+   next = pgd_addr_end(addr, end);
+
+   pgd = pgd_offset_k(addr);
+   if (!pgd_present(*pgd))
+   continue;
+
+   if (pgd_huge(*pgd)) {
+   pte_clear(_mm, addr, (pte_t *)pgd);
+   continue;
+   }
+
+   pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+   remove_pud_table(pud_base, addr, next);
+   free_pud_table(pud_base, pgd);
+   }
+
+   spin_unlock(_mm.page_table_lock);


What is this lock protecting ?


The more I look into it, I'm not sure. This is still an artifact from 
the x86 functions, where they lock/unlock agressively, as you and Ben 
noted. I can take it out. 


+   flush_tlb_kernel_range(start, end);


We can use radix__flush_tlb_kernel_range avoiding an if
(radix_enabled()) conditional ?


Yes, good idea.


(radix_enabled()) conditional ? Also if needed we could make all the
above take a radix__ prefix ?


You mean rename all these new functions? We could, but I don't really 
see why. These functions are static to pgtable-radix.c, there aren't 
hash__ versions to differentiate from, and it seemed helpful to mirror 
the x86 names.


--
Reza Arbab



Re: [PATCH v4 3/4] powerpc/mm: add radix__remove_section_mapping()

2017-01-03 Thread Aneesh Kumar K.V
Reza Arbab  writes:

> Tear down and free the four-level page tables of physical mappings
> during memory hotremove.
>
> Borrow the basic structure of remove_pagetable() and friends from the
> identically-named x86 functions. Simplify things a bit so locking and
> tlb flushing are only done in the outermost function.
>
> Memory must be offline to be removed, thus not in use. So there
> shouldn't be the sort of concurrent page walking activity here that
> might prompt us to use RCU.
>
> Signed-off-by: Reza Arbab 
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |   1 +
>  arch/powerpc/mm/pgtable-book3s64.c |   2 +-
>  arch/powerpc/mm/pgtable-radix.c| 149 
> +
>  3 files changed, 151 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index 43c2571..0032b66 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
>
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  int radix__create_section_mapping(unsigned long start, unsigned long end);
> +int radix__remove_section_mapping(unsigned long start, unsigned long end);
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  #endif /* __ASSEMBLY__ */
>  #endif
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
> b/arch/powerpc/mm/pgtable-book3s64.c
> index 2b13f6b..b798ff6 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned 
> long end)
>  int remove_section_mapping(unsigned long start, unsigned long end)
>  {
>   if (radix_enabled())
> - return -ENODEV;
> + return radix__remove_section_mapping(start, end);
>
>   return hash__remove_section_mapping(start, end);
>  }
> diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
> index 3588895..f7a8e625 100644
> --- a/arch/powerpc/mm/pgtable-radix.c
> +++ b/arch/powerpc/mm/pgtable-radix.c
> @@ -457,10 +457,159 @@ void radix__setup_initial_memory_limit(phys_addr_t 
> first_memblock_base,
>  }
>
>  #ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> + pte_t *pte;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + pte = pte_start + i;
> + if (!pte_none(*pte))
> + return;
> + }
> +
> + pte_free_kernel(_mm, pte_start);
> + pmd_clear(pmd);
> +}
> +
> +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> + pmd_t *pmd;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pmd = pmd_start + i;
> + if (!pmd_none(*pmd))
> + return;
> + }
> +
> + pmd_free(_mm, pmd_start);
> + pud_clear(pud);
> +}
> +
> +static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
> +{
> + pud_t *pud;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PUD; i++) {
> + pud = pud_start + i;
> + if (!pud_none(*pud))
> + return;
> + }
> +
> + pud_free(_mm, pud_start);
> + pgd_clear(pgd);
> +}
> +
> +static void remove_pte_table(pte_t *pte_start, unsigned long addr,
> +  unsigned long end)
> +{
> + unsigned long next;
> + pte_t *pte;
> +
> + pte = pte_start + pte_index(addr);
> + for (; addr < end; addr = next, pte++) {
> + next = (addr + PAGE_SIZE) & PAGE_MASK;
> + if (next > end)
> + next = end;
> +
> + if (!pte_present(*pte))
> + continue;
> +
> + pte_clear(_mm, addr, pte);
> + }
> +}
> +
> +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> +  unsigned long end)
> +{
> + unsigned long next;
> + pte_t *pte_base;
> + pmd_t *pmd;
> +
> + pmd = pmd_start + pmd_index(addr);
> + for (; addr < end; addr = next, pmd++) {
> + next = pmd_addr_end(addr, end);
> +
> + if (!pmd_present(*pmd))
> + continue;
> +
> + if (pmd_huge(*pmd)) {
> + pte_clear(_mm, addr, (pte_t *)pmd);
> + continue;
> + }
> +
> + pte_base = (pte_t *)pmd_page_vaddr(*pmd);
> + remove_pte_table(pte_base, addr, next);
> + free_pte_table(pte_base, pmd);
> + }
> +}
> +
> +static void remove_pud_table(pud_t *pud_start, unsigned long addr,
> +  unsigned long end)
> +{
> + unsigned long next;
> + pmd_t *pmd_base;
> + pud_t *pud;
> +
> + pud = pud_start + pud_index(addr);
> + for (; addr < end; addr = next, pud++) {
> + next = pud_addr_end(addr, end);
> +
> + if (!pud_present(*pud))
> 

[PATCH v4 3/4] powerpc/mm: add radix__remove_section_mapping()

2017-01-03 Thread Reza Arbab
Tear down and free the four-level page tables of physical mappings
during memory hotremove.

Borrow the basic structure of remove_pagetable() and friends from the
identically-named x86 functions. Simplify things a bit so locking and
tlb flushing are only done in the outermost function.

Memory must be offline to be removed, thus not in use. So there
shouldn't be the sort of concurrent page walking activity here that
might prompt us to use RCU.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/book3s/64/radix.h |   1 +
 arch/powerpc/mm/pgtable-book3s64.c |   2 +-
 arch/powerpc/mm/pgtable-radix.c| 149 +
 3 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 43c2571..0032b66 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int radix__create_section_mapping(unsigned long start, unsigned long end);
+int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 2b13f6b..b798ff6 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned 
long end)
 int remove_section_mapping(unsigned long start, unsigned long end)
 {
if (radix_enabled())
-   return -ENODEV;
+   return radix__remove_section_mapping(start, end);
 
return hash__remove_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 3588895..f7a8e625 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -457,10 +457,159 @@ void radix__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   pte_free_kernel(_mm, pte_start);
+   pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   pmd_free(_mm, pmd_start);
+   pud_clear(pud);
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (!pud_none(*pud))
+   return;
+   }
+
+   pud_free(_mm, pud_start);
+   pgd_clear(pgd);
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte;
+
+   pte = pte_start + pte_index(addr);
+   for (; addr < end; addr = next, pte++) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   if (!pte_present(*pte))
+   continue;
+
+   pte_clear(_mm, addr, pte);
+   }
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmd;
+
+   pmd = pmd_start + pmd_index(addr);
+   for (; addr < end; addr = next, pmd++) {
+   next = pmd_addr_end(addr, end);
+
+   if (!pmd_present(*pmd))
+   continue;
+
+   if (pmd_huge(*pmd)) {
+   pte_clear(_mm, addr, (pte_t *)pmd);
+   continue;
+   }
+
+   pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+   remove_pte_table(pte_base, addr, next);
+   free_pte_table(pte_base, pmd);
+   }
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pmd_t *pmd_base;
+   pud_t *pud;
+
+   pud = pud_start + pud_index(addr);
+   for (; addr < end; addr = next, pud++) {
+   next = pud_addr_end(addr, end);
+
+   if (!pud_present(*pud))
+   continue;
+
+   if (pud_huge(*pud)) {
+   pte_clear(_mm, addr, (pte_t *)pud);
+   continue;
+   }
+
+   pmd_base =