Re: [PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-10-08 Thread Anshuman Khandual



On 10/08/2019 04:25 PM, Catalin Marinas wrote:
> On Tue, Oct 08, 2019 at 10:06:26AM +0530, Anshuman Khandual wrote:
>> On 10/07/2019 07:47 PM, Catalin Marinas wrote:
>>> On Mon, Sep 23, 2019 at 11:13:45AM +0530, Anshuman Khandual wrote:
 The arch code for hot-remove must tear down portions of the linear map and
 vmemmap corresponding to memory being removed. In both cases the page
 tables mapping these regions must be freed, and when sparse vmemmap is in
 use the memory backing the vmemmap must also be freed.

 This patch adds unmap_hotplug_range() and free_empty_tables() helpers which
 can be used to tear down either region and calls it from vmemmap_free() and
 ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
 backing memory will be freed.
>>>
>>> Can you change the 'sparse_vmap' name to something more meaningful which
>>> would suggest freeing of the backing memory?
>>
>> free_mapped_mem or free_backed_mem ? Even shorter forms like free_mapped or
>> free_backed might do as well. Do you have a particular preference here ? But
>> yes, sparse_vmap has been very much specific to vmemmap for these functions
>> which are now very generic in nature.
> 
> free_mapped would do.

Sure.

> 
 +static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 +  unsigned long end, bool sparse_vmap)
 +{
 +  struct page *page;
 +  pte_t *ptep, pte;
 +
 +  do {
 +  ptep = pte_offset_kernel(pmdp, addr);
 +  pte = READ_ONCE(*ptep);
 +  if (pte_none(pte))
 +  continue;
 +
 +  WARN_ON(!pte_present(pte));
 +  page = sparse_vmap ? pte_page(pte) : NULL;
 +  pte_clear(_mm, addr, ptep);
 +  flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 +  if (sparse_vmap)
 +  free_hotplug_page_range(page, PAGE_SIZE);
>>>
>>> You could only set 'page' if sparse_vmap (or even drop 'page' entirely).
>>
>> I am afraid 'page' is being used to hold pte_page(pte) extraction which
>> needs to be freed (sparse_vmap) as we are going to clear the ptep entry
>> in the next statement and lose access to it for good.
> 
> You clear *ptep, not pte.

Ahh, missed that. We have already captured the contents with READ_ONCE().

> 
>> We will need some
>> where to hold onto pte_page(pte) across pte_clear() as we cannot free it
>> before clearing it's entry and flushing the TLB. Hence wondering how the
>> 'page' can be completely dropped.
>>
>>> The compiler is probably smart enough to optimise it but using a
>>> pointless ternary operator just makes the code harder to follow.
>>
>> Not sure I got this but are you suggesting for an 'if' statement here
>>
>> if (sparse_vmap)
>>  page = pte_page(pte);
>>
>> instead of the current assignment ?
>>
>> page = sparse_vmap ? pte_page(pte) : NULL;
> 
> I suggest:
> 
>   if (sparse_vmap)
>   free_hotplug_pgtable_page(pte_page(pte), PAGE_SIZE);

Sure, will do.

> 
 +  } while (addr += PAGE_SIZE, addr < end);
 +}
>>> [...]
 +static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 +   unsigned long end)
 +{
 +  pte_t *ptep, pte;
 +
 +  do {
 +  ptep = pte_offset_kernel(pmdp, addr);
 +  pte = READ_ONCE(*ptep);
 +  WARN_ON(!pte_none(pte));
 +  } while (addr += PAGE_SIZE, addr < end);
 +}
 +
 +static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
 +   unsigned long end, unsigned long floor,
 +   unsigned long ceiling)
 +{
 +  unsigned long next;
 +  pmd_t *pmdp, pmd;
 +
 +  do {
 +  next = pmd_addr_end(addr, end);
 +  pmdp = pmd_offset(pudp, addr);
 +  pmd = READ_ONCE(*pmdp);
 +  if (pmd_none(pmd))
 +  continue;
 +
 +  WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
 +  free_empty_pte_table(pmdp, addr, next);
 +  free_pte_table(pmdp, addr, next, floor, ceiling);
>>>
>>> Do we need two closely named functions here? Can you not collapse
>>> free_empty_pud_table() and free_pte_table() into a single one? The same
>>> comment for the pmd/pud variants. I just find this confusing.
>>
>> The two functions could be collapsed into a single one. But just wanted to
>> keep free_pxx_table() part which checks floor/ceiling alignment, non-zero
>> entries clear off the actual page table walking.
> 
> With the pmd variant, they both take the floor/ceiling argument while
> the free_empty_pte_table() doesn't even free anything. So not entirely
> consistent.> 
> Can you not just copy the free_pgd_range() functions but instead of
> p*d_free_tlb() just do the TLB invalidation followed by page freeing?
> That seems to be an easier pattern to follow.

Re: [PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-10-08 Thread Catalin Marinas
On Tue, Oct 08, 2019 at 10:06:26AM +0530, Anshuman Khandual wrote:
> On 10/07/2019 07:47 PM, Catalin Marinas wrote:
> > On Mon, Sep 23, 2019 at 11:13:45AM +0530, Anshuman Khandual wrote:
> >> The arch code for hot-remove must tear down portions of the linear map and
> >> vmemmap corresponding to memory being removed. In both cases the page
> >> tables mapping these regions must be freed, and when sparse vmemmap is in
> >> use the memory backing the vmemmap must also be freed.
> >>
> >> This patch adds unmap_hotplug_range() and free_empty_tables() helpers which
> >> can be used to tear down either region and calls it from vmemmap_free() and
> >> ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
> >> backing memory will be freed.
> > 
> > Can you change the 'sparse_vmap' name to something more meaningful which
> > would suggest freeing of the backing memory?
> 
> free_mapped_mem or free_backed_mem ? Even shorter forms like free_mapped or
> free_backed might do as well. Do you have a particular preference here ? But
> yes, sparse_vmap has been very much specific to vmemmap for these functions
> which are now very generic in nature.

free_mapped would do.

> >> +static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
> >> +  unsigned long end, bool sparse_vmap)
> >> +{
> >> +  struct page *page;
> >> +  pte_t *ptep, pte;
> >> +
> >> +  do {
> >> +  ptep = pte_offset_kernel(pmdp, addr);
> >> +  pte = READ_ONCE(*ptep);
> >> +  if (pte_none(pte))
> >> +  continue;
> >> +
> >> +  WARN_ON(!pte_present(pte));
> >> +  page = sparse_vmap ? pte_page(pte) : NULL;
> >> +  pte_clear(_mm, addr, ptep);
> >> +  flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> >> +  if (sparse_vmap)
> >> +  free_hotplug_page_range(page, PAGE_SIZE);
> > 
> > You could only set 'page' if sparse_vmap (or even drop 'page' entirely).
> 
> I am afraid 'page' is being used to hold pte_page(pte) extraction which
> needs to be freed (sparse_vmap) as we are going to clear the ptep entry
> in the next statement and lose access to it for good.

You clear *ptep, not pte.

> We will need some
> where to hold onto pte_page(pte) across pte_clear() as we cannot free it
> before clearing it's entry and flushing the TLB. Hence wondering how the
> 'page' can be completely dropped.
> 
> > The compiler is probably smart enough to optimise it but using a
> > pointless ternary operator just makes the code harder to follow.
> 
> Not sure I got this but are you suggesting for an 'if' statement here
> 
> if (sparse_vmap)
>   page = pte_page(pte);
> 
> instead of the current assignment ?
> 
> page = sparse_vmap ? pte_page(pte) : NULL;

I suggest:

if (sparse_vmap)
free_hotplug_pgtable_page(pte_page(pte), PAGE_SIZE);

> >> +  } while (addr += PAGE_SIZE, addr < end);
> >> +}
> > [...]
> >> +static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
> >> +   unsigned long end)
> >> +{
> >> +  pte_t *ptep, pte;
> >> +
> >> +  do {
> >> +  ptep = pte_offset_kernel(pmdp, addr);
> >> +  pte = READ_ONCE(*ptep);
> >> +  WARN_ON(!pte_none(pte));
> >> +  } while (addr += PAGE_SIZE, addr < end);
> >> +}
> >> +
> >> +static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
> >> +   unsigned long end, unsigned long floor,
> >> +   unsigned long ceiling)
> >> +{
> >> +  unsigned long next;
> >> +  pmd_t *pmdp, pmd;
> >> +
> >> +  do {
> >> +  next = pmd_addr_end(addr, end);
> >> +  pmdp = pmd_offset(pudp, addr);
> >> +  pmd = READ_ONCE(*pmdp);
> >> +  if (pmd_none(pmd))
> >> +  continue;
> >> +
> >> +  WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
> >> +  free_empty_pte_table(pmdp, addr, next);
> >> +  free_pte_table(pmdp, addr, next, floor, ceiling);
> > 
> > Do we need two closely named functions here? Can you not collapse
> > free_empty_pud_table() and free_pte_table() into a single one? The same
> > comment for the pmd/pud variants. I just find this confusing.
> 
> The two functions could be collapsed into a single one. But just wanted to
> keep free_pxx_table() part which checks floor/ceiling alignment, non-zero
> entries clear off the actual page table walking.

With the pmd variant, they both take the floor/ceiling argument while
the free_empty_pte_table() doesn't even free anything. So not entirely
consistent.

Can you not just copy the free_pgd_range() functions but instead of
p*d_free_tlb() just do the TLB invalidation followed by page freeing?
That seems to be an easier pattern to follow.

-- 
Catalin


Re: [PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-10-07 Thread Anshuman Khandual



On 10/07/2019 07:47 PM, Catalin Marinas wrote:
> On Mon, Sep 23, 2019 at 11:13:45AM +0530, Anshuman Khandual wrote:
>> The arch code for hot-remove must tear down portions of the linear map and
>> vmemmap corresponding to memory being removed. In both cases the page
>> tables mapping these regions must be freed, and when sparse vmemmap is in
>> use the memory backing the vmemmap must also be freed.
>>
>> This patch adds unmap_hotplug_range() and free_empty_tables() helpers which
>> can be used to tear down either region and calls it from vmemmap_free() and
>> ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
>> backing memory will be freed.
> 
> Can you change the 'sparse_vmap' name to something more meaningful which
> would suggest freeing of the backing memory?

free_mapped_mem or free_backed_mem ? Even shorter forms like free_mapped or
free_backed might do as well. Do you have a particular preference here ? But
yes, sparse_vmap has been very much specific to vmemmap for these functions
which are now very generic in nature.

> 
>> It makes two distinct passes over the kernel page table. In the first pass
>> with unmap_hotplug_range() it unmaps, invalidates applicable TLB cache and
>> frees backing memory if required (vmemmap) for each mapped leaf entry. In
>> the second pass with free_empty_tables() it looks for empty page table
>> sections whose page table page can be unmapped, TLB invalidated and freed.
>>
>> While freeing intermediate level page table pages bail out if any of its
>> entries are still valid. This can happen for partially filled kernel page
>> table either from a previously attempted failed memory hot add or while
>> removing an address range which does not span the entire page table page
>> range.
>>
>> The vmemmap region may share levels of table with the vmalloc region.
>> There can be conflicts between hot remove freeing page table pages with
>> a concurrent vmalloc() walking the kernel page table. This conflict can
>> not just be solved by taking the init_mm ptl because of existing locking
>> scheme in vmalloc(). So free_empty_tables() implements a floor and ceiling
>> method which is borrowed from user page table tear with free_pgd_range()
>> which skips freeing page table pages if intermediate address range is not
>> aligned or maximum floor-ceiling might not own the entire page table page.
>>
>> While here update arch_add_memory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture and user page table tear down method.
>>
>> Acked-by: Steve Capper 
>> Acked-by: David Hildenbrand 
>> Signed-off-by: Anshuman Khandual 
> 
> Given the amount of changes since version 7, do the acks still stand?

I had taken the liberty to carry them till V7 where the implementation has
been sort of structurally similar but as you mention, there have been some
basic changes in the approach since V7. Will drop these tags in next version
and request their fresh ACKs once again.

> 
> [...]
>> +static void free_pte_table(pmd_t *pmdp, unsigned long addr, unsigned long 
>> end,
>> +   unsigned long floor, unsigned long ceiling)
>> +{
>> +struct page *page;
>> +pte_t *ptep;
>> +int i;
>> +
>> +if (!pgtable_range_aligned(addr, end, floor, ceiling, PMD_MASK))
>> +return;
>> +
>> +ptep = pte_offset_kernel(pmdp, 0UL);
>> +for (i = 0; i < PTRS_PER_PTE; i++) {
>> +if (!pte_none(READ_ONCE(ptep[i])))
>> +return;
>> +}
>> +
>> +page = pmd_page(READ_ONCE(*pmdp));
> 
> Arguably, that's not the pmd page we are freeing here. Even if you get
> the same result, pmd_page() is normally used for huge pages pointed at
> by the pmd entry. Since you have the ptep already, why not use
> virt_to_page(ptep)?

Makes sense, will do.

> 
>> +pmd_clear(pmdp);
>> +__flush_tlb_kernel_pgtable(addr);
>> +free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void free_pmd_table(pud_t *pudp, unsigned long addr, unsigned long 
>> end,
>> +   unsigned long floor, unsigned long ceiling)
>> +{
>> +struct page *page;
>> +pmd_t *pmdp;
>> +int i;
>> +
>> +if (CONFIG_PGTABLE_LEVELS <= 2)
>> +return;
>> +
>> +if (!pgtable_range_aligned(addr, end, floor, ceiling, PUD_MASK))
>> +return;
>> +
>> +pmdp = pmd_offset(pudp, 0UL);
>> +for (i = 0; i < PTRS_PER_PMD; i++) {
>> +if (!pmd_none(READ_ONCE(pmdp[i])))
>> +return;
>> +}
>> +
>> +page = pud_page(READ_ONCE(*pudp));
> 
> Same here, virt_to_page(pmdp).

Will do.

> 
>> +pud_clear(pudp);
>> +__flush_tlb_kernel_pgtable(addr);
>> +free_hotplug_pgtable_page(page);
>> +}
>> +
>> +static void 

Re: [PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-10-07 Thread Catalin Marinas
On Mon, Sep 23, 2019 at 11:13:45AM +0530, Anshuman Khandual wrote:
> The arch code for hot-remove must tear down portions of the linear map and
> vmemmap corresponding to memory being removed. In both cases the page
> tables mapping these regions must be freed, and when sparse vmemmap is in
> use the memory backing the vmemmap must also be freed.
> 
> This patch adds unmap_hotplug_range() and free_empty_tables() helpers which
> can be used to tear down either region and calls it from vmemmap_free() and
> ___remove_pgd_mapping(). The sparse_vmap argument determines whether the
> backing memory will be freed.

Can you change the 'sparse_vmap' name to something more meaningful which
would suggest freeing of the backing memory?

> It makes two distinct passes over the kernel page table. In the first pass
> with unmap_hotplug_range() it unmaps, invalidates applicable TLB cache and
> frees backing memory if required (vmemmap) for each mapped leaf entry. In
> the second pass with free_empty_tables() it looks for empty page table
> sections whose page table page can be unmapped, TLB invalidated and freed.
> 
> While freeing intermediate level page table pages bail out if any of its
> entries are still valid. This can happen for partially filled kernel page
> table either from a previously attempted failed memory hot add or while
> removing an address range which does not span the entire page table page
> range.
> 
> The vmemmap region may share levels of table with the vmalloc region.
> There can be conflicts between hot remove freeing page table pages with
> a concurrent vmalloc() walking the kernel page table. This conflict can
> not just be solved by taking the init_mm ptl because of existing locking
> scheme in vmalloc(). So free_empty_tables() implements a floor and ceiling
> method which is borrowed from user page table tear with free_pgd_range()
> which skips freeing page table pages if intermediate address range is not
> aligned or maximum floor-ceiling might not own the entire page table page.
> 
> While here update arch_add_memory() to handle __add_pages() failures by
> just unmapping recently added kernel linear mapping. Now enable memory hot
> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
> 
> This implementation is overall inspired from kernel page table tear down
> procedure on X86 architecture and user page table tear down method.
> 
> Acked-by: Steve Capper 
> Acked-by: David Hildenbrand 
> Signed-off-by: Anshuman Khandual 

Given the amount of changes since version 7, do the acks still stand?

[...]
> +static void free_pte_table(pmd_t *pmdp, unsigned long addr, unsigned long 
> end,
> +unsigned long floor, unsigned long ceiling)
> +{
> + struct page *page;
> + pte_t *ptep;
> + int i;
> +
> + if (!pgtable_range_aligned(addr, end, floor, ceiling, PMD_MASK))
> + return;
> +
> + ptep = pte_offset_kernel(pmdp, 0UL);
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + if (!pte_none(READ_ONCE(ptep[i])))
> + return;
> + }
> +
> + page = pmd_page(READ_ONCE(*pmdp));

Arguably, that's not the pmd page we are freeing here. Even if you get
the same result, pmd_page() is normally used for huge pages pointed at
by the pmd entry. Since you have the ptep already, why not use
virt_to_page(ptep)?

> + pmd_clear(pmdp);
> + __flush_tlb_kernel_pgtable(addr);
> + free_hotplug_pgtable_page(page);
> +}
> +
> +static void free_pmd_table(pud_t *pudp, unsigned long addr, unsigned long 
> end,
> +unsigned long floor, unsigned long ceiling)
> +{
> + struct page *page;
> + pmd_t *pmdp;
> + int i;
> +
> + if (CONFIG_PGTABLE_LEVELS <= 2)
> + return;
> +
> + if (!pgtable_range_aligned(addr, end, floor, ceiling, PUD_MASK))
> + return;
> +
> + pmdp = pmd_offset(pudp, 0UL);
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + if (!pmd_none(READ_ONCE(pmdp[i])))
> + return;
> + }
> +
> + page = pud_page(READ_ONCE(*pudp));

Same here, virt_to_page(pmdp).

> + pud_clear(pudp);
> + __flush_tlb_kernel_pgtable(addr);
> + free_hotplug_pgtable_page(page);
> +}
> +
> +static void free_pud_table(pgd_t *pgdp, unsigned long addr, unsigned  long 
> end,
> +unsigned long floor, unsigned long ceiling)
> +{
> + struct page *page;
> + pud_t *pudp;
> + int i;
> +
> + if (CONFIG_PGTABLE_LEVELS <= 3)
> + return;
> +
> + if (!pgtable_range_aligned(addr, end, floor, ceiling, PGDIR_MASK))
> + return;
> +
> + pudp = pud_offset(pgdp, 0UL);
> + for (i = 0; i < PTRS_PER_PUD; i++) {
> + if (!pud_none(READ_ONCE(pudp[i])))
> + return;
> + }
> +
> + page = pgd_page(READ_ONCE(*pgdp));

As above.

> + pgd_clear(pgdp);
> + __flush_tlb_kernel_pgtable(addr);
> + free_hotplug_pgtable_page(page);
> 

Re: [PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-09-24 Thread Anshuman Khandual



On 09/23/2019 04:47 PM, Matthew Wilcox wrote:
> On Mon, Sep 23, 2019 at 11:13:45AM +0530, Anshuman Khandual wrote:
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void free_hotplug_page_range(struct page *page, size_t size)
>> +{
>> +WARN_ON(!page || PageReserved(page));
> 
> WARN_ON(!page) isn't terribly useful.  You're going to crash on the very
> next line when you call page_address() anyway.  If this line were
> 
>   if (WARN_ON(!page || PageReserved(page)))
>   return;
> 
> it would make sense, or if it were just
> 
>   WARN_ON(PageReserved(page))
> 
> it would also make sense.

I guess WARN_ON(PageReserved(page)) should be good enough to make sure
that page being freed here was originally allocated at runtime for a
previous memory hot add operation and did not some how come from the
memblock reserved area. That was the original objective for this check.
Will change it.

> 
>> +free_pages((unsigned long)page_address(page), get_order(size));
>> +}
> 


Re: [PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-09-23 Thread kbuild test robot
Hi Anshuman,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[cannot apply to v5.3 next-20190920]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/Anshuman-Khandual/arm64-mm-Hold-memory-hotplug-lock-while-walking-for-kernel-page-table-dump/20190923-134733
config: arm64-allmodconfig (attached as .config)
compiler: aarch64-linux-gcc (GCC) 7.4.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=arm64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot 

All warnings (new ones prefixed by >>):

>> mm/memremap.c:16:0: warning: "SECTION_MASK" redefined
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)

   In file included from arch/arm64/include/asm/processor.h:34:0,
from include/linux/mutex.h:19,
from include/linux/kernfs.h:12,
from include/linux/sysfs.h:16,
from include/linux/kobject.h:20,
from include/linux/device.h:16,
from mm/memremap.c:3:
   arch/arm64/include/asm/pgtable-hwdef.h:79:0: note: this is the location of 
the previous definition
#define SECTION_MASK  (~(SECTION_SIZE-1))

>> mm/memremap.c:17:0: warning: "SECTION_SIZE" redefined
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)

   In file included from arch/arm64/include/asm/processor.h:34:0,
from include/linux/mutex.h:19,
from include/linux/kernfs.h:12,
from include/linux/sysfs.h:16,
from include/linux/kobject.h:20,
from include/linux/device.h:16,
from mm/memremap.c:3:
   arch/arm64/include/asm/pgtable-hwdef.h:78:0: note: this is the location of 
the previous definition
#define SECTION_SIZE  (_AC(1, UL) << SECTION_SHIFT)


vim +/SECTION_MASK +16 mm/memremap.c

7d3dcf26a6559f kernel/memremap.c Christoph Hellwig 2015-08-10  @3  #include 

92281dee825f6d kernel/memremap.c Dan Williams  2015-08-10   4  #include 

0207df4fa1a869 kernel/memremap.c Andrey Ryabinin   2018-08-17   5  #include 

41e94a851304f7 kernel/memremap.c Christoph Hellwig 2015-08-17   6  #include 

bcfa4b72111c9a kernel/memremap.c Matthew Wilcox2018-08-15   7  #include 

bcfa4b72111c9a kernel/memremap.c Matthew Wilcox2018-08-15   8  #include 

5042db43cc26f5 kernel/memremap.c Jérôme Glisse 2017-09-08   9  #include 

5042db43cc26f5 kernel/memremap.c Jérôme Glisse 2017-09-08  10  #include 

bcfa4b72111c9a kernel/memremap.c Matthew Wilcox2018-08-15  11  #include 

e7638488434415 kernel/memremap.c Dan Williams  2018-05-16  12  #include 

bcfa4b72111c9a kernel/memremap.c Matthew Wilcox2018-08-15  13  #include 

92281dee825f6d kernel/memremap.c Dan Williams  2015-08-10  14  
bcfa4b72111c9a kernel/memremap.c Matthew Wilcox2018-08-15  15  static 
DEFINE_XARRAY(pgmap_array);
9476df7d80dfc4 kernel/memremap.c Dan Williams  2016-01-15 @16  #define 
SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
9476df7d80dfc4 kernel/memremap.c Dan Williams  2016-01-15 @17  #define 
SECTION_SIZE (1UL << PA_SECTION_SHIFT)
9476df7d80dfc4 kernel/memremap.c Dan Williams  2016-01-15  18  

:: The code at line 16 was first introduced by commit
:: 9476df7d80dfc425b37bfecf1d89edf8ec81fcb6 mm: introduce find_dev_pagemap()

:: TO: Dan Williams 
:: CC: Linus Torvalds 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-09-23 Thread Matthew Wilcox
On Mon, Sep 23, 2019 at 11:13:45AM +0530, Anshuman Khandual wrote:
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_hotplug_page_range(struct page *page, size_t size)
> +{
> + WARN_ON(!page || PageReserved(page));

WARN_ON(!page) isn't terribly useful.  You're going to crash on the very
next line when you call page_address() anyway.  If this line were

if (WARN_ON(!page || PageReserved(page)))
return;

it would make sense, or if it were just

WARN_ON(PageReserved(page))

it would also make sense.

> + free_pages((unsigned long)page_address(page), get_order(size));
> +}


[PATCH V8 2/2] arm64/mm: Enable memory hot remove

2019-09-22 Thread Anshuman Khandual
The arch code for hot-remove must tear down portions of the linear map and
vmemmap corresponding to memory being removed. In both cases the page
tables mapping these regions must be freed, and when sparse vmemmap is in
use the memory backing the vmemmap must also be freed.

This patch adds unmap_hotplug_range() and free_empty_tables() helpers which
can be used to tear down either region and calls it from vmemmap_free() and
___remove_pgd_mapping(). The sparse_vmap argument determines whether the
backing memory will be freed.

It makes two distinct passes over the kernel page table. In the first pass
with unmap_hotplug_range() it unmaps, invalidates applicable TLB cache and
frees backing memory if required (vmemmap) for each mapped leaf entry. In
the second pass with free_empty_tables() it looks for empty page table
sections whose page table page can be unmapped, TLB invalidated and freed.

While freeing intermediate level page table pages bail out if any of its
entries are still valid. This can happen for partially filled kernel page
table either from a previously attempted failed memory hot add or while
removing an address range which does not span the entire page table page
range.

The vmemmap region may share levels of table with the vmalloc region.
There can be conflicts between hot remove freeing page table pages with
a concurrent vmalloc() walking the kernel page table. This conflict can
not just be solved by taking the init_mm ptl because of existing locking
scheme in vmalloc(). So free_empty_tables() implements a floor and ceiling
method which is borrowed from user page table tear with free_pgd_range()
which skips freeing page table pages if intermediate address range is not
aligned or maximum floor-ceiling might not own the entire page table page.

While here update arch_add_memory() to handle __add_pages() failures by
just unmapping recently added kernel linear mapping. Now enable memory hot
remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.

This implementation is overall inspired from kernel page table tear down
procedure on X86 architecture and user page table tear down method.

Acked-by: Steve Capper 
Acked-by: David Hildenbrand 
Signed-off-by: Anshuman Khandual 
---
 arch/arm64/Kconfig  |   3 +
 arch/arm64/include/asm/memory.h |   1 +
 arch/arm64/mm/mmu.c | 305 ++--
 3 files changed, 300 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 41a9b42..511249f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -274,6 +274,9 @@ config ZONE_DMA32
 config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
 
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+   def_bool y
+
 config SMP
def_bool y
 
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index b61b50b..615dcd0 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -54,6 +54,7 @@
 #define MODULES_VADDR  (BPF_JIT_REGION_END)
 #define MODULES_VSIZE  (SZ_128M)
 #define VMEMMAP_START  (-VMEMMAP_SIZE - SZ_2M)
+#define VMEMMAP_END(VMEMMAP_START + VMEMMAP_SIZE)
 #define PCI_IO_END (VMEMMAP_START - SZ_2M)
 #define PCI_IO_START   (PCI_IO_END - PCI_IO_SIZE)
 #define FIXADDR_TOP(PCI_IO_START - SZ_2M)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 60c929f..6178837 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -725,6 +725,277 @@ int kern_addr_valid(unsigned long addr)
 
return pfn_valid(pte_pfn(pte));
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_hotplug_page_range(struct page *page, size_t size)
+{
+   WARN_ON(!page || PageReserved(page));
+   free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void free_hotplug_pgtable_page(struct page *page)
+{
+   free_hotplug_page_range(page, PAGE_SIZE);
+}
+
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
+ unsigned long floor, unsigned long ceiling,
+ unsigned long mask)
+{
+   start &= mask;
+   if (start < floor)
+   return false;
+
+   if (ceiling) {
+   ceiling &= mask;
+   if (!ceiling)
+   return false;
+   }
+
+   if (end - 1 > ceiling - 1)
+   return false;
+   return true;
+}
+
+static void free_pte_table(pmd_t *pmdp, unsigned long addr, unsigned long end,
+  unsigned long floor, unsigned long ceiling)
+{
+   struct page *page;
+   pte_t *ptep;
+   int i;
+
+   if (!pgtable_range_aligned(addr, end, floor, ceiling, PMD_MASK))
+   return;
+
+   ptep = pte_offset_kernel(pmdp, 0UL);
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   if (!pte_none(READ_ONCE(ptep[i])))
+   return;
+   }
+
+   page =