Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-06 Thread Jianguo Wu
Hi Tang,

On 2012/12/7 9:42, Tang Chen wrote:

> Hi Wu,
> 
> I met some problems when I was digging into the code. It's very
> kind of you if you could help me with that. :)
> 
> If I misunderstood your code, please tell me.
> Please see below. :)
> 
> On 12/03/2012 10:23 AM, Jianguo Wu wrote:
>> Signed-off-by: Jianguo Wu
>> Signed-off-by: Jiang Liu
>> ---
>>   include/linux/mm.h  |1 +
>>   mm/sparse-vmemmap.c |  231 
>> +++
>>   mm/sparse.c |3 +-
>>   3 files changed, 234 insertions(+), 1 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 5657670..1f26af5 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
>> long pages, int node);
>>   void vmemmap_populate_print_last(void);
>>   void register_page_bootmem_memmap(unsigned long section_nr, struct page 
>> *map,
>> unsigned long size);
>> +void vmemmap_free(struct page *memmap, unsigned long nr_pages);
>>
>>   enum mf_flags {
>>   MF_COUNT_INCREASED = 1<<  0,
>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>> index 1b7e22a..748732d 100644
>> --- a/mm/sparse-vmemmap.c
>> +++ b/mm/sparse-vmemmap.c
>> @@ -29,6 +29,10 @@
>>   #include
>>   #include
>>
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +#include
>> +#endif
>> +
>>   /*
>>* Allocate a block of memory to be used to back the virtual memory map
>>* or to back the page tables that are used to create the mapping.
>> @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page 
>> **map_map,
>>   vmemmap_buf_end = NULL;
>>   }
>>   }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +
>> +#define PAGE_INUSE 0xFD
>> +
>> +static void vmemmap_free_pages(struct page *page, int order)
>> +{
>> +struct zone *zone;
>> +unsigned long magic;
>> +
>> +magic = (unsigned long) page->lru.next;
>> +if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
>> +put_page_bootmem(page);
>> +
>> +zone = page_zone(page);
>> +zone_span_writelock(zone);
>> +zone->present_pages++;
>> +zone_span_writeunlock(zone);
>> +totalram_pages++;
>> +} else
>> +free_pages((unsigned long)page_address(page), order);
> 
> Here, I think SECTION_INFO and MIX_SECTION_INFO pages are all allocated
> by bootmem, so I put this function this way.
> 
> I'm not sure if parameter order is necessary here. It will always be 0
> in your code. Is this OK to you ?
> 

parameter order is necessary in cpu_has_pse case:
vmemmap_pmd_remove
free_pagetable(pmd_page(*pmd), get_order(PMD_SIZE))

> static void free_pagetable(struct page *page)
> {
> struct zone *zone;
> bool bootmem = false;
> unsigned long magic;
> 
> /* bootmem page has reserved flag */
> if (PageReserved(page)) {
> __ClearPageReserved(page);
> bootmem = true;
> }
> 
> magic = (unsigned long) page->lru.next;
> if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
> put_page_bootmem(page);
> else
> __free_page(page);
> 
> /*
>  * SECTION_INFO pages and MIX_SECTION_INFO pages
>  * are all allocated by bootmem.
>  */
> if (bootmem) {
> zone = page_zone(page);
> zone_span_writelock(zone);
> zone->present_pages++;
> zone_span_writeunlock(zone);
> totalram_pages++;
> }
> }
> 
> (snip)
> 
>> +
>> +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned 
>> long end)
>> +{
>> +pte_t *pte;
>> +unsigned long next;
>> +void *page_addr;
>> +
>> +pte = pte_offset_kernel(pmd, addr);
>> +for (; addr<  end; pte++, addr += PAGE_SIZE) {
>> +next = (addr + PAGE_SIZE)&  PAGE_MASK;
>> +if (next>  end)
>> +next = end;
>> +
>> +if (pte_none(*pte))
> 
> Here, you checked xxx_none() in your vmemmap_xxx_remove(), but you used
> !xxx_present() in your x86_64 patches. Is it OK if I only check
> !xxx_present() ?

It is Ok.

> 
>> +continue;
>> +if (IS_ALIGNED(addr, PAGE_SIZE)&&
>> +IS_ALIGNED(next, PAGE_SIZE)) {
>> +vmemmap_free_pages(pte_page(*pte), 0);
>> +spin_lock(&init_mm.page_table_lock);
>> +pte_clear(&init_mm, addr, pte);
>> +spin_unlock(&init_mm.page_table_lock);
>> +} else {
>> +/*
>> + * Removed page structs are filled with 0xFD.
>> + */
>> +memset((void *)addr, PAGE_INUSE, next - addr);
>> +page_addr = page_address(pte_page(*pte));
>> +
>> +if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
>> +spin_lock(&init_mm.page_table_lock);
>> +pte_clear(&init_mm, addr,

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-06 Thread Tang Chen

Hi Wu,

I met some problems when I was digging into the code. It's very
kind of you if you could help me with that. :)

If I misunderstood your code, please tell me.
Please see below. :)

On 12/03/2012 10:23 AM, Jianguo Wu wrote:

Signed-off-by: Jianguo Wu
Signed-off-by: Jiang Liu
---
  include/linux/mm.h  |1 +
  mm/sparse-vmemmap.c |  231 +++
  mm/sparse.c |3 +-
  3 files changed, 234 insertions(+), 1 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5657670..1f26af5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
long pages, int node);
  void vmemmap_populate_print_last(void);
  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
+void vmemmap_free(struct page *memmap, unsigned long nr_pages);

  enum mf_flags {
MF_COUNT_INCREASED = 1<<  0,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22a..748732d 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,6 +29,10 @@
  #include
  #include

+#ifdef CONFIG_MEMORY_HOTREMOVE
+#include
+#endif
+
  /*
   * Allocate a block of memory to be used to back the virtual memory map
   * or to back the page tables that are used to create the mapping.
@@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
vmemmap_buf_end = NULL;
}
  }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+
+#define PAGE_INUSE 0xFD
+
+static void vmemmap_free_pages(struct page *page, int order)
+{
+   struct zone *zone;
+   unsigned long magic;
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+   put_page_bootmem(page);
+
+   zone = page_zone(page);
+   zone_span_writelock(zone);
+   zone->present_pages++;
+   zone_span_writeunlock(zone);
+   totalram_pages++;
+   } else
+   free_pages((unsigned long)page_address(page), order);


Here, I think SECTION_INFO and MIX_SECTION_INFO pages are all allocated
by bootmem, so I put this function this way.

I'm not sure if parameter order is necessary here. It will always be 0
in your code. Is this OK to you ?

static void free_pagetable(struct page *page)
{
struct zone *zone;
bool bootmem = false;
unsigned long magic;

/* bootmem page has reserved flag */
if (PageReserved(page)) {
__ClearPageReserved(page);
bootmem = true;
}

magic = (unsigned long) page->lru.next;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
put_page_bootmem(page);
else
__free_page(page);

/*
 * SECTION_INFO pages and MIX_SECTION_INFO pages
 * are all allocated by bootmem.
 */
if (bootmem) {
zone = page_zone(page);
zone_span_writelock(zone);
zone->present_pages++;
zone_span_writeunlock(zone);
totalram_pages++;
}
}

(snip)


+
+static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long 
end)
+{
+   pte_t *pte;
+   unsigned long next;
+   void *page_addr;
+
+   pte = pte_offset_kernel(pmd, addr);
+   for (; addr<  end; pte++, addr += PAGE_SIZE) {
+   next = (addr + PAGE_SIZE)&  PAGE_MASK;
+   if (next>  end)
+   next = end;
+
+   if (pte_none(*pte))


Here, you checked xxx_none() in your vmemmap_xxx_remove(), but you used
!xxx_present() in your x86_64 patches. Is it OK if I only check
!xxx_present() ?


+   continue;
+   if (IS_ALIGNED(addr, PAGE_SIZE)&&
+   IS_ALIGNED(next, PAGE_SIZE)) {
+   vmemmap_free_pages(pte_page(*pte), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pte_clear(&init_mm, addr, pte);
+   spin_unlock(&init_mm.page_table_lock);
+   } else {
+   /*
+* Removed page structs are filled with 0xFD.
+*/
+   memset((void *)addr, PAGE_INUSE, next - addr);
+   page_addr = page_address(pte_page(*pte));
+
+   if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+   spin_lock(&init_mm.page_table_lock);
+   pte_clear(&init_mm, addr, pte);
+   spin_unlock(&init_mm.page_table_lock);


Here, since we clear pte, we should also free the page, right ?


+   }
+   }
+   }
+
+   free_pte_table(pmd);
+   __flush_tlb_all();
+}
+
+static void vmemmap_pmd_

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-04 Thread Jianguo Wu
Hi Tang,

On 2012/12/5 10:07, Tang Chen wrote:

> Hi Wu,
> 
> On 12/04/2012 08:20 PM, Jianguo Wu wrote:
> (snip)
>>>
>>> Seems that we have different ways to handle pages allocated by bootmem
>>> or by regular allocator. Is the checking way in [PATCH 09/12] available
>>> here ?
>>>
>>> +/* bootmem page has reserved flag */
>>> +if (PageReserved(page)) {
>>> ..
>>> +}
>>>
>>> If so, I think we can just merge these two functions.
>>
>> Hmm, direct mapping table isn't allocated by bootmem allocator such as 
>> memblock, can't be free by put_page_bootmem().
>> But I will try to merge these two functions.
>>
> 
> Oh, I didn't notice this, thanks. :)
> 
> (snip)
> 
 +
 +__split_large_page(kpte, address, pbase);
>>>
>>> Is this patch going to replace [PATCH 08/12] ?
>>>
>>
>> I wish to replace [PATCH 08/12], but need Congyang and Yasuaki to confirm 
>> first:)
>>
>>> If so, __split_large_page() was added and exported in [PATCH 09/12],
>>> then we should move it here, right ?
>>
>> yes.
>>
>> and what do you think about moving vmemmap_pud[pmd/pte]_remove() to 
>> arch/x86/mm/init_64.c,
>> to be consistent with vmemmap_populate() ?
> 
> It is a good idea since pud/pmd/pte related code could be platform
> dependent. And I'm also trying to move vmemmap_free() to
> arch/x86/mm/init_64.c too. I want to have a common interface just
> like vmemmap_populate(). :)
> 

Great.

>>
>> I will rework [PATCH 08/12] and [PATCH 09/12] soon.
> 
> I am rebasing the whole patch set now. And I think I chould finish part
> of your work too. A new patch-set is coming soon, and your rework is
> also welcome. :)
>

Since you are rebasing now, I will wait for your new patche-set :).

Thanks.
Jianguo Wu

> Thanks. :)
> 
> 
> 
> .
> 



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-04 Thread Tang Chen

Hi Wu,

On 12/04/2012 08:20 PM, Jianguo Wu wrote:
(snip)


Seems that we have different ways to handle pages allocated by bootmem
or by regular allocator. Is the checking way in [PATCH 09/12] available
here ?

+/* bootmem page has reserved flag */
+if (PageReserved(page)) {
..
+}

If so, I think we can just merge these two functions.


Hmm, direct mapping table isn't allocated by bootmem allocator such as 
memblock, can't be free by put_page_bootmem().
But I will try to merge these two functions.



Oh, I didn't notice this, thanks. :)

(snip)


+
+__split_large_page(kpte, address, pbase);


Is this patch going to replace [PATCH 08/12] ?



I wish to replace [PATCH 08/12], but need Congyang and Yasuaki to confirm 
first:)


If so, __split_large_page() was added and exported in [PATCH 09/12],
then we should move it here, right ?


yes.

and what do you think about moving vmemmap_pud[pmd/pte]_remove() to 
arch/x86/mm/init_64.c,
to be consistent with vmemmap_populate() ?


It is a good idea since pud/pmd/pte related code could be platform
dependent. And I'm also trying to move vmemmap_free() to
arch/x86/mm/init_64.c too. I want to have a common interface just
like vmemmap_populate(). :)



I will rework [PATCH 08/12] and [PATCH 09/12] soon.


I am rebasing the whole patch set now. And I think I chould finish part
of your work too. A new patch-set is coming soon, and your rework is
also welcome. :)

Thanks. :)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-04 Thread Jianguo Wu
Hi Tang,

Thanks for your review and comments, Please see below for my reply.

On 2012/12/4 17:13, Tang Chen wrote:

> Hi Wu,
> 
> Sorry to make noise here. Please see below. :)
> 
> On 12/03/2012 10:23 AM, Jianguo Wu wrote:
>> Signed-off-by: Jianguo Wu
>> Signed-off-by: Jiang Liu
>> ---
>>   include/linux/mm.h  |1 +
>>   mm/sparse-vmemmap.c |  231 
>> +++
>>   mm/sparse.c |3 +-
>>   3 files changed, 234 insertions(+), 1 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 5657670..1f26af5 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
>> long pages, int node);
>>   void vmemmap_populate_print_last(void);
>>   void register_page_bootmem_memmap(unsigned long section_nr, struct page 
>> *map,
>> unsigned long size);
>> +void vmemmap_free(struct page *memmap, unsigned long nr_pages);
>>
>>   enum mf_flags {
>>   MF_COUNT_INCREASED = 1<<  0,
>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>> index 1b7e22a..748732d 100644
>> --- a/mm/sparse-vmemmap.c
>> +++ b/mm/sparse-vmemmap.c
>> @@ -29,6 +29,10 @@
>>   #include
>>   #include
>>
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +#include
>> +#endif
>> +
>>   /*
>>* Allocate a block of memory to be used to back the virtual memory map
>>* or to back the page tables that are used to create the mapping.
>> @@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page 
>> **map_map,
>>   vmemmap_buf_end = NULL;
>>   }
>>   }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +
>> +#define PAGE_INUSE 0xFD
>> +
>> +static void vmemmap_free_pages(struct page *page, int order)
>> +{
>> +struct zone *zone;
>> +unsigned long magic;
>> +
>> +magic = (unsigned long) page->lru.next;
>> +if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
>> +put_page_bootmem(page);
>> +
>> +zone = page_zone(page);
>> +zone_span_writelock(zone);
>> +zone->present_pages++;
>> +zone_span_writeunlock(zone);
>> +totalram_pages++;
> 
> Seems that we have different ways to handle pages allocated by bootmem
> or by regular allocator. Is the checking way in [PATCH 09/12] available
> here ?
> 
> +/* bootmem page has reserved flag */
> +if (PageReserved(page)) {
> ..
> +}
> 
> If so, I think we can just merge these two functions.

Hmm, direct mapping table isn't allocated by bootmem allocator such as 
memblock, can't be free by put_page_bootmem().
But I will try to merge these two functions.

> 
>> +} else
>> +free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +static void free_pte_table(pmd_t *pmd)
>> +{
>> +pte_t *pte, *pte_start;
>> +int i;
>> +
>> +pte_start = (pte_t *)pmd_page_vaddr(*pmd);
>> +for (i = 0; i<  PTRS_PER_PTE; i++) {
>> +pte = pte_start + i;
>> +if (pte_val(*pte))
>> +return;
>> +}
>> +
>> +/* free a pte talbe */
>> +vmemmap_free_pages(pmd_page(*pmd), 0);
>> +spin_lock(&init_mm.page_table_lock);
>> +pmd_clear(pmd);
>> +spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static void free_pmd_table(pud_t *pud)
>> +{
>> +pmd_t *pmd, *pmd_start;
>> +int i;
>> +
>> +pmd_start = (pmd_t *)pud_page_vaddr(*pud);
>> +for (i = 0; i<  PTRS_PER_PMD; i++) {
>> +pmd = pmd_start + i;
>> +if (pmd_val(*pmd))
>> +return;
>> +}
>> +
>> +/* free a pmd talbe */
>> +vmemmap_free_pages(pud_page(*pud), 0);
>> +spin_lock(&init_mm.page_table_lock);
>> +pud_clear(pud);
>> +spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static void free_pud_table(pgd_t *pgd)
>> +{
>> +pud_t *pud, *pud_start;
>> +int i;
>> +
>> +pud_start = (pud_t *)pgd_page_vaddr(*pgd);
>> +for (i = 0; i<  PTRS_PER_PUD; i++) {
>> +pud = pud_start + i;
>> +if (pud_val(*pud))
>> +return;
>> +}
>> +
>> +/* free a pud table */
>> +vmemmap_free_pages(pgd_page(*pgd), 0);
>> +spin_lock(&init_mm.page_table_lock);
>> +pgd_clear(pgd);
>> +spin_unlock(&init_mm.page_table_lock);
>> +}
> 
> All the free_xxx_table() are very similar to the functions in
> [PATCH 09/12]. Could we reuse them anyway ?

yes, we can reuse them.

> 
>> +
>> +static int split_large_page(pte_t *kpte, unsigned long address, pte_t 
>> *pbase)
>> +{
>> +struct page *page = pmd_page(*(pmd_t *)kpte);
>> +int i = 0;
>> +unsigned long magic;
>> +unsigned long section_nr;
>> +
>> +__split_large_page(kpte, address, pbase);
> 
> Is this patch going to replace [PATCH 08/12] ?
> 

I wish to replace [PATCH 08/12], but need Congyang and Yasuaki to confirm 
first:)

> If so, __split_large_page() was added and exported in [PATCH 09/12],
> then we should move it here, right ?

yes.

and what do you think about moving vmemmap_p

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-04 Thread Tang Chen

On 11/27/2012 06:00 PM, Wen Congyang wrote:

  static int __remove_section(struct zone *zone, struct mem_section *ms)
  {
unsigned long flags;
@@ -330,9 +317,9 @@ static int __remove_section(struct zone *zone, struct 
mem_section *ms)
pgdat_resize_lock(pgdat,&flags);
sparse_remove_one_section(zone, ms);
pgdat_resize_unlock(pgdat,&flags);
-   return 0;
+
+   return ret;


I think we don't need to change this line. :)

Reviewed-by: Tang Chen 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-04 Thread Tang Chen

Hi Wu,

Sorry to make noise here. Please see below. :)

On 12/03/2012 10:23 AM, Jianguo Wu wrote:

Signed-off-by: Jianguo Wu
Signed-off-by: Jiang Liu
---
  include/linux/mm.h  |1 +
  mm/sparse-vmemmap.c |  231 +++
  mm/sparse.c |3 +-
  3 files changed, 234 insertions(+), 1 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5657670..1f26af5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
long pages, int node);
  void vmemmap_populate_print_last(void);
  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
+void vmemmap_free(struct page *memmap, unsigned long nr_pages);

  enum mf_flags {
MF_COUNT_INCREASED = 1<<  0,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22a..748732d 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,6 +29,10 @@
  #include
  #include

+#ifdef CONFIG_MEMORY_HOTREMOVE
+#include
+#endif
+
  /*
   * Allocate a block of memory to be used to back the virtual memory map
   * or to back the page tables that are used to create the mapping.
@@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
vmemmap_buf_end = NULL;
}
  }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+
+#define PAGE_INUSE 0xFD
+
+static void vmemmap_free_pages(struct page *page, int order)
+{
+   struct zone *zone;
+   unsigned long magic;
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+   put_page_bootmem(page);
+
+   zone = page_zone(page);
+   zone_span_writelock(zone);
+   zone->present_pages++;
+   zone_span_writeunlock(zone);
+   totalram_pages++;


Seems that we have different ways to handle pages allocated by bootmem
or by regular allocator. Is the checking way in [PATCH 09/12] available
here ?

+   /* bootmem page has reserved flag */
+   if (PageReserved(page)) {
..
+   }

If so, I think we can just merge these two functions.


+   } else
+   free_pages((unsigned long)page_address(page), order);
+}
+
+static void free_pte_table(pmd_t *pmd)
+{
+   pte_t *pte, *pte_start;
+   int i;
+
+   pte_start = (pte_t *)pmd_page_vaddr(*pmd);
+   for (i = 0; i<  PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (pte_val(*pte))
+   return;
+   }
+
+   /* free a pte talbe */
+   vmemmap_free_pages(pmd_page(*pmd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pmd_clear(pmd);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pmd_table(pud_t *pud)
+{
+   pmd_t *pmd, *pmd_start;
+   int i;
+
+   pmd_start = (pmd_t *)pud_page_vaddr(*pud);
+   for (i = 0; i<  PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (pmd_val(*pmd))
+   return;
+   }
+
+   /* free a pmd talbe */
+   vmemmap_free_pages(pud_page(*pud), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pud_clear(pud);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pud_table(pgd_t *pgd)
+{
+   pud_t *pud, *pud_start;
+   int i;
+
+   pud_start = (pud_t *)pgd_page_vaddr(*pgd);
+   for (i = 0; i<  PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (pud_val(*pud))
+   return;
+   }
+
+   /* free a pud table */
+   vmemmap_free_pages(pgd_page(*pgd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pgd_clear(pgd);
+   spin_unlock(&init_mm.page_table_lock);
+}


All the free_xxx_table() are very similar to the functions in
[PATCH 09/12]. Could we reuse them anyway ?


+
+static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
+{
+   struct page *page = pmd_page(*(pmd_t *)kpte);
+   int i = 0;
+   unsigned long magic;
+   unsigned long section_nr;
+
+   __split_large_page(kpte, address, pbase);


Is this patch going to replace [PATCH 08/12] ?

If so, __split_large_page() was added and exported in [PATCH 09/12],
then we should move it here, right ?

If not, free_map_bootmem() and __kfree_section_memmap() were changed in
[PATCH 08/12], and we need to handle this.


+   __flush_tlb_all();
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO) {
+   section_nr = pfn_to_section_nr(page_to_pfn(page));
+   while (i<  PTRS_PER_PMD) {
+   page++;
+   i++;
+   get_page_bootmem(section_nr, page, SECTION_INFO);
+   }
+   }
+
+   return 0;
+}
+
+static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long 

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-12-02 Thread Jianguo Wu
Hi Congyang,

This is the new version.

Thanks,
Jianguo Wu.


Signed-off-by: Jianguo Wu 
Signed-off-by: Jiang Liu 
---
 include/linux/mm.h  |1 +
 mm/sparse-vmemmap.c |  231 +++
 mm/sparse.c |3 +-
 3 files changed, 234 insertions(+), 1 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5657670..1f26af5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
long pages, int node);
 void vmemmap_populate_print_last(void);
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
+void vmemmap_free(struct page *memmap, unsigned long nr_pages);
 
 enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22a..748732d 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,6 +29,10 @@
 #include 
 #include 
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+#include 
+#endif
+
 /*
  * Allocate a block of memory to be used to back the virtual memory map
  * or to back the page tables that are used to create the mapping.
@@ -224,3 +228,230 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
vmemmap_buf_end = NULL;
}
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+
+#define PAGE_INUSE 0xFD
+
+static void vmemmap_free_pages(struct page *page, int order)
+{
+   struct zone *zone;
+   unsigned long magic;
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+   put_page_bootmem(page);
+
+   zone = page_zone(page);
+   zone_span_writelock(zone);
+   zone->present_pages++;
+   zone_span_writeunlock(zone);
+   totalram_pages++;
+   } else
+   free_pages((unsigned long)page_address(page), order);
+}
+
+static void free_pte_table(pmd_t *pmd)
+{
+   pte_t *pte, *pte_start;
+   int i;
+
+   pte_start = (pte_t *)pmd_page_vaddr(*pmd);
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (pte_val(*pte))
+   return;
+   }
+
+   /* free a pte talbe */
+   vmemmap_free_pages(pmd_page(*pmd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pmd_clear(pmd);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pmd_table(pud_t *pud)
+{
+   pmd_t *pmd, *pmd_start;
+   int i;
+
+   pmd_start = (pmd_t *)pud_page_vaddr(*pud);
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (pmd_val(*pmd))
+   return;
+   }
+
+   /* free a pmd talbe */
+   vmemmap_free_pages(pud_page(*pud), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pud_clear(pud);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pud_table(pgd_t *pgd)
+{
+   pud_t *pud, *pud_start;
+   int i;
+
+   pud_start = (pud_t *)pgd_page_vaddr(*pgd);
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (pud_val(*pud))
+   return;
+   }
+
+   /* free a pud table */
+   vmemmap_free_pages(pgd_page(*pgd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pgd_clear(pgd);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
+{
+   struct page *page = pmd_page(*(pmd_t *)kpte);
+   int i = 0;
+   unsigned long magic;
+   unsigned long section_nr;
+
+   __split_large_page(kpte, address, pbase);
+   __flush_tlb_all();
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO) {
+   section_nr = pfn_to_section_nr(page_to_pfn(page));
+   while (i < PTRS_PER_PMD) {
+   page++;
+   i++;
+   get_page_bootmem(section_nr, page, SECTION_INFO);
+   }
+   }
+
+   return 0;
+}
+
+static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long 
end)
+{
+   pte_t *pte;
+   unsigned long next;
+   void *page_addr;
+
+   pte = pte_offset_kernel(pmd, addr);
+   for (; addr < end; pte++, addr += PAGE_SIZE) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   if (pte_none(*pte))
+   continue;
+   if (IS_ALIGNED(addr, PAGE_SIZE) &&
+   IS_ALIGNED(next, PAGE_SIZE)) {
+   vmemmap_free_pages(pte_page(*pte), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pte_clear(&init_mm, addr, pte);
+   spin_unlock(&init_mm.page_table_lock);
+   

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-11-29 Thread Yasuaki Ishimatsu

Hi Jianguo,

2012/11/30 11:47, Jianguo Wu wrote:

Hi Congyang,

Thanks for your review and comments.

On 2012/11/30 9:45, Wen Congyang wrote:


At 11/28/2012 05:40 PM, Jianguo Wu Wrote:

Hi Congyang,

I think vmemmap's pgtable pages should be freed after all entries are cleared, 
I have a patch to do this.
The code logic is the same as [Patch v4 09/12] memory-hotplug: remove page 
table of x86_64 architecture.

How do you think about this?

Signed-off-by: Jianguo Wu 
Signed-off-by: Jiang Liu 
---
  include/linux/mm.h  |1 +
  mm/sparse-vmemmap.c |  214 +++
  mm/sparse.c |5 +-
  3 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5657670..1f26af5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
long pages, int node);
  void vmemmap_populate_print_last(void);
  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
+void vmemmap_free(struct page *memmap, unsigned long nr_pages);

  enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22a..242cb28 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,6 +29,10 @@
  #include 
  #include 

+#ifdef CONFIG_MEMORY_HOTREMOVE
+#include 
+#endif
+
  /*
   * Allocate a block of memory to be used to back the virtual memory map
   * or to back the page tables that are used to create the mapping.
@@ -224,3 +228,213 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
vmemmap_buf_end = NULL;
}
  }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void vmemmap_free_pages(struct page *page, int order)
+{
+   struct zone *zone;
+   unsigned long magic;
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+   put_page_bootmem(page);
+
+   zone = page_zone(page);
+   zone_span_writelock(zone);
+   zone->present_pages++;
+   zone_span_writeunlock(zone);
+   totalram_pages++;
+   } else {
+   if (is_vmalloc_addr(page_address(page)))
+   vfree(page_address(page));


Hmm, vmemmap doesn't use vmalloc() to allocate memory.



yes, this can be removed.


+   else
+   free_pages((unsigned long)page_address(page), order);
+   }
+}
+
+static void free_pte_table(pmd_t *pmd)
+{
+   pte_t *pte, *pte_start;
+   int i;
+
+   pte_start = (pte_t *)pmd_page_vaddr(*pmd);
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (pte_val(*pte))
+   return;
+   }
+
+   /* free a pte talbe */
+   vmemmap_free_pages(pmd_page(*pmd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pmd_clear(pmd);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pmd_table(pud_t *pud)
+{
+   pmd_t *pmd, *pmd_start;
+   int i;
+
+   pmd_start = (pmd_t *)pud_page_vaddr(*pud);
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (pmd_val(*pmd))
+   return;
+   }
+
+   /* free a pmd talbe */
+   vmemmap_free_pages(pud_page(*pud), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pud_clear(pud);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pud_table(pgd_t *pgd)
+{
+   pud_t *pud, *pud_start;
+   int i;
+
+   pud_start = (pud_t *)pgd_page_vaddr(*pgd);
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (pud_val(*pud))
+   return;
+   }
+
+   /* free a pud table */
+   vmemmap_free_pages(pgd_page(*pgd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pgd_clear(pgd);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
+{
+   struct page *page = pmd_page(*(pmd_t *)kpte);
+   int i = 0;
+   unsigned long magic;
+   unsigned long section_nr;
+
+   __split_large_page(kpte, address, pbase);
+   __flush_tlb_all();
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO) {
+   section_nr = pfn_to_section_nr(page_to_pfn(page));
+   while (i < PTRS_PER_PMD) {
+   page++;
+   i++;
+   get_page_bootmem(section_nr, page, SECTION_INFO);
+   }
+   }
+
+   return 0;
+}
+
+static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long 
end)
+{
+   pte_t *pte;
+   unsigned long next;
+
+   pte = pte_offset_kernel(pmd, addr);
+   for (; addr < end; pt

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-11-29 Thread Jianguo Wu
Hi Congyang,

Thanks for your review and comments.

On 2012/11/30 9:45, Wen Congyang wrote:

> At 11/28/2012 05:40 PM, Jianguo Wu Wrote:
>> Hi Congyang,
>>
>> I think vmemmap's pgtable pages should be freed after all entries are 
>> cleared, I have a patch to do this.
>> The code logic is the same as [Patch v4 09/12] memory-hotplug: remove page 
>> table of x86_64 architecture.
>>
>> How do you think about this?
>>
>> Signed-off-by: Jianguo Wu 
>> Signed-off-by: Jiang Liu 
>> ---
>>  include/linux/mm.h  |1 +
>>  mm/sparse-vmemmap.c |  214 
>> +++
>>  mm/sparse.c |5 +-
>>  3 files changed, 218 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 5657670..1f26af5 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
>> long pages, int node);
>>  void vmemmap_populate_print_last(void);
>>  void register_page_bootmem_memmap(unsigned long section_nr, struct page 
>> *map,
>>unsigned long size);
>> +void vmemmap_free(struct page *memmap, unsigned long nr_pages);
>>  
>>  enum mf_flags {
>>  MF_COUNT_INCREASED = 1 << 0,
>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
>> index 1b7e22a..242cb28 100644
>> --- a/mm/sparse-vmemmap.c
>> +++ b/mm/sparse-vmemmap.c
>> @@ -29,6 +29,10 @@
>>  #include 
>>  #include 
>>  
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +#include 
>> +#endif
>> +
>>  /*
>>   * Allocate a block of memory to be used to back the virtual memory map
>>   * or to back the page tables that are used to create the mapping.
>> @@ -224,3 +228,213 @@ void __init sparse_mem_maps_populate_node(struct page 
>> **map_map,
>>  vmemmap_buf_end = NULL;
>>  }
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +static void vmemmap_free_pages(struct page *page, int order)
>> +{
>> +struct zone *zone;
>> +unsigned long magic;
>> +
>> +magic = (unsigned long) page->lru.next;
>> +if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
>> +put_page_bootmem(page);
>> +
>> +zone = page_zone(page);
>> +zone_span_writelock(zone);
>> +zone->present_pages++;
>> +zone_span_writeunlock(zone);
>> +totalram_pages++;
>> +} else {
>> +if (is_vmalloc_addr(page_address(page)))
>> +vfree(page_address(page));
> 
> Hmm, vmemmap doesn't use vmalloc() to allocate memory.
> 

yes, this can be removed.

>> +else
>> +free_pages((unsigned long)page_address(page), order);
>> +}
>> +}
>> +
>> +static void free_pte_table(pmd_t *pmd)
>> +{
>> +pte_t *pte, *pte_start;
>> +int i;
>> +
>> +pte_start = (pte_t *)pmd_page_vaddr(*pmd);
>> +for (i = 0; i < PTRS_PER_PTE; i++) {
>> +pte = pte_start + i;
>> +if (pte_val(*pte))
>> +return;
>> +}
>> +
>> +/* free a pte talbe */
>> +vmemmap_free_pages(pmd_page(*pmd), 0);
>> +spin_lock(&init_mm.page_table_lock);
>> +pmd_clear(pmd);
>> +spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static void free_pmd_table(pud_t *pud)
>> +{
>> +pmd_t *pmd, *pmd_start;
>> +int i;
>> +
>> +pmd_start = (pmd_t *)pud_page_vaddr(*pud);
>> +for (i = 0; i < PTRS_PER_PMD; i++) {
>> +pmd = pmd_start + i;
>> +if (pmd_val(*pmd))
>> +return;
>> +}
>> +
>> +/* free a pmd talbe */
>> +vmemmap_free_pages(pud_page(*pud), 0);
>> +spin_lock(&init_mm.page_table_lock);
>> +pud_clear(pud);
>> +spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static void free_pud_table(pgd_t *pgd)
>> +{
>> +pud_t *pud, *pud_start;
>> +int i;
>> +
>> +pud_start = (pud_t *)pgd_page_vaddr(*pgd);
>> +for (i = 0; i < PTRS_PER_PUD; i++) {
>> +pud = pud_start + i;
>> +if (pud_val(*pud))
>> +return;
>> +}
>> +
>> +/* free a pud table */
>> +vmemmap_free_pages(pgd_page(*pgd), 0);
>> +spin_lock(&init_mm.page_table_lock);
>> +pgd_clear(pgd);
>> +spin_unlock(&init_mm.page_table_lock);
>> +}
>> +
>> +static int split_large_page(pte_t *kpte, unsigned long address, pte_t 
>> *pbase)
>> +{
>> +struct page *page = pmd_page(*(pmd_t *)kpte);
>> +int i = 0;
>> +unsigned long magic;
>> +unsigned long section_nr;
>> +
>> +__split_large_page(kpte, address, pbase);
>> +__flush_tlb_all();
>> +
>> +magic = (unsigned long) page->lru.next;
>> +if (magic == SECTION_INFO) {
>> +section_nr = pfn_to_section_nr(page_to_pfn(page));
>> +while (i < PTRS_PER_PMD) {
>> +page++;
>> +i++;
>> +get_page_bootmem(section_nr, page, SECTION_INFO);
>> +}
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +static void vmemmap_

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-11-29 Thread Wen Congyang
At 11/28/2012 05:40 PM, Jianguo Wu Wrote:
> Hi Congyang,
> 
> I think vmemmap's pgtable pages should be freed after all entries are 
> cleared, I have a patch to do this.
> The code logic is the same as [Patch v4 09/12] memory-hotplug: remove page 
> table of x86_64 architecture.
> 
> How do you think about this?
> 
> Signed-off-by: Jianguo Wu 
> Signed-off-by: Jiang Liu 
> ---
>  include/linux/mm.h  |1 +
>  mm/sparse-vmemmap.c |  214 
> +++
>  mm/sparse.c |5 +-
>  3 files changed, 218 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 5657670..1f26af5 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
> long pages, int node);
>  void vmemmap_populate_print_last(void);
>  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
> unsigned long size);
> +void vmemmap_free(struct page *memmap, unsigned long nr_pages);
>  
>  enum mf_flags {
>   MF_COUNT_INCREASED = 1 << 0,
> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
> index 1b7e22a..242cb28 100644
> --- a/mm/sparse-vmemmap.c
> +++ b/mm/sparse-vmemmap.c
> @@ -29,6 +29,10 @@
>  #include 
>  #include 
>  
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +#include 
> +#endif
> +
>  /*
>   * Allocate a block of memory to be used to back the virtual memory map
>   * or to back the page tables that are used to create the mapping.
> @@ -224,3 +228,213 @@ void __init sparse_mem_maps_populate_node(struct page 
> **map_map,
>   vmemmap_buf_end = NULL;
>   }
>  }
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +static void vmemmap_free_pages(struct page *page, int order)
> +{
> + struct zone *zone;
> + unsigned long magic;
> +
> + magic = (unsigned long) page->lru.next;
> + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
> + put_page_bootmem(page);
> +
> + zone = page_zone(page);
> + zone_span_writelock(zone);
> + zone->present_pages++;
> + zone_span_writeunlock(zone);
> + totalram_pages++;
> + } else {
> + if (is_vmalloc_addr(page_address(page)))
> + vfree(page_address(page));

Hmm, vmemmap doesn't use vmalloc() to allocate memory.

> + else
> + free_pages((unsigned long)page_address(page), order);
> + }
> +}
> +
> +static void free_pte_table(pmd_t *pmd)
> +{
> + pte_t *pte, *pte_start;
> + int i;
> +
> + pte_start = (pte_t *)pmd_page_vaddr(*pmd);
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + pte = pte_start + i;
> + if (pte_val(*pte))
> + return;
> + }
> +
> + /* free a pte talbe */
> + vmemmap_free_pages(pmd_page(*pmd), 0);
> + spin_lock(&init_mm.page_table_lock);
> + pmd_clear(pmd);
> + spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static void free_pmd_table(pud_t *pud)
> +{
> + pmd_t *pmd, *pmd_start;
> + int i;
> +
> + pmd_start = (pmd_t *)pud_page_vaddr(*pud);
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pmd = pmd_start + i;
> + if (pmd_val(*pmd))
> + return;
> + }
> +
> + /* free a pmd talbe */
> + vmemmap_free_pages(pud_page(*pud), 0);
> + spin_lock(&init_mm.page_table_lock);
> + pud_clear(pud);
> + spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static void free_pud_table(pgd_t *pgd)
> +{
> + pud_t *pud, *pud_start;
> + int i;
> +
> + pud_start = (pud_t *)pgd_page_vaddr(*pgd);
> + for (i = 0; i < PTRS_PER_PUD; i++) {
> + pud = pud_start + i;
> + if (pud_val(*pud))
> + return;
> + }
> +
> + /* free a pud table */
> + vmemmap_free_pages(pgd_page(*pgd), 0);
> + spin_lock(&init_mm.page_table_lock);
> + pgd_clear(pgd);
> + spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
> +{
> + struct page *page = pmd_page(*(pmd_t *)kpte);
> + int i = 0;
> + unsigned long magic;
> + unsigned long section_nr;
> +
> + __split_large_page(kpte, address, pbase);
> + __flush_tlb_all();
> +
> + magic = (unsigned long) page->lru.next;
> + if (magic == SECTION_INFO) {
> + section_nr = pfn_to_section_nr(page_to_pfn(page));
> + while (i < PTRS_PER_PMD) {
> + page++;
> + i++;
> + get_page_bootmem(section_nr, page, SECTION_INFO);
> + }
> + }
> +
> + return 0;
> +}
> +
> +static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long 
> end)
> +{
> + pte_t *pte;
> + unsigned long next;
> +
> + pte = pte_offset_kernel(pmd, addr);
> + for (; addr < end; pte++, addr += PAGE_S

Re: [Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-11-28 Thread Jianguo Wu
Hi Congyang,

I think vmemmap's pgtable pages should be freed after all entries are cleared, 
I have a patch to do this.
The code logic is the same as [Patch v4 09/12] memory-hotplug: remove page 
table of x86_64 architecture.

How do you think about this?

Signed-off-by: Jianguo Wu 
Signed-off-by: Jiang Liu 
---
 include/linux/mm.h  |1 +
 mm/sparse-vmemmap.c |  214 +++
 mm/sparse.c |5 +-
 3 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5657670..1f26af5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1642,6 +1642,7 @@ int vmemmap_populate(struct page *start_page, unsigned 
long pages, int node);
 void vmemmap_populate_print_last(void);
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
  unsigned long size);
+void vmemmap_free(struct page *memmap, unsigned long nr_pages);
 
 enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22a..242cb28 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,6 +29,10 @@
 #include 
 #include 
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+#include 
+#endif
+
 /*
  * Allocate a block of memory to be used to back the virtual memory map
  * or to back the page tables that are used to create the mapping.
@@ -224,3 +228,213 @@ void __init sparse_mem_maps_populate_node(struct page 
**map_map,
vmemmap_buf_end = NULL;
}
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void vmemmap_free_pages(struct page *page, int order)
+{
+   struct zone *zone;
+   unsigned long magic;
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+   put_page_bootmem(page);
+
+   zone = page_zone(page);
+   zone_span_writelock(zone);
+   zone->present_pages++;
+   zone_span_writeunlock(zone);
+   totalram_pages++;
+   } else {
+   if (is_vmalloc_addr(page_address(page)))
+   vfree(page_address(page));
+   else
+   free_pages((unsigned long)page_address(page), order);
+   }
+}
+
+static void free_pte_table(pmd_t *pmd)
+{
+   pte_t *pte, *pte_start;
+   int i;
+
+   pte_start = (pte_t *)pmd_page_vaddr(*pmd);
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (pte_val(*pte))
+   return;
+   }
+
+   /* free a pte talbe */
+   vmemmap_free_pages(pmd_page(*pmd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pmd_clear(pmd);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pmd_table(pud_t *pud)
+{
+   pmd_t *pmd, *pmd_start;
+   int i;
+
+   pmd_start = (pmd_t *)pud_page_vaddr(*pud);
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (pmd_val(*pmd))
+   return;
+   }
+
+   /* free a pmd talbe */
+   vmemmap_free_pages(pud_page(*pud), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pud_clear(pud);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static void free_pud_table(pgd_t *pgd)
+{
+   pud_t *pud, *pud_start;
+   int i;
+
+   pud_start = (pud_t *)pgd_page_vaddr(*pgd);
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (pud_val(*pud))
+   return;
+   }
+
+   /* free a pud table */
+   vmemmap_free_pages(pgd_page(*pgd), 0);
+   spin_lock(&init_mm.page_table_lock);
+   pgd_clear(pgd);
+   spin_unlock(&init_mm.page_table_lock);
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
+{
+   struct page *page = pmd_page(*(pmd_t *)kpte);
+   int i = 0;
+   unsigned long magic;
+   unsigned long section_nr;
+
+   __split_large_page(kpte, address, pbase);
+   __flush_tlb_all();
+
+   magic = (unsigned long) page->lru.next;
+   if (magic == SECTION_INFO) {
+   section_nr = pfn_to_section_nr(page_to_pfn(page));
+   while (i < PTRS_PER_PMD) {
+   page++;
+   i++;
+   get_page_bootmem(section_nr, page, SECTION_INFO);
+   }
+   }
+
+   return 0;
+}
+
+static void vmemmap_pte_remove(pmd_t *pmd, unsigned long addr, unsigned long 
end)
+{
+   pte_t *pte;
+   unsigned long next;
+
+   pte = pte_offset_kernel(pmd, addr);
+   for (; addr < end; pte++, addr += PAGE_SIZE) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   if (pte_none(*pte))
+   continue;
+   if (IS_ALIGNED(addr, PAGE_SIZE) &&
+  

[Patch v4 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-11-27 Thread Wen Congyang
From: Yasuaki Ishimatsu 

All pages of virtual mapping in removed memory cannot be freed, since some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch checks whether page can be freed or not.

How to check whether page can be freed or not?
 1. When removing memory, the page structs of the revmoved memory are filled
with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
In this case, the page used as PT/PMD can be freed.

Applying patch, __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is integrated
into one. So __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is deleted.

Note:  vmemmap_kfree() and vmemmap_free_bootmem() are not implemented for ia64,
ppc, s390, and sparc.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Jianguo Wu 
Signed-off-by: Wen Congyang 
---
 arch/ia64/mm/discontig.c  |   8 
 arch/powerpc/mm/init_64.c |   8 
 arch/s390/mm/vmem.c   |   8 
 arch/sparc/mm/init_64.c   |   8 
 arch/x86/mm/init_64.c | 119 ++
 include/linux/mm.h|   2 +
 mm/memory_hotplug.c   |  17 +--
 mm/sparse.c   |  19 
 8 files changed, 165 insertions(+), 24 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 33943db..0d23b69 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -823,6 +823,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return vmemmap_populate_basepages(start_page, size, node);
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 6466440..df7d155 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -298,6 +298,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 4f4803a..ab69c34 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -236,6 +236,14 @@ out:
return ret;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 75a984b..546855d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2232,6 +2232,14 @@ void __meminit vmemmap_populate_print_last(void)
}
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 795dae3..e85626d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -998,6 +998,125 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
return 0;
 }
 
+#define PAGE_INUSE 0xFD
+
+unsigned long find_and_clear_pte_page(unsigned long addr, unsigned long end,
+   struct page **pp, int *page_size)
+{
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte = NULL;
+   void *page_addr;
+   unsigned long next;
+
+   *pp = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd))
+   return pgd_addr_end(addr, end);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud))
+   return pud_addr_end(addr, end);
+
+   if (!cpu_has_pse) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+   pte = pte_offset_kernel(pmd, addr);
+   if (pte_none(*pte))
+   return next;
+
+   *page_size = PAGE_SIZE;
+   *pp = pte_page(*pte);
+   } else {
+   next = pmd_addr_end(addr, end);
+
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+