[PATCH v3 12/13] x86, numa, acpi, memory-hotplug: Make movablecore=acpi have higher priority.

2013-05-24 Thread Tang Chen
Arrange hotpluggable memory as ZONE_MOVABLE will cause NUMA performance 
decreased
because the kernel cannot use movable memory.

For users who don't use memory hotplug and who don't want to lose their NUMA
performance, they need a way to disable this functionality.

So, if users specify "movablecore=acpi" in kernel commandline, the kernel will
use SRAT to arrange ZONE_MOVABLE, and it has higher priority then original
movablecore and kernelcore boot option.

For those who don't want this, just specify nothing.

Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |1 +
 mm/memblock.c|5 +
 mm/page_alloc.c  |   31 +--
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 08c761d..5528e8f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -69,6 +69,7 @@ int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_reserve_hotpluggable(phys_addr_t base, phys_addr_t size, int nid);
+bool memblock_is_hotpluggable(struct memblock_region *region);
 void memblock_free_hotpluggable(void);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 54de398..8b9a13c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -623,6 +623,11 @@ int __init_memblock 
memblock_reserve_hotpluggable(phys_addr_t base,
return memblock_reserve_region(base, size, nid, flags);
 }
 
+bool __init_memblock memblock_is_hotpluggable(struct memblock_region *region)
+{
+   return region->flags & (1 << MEMBLK_HOTPLUGGABLE);
+}
+
 /**
  * __next_free_mem_range - next function for for_each_free_mem_range()
  * @idx: pointer to u64 loop variable
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9ea143..557b21b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4793,9 +4793,37 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+   struct memblock_type *reserved = &memblock.reserved;
 
/*
-* If movablecore was specified, calculate what size of
+* Need to find movable_zone earlier in case movablecore=acpi is
+* specified.
+*/
+   find_usable_zone_for_movable();
+
+   /*
+* If movablecore=acpi was specified, then zone_movable_pfn[] has been
+* initialized, and no more work needs to do.
+* NOTE: In this case, we ignore kernelcore option.
+*/
+   if (movablecore_enable_srat) {
+   for (i = 0; i < reserved->cnt; i++) {
+   if (!memblock_is_hotpluggable(&reserved->regions[i]))
+   continue;
+
+   nid = reserved->regions[i].nid;
+
+   usable_startpfn = PFN_DOWN(reserved->regions[i].base);
+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out;
+   }
+
+   /*
+* If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
 * and movablecore are specified, then the value of kernelcore
@@ -4821,7 +4849,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
 
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-   find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 
 restart:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 03/13] page_alloc, mem-hotplug: Improve movablecore to {en|dis}able using SRAT.

2013-05-24 Thread Tang Chen
The Hot-Pluggable Fired in SRAT specified which memory ranges are hotpluggable.
We will arrange hotpluggable memory as ZONE_MOVABLE for users who want to use
memory hotplug functionality. But this will cause NUMA performance decreased
because kernel cannot use ZONE_MOVABLE.

So we improve movablecore boot option to allow those who want to use memory
hotplug functionality to enable using SRAT info to arrange movable memory.

Users can specify "movablecore=acpi" in kernel commandline to enable this
functionality.

For those who don't use memory hotplug or who don't want to lose their NUMA
performance, just don't specify anything. The kernel will work as before.

Suggested-by: Kamezawa Hiroyuki 
Signed-off-by: Tang Chen 
---
 include/linux/memory_hotplug.h |3 +++
 mm/page_alloc.c|   13 +
 2 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b6a3be7..18fe2a3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -33,6 +33,9 @@ enum {
ONLINE_MOVABLE,
 };
 
+/* Enable/disable SRAT in movablecore boot option */
+extern bool movablecore_enable_srat;
+
 /*
  * pgdat resizing functions
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f368db4..b9ea143 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -208,6 +208,8 @@ static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 
+bool __initdata movablecore_enable_srat = false;
+
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
@@ -5025,6 +5027,12 @@ void __init free_area_init_nodes(unsigned long 
*max_zone_pfn)
}
 }
 
+static void __init cmdline_movablecore_srat(char *p)
+{
+   if (p && !strcmp(p, "acpi"))
+   movablecore_enable_srat = true;
+}
+
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
unsigned long long coremem;
@@ -5055,6 +5063,11 @@ static int __init cmdline_parse_kernelcore(char *p)
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
+   cmdline_movablecore_srat(p);
+
+   if (movablecore_enable_srat)
+   return 0;
+
return cmdline_parse_core(p, &required_movablecore);
 }
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 11/13] x86, memblock, mem-hotplug: Free hotpluggable memory reserved by memblock.

2013-05-24 Thread Tang Chen
We reserved hotpluggable memory in memblock. And when memory initialization
is done, we have to free it to buddy system.

This patch free memory reserved by memblock with flag MEMBLK_HOTPLUGGABLE.

Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |1 +
 mm/memblock.c|   20 
 mm/nobootmem.c   |3 +++
 3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 0f01930..08c761d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -69,6 +69,7 @@ int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_reserve_hotpluggable(phys_addr_t base, phys_addr_t size, int nid);
+void memblock_free_hotpluggable(void);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
 bool memblock_is_kernel_node(int nid);
diff --git a/mm/memblock.c b/mm/memblock.c
index 0c55588..54de398 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -568,6 +568,26 @@ int __init_memblock memblock_free(phys_addr_t base, 
phys_addr_t size)
return __memblock_remove(&memblock.reserved, base, size);
 }
 
+static void __init_memblock memblock_free_flags(unsigned long flags)
+{
+   int i;
+   struct memblock_type *reserved = &memblock.reserved;
+
+   for (i = 0; i < reserved->cnt; i++) {
+   if (reserved->regions[i].flags == flags)
+   memblock_remove_region(reserved, i);
+   }
+}
+
+void __init_memblock memblock_free_hotpluggable()
+{
+   unsigned long flags = 1 << MEMBLK_HOTPLUGGABLE;
+
+   memblock_dbg("memblock: free all hotpluggable memory");
+
+   memblock_free_flags(flags);
+}
+
 static int __init_memblock memblock_reserve_region(phys_addr_t base,
   phys_addr_t size,
   int nid,
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..cd85604 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -165,6 +165,9 @@ unsigned long __init free_all_bootmem(void)
for_each_online_pgdat(pgdat)
reset_node_lowmem_managed_pages(pgdat);
 
+   /* Hotpluggable memory reserved by memblock should also be freed. */
+   memblock_free_hotpluggable();
+
/*
 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
 *  because in some case like Node0 doesn't have RAM installed
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 06/13] memblock, numa: Introduce flag into memblock.

2013-05-24 Thread Tang Chen
There is no flag in memblock to discribe what type the memory is.
Sometimes, we may use memblock to reserve some memory for special usage.
For example, as Yinghai did in his patch, allocate pagetables on local
node before all the memory on the node is mapped.
Please refer to Yinghai's patch:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47
v3: https://lkml.org/lkml/2013/4/4/639
v4: https://lkml.org/lkml/2013/4/11/829

In hotplug environment, there could be some problems when we hot-remove
memory if we do so. Pagetable pages are kernel memory, which we cannot
migrate. But we can put them in local node because their life-cycle is
the same as the node.  So we need to free them all before memory hot-removing.

Actually, data whose life cycle is the same as a node, such as pagetable
pages, vmemmap pages, page_cgroup pages, all could be put on local node.
They can be freed when we hot-removing a whole node.

In order to do so, we need to mark out these special pages in memblock.
In this patch, we introduce a new "flags" member into memblock_region:
   struct memblock_region {
   phys_addr_t base;
   phys_addr_t size;
   unsigned long flags;
   #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   int nid;
   #endif
   };

This patch does the following things:
1) Add "flags" member to memblock_region, and MEMBLK_ANY flag for common usage.
2) Modify the following APIs' prototype:
memblock_add_region()
memblock_insert_region()
3) Add memblock_reserve_region() to support reserve memory with flags, and keep
   memblock_reserve()'s prototype unmodified.
4) Modify other APIs to support flags, but keep their prototype unmodified.

The idea is from Wen Congyang  and Liu Jiang 
.

Suggested-by: Wen Congyang 
Suggested-by: Liu Jiang 
Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |8 ++
 mm/memblock.c|   56 +
 2 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203..c63a66e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -19,9 +19,17 @@
 
 #define INIT_MEMBLOCK_REGIONS  128
 
+#define MEMBLK_FLAGS_DEFAULT   0
+
+/* Definition of memblock flags. */
+enum memblock_flags {
+   __NR_MEMBLK_FLAGS,  /* number of flags */
+};
+
 struct memblock_region {
phys_addr_t base;
phys_addr_t size;
+   unsigned long flags;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
 #endif
diff --git a/mm/memblock.c b/mm/memblock.c
index 16eda3d..63924ae 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -157,6 +157,7 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
+   type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
 }
@@ -307,7 +308,8 @@ static void __init_memblock memblock_merge_regions(struct 
memblock_type *type)
 
if (this->base + this->size != next->base ||
memblock_get_region_node(this) !=
-   memblock_get_region_node(next)) {
+   memblock_get_region_node(next) ||
+   this->flags != next->flags) {
BUG_ON(this->base + this->size > next->base);
i++;
continue;
@@ -327,13 +329,15 @@ static void __init_memblock memblock_merge_regions(struct 
memblock_type *type)
  * @base:  base address of the new region
  * @size:  size of the new region
  * @nid:   node id of the new region
+ * @flags: flags of the new region
  *
  * Insert new memblock region [@base,@base+@size) into @type at @idx.
  * @type must already have extra room to accomodate the new region.
  */
 static void __init_memblock memblock_insert_region(struct memblock_type *type,
   int idx, phys_addr_t base,
-  phys_addr_t size, int nid)
+  phys_addr_t size,
+  int nid, unsigned long flags)
 {
struct memblock_region *rgn = &type->regions[idx];
 
@@ -341,6 +345,7 @@ static void __init_memblock memblock_insert_region(struct 
memblock_type *type,
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
rgn->base = base;
rgn->size = size;
+   rgn->flags = flags;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
@@ -352,6 +357,7 @@ static void __init_memblock memblock_insert_region(struct 
memblock_type *type,
  * @base: base address of the new region
  * @size: 

[PATCH v3 10/13] x86, acpi, numa, mem-hotplug: Introduce MEMBLK_HOTPLUGGABLE to mark and reserve hotpluggable memory.

2013-05-24 Thread Tang Chen
We mark out movable memory ranges and reserve them with MEMBLK_HOTPLUGGABLE 
flag in
memblock.reserved. This should be done after the memory mapping is initialized
because the kernel now supports allocate pagetable pages on local node, which
are kernel pages.

The reserved hotpluggable will be freed to buddy when memory initialization
is done.

And also, ensure all the nodes which the kernel resides in are un-hotpluggable.

This idea is from Wen Congyang  and Jiang Liu 
.

Suggested-by: Jiang Liu 
Suggested-by: Wen Congyang 
Signed-off-by: Tang Chen 
Reviewed-by: Vasilis Liaskovitis 
---
 arch/x86/mm/numa.c   |   29 +
 include/linux/memblock.h |3 +++
 mm/memblock.c|   19 +++
 3 files changed, 51 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index b28baf3..73f9ade 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -727,6 +727,33 @@ static void __init early_x86_numa_init_mapping(void)
 }
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init early_mem_hotplug_init()
+{
+   int i, nid;
+   phys_addr_t start, end;
+
+   if (!movablecore_enable_srat)
+   return;
+
+   for (i = 0; i < numa_meminfo.nr_blks; i++) {
+   nid = numa_meminfo.blk[i].nid;
+   start = numa_meminfo.blk[i].start;
+   end = numa_meminfo.blk[i].end;
+
+   if (!numa_meminfo.blk[i].hotpluggable ||
+   memblock_is_kernel_node(nid))
+   continue;
+
+   memblock_reserve_hotpluggable(start, end - start, nid);
+   }
+}
+#else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void early_mem_hotplug_init()
+{
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
 void __init early_initmem_init(void)
 {
early_x86_numa_init();
@@ -736,6 +763,8 @@ void __init early_initmem_init(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
 
+   early_mem_hotplug_init();
+
early_memtest(0, max_pfn_mapped<http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 09/13] x86, numa, memblock: Introduce MEMBLK_LOCAL_NODE to mark and reserve node-life-cycle data.

2013-05-24 Thread Tang Chen
node-life-cycle data (whose life cycle is the same as a node)
allocated by memblock should be marked so that when we free usable
memory to buddy system, we can skip them.

This patch introduces a flag MEMBLK_LOCAL_NODE for memblock to reserve
node-life-cycle data. For now, it is only kernel direct mapping pagetable
pages, based on Yinghai's patch.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/init.c   |   16 
 include/linux/memblock.h |2 ++
 mm/memblock.c|7 +++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8d0007a..002d487 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -62,14 +62,22 @@ __ref void *alloc_low_pages(unsigned int num)
low_min_pfn_mapped << PAGE_SHIFT,
low_max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
-   } else
+   if (!ret)
+   panic("alloc_low_page: can not alloc memory");
+
+   memblock_reserve(ret, PAGE_SIZE * num);
+   } else {
ret = memblock_find_in_range(
local_min_pfn_mapped << PAGE_SHIFT,
local_max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
-   if (!ret)
-   panic("alloc_low_page: can not alloc memory");
-   memblock_reserve(ret, PAGE_SIZE * num);
+   if (!ret)
+   panic("alloc_low_page: can not alloc memory");
+
+   memblock_reserve_local_node(ret, PAGE_SIZE * num,
+   memory_add_physaddr_to_nid(ret));
+   }
+
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5064eed..3b2d1c4 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -23,6 +23,7 @@
 
 /* Definition of memblock flags. */
 enum memblock_flags {
+   MEMBLK_LOCAL_NODE,  /* node-life-cycle data */
__NR_MEMBLK_FLAGS,  /* number of flags */
 };
 
@@ -65,6 +66,7 @@ int memblock_add(phys_addr_t base, phys_addr_t size);
 int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
+int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index 1b93a5d..edde4c2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -589,6 +589,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, 
phys_addr_t size)
   MEMBLK_FLAGS_DEFAULT);
 }
 
+int __init_memblock memblock_reserve_local_node(phys_addr_t base,
+   phys_addr_t size, int nid)
+{
+   unsigned long flags = 1 << MEMBLK_LOCAL_NODE;
+   return memblock_reserve_region(base, size, nid, flags);
+}
+
 /**
  * __next_free_mem_range - next function for for_each_free_mem_range()
  * @idx: pointer to u64 loop variable
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Driver core / memory: Simplify __memory_block_change_state()

2013-05-22 Thread Tang Chen

Reviewed-by: Tang Chen 

Thanks. :)

On 05/23/2013 06:06 AM, Rafael J. Wysocki wrote:

From: Rafael J. Wysocki

As noted by Tang Chen, the last_online field in struct memory_block
introduced by commit 4960e05 (Driver core: Introduce offline/online
callbacks for memory blocks) is not really necessary, because
online_pages() restores the previous state if passed ONLINE_KEEP as
the last argument.  Therefore, remove that field along with the code
referring to it.

References: http://marc.info/?l=linux-kernel&m=136919777305599&w=2
Signed-off-by: Rafael J. Wysocki
---

Hi,

The patch is on top (and the commit mentioned in the changelog is present in)
the acpi-hotplug branch of the linux-pm.git tree.

Thanks,
Rafael

---
  drivers/base/memory.c  |   11 ++-
  include/linux/memory.h |1 -
  2 files changed, 2 insertions(+), 10 deletions(-)

Index: linux-pm/drivers/base/memory.c
===
--- linux-pm.orig/drivers/base/memory.c
+++ linux-pm/drivers/base/memory.c
@@ -291,13 +291,7 @@ static int __memory_block_change_state(s
mem->state = MEM_GOING_OFFLINE;

ret = memory_block_action(mem->start_section_nr, to_state, online_type);
-   if (ret) {
-   mem->state = from_state_req;
-   } else {
-   mem->state = to_state;
-   if (to_state == MEM_ONLINE)
-   mem->last_online = online_type;
-   }
+   mem->state = ret ? from_state_req : to_state;
return ret;
  }

@@ -310,7 +304,7 @@ static int memory_subsys_online(struct d

ret = mem->state == MEM_ONLINE ? 0 :
__memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE,
-   mem->last_online);
+   ONLINE_KEEP);

mutex_unlock(&mem->state_mutex);
return ret;
@@ -618,7 +612,6 @@ static int init_memory_block(struct memo
base_memory_block_id(scn_nr) * sections_per_block;
mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem->state = state;
-   mem->last_online = ONLINE_KEEP;
mem->section_count++;
mutex_init(&mem->state_mutex);
start_pfn = section_nr_to_pfn(mem->start_section_nr);
Index: linux-pm/include/linux/memory.h
===
--- linux-pm.orig/include/linux/memory.h
+++ linux-pm/include/linux/memory.h
@@ -26,7 +26,6 @@ struct memory_block {
unsigned long start_section_nr;
unsigned long end_section_nr;
unsigned long state;
-   int last_online;
int section_count;

/*



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 01/13] x86: get pg_data_t's memory from other node

2013-05-22 Thread Tang Chen

On 05/22/2013 04:55 PM, Chen Gong wrote:
..

-   nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+   nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);


go through the implementation of memblock_alloc_try_nid, it will call
panic when allocation fails(a.k.a alloc = 0), if so, below information
will be never printed. Do we really need this?


Oh, yes.

We don't need this. Will remove the following in the next version.

Thanks. :)




if (!nd_pa) {
-   pr_err("Cannot find %zu bytes in node %d\n",
-  nd_size, nid);
+   pr_err("Cannot find %zu bytes in any node\n", nd_size);
return;
}
nd = __va(nd_pa);
--
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email:mailto:"d...@kvack.org";>  em...@kvack.org


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/22] x86, ACPI, numa: Parse numa info early

2013-05-21 Thread Tang Chen

On 05/10/2013 02:24 AM, Yinghai Lu wrote:
..


If you have any thinking of this patch-set, please let me know.


Talked to HPA, and he will put my patchset into tip/x86/mm after v3.10-rc1.

after that we can work on put pagetable on local node for hotadd path.



Hi,

It is Linux v3.10-rc2 now. But I didn't find this patch-set merged into
tip/x86/mm. was it merged to somewhere else, or we have any other plan
to push ?

By the way, I have done some tests for this patch-set, and the test results
have been sent. Please refer to:
https://lkml.org/lkml/2013/4/30/45

Reviewed-by: Tang Chen 
Tested-by: Tang Chen 

Thanks. :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2 v2, RFC] Driver core: Introduce offline/online callbacks for memory blocks

2013-05-21 Thread Tang Chen

Hi Rafael,

On 05/21/2013 07:15 PM, Rafael J. Wysocki wrote:
..

+   mem->state = to_state;
+   if (to_state == MEM_ONLINE)
+   mem->last_online = online_type;


Why do we need to remember last online type ?

And as far as I know, we can obtain which zone a page was in last time it
was onlined by check page->flags, just like online_pages() does. If we
use online_kernel or online_movable, the zone boundary will be
recalculated.
So we don't need to remember the last online type.

Seeing from your patch, I guess memory_subsys_online() can only handle
online and offline. So mem->last_online is used to remember what user has
done through the original way to trigger memory hot-remove, right ? And
when
user does it in this new way, it just does the same thing as user does last
time.

But I still think we don't need to remember it because if finally you call
online_pages(), it just does the same thing as last time by default.

online_pages()
{
..
if (online_type == ONLINE_KERNEL ..

if (online_type == ONLINE_MOVABLE..

zone = page_zone(pfn_to_page(pfn));

/* Here, the page will be put into the zone which it belong to last
time. */


To be honest, it wasn't entirely clear to me that online_pages() would do the
same thing as last time by default.  Suppose, for example, that the previous
online_type was ONLINE_MOVABLE.  How online_pages() is supposed to know that
it should do the move_pfn_zone_right() if we don't tell it to do that?  Or
is that unnecessary, because it's already been done previously?


Yes, it is unnecessary. move_pfn_zone_right/left() will modify the zone 
related

bits in page->flags. But when the page is offline, the zone related bits in
page->flags will not change. So when it is online again, by dafault, it 
will

be in the zone which it was in last time.

..



I just thought of it. Maybe I missed something in your design. Please tell
me if I'm wrong.


Well, so what should be passed to __memory_block_change_state() in
memory_subsys_online()?  -1?


If you want to keep the last time status, you can pass ONLINE_KEEP.
Or -1 is all right.

Thanks. :)




Reviewed-by: Tang Chen

Thanks. :)


Thanks for your comments,
Rafael




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 12/13] x86, numa, acpi, memory-hotplug: Make movablecore=acpi have higher priority.

2013-05-21 Thread Tang Chen

Hi Vasilis,

Maybe the following two problems are the cause of the reboot panic
problem in qemu you mentioned.

On 04/30/2013 05:21 PM, Tang Chen wrote:
..

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9ea143..2fe9ebf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4793,9 +4793,31 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+   struct memblock_type *reserved =&memblock.reserved;



Need to call find_usable_zone_for_movable() here before goto out.


/*
-* If movablecore was specified, calculate what size of
+* If movablecore=acpi was specified, then zone_movable_pfn[] has been
+* initialized, and no more work needs to do.
+* NOTE: In this case, we ignore kernelcore option.
+*/
+   if (movablecore_enable_srat) {
+   for (i = 0; i<  reserved->cnt; i++) {
+   if (!memblock_is_hotpluggable(&reserved->regions[i]))
+   continue;
+
+   nid = reserved->regions[i].nid;
+
+   usable_startpfn = reserved->regions[i].base;


Here, it should be PFN_DOWN(reserved->regions[i].base).

Thanks. :)


+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out;
+   }
+
+   /*
+* If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
 * and movablecore are specified, then the value of kernelcore

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] numa, mm, memory-hotplug: Do not allocate pagetable to local node with MEMORY_HOTREMOVE enabled.

2013-05-21 Thread Tang Chen

Hi

On 05/21/2013 03:02 PM, Pekka Enberg wrote:
..


Ugh. Special-casing for CONFIG_MEMORY_HOTPLUG is just begging for
trouble. Were you able to determine which commit broke memory
hot-remove?


Please refer to the following patch-set.
https://lkml.org/lkml/2013/4/11/829

patch21 and patch22 will allocate pagetable to local, which may cause
memory hot-remove fail.

But this patch-set is not in the mainline now.

Thanks. :)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2 v2, RFC] ACPI / memhotplug: Bind removable memory blocks to ACPI device nodes

2013-05-20 Thread Tang Chen

Hi Rafael,

Seems OK to me.

Reviewed-by: Tang Chen 

Thanks. :)

On 05/04/2013 07:12 PM, Rafael J. Wysocki wrote:

From: Rafael J. Wysocki

During ACPI memory hotplug configuration bind memory blocks residing
in modules removable through the standard ACPI mechanism to struct
acpi_device objects associated with ACPI namespace objects
representing those modules.  Accordingly, unbind those memory blocks
from the struct acpi_device objects when the memory modules in
question are being removed.

When "offline" operation for devices representing memory blocks is
introduced, this will allow the ACPI core's device hot-remove code to
use it to carry out remove_memory() for those memory blocks and check
the results of that before it actually removes the modules holding
them from the system.

Since walk_memory_range() is used for accessing all memory blocks
corresponding to a given ACPI namespace object, it is exported from
memory_hotplug.c so that the code in acpi_memhotplug.c can use it.

Signed-off-by: Rafael J. Wysocki
---
  drivers/acpi/acpi_memhotplug.c |   53 
++---
  include/linux/memory_hotplug.h |2 +
  mm/memory_hotplug.c|4 ++-
  3 files changed, 55 insertions(+), 4 deletions(-)

Index: linux-pm/mm/memory_hotplug.c
===
--- linux-pm.orig/mm/memory_hotplug.c
+++ linux-pm/mm/memory_hotplug.c
@@ -1618,6 +1618,7 @@ int offline_pages(unsigned long start_pf
  {
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
  }
+#endif /* CONFIG_MEMORY_HOTREMOVE */

  /**
   * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
@@ -1631,7 +1632,7 @@ int offline_pages(unsigned long start_pf
   *
   * Returns the return value of func.
   */
-static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *))
  {
struct memory_block *mem = NULL;
@@ -1668,6 +1669,7 @@ static int walk_memory_range(unsigned lo
return 0;
  }

+#ifdef CONFIG_MEMORY_HOTREMOVE
  /**
   * offline_memory_block_cb - callback function for offlining memory block
   * @mem: the memory block to be offlined
Index: linux-pm/include/linux/memory_hotplug.h
===
--- linux-pm.orig/include/linux/memory_hotplug.h
+++ linux-pm/include/linux/memory_hotplug.h
@@ -245,6 +245,8 @@ static inline int is_mem_section_removab
  static inline void try_offline_node(int nid) {}
  #endif /* CONFIG_MEMORY_HOTREMOVE */

+extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+   void *arg, int (*func)(struct memory_block *, void *));
  extern int mem_online_node(int nid);
  extern int add_memory(int nid, u64 start, u64 size);
  extern int arch_add_memory(int nid, u64 start, u64 size);
Index: linux-pm/drivers/acpi/acpi_memhotplug.c
===
--- linux-pm.orig/drivers/acpi/acpi_memhotplug.c
+++ linux-pm/drivers/acpi/acpi_memhotplug.c
@@ -28,6 +28,7 @@
   */

  #include
+#include
  #include

  #include "internal.h"
@@ -166,13 +167,50 @@ static int acpi_memory_check_device(stru
return 0;
  }

+static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
+{
+   return PFN_DOWN(info->start_addr);
+}
+
+static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
+{
+   return PFN_UP(info->start_addr + info->length-1);
+}
+
+static int acpi_bind_memblk(struct memory_block *mem, void *arg)
+{
+   return acpi_bind_one(&mem->dev, (acpi_handle)arg);
+}
+
+static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
+  acpi_handle handle)
+{
+   return walk_memory_range(acpi_meminfo_start_pfn(info),
+acpi_meminfo_end_pfn(info), (void *)handle,
+acpi_bind_memblk);
+}
+
+static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
+{
+   acpi_unbind_one(&mem->dev);
+   return 0;
+}
+
+static void acpi_unbind_memory_blocks(struct acpi_memory_info *info,
+ acpi_handle handle)
+{
+   walk_memory_range(acpi_meminfo_start_pfn(info),
+ acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk);
+}
+
  static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
  {
+   acpi_handle handle = mem_device->device->handle;
int result, num_enabled = 0;
struct acpi_memory_info *info;
int node;

-   node = acpi_get_node(mem_device->device->handle);
+   node = acpi_get_node(handle);
/*
 * Tell the VM there is more memory here...
 * Note: Assume that this func

Re: [PATCH 2/2 v2, RFC] Driver core: Introduce offline/online callbacks for memory blocks

2013-05-20 Thread Tang Chen

Hi Rafael,

Please see below.

On 05/04/2013 07:21 PM, Rafael J. Wysocki wrote:
..

  static BLOCKING_NOTIFIER_HEAD(memory_chain);
@@ -278,33 +283,64 @@ static int __memory_block_change_state(s
  {
int ret = 0;

-   if (mem->state != from_state_req) {
-   ret = -EINVAL;
-   goto out;
-   }
+   if (mem->state != from_state_req)
+   return -EINVAL;

if (to_state == MEM_OFFLINE)
mem->state = MEM_GOING_OFFLINE;

ret = memory_block_action(mem->start_section_nr, to_state, online_type);
-
if (ret) {
mem->state = from_state_req;
-   goto out;
+   } else {
+   mem->state = to_state;
+   if (to_state == MEM_ONLINE)
+   mem->last_online = online_type;


Why do we need to remember last online type ?

And as far as I know, we can obtain which zone a page was in last time it
was onlined by check page->flags, just like online_pages() does. If we
use online_kernel or online_movable, the zone boundary will be 
recalculated.

So we don't need to remember the last online type.

Seeing from your patch, I guess memory_subsys_online() can only handle
online and offline. So mem->last_online is used to remember what user has
done through the original way to trigger memory hot-remove, right ? And 
when

user does it in this new way, it just does the same thing as user does last
time.

But I still think we don't need to remember it because if finally you call
online_pages(), it just does the same thing as last time by default.

online_pages()
{
..
if (online_type == ONLINE_KERNEL ..

if (online_type == ONLINE_MOVABLE..

zone = page_zone(pfn_to_page(pfn));

	/* Here, the page will be put into the zone which it belong to last 
time. */


..
}

I just thought of it. Maybe I missed something in your design. Please tell
me if I'm wrong.

Reviewed-by: Tang Chen 

Thanks. :)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [WiP]: aio support for migrating pages (Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable())

2013-05-20 Thread Tang Chen

Hi Benjamin,

Sorry for the late. Please see below.

On 05/17/2013 10:37 PM, Benjamin LaHaise wrote:

On Fri, May 17, 2013 at 11:28:52AM +0800, Tang Chen wrote:

Hi Benjamin,

Thank you very much for your idea. :)

I have no objection to your idea, but seeing from your patch, this only
works for aio subsystem because you changed the way to allocate the aio
ring pages, with a file mapping.


That is correct.  There is no way you're going to be able to solve this
problem without dealing with the issue on a subsystem by subsystem basis.



Yes, I understand that. We need subsystem work anyway.



I'm working in the way Mel has said, migrate_unpin() and migrate_pin()
callbacks. But as you saw, I met some problems, like I don't where to put
these two callbacks. And discussed with you guys, I want to try this:

1. Add a new member to struct page, used to remember the pin holders of
this page, including the pin and unpin callbacks and the necessary data.
This is more like a callback chain.
(I'm worry about this step, I'm not sure if it is good enough. After
all,
 we need a good place to put the callbacks.)


Putting function pointers into struct page is not going to happen.  You'd
be adding a significant amount of memory overhead for something that is
never going to be used on the vast majority of systems (2 function pointers
would be 16 bytes per page on a 64 bit system).  Keep in mind that distro
kernels tend to enable almost all config options on their kernels, so the
overhead of any approach has to make sense for the users of the kernel that
will never make use of this kind of migration.


True. But I just cannot find a place to hold the callbacks.




3. Call these callbacks before and after migration.


How is that better than using the existing hook in address_space_operations?


I'm not saying using two callbacks before and after migration is better.
I don't want to use address_space_operations is because there is no such 
member

for anonymous pages.

In your idea, using a file mapping will create a 
address_space_operations. But

I really don't think we can modify the way of memory allocation for all the
subsystems who has this problem. Maybe not just aio and cma. That means if
you want to pin pages in memory, you have to use a file mapping. This makes
the memory allocation more complicated. And the idea should be known by all
the subsystem developers. Is that going to happen ?


I also thought about reuse one field of struct page. But as you said, there
may not be many users of this functionality. Reusing a field of struct page
will make things more complicated and lead to high coupling.


So, how about the other idea that Mel mentioned ?

We create a 1-1 mapping of pinned page ranges and the pinner (subsystem
callbacks and data), maybe a global list or a hash table. And then, we can
find the callbacks.


Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [WiP]: aio support for migrating pages (Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable())

2013-05-16 Thread Tang Chen

Hi Benjamin,

Thank you very much for your idea. :)

I have no objection to your idea, but seeing from your patch, this only
works for aio subsystem because you changed the way to allocate the aio
ring pages, with a file mapping.

So far as I know, not only aio, but also other subsystems, such CMA, will
also have problem like this. The page cannot be migrated because it is
pinned in memory. So I think we should work out a common way to solve how
to migrate pinned pages.

I'm working in the way Mel has said, migrate_unpin() and migrate_pin()
callbacks. But as you saw, I met some problems, like I don't where to put
these two callbacks. And discussed with you guys, I want to try this:

1. Add a new member to struct page, used to remember the pin holders of
   this page, including the pin and unpin callbacks and the necessary data.
   This is more like a callback chain.
   (I'm worry about this step, I'm not sure if it is good enough. After 
all,

we need a good place to put the callbacks.)

And then, like Mel said,

2. Implement the callbacks in the subsystems, and register them to the
   new member in struct page.

3. Call these callbacks before and after migration.


I think I'll send a RFC patch next week when I finished the outline. I'm
just thinking of finding a common way to solve this problem that all the
other subsystems will benefit.

Thanks. :)


On 05/17/2013 08:23 AM, Benjamin LaHaise wrote:

On Thu, May 16, 2013 at 01:54:18PM +0800, Tang Chen wrote:
...

OK, I'll try to figure out a proper place to put the callbacks.
But I think we need to add something new to struct page. I'm just
not sure if it is OK. Maybe we can discuss more about it when I send
a RFC patch.

...

I ended up working on this a bit today, and managed to cobble together
something that somewhat works -- please see the patch below.  It still is
not completely tested, and it has a rather nasty bug owing to the fact
that the file descriptors returned by anon_inode_getfile() all share the
same inode (read: more than one instance of aio does not work), but it
shows the basic idea.  Also, bad things probably happen if someone does
an mremap() on the aio ring buffer.  I'll polish this off sometime next
week after the long weekend if noone beats me to it.

-ben

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] numa, mm, memory-hotplug: Do not allocate pagetable to local node with MEMORY_HOTREMOVE enabled.

2013-05-16 Thread Tang Chen
The following patch-set allocated pagetables to local node.
https://lkml.org/lkml/2013/4/11/829

Doing this will break memory hot-remove.

Before removing memory, the kernel offlines memory. If offlining
memory fails, the memory cannot be removed. The pagetables are
used by the kernel, so they cannot be offlined. Furthermore, they
cannot be removed.

Of course, we can free pagetable pages because the pagetables of
the to be removed memory are useless. But offlining memory doesn't
mean removing memory. If users only want to offline memory, the
pagetables should not be freed.

The minimum unit of memory online/offline is block. And by default,
one block contains one section, which by default is 128MB. There is
possiblity that half of the block is pagetable, and the other half
is movable memory.

When we offline this kind of block, the status of the block is
uncertain. We cannot simply free the pagetables in this block because
they may be used by other online blocks. But when doing memory
hot-remove, the failure of offlining blocks will break the memory
hot-remove logic.


In order to fix it, we have three solutions:

1. Reserve the whole block (128MB), making no user can use the rest
   parts of the block. And skip them when offlining memory.
   When all the other blocks are offlined, free the pagetable, and remove
   all the memory.

   But we may lose some memory for this purpose. 128MB is a little big
   to waste.


2. Keep this block online. Although the offline operation fails, it is
   OK to remove memory.

   But the offline operation will always fail. And generally speaking,
   there are a lot of reasons of offline failing, it is difficult to
   detect if it is OK to remove memory. So we don't suggest this way.


3. Migrate user pages and make this block offline. Offlining memory won't
   stop the kernel using the pagetables stored in them, so it will be OK.

   But this will change the semantics of "offline". I'm not sure if we
   can do it in this way.


So before we fix this problem, I think we should not allocate pagetables
to local node when CONFIG_MEMORY_HOTREMOVE is enabled. And recover it when
we confirm the direction and fix the problem.

This patch is based on
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
for-x86-mm

Any other solution for this problem is welcome.


Signed-off-by: Tang Chen 
---
 arch/x86/mm/init.c |   27 ---
 1 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8d0007a..8cd8a2d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -55,18 +55,23 @@ __ref void *alloc_low_pages(unsigned int num)
 
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
unsigned long ret;
-   if (local_min_pfn_mapped >= local_max_pfn_mapped) {
+#ifndef CONFIG_MEMORY_HOTPLUG
+   if (local_max_pfn_mapped > local_min_pfn_mapped) {
+   ret = memblock_find_in_range(
+   local_min_pfn_mapped << PAGE_SHIFT,
+   local_max_pfn_mapped << PAGE_SHIFT,
+   PAGE_SIZE * num , PAGE_SIZE);
+   } else
+#endif
+   {
if (low_min_pfn_mapped >= low_max_pfn_mapped)
panic("alloc_low_page: ran out of memory");
ret = memblock_find_in_range(
low_min_pfn_mapped << PAGE_SHIFT,
low_max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
-   } else
-   ret = memblock_find_in_range(
-   local_min_pfn_mapped << PAGE_SHIFT,
-   local_max_pfn_mapped << PAGE_SHIFT,
-   PAGE_SIZE * num , PAGE_SIZE);
+   }
+
if (!ret)
panic("alloc_low_page: can not alloc memory");
memblock_reserve(ret, PAGE_SIZE * num);
@@ -443,6 +448,11 @@ void __init init_mem_mapping(unsigned long begin, unsigned 
long end)
if (new_mapped_ram_size > mapped_ram_size)
step_size <<= STEP_SIZE_SHIFT;
mapped_ram_size += new_mapped_ram_size;
+
+   if (is_low) {
+   low_min_pfn_mapped = local_min_pfn_mapped;
+   low_max_pfn_mapped = local_max_pfn_mapped;
+   }
}
 
if (real_end < end) {
@@ -450,11 +460,6 @@ void __init init_mem_mapping(unsigned long begin, unsigned 
long end)
if ((end >> PAGE_SHIFT) > local_max_pfn_mapped)
local_max_pfn_

Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable()

2013-05-15 Thread Tang Chen

Hi Mel,

On 05/15/2013 09:24 PM, Mel Gorman wrote:

If it is to be an address space operations sturcture then you'll need a
pseudo mapping structure for anonymous pages that are pinned by aio --
similar in principal to how swapper_space is used for managing PageSwapCache
or how anon_vma structures can be associated with a page.

However, I warn you that you may find that the address_space is the
wrong level to register such callbacks, it just seemed like the obvious
first choice. A potential alternative implementation is to create a 1:1
association between pages and a long-lived holder that is stored on a hash
table (similar style of arrangement as page_waitqueue).  A page is looked up
in the hash table and if an entry exists, it points to an callback structure
to the subsystem holding the pin. It's up to the subsystem to register the
callbacks when it is about to pin a page (get_user_pages_longlived(,
&release_ops) and figure out how to release the pin safely.



OK, I'll try to figure out a proper place to put the callbacks.
But I think we need to add something new to struct page. I'm just
not sure if it is OK. Maybe we can discuss more about it when I send
a RFC patch.

Thanks for the advices, and I'll try them.

Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable()

2013-05-15 Thread Tang Chen

Hi Benjamin, Mel,

On 05/15/2013 10:09 AM, Tang Chen wrote:

Hi Benjamin, Mel,

Please see below.

On 05/14/2013 09:58 PM, Benjamin LaHaise wrote:

On Tue, May 14, 2013 at 09:24:58AM +0800, Tang Chen wrote:

Hi Mel, Benjamin, Jeff,

On 05/13/2013 11:01 PM, Benjamin LaHaise wrote:

On Mon, May 13, 2013 at 10:54:03AM -0400, Jeff Moyer wrote:

How do you propose to move the ring pages?


It's the same problem as doing a TLB shootdown: flush the old pages
from
userspace's mapping, copy any existing data to the new pages, then
repopulate the page tables. It will likely require the addition of
address_space_operations for the mapping, but that's not too hard to
do.



I think we add migrate_unpin() callback to decrease page->count if
necessary,
and migrate the page to a new page, and add migrate_pin() callback to
pin
the new page again.


You can't just decrease the page count for this to work. The pages are
pinned because aio_complete() can occur at any time and needs to have a
place to write the completion events. When changing pages, aio has to
take the appropriate lock when changing one page for another.


In aio_complete(),

aio_complete() {
..
spin_lock_irqsave(&ctx->completion_lock, flags);
//write the completion event.
spin_unlock_irqrestore(&ctx->completion_lock, flags);
..
}

So for this problem, I think we can hold kioctx->completion_lock in the aio
callbacks to prevent aio subsystem accessing pages who are being migrated.



Another problem here is:

We intend to call these callbacks in the page migrate path, and we need to
know which lock to hold. But there is no way for migrate path to know this
info.

The migrate path is common for all kinds of pages, so we cannot pass any
specific parameter to the callbacks in migrate path.

When we get a page, we cannot get any kioctx info from the page. So how can
the callback know which lock to require without any parameter ? Or do we 
have

any other way to do so ?

Would you please give some more advice about this ?

BTW, we also need to update kioctx->ring_pages.

Thanks. :)




The migrate procedure will work just as before. We use callbacks to
decrease
the page->count before migration starts, and increase it when the
migration
is done.

And migrate_pin() and migrate_unpin() callbacks will be added to
struct address_space_operations.


I think the existing migratepage operation in address_space_operations
can
be used. Does it get called when hot unplug occurs? That is: is testing
with the migrate_pages syscall similar enough to the memory removal case?



But as I said, for anonymous pages such as aio ring buffer, they don't have
address_space_operations. So where should we put the callbacks' pointers ?

Add something like address_space_operations to struct anon_vma ?

Thanks. :)






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable()

2013-05-14 Thread Tang Chen

Hi Benjamin, Mel,

Please see below.

On 05/14/2013 09:58 PM, Benjamin LaHaise wrote:

On Tue, May 14, 2013 at 09:24:58AM +0800, Tang Chen wrote:

Hi Mel, Benjamin, Jeff,

On 05/13/2013 11:01 PM, Benjamin LaHaise wrote:

On Mon, May 13, 2013 at 10:54:03AM -0400, Jeff Moyer wrote:

How do you propose to move the ring pages?


It's the same problem as doing a TLB shootdown: flush the old pages from
userspace's mapping, copy any existing data to the new pages, then
repopulate the page tables.  It will likely require the addition of
address_space_operations for the mapping, but that's not too hard to do.



I think we add migrate_unpin() callback to decrease page->count if
necessary,
and migrate the page to a new page, and add migrate_pin() callback to pin
the new page again.


You can't just decrease the page count for this to work.  The pages are
pinned because aio_complete() can occur at any time and needs to have a
place to write the completion events.  When changing pages, aio has to
take the appropriate lock when changing one page for another.


In aio_complete(),

aio_complete() {
..
spin_lock_irqsave(&ctx->completion_lock, flags);
//write the completion event.
spin_unlock_irqrestore(&ctx->completion_lock, flags);
..
}

So for this problem, I think we can hold ctx->completion_lock in the aio
callbacks to prevent aio subsystem accessing pages who are being migrated.




The migrate procedure will work just as before. We use callbacks to
decrease
the page->count before migration starts, and increase it when the migration
is done.

And migrate_pin() and migrate_unpin() callbacks will be added to
struct address_space_operations.


I think the existing migratepage operation in address_space_operations can
be used.  Does it get called when hot unplug occurs?  That is: is testing
with the migrate_pages syscall similar enough to the memory removal case?



But as I said, for anonymous pages such as aio ring buffer, they don't have
address_space_operations. So where should we put the callbacks' pointers ?

Add something like address_space_operations to struct anon_vma ?

Thanks. :)






--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/22] x86, ACPI, numa: Parse numa info early

2013-05-14 Thread Tang Chen

Hi Yinghai,

How do you think of the following problem and solutions ?

And can we not allocate pagetable to local node when MEMORY_HOTREMOVE
is enabled for now, and do it again when the problem in hot-remove
path is fixed ?

Thanks. :)

On 05/13/2013 10:59 AM, Tang Chen wrote:

Hi Yinghai,

On 05/10/2013 02:24 AM, Yinghai Lu wrote:

So I suggest to separate the job into 2 parts:
1. Push Yinghai's patch1 ~ patch20, without putting pagetable in local
node.
And push my work to use SRAT to arrange ZONE_MOVABLE.
In this case, we can enable memory hotplug in the kernel first.
2. Merge patch21 and patch22 into the fixing work I am doing now,
and push
them
together when finished.



no, no, no, please do not half-done work.

Do it right, and Do it clean.



I'm not saying I want to do it half-way. Putting pagetable in local node
will make memory hot-remove patch unable to work.

Before removing pages, the kernel first offlines pages. If the offline
logic
fails, the hot-remove cannot work. Since your patches have put node
pagetable
in local node at boot time, this memory cannot be offlined, furthermore,
it cannot be hot-removed.

The minimum unit of memory online/offline is block. And by default,
one block contains one section, which by default is 128MB. So if parts
of a block are pagetable, and the rest parts are movable memory, this
block cannot be offlined. And as a result, it cannot be removed.

In order to fix it, we have three solutions:

1. Reserve the whole block (128MB), making no user can use the rest
parts of the block. And skip them when offlining memory.
When all the other blocks are offlined, free the pagetable, and remove
all the memory.

But we may lose some memory for this purpose. 128MB is a little big
to waste.


2. Migrate movable pages and keep this block online. Although the offline
operation fails, it is OK to remove memory.

But the offline operation will always fail. And generally speaking,
there are a lot of reasons of offline failing, it is difficult to
detect if it is OK to remove memory.


3. Migrate user pages and make this block offline, but the kernel can
still use the pagetable in it.

But this will change the semantics of "offline". I'm not sure if we
can do it in this way.


4. Do not allocate pagetable to local node when CONFIG_MEMORY_HOTREMOVE
is enabled. (I do suggest not to put pagetable in local node in
memory hot-remove situation.)


How do you think about these 4 solutions above ?

I think I need some advices for this problem in community. Do you have
any idea to fix this problem if we put pagetable in local node ?

The memory hot-plug guys do want to use memory hot-remove. And I think
for now, we use solution 4 above. When CONFIG_MEMORY_HOTREMOVE is enabled,
do not allocate pagetable to local node.

I'm not trying to do it half-way. When we fix this problem, we can allocate
pagetable to local node again with CONFIG_MEMORY_HOTREMOVE enabled.

Please do give some advices or feedback.




If you have any thinking of this patch-set, please let me know.


Talked to HPA, and he will put my patchset into tip/x86/mm after
v3.10-rc1.

after that we can work on put pagetable on local node for hotadd path.



hot-add path is another problem. But I think the hot-remove path is more
urgent now.


Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable()

2013-05-13 Thread Tang Chen

Hi Mel,

On 05/13/2013 05:19 PM, Mel Gorman wrote:

For memory hot-remove case, the aio pages are pined in memory and making
the pages cannot be offlined, furthermore, the pages cannot be removed.

IIUC, you mean implement migrate_unpin() and migrate_pin() callbacks in aio
subsystem, and call them when hot-remove code tries to offline
pages, right ?

If so, I'm wondering where should we put this callback pointers ?
In struct page ?



No, I would expect the callbacks to be part the address space operations
which can be found via page->mapping.



Two more problems I don't quite understand.

1. For an anonymous page, it has no address_space, and no address space
   operation. But the aio ring problem just happened when dealing with
   anonymous pages. Please refer to:
   (https://lkml.org/lkml/2012/11/29/69)

   If we put the the callbacks in page->mapping->a_ops, the anonymous 
pages

   won't be able to use them.

   And we cannot give a default callback because the situation we are 
dealing

   with is a special situation.

   So where to put the callback for anonymous pages ?


2. How to find out the reason why page->count != 1 in 
migrate_page_move_mapping() ?


   In the problem we are dealing with, get_user_pages() is called to 
pin the pages
   in memory. And the pages are migratable. So we want to decrease the 
page->count.


   But get_user_pages() is not the only reason leading to page->count 
increased.

   How can I know when should decrease teh page->count or when should not ?

   The way I can figure out is to assign the callback pointer in 
get_user_pages()

   because it is get_user_pages() who pins the pages.


Thanks. :)








--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable()

2013-05-13 Thread Tang Chen

Hi Mel, Benjamin, Jeff,

On 05/13/2013 11:01 PM, Benjamin LaHaise wrote:

On Mon, May 13, 2013 at 10:54:03AM -0400, Jeff Moyer wrote:

How do you propose to move the ring pages?


It's the same problem as doing a TLB shootdown: flush the old pages from
userspace's mapping, copy any existing data to the new pages, then
repopulate the page tables.  It will likely require the addition of
address_space_operations for the mapping, but that's not too hard to do.



I think we add migrate_unpin() callback to decrease page->count if 
necessary,

and migrate the page to a new page, and add migrate_pin() callback to pin
the new page again.

The migrate procedure will work just as before. We use callbacks to 
decrease

the page->count before migration starts, and increase it when the migration
is done.

And migrate_pin() and migrate_unpin() callbacks will be added to
struct address_space_operations.

Is that right ?

If so, I'll be working on it.

Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 1/2] mm: hotplug: implement non-movable version of get_user_pages() called get_user_pages_non_movable()

2013-05-13 Thread Tang Chen

Hi Mel,

On 02/06/2013 05:56 PM, Mel Gorman wrote:


There is the possibility that callbacks could be introduced for
migrate_unpin() and migrate_pin() that takes a list of PFN pairs
(old,new). The unpin callback should release the old PFNs and barrier
against any operations until the migrate_pfn() callback is called with
the updated pfns to be repinned. Again it would fully depend on subsystems
implementing it properly.

The callback interface would be more robust but puts a lot more work on
the driver side where your milage will vary.



I'm very interested in the "callback" way you said.

For memory hot-remove case, the aio pages are pined in memory and making
the pages cannot be offlined, furthermore, the pages cannot be removed.

IIUC, you mean implement migrate_unpin() and migrate_pin() callbacks in aio
subsystem, and call them when hot-remove code tries to offline pages, 
right ?


If so, I'm wondering where should we put this callback pointers ?
In struct page ?


It has been a long time since this topic was discussed. But to solve this
problem cleanly for hotplug guys and CMA guys, please give some more 
comments.


Thanks. :)



To guarantee CMA can migrate pages pinned by drivers I think you need
migrate-related callsbacks to unpin, barrier the driver until migration
completes and repin.

I do not know, or at least have no heard, of anyone working on such a
scheme.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/22] x86, ACPI, numa: Parse numa info early

2013-05-12 Thread Tang Chen

Hi Yinghai,

On 05/10/2013 02:24 AM, Yinghai Lu wrote:

So I suggest to separate the job into 2 parts:
1. Push Yinghai's patch1 ~ patch20, without putting pagetable in local
node.
And push my work to use SRAT to arrange ZONE_MOVABLE.
In this case, we can enable memory hotplug in the kernel first.
2. Merge patch21 and patch22 into the fixing work I am doing now, and push
them
together when finished.



no, no, no, please do not half-done work.

Do it right, and Do it clean.



I'm not saying I want to do it half-way. Putting pagetable in local node
will make memory hot-remove patch unable to work.

Before removing pages, the kernel first offlines pages. If the offline logic
fails, the hot-remove cannot work. Since your patches have put node 
pagetable

in local node at boot time, this memory cannot be offlined, furthermore,
it cannot be hot-removed.

The minimum unit of memory online/offline is block. And by default,
one block contains one section, which by default is 128MB. So if parts
of a block are pagetable, and the rest parts are movable memory, this
block cannot be offlined. And as a result, it cannot be removed.

In order to fix it, we have three solutions:

1. Reserve the whole block (128MB), making no user can use the rest
   parts of the block. And skip them when offlining memory.
   When all the other blocks are offlined, free the pagetable, and remove
   all the memory.

   But we may lose some memory for this purpose. 128MB is a little big
   to waste.


2. Migrate movable pages and keep this block online. Although the offline
   operation fails, it is OK to remove memory.

   But the offline operation will always fail. And generally speaking,
   there are a lot of reasons of offline failing, it is difficult to
   detect if it is OK to remove memory.


3. Migrate user pages and make this block offline, but the kernel can
   still use the pagetable in it.

   But this will change the semantics of "offline". I'm not sure if we
   can do it in this way.


4. Do not allocate pagetable to local node when CONFIG_MEMORY_HOTREMOVE
   is enabled. (I do suggest not to put pagetable in local node in
   memory hot-remove situation.)


How do you think about these 4 solutions above ?

I think I need some advices for this problem in community. Do you have
any idea to fix this problem if we put pagetable in local node ?

The memory hot-plug guys do want to use memory hot-remove. And I think
for now, we use solution 4 above. When CONFIG_MEMORY_HOTREMOVE is enabled,
do not allocate pagetable to local node.

I'm not trying to do it half-way. When we fix this problem, we can allocate
pagetable to local node again with CONFIG_MEMORY_HOTREMOVE enabled.

Please do give some advices or feedback.




If you have any thinking of this patch-set, please let me know.


Talked to HPA, and he will put my patchset into tip/x86/mm after v3.10-rc1.

after that we can work on put pagetable on local node for hotadd path.



hot-add path is another problem. But I think the hot-remove path is more
urgent now.


Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/22] x86, ACPI, numa: Parse numa info early

2013-05-09 Thread Tang Chen

Hi Yinghai,

On 04/30/2013 03:21 PM, Tang Chen wrote:

So I suggest to separate the job into 2 parts:
1. Push Yinghai's patch1 ~ patch20, without putting pagetable in local node.
And push my work to use SRAT to arrange ZONE_MOVABLE.
In this case, we can enable memory hotplug in the kernel first.
2. Merge patch21 and patch22 into the fixing work I am doing now, and push them
together when finished.



It has been a long time since this mail and there was no response. I do 
think I
should move on to push this patch-set. So if you don't mind, I'll rebase 
and

push "parse SRAT earlier" part of this patch-set first.

Since putting pagetable in local node will destroy memory hot-remove 
logic for now,
I will drop "put pagetable in local node" parts now, and merge this part 
into

the hot-add and hot-remove fix work.

If you have any thinking of this patch-set, please let me know.

Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC/PATCH 3/5] mm: get_user_pages: use NON-MOVABLE pages when FOLL_DURABLE flag is set

2013-05-07 Thread Tang Chen

Hi Marek,

On 05/07/2013 06:47 PM, Marek Szyprowski wrote:


I don't think that there was any conclusion after my patch, so I really see
no point in submitting it again now. If you need it for Your patchset, You
can include it directly. Just please keep my signed-off-by tag.



That's very kind of you. I'll keep you as the Author and your 
signed-off-by tag

if I use your patches, and will cc you.

Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 10/13] x86, acpi, numa, mem-hotplug: Introduce MEMBLK_HOTPLUGGABLE to mark and reserve hotpluggable memory.

2013-05-06 Thread Tang Chen

Hi Vasilis,

On 05/06/2013 06:37 PM, Vasilis Liaskovitis wrote:


you can use qemu-kvm and seabios from these branches:
https://github.com/vliaskov/qemu-kvm/commits/memhp-v4
https://github.com/vliaskov/seabios/commits/memhp-v4

Instructions on how to use the DIMM/memory hotplug are here:

http://lists.gnu.org/archive/html/qemu-devel/2012-12/msg02693.html
(these patchsets are not in mainline qemu/qemu-kvm and seabios)

e.g. the following creates a VM with 2G initial memory on 2 nodes (1GB on each).
There is also an extra 1GB DIMM on each node (the last 3 lines below describe
this):

/opt/qemu/bin/qemu-system-x86_64 -bios /opt/devel/seabios-upstream/out/bios.bin 
\
-enable-kvm -M pc -smp 4,maxcpus=8 -cpu host -m 2G  \
-drive
file=/opt/images/debian.img,if=none,id=drive-virtio-disk0,format=raw,cache=none 
\
-device 
virtio-blk-pci,bus=pci.0,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 \
-netdev type=tap,id=guest0,vhost=on -device virtio-net-pci,netdev=guest0 -vga \
std -monitor stdio \
-numa node,mem=1G,cpus=2,nodeid=0 -numa node,mem=0,cpus=2,nodeid=1 \
-device dimm,id=dimm0,size=1G,node=0,bus=membus.0,populated=off \
-device dimm,id=dimm1,size=1G,node=1,bus=membus.0,populated=off

After startup I hotplug the dimm0 on node0 (or dimm1 on node1, same result)
(qemu) device_add dimm,id=dimm0,size=1G,node=0,bus=membus.0

than i reboot VM. Kernel works without "movablecore=acpi" but panics with this
option.

Note this qemu/seabios does not model initial memory (-m 2G) as memory devices.
Only extra dimms ("device -dimm") are modeled as separate memory devices.



OK, I'll try it. Thank you for telling me this.:)



Now in kernel, we can recognize a node (by PXM in SRAT), but we cannot
recognize a memory device. Are you saying if we have this
entry-granularity,
we can hotplug a single memory device in a node ? (Perhaps there are more
than on memory device in a node.)


yes, this is what I mean. Multiple memory devices on one node is possible in
both a real machine and a VM.
In the VM case, seabios can present different DIMM devices for any number of
nodes. Each DIMM is also given a separate SRAT entry by seabios. So when the
kernel initially parses the entries, it sees multiple ones for the same node.
(these are merged together in numa_cleanup_meminfo though)



If so, it makes sense. But I don't the kernel is able to recognize which
device a memory range belongs to now. And I'm not sure if we can do this.


kernel knows which memory ranges belong to each DIMM (with ACPI enabled, each
DIMM is represented by an acpi memory device, see 
drivers/acpi/acpi_memhotplug.c)



Oh, I'll check acpi_memhotplug.c and see what we can do.

And BTW, as Yinghai suggested, we'd better put pagetable in local node. 
But the best
way is to put pagetable in the local memory device, I think. Otherwise, 
we are not

able to hot-remove a memory device.

Thanks. :)




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/22] x86, ACPI, numa: Parse numa info early

2013-05-06 Thread Tang Chen

Hi all,

Could anyone give some suggestions to this patch-set ?

Thanks.

On 04/30/2013 03:21 PM, Tang Chen wrote:

Hi Yinghai, all,

I've tested this patch-set with my following patch-set:
[PATCH v1 00/12] Arrange hotpluggable memory in SRAT as ZONE_MOVABLE.
https://lkml.org/lkml/2013/4/19/94

Using ACPI table override, I overrided SRAT on my box like this:

[ 0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
[ 0.00] SRAT: Node 0 PXM 0 [mem 0x1-0x307ff]
[ 0.00] SRAT: Node 1 PXM 2 [mem 0x30800-0x583ff] Hot Pluggable
[ 0.00] SRAT: Node 2 PXM 3 [mem 0x58400-0x7] Hot Pluggable

We had 3 nodes, node0 was not hotpluggable, and node1 and node2 were
hotpluggable.


And memblock reserved pagetable pages (with flag 0x1) in local nodes.
..
[ 0.00] reserved[0xb] [0x0307ff-0x0307ff1fff], 0x2000
bytes flags: 0x0
[ 0.00] reserved[0xc] [0x0307ff2000-0x0307ff], 0xe000
bytes on node 0 flags: 0x1
[ 0.00] reserved[0xd] [0x0583ff7000-0x0583ff], 0x9000
bytes on node 1 flags: 0x1
[ 0.00] reserved[0xe] [0x079000-0x07], 0x7000
bytes on node 2 flags: 0x1

And after some bug fix, memblock can also reserve hotpluggable memory
with flag 0x2.
..
[ 0.00] reserved[0xb] [0x0307ff-0x0307ff1fff], 0x2000
bytes flags: 0x0
[ 0.00] reserved[0xc] [0x0307ff2000-0x0307ff], 0xe000
bytes on node 0 flags: 0x1
[ 0.00] reserved[0xd] [0x030800-0x0583ff6fff],
0x27bff7000 bytes on node 1 flags: 0x2
[ 0.00] reserved[0xe] [0x0583ff7000-0x0583ff], 0x9000
bytes on node 1 flags: 0x1
[ 0.00] reserved[0xf] [0x058400-0x077fff],
0x27bff8000 bytes on node 2 flags: 0x2
[ 0.00] reserved[0x10] [0x078000-0x07], 0x8000
bytes on node 2 flags: 0x1

And free it to buddy system when memory initialization finished.


So the results:
1. We can parse SRAT earlier correctly.
2. We can override tables correctly.
3. We can put pagetable pages in local node.
4. We can prevent memblock from allocating hotpluggable memory.
5. We can arrange ZONE_MOVABLE using SRAT info.


Known problems:

When we put pagetable pages in local node, memory hot-remove logic won't
work.
I'm fixing it now. We need to fix the following:
1. Improve hot-remove to support freeing local node pagetable pages.
2. Improve hot-add to support putting hot-added pagetable pages in local
node.
3. Do the same to vmemmap and page_cgrop pages.

So I suggest to separate the job into 2 parts:
1. Push Yinghai's patch1 ~ patch20, without putting pagetable in local
node.
And push my work to use SRAT to arrange ZONE_MOVABLE.
In this case, we can enable memory hotplug in the kernel first.
2. Merge patch21 and patch22 into the fixing work I am doing now, and
push them
together when finished.

How do you think ?

Reviewed-by: Tang Chen 
Tested-by: Tang Chen 

Thanks. :)




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC/PATCH 3/5] mm: get_user_pages: use NON-MOVABLE pages when FOLL_DURABLE flag is set

2013-05-06 Thread Tang Chen

Hi Marek,

It has been a long time since this patch-set was sent.
And I'm pushing memory hot-remove works. I think I need your
[patch3/5] to fix a problem I met.

We have sent a similar patch before. But I think yours may be better. :)
https://lkml.org/lkml/2013/2/21/126

So would you please update and resend your patch again ?
Or do you have your own plan to push it ?

Thanks. :)

On 03/05/2013 02:57 PM, Marek Szyprowski wrote:

Ensure that newly allocated pages, which are faulted in in FOLL_DURABLE
mode comes from non-movalbe pageblocks, to workaround migration failures
with Contiguous Memory Allocator.

Signed-off-by: Marek Szyprowski
Signed-off-by: Kyungmin Park
---
  include/linux/highmem.h |   12 ++--
  include/linux/mm.h  |2 ++
  mm/memory.c |   24 ++--
  3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 7fb31da..cf0b9d8 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -168,7 +168,8 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
  #endif

  /**
- * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a 
VMA that the caller knows can move
+ * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for
+ * a VMA that the caller knows can move
   * @vma: The VMA the page is to be allocated for
   * @vaddr: The virtual address the page will be inserted into
   *
@@ -177,11 +178,18 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
   */
  static inline struct page *
  alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
-   unsigned long vaddr)
+  unsigned long vaddr)
  {
return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
  }

+static inline struct page *
+alloc_zeroed_user_highpage(gfp_t gfp, struct vm_area_struct *vma,
+  unsigned long vaddr)
+{
+   return __alloc_zeroed_user_highpage(gfp, vma, vaddr);
+}
+
  static inline void clear_highpage(struct page *page)
  {
void *kaddr = kmap_atomic(page);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9806e54..c11f58f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -165,6 +165,7 @@ extern pgprot_t protection_map[16];
  #define FAULT_FLAG_RETRY_NOWAIT   0x10/* Don't drop mmap_sem and wait 
when retrying */
  #define FAULT_FLAG_KILLABLE   0x20/* The fault task is in SIGKILL 
killable region */
  #define FAULT_FLAG_TRIED  0x40/* second try */
+#define FAULT_FLAG_NO_CMA  0x80/* don't use CMA pages */

  /*
   * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -1633,6 +1634,7 @@ static inline struct page *follow_page(struct 
vm_area_struct *vma,
  #define FOLL_HWPOISON 0x100   /* check page is hwpoisoned */
  #define FOLL_NUMA 0x200   /* force NUMA hinting page fault */
  #define FOLL_MIGRATION0x400   /* wait for page to replace migration 
entry */
+#define FOLL_DURABLE   0x800   /* get the page reference for a long time */

  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
diff --git a/mm/memory.c b/mm/memory.c
index 42dfd8e..2b9c2dd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1816,6 +1816,9 @@ long __get_user_pages(struct task_struct *tsk, struct 
mm_struct *mm,
int ret;
unsigned int fault_flags = 0;

+   if (gup_flags&  FOLL_DURABLE)
+   fault_flags = FAULT_FLAG_NO_CMA;
+
/* For mlock, just skip the stack guard page. */
if (foll_flags&  FOLL_MLOCK) {
if (stack_guard_page(vma, start))
@@ -2495,7 +2498,7 @@ static inline void cow_user_page(struct page *dst, struct 
page *src, unsigned lo
   */
  static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
-   spinlock_t *ptl, pte_t orig_pte)
+   spinlock_t *ptl, pte_t orig_pte, unsigned int flags)
__releases(ptl)
  {
struct page *old_page, *new_page = NULL;
@@ -2505,6 +2508,10 @@ static int do_wp_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
struct page *dirty_page = NULL;
unsigned long mmun_start = 0;   /* For mmu_notifiers */
unsigned long mmun_end = 0; /* For mmu_notifiers */
+   gfp_t gfp = GFP_HIGHUSER_MOVABLE;
+
+   if (IS_ENABLED(CONFIG_CMA)&&  (flags&  FAULT_FLAG_NO_CMA))
+   gfp&= ~__GFP_MOVABLE;

old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
@@ -2668,11 +2675,11 @@ gotten:
goto oom;

if (is_zero_pfn(pte_pfn(orig_pte))) {
-   new_page 

Re: [PATCH v2 10/13] x86, acpi, numa, mem-hotplug: Introduce MEMBLK_HOTPLUGGABLE to mark and reserve hotpluggable memory.

2013-05-05 Thread Tang Chen

Hi Vasilis,

Sorry for the delay and thank you for reviewing and testing. :)

On 05/03/2013 06:50 PM, Vasilis Liaskovitis wrote:


Should we skip ranges on nodes that the kernel uses? e.g. with

 if (memblock_is_kernel_node(nid))
 continue;


Yes. I think I forgot to call it in this patch.
Will update in the next version.




- I am getting a "PANIC: early exception" when rebooting with movablecore=acpi
after hotplugging memory on node0 or node1 of a 2-node VM. The guest kernel is
based on
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
for-x86-mm (e9058baf) + these v2 patches.

This happens with or without the above memblock_is_kernel_node(nid) check.
Perhaps I am missing something or I need a newer "ACPI, numa: Parse numa info
early" patch-set?


I didn't test it on a VM. But on my real box, I haven't got a panic
when rebooting. I think I can help to test it in a VM, but would you 
please to

tell me how to setup a environment as yours ?



A general question: Disabling hot-pluggability/zone-movable eligibility for a
whole node sounds a bit inflexible, if the machine only has one node to begin
with.  Would it be possible to keep movable information per SRAT entry? I.e
if the BIOS presents multiple SRAT entries for one node/PXM (say node 0), and
there is no memblock/kernel allocation on one of these SRAT entries, could
we still mark this SRAT entry's range as hot-pluggable/movable?  Not sure if
many real machine BIOSes would do this, but seabios could.  This implies that
SRAT entries are processed for movable-zone eligilibity before they are merged
on node/PXM basis entry-granularity (I think numa_cleanup_meminfo currently does
this merge).


Yes, this can be done. But in real usage, part of the memory in a node
is hot-removable makes no sense, I think. We cannot remove the whole node,
so we cannot remove a real hardware device.

But in virtualization, would you please give a reason why we need this
entry-granularity ?


Another thinking. Assume I didn't understand your question correctly. :)

Now in kernel, we can recognize a node (by PXM in SRAT), but we cannot
recognize a memory device. Are you saying if we have this 
entry-granularity,

we can hotplug a single memory device in a node ? (Perhaps there are more
than on memory device in a node.)

If so, it makes sense. But I don't the kernel is able to recognize which
device a memory range belongs to now. And I'm not sure if we can do this.



Of course the kernel should still have enough memory(i.e. non movable zone) to
boot. Can we ensure that at least certain amount of memory is non-movable, and
then, given more separate SRAT entries for node0 not used by kernel, treat
these rest entries as movable?


I tried this idea before. But as HPA said, it seems no way to calculate 
how much

memory the kernel needs.
https://lkml.org/lkml/2012/11/27/29


Thanks. :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 01/13] x86: get pg_data_t's memory from other node

2013-04-30 Thread Tang Chen
From: Yasuaki Ishimatsu 

If system can create movable node which all memory of the
node is allocated as ZONE_MOVABLE, setup_node_data() cannot
allocate memory for the node's pg_data_t.
So, use memblock_alloc_try_nid() instead of memblock_alloc_nid()
to retry when the first allocation fails.

Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Lai Jiangshan 
Signed-off-by: Tang Chen 
Signed-off-by: Jiang Liu 
---
 arch/x86/mm/numa.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 11acdf6..4f754e6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -214,10 +214,9 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
-   nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+   nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
-   pr_err("Cannot find %zu bytes in node %d\n",
-  nd_size, nid);
+   pr_err("Cannot find %zu bytes in any node\n", nd_size);
return;
}
nd = __va(nd_pa);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 02/13] acpi: Print Hot-Pluggable Field in SRAT.

2013-04-30 Thread Tang Chen
The Hot-Pluggable field in SRAT suggests if the memory could be
hotplugged while the system is running. Print it as well when
parsing SRAT will help users to know which memory is hotpluggable.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/srat.c |9 ++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 443f9ef..5055fa7 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -146,6 +146,7 @@ int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
u64 start, end;
+   u32 hotpluggable;
int node, pxm;
 
if (srat_disabled())
@@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
-   if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+   hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+   if (hotpluggable && !save_add_info())
goto out_err;
 
start = ma->base_address;
@@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
 
node_set(node, numa_nodes_parsed);
 
-   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
   node, pxm,
-  (unsigned long long) start, (unsigned long long) end - 1);
+  (unsigned long long) start, (unsigned long long) end - 1,
+  hotpluggable ? "Hot Pluggable" : "");
 
return 0;
 out_err_bad_srat:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 03/13] page_alloc, mem-hotplug: Improve movablecore to {en|dis}able using SRAT.

2013-04-30 Thread Tang Chen
The Hot-Pluggable Fired in SRAT specified which memory ranges are hotpluggable.
We will arrange hotpluggable memory as ZONE_MOVABLE for users who want to use
memory hotplug functionality. But this will cause NUMA performance decreased
because kernel cannot use ZONE_MOVABLE.

So we improve movablecore boot option to allow those who want to use memory
hotplug functionality to enable using SRAT info to arrange movable memory.

Users can specify "movablecore=acpi" in kernel commandline to enable this
functionality.

For those who don't use memory hotplug or who don't want to lose their NUMA
performance, just don't specify anything. The kernel will work as before.

Suggested-by: Kamezawa Hiroyuki 
Signed-off-by: Tang Chen 
---
 include/linux/memory_hotplug.h |3 +++
 mm/page_alloc.c|   13 +
 2 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b6a3be7..18fe2a3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -33,6 +33,9 @@ enum {
ONLINE_MOVABLE,
 };
 
+/* Enable/disable SRAT in movablecore boot option */
+extern bool movablecore_enable_srat;
+
 /*
  * pgdat resizing functions
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f368db4..b9ea143 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -208,6 +208,8 @@ static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 
+bool __initdata movablecore_enable_srat = false;
+
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
@@ -5025,6 +5027,12 @@ void __init free_area_init_nodes(unsigned long 
*max_zone_pfn)
}
 }
 
+static void __init cmdline_movablecore_srat(char *p)
+{
+   if (p && !strcmp(p, "acpi"))
+   movablecore_enable_srat = true;
+}
+
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
unsigned long long coremem;
@@ -5055,6 +5063,11 @@ static int __init cmdline_parse_kernelcore(char *p)
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
+   cmdline_movablecore_srat(p);
+
+   if (movablecore_enable_srat)
+   return 0;
+
return cmdline_parse_core(p, &required_movablecore);
 }
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 04/13] x86, numa, acpi, memory-hotplug: Introduce hotplug info into struct numa_meminfo.

2013-04-30 Thread Tang Chen
Since Yinghai has implement "Allocate pagetable pages in local node", for a
node with hotpluggable memory, we have to allocate pagetable pages first, and
then reserve the rest as hotpluggable memory in memblock.

But the kernel parse SRAT first, and then initialize memory mapping. So we have
to remember the which memory ranges are hotpluggable for future usage.

When parsing SRAT, we added each memory range to numa_meminfo. So we can store
hotpluggable info in numa_meminfo.

This patch introduces a "bool hotpluggable" member into struct
numa_meminfo.

And modifies the following APIs' prototypes to support it:
   - numa_add_memblk()
   - numa_add_memblk_to()

And the following callers:
   - numaq_register_node()
   - dummy_numa_init()
   - amd_numa_init()
   - acpi_numa_memory_affinity_init() in x86

Signed-off-by: Tang Chen 
---
 arch/x86/include/asm/numa.h |3 ++-
 arch/x86/kernel/apic/numaq_32.c |2 +-
 arch/x86/mm/amdtopology.c   |3 ++-
 arch/x86/mm/numa.c  |   10 +++---
 arch/x86/mm/numa_internal.h |1 +
 arch/x86/mm/srat.c  |2 +-
 6 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 1b99ee5..73096b2 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -31,7 +31,8 @@ extern int numa_off;
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
 extern nodemask_t numa_nodes_parsed __initdata;
 
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end,
+ bool hotpluggable);
 extern void __init numa_set_distance(int from, int to, int distance);
 
 static inline void set_apicid_to_node(int apicid, s16 node)
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index d661ee9..7a9c542 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -82,7 +82,7 @@ static inline void numaq_register_node(int node, struct 
sys_cfg_data *scd)
int ret;
 
node_set(node, numa_nodes_parsed);
-   ret = numa_add_memblk(node, start, end);
+   ret = numa_add_memblk(node, start, end, false);
BUG_ON(ret < 0);
 }
 
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 5247d01..d521471 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -167,7 +167,8 @@ int __init amd_numa_init(void)
nodeid, base, limit);
 
prevbase = base;
-   numa_add_memblk(nodeid, base, limit);
+   /* Do not support memory hotplug for AMD cpu. */
+   numa_add_memblk(nodeid, base, limit, false);
node_set(nodeid, numa_nodes_parsed);
}
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4f754e6..ecf37fd 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -134,6 +134,7 @@ void __init setup_node_to_cpumask_map(void)
 }
 
 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+bool hotpluggable,
 struct numa_meminfo *mi)
 {
/* ignore zero length blks */
@@ -155,6 +156,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
u64 end,
mi->blk[mi->nr_blks].start = start;
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
+   mi->blk[mi->nr_blks].hotpluggable = hotpluggable;
mi->nr_blks++;
return 0;
 }
@@ -179,15 +181,17 @@ void __init numa_remove_memblk_from(int idx, struct 
numa_meminfo *mi)
  * @nid: NUMA node ID of the new memblk
  * @start: Start address of the new memblk
  * @end: End address of the new memblk
+ * @hotpluggable: True if memblk is hotpluggable
  *
  * Add a new memblk to the default numa_meminfo.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
+int __init numa_add_memblk(int nid, u64 start, u64 end,
+  bool hotpluggable)
 {
-   return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+   return numa_add_memblk_to(nid, start, end, hotpluggable, &numa_meminfo);
 }
 
 /* Initialize NODE_DATA for a node on the local memory */
@@ -631,7 +635,7 @@ static int __init dummy_numa_init(void)
   0LLU, PFN_PHYS(max_pfn) - 1);
 
node_set(0, numa_nodes_parsed);
-   numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+   numa_add_memblk(0, 0, PFN_PHYS(max_pfn), false);
 
return 0;
 }
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index bb2fbcc..1ce4e6b 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -8,6 +8,7 @@ struct numa_memblk {
u64 start;
u64 end;
int nid;
+   boolhotp

[PATCH v2 06/13] memblock, numa: Introduce flag into memblock.

2013-04-30 Thread Tang Chen
There is no flag in memblock to discribe what type the memory is.
Sometimes, we may use memblock to reserve some memory for special usage.
For example, as Yinghai did in his patch, allocate pagetables on local
node before all the memory on the node is mapped.
Please refer to Yinghai's patch:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47
v3: https://lkml.org/lkml/2013/4/4/639
v4: https://lkml.org/lkml/2013/4/11/829

In hotplug environment, there could be some problems when we hot-remove
memory if we do so. Pagetable pages are kernel memory, which we cannot
migrate. But we can put them in local node because their life-cycle is
the same as the node.  So we need to free them all before memory hot-removing.

Actually, data whose life cycle is the same as a node, such as pagetable
pages, vmemmap pages, page_cgroup pages, all could be put on local node.
They can be freed when we hot-removing a whole node.

In order to do so, we need to mark out these special pages in memblock.
In this patch, we introduce a new "flags" member into memblock_region:
   struct memblock_region {
   phys_addr_t base;
   phys_addr_t size;
   unsigned long flags;
   #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   int nid;
   #endif
   };

This patch does the following things:
1) Add "flags" member to memblock_region, and MEMBLK_ANY flag for common usage.
2) Modify the following APIs' prototype:
memblock_add_region()
memblock_insert_region()
3) Add memblock_reserve_region() to support reserve memory with flags, and keep
   memblock_reserve()'s prototype unmodified.
4) Modify other APIs to support flags, but keep their prototype unmodified.

The idea is from Wen Congyang  and Liu Jiang 
.

Suggested-by: Wen Congyang 
Suggested-by: Liu Jiang 
Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |8 ++
 mm/memblock.c|   56 +
 2 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203..c63a66e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -19,9 +19,17 @@
 
 #define INIT_MEMBLOCK_REGIONS  128
 
+#define MEMBLK_FLAGS_DEFAULT   0
+
+/* Definition of memblock flags. */
+enum memblock_flags {
+   __NR_MEMBLK_FLAGS,  /* number of flags */
+};
+
 struct memblock_region {
phys_addr_t base;
phys_addr_t size;
+   unsigned long flags;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
 #endif
diff --git a/mm/memblock.c b/mm/memblock.c
index 16eda3d..63924ae 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -157,6 +157,7 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
+   type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
 }
@@ -307,7 +308,8 @@ static void __init_memblock memblock_merge_regions(struct 
memblock_type *type)
 
if (this->base + this->size != next->base ||
memblock_get_region_node(this) !=
-   memblock_get_region_node(next)) {
+   memblock_get_region_node(next) ||
+   this->flags != next->flags) {
BUG_ON(this->base + this->size > next->base);
i++;
continue;
@@ -327,13 +329,15 @@ static void __init_memblock memblock_merge_regions(struct 
memblock_type *type)
  * @base:  base address of the new region
  * @size:  size of the new region
  * @nid:   node id of the new region
+ * @flags: flags of the new region
  *
  * Insert new memblock region [@base,@base+@size) into @type at @idx.
  * @type must already have extra room to accomodate the new region.
  */
 static void __init_memblock memblock_insert_region(struct memblock_type *type,
   int idx, phys_addr_t base,
-  phys_addr_t size, int nid)
+  phys_addr_t size,
+  int nid, unsigned long flags)
 {
struct memblock_region *rgn = &type->regions[idx];
 
@@ -341,6 +345,7 @@ static void __init_memblock memblock_insert_region(struct 
memblock_type *type,
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
rgn->base = base;
rgn->size = size;
+   rgn->flags = flags;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
@@ -352,6 +357,7 @@ static void __init_memblock memblock_insert_region(struct 
memblock_type *type,
  * @base: base address of the new region
  * @size: 

[PATCH v2 13/13] doc, page_alloc, acpi, mem-hotplug: Add doc for movablecore=acpi boot option.

2013-04-30 Thread Tang Chen
Since we modify movablecore boot option to support
"movablecore=acpi", this patch adds doc for it.

Signed-off-by: Tang Chen 
---
 Documentation/kernel-parameters.txt |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4609e81..a1c515b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1649,6 +1649,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
+   movablecore=acpi[KNL,X86] This parameter will enable the
+   kernel to arrange ZONE_MOVABLE with the help of
+   Hot-Pluggable Field in SRAT. All the hotpluggable
+   memory will be arranged in ZONE_MOVABLE.
+   NOTE: Any node which the kernel resides in will
+ always be un-hotpluggable so that the kernel
+ will always have enough memory to boot.
+
MTD_Partition=  [MTD]
Format: ,,,
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 07/13] x86, numa, mem-hotplug: Mark nodes which the kernel resides in.

2013-04-30 Thread Tang Chen
If all the memory ranges in SRAT are hotpluggable, we should not
arrange them all in ZONE_MOVABLE. Otherwise the kernel won't have
enough memory to boot.

This patch introduce a global variable kernel_nodemask to mark
all the nodes the kernel resides in. And no matter if they are
hotpluggable, we arrange them as un-hotpluggable.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c   |6 ++
 include/linux/memblock.h |1 +
 mm/memblock.c|   20 
 3 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 26d1800..105b092 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -658,6 +658,12 @@ static bool srat_used __initdata;
  */
 static void __init early_x86_numa_init(void)
 {
+   /*
+* Need to find out which nodes the kernel resides in, and arrange
+* them as un-hotpluggable when parsing SRAT.
+*/
+   memblock_mark_kernel_nodes();
+
if (!numa_off) {
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index c63a66e..5064eed 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -66,6 +66,7 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
+void memblock_mark_kernel_nodes(void);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
diff --git a/mm/memblock.c b/mm/memblock.c
index 63924ae..1b93a5d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -35,6 +35,9 @@ struct memblock memblock __initdata_memblock = {
.current_limit  = MEMBLOCK_ALLOC_ANYWHERE,
 };
 
+/* Mark which nodes the kernel resides in. */
+static nodemask_t memblock_kernel_nodemask __initdata_memblock;
+
 int memblock_debug __initdata_memblock;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
@@ -787,6 +790,23 @@ int __init_memblock memblock_set_node(phys_addr_t base, 
phys_addr_t size,
memblock_merge_regions(type);
return 0;
 }
+
+void __init_memblock memblock_mark_kernel_nodes()
+{
+   int i, nid;
+   struct memblock_type *reserved = &memblock.reserved;
+
+   for (i = 0; i < reserved->cnt; i++)
+   if (reserved->regions[i].flags == MEMBLK_FLAGS_DEFAULT) {
+   nid = memblock_get_region_node(&reserved->regions[i]);
+   node_set(nid, memblock_kernel_nodemask);
+   }
+}
+#else
+void __init_memblock memblock_mark_kernel_nodes()
+{
+   node_set(0, memblock_kernel_nodemask);
+}
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 00/13] Arrange hotpluggable memory in SRAT as ZONE_MOVABLE.

2013-04-30 Thread Tang Chen
In memory hotplug situation, the hotpluggable memory should be
arranged in ZONE_MOVABLE because memory in ZONE_NORMAL may be
used by kernel, and Linux cannot migrate pages used by kernel.

So we need a way to specify hotpluggable memory as movable. It
should be as easy as possible.

According to ACPI spec 5.0, SRAT table has memory affinity
structure and the structure has Hot Pluggable Filed. 
See "5.2.16.2 Memory Affinity Structure".

If we use the information, we might be able to specify hotpluggable
memory by firmware. For example, if Hot Pluggable Filed is enabled,
kernel sets the memory as movable memory.

To achieve this goal, we need to do the following:
1. Prevent memblock from allocating hotpluggable memroy for kernel.
   This is done by reserving hotpluggable memory in memblock as the
   folowing steps:
   1) Parse SRAT early enough so that memblock knows which memory
  is hotpluggable.
   2) Add a "flags" member to memblock so that it is able to tell
  which memory is hotpluggable when freeing it to buddy.

2. Free hotpluggable memory to buddy system when memory initialization
   is done.

3. Arrange hotpluggable memory in ZONE_MOVABLE.
   (This will cause NUMA performance decreased)

4. Provide a user interface to enable/disable this functionality.
   (This is useful for those who don't use memory hotplug and who don't
want to lose their NUMA performance.)


This patch-set does the following:
patch1:Fix a little problem.
patch2:Have Hot-Pluggable Field in SRAT printed when parsing SRAT.
patch4,5:  Introduce hotpluggable field to numa_meminfo.
patch6,7:  Introduce flags to memblock, and keep the public APIs prototype
   unmodified.
patch8,9:  Reserve node-life-cycle memory as MEMBLK_LOCAL_NODE with 
memblock.
patch10,11:Reserve hotpluggable memory as MEMBLK_HOTPLUGGABLE with memblock,
   and free it to buddy when memory initialization is done.
patch3,12,13:  Improve "movablecore" boot option to support "movablecore=acpi".


Change log:
1. Fix a bug in patch10: forgot to update start and end value.
2. Add new patch8: make alloc_low_pages be able to call
   memory_add_physaddr_to_nid().


This patch-set is based on Yinghai's
"x86, ACPI, numa: Parse numa info early" patch-set.
Please refer to:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47
v3: https://lkml.org/lkml/2013/4/4/639
v4: https://lkml.org/lkml/2013/4/11/829

And Yinghai's patch did the following things:
1) Parse SRAT early enough.
2)Allocate pagetable pages in local node.


Tang Chen (12):
  acpi: Print Hot-Pluggable Field in SRAT.
  page_alloc, mem-hotplug: Improve movablecore to {en|dis}able using
SRAT.
  x86, numa, acpi, memory-hotplug: Introduce hotplug info into struct
numa_meminfo.
  x86, numa, acpi, memory-hotplug: Consider hotplug info when cleanup
numa_meminfo.
  memblock, numa: Introduce flag into memblock.
  x86, numa, mem-hotplug: Mark nodes which the kernel resides in.
  x86, numa: Move memory_add_physaddr_to_nid() to CONFIG_NUMA.
  x86, numa, memblock: Introduce MEMBLK_LOCAL_NODE to mark and reserve
node-life-cycle data.
  x86, acpi, numa, mem-hotplug: Introduce MEMBLK_HOTPLUGGABLE to mark
and reserve hotpluggable memory.
  x86, memblock, mem-hotplug: Free hotpluggable memory reserved by
memblock.
  x86, numa, acpi, memory-hotplug: Make movablecore=acpi have higher
priority.
  doc, page_alloc, acpi, mem-hotplug: Add doc for movablecore=acpi boot
option.

Yasuaki Ishimatsu (1):
  x86: get pg_data_t's memory from other node

 Documentation/kernel-parameters.txt |8 ++
 arch/x86/include/asm/numa.h |3 +-
 arch/x86/kernel/apic/numaq_32.c |2 +-
 arch/x86/mm/amdtopology.c   |3 +-
 arch/x86/mm/init.c  |   16 +++-
 arch/x86/mm/numa.c  |   64 +++---
 arch/x86/mm/numa_internal.h |1 +
 arch/x86/mm/srat.c  |   11 ++-
 include/linux/memblock.h|   16 +
 include/linux/memory_hotplug.h  |3 +
 mm/memblock.c   |  127 ++
 mm/nobootmem.c  |3 +
 mm/page_alloc.c |   37 ++-
 13 files changed, 256 insertions(+), 38 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 11/13] x86, memblock, mem-hotplug: Free hotpluggable memory reserved by memblock.

2013-04-30 Thread Tang Chen
We reserved hotpluggable memory in memblock. And when memory initialization
is done, we have to free it to buddy system.

This patch free memory reserved by memblock with flag MEMBLK_HOTPLUGGABLE.

Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |1 +
 mm/memblock.c|   20 
 mm/nobootmem.c   |3 +++
 3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 0f01930..08c761d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -69,6 +69,7 @@ int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_reserve_hotpluggable(phys_addr_t base, phys_addr_t size, int nid);
+void memblock_free_hotpluggable(void);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
 bool memblock_is_kernel_node(int nid);
diff --git a/mm/memblock.c b/mm/memblock.c
index 0c55588..54de398 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -568,6 +568,26 @@ int __init_memblock memblock_free(phys_addr_t base, 
phys_addr_t size)
return __memblock_remove(&memblock.reserved, base, size);
 }
 
+static void __init_memblock memblock_free_flags(unsigned long flags)
+{
+   int i;
+   struct memblock_type *reserved = &memblock.reserved;
+
+   for (i = 0; i < reserved->cnt; i++) {
+   if (reserved->regions[i].flags == flags)
+   memblock_remove_region(reserved, i);
+   }
+}
+
+void __init_memblock memblock_free_hotpluggable()
+{
+   unsigned long flags = 1 << MEMBLK_HOTPLUGGABLE;
+
+   memblock_dbg("memblock: free all hotpluggable memory");
+
+   memblock_free_flags(flags);
+}
+
 static int __init_memblock memblock_reserve_region(phys_addr_t base,
   phys_addr_t size,
   int nid,
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..cd85604 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -165,6 +165,9 @@ unsigned long __init free_all_bootmem(void)
for_each_online_pgdat(pgdat)
reset_node_lowmem_managed_pages(pgdat);
 
+   /* Hotpluggable memory reserved by memblock should also be freed. */
+   memblock_free_hotpluggable();
+
/*
 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
 *  because in some case like Node0 doesn't have RAM installed
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 09/13] x86, numa, memblock: Introduce MEMBLK_LOCAL_NODE to mark and reserve node-life-cycle data.

2013-04-30 Thread Tang Chen
node-life-cycle data (whose life cycle is the same as a node)
allocated by memblock should be marked so that when we free usable
memory to buddy system, we can skip them.

This patch introduces a flag MEMBLK_LOCAL_NODE for memblock to reserve
node-life-cycle data. For now, it is only kernel direct mapping pagetable
pages, based on Yinghai's patch.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/init.c   |   16 
 include/linux/memblock.h |2 ++
 mm/memblock.c|7 +++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8d0007a..002d487 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -62,14 +62,22 @@ __ref void *alloc_low_pages(unsigned int num)
low_min_pfn_mapped << PAGE_SHIFT,
low_max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
-   } else
+   if (!ret)
+   panic("alloc_low_page: can not alloc memory");
+
+   memblock_reserve(ret, PAGE_SIZE * num);
+   } else {
ret = memblock_find_in_range(
local_min_pfn_mapped << PAGE_SHIFT,
local_max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
-   if (!ret)
-   panic("alloc_low_page: can not alloc memory");
-   memblock_reserve(ret, PAGE_SIZE * num);
+   if (!ret)
+   panic("alloc_low_page: can not alloc memory");
+
+   memblock_reserve_local_node(ret, PAGE_SIZE * num,
+   memory_add_physaddr_to_nid(ret));
+   }
+
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5064eed..3b2d1c4 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -23,6 +23,7 @@
 
 /* Definition of memblock flags. */
 enum memblock_flags {
+   MEMBLK_LOCAL_NODE,  /* node-life-cycle data */
__NR_MEMBLK_FLAGS,  /* number of flags */
 };
 
@@ -65,6 +66,7 @@ int memblock_add(phys_addr_t base, phys_addr_t size);
 int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
+int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index 1b93a5d..edde4c2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -589,6 +589,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, 
phys_addr_t size)
   MEMBLK_FLAGS_DEFAULT);
 }
 
+int __init_memblock memblock_reserve_local_node(phys_addr_t base,
+   phys_addr_t size, int nid)
+{
+   unsigned long flags = 1 << MEMBLK_LOCAL_NODE;
+   return memblock_reserve_region(base, size, nid, flags);
+}
+
 /**
  * __next_free_mem_range - next function for for_each_free_mem_range()
  * @idx: pointer to u64 loop variable
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 05/13] x86, numa, acpi, memory-hotplug: Consider hotplug info when cleanup numa_meminfo.

2013-04-30 Thread Tang Chen
Since we have introduced hotplug info into struct numa_meminfo, we need
to consider it when cleanup numa_meminfo.

The original logic in numa_cleanup_meminfo() is:
Merge blocks on the same node, holes between which don't overlap with
memory on other nodes.

This patch modifies numa_cleanup_meminfo() logic like this:
Merge blocks with the same hotpluggable type on the same node, holes
between which don't overlap with memory on other nodes.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   13 +
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ecf37fd..26d1800 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -296,18 +296,22 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
}
 
/*
-* Join together blocks on the same node, holes
-* between which don't overlap with memory on other
-* nodes.
+* Join together blocks on the same node, with the same
+* hotpluggable flags, holes between which don't overlap
+* with memory on other nodes.
 */
if (bi->nid != bj->nid)
continue;
+   if (bi->hotpluggable != bj->hotpluggable)
+   continue;
+
start = min(bi->start, bj->start);
end = max(bi->end, bj->end);
for (k = 0; k < mi->nr_blks; k++) {
struct numa_memblk *bk = &mi->blk[k];
 
-   if (bi->nid == bk->nid)
+   if (bi->nid == bk->nid &&
+   bi->hotpluggable == bk->hotpluggable)
continue;
if (start < bk->end && end > bk->start)
break;
@@ -327,6 +331,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
mi->blk[i].start = mi->blk[i].end = 0;
mi->blk[i].nid = NUMA_NO_NODE;
+   mi->blk[i].hotpluggable = false;
}
 
return 0;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 12/13] x86, numa, acpi, memory-hotplug: Make movablecore=acpi have higher priority.

2013-04-30 Thread Tang Chen
Arrange hotpluggable memory as ZONE_MOVABLE will cause NUMA performance 
decreased
because the kernel cannot use movable memory.

For users who don't use memory hotplug and who don't want to lose their NUMA
performance, they need a way to disable this functionality.

So, if users specify "movablecore=acpi" in kernel commandline, the kernel will
use SRAT to arrange ZONE_MOVABLE, and it has higher priority then original
movablecore and kernelcore boot option.

For those who don't want this, just specify nothing.

Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |1 +
 mm/memblock.c|5 +
 mm/page_alloc.c  |   24 +++-
 3 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 08c761d..5528e8f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -69,6 +69,7 @@ int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_reserve_hotpluggable(phys_addr_t base, phys_addr_t size, int nid);
+bool memblock_is_hotpluggable(struct memblock_region *region);
 void memblock_free_hotpluggable(void);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 54de398..8b9a13c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -623,6 +623,11 @@ int __init_memblock 
memblock_reserve_hotpluggable(phys_addr_t base,
return memblock_reserve_region(base, size, nid, flags);
 }
 
+bool __init_memblock memblock_is_hotpluggable(struct memblock_region *region)
+{
+   return region->flags & (1 << MEMBLK_HOTPLUGGABLE);
+}
+
 /**
  * __next_free_mem_range - next function for for_each_free_mem_range()
  * @idx: pointer to u64 loop variable
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9ea143..2fe9ebf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4793,9 +4793,31 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+   struct memblock_type *reserved = &memblock.reserved;
 
/*
-* If movablecore was specified, calculate what size of
+* If movablecore=acpi was specified, then zone_movable_pfn[] has been
+* initialized, and no more work needs to do.
+* NOTE: In this case, we ignore kernelcore option.
+*/
+   if (movablecore_enable_srat) {
+   for (i = 0; i < reserved->cnt; i++) {
+   if (!memblock_is_hotpluggable(&reserved->regions[i]))
+   continue;
+
+   nid = reserved->regions[i].nid;
+
+   usable_startpfn = reserved->regions[i].base;
+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out;
+   }
+
+   /*
+* If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
 * and movablecore are specified, then the value of kernelcore
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 10/13] x86, acpi, numa, mem-hotplug: Introduce MEMBLK_HOTPLUGGABLE to mark and reserve hotpluggable memory.

2013-04-30 Thread Tang Chen
We mark out movable memory ranges and reserve them with MEMBLK_HOTPLUGGABLE 
flag in
memblock.reserved. This should be done after the memory mapping is initialized
because the kernel now supports allocate pagetable pages on local node, which
are kernel pages.

The reserved hotpluggable will be freed to buddy when memory initialization
is done.

This idea is from Wen Congyang  and Jiang Liu 
.

Suggested-by: Jiang Liu 
Suggested-by: Wen Congyang 
Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c   |   28 
 include/linux/memblock.h |3 +++
 mm/memblock.c|   19 +++
 3 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1367fe4..a1f1f90 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -731,6 +731,32 @@ static void __init early_x86_numa_init_mapping(void)
 }
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init early_mem_hotplug_init()
+{
+   int i, nid;
+   phys_addr_t start, end;
+
+   if (!movablecore_enable_srat)
+   return;
+
+   for (i = 0; i < numa_meminfo.nr_blks; i++) {
+   if (!numa_meminfo.blk[i].hotpluggable)
+   continue;
+
+   nid = numa_meminfo.blk[i].nid;
+   start = numa_meminfo.blk[i].start;
+   end = numa_meminfo.blk[i].end;
+
+   memblock_reserve_hotpluggable(start, end - start, nid);
+   }
+}
+#else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void early_mem_hotplug_init()
+{
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
 void __init early_initmem_init(void)
 {
early_x86_numa_init();
@@ -740,6 +766,8 @@ void __init early_initmem_init(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
 
+   early_mem_hotplug_init();
+
early_memtest(0, max_pfn_mapped<http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 08/13] x86, numa: Move memory_add_physaddr_to_nid() to CONFIG_NUMA.

2013-04-30 Thread Tang Chen
memory_add_physaddr_to_nid() is declared in include/linux/memory_hotplug.h,
protected by CONFIG_NUMA. And in x86, the definitions are protected by
CONFIG_MEMORY_HOTPLUG.

memory_add_physaddr_to_nid() uses numa_meminfo to find the physical address's
nid. It has nothing to do with memory hotplug. And also, it can be used by
alloc_low_pages() to obtain nid of the allocated memory.

So in x86, also use CONFIG_NUMA to protect it.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 105b092..1367fe4 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -959,7 +959,7 @@ EXPORT_SYMBOL(cpumask_of_node);
 
 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-#ifdef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_NUMA
 int memory_add_physaddr_to_nid(u64 start)
 {
struct numa_meminfo *mi = &numa_meminfo;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/22] x86, ACPI, numa: Parse numa info early

2013-04-30 Thread Tang Chen

Hi Yinghai, all,

I've tested this patch-set with my following patch-set:
[PATCH v1 00/12] Arrange hotpluggable memory in SRAT as ZONE_MOVABLE.
https://lkml.org/lkml/2013/4/19/94

Using ACPI table override, I overrided SRAT on my box like this:

[0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
[0.00] SRAT: Node 0 PXM 0 [mem 0x1-0x307ff]
[0.00] SRAT: Node 1 PXM 2 [mem 0x30800-0x583ff] Hot 
Pluggable
[0.00] SRAT: Node 2 PXM 3 [mem 0x58400-0x7] Hot 
Pluggable


We had 3 nodes, node0 was not hotpluggable, and node1 and node2 were 
hotpluggable.



And memblock reserved pagetable pages (with flag 0x1) in local nodes.
..
[0.00]  reserved[0xb]   [0x0307ff-0x0307ff1fff], 
0x2000 bytes flags: 0x0
[0.00]  reserved[0xc]   [0x0307ff2000-0x0307ff], 
0xe000 bytes on node 0 flags: 0x1
[0.00]  reserved[0xd]   [0x0583ff7000-0x0583ff], 
0x9000 bytes on node 1 flags: 0x1
[0.00]  reserved[0xe]   [0x079000-0x07], 
0x7000 bytes on node 2 flags: 0x1


And after some bug fix, memblock can also reserve hotpluggable memory 
with flag 0x2.

..
[0.00]  reserved[0xb]   [0x0307ff-0x0307ff1fff], 
0x2000 bytes flags: 0x0
[0.00]  reserved[0xc]   [0x0307ff2000-0x0307ff], 
0xe000 bytes on node 0 flags: 0x1
[0.00]  reserved[0xd]   [0x030800-0x0583ff6fff], 
0x27bff7000 bytes on node 1 flags: 0x2
[0.00]  reserved[0xe]   [0x0583ff7000-0x0583ff], 
0x9000 bytes on node 1 flags: 0x1
[0.00]  reserved[0xf]   [0x058400-0x077fff], 
0x27bff8000 bytes on node 2 flags: 0x2
[0.00]  reserved[0x10]  [0x078000-0x07], 
0x8000 bytes on node 2 flags: 0x1


And free it to buddy system when memory initialization finished.


So the results:
1. We can parse SRAT earlier correctly.
2. We can override tables correctly.
3. We can put pagetable pages in local node.
4. We can prevent memblock from allocating hotpluggable memory.
5. We can arrange ZONE_MOVABLE using SRAT info.


Known problems:

When we put pagetable pages in local node, memory hot-remove logic won't 
work.

I'm fixing it now. We need to fix the following:
1. Improve hot-remove to support freeing local node pagetable pages.
2. Improve hot-add to support putting hot-added pagetable pages in local 
node.

3. Do the same to vmemmap and page_cgrop pages.

So I suggest to separate the job into 2 parts:
1. Push Yinghai's patch1 ~ patch20, without putting pagetable in local node.
   And push my work to use SRAT to arrange ZONE_MOVABLE.
   In this case, we can enable memory hotplug in the kernel first.
2. Merge patch21 and patch22 into the fixing work I am doing now, and 
push them

   together when finished.

How do you think ?

Reviewed-by: Tang Chen 
Tested-by: Tang Chen 

Thanks. :)




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/22] x86, ACPI, numa: Parse numa info early

2013-04-26 Thread Tang Chen

Hi Yinghai,

It has been a long time since this patch-set was sent. I think we need to
do something to push it.

In my understanding, this patch-set did 2 things.
1. Parse numa info earlier, some improvements for 
ACPI_INITRD_TABLE_OVERRIDE.

   (patch1 ~ patch20)

2. Allocate pagetable in local node at boot time. (patch21 ~ patch22)


As you know, the current implement of memory hot-remove is not based on
putting pagetable in local node. If we put pagetable in local node at boot
time, the memory hot-remove won't be able to work as before.

I agree that this should be fixed. But we have the following two reasons to
push "Parse numa info earlier" part first, and improve the performance 
later.


1. patch21 and patch22 only affect the performance, not the functionality.
   I think we can make memory hot-remove work in the kernel, and than 
improve

   the performance.

2. Besides putting pagetable in local node at boot time, there are many 
other
   things need to do. I'm working on improving hot-add code to allocate 
pagetable
   and vmemmap in local node, and improving hot-remove code to support 
freeing

   this kind of memory.


So in order to push this patch-set and memory hot-remove functionality,
shall we divide this patch-set into 2 steps:

1. Push patch1 ~ patch20, and I'll push the remaining memory hot-remove 
work together.


2. Merge your "putting pagetable in local node" work with the 
performance improvement

   work I'm doing, and improve the performance.

How do you think ?

BTW, I'm testing your patch-set, and will give a result next week.
I can also help to rebase it if you like.

Thanks. :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 11/12] x86, numa, acpi, memory-hotplug: Make movablecore=acpi have higher priority.

2013-04-19 Thread Tang Chen
Arrange hotpluggable memory as ZONE_MOVABLE will cause NUMA performance 
decreased
because the kernel cannot use movable memory.

For users who don't use memory hotplug and who don't want to lose their NUMA
performance, they need a way to disable this functionality.

So, if users specify "movablecore=acpi" in kernel commandline, the kernel will
use SRAT to arrange ZONE_MOVABLE, and it has higher priority then original
movablecore and kernelcore boot option.

For those who don't want this, just specify nothing.

Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |1 +
 mm/memblock.c|5 +
 mm/page_alloc.c  |   24 +++-
 3 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 08c761d..5528e8f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -69,6 +69,7 @@ int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_reserve_hotpluggable(phys_addr_t base, phys_addr_t size, int nid);
+bool memblock_is_hotpluggable(struct memblock_region *region);
 void memblock_free_hotpluggable(void);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 54de398..8b9a13c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -623,6 +623,11 @@ int __init_memblock 
memblock_reserve_hotpluggable(phys_addr_t base,
return memblock_reserve_region(base, size, nid, flags);
 }
 
+bool __init_memblock memblock_is_hotpluggable(struct memblock_region *region)
+{
+   return region->flags & (1 << MEMBLK_HOTPLUGGABLE);
+}
+
 /**
  * __next_free_mem_range - next function for for_each_free_mem_range()
  * @idx: pointer to u64 loop variable
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9ea143..2fe9ebf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4793,9 +4793,31 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+   struct memblock_type *reserved = &memblock.reserved;
 
/*
-* If movablecore was specified, calculate what size of
+* If movablecore=acpi was specified, then zone_movable_pfn[] has been
+* initialized, and no more work needs to do.
+* NOTE: In this case, we ignore kernelcore option.
+*/
+   if (movablecore_enable_srat) {
+   for (i = 0; i < reserved->cnt; i++) {
+   if (!memblock_is_hotpluggable(&reserved->regions[i]))
+   continue;
+
+   nid = reserved->regions[i].nid;
+
+   usable_startpfn = reserved->regions[i].base;
+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out;
+   }
+
+   /*
+* If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
 * and movablecore are specified, then the value of kernelcore
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 05/12] x86, numa, acpi, memory-hotplug: Consider hotplug info when cleanup numa_meminfo.

2013-04-19 Thread Tang Chen
Since we have introduced hotplug info into struct numa_meminfo, we need
to consider it when cleanup numa_meminfo.

The original logic in numa_cleanup_meminfo() is:
Merge blocks on the same node, holes between which don't overlap with
memory on other nodes.

This patch modifies numa_cleanup_meminfo() logic like this:
Merge blocks with the same hotpluggable type on the same node, holes
between which don't overlap with memory on other nodes.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   13 +
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ecf37fd..26d1800 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -296,18 +296,22 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
}
 
/*
-* Join together blocks on the same node, holes
-* between which don't overlap with memory on other
-* nodes.
+* Join together blocks on the same node, with the same
+* hotpluggable flags, holes between which don't overlap
+* with memory on other nodes.
 */
if (bi->nid != bj->nid)
continue;
+   if (bi->hotpluggable != bj->hotpluggable)
+   continue;
+
start = min(bi->start, bj->start);
end = max(bi->end, bj->end);
for (k = 0; k < mi->nr_blks; k++) {
struct numa_memblk *bk = &mi->blk[k];
 
-   if (bi->nid == bk->nid)
+   if (bi->nid == bk->nid &&
+   bi->hotpluggable == bk->hotpluggable)
continue;
if (start < bk->end && end > bk->start)
break;
@@ -327,6 +331,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
mi->blk[i].start = mi->blk[i].end = 0;
mi->blk[i].nid = NUMA_NO_NODE;
+   mi->blk[i].hotpluggable = false;
}
 
return 0;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 03/12] page_alloc, mem-hotplug: Improve movablecore to {en|dis}able using SRAT.

2013-04-19 Thread Tang Chen
The Hot-Pluggable Fired in SRAT specified which memory ranges are hotpluggable.
We will arrange hotpluggable memory as ZONE_MOVABLE for users who want to use
memory hotplug functionality. But this will cause NUMA performance decreased
because kernel cannot use ZONE_MOVABLE.

So we improve movablecore boot option to allow those who want to use memory
hotplug functionality to enable using SRAT info to arrange movable memory.

Users can specify "movablecore=acpi" in kernel commandline to enable this
functionality.

For those who don't use memory hotplug or who don't want to lose their NUMA
performance, just don't specify anything. The kernel will work as before.

Suggested-by: Kamezawa Hiroyuki 
Signed-off-by: Tang Chen 
---
 include/linux/memory_hotplug.h |3 +++
 mm/page_alloc.c|   13 +
 2 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b6a3be7..18fe2a3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -33,6 +33,9 @@ enum {
ONLINE_MOVABLE,
 };
 
+/* Enable/disable SRAT in movablecore boot option */
+extern bool movablecore_enable_srat;
+
 /*
  * pgdat resizing functions
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f368db4..b9ea143 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -208,6 +208,8 @@ static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 
+bool __initdata movablecore_enable_srat = false;
+
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
@@ -5025,6 +5027,12 @@ void __init free_area_init_nodes(unsigned long 
*max_zone_pfn)
}
 }
 
+static void __init cmdline_movablecore_srat(char *p)
+{
+   if (p && !strcmp(p, "acpi"))
+   movablecore_enable_srat = true;
+}
+
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
unsigned long long coremem;
@@ -5055,6 +5063,11 @@ static int __init cmdline_parse_kernelcore(char *p)
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
+   cmdline_movablecore_srat(p);
+
+   if (movablecore_enable_srat)
+   return 0;
+
return cmdline_parse_core(p, &required_movablecore);
 }
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 01/12] x86: get pg_data_t's memory from other node

2013-04-19 Thread Tang Chen
From: Yasuaki Ishimatsu 

If system can create movable node which all memory of the
node is allocated as ZONE_MOVABLE, setup_node_data() cannot
allocate memory for the node's pg_data_t.
So, use memblock_alloc_try_nid() instead of memblock_alloc_nid()
to retry when the first allocation fails.

Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Lai Jiangshan 
Signed-off-by: Tang Chen 
Signed-off-by: Jiang Liu 
---
 arch/x86/mm/numa.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 11acdf6..4f754e6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -214,10 +214,9 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
-   nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+   nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
-   pr_err("Cannot find %zu bytes in node %d\n",
-  nd_size, nid);
+   pr_err("Cannot find %zu bytes in any node\n", nd_size);
return;
}
nd = __va(nd_pa);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 06/12] memblock, numa: Introduce flag into memblock.

2013-04-19 Thread Tang Chen
There is no flag in memblock to discribe what type the memory is.
Sometimes, we may use memblock to reserve some memory for special usage.
For example, as Yinghai did in his patch, allocate pagetables on local
node before all the memory on the node is mapped.
Please refer to Yinghai's patch:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47
v3: https://lkml.org/lkml/2013/4/4/639
v4: https://lkml.org/lkml/2013/4/11/829

In hotplug environment, there could be some problems when we hot-remove
memory if we do so. Pagetable pages are kernel memory, which we cannot
migrate. But we can put them in local node because their life-cycle is
the same as the node.  So we need to free them all before memory hot-removing.

Actually, data whose life cycle is the same as a node, such as pagetable
pages, vmemmap pages, page_cgroup pages, all could be put on local node.
They can be freed when we hot-removing a whole node.

In order to do so, we need to mark out these special pages in memblock.
In this patch, we introduce a new "flags" member into memblock_region:
   struct memblock_region {
   phys_addr_t base;
   phys_addr_t size;
   unsigned long flags;
   #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   int nid;
   #endif
   };

This patch does the following things:
1) Add "flags" member to memblock_region, and MEMBLK_ANY flag for common usage.
2) Modify the following APIs' prototype:
memblock_add_region()
memblock_insert_region()
3) Add memblock_reserve_region() to support reserve memory with flags, and keep
   memblock_reserve()'s prototype unmodified.
4) Modify other APIs to support flags, but keep their prototype unmodified.

The idea is from Wen Congyang  and Liu Jiang 
.

Suggested-by: Wen Congyang 
Suggested-by: Liu Jiang 
Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |8 ++
 mm/memblock.c|   56 +
 2 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203..c63a66e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -19,9 +19,17 @@
 
 #define INIT_MEMBLOCK_REGIONS  128
 
+#define MEMBLK_FLAGS_DEFAULT   0
+
+/* Definition of memblock flags. */
+enum memblock_flags {
+   __NR_MEMBLK_FLAGS,  /* number of flags */
+};
+
 struct memblock_region {
phys_addr_t base;
phys_addr_t size;
+   unsigned long flags;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
 #endif
diff --git a/mm/memblock.c b/mm/memblock.c
index 16eda3d..63924ae 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -157,6 +157,7 @@ static void __init_memblock memblock_remove_region(struct 
memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
+   type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
 }
@@ -307,7 +308,8 @@ static void __init_memblock memblock_merge_regions(struct 
memblock_type *type)
 
if (this->base + this->size != next->base ||
memblock_get_region_node(this) !=
-   memblock_get_region_node(next)) {
+   memblock_get_region_node(next) ||
+   this->flags != next->flags) {
BUG_ON(this->base + this->size > next->base);
i++;
continue;
@@ -327,13 +329,15 @@ static void __init_memblock memblock_merge_regions(struct 
memblock_type *type)
  * @base:  base address of the new region
  * @size:  size of the new region
  * @nid:   node id of the new region
+ * @flags: flags of the new region
  *
  * Insert new memblock region [@base,@base+@size) into @type at @idx.
  * @type must already have extra room to accomodate the new region.
  */
 static void __init_memblock memblock_insert_region(struct memblock_type *type,
   int idx, phys_addr_t base,
-  phys_addr_t size, int nid)
+  phys_addr_t size,
+  int nid, unsigned long flags)
 {
struct memblock_region *rgn = &type->regions[idx];
 
@@ -341,6 +345,7 @@ static void __init_memblock memblock_insert_region(struct 
memblock_type *type,
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
rgn->base = base;
rgn->size = size;
+   rgn->flags = flags;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
@@ -352,6 +357,7 @@ static void __init_memblock memblock_insert_region(struct 
memblock_type *type,
  * @base: base address of the new region
  * @size: 

[PATCH v1 04/12] x86, numa, acpi, memory-hotplug: Introduce hotplug info into struct numa_meminfo.

2013-04-19 Thread Tang Chen
Since Yinghai has implement "Allocate pagetable pages in local node", for a
node with hotpluggable memory, we have to allocate pagetable pages first, and
then reserve the rest as hotpluggable memory in memblock.

But the kernel parse SRAT first, and then initialize memory mapping. So we have
to remember the which memory ranges are hotpluggable for future usage.

When parsing SRAT, we added each memory range to numa_meminfo. So we can store
hotpluggable info in numa_meminfo.

This patch introduces a "bool hotpluggable" member into struct
numa_meminfo.

And modifies the following APIs' prototypes to support it:
   - numa_add_memblk()
   - numa_add_memblk_to()

And the following callers:
   - numaq_register_node()
   - dummy_numa_init()
   - amd_numa_init()
   - acpi_numa_memory_affinity_init() in x86

Signed-off-by: Tang Chen 
---
 arch/x86/include/asm/numa.h |3 ++-
 arch/x86/kernel/apic/numaq_32.c |2 +-
 arch/x86/mm/amdtopology.c   |3 ++-
 arch/x86/mm/numa.c  |   10 +++---
 arch/x86/mm/numa_internal.h |1 +
 arch/x86/mm/srat.c  |2 +-
 6 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 1b99ee5..73096b2 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -31,7 +31,8 @@ extern int numa_off;
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
 extern nodemask_t numa_nodes_parsed __initdata;
 
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end,
+ bool hotpluggable);
 extern void __init numa_set_distance(int from, int to, int distance);
 
 static inline void set_apicid_to_node(int apicid, s16 node)
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index d661ee9..7a9c542 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -82,7 +82,7 @@ static inline void numaq_register_node(int node, struct 
sys_cfg_data *scd)
int ret;
 
node_set(node, numa_nodes_parsed);
-   ret = numa_add_memblk(node, start, end);
+   ret = numa_add_memblk(node, start, end, false);
BUG_ON(ret < 0);
 }
 
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 5247d01..d521471 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -167,7 +167,8 @@ int __init amd_numa_init(void)
nodeid, base, limit);
 
prevbase = base;
-   numa_add_memblk(nodeid, base, limit);
+   /* Do not support memory hotplug for AMD cpu. */
+   numa_add_memblk(nodeid, base, limit, false);
node_set(nodeid, numa_nodes_parsed);
}
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4f754e6..ecf37fd 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -134,6 +134,7 @@ void __init setup_node_to_cpumask_map(void)
 }
 
 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+bool hotpluggable,
 struct numa_meminfo *mi)
 {
/* ignore zero length blks */
@@ -155,6 +156,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
u64 end,
mi->blk[mi->nr_blks].start = start;
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
+   mi->blk[mi->nr_blks].hotpluggable = hotpluggable;
mi->nr_blks++;
return 0;
 }
@@ -179,15 +181,17 @@ void __init numa_remove_memblk_from(int idx, struct 
numa_meminfo *mi)
  * @nid: NUMA node ID of the new memblk
  * @start: Start address of the new memblk
  * @end: End address of the new memblk
+ * @hotpluggable: True if memblk is hotpluggable
  *
  * Add a new memblk to the default numa_meminfo.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
+int __init numa_add_memblk(int nid, u64 start, u64 end,
+  bool hotpluggable)
 {
-   return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+   return numa_add_memblk_to(nid, start, end, hotpluggable, &numa_meminfo);
 }
 
 /* Initialize NODE_DATA for a node on the local memory */
@@ -631,7 +635,7 @@ static int __init dummy_numa_init(void)
   0LLU, PFN_PHYS(max_pfn) - 1);
 
node_set(0, numa_nodes_parsed);
-   numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+   numa_add_memblk(0, 0, PFN_PHYS(max_pfn), false);
 
return 0;
 }
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index bb2fbcc..1ce4e6b 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -8,6 +8,7 @@ struct numa_memblk {
u64 start;
u64 end;
int nid;
+   boolhotp

[PATCH v1 07/12] x86, numa, mem-hotplug: Mark nodes which the kernel resides in.

2013-04-19 Thread Tang Chen
If all the memory ranges in SRAT are hotpluggable, we should not
arrange them all in ZONE_MOVABLE. Otherwise the kernel won't have
enough memory to boot.

This patch introduce a global variable kernel_nodemask to mark
all the nodes the kernel resides in. And no matter if they are
hotpluggable, we arrange them as un-hotpluggable.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c   |6 ++
 include/linux/memblock.h |1 +
 mm/memblock.c|   20 
 3 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 26d1800..105b092 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -658,6 +658,12 @@ static bool srat_used __initdata;
  */
 static void __init early_x86_numa_init(void)
 {
+   /*
+* Need to find out which nodes the kernel resides in, and arrange
+* them as un-hotpluggable when parsing SRAT.
+*/
+   memblock_mark_kernel_nodes();
+
if (!numa_off) {
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index c63a66e..5064eed 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -66,6 +66,7 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
+void memblock_mark_kernel_nodes(void);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
diff --git a/mm/memblock.c b/mm/memblock.c
index 63924ae..1b93a5d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -35,6 +35,9 @@ struct memblock memblock __initdata_memblock = {
.current_limit  = MEMBLOCK_ALLOC_ANYWHERE,
 };
 
+/* Mark which nodes the kernel resides in. */
+static nodemask_t memblock_kernel_nodemask __initdata_memblock;
+
 int memblock_debug __initdata_memblock;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
@@ -787,6 +790,23 @@ int __init_memblock memblock_set_node(phys_addr_t base, 
phys_addr_t size,
memblock_merge_regions(type);
return 0;
 }
+
+void __init_memblock memblock_mark_kernel_nodes()
+{
+   int i, nid;
+   struct memblock_type *reserved = &memblock.reserved;
+
+   for (i = 0; i < reserved->cnt; i++)
+   if (reserved->regions[i].flags == MEMBLK_FLAGS_DEFAULT) {
+   nid = memblock_get_region_node(&reserved->regions[i]);
+   node_set(nid, memblock_kernel_nodemask);
+   }
+}
+#else
+void __init_memblock memblock_mark_kernel_nodes()
+{
+   node_set(0, memblock_kernel_nodemask);
+}
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 02/12] acpi: Print Hot-Pluggable Field in SRAT.

2013-04-19 Thread Tang Chen
The Hot-Pluggable field in SRAT suggests if the memory could be
hotplugged while the system is running. Print it as well when
parsing SRAT will help users to know which memory is hotpluggable.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/srat.c |9 ++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 443f9ef..5055fa7 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -146,6 +146,7 @@ int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
u64 start, end;
+   u32 hotpluggable;
int node, pxm;
 
if (srat_disabled())
@@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
-   if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+   hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+   if (hotpluggable && !save_add_info())
goto out_err;
 
start = ma->base_address;
@@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
 
node_set(node, numa_nodes_parsed);
 
-   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
   node, pxm,
-  (unsigned long long) start, (unsigned long long) end - 1);
+  (unsigned long long) start, (unsigned long long) end - 1,
+  hotpluggable ? "Hot Pluggable" : "");
 
return 0;
 out_err_bad_srat:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 10/12] x86, memblock, mem-hotplug: Free hotpluggable memory reserved by memblock.

2013-04-19 Thread Tang Chen
We reserved hotpluggable memory in memblock. And when memory initialization
is done, we have to free it to buddy system.

This patch free memory reserved by memblock with flag MEMBLK_HOTPLUGGABLE.

Signed-off-by: Tang Chen 
---
 include/linux/memblock.h |1 +
 mm/memblock.c|   20 
 mm/nobootmem.c   |3 +++
 3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 0f01930..08c761d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -69,6 +69,7 @@ int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_reserve_hotpluggable(phys_addr_t base, phys_addr_t size, int nid);
+void memblock_free_hotpluggable(void);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
 bool memblock_is_kernel_node(int nid);
diff --git a/mm/memblock.c b/mm/memblock.c
index 0c55588..54de398 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -568,6 +568,26 @@ int __init_memblock memblock_free(phys_addr_t base, 
phys_addr_t size)
return __memblock_remove(&memblock.reserved, base, size);
 }
 
+static void __init_memblock memblock_free_flags(unsigned long flags)
+{
+   int i;
+   struct memblock_type *reserved = &memblock.reserved;
+
+   for (i = 0; i < reserved->cnt; i++) {
+   if (reserved->regions[i].flags == flags)
+   memblock_remove_region(reserved, i);
+   }
+}
+
+void __init_memblock memblock_free_hotpluggable()
+{
+   unsigned long flags = 1 << MEMBLK_HOTPLUGGABLE;
+
+   memblock_dbg("memblock: free all hotpluggable memory");
+
+   memblock_free_flags(flags);
+}
+
 static int __init_memblock memblock_reserve_region(phys_addr_t base,
   phys_addr_t size,
   int nid,
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..cd85604 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -165,6 +165,9 @@ unsigned long __init free_all_bootmem(void)
for_each_online_pgdat(pgdat)
reset_node_lowmem_managed_pages(pgdat);
 
+   /* Hotpluggable memory reserved by memblock should also be freed. */
+   memblock_free_hotpluggable();
+
/*
 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
 *  because in some case like Node0 doesn't have RAM installed
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 08/12] x86, numa, memblock: Introduce MEMBLK_LOCAL_NODE to mark and reserve node-life-cycle data.

2013-04-19 Thread Tang Chen
node-life-cycle data (whose life cycle is the same as a node)
allocated by memblock should be marked so that when we free usable
memory to buddy system, we can skip them.

This patch introduces a flag MEMBLK_LOCAL_NODE for memblock to reserve
node-life-cycle data. For now, it is only kernel direct mapping pagetable
pages, based on Yinghai's patch.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/init.c   |   16 
 include/linux/memblock.h |2 ++
 mm/memblock.c|7 +++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8d0007a..1261e2e 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -62,14 +62,22 @@ __ref void *alloc_low_pages(unsigned int num)
low_min_pfn_mapped << PAGE_SHIFT,
low_max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
-   } else
+   if (!ret)
+   panic("alloc_low_page: can not alloc memory");
+
+   memblock_reserve(ret, PAGE_SIZE * num);
+   } else {
ret = memblock_find_in_range(
local_min_pfn_mapped << PAGE_SHIFT,
local_max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
-   if (!ret)
-   panic("alloc_low_page: can not alloc memory");
-   memblock_reserve(ret, PAGE_SIZE * num);
+   if (!ret)
+   panic("alloc_low_page: can not alloc memory");
+
+   memblock_reserve_local_node(ret, PAGE_SIZE * num,
+   MAX_NUMNODES);
+   }
+
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5064eed..3b2d1c4 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -23,6 +23,7 @@
 
 /* Definition of memblock flags. */
 enum memblock_flags {
+   MEMBLK_LOCAL_NODE,  /* node-life-cycle data */
__NR_MEMBLK_FLAGS,  /* number of flags */
 };
 
@@ -65,6 +66,7 @@ int memblock_add(phys_addr_t base, phys_addr_t size);
 int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
+int memblock_reserve_local_node(phys_addr_t base, phys_addr_t size, int nid);
 void memblock_trim_memory(phys_addr_t align);
 void memblock_mark_kernel_nodes(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index 1b93a5d..edde4c2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -589,6 +589,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, 
phys_addr_t size)
   MEMBLK_FLAGS_DEFAULT);
 }
 
+int __init_memblock memblock_reserve_local_node(phys_addr_t base,
+   phys_addr_t size, int nid)
+{
+   unsigned long flags = 1 << MEMBLK_LOCAL_NODE;
+   return memblock_reserve_region(base, size, nid, flags);
+}
+
 /**
  * __next_free_mem_range - next function for for_each_free_mem_range()
  * @idx: pointer to u64 loop variable
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 00/12] Arrange hotpluggable memory in SRAT as ZONE_MOVABLE.

2013-04-19 Thread Tang Chen
In memory hotplug situation, the hotpluggable memory should be
arranged in ZONE_MOVABLE because memory in ZONE_NORMAL may be
used by kernel, and Linux cannot migrate pages used by kernel.

So we need a way to specify hotpluggable memory as movable. It
should be as easy as possible.

According to ACPI spec 5.0, SRAT table has memory affinity
structure and the structure has Hot Pluggable Filed. 
See "5.2.16.2 Memory Affinity Structure".

If we use the information, we might be able to specify hotpluggable
memory by firmware. For example, if Hot Pluggable Filed is enabled,
kernel sets the memory as movable memory.

To achieve this goal, we need to do the following:
1. Prevent memblock from allocating hotpluggable memroy for kernel.
   This is done by reserving hotpluggable memory in memblock as the
   folowing steps:
   1) Parse SRAT early enough so that memblock knows which memory
  is hotpluggable.
   2) Add a "flags" member to memblock so that it is able to tell
  which memory is hotpluggable when freeing it to buddy.

2. Free hotpluggable memory to buddy system when memory initialization
   is done.

3. Arrange hotpluggable memory in ZONE_MOVABLE.
   (This will cause NUMA performance decreased)

4. Provide a user interface to enable/disable this functionality.
   (This is useful for those who don't use memory hotplug and who don't
want to lose their NUMA performance.)


This patch-set does the following:
patch1:Fix a little problem.
patch2:Have Hot-Pluggable Field in SRAT printed when parsing SRAT.
patch4,5:  Introduce hotpluggable field to numa_meminfo.
patch6,7:  Introduce flags to memblock, and keep the public APIs prototype
   unmodified.
patch8:Reserve node-life-cycle memory as MEMBLK_LOCAL_NODE with 
memblock.
patch9,10: Reserve hotpluggable memory as MEMBLK_HOTPLUGGABLE with memblock,
   and free it to buddy when memory initialization is done.
patch3,11,12:  Improve "movablecore" boot option to support "movablecore=acpi".


This patch-set is based on Yinghai's
"x86, ACPI, numa: Parse numa info early" patch-set.
Please refer to:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47
v3: https://lkml.org/lkml/2013/4/4/639
v4: https://lkml.org/lkml/2013/4/11/829

And Yinghai's patch did the following things:
1) Parse SRAT early enough.
2)Allocate pagetable pages in local node.


Tang Chen (11):
  acpi: Print Hot-Pluggable Field in SRAT.
  page_alloc, mem-hotplug: Improve movablecore to {en|dis}able using
SRAT.
  x86, numa, acpi, memory-hotplug: Introduce hotplug info into struct
numa_meminfo.
  x86, numa, acpi, memory-hotplug: Consider hotplug info when cleanup
numa_meminfo.
  memblock, numa: Introduce flag into memblock.
  x86, numa, mem-hotplug: Mark nodes which the kernel resides in.
  x86, numa, memblock: Introduce MEMBLK_LOCAL_NODE to mark and reserve
node-life-cycle data.
  x86, acpi, numa, mem-hotplug: Introduce MEMBLK_HOTPLUGGABLE to mark
and reserve hotpluggable memory.
  x86, memblock, mem-hotplug: Free hotpluggable memory reserved by
memblock.
  x86, numa, acpi, memory-hotplug: Make movablecore=acpi have higher
priority.
  doc, page_alloc, acpi, mem-hotplug: Add doc for movablecore=acpi boot
option.

Yasuaki Ishimatsu (1):
  x86: get pg_data_t's memory from other node

 Documentation/kernel-parameters.txt |8 ++
 arch/x86/include/asm/numa.h |3 +-
 arch/x86/kernel/apic/numaq_32.c |2 +-
 arch/x86/mm/amdtopology.c   |3 +-
 arch/x86/mm/init.c  |   16 +++-
 arch/x86/mm/numa.c  |   60 ++---
 arch/x86/mm/numa_internal.h |1 +
 arch/x86/mm/srat.c  |   11 ++-
 include/linux/memblock.h|   16 +
 include/linux/memory_hotplug.h  |3 +
 mm/memblock.c   |  127 ++
 mm/nobootmem.c  |3 +
 mm/page_alloc.c |   37 ++-
 13 files changed, 253 insertions(+), 37 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 12/12] doc, page_alloc, acpi, mem-hotplug: Add doc for movablecore=acpi boot option.

2013-04-19 Thread Tang Chen
Since we modify movablecore boot option to support
"movablecore=acpi", this patch adds doc for it.

Signed-off-by: Tang Chen 
---
 Documentation/kernel-parameters.txt |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4609e81..a1c515b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1649,6 +1649,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
+   movablecore=acpi[KNL,X86] This parameter will enable the
+   kernel to arrange ZONE_MOVABLE with the help of
+   Hot-Pluggable Field in SRAT. All the hotpluggable
+   memory will be arranged in ZONE_MOVABLE.
+   NOTE: Any node which the kernel resides in will
+ always be un-hotpluggable so that the kernel
+ will always have enough memory to boot.
+
MTD_Partition=  [MTD]
Format: ,,,
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 09/12] x86, acpi, numa, mem-hotplug: Introduce MEMBLK_HOTPLUGGABLE to mark and reserve hotpluggable memory.

2013-04-19 Thread Tang Chen
We mark out movable memory ranges and reserve them with MEMBLK_HOTPLUGGABLE 
flag in
memblock.reserved. This should be done after the memory mapping is initialized
because the kernel now supports allocate pagetable pages on local node, which
are kernel pages.

The reserved hotpluggable will be freed to buddy when memory initialization
is done.

This idea is from Wen Congyang  and Jiang Liu 
.

Suggested-by: Jiang Liu 
Suggested-by: Wen Congyang 
Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c   |   26 ++
 include/linux/memblock.h |3 +++
 mm/memblock.c|   19 +++
 3 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 105b092..6f61691 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -731,6 +731,30 @@ static void __init early_x86_numa_init_mapping(void)
 }
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init early_mem_hotplug_init()
+{
+   int i, nid;
+   phys_addr_t start, end;
+
+   if (!movablecore_enable_srat)
+   return;
+
+   for (i = 0; i < numa_meminfo.nr_blks; i++) {
+   if (!numa_meminfo.blk[i].hotpluggable)
+   continue;
+
+   nid = numa_meminfo.blk[i].nid;
+
+   memblock_reserve_hotpluggable(start, end - start, nid);
+   }
+}
+#else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void early_mem_hotplug_init()
+{
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
 void __init early_initmem_init(void)
 {
early_x86_numa_init();
@@ -740,6 +764,8 @@ void __init early_initmem_init(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
 
+   early_mem_hotplug_init();
+
early_memtest(0, max_pfn_mapped<http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:x86/mm] x86/mm/hotplug: Put kernel_physical_mapping_remove() declaration in CONFIG_MEMORY_HOTREMOVE

2013-04-15 Thread tip-bot for Tang Chen
Commit-ID:  587ff8c4eab1587044e69156f997e9d1d3b07709
Gitweb: http://git.kernel.org/tip/587ff8c4eab1587044e69156f997e9d1d3b07709
Author: Tang Chen 
AuthorDate: Mon, 15 Apr 2013 17:46:46 +0800
Committer:  Ingo Molnar 
CommitDate: Mon, 15 Apr 2013 12:03:24 +0200

x86/mm/hotplug: Put kernel_physical_mapping_remove() declaration in 
CONFIG_MEMORY_HOTREMOVE

kernel_physical_mapping_remove() is only called by
arch_remove_memory() in init_64.c, which is enclosed in
CONFIG_MEMORY_HOTREMOVE. So when we don't configure
CONFIG_MEMORY_HOTREMOVE, the compiler will give a warning:

warning: ‘kernel_physical_mapping_remove’ defined but not used

So put kernel_physical_mapping_remove() in
CONFIG_MEMORY_HOTREMOVE.

Signed-off-by: Tang Chen 
Cc: linux...@kvack.org
Cc: gre...@linuxfoundation.org
Cc: ying...@kernel.org
Cc: we...@cn.fujitsu.com
Cc: mgor...@suse.de
Cc: t...@kernel.org
Cc: liw...@linux.vnet.ibm.com
Link: 
http://lkml.kernel.org/r/1366019207-27818-3-git-send-email-tangc...@cn.fujitsu.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 474e28f..dafdeb2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1019,6 +1019,7 @@ void __ref vmemmap_free(struct page *memmap, unsigned 
long nr_pages)
remove_pagetable(start, end, false);
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 static void __meminit
 kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 {
@@ -1028,7 +1029,6 @@ kernel_physical_mapping_remove(unsigned long start, 
unsigned long end)
remove_pagetable(start, end, true);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 int __ref arch_remove_memory(u64 start, u64 size)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] mem-hotplug: Put kernel_physical_mapping_remove() declaration in CONFIG_MEMORY_HOTREMOVE.

2013-04-15 Thread Tang Chen
kernel_physical_mapping_remove() is only called by arch_remove_memory() in
init_64.c, which is enclosed in CONFIG_MEMORY_HOTREMOVE. So when we don't
configure CONFIG_MEMORY_HOTREMOVE, the compiler will give a warning:

warning: ‘kernel_physical_mapping_remove’ defined but not used

So put kernel_physical_mapping_remove() in CONFIG_MEMORY_HOTREMOVE.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/init_64.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 474e28f..dafdeb2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1019,6 +1019,7 @@ void __ref vmemmap_free(struct page *memmap, unsigned 
long nr_pages)
remove_pagetable(start, end, false);
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 static void __meminit
 kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 {
@@ -1028,7 +1029,6 @@ kernel_physical_mapping_remove(unsigned long start, 
unsigned long end)
remove_pagetable(start, end, true);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 int __ref arch_remove_memory(u64 start, u64 size)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] memblock: Fix missing comment of memblock_insert_region().

2013-04-15 Thread Tang Chen
There is no comment for parameter nid of memblock_insert_region().
This patch adds comment for it.

Signed-off-by: Tang Chen 
---
 mm/memblock.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index b8d9147..16eda3d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -322,10 +322,11 @@ static void __init_memblock memblock_merge_regions(struct 
memblock_type *type)
 
 /**
  * memblock_insert_region - insert new memblock region
- * @type: memblock type to insert into
- * @idx: index for the insertion point
- * @base: base address of the new region
- * @size: size of the new region
+ * @type:  memblock type to insert into
+ * @idx:   index for the insertion point
+ * @base:  base address of the new region
+ * @size:  size of the new region
+ * @nid:   node id of the new region
  *
  * Insert new memblock region [@base,@base+@size) into @type at @idx.
  * @type must already have extra room to accomodate the new region.
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] mm: Remove unused parameter of pages_correctly_reserved()

2013-04-15 Thread Tang Chen
nr_pages is not used in pages_correctly_reserved().
So remove it.

Signed-off-by: Tang Chen 
Reviewed-by: Wang Shilong 
Reviewed-by: Wen Congyang 
---
 drivers/base/memory.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index a51007b..f926b9c 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -217,8 +217,7 @@ int memory_isolate_notify(unsigned long val, void *v)
  * The probe routines leave the pages reserved, just as the bootmem code does.
  * Make sure they're still that way.
  */
-static bool pages_correctly_reserved(unsigned long start_pfn,
-   unsigned long nr_pages)
+static bool pages_correctly_reserved(unsigned long start_pfn)
 {
int i, j;
struct page *page;
@@ -266,7 +265,7 @@ memory_block_action(unsigned long phys_index, unsigned long 
action, int online_t
 
switch (action) {
case MEM_ONLINE:
-   if (!pages_correctly_reserved(start_pfn, nr_pages))
+   if (!pages_correctly_reserved(start_pfn))
return -EBUSY;
 
ret = online_pages(start_pfn, nr_pages, online_type);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/3] Little error fix and cleanup.

2013-04-15 Thread Tang Chen
This patch-set did the following things:

patch1: Remove unused parameter "nr_pages" of pages_correctly_reserved().
patch2: Use CONFIG_MEMORY_HOTREMOVE to protect kernel_physical_mapping_remove().
patch3: Add comments for parameter "nid" for memblock_insert_region().

Tang Chen (3):
  mm: Remove unused parameter of pages_correctly_reserved().
  mem-hotplug: Put kernel_physical_mapping_remove() declaration in
CONFIG_MEMORY_HOTREMOVE.
  memblock: Fix missing comment of memblock_insert_region().

 arch/x86/mm/init_64.c |2 +-
 drivers/base/memory.c |5 ++---
 mm/memblock.c |9 +
 3 files changed, 8 insertions(+), 8 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] firmware, memmap: fix firmware_map_entry leak

2013-04-15 Thread Tang Chen

Reviewed-by: Tang Chen 

Thanks. :)

On 04/15/2013 01:48 PM, Yasuaki Ishimatsu wrote:
> When hot removing a memory, a firmware_map_entry which has memory range
> of the memory is released by release_firmware_map_entry(). If the entry
> is allocated by bootmem, release_firmware_map_entry() adds the entry to
> map_entires_bootmem list when firmware_map_find_entry() finds the entry
> from map_entries list. But firmware_map_find_entry never find the entry
> sicne map_entires list does not have the entry. So the entry just leaks.
> 
> Here are steps of leaking firmware_map_entry:
> firmware_map_remove()
> ->  firmware_map_find_entry()
> Find released entry from map_entries list
> ->  firmware_map_remove_entry()
> Delete the entry from map_entries list
> ->  remove_sysfs_fw_map_entry()
> ...
> ->  release_firmware_map_entry()
>->  firmware_map_find_entry()
>   Find the entry from map_entries list but the entry has been
>   deleted from map_entries list. So the entry is not added
>   to map_entries_bootmem. Thus the entry leaks
> 
> release_firmware_map_entry() should not call firmware_map_find_entry()
> since releaed entry has been deleted from map_entries list.
> So the patch delete firmware_map_find_entry() from releae_firmware_map_entry()
> 
> Signed-off-by: Yasuaki Ishimatsu
> ---
>   drivers/firmware/memmap.c |9 +++--
>   1 files changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
> index 0b5b5f6..e2e04b0 100644
> --- a/drivers/firmware/memmap.c
> +++ b/drivers/firmware/memmap.c
> @@ -114,12 +114,9 @@ static void __meminit release_firmware_map_entry(struct 
> kobject *kobj)
>* map_entries_bootmem here, and deleted from&map_entries in
>* firmware_map_remove_entry().
>*/
> - if (firmware_map_find_entry(entry->start, entry->end,
> - entry->type)) {
> - spin_lock(&map_entries_bootmem_lock);
> - list_add(&entry->list,&map_entries_bootmem);
> - spin_unlock(&map_entries_bootmem_lock);
> - }
> + spin_lock(&map_entries_bootmem_lock);
> + list_add(&entry->list,&map_entries_bootmem);
> + spin_unlock(&map_entries_bootmem_lock);
> 
>   return;
>   }
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/2] mm: vmemmap: add vmemmap_verify check for hot-add node/memory case

2013-04-11 Thread Tang Chen

On 04/11/2013 11:10 PM, Yinghai Lu wrote:

On Thu, Apr 11, 2013 at 12:41 AM, Tang Chen  wrote:


3. If we add flag to memblock, we can mark different memory. And I remember
you mentioned before that we can use memblock to reserve local node data
for node-life-cycle data, like vmemmap, pagetable.

So are you doing the similar work now ?


No, i did not start it yet.



If not, I think I can merge it into mine, and push a new patch-set with
hot-add, hot-remove code modified to support putting vmemmap, pagetable,
pgdat, page_cgroup, ..., on local node.


Need to have it separated with moving_zone.

1. rework memblock to keep alive all the way for hotplug usage.
2. put pagetable and vmemap on the local node range with help of memblock.



OK,thanks for the comments. I'll merge it into my work and post an RFC 
patch-set soon.


Thanks. :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/2] mm: vmemmap: add vmemmap_verify check for hot-add node/memory case

2013-04-11 Thread Tang Chen

Hi Yinghai,

(Add cc Liu Jiang.)

On 04/09/2013 02:40 AM, Yinghai Lu wrote:

On Mon, Apr 8, 2013 at 2:56 AM, Lin Feng  wrote:

In hot add node(memory) case, vmemmap pages are always allocated from other
node,


that is broken, and should be fixed.
vmemmap should be on local node even for hot add node.



I want some info sharing. :)

Here is the work I'm trying to do.

1. As most of people don't like movablemem_map idea, we decide to
   drop "specifying physical address" thing, and restart a new solution
   to support using SRAT info only.

   We want to modify movablecore to support "movablecore=acpi" to
   enable/disable limiting hotpluggable memory in ZONE_MOVABLE.
   And we dropped all the old design and data structures.

2. As Liu Jiang mentioned before, we can add a flag to memblock to mark
   special memory. Since we are dropping all the old data structures,
   I think I want to reuse his idea to reserve movable memory with memblock
   when booting.

3. If we add flag to memblock, we can mark different memory. And I remember
   you mentioned before that we can use memblock to reserve local node 
data

   for node-life-cycle data, like vmemmap, pagetable.

   So are you doing the similar work now ?

   If not, I think I can merge it into mine, and push a new patch-set with
   hot-add, hot-remove code modified to support putting vmemmap, 
pagetable,

   pgdat, page_cgroup, ..., on local node.

   If you are doing the similar work, I will only finish my work and wait
   for your patch.

Thanks. :)










--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 02/22] x86, microcode: Use common get_ramdisk_image()

2013-04-09 Thread Tang Chen

On 04/05/2013 07:46 AM, Yinghai Lu wrote:

Use common get_ramdisk_image() to get ramdisk start phys address.

We need this to get correct ramdisk adress for 64bit bzImage that
initrd can be loaded above 4G by kexec-tools.

Signed-off-by: Yinghai Lu
Cc: Fenghua Yu
Acked-by: Tejun Heo
---
  arch/x86/kernel/microcode_intel_early.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/microcode_intel_early.c 
b/arch/x86/kernel/microcode_intel_early.c
index d893e8e..ea57bd8 100644
--- a/arch/x86/kernel/microcode_intel_early.c
+++ b/arch/x86/kernel/microcode_intel_early.c
@@ -742,8 +742,8 @@ load_ucode_intel_bsp(void)
struct boot_params *boot_params_p;

boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
-   ramdisk_image = boot_params_p->hdr.ramdisk_image;
-   ramdisk_size  = boot_params_p->hdr.ramdisk_size;
+   ramdisk_image = get_ramdisk_image(boot_params_p);
+   ramdisk_size  = get_ramdisk_image(boot_params_p);


Shoule be get_ramdisk_size(boot_params_p) ?


initrd_start_early = ramdisk_image;
initrd_end_early = initrd_start_early + ramdisk_size;

@@ -752,8 +752,8 @@ load_ucode_intel_bsp(void)
(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
initrd_start_early, initrd_end_early,&uci);
  #else
-   ramdisk_image = boot_params.hdr.ramdisk_image;
-   ramdisk_size  = boot_params.hdr.ramdisk_size;
+   ramdisk_image = get_ramdisk_image(&boot_params);
+   ramdisk_size  = get_ramdisk_size(&boot_params);
initrd_start_early = ramdisk_image + PAGE_OFFSET;
initrd_end_early = initrd_start_early + ramdisk_size;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/11] Introduce movablemem_map=acpi boot option.

2013-04-09 Thread Tang Chen
On 04/09/2013 01:14 PM, Yasuaki Ishimatsu wrote:
> Hi Tang,
> 
> The patch works well on my x86_64 box.
> I confirmed that hotpluggable node is allocated as Movable Zone.
> So feel free to add:
> 
> Tested by: Yasuaki Ishimatsu
> 
> Nitpick below.

Thanks for testing. Will fix the whitespace error and resend the
patch-set soon. :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 03/11] numa, acpi, memory-hotplug: Add movablemem_map=acpi boot option.

2013-04-05 Thread Tang Chen
Since the kernel pages cannot be migrated, if we want a memory device
hotpluggable, we have to set all the memory on it as ZONE_MOVABLE.

This patch adds a boot option movablemem_map=acpi to inform the kernel
to use Hot Pluggable bit in SRAT to determine which memory device is
hotpluggable.

Signed-off-by: Tang Chen 
Signed-off-by: Lai Jiangshan 
Reviewed-by: Wen Congyang 
Tested-by: Lin Feng 
---
 Documentation/kernel-parameters.txt |   11 +++
 include/linux/mm.h  |   12 
 mm/page_alloc.c |   35 +++
 3 files changed, 58 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4609e81..e039888 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1649,6 +1649,17 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
+   movablemem_map=acpi
+   [KNL,X86,IA-64,PPC] This parameter is similar to
+   memmap except it specifies the memory map of
+   ZONE_MOVABLE.
+   This option inform the kernel to use Hot Pluggable bit
+   in flags from SRAT from ACPI BIOS to determine which
+   memory devices could be hotplugged. The corresponding
+   memory ranges will be set as ZONE_MOVABLE.
+   NOTE: Whatever node the kernel resides in will always
+ be un-hotpluggable.
+
MTD_Partition=  [MTD]
Format: ,,,
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c79b10..52c3558 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1332,6 +1332,18 @@ extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
 extern void sparse_memory_present_with_active_regions(int nid);
 
+#define MOVABLEMEM_MAP_MAX MAX_NUMNODES
+struct movablemem_entry {
+   unsigned long start_pfn;/* start pfn of memory segment */
+   unsigned long end_pfn;  /* end pfn of memory segment (exclusive) */
+};
+
+struct movablemem_map {
+   bool acpi;
+   int nr_map;
+   struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
+};
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f368db4..475fd8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -202,6 +202,12 @@ static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/* Movable memory ranges, will also be used by memblock subsystem. */
+struct movablemem_map movablemem_map = {
+   .acpi = false,
+   .nr_map = 0,
+};
+
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
@@ -5061,6 +5067,35 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
+/**
+ * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
+ * @p: The boot option of the following format:
+ * movablemem_map=acpi
+ *
+ * This option inform the kernel to use Hot Pluggable bit in SRAT to determine
+ * which memory device is hotpluggable, and set the memory on it as movable.
+ *
+ * Return: 0 on success or -EINVAL on failure.
+ */
+static int __init cmdline_parse_movablemem_map(char *p)
+{
+   if (!p || strcmp(p, "acpi"))
+   goto err;
+
+   movablemem_map.acpi = true;
+
+   if (movablemem_map.nr_map) {
+   memset(movablemem_map.map, 0,
+  sizeof(struct movablemem_entry) * movablemem_map.nr_map);
+   }
+
+   return 0;
+
+err:
+   return -EINVAL;
+}
+early_param("movablemem_map", cmdline_parse_movablemem_map);
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 06/11] X86, numa, acpi, memory-hotplug: Add hotpluggable ranges to movablemem_map.

2013-04-05 Thread Tang Chen
When parsing SRAT, we are able to know which memory ranges are hotpluggable,
and we add them to movablemem_map. So movablemem_map could be used to prevent
memblock from allocating memory in area which will be set as ZONE_MOVABLE later.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   39 ++
 include/linux/mm.h |4 ++
 mm/page_alloc.c|   92 
 3 files changed, 135 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 26d1800..73e7934 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -725,6 +725,43 @@ static void __init early_x86_numa_init_mapping(void)
 }
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/**
+ * early_mem_hotplug_init - Add hotpluggable memory ranges to movablemem_map.
+ *
+ * This function scan numa_meminfo.blk[], and add all the hotpluggable memory 
+ * ranges to movablemem_map. movablemem_map can be used to prevent memblock
+ * from allocating memory in area which will be set as ZONE_MOVABLE later, so
+ * this function should be called after memory mapping is initialized because
+ * we will put pagetable pages in local node even if the memory of that node is
+ * hotpluggable.
+ *
+ * If users specify movablemem_map=acpi, then:
+ *
+ * SRAT:|_| |_| |_| |_| ..
+ * node id:0   1 1   2
+ * hotpluggable:   n   y y   n
+ * movablemem_map:  |_| |_|
+ */
+static void __init early_mem_hotplug_init()
+{
+   int i;
+
+   if (!movablemem_map.acpi)
+   return;
+
+   for (i = 0; i < numa_meminfo.nr_blks; i++) {
+   if (numa_meminfo.blk[i].hotpluggable)
+   movablemem_map_add_region(numa_meminfo.blk[i].start,
+ numa_meminfo.blk[i].end);
+   }
+}
+#else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void early_mem_hotplug_init()
+{
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
 void __init early_initmem_init(void)
 {
early_x86_numa_init();
@@ -734,6 +771,8 @@ void __init early_initmem_init(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
 
+   early_mem_hotplug_init();
+
early_memtest(0, max_pfn_mapped<= ARRAY_SIZE(movablemem_map.map)) {
+   pr_err("movablemem_map: too many entries; "
+  "ignoring [mem %#010llx-%#010llx]\n",
+  (unsigned long long) start,
+  (unsigned long long) (start + size - 1));
+   return;
+   }
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = PFN_UP(start + size);
+   insert_movablemem_map(start_pfn, end_pfn);
+}
+
+/**
  * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
  * @p: The boot option of the following format:
  * movablemem_map=acpi
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/11] x86, numa, acpi, memory-hotplug: Consider hotplug info when cleanup numa_meminfo.

2013-04-05 Thread Tang Chen
Since we have introduced hotplug info into struct numa_meminfo, we need
to consider it when cleanup numa_meminfo.

The original logic in numa_cleanup_meminfo() is:
Merge blocks on the same node, holes between which don't overlap with
memory on other nodes.

This patch modifies numa_cleanup_meminfo() logic like this:
Merge blocks with the same hotpluggable type on the same node, holes
between which don't overlap with memory on other nodes.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   13 +
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ecf37fd..26d1800 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -296,18 +296,22 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
}
 
/*
-* Join together blocks on the same node, holes
-* between which don't overlap with memory on other
-* nodes.
+* Join together blocks on the same node, with the same
+* hotpluggable flags, holes between which don't overlap
+* with memory on other nodes.
 */
if (bi->nid != bj->nid)
continue;
+   if (bi->hotpluggable != bj->hotpluggable)
+   continue;
+
start = min(bi->start, bj->start);
end = max(bi->end, bj->end);
for (k = 0; k < mi->nr_blks; k++) {
struct numa_memblk *bk = &mi->blk[k];
 
-   if (bi->nid == bk->nid)
+   if (bi->nid == bk->nid &&
+   bi->hotpluggable == bk->hotpluggable)
continue;
if (start < bk->end && end > bk->start)
break;
@@ -327,6 +331,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
mi->blk[i].start = mi->blk[i].end = 0;
mi->blk[i].nid = NUMA_NO_NODE;
+   mi->blk[i].hotpluggable = false;
}
 
return 0;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 04/11] x86, numa, acpi, memory-hotplug: Introduce hotplug info into struct numa_meminfo.

2013-04-05 Thread Tang Chen
Since we are using struct numa_meminfo to store SRAT info, and sanitize
movablemem_map.map[], we need hotplug info in struct numa_meminfo.

This patch introduces a "bool hotpluggable" member into struct
numa_meminfo.

And modifies the following APIs' prototypes to support it:
   - numa_add_memblk()
   - numa_add_memblk_to()

And the following callers:
   - numaq_register_node()
   - dummy_numa_init()
   - amd_numa_init()
   - acpi_numa_memory_affinity_init() in x86

Signed-off-by: Tang Chen 
---
 arch/x86/include/asm/numa.h |3 ++-
 arch/x86/kernel/apic/numaq_32.c |2 +-
 arch/x86/mm/amdtopology.c   |3 ++-
 arch/x86/mm/numa.c  |   10 +++---
 arch/x86/mm/numa_internal.h |1 +
 arch/x86/mm/srat.c  |2 +-
 6 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 1b99ee5..73096b2 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -31,7 +31,8 @@ extern int numa_off;
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
 extern nodemask_t numa_nodes_parsed __initdata;
 
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end,
+ bool hotpluggable);
 extern void __init numa_set_distance(int from, int to, int distance);
 
 static inline void set_apicid_to_node(int apicid, s16 node)
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index d661ee9..7a9c542 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -82,7 +82,7 @@ static inline void numaq_register_node(int node, struct 
sys_cfg_data *scd)
int ret;
 
node_set(node, numa_nodes_parsed);
-   ret = numa_add_memblk(node, start, end);
+   ret = numa_add_memblk(node, start, end, false);
BUG_ON(ret < 0);
 }
 
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 5247d01..d521471 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -167,7 +167,8 @@ int __init amd_numa_init(void)
nodeid, base, limit);
 
prevbase = base;
-   numa_add_memblk(nodeid, base, limit);
+   /* Do not support memory hotplug for AMD cpu. */
+   numa_add_memblk(nodeid, base, limit, false);
node_set(nodeid, numa_nodes_parsed);
}
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4f754e6..ecf37fd 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -134,6 +134,7 @@ void __init setup_node_to_cpumask_map(void)
 }
 
 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+bool hotpluggable,
 struct numa_meminfo *mi)
 {
/* ignore zero length blks */
@@ -155,6 +156,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
u64 end,
mi->blk[mi->nr_blks].start = start;
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
+   mi->blk[mi->nr_blks].hotpluggable = hotpluggable;
mi->nr_blks++;
return 0;
 }
@@ -179,15 +181,17 @@ void __init numa_remove_memblk_from(int idx, struct 
numa_meminfo *mi)
  * @nid: NUMA node ID of the new memblk
  * @start: Start address of the new memblk
  * @end: End address of the new memblk
+ * @hotpluggable: True if memblk is hotpluggable
  *
  * Add a new memblk to the default numa_meminfo.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
+int __init numa_add_memblk(int nid, u64 start, u64 end,
+  bool hotpluggable)
 {
-   return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+   return numa_add_memblk_to(nid, start, end, hotpluggable, &numa_meminfo);
 }
 
 /* Initialize NODE_DATA for a node on the local memory */
@@ -631,7 +635,7 @@ static int __init dummy_numa_init(void)
   0LLU, PFN_PHYS(max_pfn) - 1);
 
node_set(0, numa_nodes_parsed);
-   numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+   numa_add_memblk(0, 0, PFN_PHYS(max_pfn), false);
 
return 0;
 }
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index bb2fbcc..1ce4e6b 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -8,6 +8,7 @@ struct numa_memblk {
u64 start;
u64 end;
int nid;
+   boolhotpluggable;
 };
 
 struct numa_meminfo {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 5055fa7..f7f6fd4 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -171,7 +171,7 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
}
 
-   if (numa_add_memblk(node, start, e

[PATCH 11/11] x86, numa, acpi, memory-hotplug: Memblock limit with movablemem_map

2013-04-05 Thread Tang Chen
Ensure memblock will not allocate memory from areas that may be
ZONE_MOVABLE. The map info is from movablemem_map boot option.

The following problem was reported by Stephen Rothwell:
The definition of struct movablecore_map is protected by
CONFIG_HAVE_MEMBLOCK_NODE_MAP but its use in memblock_overlaps_region()
is not. So add CONFIG_HAVE_MEMBLOCK_NODE_MAP to protect the use of
movablecore_map in memblock_overlaps_region().

Signed-off-by: Tang Chen 
Reviewed-by: Wen Congyang 
Reviewed-by: Lai Jiangshan 
Tested-by: Lin Feng 
Reported-by: Stephen Rothwell 
---
 include/linux/memblock.h |2 +
 mm/memblock.c|   50 ++
 2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203..3e5ecb2 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock {
 
 extern struct memblock memblock;
 extern int memblock_debug;
+extern struct movablemem_map movablemem_map;
 
 #define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
@@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  unsigned long *out_end_pfn, int *out_nid);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index b8d9147..1bcd9b9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct 
memblock_type *type,
  *
  * Find @size free area aligned to @align in the specified range and node.
  *
+ * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
+ * memory we found if not in hotpluggable ranges.
+ *
  * RETURNS:
  * Found address on success, %0 on failure.
  */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+   phys_addr_t end, phys_addr_t size,
+   phys_addr_t align, int nid)
+{
+   phys_addr_t this_start, this_end, cand;
+   u64 i;
+   int curr = movablemem_map.nr_map - 1;
+
+   /* pump up @end */
+   if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+   end = memblock.current_limit;
+
+   /* avoid allocating the first page */
+   start = max_t(phys_addr_t, start, PAGE_SIZE);
+   end = max(start, end);
+
+   for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+   this_start = clamp(this_start, start, end);
+   this_end = clamp(this_end, start, end);
+
+restart:
+   if (this_end <= this_start || this_end < size)
+   continue;
+
+   for (; curr >= 0; curr--) {
+   if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
+   < this_end)
+   break;
+   }
+
+   cand = round_down(this_end - size, align);
+   if (curr >= 0 &&
+   cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
+   this_end = movablemem_map.map[curr].start_pfn
+  << PAGE_SHIFT;
+   goto restart;
+   }
+
+   if (cand >= this_start)
+   return cand;
+   }
+
+   return 0;
+}
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock 
memblock_find_in_range_node(phys_addr_t start,
}
return 0;
 }
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
  * memblock_find_in_range - find free area in given range
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 07/11] x86, numa, acpi, memory-hotplug: Make any node which the kernel resides in un-hotpluggable.

2013-04-05 Thread Tang Chen
Before parsing SRAT, memblock has already reserved some memory ranges
for other purposes, such as for kernel image. We cannot prevent
kernel from using these memory. Furthermore, if all the memory is
hotpluggable, then the system won't have enough memory to boot if we set
all of them as movable. So we always set the nodes which the kernel
resides in as non-movable.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   25 +++--
 arch/x86/mm/srat.c |   17 -
 include/linux/mm.h |1 +
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 73e7934..dcaf248 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -736,24 +736,37 @@ static void __init early_x86_numa_init_mapping(void)
  * we will put pagetable pages in local node even if the memory of that node is
  * hotpluggable.
  *
- * If users specify movablemem_map=acpi, then:
+ * And, when the kernel is booting, memblock has reserved some memory for other
+ * purpose, such as storing kernel image. We cannot prevent the kernel from
+ * using this kind of memory. So whatever node the kernel resides in should be
+ * un-hotpluggable, because if all the memory is hotpluggable, and is set as
+ * movable, the kernel won't have enough memory to boot.
+ *
+ * It works like this:
+ * If users specify movablemem_map=acpi, then
  *
  * SRAT:|_| |_| |_| |_| ..
  * node id:0   1 1   2
- * hotpluggable:   n   y y   n
+ * hotpluggable:   y   y y   n
+ * kernel resides in:  y   n n   n
  * movablemem_map:  |_| |_|
  */
 static void __init early_mem_hotplug_init()
 {
-   int i;
+   int i, nid;
 
if (!movablemem_map.acpi)
return;
 
for (i = 0; i < numa_meminfo.nr_blks; i++) {
-   if (numa_meminfo.blk[i].hotpluggable)
-   movablemem_map_add_region(numa_meminfo.blk[i].start,
- numa_meminfo.blk[i].end);
+   nid = numa_meminfo_all.blk[i].nid;
+
+   if (node_isset(nid, movablemem_map.numa_nodes_kernel) ||
+   !numa_meminfo.blk[i].hotpluggable)
+   continue;
+
+   movablemem_map_add_region(numa_meminfo.blk[i].start,
+ numa_meminfo.blk[i].end);
}
 }
 #else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index f7f6fd4..0b5904e 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -147,7 +147,8 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
 {
u64 start, end;
u32 hotpluggable;
-   int node, pxm;
+   int node, pxm, i;
+   struct memblock_type *rgn = &memblock.reserved;
 
if (srat_disabled())
goto out_err;
@@ -176,6 +177,20 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
 
node_set(node, numa_nodes_parsed);
 
+   /*
+* Before parsing SRAT, memblock has reserved some memory for other
+* purpose, such as storing kernel image. We cannot prevent the kernel
+* from using this kind of memory. So just mark which nodes the kernel
+* resides in, and set these nodes un-hotpluggable later.
+*/
+   for (i = 0; i < rgn->cnt; i++) {
+   if (end <= rgn->regions[i].base ||
+   start >= rgn->regions[i].base + rgn->regions[i].size)
+   continue;
+
+   node_set(node, movablemem_map.numa_nodes_kernel);
+   }
+
printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
   node, pxm,
   (unsigned long long) start, (unsigned long long) end - 1,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7468221..2835c91 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1342,6 +1342,7 @@ struct movablemem_map {
bool acpi;
int nr_map;
struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
+   nodemask_t numa_nodes_kernel;   /* on which nodes kernel resides in */
 };
 
 extern struct movablemem_map movablemem_map;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 08/11] x86, numa, acpi, memory-hotplug: Introduce zone_movable_limit[] to store start pfn of ZONE_MOVABLE.

2013-04-05 Thread Tang Chen
Since node info in SRAT may not be in increasing order, we may meet
a lower range after we handled a higher range. So we need to keep
the lowest movable pfn each time we parse a SRAT memory entry, and
update it when we get a lower one.

This patch introduces a new array zone_movable_limit[], which is used
to store the start pfn of each node's ZONE_MOVABLE.

We update it each time we parsed a SRAT memory entry if necessary.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   16 ++--
 include/linux/mm.h |2 ++
 mm/page_alloc.c|1 +
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index dcaf248..8cbe8a0 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -727,7 +727,8 @@ static void __init early_x86_numa_init_mapping(void)
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 /**
- * early_mem_hotplug_init - Add hotpluggable memory ranges to movablemem_map.
+ * early_mem_hotplug_init - Add hotpluggable memory ranges to movablemem_mapi,
+ *  and initialize zone_movable_limit.
  *
  * This function scan numa_meminfo.blk[], and add all the hotpluggable memory 
  * ranges to movablemem_map. movablemem_map can be used to prevent memblock
@@ -750,6 +751,10 @@ static void __init early_x86_numa_init_mapping(void)
  * hotpluggable:   y   y y   n
  * kernel resides in:  y   n n   n
  * movablemem_map:  |_| |_|
+ *
+ * This function will also initialize zone_movable_limit[].
+ * ZONE_MOVABLE of node i should start at least from zone_movable_limit[i].
+ * zone_movable_limit[i] == 0 means there is no limitation for node i.
  */
 static void __init early_mem_hotplug_init()
 {
@@ -759,7 +764,7 @@ static void __init early_mem_hotplug_init()
return;
 
for (i = 0; i < numa_meminfo.nr_blks; i++) {
-   nid = numa_meminfo_all.blk[i].nid;
+   nid = numa_meminfo.blk[i].nid;
 
if (node_isset(nid, movablemem_map.numa_nodes_kernel) ||
!numa_meminfo.blk[i].hotpluggable)
@@ -767,6 +772,13 @@ static void __init early_mem_hotplug_init()
 
movablemem_map_add_region(numa_meminfo.blk[i].start,
  numa_meminfo.blk[i].end);
+
+   if (zone_movable_limit[nid])
+   zone_movable_limit[nid] = min(zone_movable_limit[nid],
+   PFN_DOWN(numa_meminfo.blk[i].start));
+   else
+   zone_movable_limit[nid] = 
+   PFN_DOWN(numa_meminfo.blk[i].start);
}
 }
 #else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2835c91..b313d83 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1349,6 +1349,8 @@ extern struct movablemem_map movablemem_map;
 
 extern void __init movablemem_map_add_region(u64 start, u64 size);
 
+extern unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2a7904f..b97bdb5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -213,6 +213,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 00/11] Introduce movablemem_map=acpi boot option.

2013-04-05 Thread Tang Chen
Before this patch-set, we introduced movablemem_map boot option which allowed
users to specify physical address ranges to set memory as movable. This is not
user friendly enough for normal users.

So now, we introduce just movablemem_map=acpi to allow users to enable/disable
the kernel to use Hot Pluggable bit in SRAT to determine which memory ranges are
hotpluggable, and set them as ZONE_MOVABLE.

This patch-set is based on Yinghai's patch-set:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47

So it supports to allocate pagetable pages in local nodes.

We also split the large patch-set into smaller ones, and it seems easier to 
review.



[What we are doing]
This patchset introduces a boot option for users to specify ZONE_MOVABLE
memory map for each node in the system. Users can use it in two ways:

1. movablecore_map=acpi
   In this way, the kernel will use Hot Pluggable bit in SRAT to determine
   ZONE_MOVABLE for each node. All the ranges user has specified will be
   ignored.


[Why we do this]
If we hot remove a memroy device, it cannot have kernel memory,
because Linux cannot migrate kernel memory currently. Therefore,
we have to guarantee that the hot removed memory has only movable
memoroy.
(Here is an exception: When we implement the node hotplug functionality,
for those kernel memory whose life cycle is the same as the node, such as
pagetables, vmemmap and so on, although the kernel cannot migrate them,
we can still put them on local node because we can free them before we
hot-remove the node. This is not completely implemented yet.)

Linux has two boot options, kernelcore= and movablecore=, for
creating movable memory. These boot options can specify the amount
of memory use as kernel or movable memory. Using them, we can
create ZONE_MOVABLE which has only movable memory.
(NOTE: doing this will cause NUMA performance because the kernel won't
 be able to distribute kernel memory evenly to each node.)

But it does not fulfill a requirement of memory hot remove, because
even if we specify the boot options, movable memory is distributed
in each node evenly. So when we want to hot remove memory which
memory range is 0x8000-0c000, we have no way to specify
the memory as movable memory.

Furthermore, even if we can use SRAT, users still need an interface
to enable/disable this functionality if they don't want to lose their
NUMA performance.  So I think, a user interface is always needed.

So we proposed this new feature which enable/disable the kernel to set
hotpluggable memory as ZONE_MOVABLE.


[Ways to do this]
There may be 2 ways to specify movable memory.
1. use firmware information
2. use boot option

1. use firmware information
  According to ACPI spec 5.0, SRAT table has memory affinity structure
  and the structure has Hot Pluggable Filed. See "5.2.16.2 Memory
  Affinity Structure". If we use the information, we might be able to
  specify movable memory by firmware. For example, if Hot Pluggable
  Filed is enabled, Linux sets the memory as movable memory.

2. use boot option
  This is our proposal. New boot option can specify memory range to use
  as movable memory.


[How we do this]
We now propose a boot option, but support the first way above. A boot option
is always needed because set memory as movable will cause NUMA performance
down. So at least, we need an interface to enable/disable it so that users
who don't want to use memory hotplug functionality will also be happy.


[How to use]
Specify movablemem_map=acpi in kernel commandline:
 *
 * SRAT:|_| |_| |_| |_| ..
 * node id:0   1 1   2
 * hotpluggable:   n   y y   n
 * ZONE_MOVABLE:|_| |_|
 *
   NOTE: 1) Before parsing SRAT, memblock has already reserve some memory ranges
for other purposes, such as for kernel image. We cannot prevent
kernel from using these memory, so we need to exclude these memory
even if it is hotpluggable.
Furthermore, to ensure the kernel has enough memory to boot, we make
all the memory on the node which the kernel resides in should be
un-hotpluggable.
 2) In this case, all the user specified memory ranges will be ingored.

We also need to consider the following points:
1) Using this boot option could cause NUMA performance down because the kernel
   memory will not be distributed on each node evenly. So for users who don't
   want to lose their NUMA performance, just don't use it.
2) If kernelcore or movablecore is also specified, movablecore_map will have
   higher priority to be satisfied.
3) This option has no conflict with memmap option.

Tane Chen (10):
  acpi: Print hotplug info in SRAT.
  numa, acpi, memory-hotplug: Add movablemem_map=acp

[PATCH 10/11] x86, numa, acpi, memory-hotplug: make movablemem_map have higher priority

2013-04-05 Thread Tang Chen
If kernelcore or movablecore is specified at the same time with
movablemem_map, movablemem_map will have higher priority to be
satisfied.  This patch will make find_zone_movable_pfns_for_nodes()
calculate zone_movable_pfn[] with the limit from zone_movable_limit[].

Signed-off-by: Tang Chen 
Reviewed-by: Wen Congyang 
Reviewed-by: Lai Jiangshan 
Tested-by: Lin Feng 
---
 mm/page_alloc.c |   28 +---
 1 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f800aec..5db286f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4872,9 +4872,17 @@ static void __init find_zone_movable_pfns_for_nodes(void)
required_kernelcore = max(required_kernelcore, corepages);
}
 
-   /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-   if (!required_kernelcore)
+   /*
+* If neither kernelcore/movablecore nor movablemem_map is specified,
+* there is no ZONE_MOVABLE. But if movablemem_map is specified, the
+* start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
+*/
+   if (!required_kernelcore) {
+   if (movablemem_map.nr_map)
+   memcpy(zone_movable_pfn, zone_movable_limit,
+   sizeof(zone_movable_pfn));
goto out;
+   }
 
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
@@ -4904,10 +4912,24 @@ restart:
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
unsigned long size_pages;
 
+   /*
+* Find more memory for kernelcore in
+* [zone_movable_pfn[nid], zone_movable_limit[nid]).
+*/
start_pfn = max(start_pfn, zone_movable_pfn[nid]);
if (start_pfn >= end_pfn)
continue;
 
+   if (zone_movable_limit[nid]) {
+   end_pfn = min(end_pfn, zone_movable_limit[nid]);
+   /* No range left for kernelcore in this node */
+   if (start_pfn >= end_pfn) {
+   zone_movable_pfn[nid] =
+   zone_movable_limit[nid];
+   break;
+   }
+   }
+
/* Account for what is only usable for kernelcore */
if (start_pfn < usable_startpfn) {
unsigned long kernel_pages;
@@ -4967,12 +4989,12 @@ restart:
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
 
+out:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 
-out:
/* restore the node_state */
node_states[N_MEMORY] = saved_node_state;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/11] x86, numa, acpi, memory-hotplug: Sanitize zone_movable_limit[].

2013-04-05 Thread Tang Chen
As mentioned by Liu Jiang and Wu Jiangguo, users could specify DMA,
DMA32, and HIGHMEM as movable. In order to ensure the kernel will
work correctly, we should exclude these memory ranges out from
zone_movable_limit[].

NOTE: Do find_usable_zone_for_movable() to initialize movable_zone
  so that sanitize_zone_movable_limit() could use it. This is
  pointed out by Wu Jianguo .

Reported-by: Wu Jianguo 
Signed-off-by: Tang Chen 
Signed-off-by: Liu Jiang 
Reviewed-by: Wen Congyang 
Reviewed-by: Lai Jiangshan 
Tested-by: Lin Feng 
---
 mm/page_alloc.c |   54 +-
 1 files changed, 53 insertions(+), 1 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b97bdb5..f800aec 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4412,6 +4412,57 @@ static unsigned long __meminit 
zone_absent_pages_in_node(int nid,
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
+/**
+ * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
+ *
+ * zone_movable_limit[] have been initialized when parsing SRAT or
+ * movablemem_map. This function will try to exclude ZONE_DMA, ZONE_DMA32,
+ * and HIGHMEM from zone_movable_limit[].
+ *
+ * zone_movable_limit[nid] == 0 means no limit for the node.
+ *
+ * Note: Need to be called with movable_zone initialized.
+ */
+static void __meminit sanitize_zone_movable_limit(void)
+{
+   int nid;
+
+   if (!movablemem_map.nr_map)
+   return;
+
+   /* Iterate each node id. */
+   for_each_node(nid) {
+   /* If we have no limit for this node, just skip it. */
+   if (!zone_movable_limit[nid])
+   continue;
+
+#ifdef CONFIG_ZONE_DMA
+   /* Skip DMA memory. */
+   if (zone_movable_limit[nid] <
+   arch_zone_highest_possible_pfn[ZONE_DMA])
+   zone_movable_limit[nid] =
+   arch_zone_highest_possible_pfn[ZONE_DMA];
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+   /* Skip DMA32 memory. */
+   if (zone_movable_limit[nid] <
+   arch_zone_highest_possible_pfn[ZONE_DMA32])
+   zone_movable_limit[nid] =
+   arch_zone_highest_possible_pfn[ZONE_DMA32];
+#endif
+
+#ifdef CONFIG_HIGHMEM
+   /* Skip lowmem if ZONE_MOVABLE is highmem. */
+   if (zone_movable_is_highmem() &&
+   zone_movable_limit[nid] <
+   arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
+   zone_movable_limit[nid] =
+   arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
+#endif
+   }
+}
+
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
@@ -4826,7 +4877,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
 
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-   find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 
 restart:
@@ -4985,6 +5035,8 @@ void __init free_area_init_nodes(unsigned long 
*max_zone_pfn)
 
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+   find_usable_zone_for_movable();
+   sanitize_zone_movable_limit();
find_zone_movable_pfns_for_nodes();
 
/* Print out the zone ranges */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/11] acpi: Print hotplug info in SRAT.

2013-04-05 Thread Tang Chen
The Hot Pluggable field in SRAT points out if the memory could be
hotplugged while the system is running. It is useful to print out
this info when parsing SRAT.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/srat.c |9 ++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 443f9ef..5055fa7 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -146,6 +146,7 @@ int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
u64 start, end;
+   u32 hotpluggable;
int node, pxm;
 
if (srat_disabled())
@@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
-   if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+   hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+   if (hotpluggable && !save_add_info())
goto out_err;
 
start = ma->base_address;
@@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
 
node_set(node, numa_nodes_parsed);
 
-   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
   node, pxm,
-  (unsigned long long) start, (unsigned long long) end - 1);
+  (unsigned long long) start, (unsigned long long) end - 1,
+  hotpluggable ? "Hot Pluggable" : "");
 
return 0;
 out_err_bad_srat:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 01/11] x86: get pg_data_t's memory from other node

2013-04-05 Thread Tang Chen
From: Yasuaki Ishimatsu 

If system can create movable node which all memory of the
node is allocated as ZONE_MOVABLE, setup_node_data() cannot
allocate memory for the node's pg_data_t.
So, use memblock_alloc_try_nid() instead of memblock_alloc_nid()
to retry when the first allocation fails.

Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Lai Jiangshan 
Signed-off-by: Tang Chen 
Signed-off-by: Jiang Liu 
---
 arch/x86/mm/numa.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 11acdf6..4f754e6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -214,10 +214,9 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
-   nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+   nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
-   pr_err("Cannot find %zu bytes in node %d\n",
-  nd_size, nid);
+   pr_err("Cannot find %zu bytes in any node\n", nd_size);
return;
}
nd = __va(nd_pa);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH part2 0/4] Allow allocating pagetable on local node in movablemem_map.

2013-03-26 Thread Tang Chen

Hi Yinghai,

Would you please help to review this patch-set ?

And how do you think of the memblock flag idea ?

FYI, Liu Jiang has proposed a similar idea before.
https://lkml.org/lkml/2012/12/6/422

But we may have the following difference:
1) It is a flag, not a tag, which means a range may have several
   different attributes.
2) Mark node-lify-cycle data, and put it on local node, and free
   it when hot-removing.
3) Mark and reserve movable memory, as you did.

Thanks. :)

On 03/21/2013 05:21 PM, Tang Chen wrote:

Hi Yinghai, all,

This patch-set is based on Yinghai's tree:
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
for-x86-mm

For main line, we need to apply Yinghai's
"x86, ACPI, numa: Parse numa info early" patch-set first.
Please refer to:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47


In this part2 patch-set, we didi the following things:
1) Introduce a "bool hotpluggable" member into struct numa_memblk so that we are
able to know which memory ranges in numa_meminfo are hotpluggable.
All the related apis have been changed.
2) Introduce a new global variable "numa_meminfo_all" to store all the memory 
ranges
recorded in SRAT, because numa_cleanup_meminfo() will remove ranges higher 
than
max_pfn.
We need full numa memory info to limit zone_movable_pfn[].
3) Move movablemem_map sanitization after memory mapping is initialized so that
pagetable allocation will not be limited by movablemem_map.


On the other hand, we may have another way to solve this problem:

Not only pagetable and vmemmap pages, but also all the data whose life cycle is 
the
same as a node, could be put on local node.

1) Introduce a flag into memblock, such as "LOCAL_NODE_DATA", to mark out which
ranges have the same life cycle with node.
2) Only keep existing memory ranges in movablemem_map (no need to introduce
numa_meminfo_all), and exclude these LOCAL_NODE_DATA ranges.
3) When hot-removing, we are able to find out these ranges, and free them first.
This is very important.

Also, hot-add logic needs to be modified, too. As Yinghai mentioned before, I 
think
we can make memblock alive when memory is hot-added. And go with the same logic
as it is when booting.

How do you think?


Tang Chen (4):
   x86, mm, numa, acpi: Introduce numa_meminfo_all to store all the numa
 meminfo.
   x86, mm, numa, acpi: Introduce hotplug info into struct numa_meminfo.
   x86, mm, numa, acpi: Consider hotplug info when cleanup numa_meminfo.
   x86, mm, numa, acpi: Sanitize movablemem_map after memory mapping
 initialized.

  arch/x86/include/asm/numa.h |3 +-
  arch/x86/kernel/apic/numaq_32.c |2 +-
  arch/x86/mm/amdtopology.c   |3 +-
  arch/x86/mm/numa.c  |  161 +--
  arch/x86/mm/numa_internal.h |1 +
  arch/x86/mm/srat.c  |  141 +-
  6 files changed, 178 insertions(+), 133 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH part1 0/9] Introduce movablemem_map boot option.

2013-03-21 Thread Tang Chen
y, so we need to exclude these memory
even if it is hotpluggable.
Furthermore, to ensure the kernel has enough memory to boot, we make
all the memory on the node which the kernel resides in should be
un-hotpluggable.
 2) In this case, all the user specified memory ranges will be ingored.

We also need to consider the following points:
1) Using this boot option could cause NUMA performance down because the kernel
   memory will not be distributed on each node evenly. So for users who don't
   want to lose their NUMA performance, just don't use it.
2) If kernelcore or movablecore is also specified, movablecore_map will have
   higher priority to be satisfied.
3) This option has no conflict with memmap option.


Tang Chen (8):
  acpi: Print hotplug info in SRAT.
  x86, mm, numa, acpi: Add movable_memmap boot option.
  x86, mm, numa, acpi: Introduce zone_movable_limit[] to store start
pfn of ZONE_MOVABLE.
  x86, mm, numa, acpi: Extend movablemem_map to the end of each node.
  x86, mm, numa, acpi: Support getting hotplug info from SRAT.
  x86, mm, numa, acpi: Sanitize zone_movable_limit[].
  x86, mm, numa, acpi: make movablemem_map have higher priority
  x86, mm, numa, acpi: Memblock limit with movablemem_map

Yasuaki Ishimatsu (1):
  x86: get pg_data_t's memory from other node

 Documentation/kernel-parameters.txt |   36 +
 arch/x86/mm/numa.c  |5 +-
 arch/x86/mm/srat.c  |  130 +-
 include/linux/memblock.h|2 +
 include/linux/mm.h  |   22 +++
 mm/memblock.c   |   50 +++
 mm/page_alloc.c |  265 ++-
 7 files changed, 500 insertions(+), 10 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH part1 2/9] acpi: Print hotplug info in SRAT.

2013-03-21 Thread Tang Chen
The Hot Pluggable field in SRAT points out if the memory could be
hotplugged while the system is running. It is useful to print out
this info when parsing SRAT.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/srat.c |9 ++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 443f9ef..5055fa7 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -146,6 +146,7 @@ int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
u64 start, end;
+   u32 hotpluggable;
int node, pxm;
 
if (srat_disabled())
@@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
-   if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+   hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+   if (hotpluggable && !save_add_info())
goto out_err;
 
start = ma->base_address;
@@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
 
node_set(node, numa_nodes_parsed);
 
-   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+   printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
   node, pxm,
-  (unsigned long long) start, (unsigned long long) end - 1);
+  (unsigned long long) start, (unsigned long long) end - 1,
+  hotpluggable ? "Hot Pluggable" : "");
 
return 0;
 out_err_bad_srat:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH part1 4/9] x86, mm, numa, acpi: Introduce zone_movable_limit[] to store start pfn of ZONE_MOVABLE.

2013-03-21 Thread Tang Chen
Since node info in SRAT may not be in increasing order, we may meet
a lower range after we handled a higher range. So we need to keep
the lowest movable pfn each time we parse a SRAT memory entry, and
update it when we get a lower one.

This patch introduces a new array zone_movable_limit[], which is used
to store the start pfn of each node's ZONE_MOVABLE.

We update it each time we parsed a SRAT memory entry if necessary.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/srat.c |   29 +
 include/linux/mm.h |9 +
 mm/page_alloc.c|   35 +--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 5055fa7..6cd4d33 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -141,6 +141,33 @@ static inline int save_add_info(void) {return 1;}
 static inline int save_add_info(void) {return 0;}
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init sanitize_movablemem_map(int nid, u64 start, u64 end)
+{
+   int overlap;
+   unsigned long start_pfn, end_pfn;
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = PFN_UP(end);
+
+   overlap = movablemem_map_overlap(start_pfn, end_pfn);
+   if (overlap >= 0) {
+   start_pfn = max(start_pfn,
+   movablemem_map.map[overlap].start_pfn);
+
+   if (zone_movable_limit[nid])
+   zone_movable_limit[nid] = min(zone_movable_limit[nid],
+ start_pfn);
+   else
+   zone_movable_limit[nid] = start_pfn;
+   }
+}
+#else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline void sanitize_movablemem_map(int nid, u64 start, u64 end)
+{
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
@@ -181,6 +208,8 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
   (unsigned long long) start, (unsigned long long) end - 1,
   hotpluggable ? "Hot Pluggable" : "");
 
+   sanitize_movablemem_map(node, start, end);
+
return 0;
 out_err_bad_srat:
bad_srat();
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c068d5..d2c5fec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1343,6 +1343,15 @@ struct movablemem_map {
struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
 };
 
+extern struct movablemem_map movablemem_map;
+
+extern void __init insert_movablemem_map(unsigned long start_pfn,
+unsigned long end_pfn);
+extern int __init movablemem_map_overlap(unsigned long start_pfn,
+unsigned long end_pfn);
+
+extern unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27fcd29..f451ded 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -210,6 +210,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -5065,6 +5066,36 @@ early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
 /**
+ * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
+ * @start_pfn: start pfn of the range to be checked
+ * @end_pfn:   end pfn of the range to be checked (exclusive)
+ *
+ * This function checks if a given memory range [start_pfn, end_pfn) overlaps
+ * the movablemem_map.map[] array.
+ *
+ * Return: index of the first overlapped element in movablemem_map.map[]
+ * or -1 if they don't overlap each other.
+ */
+int __init movablemem_map_overlap(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+   int overlap;
+
+   if (!movablemem_map.nr_map)
+   return -1;
+
+   for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
+   if (start_pfn < movablemem_map.map[overlap].end_pfn)
+   break;
+
+   if (overlap == movablemem_map.nr_map ||
+   end_pfn <= movablemem_map.map[overlap].start_pfn)
+   return -1;
+
+   return overlap;
+}
+
+/**
  * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
  * @start_pfn: start pfn of the range
  * @end_pfn:   end pfn o

[RESEND PATCH part1 9/9] x86, mm, numa, acpi: Memblock limit with movablemem_map

2013-03-21 Thread Tang Chen
Ensure memblock will not allocate memory from areas that may be
ZONE_MOVABLE. The map info is from movablemem_map boot option.

The following problem was reported by Stephen Rothwell:
The definition of struct movablecore_map is protected by
CONFIG_HAVE_MEMBLOCK_NODE_MAP but its use in memblock_overlaps_region()
is not. So add CONFIG_HAVE_MEMBLOCK_NODE_MAP to protect the use of
movablecore_map in memblock_overlaps_region().

Signed-off-by: Tang Chen 
Reviewed-by: Wen Congyang 
Reviewed-by: Lai Jiangshan 
Tested-by: Lin Feng 
Reported-by: Stephen Rothwell 
---
 include/linux/memblock.h |2 +
 mm/memblock.c|   50 ++
 2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203..3e5ecb2 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock {
 
 extern struct memblock memblock;
 extern int memblock_debug;
+extern struct movablemem_map movablemem_map;
 
 #define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
@@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  unsigned long *out_end_pfn, int *out_nid);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index b8d9147..1bcd9b9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct 
memblock_type *type,
  *
  * Find @size free area aligned to @align in the specified range and node.
  *
+ * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
+ * memory we found if not in hotpluggable ranges.
+ *
  * RETURNS:
  * Found address on success, %0 on failure.
  */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+   phys_addr_t end, phys_addr_t size,
+   phys_addr_t align, int nid)
+{
+   phys_addr_t this_start, this_end, cand;
+   u64 i;
+   int curr = movablemem_map.nr_map - 1;
+
+   /* pump up @end */
+   if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+   end = memblock.current_limit;
+
+   /* avoid allocating the first page */
+   start = max_t(phys_addr_t, start, PAGE_SIZE);
+   end = max(start, end);
+
+   for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+   this_start = clamp(this_start, start, end);
+   this_end = clamp(this_end, start, end);
+
+restart:
+   if (this_end <= this_start || this_end < size)
+   continue;
+
+   for (; curr >= 0; curr--) {
+   if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
+   < this_end)
+   break;
+   }
+
+   cand = round_down(this_end - size, align);
+   if (curr >= 0 &&
+   cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
+   this_end = movablemem_map.map[curr].start_pfn
+  << PAGE_SHIFT;
+   goto restart;
+   }
+
+   if (cand >= this_start)
+   return cand;
+   }
+
+   return 0;
+}
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock 
memblock_find_in_range_node(phys_addr_t start,
}
return 0;
 }
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
  * memblock_find_in_range - find free area in given range
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH part1 3/9] x86, mm, numa, acpi: Add movable_memmap boot option.

2013-03-21 Thread Tang Chen
Add functions to parse movablemem_map boot option. Since the option
could be specified more then once, all the maps will be stored in the
global array movablemem_map.map[].

And also, we keep the array in monotonic increasing order by start_pfn.
And merge all overlapped ranges.

Signed-off-by: Tang Chen 
Signed-off-by: Lai Jiangshan 
Reviewed-by: Wen Congyang 
Tested-by: Lin Feng 
---
 Documentation/kernel-parameters.txt |   21 ++
 include/linux/mm.h  |   11 +++
 mm/page_alloc.c |  131 +++
 3 files changed, 163 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4609e81..dd3a36a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1649,6 +1649,27 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
+   movablemem_map=nn[KMG]@ss[KMG]
+   [KNL,X86,IA-64,PPC] This parameter is similar to
+   memmap except it specifies the memory map of
+   ZONE_MOVABLE.
+   If user specifies memory ranges, the info in SRAT will
+   be ingored. And it works like the following:
+   - If more ranges are all within one node, then from
+ lowest ss to the end of the node will be ZONE_MOVABLE.
+   - If a range is within a node, then from ss to the end
+ of the node will be ZONE_MOVABLE.
+   - If a range covers two or more nodes, then from ss to
+ the end of the 1st node will be ZONE_MOVABLE, and all
+ the rest nodes will only have ZONE_MOVABLE.
+   If memmap is specified at the same time, the
+   movablemem_map will be limited within the memmap
+   areas. If kernelcore or movablecore is also specified,
+   movablemem_map will have higher priority to be
+   satisfied. So the administrator should be careful that
+   the amount of movablemem_map areas are not too large.
+   Otherwise kernel won't have enough memory to start.
+
MTD_Partition=  [MTD]
Format: ,,,
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c79b10..9c068d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1332,6 +1332,17 @@ extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
 extern void sparse_memory_present_with_active_regions(int nid);
 
+#define MOVABLEMEM_MAP_MAX MAX_NUMNODES
+struct movablemem_entry {
+   unsigned long start_pfn;/* start pfn of memory segment */
+   unsigned long end_pfn;  /* end pfn of memory segment (exclusive) */
+};
+
+struct movablemem_map {
+   int nr_map;
+   struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
+};
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f368db4..27fcd29 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -202,6 +202,9 @@ static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/* Movable memory ranges, will also be used by memblock subsystem. */
+struct movablemem_map movablemem_map;
+
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
@@ -5061,6 +5064,134 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
+/**
+ * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
+ * @start_pfn: start pfn of the range
+ * @end_pfn:   end pfn of the range
+ *
+ * This function will also merge the overlapped ranges, and sort the array
+ * by start_pfn in monotonic increasing order.
+ */
+static void __init insert_movablemem_map(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+   int pos, overlap;
+
+   /*
+* pos will be at the 1st overlapped range, or the position
+* where the element should be inserted.
+*/
+   for (pos = 0; pos < movablemem_map.nr_map; pos++)
+   if (start_pfn <= movablemem_map.map[pos].end_pfn)
+   break;
+
+   /* If there is no overlapped range, just insert the element. */
+   if (pos == movab

[RESEND PATCH part1 5/9] x86, mm, numa, acpi: Extend movablemem_map to the end of each node.

2013-03-21 Thread Tang Chen
When implementing movablemem_map boot option, we introduced an array
movablemem_map.map[] to store the memory ranges to be set as ZONE_MOVABLE.

Since ZONE_MOVABLE is the latst zone of a node, if user didn't specify
the whole node memory range, we need to extend it to the node end so that
we can use it to prevent memblock from allocating memory in the ranges
user didn't specify.

We now implement movablemem_map boot option like this:
/*
 * For movablemem_map=nn[KMG]@ss[KMG]:
 *
 * SRAT:|_| |_| |_| |_| ..
 * node id:0   1 1   2
 * user specified:|__| |___|
 * movablemem_map:|___| |_||__| ..
 *
 * Using movablemem_map, we can prevent memblock from allocating memory
 * on ZONE_MOVABLE at boot time.
 *
 * NOTE: In this case, SRAT info will be ingored.
 */

Signed-off-by: Tang Chen 
---
 arch/x86/mm/srat.c |   34 ++
 1 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 6cd4d33..44a9b9b 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -150,16 +150,42 @@ static void __init sanitize_movablemem_map(int nid, u64 
start, u64 end)
start_pfn = PFN_DOWN(start);
end_pfn = PFN_UP(end);
 
+   /*
+* For movablemem_map=nn[KMG]@ss[KMG]:
+*
+* SRAT:|_| |_| |_| |_| ..
+* node id:0   1 1   2
+* user specified:|__| |___|
+* movablemem_map:|___| |_||__| ..
+*
+* Using movablemem_map, we can prevent memblock from allocating memory
+* on ZONE_MOVABLE at boot time.
+*/
overlap = movablemem_map_overlap(start_pfn, end_pfn);
if (overlap >= 0) {
+   /*
+* If this range overlaps with movablemem_map, then update
+* zone_movable_limit[nid] if it has lower start pfn.
+*/
start_pfn = max(start_pfn,
movablemem_map.map[overlap].start_pfn);
 
-   if (zone_movable_limit[nid])
-   zone_movable_limit[nid] = min(zone_movable_limit[nid],
- start_pfn);
-   else
+   if (!zone_movable_limit[nid] ||
+   zone_movable_limit[nid] > start_pfn)
zone_movable_limit[nid] = start_pfn;
+
+   /* Insert the higher part of the overlapped range. */
+   if (movablemem_map.map[overlap].end_pfn < end_pfn)
+   insert_movablemem_map(start_pfn, end_pfn);
+   } else {
+   /*
+* If this is a range higher than zone_movable_limit[nid],
+* insert it to movablemem_map because all ranges higher than
+* zone_movable_limit[nid] on this node will be ZONE_MOVABLE.
+*/
+   if (zone_movable_limit[nid] &&
+   start_pfn > zone_movable_limit[nid])
+   insert_movablemem_map(start_pfn, end_pfn);
}
 }
 #else  /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH part1 8/9] x86, mm, numa, acpi: make movablemem_map have higher priority

2013-03-21 Thread Tang Chen
If kernelcore or movablecore is specified at the same time with
movablemem_map, movablemem_map will have higher priority to be
satisfied.  This patch will make find_zone_movable_pfns_for_nodes()
calculate zone_movable_pfn[] with the limit from zone_movable_limit[].

Signed-off-by: Tang Chen 
Reviewed-by: Wen Congyang 
Reviewed-by: Lai Jiangshan 
Tested-by: Lin Feng 
---
 mm/page_alloc.c |   28 +---
 1 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 70ed381..bdde30d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4873,9 +4873,17 @@ static void __init find_zone_movable_pfns_for_nodes(void)
required_kernelcore = max(required_kernelcore, corepages);
}
 
-   /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-   if (!required_kernelcore)
+   /*
+* If neither kernelcore/movablecore nor movablemem_map is specified,
+* there is no ZONE_MOVABLE. But if movablemem_map is specified, the
+* start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
+*/
+   if (!required_kernelcore) {
+   if (movablemem_map.nr_map)
+   memcpy(zone_movable_pfn, zone_movable_limit,
+   sizeof(zone_movable_pfn));
goto out;
+   }
 
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
@@ -4905,10 +4913,24 @@ restart:
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
unsigned long size_pages;
 
+   /*
+* Find more memory for kernelcore in
+* [zone_movable_pfn[nid], zone_movable_limit[nid]).
+*/
start_pfn = max(start_pfn, zone_movable_pfn[nid]);
if (start_pfn >= end_pfn)
continue;
 
+   if (zone_movable_limit[nid]) {
+   end_pfn = min(end_pfn, zone_movable_limit[nid]);
+   /* No range left for kernelcore in this node */
+   if (start_pfn >= end_pfn) {
+   zone_movable_pfn[nid] =
+   zone_movable_limit[nid];
+   break;
+   }
+   }
+
/* Account for what is only usable for kernelcore */
if (start_pfn < usable_startpfn) {
unsigned long kernel_pages;
@@ -4968,12 +4990,12 @@ restart:
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
 
+out:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 
-out:
/* restore the node_state */
node_states[N_MEMORY] = saved_node_state;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH part2 2/4] x86, mm, numa, acpi: Introduce hotplug info into struct numa_meminfo.

2013-03-21 Thread Tang Chen
Since we are using struct numa_meminfo to store SRAT info, and sanitize
movablemem_map.map[], we need hotplug info in struct numa_meminfo.

This patch introduces a "bool hotpluggable" member into struct
numa_meminfo.

And modifies the following APIs' prototypes to support it:
   - numa_add_memblk()
   - numa_add_memblk_to()

And the following callers:
   - numaq_register_node()
   - dummy_numa_init()
   - amd_numa_init()
   - acpi_numa_memory_affinity_init() in x86

Signed-off-by: Tang Chen 
---
 arch/x86/include/asm/numa.h |3 ++-
 arch/x86/kernel/apic/numaq_32.c |2 +-
 arch/x86/mm/amdtopology.c   |3 ++-
 arch/x86/mm/numa.c  |   10 +++---
 arch/x86/mm/numa_internal.h |1 +
 arch/x86/mm/srat.c  |2 +-
 6 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 1b99ee5..73096b2 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -31,7 +31,8 @@ extern int numa_off;
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
 extern nodemask_t numa_nodes_parsed __initdata;
 
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end,
+ bool hotpluggable);
 extern void __init numa_set_distance(int from, int to, int distance);
 
 static inline void set_apicid_to_node(int apicid, s16 node)
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index d661ee9..7a9c542 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -82,7 +82,7 @@ static inline void numaq_register_node(int node, struct 
sys_cfg_data *scd)
int ret;
 
node_set(node, numa_nodes_parsed);
-   ret = numa_add_memblk(node, start, end);
+   ret = numa_add_memblk(node, start, end, false);
BUG_ON(ret < 0);
 }
 
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 5247d01..d521471 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -167,7 +167,8 @@ int __init amd_numa_init(void)
nodeid, base, limit);
 
prevbase = base;
-   numa_add_memblk(nodeid, base, limit);
+   /* Do not support memory hotplug for AMD cpu. */
+   numa_add_memblk(nodeid, base, limit, false);
node_set(nodeid, numa_nodes_parsed);
}
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4cf3b49..5f98bb5 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -142,6 +142,7 @@ void __init setup_node_to_cpumask_map(void)
 }
 
 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+bool hotpluggable,
 struct numa_meminfo *mi)
 {
/* ignore zero length blks */
@@ -163,6 +164,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
u64 end,
mi->blk[mi->nr_blks].start = start;
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
+   mi->blk[mi->nr_blks].hotpluggable = hotpluggable;
mi->nr_blks++;
return 0;
 }
@@ -187,15 +189,17 @@ void __init numa_remove_memblk_from(int idx, struct 
numa_meminfo *mi)
  * @nid: NUMA node ID of the new memblk
  * @start: Start address of the new memblk
  * @end: End address of the new memblk
+ * @hotpluggable: True if memblk is hotpluggable
  *
  * Add a new memblk to the default numa_meminfo.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
+int __init numa_add_memblk(int nid, u64 start, u64 end,
+  bool hotpluggable)
 {
-   return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+   return numa_add_memblk_to(nid, start, end, hotpluggable, &numa_meminfo);
 }
 
 /* Initialize NODE_DATA for a node on the local memory */
@@ -644,7 +648,7 @@ static int __init dummy_numa_init(void)
   0LLU, PFN_PHYS(max_pfn) - 1);
 
node_set(0, numa_nodes_parsed);
-   numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+   numa_add_memblk(0, 0, PFN_PHYS(max_pfn), false);
 
return 0;
 }
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index bb2fbcc..1ce4e6b 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -8,6 +8,7 @@ struct numa_memblk {
u64 start;
u64 end;
int nid;
+   boolhotpluggable;
 };
 
 struct numa_meminfo {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 4f443de..76c2eb4 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -290,7 +290,7 @@ acpi_numa_memory_affinity_init(struct 
acpi_srat_mem_affinity *ma)
goto out_err_bad_srat;
}
 
-   if (numa_add_memblk(node, start, e

[RESEND PATCH part1 6/9] x86, mm, numa, acpi: Support getting hotplug info from SRAT.

2013-03-21 Thread Tang Chen
We now provide an option for users who don't want to specify physical
memory address in kernel commandline.

/*
 * For movablemem_map=acpi:
 *
 * SRAT:|_| |_| |_| |_| ..
 * node id:0   1 1   2
 * hotpluggable:   n   y y   n
 * movablemem_map:  |_| |_|
 *
 * Using movablemem_map, we can prevent memblock from allocating memory
 * on ZONE_MOVABLE at boot time.
 */

So user just specify movablemem_map=acpi, and the kernel will use hotpluggable
info in SRAT to determine which memory ranges should be set as ZONE_MOVABLE.

NOTE: Using this way will cause NUMA performance down because the whole node
  will be set as ZONE_MOVABLE, and kernel cannot use memory on it.
  If users don't want to lose NUMA performance, just don't use it.

Signed-off-by: Tang Chen 
---
 Documentation/kernel-parameters.txt |   15 +++
 arch/x86/mm/srat.c  |   74 +--
 include/linux/mm.h  |2 +
 mm/page_alloc.c |   22 ++-
 4 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index dd3a36a..40387a2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1649,6 +1649,17 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
+   movablemem_map=acpi
+   [KNL,X86,IA-64,PPC] This parameter is similar to
+   memmap except it specifies the memory map of
+   ZONE_MOVABLE.
+   This option inform the kernel to use Hot Pluggable bit
+   in flags from SRAT from ACPI BIOS to determine which
+   memory devices could be hotplugged. The corresponding
+   memory ranges will be set as ZONE_MOVABLE.
+   NOTE: Whatever node the kernel resides in will always
+ be un-hotpluggable.
+
movablemem_map=nn[KMG]@ss[KMG]
[KNL,X86,IA-64,PPC] This parameter is similar to
memmap except it specifies the memory map of
@@ -1669,6 +1680,10 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
satisfied. So the administrator should be careful that
the amount of movablemem_map areas are not too large.
Otherwise kernel won't have enough memory to start.
+   NOTE: We don't stop users specifying the node the
+ kernel resides in as hotpluggable so that this
+ option can be used as a workaround of firmware
+ bugs.
 
MTD_Partition=  [MTD]
Format: ,,,
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 44a9b9b..4f443de 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -142,15 +142,78 @@ static inline int save_add_info(void) {return 0;}
 #endif
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static void __init sanitize_movablemem_map(int nid, u64 start, u64 end)
+static void __init sanitize_movablemem_map(int nid, u64 start, u64 end,
+  bool hotpluggable)
 {
-   int overlap;
+   int overlap, i;
unsigned long start_pfn, end_pfn;
 
start_pfn = PFN_DOWN(start);
end_pfn = PFN_UP(end);
 
/*
+* For movablemem_map=acpi:
+*
+* SRAT:|_| |_| |_| |_| ..
+* node id:0   1 1   2
+* hotpluggable:   n   y y   n
+* movablemem_map:  |_| |_|
+*
+* Using movablemem_map, we can prevent memblock from allocating memory
+* on ZONE_MOVABLE at boot time.
+*
+* Before parsing SRAT, memblock has already reserve some memory ranges
+* for other purposes, such as for kernel image. We cannot prevent
+* kernel from using these memory, so we need to exclude these memory
+* even if it is hotpluggable.
+* Furthermore, to ensure the kernel has enough memory to boot, we make
+* all the memory on the node which the kernel resides in should be
+* un-hotpluggable.
+*/
+   if (hotpluggable && movablemem_map.acpi) {
+   /* Exclude ranges reserved by memblock. */
+   struct memblock_type *rgn = &memblock.reserved;
+
+   for (i = 0; i < rgn->

[PATCH part2 4/4] x86, mm, numa, acpi: Sanitize movablemem_map after memory mapping initialized.

2013-03-21 Thread Tang Chen
In order to support allocating pagetable and vmammap pages in local node,
we should initialzie memory mapping without any limitation for memblock first,
using memblock to reserve pagetable and vmemmap pages in local node, and then
sanitize movablemem_map.map[] to limit memblock.

In this way, we can prevent allocation in movable area but with pagetable
and vmemmap pages (used by kernel) in local node.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |  125 ++
 arch/x86/mm/srat.c |  139 ++-
 2 files changed, 142 insertions(+), 122 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 0c3a278..d0b9c5a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -738,6 +738,129 @@ static void __init early_x86_numa_init_mapping(void)
 }
 #endif
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static void __init movablemem_map_handle_srat(struct numa_memblk mb)
+{
+   unsigned long start_pfn = PFN_DOWN(mb.start);
+   unsigned long end_pfn = PFN_UP(mb.end);
+   int nid = mb.nid;
+   bool hotpluggable = mb.hotpluggable;
+
+   /*
+* For movablemem_map=acpi:
+*
+* SRAT:|_| |_| |_| |_| ..
+* node id:0   1 1   2
+* hotpluggable:   n   y y   n
+* movablemem_map:  |_| |_|
+*
+* Using movablemem_map, we can prevent memblock from allocating memory
+* on ZONE_MOVABLE at boot time.
+*
+* Before parsing SRAT, memblock has already reserve some memory ranges
+* for other purposes, such as for kernel image. We cannot prevent
+* kernel from using these memory. Furthermore, if all the memory is
+* hotpluggable, then the system won't have enough memory to boot. So
+* we always set the nodes which the kernel resides in as non-movable
+* by not calling this function in sanitize_movablemem_map().
+*
+* Known problem: We now allocate pagetable and vmemmap pages on local
+* node, and reserved them in memblock. But we cannot tell these pages
+* from other reserved memory, such as kernel image. Fortunately, the
+* reserved memory will not be released into buddy system, so it won't
+* impact the ZONE_MOVABLE limitation.
+*/
+   if (!hotpluggable)
+   return;
+
+   /* If the range is hotpluggable, insert it into movablemem_map. */
+   insert_movablemem_map(start_pfn, end_pfn);
+
+   if (zone_movable_limit[nid])
+   zone_movable_limit[nid] = min(zone_movable_limit[nid],
+ start_pfn);
+   else
+   zone_movable_limit[nid] = start_pfn;
+}
+
+static void __init movablemem_map_handle_user(struct numa_memblk mb)
+{
+   int overlap;
+   unsigned long start_pfn = PFN_DOWN(mb.start);
+   unsigned long end_pfn = PFN_UP(mb.end);
+   int nid = mb.nid;
+
+   /*
+* For movablemem_map=nn[KMG]@ss[KMG]:
+*
+* SRAT:|_| |_| |_| |_| ..
+* node id:0   1 1   2
+* user specified:|__| |___|
+* movablemem_map:|___| |_||__| ..
+*
+* Using movablemem_map, we can prevent memblock from allocating memory
+* on ZONE_MOVABLE at boot time.
+*
+* NOTE: In this case, SRAT info will be ingored. Even if the memory
+* range is not hotpluggable in SRAT, it will be inserted into
+* movablemem_map. This is useful if firmware is buggy.
+*/
+   overlap = movablemem_map_overlap(start_pfn, end_pfn);
+   if (overlap >= 0) {
+   /*
+* If this range overlaps with movablemem_map, then update
+* zone_movable_limit[nid] if it has lower start pfn.
+*/
+   start_pfn = max(start_pfn,
+   movablemem_map.map[overlap].start_pfn);
+
+   if (!zone_movable_limit[nid] ||
+   zone_movable_limit[nid] > start_pfn)
+   zone_movable_limit[nid] = start_pfn;
+
+   /* Insert the higher part of the overlapped range. */
+   if (movablemem_map.map[overlap].end_pfn < end_pfn)
+   insert_movablemem_map(start_pfn, end_pfn);
+   } else {
+   /*
+* If this is a range higher than zone_movable_limit[nid],
+* insert it to movablemem_map because all ranges higher than
+* zone_movable_limit[nid] on this node will be ZONE_MOVABLE.
+*/
+   if (zone_movable_limit[nid] &&
+   start_p

[PATCH part2 1/4] x86, mm, numa, acpi: Introduce numa_meminfo_all to store all the numa meminfo.

2013-03-21 Thread Tang Chen
Now, Yinghai has tried to allocate pagetables and vmemmap pages in local
node. If we limit memblock allocation in movablemem_map.map[], we have to
exclude the pagetables and vmemmap pages.

So we need the following sequence:
1) Parse SRAT, store numa_meminfo.
2) Initialize memory mapping, allocate pagetables and vmemmap pages in local
   node. And reserve these memory with memblock.
3) Sanitize movablemem_map.map[], exclude the pagetables and vmemmap pages.

When parsing SRAT, we added memory ranges into numa_meminfo. But in
numa_cleanup_meminfo(), it removed all the unused memory from numa_meminfo.

 const u64 low = 0;
 const u64 high = PFN_PHYS(max_pfn);

 /* first, trim all entries */
 for (i = 0; i < mi->nr_blks; i++) {
 struct numa_memblk *bi = &mi->blk[i];

 /* make sure all blocks are inside the limits */
 bi->start = max(bi->start, low);
 bi->end = min(bi->end, high);

 /* and there's no empty block */
 if (bi->start >= bi->end)
 numa_remove_memblk_from(i--, mi);
 }

So numa_meminfo doesn't have the whole memory info.

In order to sanitize movablemem_map.map[] after memory mapping initialziation,
we need the whole SRAT info.

So this patch introduces global variable numa_meminfo_all to store the whole
numa memory info.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4f754e6..4cf3b49 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -28,12 +28,20 @@ nodemask_t numa_nodes_parsed __initdata;
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
+/*e820 mapped memory info */
 static struct numa_meminfo numa_meminfo
 #ifndef CONFIG_MEMORY_HOTPLUG
 __initdata
 #endif
 ;
 
+/* All memory info */
+static struct numa_meminfo numa_meminfo_all
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
 static int numa_distance_cnt;
 static u8 *numa_distance;
 
@@ -599,10 +607,15 @@ static int __init numa_init(int (*init_func)(void))
 
nodes_clear(numa_nodes_parsed);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+   memset(&numa_meminfo_all, 0, sizeof(numa_meminfo));
 
ret = init_func();
if (ret < 0)
return ret;
+
+   /* Store the whole memory info before cleanup numa_meminfo. */
+   memcpy(&numa_meminfo_all, &numa_meminfo, sizeof(numa_meminfo));
+
ret = numa_cleanup_meminfo(&numa_meminfo);
if (ret < 0)
return ret;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH part2 3/4] x86, mm, numa, acpi: Consider hotplug info when cleanup numa_meminfo.

2013-03-21 Thread Tang Chen
Since we have introduced hotplug info into struct numa_meminfo, we need
to consider it when cleanup numa_meminfo.

The original logic in numa_cleanup_meminfo() is:
Merge blocks on the same node, holes between which don't overlap with
memory on other nodes.

This patch modifies numa_cleanup_meminfo() logic like this:
Merge blocks with the same hotpluggable type on the same node, holes
between which don't overlap with memory on other nodes.

Signed-off-by: Tang Chen 
---
 arch/x86/mm/numa.c |   13 +
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 5f98bb5..0c3a278 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -304,18 +304,22 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
}
 
/*
-* Join together blocks on the same node, holes
-* between which don't overlap with memory on other
-* nodes.
+* Join together blocks on the same node, with the same
+* hotpluggable flags, holes between which don't overlap
+* with memory on other nodes.
 */
if (bi->nid != bj->nid)
continue;
+   if (bi->hotpluggable != bj->hotpluggable)
+   continue;
+
start = min(bi->start, bj->start);
end = max(bi->end, bj->end);
for (k = 0; k < mi->nr_blks; k++) {
struct numa_memblk *bk = &mi->blk[k];
 
-   if (bi->nid == bk->nid)
+   if (bi->nid == bk->nid &&
+   bi->hotpluggable == bk->hotpluggable)
continue;
if (start < bk->end && end > bk->start)
break;
@@ -335,6 +339,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
mi->blk[i].start = mi->blk[i].end = 0;
mi->blk[i].nid = NUMA_NO_NODE;
+   mi->blk[i].hotpluggable = false;
}
 
return 0;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH part2 0/4] Allow allocating pagetable on local node in movablemem_map.

2013-03-21 Thread Tang Chen
Hi Yinghai, all,

This patch-set is based on Yinghai's tree:
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
for-x86-mm

For main line, we need to apply Yinghai's
"x86, ACPI, numa: Parse numa info early" patch-set first.
Please refer to:
v1: https://lkml.org/lkml/2013/3/7/642
v2: https://lkml.org/lkml/2013/3/10/47


In this part2 patch-set, we didi the following things:
1) Introduce a "bool hotpluggable" member into struct numa_memblk so that we are
   able to know which memory ranges in numa_meminfo are hotpluggable.
   All the related apis have been changed.
2) Introduce a new global variable "numa_meminfo_all" to store all the memory 
ranges
   recorded in SRAT, because numa_cleanup_meminfo() will remove ranges higher 
than
   max_pfn.
   We need full numa memory info to limit zone_movable_pfn[].
3) Move movablemem_map sanitization after memory mapping is initialized so that
   pagetable allocation will not be limited by movablemem_map.


On the other hand, we may have another way to solve this problem:

Not only pagetable and vmemmap pages, but also all the data whose life cycle is 
the
same as a node, could be put on local node.

1) Introduce a flag into memblock, such as "LOCAL_NODE_DATA", to mark out which
   ranges have the same life cycle with node.
2) Only keep existing memory ranges in movablemem_map (no need to introduce
   numa_meminfo_all), and exclude these LOCAL_NODE_DATA ranges.
3) When hot-removing, we are able to find out these ranges, and free them first.
   This is very important.

Also, hot-add logic needs to be modified, too. As Yinghai mentioned before, I 
think
we can make memblock alive when memory is hot-added. And go with the same logic
as it is when booting.

How do you think?


Tang Chen (4):
  x86, mm, numa, acpi: Introduce numa_meminfo_all to store all the numa
meminfo.
  x86, mm, numa, acpi: Introduce hotplug info into struct numa_meminfo.
  x86, mm, numa, acpi: Consider hotplug info when cleanup numa_meminfo.
  x86, mm, numa, acpi: Sanitize movablemem_map after memory mapping
initialized.

 arch/x86/include/asm/numa.h |3 +-
 arch/x86/kernel/apic/numaq_32.c |2 +-
 arch/x86/mm/amdtopology.c   |3 +-
 arch/x86/mm/numa.c  |  161 +--
 arch/x86/mm/numa_internal.h |1 +
 arch/x86/mm/srat.c  |  141 +-
 6 files changed, 178 insertions(+), 133 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH part1 1/9] x86: get pg_data_t's memory from other node

2013-03-21 Thread Tang Chen
From: Yasuaki Ishimatsu 

If system can create movable node which all memory of the
node is allocated as ZONE_MOVABLE, setup_node_data() cannot
allocate memory for the node's pg_data_t.
So, use memblock_alloc_try_nid() instead of memblock_alloc_nid()
to retry when the first allocation fails.

Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Lai Jiangshan 
Signed-off-by: Tang Chen 
Signed-off-by: Jiang Liu 
---
 arch/x86/mm/numa.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 11acdf6..4f754e6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -214,10 +214,9 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
-   nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+   nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
-   pr_err("Cannot find %zu bytes in node %d\n",
-  nd_size, nid);
+   pr_err("Cannot find %zu bytes in any node\n", nd_size);
return;
}
nd = __va(nd_pa);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH part1 7/9] x86, mm, numa, acpi: Sanitize zone_movable_limit[].

2013-03-21 Thread Tang Chen
As mentioned by Liu Jiang and Wu Jiangguo, users could specify DMA,
DMA32, and HIGHMEM as movable. In order to ensure the kernel will
work correctly, we should exclude these memory ranges out from
zone_movable_limit[].

NOTE: Do find_usable_zone_for_movable() to initialize movable_zone
  so that sanitize_zone_movable_limit() could use it. This is
  pointed out by Wu Jianguo .

Reported-by: Wu Jianguo 
Signed-off-by: Tang Chen 
Signed-off-by: Liu Jiang 
Reviewed-by: Wen Congyang 
Reviewed-by: Lai Jiangshan 
Tested-by: Lin Feng 
---
 mm/page_alloc.c |   55 ++-
 1 files changed, 54 insertions(+), 1 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 31d27af..70ed381 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4412,6 +4412,58 @@ static unsigned long __meminit 
zone_absent_pages_in_node(int nid,
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
+/**
+ * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
+ *
+ * zone_movable_limit[] have been initialized when parsing SRAT or
+ * movablemem_map. This function will try to exclude ZONE_DMA, ZONE_DMA32,
+ * and HIGHMEM from zone_movable_limit[].
+ *
+ * zone_movable_limit[nid] == 0 means no limit for the node.
+ *
+ * Note: Need to be called with movable_zone initialized.
+ */
+static void __meminit sanitize_zone_movable_limit(void)
+{
+   int i, nid;
+   unsigned long start_pfn, end_pfn;
+
+   if (!movablemem_map.nr_map)
+   return;
+
+   /* Iterate each node id. */
+   for_each_node(nid) {
+   /* If we have no limit for this node, just skip it. */
+   if (!zone_movable_limit[nid])
+   continue;
+
+#ifdef CONFIG_ZONE_DMA
+   /* Skip DMA memory. */
+   if (zone_movable_limit[nid] <
+   arch_zone_highest_possible_pfn[ZONE_DMA])
+   zone_movable_limit[nid] =
+   arch_zone_highest_possible_pfn[ZONE_DMA];
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+   /* Skip DMA32 memory. */
+   if (zone_movable_limit[nid] <
+   arch_zone_highest_possible_pfn[ZONE_DMA32])
+   zone_movable_limit[nid] =
+   arch_zone_highest_possible_pfn[ZONE_DMA32];
+#endif
+
+#ifdef CONFIG_HIGHMEM
+   /* Skip lowmem if ZONE_MOVABLE is highmem. */
+   if (zone_movable_is_highmem() &&
+   zone_movable_limit[nid] <
+   arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
+   zone_movable_limit[nid] =
+   arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
+#endif
+   }
+}
+
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
@@ -4826,7 +4878,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
 
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-   find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 
 restart:
@@ -4985,6 +5036,8 @@ void __init free_area_init_nodes(unsigned long 
*max_zone_pfn)
 
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+   find_usable_zone_for_movable();
+   sanitize_zone_movable_limit();
find_zone_movable_pfns_for_nodes();
 
/* Print out the zone ranges */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v1 part1 0/9] Introduce movablemem_map boot option.

2013-03-18 Thread Tang Chen

Hi Will,

On 03/17/2013 08:25 AM, Will Huck wrote:


http://marc.info/?l=linux-mm&m=136014458829566&w=2

It seems that Mel don't like this idea.



Thank you for reminding me this.

And yes, I have read that email. :)

And about this boot option, we have had a long discussion before.
Please refer to: https://lkml.org/lkml/2012/11/29/190

The situation is:

For now, Linux kernel cannot migrate kernel direct mapping memory. And
there is no way to ensure that ZONE_NORMAL has no kernel memory. So we
can only use ZONE_MOVABLE to ensure the memory device could be removed.

For now, I have the following reasons that movablemem_map boot option is
necessary. Some may be mentioned before, but here, I think I need to say
them again:

1) If we want to hot-remove a memory device, the device should only have
   memory of two types:
   - kernel memory whose life cycle is the same as the memory device.
 such as pagetables, vmemmap
   - user memory that could be migrated.

   For type1: we can allocate it on local node, just like Yinghai's work,
  and free it when hot-removing.
   For type2: we can migrate it at run time. But it must be in ZONE_MOVABLE
  because we cannot ensure ZONE_NORMAL has no kernel memory.

   So we need a way to limit hotpluggable memory in ZONE_MOVABLE.

2) We have the following ways to do it:
   a) use SRAT, which I have already implemented
   b) specify physical address ranges, which I have implemented too, but
  obviously very few guys like it.
   c) specify node id. But nid could be changed on some platform by 
firmware.


   Because of c), we chose to use physical address ranges. To satisfy all
   users, I also implemented a).

3) Even if we don't specify physical address in command line, we use SRAT,
   we still need the logic in this patch-set to achieve the same goal.

4) Since setting a whole node as movable will cause NUMA performance down,
   no matter which way we use, we always need an interface to open or close
   this functionality.
   The boot option itself is an interface. If users don't specify it in
   command line, the kernel will work as before.

So I do want to try again to push this boot option.  :)

With this boot option, memory hotplug will work now.


It's true that if we reimplement the whole mm in Linux to make kernel
memory migratable, but we need to handle a lot of problems. I agree with 
Mel.

But it is a long way to go in the future.

And the work in the near future:
1) Allocate pagetables and vmemmap on local node, as Yinghai said.
2) Do the proper modification for hot-add and hot-remove.
   - Reserve memory for pagetables and vmemmap when hot-add, maybe use
 memblock.
   - Free all pagetables and vmemmap before hot-remove.
3) And about Mel's advice, modify memory management in Linux to migrate
   kernel pages, it is a long way to go in the future. I think we can
   discuss more.

Thanks. :)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v1 3/9] x86, mm, numa, acpi: Add movable_memmap boot option.

2013-03-16 Thread Tang Chen
Add functions to parse movablemem_map boot option. Since the option
could be specified more then once, all the maps will be stored in the
global array movablemem_map.map[].

And also, we keep the array in monotonic increasing order by start_pfn.
And merge all overlapped ranges.

Signed-off-by: Tang Chen 
Signed-off-by: Lai Jiangshan 
Reviewed-by: Wen Congyang 
Tested-by: Lin Feng 
---
 Documentation/kernel-parameters.txt |   21 ++
 include/linux/mm.h  |   11 +++
 mm/page_alloc.c |  131 +++
 3 files changed, 163 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 4609e81..dd3a36a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1649,6 +1649,27 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
+   movablemem_map=nn[KMG]@ss[KMG]
+   [KNL,X86,IA-64,PPC] This parameter is similar to
+   memmap except it specifies the memory map of
+   ZONE_MOVABLE.
+   If user specifies memory ranges, the info in SRAT will
+   be ingored. And it works like the following:
+   - If more ranges are all within one node, then from
+ lowest ss to the end of the node will be ZONE_MOVABLE.
+   - If a range is within a node, then from ss to the end
+ of the node will be ZONE_MOVABLE.
+   - If a range covers two or more nodes, then from ss to
+ the end of the 1st node will be ZONE_MOVABLE, and all
+ the rest nodes will only have ZONE_MOVABLE.
+   If memmap is specified at the same time, the
+   movablemem_map will be limited within the memmap
+   areas. If kernelcore or movablecore is also specified,
+   movablemem_map will have higher priority to be
+   satisfied. So the administrator should be careful that
+   the amount of movablemem_map areas are not too large.
+   Otherwise kernel won't have enough memory to start.
+
MTD_Partition=  [MTD]
Format: ,,,
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c79b10..9c068d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1332,6 +1332,17 @@ extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
 extern void sparse_memory_present_with_active_regions(int nid);
 
+#define MOVABLEMEM_MAP_MAX MAX_NUMNODES
+struct movablemem_entry {
+   unsigned long start_pfn;/* start pfn of memory segment */
+   unsigned long end_pfn;  /* end pfn of memory segment (exclusive) */
+};
+
+struct movablemem_map {
+   int nr_map;
+   struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
+};
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f368db4..27fcd29 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -202,6 +202,9 @@ static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/* Movable memory ranges, will also be used by memblock subsystem. */
+struct movablemem_map movablemem_map;
+
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
@@ -5061,6 +5064,134 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
+/**
+ * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
+ * @start_pfn: start pfn of the range
+ * @end_pfn:   end pfn of the range
+ *
+ * This function will also merge the overlapped ranges, and sort the array
+ * by start_pfn in monotonic increasing order.
+ */
+static void __init insert_movablemem_map(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+   int pos, overlap;
+
+   /*
+* pos will be at the 1st overlapped range, or the position
+* where the element should be inserted.
+*/
+   for (pos = 0; pos < movablemem_map.nr_map; pos++)
+   if (start_pfn <= movablemem_map.map[pos].end_pfn)
+   break;
+
+   /* If there is no overlapped range, just insert the element. */
+   if (pos == movab

<    2   3   4   5   6   7   8   9   10   11   >