[PATCH v3 3/3] acpi,memory-hotplug : add memory offline code to acpi_memory_device_remove()

2012-10-26 Thread wency
From: Yasuaki Ishimatsu 

The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

In the 1st case, acpi_memory_disable_device() will be called.
In the 2nd case, acpi_memory_device_remove() will be called.
acpi_memory_device_remove() will also be called when we unbind the
memory device from the driver acpi_memhotplug or a driver initialization
fails.

acpi_memory_disable_device() has already implemented a code which
offlines memory and releases acpi_memory_info struct. But
acpi_memory_device_remove() has not implemented it yet.

So the patch move offlining memory and releasing acpi_memory_info struct
codes to a new function acpi_memory_remove_memory(). And it is used by both
acpi_memory_device_remove() and acpi_memory_disable_device().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c | 31 ---
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 666dac6..92c973a 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -316,16 +316,11 @@ static int acpi_memory_powerdown_device(struct 
acpi_memory_device *mem_device)
return 0;
 }
 
-static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 {
int result;
struct acpi_memory_info *info, *n;
 
-
-   /*
-* Ask the VM to offline this memory range.
-* Note: Assume that this function returns zero on success
-*/
mutex_lock(&mem_device->list_lock);
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
@@ -333,10 +328,27 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
if (result)
return result;
}
+
+   list_del(&info->list);
kfree(info);
}
mutex_unlock(&mem_device->list_lock);
 
+   return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+   int result;
+
+   /*
+* Ask the VM to offline this memory range.
+* Note: Assume that this function returns zero on success
+*/
+   result = acpi_memory_remove_memory(mem_device);
+   if (result)
+   return result;
+
/* Power-off and eject the device */
result = acpi_memory_powerdown_device(mem_device);
if (result) {
@@ -487,12 +499,17 @@ static int acpi_memory_device_add(struct acpi_device 
*device)
 static int acpi_memory_device_remove(struct acpi_device *device, int type)
 {
struct acpi_memory_device *mem_device = NULL;
-
+   int result;
 
if (!device || !acpi_driver_data(device))
return -EINVAL;
 
mem_device = acpi_driver_data(device);
+
+   result = acpi_memory_remove_memory(mem_device);
+   if (result)
+   return result;
+
kfree(mem_device);
 
return 0;
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/3] acpi,memory-hotplug: call acpi_bus_trim() to remove memory device

2012-10-26 Thread wency
From: Wen Congyang 

The memory device has been ejected and powoffed, so we can call
acpi_bus_trim() to remove the memory device from acpi bus.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24c807f..1e90e8f 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -401,8 +401,9 @@ static void acpi_memory_device_notify(acpi_handle handle, 
u32 event, void *data)
}
 
/*
-* TBD: Invoke acpi_bus_remove to cleanup data structures
+* Invoke acpi_bus_trim() to remove memory device
 */
+   acpi_bus_trim(device, 1);
 
/* _EJ0 succeeded; _OST is not necessary */
return;
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 2/3] acpi,memory-hotplug: introduce a mutex lock to protect the list in acpi_memory_device

2012-10-26 Thread wency
From: Wen Congyang 

The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

This 2 events may happen at the same time, so we may touch
acpi_memory_device.res_list at the same time. This patch
introduce a lock to protect this list.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 1e90e8f..666dac6 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -83,7 +83,8 @@ struct acpi_memory_info {
 struct acpi_memory_device {
struct acpi_device * device;
unsigned int state; /* State of the memory device */
-   struct list_head res_list;
+   struct mutex list_lock;
+   struct list_head res_list;  /* protected by list_lock */
 };
 
 static int acpi_hotmem_initialized;
@@ -101,19 +102,23 @@ acpi_memory_get_resource(struct acpi_resource *resource, 
void *context)
(address64.resource_type != ACPI_MEMORY_RANGE))
return AE_OK;
 
+   mutex_lock(&mem_device->list_lock);
list_for_each_entry(info, &mem_device->res_list, list) {
/* Can we combine the resource range information? */
if ((info->caching == address64.info.mem.caching) &&
(info->write_protect == address64.info.mem.write_protect) &&
(info->start_addr + info->length == address64.minimum)) {
info->length += address64.address_length;
+   mutex_unlock(&mem_device->list_lock);
return AE_OK;
}
}
 
new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
-   if (!new)
+   if (!new) {
+   mutex_unlock(&mem_device->list_lock);
return AE_ERROR;
+   }
 
INIT_LIST_HEAD(&new->list);
new->caching = address64.info.mem.caching;
@@ -121,6 +126,7 @@ acpi_memory_get_resource(struct acpi_resource *resource, 
void *context)
new->start_addr = address64.minimum;
new->length = address64.address_length;
list_add_tail(&new->list, &mem_device->res_list);
+   mutex_unlock(&mem_device->list_lock);
 
return AE_OK;
 }
@@ -138,9 +144,11 @@ acpi_memory_get_device_resources(struct acpi_memory_device 
*mem_device)
status = acpi_walk_resources(mem_device->device->handle, 
METHOD_NAME__CRS,
 acpi_memory_get_resource, mem_device);
if (ACPI_FAILURE(status)) {
+   mutex_lock(&mem_device->list_lock);
list_for_each_entry_safe(info, n, &mem_device->res_list, list)
kfree(info);
INIT_LIST_HEAD(&mem_device->res_list);
+   mutex_unlock(&mem_device->list_lock);
return -EINVAL;
}
 
@@ -236,6 +244,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
 * We don't have memory-hot-add rollback function,now.
 * (i.e. memory-hot-remove function)
 */
+   mutex_lock(&mem_device->list_lock);
list_for_each_entry(info, &mem_device->res_list, list) {
if (info->enabled) { /* just sanity check...*/
num_enabled++;
@@ -256,6 +265,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
info->enabled = 1;
num_enabled++;
}
+   mutex_unlock(&mem_device->list_lock);
if (!num_enabled) {
printk(KERN_ERR PREFIX "add_memory failed\n");
mem_device->state = MEMORY_INVALID_STATE;
@@ -316,6 +326,7 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
 * Ask the VM to offline this memory range.
 * Note: Assume that this function returns zero on success
 */
+   mutex_lock(&mem_device->list_lock);
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
result = remove_memory(info->start_addr, info->length);
@@ -324,6 +335,7 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
}
kfree(info);
}
+   mutex_unlock(&mem_device->list_lock);
 
/* Power-off and eject the device */
result = acpi_memory_powerdown_device(mem_device);
@@ -438,6 +450,7 @@ static int acpi_memory_device_add(struct acpi_device 
*device)
mem_device->device = device;
sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
+   mutex_init(&mem_dev

[PATCH v3 0/3] acpi,memory-hotplug : implement framework for hot removing memory

2012-10-26 Thread wency
From: Wen Congyang 

The patch-set implements a framework for hot removing memory.

The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

In the 1st case, acpi_memory_disable_device() will be called.
In the 2nd case, acpi_memory_device_remove() will be called.
acpi_memory_device_remove() will also be called when we unbind the
memory device from the driver acpi_memhotplug or a driver initialization
fails.

acpi_memory_disable_device() has already implemented a code which
offlines memory and releases acpi_memory_info struct . But
acpi_memory_device_remove() has not implemented it yet.

So the patch prepares the framework for hot removing memory and
adds the framework into acpi_memory_device_remove().

The last version of this patchset is here:
https://lkml.org/lkml/2012/10/19/156

Changelogs from v2 to v3:
  Patch2: rename lock to list_lock

Changelogs from v1 to v2:
  Patch1: use acpi_bus_trim() instead of acpi_bus_remove()
  Patch2: new patch, introduce a lock to protect the list
  Patch3: remove memory too when type is ACPI_BUS_REMOVAL_NORMAL
  Note: I don't send [Patch2-4 v1] in this series because they
  are no logical changes in these 3 patches.

Wen Congyang (2):
  acpi,memory-hotplug: call acpi_bus_trim() to remove memory device
  acpi,memory-hotplug: introduce a mutex lock to protect the list in
acpi_memory_device

Yasuaki Ishimatsu (1):
  acpi,memory-hotplug : add memory offline code to
acpi_memory_device_remove()

 drivers/acpi/acpi_memhotplug.c | 51 +-
 1 file changed, 41 insertions(+), 10 deletions(-)

-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 01/12] memory-hotplug: try to offline the memory twice to avoid dependence

2012-10-23 Thread wency
From: Wen Congyang 

memory can't be offlined when CONFIG_MEMCG is selected.
For example: there is a memory device on node 1. The address range
is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
and memory11 under the directory /sys/devices/system/memory/.

If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
when we online pages. When we online memory8, the memory stored page cgroup
is not provided by this memory device. But when we online memory9, the memory
stored page cgroup may be provided by memory8. So we can't offline memory8
now. We should offline the memory in the reversed order.

When the memory device is hotremoved, we will auto offline memory provided
by this memory device. But we don't know which memory is onlined first, so
offlining memory may fail. In such case, iterate twice to offline the memory.
1st iterate: offline every non primary memory block.
2nd iterate: offline primary (i.e. first added) memory block.

This idea is suggested by KOSAKI Motohiro.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |   16 ++--
 1 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 56b758a..600e200 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1019,10 +1019,13 @@ int remove_memory(u64 start, u64 size)
unsigned long start_pfn, end_pfn;
unsigned long pfn, section_nr;
int ret;
+   int return_on_error = 0;
+   int retry = 0;
 
start_pfn = PFN_DOWN(start);
end_pfn = start_pfn + PFN_DOWN(size);
 
+repeat:
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
section_nr = pfn_to_section_nr(pfn);
if (!present_section_nr(section_nr))
@@ -1041,14 +1044,23 @@ int remove_memory(u64 start, u64 size)
 
ret = offline_memory_block(mem);
if (ret) {
-   kobject_put(&mem->dev.kobj);
-   return ret;
+   if (return_on_error) {
+   kobject_put(&mem->dev.kobj);
+   return ret;
+   } else {
+   retry = 1;
+   }
}
}
 
if (mem)
kobject_put(&mem->dev.kobj);
 
+   if (retry) {
+   return_on_error = 1;
+   goto repeat;
+   }
+
return 0;
 }
 #else
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 03/12] memory-hotplug: remove redundant codes

2012-10-23 Thread wency
From: Wen Congyang 

offlining memory blocks and checking whether memory blocks are offlined
are very similar. This patch introduces a new function to remove
redundant codes.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |  101 ---
 1 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f4fdedd..80fc70c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1012,20 +1012,14 @@ int offline_pages(unsigned long start_pfn, unsigned 
long nr_pages)
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
 }
 
-int remove_memory(u64 start, u64 size)
+static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+   void *arg, int (*func)(struct memory_block *, void *))
 {
struct memory_block *mem = NULL;
struct mem_section *section;
-   unsigned long start_pfn, end_pfn;
unsigned long pfn, section_nr;
int ret;
-   int return_on_error = 0;
-   int retry = 0;
-
-   start_pfn = PFN_DOWN(start);
-   end_pfn = start_pfn + PFN_DOWN(size);
 
-repeat:
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
section_nr = pfn_to_section_nr(pfn);
if (!present_section_nr(section_nr))
@@ -1042,22 +1036,61 @@ repeat:
if (!mem)
continue;
 
-   ret = offline_memory_block(mem);
+   ret = func(mem, arg);
if (ret) {
-   if (return_on_error) {
-   kobject_put(&mem->dev.kobj);
-   return ret;
-   } else {
-   retry = 1;
-   }
+   kobject_put(&mem->dev.kobj);
+   return ret;
}
}
 
if (mem)
kobject_put(&mem->dev.kobj);
 
-   if (retry) {
-   return_on_error = 1;
+   return 0;
+}
+
+static int offline_memory_block_cb(struct memory_block *mem, void *arg)
+{
+   int *ret = arg;
+   int error = offline_memory_block(mem);
+
+   if (error != 0 && *ret == 0)
+   *ret = error;
+
+   return 0;
+}
+
+static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
+{
+   int ret = !is_memblock_offlined(mem);
+
+   if (unlikely(ret))
+   pr_warn("removing memory fails, because memory "
+   "[%#010llx-%#010llx] is onlined\n",
+   PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+   PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
+
+   return ret;
+}
+
+int remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn, end_pfn;
+   int ret = 0;
+   int retry = 1;
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = start_pfn + PFN_DOWN(size);
+
+repeat:
+   walk_memory_range(start_pfn, end_pfn, &ret,
+ offline_memory_block_cb);
+   if (ret) {
+   if (!retry)
+   return ret;
+
+   retry = 0;
+   ret = 0;
goto repeat;
}
 
@@ -1075,37 +1108,13 @@ repeat:
 * memory blocks are offlined.
 */
 
-   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-   section_nr = pfn_to_section_nr(pfn);
-   if (!present_section_nr(section_nr))
-   continue;
-
-   section = __nr_to_section(section_nr);
-   /* same memblock? */
-   if (mem)
-   if ((section_nr >= mem->start_section_nr) &&
-   (section_nr <= mem->end_section_nr))
-   continue;
-
-   mem = find_memory_block_hinted(section, mem);
-   if (!mem)
-   continue;
-
-   ret = is_memblock_offlined(mem);
-   if (!ret) {
-   pr_warn("removing memory fails, because memory "
-   "[%#010llx-%#010llx] is onlined\n",
-   
PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
-   PFN_PHYS(section_nr_to_pfn(mem->end_section_nr 
+ 1)) - 1);
-
-   kobject_put(&mem->dev.kobj);
-   unlock_memory_hotplug();
-   return ret;
-   }
+   ret = walk_memory_range(start_pfn, end_pfn, NULL,
+   is_memblock_offlined_cb);
+   if (ret) {
+   unlock_memory_hotplug();
+   return ret;
}
 
-   if (mem)
-   kobject_put(&mem->dev.kobj);
unlo

[PATCH v2 06/12] memory-hotplug: unregister memory section on SPARSEMEM_VMEMMAP

2012-10-23 Thread wency
From: Yasuaki Ishimatsu 

Currently __remove_section for SPARSEMEM_VMEMMAP does nothing. But even if
we use SPARSEMEM_VMEMMAP, we can unregister the memory_section.

So the patch add unregister_memory_section() into __remove_section().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 mm/memory_hotplug.c |   13 -
 1 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ca07433..66a79a7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -286,11 +286,14 @@ static int __meminit __add_section(int nid, struct zone 
*zone,
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
-   /*
-* XXX: Freeing memmap with vmemmap is not implement yet.
-*  This should be removed later.
-*/
-   return -EBUSY;
+   int ret = -EINVAL;
+
+   if (!valid_section(ms))
+   return ret;
+
+   ret = unregister_memory_section(ms);
+
+   return ret;
 }
 #else
 static int __remove_section(struct zone *zone, struct mem_section *ms)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 05/12] memory-hotplug: introduce new function arch_remove_memory() for removing page table depends on architecture

2012-10-23 Thread wency
From: Wen Congyang 

For removing memory, we need to remove page table. But it depends
on architecture. So the patch introduce arch_remove_memory() for
removing page table. Now it only calls __remove_pages().

Note: __remove_pages() for some archtecuture is not implemented
  (I don't know how to implement it for s390).

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 arch/ia64/mm/init.c|   18 ++
 arch/powerpc/mm/mem.c  |   12 
 arch/s390/mm/init.c|   12 
 arch/sh/mm/init.c  |   17 +
 arch/tile/mm/init.c|8 
 arch/x86/mm/init_32.c  |   12 
 arch/x86/mm/init_64.c  |   15 +++
 include/linux/memory_hotplug.h |1 +
 mm/memory_hotplug.c|2 ++
 9 files changed, 97 insertions(+), 0 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index acd5b68..1d36ba2 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -690,6 +690,24 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   struct zone *zone;
+   int ret;
+
+   zone = page_zone(pfn_to_page(start_pfn));
+   ret = __remove_pages(zone, start_pfn, nr_pages);
+   if (ret)
+   pr_warn("%s: Problem encountered in __remove_pages() as"
+   " ret=%d\n", __func__,  ret);
+
+   return ret;
+}
+#endif
 #endif
 
 /*
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0dba506..09c6451 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   struct zone *zone;
+
+   zone = page_zone(pfn_to_page(start_pfn));
+   return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 81e596c..b565190 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -257,4 +257,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
vmem_remove_mapping(start, size);
return rc;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /*
+* There is no hardware or firmware interface which could trigger a
+* hot memory remove on s390. So there is nothing that needs to be
+* implemented.
+*/
+   return -EBUSY;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 82cc576..1057940 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   struct zone *zone;
+   int ret;
+
+   zone = page_zone(pfn_to_page(start_pfn));
+   ret = __remove_pages(zone, start_pfn, nr_pages);
+   if (unlikely(ret))
+   pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
+   ret);
+
+   return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index ef29d6c..2749515 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 {
return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /* TODO */
+   return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 11a5800..b19eba4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -839,6 +839,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   struct zone *zone;
+
+   zone = page_zone(pfn_to_page(start_pfn));
+   return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
 #endif
 
 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/

[PATCH v2 07/12] memory-hotplug: implement register_page_bootmem_info_section of sparse-vmemmap

2012-10-23 Thread wency
From: Yasuaki Ishimatsu 

For removing memmap region of sparse-vmemmap which is allocated bootmem,
memmap region of sparse-vmemmap needs to be registered by get_page_bootmem().
So the patch searches pages of virtual mapping and registers the pages by
get_page_bootmem().

Note: register_page_bootmem_memmap() is not implemented for ia64, ppc, s390,
and sparc.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/ia64/mm/discontig.c   |6 
 arch/powerpc/mm/init_64.c  |6 
 arch/s390/mm/vmem.c|6 
 arch/sparc/mm/init_64.c|6 
 arch/x86/mm/init_64.c  |   52 
 include/linux/memory_hotplug.h |   11 +---
 include/linux/mm.h |3 +-
 mm/memory_hotplug.c|   37 +---
 8 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c641333..33943db 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -822,4 +822,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 {
return vmemmap_populate_basepages(start_page, size, node);
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 95a4529..6466440 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,11 @@ int __meminit vmemmap_populate(struct page *start_page,
 
return 0;
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 387c7c6..4f4803a 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -236,6 +236,12 @@ out:
return ret;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
+
 /*
  * Add memory segment to the segment list if it doesn't overlap with
  * an already present segment.
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 9e28a11..75a984b 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2231,6 +2231,12 @@ void __meminit vmemmap_populate_print_last(void)
node_start = 0;
}
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static void prot_init_common(unsigned long page_none,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 2309cf0..f1e2c21 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -993,6 +993,58 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
return 0;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   unsigned long addr = (unsigned long)start_page;
+   unsigned long end = (unsigned long)(start_page + size);
+   unsigned long next;
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+
+   for (; addr < end; addr = next) {
+   pte_t *pte = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd)) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   continue;
+   }
+   get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud)) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   continue;
+   }
+   get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+   if (!cpu_has_pse) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   continue;
+   get_page_bootmem(section_nr, pmd_page(*pmd),
+MIX_SECTION_INFO);
+
+   pte = pte_offset_kernel(pmd, addr);
+   if (pte_none(*pte))
+   continue;
+   get_page_bootmem(section_nr, pte_page(*pte),
+SECTION_INFO);
+   } else {
+   next = pmd_addr_end(addr, end);
+
+   pmd = pmd_offset(pud, addr);
+

[PATCH v2 08/12] memory-hotplug: remove memmap of sparse-vmemmap

2012-10-23 Thread wency
From: Yasuaki Ishimatsu 

All pages of virtual mapping in removed memory cannot be freed, since some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch checks whether page can be freed or not.

How to check whether page can be freed or not?
 1. When removing memory, the page structs of the revmoved memory are filled
with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
In this case, the page used as PT/PMD can be freed.

Applying patch, __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is integrated
into one. So __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is deleted.

Note:  vmemmap_kfree() and vmemmap_free_bootmem() are not implemented for ia64,
ppc, s390, and sparc.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/ia64/mm/discontig.c  |8 +++
 arch/powerpc/mm/init_64.c |8 +++
 arch/s390/mm/vmem.c   |8 +++
 arch/sparc/mm/init_64.c   |8 +++
 arch/x86/mm/init_64.c |  119 +
 include/linux/mm.h|2 +
 mm/memory_hotplug.c   |   17 +--
 mm/sparse.c   |5 +-
 8 files changed, 158 insertions(+), 17 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 33943db..0d23b69 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -823,6 +823,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return vmemmap_populate_basepages(start_page, size, node);
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 6466440..df7d155 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -298,6 +298,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 4f4803a..ab69c34 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -236,6 +236,14 @@ out:
return ret;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 75a984b..546855d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2232,6 +2232,14 @@ void __meminit vmemmap_populate_print_last(void)
}
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f1e2c21..73b1b6a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -993,6 +993,125 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
return 0;
 }
 
+#define PAGE_INUSE 0xFD
+
+unsigned long find_and_clear_pte_page(unsigned long addr, unsigned long end,
+   struct page **pp, int *page_size)
+{
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte = NULL;
+   void *page_addr;
+   unsigned long next;
+
+   *pp = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd))
+   return pgd_addr_end(addr, end);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud))
+   return pud_addr_end(addr, end);
+
+   if (!cpu_has_pse) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+   pte = pte_offset_kernel(pmd, addr);
+   if (pte_none(*pte))
+   return next;
+
+   *page_size = PAGE_SIZE;
+   *pp = pte_page(*pte);
+   } else {
+   next = pmd_addr_end(addr, end);
+
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+   *page_size = PMD_SIZE;
+   *

[PATCH v2 02/12] memory-hotplug: check whether all memory blocks are offlined or not when removing memory

2012-10-23 Thread wency
From: Yasuaki Ishimatsu 

We remove the memory like this:
1. lock memory hotplug
2. offline a memory block
3. unlock memory hotplug
4. repeat 1-3 to offline all memory blocks
5. lock memory hotplug
6. remove memory(TODO)
7. unlock memory hotplug

All memory blocks must be offlined before removing memory. But we don't hold
the lock in the whole operation. So we should check whether all memory blocks
are offlined before step6. Otherwise, kernel maybe panicked.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/base/memory.c  |6 +
 include/linux/memory_hotplug.h |1 +
 mm/memory_hotplug.c|   47 
 3 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 86c8821..badb025 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -675,6 +675,12 @@ int offline_memory_block(struct memory_block *mem)
return ret;
 }
 
+/* return true if the memory block is offlined, otherwise, return false */
+bool is_memblock_offlined(struct memory_block *mem)
+{
+   return mem->state == MEM_OFFLINE;
+}
+
 /*
  * Initialize the sysfs support for memory devices...
  */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 95573ec..38675e9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -236,6 +236,7 @@ extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern int offline_memory_block(struct memory_block *mem);
+extern bool is_memblock_offlined(struct memory_block *mem);
 extern int remove_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 600e200..f4fdedd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1061,6 +1061,53 @@ repeat:
goto repeat;
}
 
+   lock_memory_hotplug();
+
+   /*
+* we have offlined all memory blocks like this:
+*   1. lock memory hotplug
+*   2. offline a memory block
+*   3. unlock memory hotplug
+*
+* repeat step1-3 to offline the memory block. All memory blocks
+* must be offlined before removing memory. But we don't hold the
+* lock in the whole operation. So we should check whether all
+* memory blocks are offlined.
+*/
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   section_nr = pfn_to_section_nr(pfn);
+   if (!present_section_nr(section_nr))
+   continue;
+
+   section = __nr_to_section(section_nr);
+   /* same memblock? */
+   if (mem)
+   if ((section_nr >= mem->start_section_nr) &&
+   (section_nr <= mem->end_section_nr))
+   continue;
+
+   mem = find_memory_block_hinted(section, mem);
+   if (!mem)
+   continue;
+
+   ret = is_memblock_offlined(mem);
+   if (!ret) {
+   pr_warn("removing memory fails, because memory "
+   "[%#010llx-%#010llx] is onlined\n",
+   
PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+   PFN_PHYS(section_nr_to_pfn(mem->end_section_nr 
+ 1)) - 1);
+
+   kobject_put(&mem->dev.kobj);
+   unlock_memory_hotplug();
+   return ret;
+   }
+   }
+
+   if (mem)
+   kobject_put(&mem->dev.kobj);
+   unlock_memory_hotplug();
+
return 0;
 }
 #else
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 11/12] memory-hotplug: remove sysfs file of node

2012-10-23 Thread wency
From: Wen Congyang 

This patch introduces a new function try_offline_node() to
remove sysfs file of node when all memory sections of this
node are removed. If some memory sections of this node are
not removed, this function does nothing.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |8 +-
 include/linux/memory_hotplug.h |2 +-
 mm/memory_hotplug.c|   58 ++-
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24c807f..0780f99 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -310,7 +310,9 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
 {
int result;
struct acpi_memory_info *info, *n;
+   int node;
 
+   node = acpi_get_node(mem_device->device->handle);
 
/*
 * Ask the VM to offline this memory range.
@@ -318,7 +320,11 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
 */
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
-   result = remove_memory(info->start_addr, info->length);
+   if (node < 0)
+   node = memory_add_physaddr_to_nid(
+   info->start_addr);
+   result = remove_memory(node, info->start_addr,
+   info->length);
if (result)
return result;
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d4c4402..7b4cfe6 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -231,7 +231,7 @@ extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern int offline_memory_block(struct memory_block *mem);
 extern bool is_memblock_offlined(struct memory_block *mem);
-extern int remove_memory(u64 start, u64 size);
+extern int remove_memory(int node, u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section 
*ms);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 55a228d..b1fe41d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -1299,7 +1300,58 @@ static int is_memblock_offlined_cb(struct memory_block 
*mem, void *arg)
return ret;
 }
 
-int __ref remove_memory(u64 start, u64 size)
+static int check_cpu_on_node(void *data)
+{
+   struct pglist_data *pgdat = data;
+   int cpu;
+
+   for_each_present_cpu(cpu) {
+   if (cpu_to_node(cpu) == pgdat->node_id)
+   /*
+* the cpu on this node isn't removed, and we can't
+* offline this node.
+*/
+   return -EBUSY;
+   }
+
+   return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+static void try_offline_node(int nid)
+{
+   unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
+   unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages;
+   unsigned long pfn;
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   unsigned long section_nr = pfn_to_section_nr(pfn);
+
+   if (!present_section_nr(section_nr))
+   continue;
+
+   if (pfn_to_nid(pfn) != nid)
+   continue;
+
+   /*
+* some memory sections of this node are not removed, and we
+* can't offline node now.
+*/
+   return;
+   }
+
+   if (stop_machine(check_cpu_on_node, NODE_DATA(nid), NULL))
+   return;
+
+   /*
+* all memory/cpu of this node are removed, we can offline this
+* node now.
+*/
+   node_set_offline(nid);
+   unregister_one_node(nid);
+}
+
+int __ref remove_memory(int nid, u64 start, u64 size)
 {
unsigned long start_pfn, end_pfn;
int ret = 0;
@@ -1346,6 +1398,8 @@ repeat:
 
arch_remove_memory(start, size);
 
+   try_offline_node(nid);
+
unlock_memory_hotplug();
 
return 0;
@@ -1355,7 +1409,7 @@ int offline_pages(unsigned long start_pfn, unsigned long 
nr_pages)
 {
return -EINVAL;
 }
-int remove_memory(u64 start, u64 size)
+int remove_memory(int nid, u64 

[PATCH v2 09/12] memory-hotplug: remove page table of x86_64 architecture

2012-10-23 Thread wency
From: Wen Congyang 

For hot removing memory, we sholud remove page table about the memory.
So the patch searches a page table about the removed memory, and clear
page table.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
Signed-off-by: Jianguo Wu 
Signed-off-by: Jiang Liu 
---
 arch/x86/include/asm/pgtable_types.h |1 +
 arch/x86/mm/init_64.c|  223 ++
 arch/x86/mm/pageattr.c   |   47 
 3 files changed, 249 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index ec8a1fc..fb0c24d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -332,6 +332,7 @@ static inline void update_page_count(int level, unsigned 
long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t 
*pbase);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 73b1b6a..4809a9f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -675,6 +675,227 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
+static inline void free_pagetable(struct page *page)
+{
+   struct zone *zone;
+   bool bootmem = false;
+
+   /* bootmem page has reserved flag */
+   if (PageReserved(page)) {
+   __ClearPageReserved(page);
+   bootmem = true;
+   }
+
+   __free_page(page);
+
+   if (bootmem) {
+   zone = page_zone(page);
+   zone_span_writelock(zone);
+   zone->present_pages++;
+   zone_span_writeunlock(zone);
+   totalram_pages++;
+   }
+}
+
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (pte_val(*pte))
+   return;
+   }
+
+   /* free a pte talbe */
+   free_pagetable(pmd_page(*pmd));
+   pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (pmd_val(*pmd))
+   return;
+   }
+
+   /* free a pmd talbe */
+   free_pagetable(pud_page(*pud));
+   pud_clear(pud);
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (pud_val(*pud))
+   return;
+   }
+
+   /* free a pud table */
+   free_pagetable(pgd_page(*pgd));
+   pgd_clear(pgd);
+}
+
+static void __meminit
+phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
+{
+   unsigned pages = 0;
+   int i = pte_index(addr);
+
+   pte_t *pte = pte_page + pte_index(addr);
+
+   for (; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
+
+   if (addr >= end)
+   break;
+
+   if (!pte_present(*pte))
+   continue;
+
+   pages++;
+   set_pte(pte, __pte(0));
+   }
+
+   update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
+{
+   unsigned long pages = 0, next;
+   int i = pmd_index(addr);
+
+   for (; i < PTRS_PER_PMD && addr < end; i++, addr = next) {
+   unsigned long pte_phys;
+   pmd_t *pmd = pmd_page + pmd_index(addr);
+   pte_t *pte;
+
+   next = pmd_addr_end(addr, end);
+
+   if (!pmd_present(*pmd))
+   continue;
+
+   if (pmd_large(*pmd)) {
+   if (IS_ALIGNED(addr, PMD_SIZE) &&
+   IS_ALIGNED(next, PMD_SIZE)) {
+   set_pmd(pmd, __pmd(0));
+   pages++;
+   continue;
+   }
+
+   /*
+* We use 2M page, but we need to remove part of them,
+* so split 2M page to 4K page.
+*/
+   pte = alloc_low_page(&pte_phys);
+   BUG_ON(!pte);
+   __split_large_page((pte_t *)pmd,
+  (unsigned long)__va(addr), pte);
+
+   spin_lock(&init_mm.page_table_lock);
+   pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+   s

[PATCH v2 12/12] memory-hotplug: free node_data when a node is offlined

2012-10-23 Thread wency
From: Wen Congyang 

We call hotadd_new_pgdat() to allocate memory to store node_data. So we
should free it when removing a node.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |   20 +++-
 1 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b1fe41d..6b4cd53 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1320,9 +1320,12 @@ static int check_cpu_on_node(void *data)
 /* offline the node if all memory sections of this node are removed */
 static void try_offline_node(int nid)
 {
+   pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
-   unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages;
+   unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
unsigned long pfn;
+   struct page *pgdat_page = virt_to_page(pgdat);
+   int i;
 
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
unsigned long section_nr = pfn_to_section_nr(pfn);
@@ -1349,6 +1352,21 @@ static void try_offline_node(int nid)
 */
node_set_offline(nid);
unregister_one_node(nid);
+
+   if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+   /* node data is allocated from boot memory */
+   return;
+
+   /* free waittable in each zone */
+   for (i = 0; i < MAX_NR_ZONES; i++) {
+   struct zone *zone = pgdat->node_zones + i;
+
+   if (zone->wait_table)
+   vfree(zone->wait_table);
+   }
+
+   arch_refresh_nodedata(nid, NULL);
+   arch_free_nodedata(pgdat);
 }
 
 int __ref remove_memory(int nid, u64 start, u64 size)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 04/12] memory-hotplug: remove /sys/firmware/memmap/X sysfs

2012-10-23 Thread wency
From: Yasuaki Ishimatsu 

When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
sysfs files are created. But there is no code to remove these files. The patch
implements the function to remove them.

Note: The code does not free firmware_map_entry which is allocated by bootmem.
  So the patch makes memory leak. But I think the memory leak size is
  very samll. And it does not affect the system.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/firmware/memmap.c|   98 +-
 include/linux/firmware-map.h |6 +++
 mm/memory_hotplug.c  |5 ++-
 3 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 90723e6..49be12a 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Data types 
--
@@ -41,6 +42,7 @@ struct firmware_map_entry {
const char  *type;  /* type of the memory range */
struct list_headlist;   /* entry for the linked list */
struct kobject  kobj;   /* kobject for each entry */
+   unsigned intbootmem:1; /* allocated from bootmem */
 };
 
 /*
@@ -79,7 +81,26 @@ static const struct sysfs_ops memmap_attr_ops = {
.show = memmap_attr_show,
 };
 
+
+static inline struct firmware_map_entry *
+to_memmap_entry(struct kobject *kobj)
+{
+   return container_of(kobj, struct firmware_map_entry, kobj);
+}
+
+static void release_firmware_map_entry(struct kobject *kobj)
+{
+   struct firmware_map_entry *entry = to_memmap_entry(kobj);
+
+   if (entry->bootmem)
+   /* There is no way to free memory allocated from bootmem */
+   return;
+
+   kfree(entry);
+}
+
 static struct kobj_type memmap_ktype = {
+   .release= release_firmware_map_entry,
.sysfs_ops  = &memmap_attr_ops,
.default_attrs  = def_attrs,
 };
@@ -94,6 +115,7 @@ static struct kobj_type memmap_ktype = {
  * in firmware initialisation code in one single thread of execution.
  */
 static LIST_HEAD(map_entries);
+static DEFINE_SPINLOCK(map_entries_lock);
 
 /**
  * firmware_map_add_entry() - Does the real work to add a firmware memmap 
entry.
@@ -118,11 +140,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
INIT_LIST_HEAD(&entry->list);
kobject_init(&entry->kobj, &memmap_ktype);
 
+   spin_lock(&map_entries_lock);
list_add_tail(&entry->list, &map_entries);
+   spin_unlock(&map_entries_lock);
 
return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+   spin_lock(&map_entries_lock);
+   list_del(&entry->list);
+   spin_unlock(&map_entries_lock);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +180,35 @@ static int add_sysfs_fw_map_entry(struct 
firmware_map_entry *entry)
return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+   kobject_put(&entry->kobj);
+}
+
+/*
+ * Search memmap entry
+ */
+
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   spin_lock(&map_entries_lock);
+   list_for_each_entry(entry, &map_entries, list)
+   if ((entry->start == start) && (entry->end == end) &&
+   (!strcmp(entry->type, type))) {
+   spin_unlock(&map_entries_lock);
+   return entry;
+   }
+
+   spin_unlock(&map_entries_lock);
+   return NULL;
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -193,9 +258,36 @@ int __init firmware_map_add_early(u64 start, u64 end, 
const char *type)
if (WARN_ON(!entry))
return -ENOMEM;
 
+   entry->bootmem = 1;
return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   entry = firmware_map_find_entry(start, end - 1, type);
+   if (!entry)
+   return -EINVAL;
+
+   firmware_map_remove_entry(entry

[PATCH v2 10/12] memory-hotplug: memory_hotplug: clear zone when removing the memory

2012-10-23 Thread wency
From: Yasuaki Ishimatsu 

When a memory is added, we update zone's and pgdat's start_pfn and
spanned_pages in the function __add_zone(). So we should revert them
when the memory is removed.

The patch adds a new function __remove_zone() to do this.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |  207 +++
 1 files changed, 207 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 03153cf..55a228d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -312,10 +312,213 @@ static int __meminit __add_section(int nid, struct zone 
*zone,
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   struct mem_section *ms;
+
+   for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(start_pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(start_pfn)) != nid)
+   continue;
+
+   if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+   continue;
+
+   return start_pfn;
+   }
+
+   return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+   unsigned long start_pfn,
+   unsigned long end_pfn)
+{
+   struct mem_section *ms;
+   unsigned long pfn;
+
+   /* pfn is the end pfn of a memory section. */
+   pfn = end_pfn - 1;
+   for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(pfn)) != nid)
+   continue;
+
+   if (zone && zone != page_zone(pfn_to_page(pfn)))
+   continue;
+
+   return pfn;
+   }
+
+   return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   unsigned long zone_start_pfn =  zone->zone_start_pfn;
+   unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+   unsigned long pfn;
+   struct mem_section *ms;
+   int nid = zone_to_nid(zone);
+
+   zone_span_writelock(zone);
+   if (zone_start_pfn == start_pfn) {
+   /*
+* If the section is smallest section in the zone, it need
+* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+* In this case, we find second smallest valid mem_section
+* for shrinking zone.
+*/
+   pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+   zone_end_pfn);
+   if (pfn) {
+   zone->zone_start_pfn = pfn;
+   zone->spanned_pages = zone_end_pfn - pfn;
+   }
+   } else if (zone_end_pfn == end_pfn) {
+   /*
+* If the section is biggest section in the zone, it need
+* shrink zone->spanned_pages.
+* In this case, we find second biggest valid mem_section for
+* shrinking zone.
+*/
+   pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+  start_pfn);
+   if (pfn)
+   zone->spanned_pages = pfn - zone_start_pfn + 1;
+   }
+
+   /*
+* The section is not biggest or smallest mem_section in the zone, it
+* only creates a hole in the zone. So in this case, we need not
+* change the zone. But perhaps, the zone has only hole data. Thus
+* it check the zone has only hole or not.
+*/
+   pfn = zone_start_pfn;
+   for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (page_zone(pfn_to_page(pfn)) != zone)
+   continue;
+
+/* If the section is current section, it continues the loop */
+   if (start_pfn == pfn)
+   continue;
+
+   /* If we find valid section, we have nothing to do */
+   zone_span_writeunlock(zone);

[PATCH v2 00/12] memory-hotplug: hot-remove physical memory

2012-10-23 Thread wency
From: Wen Congyang 

The patch-set was divided from following thread's patch-set.

https://lkml.org/lkml/2012/9/5/201

The last version of this patchset:
https://lkml.org/lkml/2012/10/5/469

If you want to know the reason, please read following thread.

https://lkml.org/lkml/2012/10/2/83

The patch-set has only the function of kernel core side for physical
memory hot remove. So if you use the patch, please apply following
patches.

- bug fix for memory hot remove
  https://lkml.org/lkml/2012/10/19/56
  
- acpi framework
  https://lkml.org/lkml/2012/10/19/156

The patches can free/remove the following things:

  - /sys/firmware/memmap/X/{end, start, type} : [PATCH 2/10]
  - mem_section and related sysfs files   : [PATCH 3-4/10]
  - memmap of sparse-vmemmap  : [PATCH 5-7/10]
  - page table of removed memory  : [RFC PATCH 8/10]
  - node and related sysfs files  : [RFC PATCH 9-10/10]

* [PATCH 2/10] checks whether the memory can be removed or not.

If you find lack of function for physical memory hot-remove, please let me
know.

How to test this patchset?
1. apply this patchset and build the kernel. MEMORY_HOTPLUG, MEMORY_HOTREMOVE,
   ACPI_HOTPLUG_MEMORY must be selected.
2. load the module acpi_memhotplug
3. hotplug the memory device(it depends on your hardware)
   You will see the memory device under the directory /sys/bus/acpi/devices/.
   Its name is PNP0C80:XX.
4. online/offline pages provided by this memory device
   You can write online/offline to /sys/devices/system/memory/memoryX/state to
   online/offline pages provided by this memory device
5. hotremove the memory device
   You can hotremove the memory device by the hardware, or writing 1 to
   /sys/bus/acpi/devices/PNP0C80:XX/eject.

Note: if the memory provided by the memory device is used by the kernel, it
can't be offlined. It is not a bug.

Known problems:
1. hotremoving memory device may cause kernel panicked
   This bug will be fixed by Liu Jiang's patch:
   https://lkml.org/lkml/2012/7/3/1


Changelogs from v1 to v2:
 Patch1: new patch, offline memory twice. 1st iterate: offline every non primary
 memory block. 2nd iterate: offline primary (i.e. first added) memory
 block.

 Patch3: new patch, no logical change, just remove reduntant codes.

 Patch9: merge the patch from wujianguo into this patch. flush tlb on all cpu
 after the pagetable is changed.

 Patch12: new patch, free node_data when a node is offlined

Wen Congyang (6):
  memory-hotplug: try to offline the memory twice to avoid dependence
  memory-hotplug: remove redundant codes
  memory-hotplug: introduce new function arch_remove_memory() for
removing page table depends on architecture
  memory-hotplug: remove page table of x86_64 architecture
  memory-hotplug: remove sysfs file of node
  memory-hotplug: free node_data when a node is offlined

Yasuaki Ishimatsu (6):
  memory-hotplug: check whether all memory blocks are offlined or not
when removing memory
  memory-hotplug: remove /sys/firmware/memmap/X sysfs
  memory-hotplug: unregister memory section on SPARSEMEM_VMEMMAP
  memory-hotplug: implement register_page_bootmem_info_section of
sparse-vmemmap
  memory-hotplug: remove memmap of sparse-vmemmap
  memory-hotplug: memory_hotplug: clear zone when removing the memory

 arch/ia64/mm/discontig.c |   14 ++
 arch/ia64/mm/init.c  |   18 ++
 arch/powerpc/mm/init_64.c|   14 ++
 arch/powerpc/mm/mem.c|   12 +
 arch/s390/mm/init.c  |   12 +
 arch/s390/mm/vmem.c  |   14 ++
 arch/sh/mm/init.c|   17 ++
 arch/sparc/mm/init_64.c  |   14 ++
 arch/tile/mm/init.c  |8 +
 arch/x86/include/asm/pgtable_types.h |1 +
 arch/x86/mm/init_32.c|   12 +
 arch/x86/mm/init_64.c|  409 ++
 arch/x86/mm/pageattr.c   |   47 ++--
 drivers/acpi/acpi_memhotplug.c   |8 +-
 drivers/base/memory.c|6 +
 drivers/firmware/memmap.c|   98 -
 include/linux/firmware-map.h |6 +
 include/linux/memory_hotplug.h   |   15 +-
 include/linux/mm.h   |5 +-
 mm/memory_hotplug.c  |  409 --
 mm/sparse.c  |5 +-
 21 files changed, 1087 insertions(+), 57 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH v2 2/2] x86: make 'mem=' option to work for efi platform

2012-10-19 Thread wency
From: Wen Congyang 

Current mem boot option only can work for non efi environment. If the user
specifies add_efi_memmap, it cannot work for efi environment. In
the efi environment, we call e820_add_region() to add the memory map. So
we can modify __e820_add_region() and the mem boot option can work for
efi environment.

Signed-off-by: Wen Congyang 
---
 arch/x86/kernel/e820.c |   29 +
 1 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index ed858e9..e28982a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
 #ifdef CONFIG_PCI
 EXPORT_SYMBOL(pci_mem_start);
 #endif
+static u64 mem_limit = ~0ULL;
 
 /*
  * This function checks if any part of the range  is mapped
@@ -119,6 +120,20 @@ static void __init __e820_add_region(struct e820map 
*e820x, u64 start, u64 size,
return;
}
 
+   if (start >= mem_limit) {
+   printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+  (unsigned long long)start,
+  (unsigned long long)(start + size - 1));
+   return;
+   }
+
+   if (mem_limit - start < size) {
+   printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+  (unsigned long long)mem_limit,
+  (unsigned long long)(start + size - 1));
+   size = mem_limit - start;
+   }
+
e820x->map[x].addr = start;
e820x->map[x].size = size;
e820x->map[x].type = type;
@@ -809,7 +824,7 @@ static int userdef __initdata;
 /* "mem=nopentium" disables the 4MB page tables. */
 static int __init parse_memopt(char *p)
 {
-   u64 mem_size;
+   char *oldp;
 
if (!p)
return -EINVAL;
@@ -825,11 +840,11 @@ static int __init parse_memopt(char *p)
}
 
userdef = 1;
-   mem_size = memparse(p, &p);
+   oldp = p;
+   mem_limit = memparse(p, &p);
/* don't remove all of memory when handling "mem={invalid}" param */
-   if (mem_size == 0)
+   if (mem_limit == 0 || p == oldp)
return -EINVAL;
-   e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
return 0;
 }
@@ -881,6 +896,12 @@ early_param("memmap", parse_memmap_opt);
 
 void __init finish_e820_parsing(void)
 {
+   if (mem_limit != ~0ULL) {
+   userdef = 1;
+   e820_remove_range(mem_limit, ULLONG_MAX - mem_limit,
+ E820_RAM, 1);
+   }
+
if (userdef) {
u32 nr = e820.nr_map;
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH v2 1/2] update mem= option's spec according to its implementation

2012-10-19 Thread wency
From: Wen Congyang 

Current mem= implementation seems buggy because specification and
implementation doesn't match. Current mem= has been working
for many years and it's not buggy, it works as expected. So
we should update the specification.

Signed-off-by: Wen Congyang 
Sort-of-tentatively-acked-by: Rob Landley 
---
 Documentation/kernel-parameters.txt |7 ---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 9776f06..85b911a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1481,9 +1481,10 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
Amount of memory to be used when the kernel is not able
to see the whole system memory or for test.
-   [X86-32] Use together with memmap= to avoid physical
-   address space collisions. Without memmap= PCI devices
-   could be placed at addresses belonging to unused RAM.
+   [X86-32] Work as limiting max address. Use together
+   with memmap= to avoid physical address space collisions.
+   Without memmap= PCI devices could be placed at addresses
+   belonging to unused RAM.
 
mem=nopentium   [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH v2 0/2] fixes for mem= option

2012-10-19 Thread wency
From: Wen Congyang 

The documentation and implementation of 'mem=' option doesn't match, and the
option can't work for efi platform. This patchset updates the documentation
and make the option to work for efi platform.

I resend it again because HPA asked me to resend it some days after merge
window.

Changes from v1 to v2
Patch1: Just fix a typo error(ingoring -> ignoring).

Wen Congyang (2):
  update mem= option's spec according to its implementation
  x86: make 'mem=' option to work for efi platform

 Documentation/kernel-parameters.txt |7 ---
 arch/x86/kernel/e820.c  |   29 +
 2 files changed, 29 insertions(+), 7 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/3] acpi,memory-hotplug: introduce a mutex lock to protect the list in acpi_memory_device

2012-10-19 Thread wency
From: Wen Congyang 

The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

This 2 events may happen at the same time, so we may touch
acpi_memory_device.res_list at the same time. This patch
introduce a lock to protect this list.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |   17 +++--
 1 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 1e90e8f..8ff2976 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -83,7 +83,8 @@ struct acpi_memory_info {
 struct acpi_memory_device {
struct acpi_device * device;
unsigned int state; /* State of the memory device */
-   struct list_head res_list;
+   struct mutex lock;
+   struct list_head res_list;  /* protected by lock */
 };
 
 static int acpi_hotmem_initialized;
@@ -101,19 +102,23 @@ acpi_memory_get_resource(struct acpi_resource *resource, 
void *context)
(address64.resource_type != ACPI_MEMORY_RANGE))
return AE_OK;
 
+   mutex_lock(&mem_device->lock);
list_for_each_entry(info, &mem_device->res_list, list) {
/* Can we combine the resource range information? */
if ((info->caching == address64.info.mem.caching) &&
(info->write_protect == address64.info.mem.write_protect) &&
(info->start_addr + info->length == address64.minimum)) {
info->length += address64.address_length;
+   mutex_unlock(&mem_device->lock);
return AE_OK;
}
}
 
new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
-   if (!new)
+   if (!new) {
+   mutex_unlock(&mem_device->lock);
return AE_ERROR;
+   }
 
INIT_LIST_HEAD(&new->list);
new->caching = address64.info.mem.caching;
@@ -121,6 +126,7 @@ acpi_memory_get_resource(struct acpi_resource *resource, 
void *context)
new->start_addr = address64.minimum;
new->length = address64.address_length;
list_add_tail(&new->list, &mem_device->res_list);
+   mutex_unlock(&mem_device->lock);
 
return AE_OK;
 }
@@ -138,9 +144,11 @@ acpi_memory_get_device_resources(struct acpi_memory_device 
*mem_device)
status = acpi_walk_resources(mem_device->device->handle, 
METHOD_NAME__CRS,
 acpi_memory_get_resource, mem_device);
if (ACPI_FAILURE(status)) {
+   mutex_lock(&mem_device->lock);
list_for_each_entry_safe(info, n, &mem_device->res_list, list)
kfree(info);
INIT_LIST_HEAD(&mem_device->res_list);
+   mutex_unlock(&mem_device->lock);
return -EINVAL;
}
 
@@ -236,6 +244,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
 * We don't have memory-hot-add rollback function,now.
 * (i.e. memory-hot-remove function)
 */
+   mutex_lock(&mem_device->lock);
list_for_each_entry(info, &mem_device->res_list, list) {
if (info->enabled) { /* just sanity check...*/
num_enabled++;
@@ -256,6 +265,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
info->enabled = 1;
num_enabled++;
}
+   mutex_unlock(&mem_device->lock);
if (!num_enabled) {
printk(KERN_ERR PREFIX "add_memory failed\n");
mem_device->state = MEMORY_INVALID_STATE;
@@ -316,6 +326,7 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
 * Ask the VM to offline this memory range.
 * Note: Assume that this function returns zero on success
 */
+   mutex_lock(&mem_device->lock);
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
result = remove_memory(info->start_addr, info->length);
@@ -324,6 +335,7 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
}
kfree(info);
}
+   mutex_unlock(&mem_device->lock);
 
/* Power-off and eject the device */
result = acpi_memory_powerdown_device(mem_device);
@@ -438,6 +450,7 @@ static int acpi_memory_device_add(struct acpi_device 
*device)
mem_device->device = device;
sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
+   mutex_init(&mem_device->lock);
device->driver_data = mem_device;
 
 

[PATCH v2 1/3] acpi,memory-hotplug: call acpi_bus_trim() to remove memory device

2012-10-19 Thread wency
From: Wen Congyang 

The memory device has been ejected and powoffed, so we can call
acpi_bus_trim() to remove the memory device from acpi bus.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24c807f..1e90e8f 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -401,8 +401,9 @@ static void acpi_memory_device_notify(acpi_handle handle, 
u32 event, void *data)
}
 
/*
-* TBD: Invoke acpi_bus_remove to cleanup data structures
+* Invoke acpi_bus_trim() to remove memory device
 */
+   acpi_bus_trim(device, 1);
 
/* _EJ0 succeeded; _OST is not necessary */
return;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/3] acpi,memory-hotplug : add memory offline code to acpi_memory_device_remove()

2012-10-19 Thread wency
From: Yasuaki Ishimatsu 

The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

In the 1st case, acpi_memory_disable_device() will be called.
In the 2nd case, acpi_memory_device_remove() will be called.
acpi_memory_device_remove() will also be called when we unbind the
memory device from the driver acpi_memhotplug or a driver initialization
fails.

acpi_memory_disable_device() has already implemented a code which
offlines memory and releases acpi_memory_info struct. But
acpi_memory_device_remove() has not implemented it yet.

So the patch move offlining memory and releasing acpi_memory_info struct
codes to a new function acpi_memory_remove_memory(). And it is used by both
acpi_memory_device_remove() and acpi_memory_disable_device().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |   31 ---
 1 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 8ff2976..5b28aa9 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -316,16 +316,11 @@ static int acpi_memory_powerdown_device(struct 
acpi_memory_device *mem_device)
return 0;
 }
 
-static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 {
int result;
struct acpi_memory_info *info, *n;
 
-
-   /*
-* Ask the VM to offline this memory range.
-* Note: Assume that this function returns zero on success
-*/
mutex_lock(&mem_device->lock);
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
@@ -333,10 +328,27 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
if (result)
return result;
}
+
+   list_del(&info->list);
kfree(info);
}
mutex_unlock(&mem_device->lock);
 
+   return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+   int result;
+
+   /*
+* Ask the VM to offline this memory range.
+* Note: Assume that this function returns zero on success
+*/
+   result = acpi_memory_remove_memory(mem_device);
+   if (result)
+   return result;
+
/* Power-off and eject the device */
result = acpi_memory_powerdown_device(mem_device);
if (result) {
@@ -487,12 +499,17 @@ static int acpi_memory_device_add(struct acpi_device 
*device)
 static int acpi_memory_device_remove(struct acpi_device *device, int type)
 {
struct acpi_memory_device *mem_device = NULL;
-
+   int result;
 
if (!device || !acpi_driver_data(device))
return -EINVAL;
 
mem_device = acpi_driver_data(device);
+
+   result = acpi_memory_remove_memory(mem_device);
+   if (result)
+   return result;
+
kfree(mem_device);
 
return 0;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/3] acpi,memory-hotplug : implement framework for hot removing memory

2012-10-19 Thread wency
From: Wen Congyang 

The patch-set implements a framework for hot removing memory.

The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

In the 1st case, acpi_memory_disable_device() will be called.
In the 2nd case, acpi_memory_device_remove() will be called.
acpi_memory_device_remove() will also be called when we unbind the
memory device from the driver acpi_memhotplug or a driver initialization
fails.

acpi_memory_disable_device() has already implemented a code which
offlines memory and releases acpi_memory_info struct . But
acpi_memory_device_remove() has not implemented it yet.

So the patch prepares the framework for hot removing memory and
adds the framework into acpi_memory_device_remove().

The last version of this patchset is here:
https://lkml.org/lkml/2012/10/3/126

Changelos from v1 to v2:
  Patch1: use acpi_bus_trim() instead of acpi_bus_remove()
  Patch2: new patch, introduce a lock to protect the list
  Patch3: remove memory too when type is ACPI_BUS_REMOVAL_NORMAL
  Note: I don't send [Patch2-4 v1] in this series because they
  are no logical changes in these 3 patches.

Wen Congyang (2):
  acpi,memory-hotplug: call acpi_bus_trim() to remove memory device
  acpi,memory-hotplug: introduce a mutex lock to protect the list in
acpi_memory_device

Yasuaki Ishimatsu (1):
  acpi,memory-hotplug : add memory offline code to
acpi_memory_device_remove()

 drivers/acpi/acpi_memhotplug.c |   51 
 1 files changed, 41 insertions(+), 10 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 4/9] clear the memory to store struct page

2012-10-18 Thread wency
From: Wen Congyang 

If sparse memory vmemmap is enabled, we can't free the memory to store
struct page when a memory device is hotremoved, because we may store
struct page in the memory to manage the memory which doesn't belong
to this memory device. When we hotadded this memory device again, we
will reuse this memory to store struct page, and struct page may
contain some obsolete information, and we will get bad-page state:

[   59.611278] init_memory_mapping: [mem 0x8000-0x9fff]
[   59.637836] Built 2 zonelists in Node order, mobility grouping on.  Total 
pages: 547617
[   59.638739] Policy zone: Normal
[   59.650840] BUG: Bad page state in process bash  pfn:9b6dc
[   59.651124] page:ea0002200020 count:0 mapcount:0 mapping:  
(null) index:0xfdfdfdfdfdfdfdfd
[   59.651494] page flags: 
0x2fdfdfdfd5df9fd(locked|referenced|uptodate|dirty|lru|active|slab|owner_priv_1|private|private_2|writeback|head|tail|swapcache|reclaim|swapbacked|unevictable|uncached|compound_lock)
[   59.653604] Modules linked in: netconsole acpiphp pci_hotplug 
acpi_memhotplug loop kvm_amd kvm microcode tpm_tis tpm tpm_bios evdev psmouse 
serio_raw i2c_piix4 i2c_core parport_pc parport processor button thermal_sys 
ext3 jbd mbcache sg sr_mod cdrom ata_generic virtio_net ata_piix virtio_blk 
libata virtio_pci virtio_ring virtio scsi_mod
[   59.656998] Pid: 988, comm: bash Not tainted 3.6.0-rc7-guest #12
[   59.657172] Call Trace:
[   59.657275]  [] ? bad_page+0xb0/0x100
[   59.657434]  [] ? free_pages_prepare+0xb3/0x100
[   59.657610]  [] ? free_hot_cold_page+0x48/0x1a0
[   59.657787]  [] ? online_pages_range+0x68/0xa0
[   59.657961]  [] ? 
__online_page_increment_counters+0x10/0x10
[   59.658162]  [] ? walk_system_ram_range+0x101/0x110
[   59.658346]  [] ? online_pages+0x1a5/0x2b0
[   59.658515]  [] ? __memory_block_change_state+0x20d/0x270
[   59.658710]  [] ? store_mem_state+0xb6/0xf0
[   59.658878]  [] ? sysfs_write_file+0xd2/0x160
[   59.659052]  [] ? vfs_write+0xaa/0x160
[   59.659212]  [] ? sys_write+0x47/0x90
[   59.659371]  [] ? async_page_fault+0x25/0x30
[   59.659543]  [] ? system_call_fastpath+0x16/0x1b
[   59.659720] Disabling lock debugging due to kernel taint

This patch clears the memory to store struct page to avoid unexpected error.

CC: David Rientjes 
CC: Jiang Liu 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Reported-by: Vasilis Liaskovitis 
Signed-off-by: Wen Congyang 
---
 mm/sparse.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2..0021265 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long 
nr_pages)
 got_map_page:
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
 got_map_ptr:
-   memset(ret, 0, memmap_size);
 
return ret;
 }
@@ -760,6 +759,8 @@ int __meminit sparse_add_one_section(struct zone *zone, 
unsigned long start_pfn,
goto out;
}
 
+   memset(memmap, 0, sizeof(struct page) * nr_pages);
+
ms->section_mem_map |= SECTION_MARKED_PRESENT;
 
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 6/9] memory-hotplug: update mce_bad_pages when removing the memory

2012-10-18 Thread wency
From: Wen Congyang 

When we hotremove a memory device, we will free the memory to store
struct page. If the page is hwpoisoned page, we should decrease
mce_bad_pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/sparse.c |   19 +++
 1 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index 0021265..77d6a93 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -774,6 +774,24 @@ out:
return ret;
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+   int i;
+
+   for (i = 0; i < PAGES_PER_SECTION; i++) {
+   if (PageHWPoison(&memmap[i])) {
+   atomic_long_sub(1, &mce_bad_pages);
+   ClearPageHWPoison(&memmap[i]);
+   }
+   }
+}
+#else
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+}
+#endif
+
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
struct page *memmap = NULL;
@@ -785,6 +803,7 @@ void sparse_remove_one_section(struct zone *zone, struct 
mem_section *ms)
__section_nr(ms));
ms->section_mem_map = 0;
ms->pageblock_flags = NULL;
+   clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
}
 
free_section_usemap(memmap, usemap);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 5/9] memory-hotplug: skip HWPoisoned page when offlining pages

2012-10-18 Thread wency
From: Wen Congyang 

hwpoisoned may be set when we offline a page by the sysfs interface
/sys/devices/system/memory/soft_offline_page or
/sys/devices/system/memory/hard_offline_page. We use __free_page() to put
a page to buddy system when onlining pages. If the page is hwpoisoned page,
we can't put it to buddy system, and the page is not in free list. Such
page can't be isolated and offlined. So we should skip such pages when
offlining pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 include/linux/page-isolation.h |   10 ++
 mm/memory-failure.c|2 +-
 mm/memory_hotplug.c|4 ++--
 mm/page_alloc.c|   27 +++
 mm/page_isolation.c|   27 ---
 5 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 76a9539..a92061e 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -2,7 +2,8 @@
 #define __LINUX_PAGEISOLATION_H
 
 
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+bool skip_hwpoisoned_pages);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
int migratetype);
@@ -21,7 +22,7 @@ int move_freepages(struct zone *zone,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-unsigned migratetype);
+unsigned migratetype, bool skip_hwpoisoned_pages);
 
 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
@@ -34,12 +35,13 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned 
long end_pfn,
 /*
  * Test all pages in [start_pfn, end_pfn) are isolated or not.
  */
-int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+   bool skip_hwpoisoned_pages);
 
 /*
  * Internal functions. Changes pageblock's migrate type.
  */
-int set_migratetype_isolate(struct page *page);
+int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
 void unset_migratetype_isolate(struct page *page, unsigned migratetype);
 struct page *alloc_migrate_target(struct page *page, unsigned long private,
int **resultp);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6c5899b..1abffee 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long 
pfn, int flags)
 * Isolate the page, so that it doesn't get reallocated if it
 * was free.
 */
-   set_migratetype_isolate(p);
+   set_migratetype_isolate(p, true);
/*
 * When the target page is a free hugepage, just remove it
 * from free hugepage list.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 56b758a..ec899a2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -854,7 +854,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned 
long nr_pages,
 {
int ret;
long offlined = *(long *)data;
-   ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+   ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
offlined = nr_pages;
if (!ret)
*(long *)data += offlined;
@@ -901,7 +901,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
nr_pages = end_pfn - start_pfn;
 
/* set above range as isolated */
-   ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+   ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 
true);
if (ret)
goto out;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb90971..e33d0fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5575,7 +5575,8 @@ void set_pageblock_flags_group(struct page *page, 
unsigned long flags,
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+bool skip_hwpoisoned_pages)
 {
unsigned long pfn, iter, found;
int mt;
@@ -5610,6 +5611,13 @@ bool has_unmovable_pages(struct zone *zone, struct page 
*page, int count)
continue;
}
 
+   /*
+* The HWPoisoned page may be not in buddy system, and
+* page_count() is no

[PATCH v3 7/9] memory-hotplug: auto offline page_cgroup when onlining memory block failed

2012-10-18 Thread wency
From: Wen Congyang 

When a memory block is onlined, we will try allocate memory on that node
to store page_cgroup. If onlining the memory block failed, we don't
offline the page cgroup, and we have no chance to offline this page cgroup
unless the memory block is onlined successfully again. It will cause
that we can't hot-remove the memory device on that node, because some
memory is used to store page cgroup. If onlining the memory block
is failed, there is no need to stort page cgroup for this memory. So
auto offline page_cgroup when onlining memory block failed.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
Acked-by: KOSAKI Motohiro 
---
 mm/page_cgroup.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c..44db00e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct 
notifier_block *self,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
+   offline_page_cgroup(mn->start_pfn,
+   mn->nr_pages, mn->status_change_nid);
+   break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 8/9] memory-hotplug: fix NR_FREE_PAGES mismatch

2012-10-18 Thread wency
From: Wen Congyang 

NR_FREE_PAGES will be wrong after offlining pages. We add/dec NR_FREE_PAGES
like this now:
1. mova all pages in buddy system to MIGRATE_ISOLATE, and dec NR_FREE_PAGES
2. don't add NR_FREE_PAGES when it is freed and the migratetype is 
MIGRATE_ISOLATE
3. dec NR_FREE_PAGES when offlining isolated pages.
4. add NR_FREE_PAGES when undoing isolate pages.

When we come to step 3, all pages are in MIGRATE_ISOLATE list, and NR_FREE_PAGES
are right. When we come to step4, all pages are not in buddy system, so we don't
change NR_FREE_PAGES in this step, but we change NR_FREE_PAGES in step3. So
NR_FREE_PAGES is wrong after offlining pages. So there is no need to change
NR_FREE_PAGES in step3.

This patch also fixs a problem in step2: if the migratetype is MIGRATE_ISOLATE,
we should not add NR_FRR_PAGES when we remove pages from pcppages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/page_alloc.c |   10 +-
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e33d0fb..9aa9490 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -667,11 +667,13 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
-   if (is_migrate_cma(mt))
-   __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 
1);
+   if (likely(mt != MIGRATE_ISOLATE)) {
+   __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+   if (is_migrate_cma(mt))
+   __mod_zone_page_state(zone, 
NR_FREE_CMA_PAGES, 1);
+   }
} while (--to_free && --batch_free && !list_empty(list));
}
-   __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
 }
 
@@ -6006,8 +6008,6 @@ __offline_isolated_pages(unsigned long start_pfn, 
unsigned long end_pfn)
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
-   __mod_zone_page_state(zone, NR_FREE_PAGES,
- - (1UL << order));
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 9/9] memory-hotplug: allocate zone's pcp before onlining pages

2012-10-18 Thread wency
From: Wen Congyang 

We use __free_page() to put a page to buddy system when onlining pages.
__free_page() will store NR_FREE_PAGES in zone's pcp.vm_stat_diff, so we
should allocate zone's pcp before onlining pages, otherwise we will lose
some free pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |   10 ++
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ec899a2..eb4c132 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -505,12 +505,16 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
 * So, zonelist must be updated after online.
 */
mutex_lock(&zonelists_mutex);
-   if (!populated_zone(zone))
+   if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
+   build_all_zonelists(NULL, zone);
+   }
 
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+   if (need_zonelists_rebuild)
+   zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] 
failed\n",
   (unsigned long long) pfn << PAGE_SHIFT,
@@ -525,9 +529,7 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
zone->zone_pgdat->node_present_pages += onlined_pages;
if (onlined_pages) {
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-   if (need_zonelists_rebuild)
-   build_all_zonelists(NULL, zone);
-   else
+   if (!need_zonelists_rebuild)
zone_pcp_update(zone);
}
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 2/9] suppress "Device nodeX does not have a release() function" warning

2012-10-18 Thread wency
From: Yasuaki Ishimatsu 

When calling unregister_node(), the function shows following message at
device_release().

"Device 'node2' does not have a release() function, it is broken and must
be fixed."

The reason is node's device struct does not have a release() function.

So the patch registers node_device_release() to the device's release()
function for suppressing the warning message. Additionally, the patch adds
memset() to initialize a node struct into register_node(). Because the node
struct is part of node_devices[] array and it cannot be freed by
node_device_release(). So if system reuses the node struct, it has a garbage.

CC: David Rientjes 
CC: Jiang Liu 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/base/node.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index af1a177..2baa73a 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -252,6 +252,9 @@ static inline void hugetlb_register_node(struct node *node) 
{}
 static inline void hugetlb_unregister_node(struct node *node) {}
 #endif
 
+static void node_device_release(struct device *dev)
+{
+}
 
 /*
  * register_node - Setup a sysfs device for a node.
@@ -263,8 +266,11 @@ int register_node(struct node *node, int num, struct node 
*parent)
 {
int error;
 
+   memset(node, 0, sizeof(*node));
+
node->dev.id = num;
node->dev.bus = &node_subsys;
+   node->dev.release = node_device_release;
error = device_register(&node->dev);
 
if (!error){
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 0/9] bugfix for memory hotplug

2012-10-18 Thread wency
From: Wen Congyang 

Changes from v2 to v3:
  Merge the bug fix from ishimatsu to this patchset(Patch 1-3)
  Patch 3: split it from patch as it fixes another bug.
  Patch 4: new patch, and fix bad-page state when hotadding a memory
   device after hotremoving it. I forgot to post this patch in v2.
  Patch 6: update it according to Dave Hansen's comment.

Changes from v1 to v2:
  Patch 1: updated according to kosaki's suggestion

  Patch 2: new patch, and update mce_bad_pages when removing memory.

  Patch 4: new patch, and fix a NR_FREE_PAGES mismatch, and this bug
   cause oom in my test.

  Patch 5: new patch, and fix a new bug. When repeating to online/offline
   pages, the free pages will continue to decrease. 

Wen Congyang (6):
  clear the memory to store struct page
  memory-hotplug: skip HWPoisoned page when offlining pages
  memory-hotplug: update mce_bad_pages when removing the memory
  memory-hotplug: auto offline page_cgroup when onlining memory block
failed
  memory-hotplug: fix NR_FREE_PAGES mismatch
  memory-hotplug: allocate zone's pcp before onlining pages

Yasuaki Ishimatsu (3):
  suppress "Device memoryX does not have a release() function" warning
  suppress "Device nodeX does not have a release() function" warning
  memory-hotplug: flush the work for the node when the node is offlined

 drivers/base/memory.c  |9 -
 drivers/base/node.c|   11 +++
 include/linux/page-isolation.h |   10 ++
 mm/memory-failure.c|2 +-
 mm/memory_hotplug.c|   14 --
 mm/page_alloc.c|   37 -
 mm/page_cgroup.c   |3 +++
 mm/page_isolation.c|   27 ---
 mm/sparse.c|   22 +-
 9 files changed, 106 insertions(+), 29 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 3/9] memory-hotplug: flush the work for the node when the node is offlined

2012-10-18 Thread wency
From: Yasuaki Ishimatsu 

If the node is onlined after it is offlined, we will clear the memory
to store the node's information. This structure contains struct work,
so we should flush work before the work's information is cleared.

CC: David Rientjes 
CC: Jiang Liu 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/base/node.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 2baa73a..13c0ddf 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -254,6 +254,11 @@ static inline void hugetlb_unregister_node(struct node 
*node) {}
 
 static void node_device_release(struct device *dev)
 {
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+   struct node *node_dev = to_node(dev);
+
+   flush_work(&node_dev->node_work);
+#endif
 }
 
 /*
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/9] suppress "Device memoryX does not have a release() function" warning

2012-10-18 Thread wency
From: Yasuaki Ishimatsu 

When calling remove_memory_block(), the function shows following message at
device_release().

"Device 'memory528' does not have a release() function, it is broken and must
be fixed."

The reason is memory_block's device struct does not have a release() function.

So the patch registers memory_block_release() to the device's release() function
for suppressing the warning message. Additionally, the patch moves kfree(mem)
into the release function since the release function is prepared as a means
to free a memory_block struct.

CC: Jiang Liu 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
Acked-by: David Rientjes 
Acked-by: KOSAKI Motohiro 
---
 drivers/base/memory.c |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 86c8821..7eb1211 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -70,6 +70,13 @@ void unregister_memory_isolate_notifier(struct 
notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_memory_isolate_notifier);
 
+static void memory_block_release(struct device *dev)
+{
+   struct memory_block *mem = container_of(dev, struct memory_block, dev);
+
+   kfree(mem);
+}
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
@@ -80,6 +87,7 @@ int register_memory(struct memory_block *memory)
 
memory->dev.bus = &memory_subsys;
memory->dev.id = memory->start_section_nr / sections_per_block;
+   memory->dev.release = memory_block_release;
 
error = device_register(&memory->dev);
return error;
@@ -635,7 +643,6 @@ int remove_memory_block(unsigned long node_id, struct 
mem_section *section,
mem_remove_simple_file(mem, phys_device);
mem_remove_simple_file(mem, removable);
unregister_memory(mem);
-   kfree(mem);
} else
kobject_put(&mem->dev.kobj);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/5] memory-hotplug: skip HWPoisoned page when offlining pages

2012-10-17 Thread wency
From: Wen Congyang 

hwpoisoned may be set when we offline a page by the sysfs interface
/sys/devices/system/memory/soft_offline_page or
/sys/devices/system/memory/hard_offline_page. If we don't clear
this flag when onlining pages, this page can't be freed, and will
not in free list. So we can't offline these pages again. So we
should skip such page when offlining pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 include/linux/page-isolation.h |   10 ++
 mm/memory-failure.c|2 +-
 mm/memory_hotplug.c|4 ++--
 mm/page_alloc.c|   27 +++
 mm/page_isolation.c|   27 ---
 5 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 76a9539..a92061e 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -2,7 +2,8 @@
 #define __LINUX_PAGEISOLATION_H
 
 
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+bool skip_hwpoisoned_pages);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
int migratetype);
@@ -21,7 +22,7 @@ int move_freepages(struct zone *zone,
  */
 int
 start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-unsigned migratetype);
+unsigned migratetype, bool skip_hwpoisoned_pages);
 
 /*
  * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
@@ -34,12 +35,13 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned 
long end_pfn,
 /*
  * Test all pages in [start_pfn, end_pfn) are isolated or not.
  */
-int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+   bool skip_hwpoisoned_pages);
 
 /*
  * Internal functions. Changes pageblock's migrate type.
  */
-int set_migratetype_isolate(struct page *page);
+int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
 void unset_migratetype_isolate(struct page *page, unsigned migratetype);
 struct page *alloc_migrate_target(struct page *page, unsigned long private,
int **resultp);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6c5899b..1abffee 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long 
pfn, int flags)
 * Isolate the page, so that it doesn't get reallocated if it
 * was free.
 */
-   set_migratetype_isolate(p);
+   set_migratetype_isolate(p, true);
/*
 * When the target page is a free hugepage, just remove it
 * from free hugepage list.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 56b758a..ec899a2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -854,7 +854,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned 
long nr_pages,
 {
int ret;
long offlined = *(long *)data;
-   ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+   ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
offlined = nr_pages;
if (!ret)
*(long *)data += offlined;
@@ -901,7 +901,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
nr_pages = end_pfn - start_pfn;
 
/* set above range as isolated */
-   ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+   ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 
true);
if (ret)
goto out;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb90971..e33d0fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5575,7 +5575,8 @@ void set_pageblock_flags_group(struct page *page, 
unsigned long flags,
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+bool skip_hwpoisoned_pages)
 {
unsigned long pfn, iter, found;
int mt;
@@ -5610,6 +5611,13 @@ bool has_unmovable_pages(struct zone *zone, struct page 
*page, int count)
continue;
}
 
+   /*
+* The HWPoisoned page may be not in buddy system, and
+* page_count() is not 0.
+*/
+   if (skip_hwpoisoned_pages && Pag

[PATCH v2 2/5] memory-hotplug: update mce_bad_pages when removing the memory

2012-10-17 Thread wency
From: Wen Congyang 

When we hotremove a memory device, we will free the memory to store
struct page. If the page is hwpoisoned page, we should decrease
mce_bad_pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/sparse.c |   21 +
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2..24072e4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -773,6 +773,23 @@ out:
return ret;
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+   int i;
+
+   if (!memmap)
+   return;
+
+   for (i = 0; i < PAGES_PER_SECTION; i++) {
+   if (PageHWPoison(&memmap[i])) {
+   atomic_long_sub(1, &mce_bad_pages);
+   ClearPageHWPoison(&memmap[i]);
+   }
+   }
+}
+#endif
+
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
struct page *memmap = NULL;
@@ -786,6 +803,10 @@ void sparse_remove_one_section(struct zone *zone, struct 
mem_section *ms)
ms->pageblock_flags = NULL;
}
 
+#ifdef CONFIG_MEMORY_FAILURE
+   clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
+#endif
+
free_section_usemap(memmap, usemap);
 }
 #endif
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 5/5] memory-hotplug: allocate zone's pcp before onlining pages

2012-10-17 Thread wency
From: Wen Congyang 

We use __free_page() to put a page to buddy system when onlining pages.
__free_page() will store NR_FREE_PAGES in zone's pcp.vm_stat_diff, so we
should allocate zone's pcp before onlining pages, otherwise we will lose
some free pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |   10 ++
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ec899a2..eb4c132 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -505,12 +505,16 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
 * So, zonelist must be updated after online.
 */
mutex_lock(&zonelists_mutex);
-   if (!populated_zone(zone))
+   if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
+   build_all_zonelists(NULL, zone);
+   }
 
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+   if (need_zonelists_rebuild)
+   zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] 
failed\n",
   (unsigned long long) pfn << PAGE_SHIFT,
@@ -525,9 +529,7 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
zone->zone_pgdat->node_present_pages += onlined_pages;
if (onlined_pages) {
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-   if (need_zonelists_rebuild)
-   build_all_zonelists(NULL, zone);
-   else
+   if (!need_zonelists_rebuild)
zone_pcp_update(zone);
}
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/5] memory-hotplug: auto offline page_cgroup when onlining memory block failed

2012-10-17 Thread wency
From: Wen Congyang 

When a memory block is onlined, we will try allocate memory on that node
to store page_cgroup. If onlining the memory block failed, we don't
offline the page cgroup, and we have no chance to offline this page cgroup
unless the memory block is onlined successfully again. It will cause
that we can't hot-remove the memory device on that node, because some
memory is used to store page cgroup. If onlining the memory block
is failed, there is no need to stort page cgroup for this memory. So
auto offline page_cgroup when onlining memory block failed.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/page_cgroup.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c..44db00e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct 
notifier_block *self,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
+   offline_page_cgroup(mn->start_pfn,
+   mn->nr_pages, mn->status_change_nid);
+   break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 4/5] memory-hotplug: fix NR_FREE_PAGES mismatch

2012-10-17 Thread wency
From: Wen Congyang 

NR_FREE_PAGES will be wrong after offlining pages. We add/dec NR_FREE_PAGES
like this now:
1. mova all pages in buddy system to MIGRATE_ISOLATE, and dec NR_FREE_PAGES
2. don't add NR_FREE_PAGES when it is freed and the migratetype is 
MIGRATE_ISOLATE
3. dec NR_FREE_PAGES when offlining isolated pages.
4. add NR_FREE_PAGES when undoing isolate pages.

When we come to step 3, all pages are in MIGRATE_ISOLATE list, and NR_FREE_PAGES
are right. When we come to step4, all pages are not in buddy system, so we don't
change NR_FREE_PAGES in this step, but we change NR_FREE_PAGES in step3. So
NR_FREE_PAGES is wrong after offlining pages. So there is no need to change
NR_FREE_PAGES in step3.

This patch also fixs a problem in step2: if the migratetype is MIGRATE_ISOLATE,
we should not add NR_FRR_PAGES when we remove pages from pcppages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/page_alloc.c |   10 +-
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e33d0fb..9aa9490 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -667,11 +667,13 @@ static void free_pcppages_bulk(struct zone *zone, int 
count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
-   if (is_migrate_cma(mt))
-   __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 
1);
+   if (likely(mt != MIGRATE_ISOLATE)) {
+   __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+   if (is_migrate_cma(mt))
+   __mod_zone_page_state(zone, 
NR_FREE_CMA_PAGES, 1);
+   }
} while (--to_free && --batch_free && !list_empty(list));
}
-   __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
 }
 
@@ -6006,8 +6008,6 @@ __offline_isolated_pages(unsigned long start_pfn, 
unsigned long end_pfn)
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
-   __mod_zone_page_state(zone, NR_FREE_PAGES,
- - (1UL << order));
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/5] bugfix for memory hotplug

2012-10-17 Thread wency
From: Wen Congyang 

Wen Congyang (5):
  memory-hotplug: skip HWPoisoned page when offlining pages
  memory-hotplug: update mce_bad_pages when removing the memory
  memory-hotplug: auto offline page_cgroup when onlining memory block
failed
  memory-hotplug: fix NR_FREE_PAGES mismatch
  memory-hotplug: allocate zone's pcp before onlining pages

 include/linux/page-isolation.h |   10 ++
 mm/memory-failure.c|2 +-
 mm/memory_hotplug.c|   14 --
 mm/page_alloc.c|   37 -
 mm/page_cgroup.c   |3 +++
 mm/page_isolation.c|   27 ---
 mm/sparse.c|   21 +
 7 files changed, 87 insertions(+), 27 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH v2 2/2] x86: reimplement mem boot option

2012-10-11 Thread wency
From: Wen Congyang 

Current mem boot option only can work for non efi environment. If the user
specifies add_efi_memmap, it cannot work for efi environment. In
the efi environment, we call e820_add_region() to add the memory map. So
we can modify __e820_add_region() and the mem boot option can work for
efi environment.

Signed-off-by: Wen Congyang 
---
 arch/x86/kernel/e820.c |   29 +
 1 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4185797..20bc467 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -47,6 +47,7 @@ unsigned long pci_mem_start = 0xaeedbabe;
 #ifdef CONFIG_PCI
 EXPORT_SYMBOL(pci_mem_start);
 #endif
+static u64 mem_limit = ~0ULL;
 
 /*
  * This function checks if any part of the range  is mapped
@@ -119,6 +120,20 @@ static void __init __e820_add_region(struct e820map 
*e820x, u64 start, u64 size,
return;
}
 
+   if (start >= mem_limit) {
+   printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+  (unsigned long long)start,
+  (unsigned long long)(start + size - 1));
+   return;
+   }
+
+   if (mem_limit - start < size) {
+   printk(KERN_ERR "e820: ignoring [mem %#010llx-%#010llx]\n",
+  (unsigned long long)mem_limit,
+  (unsigned long long)(start + size - 1));
+   size = mem_limit - start;
+   }
+
e820x->map[x].addr = start;
e820x->map[x].size = size;
e820x->map[x].type = type;
@@ -809,7 +824,7 @@ static int userdef __initdata;
 /* "mem=nopentium" disables the 4MB page tables. */
 static int __init parse_memopt(char *p)
 {
-   u64 mem_size;
+   char *oldp;
 
if (!p)
return -EINVAL;
@@ -825,11 +840,11 @@ static int __init parse_memopt(char *p)
}
 
userdef = 1;
-   mem_size = memparse(p, &p);
+   oldp = p;
+   mem_limit = memparse(p, &p);
/* don't remove all of memory when handling "mem={invalid}" param */
-   if (mem_size == 0)
+   if (mem_limit == 0 || p == oldp)
return -EINVAL;
-   e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
return 0;
 }
@@ -881,6 +896,12 @@ early_param("memmap", parse_memmap_opt);
 
 void __init finish_e820_parsing(void)
 {
+   if (mem_limit != ~0ULL) {
+   userdef = 1;
+   e820_remove_range(mem_limit, ULLONG_MAX - mem_limit,
+ E820_RAM, 1);
+   }
+
if (userdef) {
u32 nr = e820.nr_map;
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RESEND PATCH v2 1/2] update mem= option's spec

2012-10-11 Thread wency
From: Wen Congyang 

Current mem= implementation seems buggy because specification and
implementation doesn't match. Current mem= has been working
for many years and it's not buggy, it works as expected. So
we should update the specification.

Signed-off-by: Wen Congyang 
Sort-of-tentatively-acked-by: Rob Landley 

---
 Documentation/kernel-parameters.txt |7 ---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index a92c5eb..924b1a4 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1471,9 +1471,10 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
Amount of memory to be used when the kernel is not able
to see the whole system memory or for test.
-   [X86-32] Use together with memmap= to avoid physical
-   address space collisions. Without memmap= PCI devices
-   could be placed at addresses belonging to unused RAM.
+   [X86-32] Work as limiting max address. Use together
+   with memmap= to avoid physical address space collisions.
+   Without memmap= PCI devices could be placed at addresses
+   belonging to unused RAM.
 
mem=nopentium   [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] bugfix for memory hotplug

2012-09-26 Thread wency
From: Wen Congyang 

Wen Congyang (2):
  memory-hotplug: clear hwpoisoned flag when onlining pages
  memory-hotplug: auto offline page_cgroup when onlining memory block
failed

Yasuaki Ishimatsu (2):
  memory-hotplug: add memory_block_release
  memory-hotplug: add node_device_release

 drivers/base/memory.c |9 -
 drivers/base/node.c   |   11 +++
 mm/memory_hotplug.c   |8 
 mm/page_cgroup.c  |3 +++
 4 files changed, 30 insertions(+), 1 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] memory-hotplug: add memory_block_release

2012-09-26 Thread wency
From: Yasuaki Ishimatsu 

When calling remove_memory_block(), the function shows following message at
device_release().

Device 'memory528' does not have a release() function, it is broken and must
be fixed.

remove_memory_block() calls kfree(mem). I think it shouled be called from
device_release(). So the patch implements memory_block_release()

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/base/memory.c |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7dda4f7..da457e5 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -70,6 +70,13 @@ void unregister_memory_isolate_notifier(struct 
notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_memory_isolate_notifier);
 
+static void release_memory_block(struct device *dev)
+{
+   struct memory_block *mem = container_of(dev, struct memory_block, dev);
+
+   kfree(mem);
+}
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
@@ -80,6 +87,7 @@ int register_memory(struct memory_block *memory)
 
memory->dev.bus = &memory_subsys;
memory->dev.id = memory->start_section_nr / sections_per_block;
+   memory->dev.release = release_memory_block;
 
error = device_register(&memory->dev);
return error;
@@ -630,7 +638,6 @@ int remove_memory_block(unsigned long node_id, struct 
mem_section *section,
mem_remove_simple_file(mem, phys_device);
mem_remove_simple_file(mem, removable);
unregister_memory(mem);
-   kfree(mem);
} else
kobject_put(&mem->dev.kobj);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] memory-hotplug: clear hwpoisoned flag when onlining pages

2012-09-26 Thread wency
From: Wen Congyang 

hwpoisoned may set when we offline a page by the sysfs interface
/sys/devices/system/memory/soft_offline_page or
/sys/devices/system/memory/hard_offline_page. If we don't clear
this flag when onlining pages, this page can't be freed, and will
not in free list. So we can't offline these pages again. So we
should clear this flag when onlining pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |8 
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d..9a5b10f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -431,6 +431,14 @@ EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 
 void __online_page_free(struct page *page)
 {
+#ifdef CONFIG_MEMORY_FAILURE
+   /* The page may be marked HWPoisoned by soft/hard offline page */
+   if (PageHWPoison(page)) {
+   atomic_long_sub(1, &mce_bad_pages);
+   ClearPageHWPoison(page);
+   }
+#endif
+
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] memory-hotplug: auto offline page_cgroup when onlining memory block failed

2012-09-26 Thread wency
From: Wen Congyang 

When a memory block is onlined, we will try allocate memory on that node
to store page_cgroup. If onlining the memory block failed, we don't
offline the page cgroup, and we have no chance to offline this page cgroup
unless the memory block is onlined successfully again. It will cause
that we can't hot-remove the memory device on that node, because some
memory is used to store page cgroup. If onlining the memory block
is failed, there is no need to stort page cgroup for this memory. So
auto offline page_cgroup when onlining memory block failed.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/page_cgroup.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c..44db00e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct 
notifier_block *self,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
+   offline_page_cgroup(mn->start_pfn,
+   mn->nr_pages, mn->status_change_nid);
+   break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] memory-hotplug: add node_device_release

2012-09-26 Thread wency
From: Yasuaki Ishimatsu 

When calling unregister_node(), the function shows following message at
device_release().

Device 'node2' does not have a release() function, it is broken and must be
fixed.

So the patch implements node_device_release()

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/base/node.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index af1a177..07523fb 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -252,6 +252,16 @@ static inline void hugetlb_register_node(struct node 
*node) {}
 static inline void hugetlb_unregister_node(struct node *node) {}
 #endif
 
+static void node_device_release(struct device *dev)
+{
+   struct node *node_dev = to_node(dev);
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+   flush_work(&node_dev->node_work);
+#endif
+
+   memset(node_dev, 0, sizeof(struct node));
+}
 
 /*
  * register_node - Setup a sysfs device for a node.
@@ -265,6 +275,7 @@ int register_node(struct node *node, int num, struct node 
*parent)
 
node->dev.id = num;
node->dev.bus = &node_subsys;
+   node->dev.release = node_device_release;
error = device_register(&node->dev);
 
if (!error){
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 18/21] memory-hotplug: add node_device_release

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

When calling unregister_node(), the function shows following message at
device_release().

Device 'node2' does not have a release() function, it is broken and must be
fixed.

So the patch implements node_device_release()

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/base/node.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index af1a177..07523fb 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -252,6 +252,16 @@ static inline void hugetlb_register_node(struct node 
*node) {}
 static inline void hugetlb_unregister_node(struct node *node) {}
 #endif
 
+static void node_device_release(struct device *dev)
+{
+   struct node *node_dev = to_node(dev);
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+   flush_work(&node_dev->node_work);
+#endif
+
+   memset(node_dev, 0, sizeof(struct node));
+}
 
 /*
  * register_node - Setup a sysfs device for a node.
@@ -265,6 +275,7 @@ int register_node(struct node *node, int num, struct node 
*parent)
 
node->dev.id = num;
node->dev.bus = &node_subsys;
+   node->dev.release = node_device_release;
error = device_register(&node->dev);
 
if (!error){
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 17/21] memory_hotplug: clear zone when the memory is removed

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

When a memory is added, we update zone's and pgdat's start_pfn and spanned_pages
in the function __add_zone(). So we should revert these when the memory is
removed. Add a new function __remove_zone() to do this.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |  207 +++
 1 files changed, 207 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c54922c..afda7e9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -308,10 +308,213 @@ static int __meminit __add_section(int nid, struct zone 
*zone,
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   struct mem_section *ms;
+
+   for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(start_pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(start_pfn)) != nid)
+   continue;
+
+   if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+   continue;
+
+   return start_pfn;
+   }
+
+   return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+   unsigned long start_pfn,
+   unsigned long end_pfn)
+{
+   struct mem_section *ms;
+   unsigned long pfn;
+
+   /* pfn is the end pfn of a memory section. */
+   pfn = end_pfn - 1;
+   for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(pfn)) != nid)
+   continue;
+
+   if (zone && zone != page_zone(pfn_to_page(pfn)))
+   continue;
+
+   return pfn;
+   }
+
+   return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   unsigned long zone_start_pfn =  zone->zone_start_pfn;
+   unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+   unsigned long pfn;
+   struct mem_section *ms;
+   int nid = zone_to_nid(zone);
+
+   zone_span_writelock(zone);
+   if (zone_start_pfn == start_pfn) {
+   /*
+* If the section is smallest section in the zone, it need
+* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+* In this case, we find second smallest valid mem_section
+* for shrinking zone.
+*/
+   pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+   zone_end_pfn);
+   if (pfn) {
+   zone->zone_start_pfn = pfn;
+   zone->spanned_pages = zone_end_pfn - pfn;
+   }
+   } else if (zone_end_pfn == end_pfn) {
+   /*
+* If the section is biggest section in the zone, it need
+* shrink zone->spanned_pages.
+* In this case, we find second biggest valid mem_section for
+* shrinking zone.
+*/
+   pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+  start_pfn);
+   if (pfn)
+   zone->spanned_pages = pfn - zone_start_pfn + 1;
+   }
+
+   /*
+* The section is not biggest or smallest mem_section in the zone, it
+* only creates a hole in the zone. So in this case, we need not
+* change the zone. But perhaps, the zone has only hole data. Thus
+* it check the zone has only hole or not.
+*/
+   pfn = zone_start_pfn;
+   for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (page_zone(pfn_to_page(pfn)) != zone)
+   continue;
+
+/* If the section is current section, it continues the loop */
+   if (start_pfn == pfn)
+   continue;
+
+   /* If we find valid section, we have nothing to do */
+   

[RFC v9 PATCH 12/21] memory-hotplug: introduce new function arch_remove_memory()

2012-09-05 Thread wency
From: Wen Congyang 

We don't call __add_pages() directly in the function add_memory()
because some other architecture related things need to be done
before or after calling __add_pages(). So we should introduce
a new function arch_remove_memory() to revert the things
done in arch_add_memory().

Note: the function for s390 is not implemented(I don't know how to
implement it for s390).

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 arch/ia64/mm/init.c  |   16 
 arch/powerpc/mm/mem.c|   14 +++
 arch/s390/mm/init.c  |   12 +++
 arch/sh/mm/init.c|   15 +++
 arch/tile/mm/init.c  |8 ++
 arch/x86/include/asm/pgtable_types.h |1 +
 arch/x86/mm/init_32.c|   10 ++
 arch/x86/mm/init_64.c|  160 ++
 arch/x86/mm/pageattr.c   |   47 +-
 include/linux/memory_hotplug.h   |1 +
 mm/memory_hotplug.c  |1 +
 11 files changed, 263 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 0eab454..1e345ed 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -688,6 +688,22 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   int ret;
+
+   ret = __remove_pages(start_pfn, nr_pages);
+   if (ret)
+   pr_warn("%s: Problem encountered in __remove_pages() as"
+   " ret=%d\n", __func__,  ret);
+
+   return ret;
+}
+#endif
 #endif
 
 /*
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index fbdad0e..011170b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,20 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+
+   start = (unsigned long)__va(start);
+   if (remove_section_mapping(start, start + size))
+   return -EINVAL;
+
+   return __remove_pages(start_pfn, nr_pages);
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6adbc08..501b20e 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -257,4 +257,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
vmem_remove_mapping(start, size);
return rc;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /*
+* There is no hardware or firmware interface which could trigger a
+* hot memory remove on s390. So there is nothing that needs to be
+* implemented.
+*/
+   return -EBUSY;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 82cc576..fc84491 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -558,4 +558,19 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   int ret;
+
+   ret = __remove_pages(start_pfn, nr_pages);
+   if (unlikely(ret))
+   pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
+   ret);
+
+   return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index ef29d6c..2749515 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 {
return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /* TODO */
+   return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 013286a..b725af2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -334,6 +334,7 @@ static inline void update_page_count(int level, unsigned 
long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t 
*pbase);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 575d86f..41eefe8 100644
--- a/arch/x

[RFC v9 PATCH 16/21] memory-hotplug: free memmap of sparse-vmemmap

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

All pages of virtual mapping in removed memory cannot be freed, since some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch checks whether page can be freed or not.

How to check whether page can be freed or not?
 1. When removing memory, the page structs of the revmoved memory are filled
with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
In this case, the page used as PT/PMD can be freed.

Applying patch, __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is integrated
into one. So __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is deleted.

Note:  vmemmap_kfree() and vmemmap_free_bootmem() are not implemented for ia64,
ppc, s390, and sparc.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/ia64/mm/discontig.c  |8 +++
 arch/powerpc/mm/init_64.c |8 +++
 arch/s390/mm/vmem.c   |8 +++
 arch/sparc/mm/init_64.c   |8 +++
 arch/x86/mm/init_64.c |  119 +
 include/linux/mm.h|2 +
 mm/memory_hotplug.c   |   17 +--
 mm/sparse.c   |5 +-
 8 files changed, 158 insertions(+), 17 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 33943db..0d23b69 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -823,6 +823,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return vmemmap_populate_basepages(start_page, size, node);
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 3690c44..835a2b3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -299,6 +299,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index eda55cd..4b42b0b 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -227,6 +227,14 @@ out:
return ret;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index add1cc7..1384826 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2078,6 +2078,14 @@ void __meminit vmemmap_populate_print_last(void)
}
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0075592..4e8f8a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1138,6 +1138,125 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
return 0;
 }
 
+#define PAGE_INUSE 0xFD
+
+unsigned long find_and_clear_pte_page(unsigned long addr, unsigned long end,
+   struct page **pp, int *page_size)
+{
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
+   void *page_addr;
+   unsigned long next;
+
+   *pp = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd))
+   return pgd_addr_end(addr, end);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud))
+   return pud_addr_end(addr, end);
+
+   if (!cpu_has_pse) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+   pte = pte_offset_kernel(pmd, addr);
+   if (pte_none(*pte))
+   return next;
+
+   *page_size = PAGE_SIZE;
+   *pp = pte_page(*pte);
+   } else {
+   next = pmd_addr_end(addr, end);
+
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+

[RFC v9 PATCH 14/21] memory-hotplug: move register_page_bootmem_info_node and put_page_bootmem for sparse-vmemmap

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

For implementing register_page_bootmem_info_node of sparse-vmemmap,
register_page_bootmem_info_node and put_page_bootmem are moved to
memory_hotplug.c

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 include/linux/memory_hotplug.h |9 -
 mm/memory_hotplug.c|8 ++--
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index cdbbd79..1133e63 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -162,17 +162,8 @@ static inline void arch_refresh_nodedata(int nid, 
pg_data_t *pgdat)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-static inline void put_page_bootmem(struct page *page)
-{
-}
-#else
 extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
 extern void put_page_bootmem(struct page *page);
-#endif
 
 /*
  * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 26a5012..df6857b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -91,7 +91,6 @@ static void release_memory_resource(struct resource *res)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void get_page_bootmem(unsigned long info,  struct page *page,
 unsigned long type)
 {
@@ -127,6 +126,7 @@ void __ref put_page_bootmem(struct page *page)
 
 }
 
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
unsigned long *usemap, mapsize, section_nr, i;
@@ -163,6 +163,11 @@ static void register_page_bootmem_info_section(unsigned 
long start_pfn)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 
 }
+#else
+static inline void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+}
+#endif
 
 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -198,7 +203,6 @@ void register_page_bootmem_info_node(struct pglist_data 
*pgdat)
register_page_bootmem_info_section(pfn);
 
 }
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
   unsigned long end_pfn)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 19/21] memory-hotplug: remove sysfs file of node

2012-09-05 Thread wency
From: Wen Congyang 

This patch introduces a new function try_offline_node() to
remove sysfs file of node when all memory sections of this
node are removed. If some memory sections of this node are
not removed, this function does nothing.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |   54 +++
 1 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index afda7e9..270c249 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -1285,6 +1286,57 @@ int offline_memory(u64 start, u64 size)
return 0;
 }
 
+static int check_cpu_on_node(void *data)
+{
+   struct pglist_data *pgdat = data;
+   int cpu;
+
+   for_each_online_cpu(cpu) {
+   if (cpu_to_node(cpu) == pgdat->node_id)
+   /*
+* the cpu on this node is onlined, and we can't
+* offline this node.
+*/
+   return -EBUSY;
+   }
+
+   return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+static void try_offline_node(int nid)
+{
+   unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
+   unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages;
+   unsigned long pfn;
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   unsigned long section_nr = pfn_to_section_nr(pfn);
+
+   if (!present_section_nr(section_nr))
+   continue;
+
+   if (pfn_to_nid(pfn) != nid)
+   continue;
+
+   /*
+* some memory sections of this node are not removed, and we
+* can't offline node now.
+*/
+   return;
+   }
+
+   if (stop_machine(check_cpu_on_node, NODE_DATA(nid), NULL))
+   return;
+
+   /*
+* all memory sections of this node are removed, we can offline this
+* node now.
+*/
+   node_set_offline(nid);
+   unregister_one_node(nid);
+}
+
 int __ref remove_memory(int nid, u64 start, u64 size)
 {
int ret = 0;
@@ -1305,6 +1357,8 @@ int __ref remove_memory(int nid, u64 start, u64 size)
firmware_map_remove(start, start + size, "System RAM");
 
arch_remove_memory(start, size);
+
+   try_offline_node(nid);
 out:
unlock_memory_hotplug();
return ret;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 10/21] memory-hotplug: add memory_block_release

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

When calling remove_memory_block(), the function shows following message at
device_release().

Device 'memory528' does not have a release() function, it is broken and must
be fixed.

remove_memory_block() calls kfree(mem). I think it shouled be called from
device_release(). So the patch implements memory_block_release()

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/base/memory.c |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 038be73..f44d624 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -109,6 +109,13 @@ bool is_memblk_offline(unsigned long start, unsigned long 
size)
 }
 EXPORT_SYMBOL(is_memblk_offline);
 
+static void release_memory_block(struct device *dev)
+{
+   struct memory_block *mem = container_of(dev, struct memory_block, dev);
+
+   kfree(mem);
+}
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
@@ -119,6 +126,7 @@ int register_memory(struct memory_block *memory)
 
memory->dev.bus = &memory_subsys;
memory->dev.id = memory->start_section_nr / sections_per_block;
+   memory->dev.release = release_memory_block;
 
error = device_register(&memory->dev);
return error;
@@ -674,7 +682,6 @@ int remove_memory_block(unsigned long node_id, struct 
mem_section *section,
mem_remove_simple_file(mem, phys_device);
mem_remove_simple_file(mem, removable);
unregister_memory(mem);
-   kfree(mem);
} else
kobject_put(&mem->dev.kobj);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 13/21] memory-hotplug: check page type in get_page_bootmem

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

The function get_page_bootmem() may be called more than one time to the same
page. There is no need to set page's type, private if the function is not
the first time called to the page.

Note: the patch is just optimization and does not fix any problem.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 mm/memory_hotplug.c |   15 +++
 1 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d736df3..26a5012 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -95,10 +95,17 @@ static void release_memory_resource(struct resource *res)
 static void get_page_bootmem(unsigned long info,  struct page *page,
 unsigned long type)
 {
-   page->lru.next = (struct list_head *) type;
-   SetPagePrivate(page);
-   set_page_private(page, info);
-   atomic_inc(&page->_count);
+   unsigned long page_type;
+
+   page_type = (unsigned long)page->lru.next;
+   if (page_type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+   page_type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE){
+   page->lru.next = (struct list_head *)type;
+   SetPagePrivate(page);
+   set_page_private(page, info);
+   atomic_inc(&page->_count);
+   } else
+   atomic_inc(&page->_count);
 }
 
 /* reference to __meminit __free_pages_bootmem is valid
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 11/21] memory-hotplug: remove_memory calls __remove_pages

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

The patch adds __remove_pages() to remove_memory(). Then the range of
phys_start_pfn argument and nr_pages argument in __remove_pagse() may
have different zone. So zone argument is removed from __remove_pages()
and __remove_pages() caluculates zone in each section.

When CONFIG_SPARSEMEM_VMEMMAP is defined, there is no way to remove a memmap.
So __remove_section only calls unregister_memory_section().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |5 +
 include/linux/memory_hotplug.h  |3 +--
 mm/memory_hotplug.c |   17 ++---
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index dc0a035..cc14da4 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -76,7 +76,6 @@ unsigned long memory_block_size_bytes(void)
 static int pseries_remove_memblock(unsigned long base, unsigned int 
memblock_size)
 {
unsigned long start, start_pfn;
-   struct zone *zone;
int i, ret;
int sections_to_remove;
 
@@ -87,8 +86,6 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
return 0;
}
 
-   zone = page_zone(pfn_to_page(start_pfn));
-
/*
 * Remove section mappings and sysfs entries for the
 * section of the memory we are removing.
@@ -101,7 +98,7 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
-   ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+   ret = __remove_pages(start_pfn,  PAGES_PER_SECTION);
if (ret)
return ret;
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index fd84ea9..8bf820d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -90,8 +90,7 @@ extern bool is_pageblock_removable_nolock(struct page *page);
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
-   unsigned long nr_pages);
+extern int __remove_pages(unsigned long start_pfn, unsigned long nr_pages);
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2353887..7fbfc9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -275,11 +275,14 @@ static int __meminit __add_section(int nid, struct zone 
*zone,
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
-   /*
-* XXX: Freeing memmap with vmemmap is not implement yet.
-*  This should be removed later.
-*/
-   return -EBUSY;
+   int ret = -EINVAL;
+
+   if (!valid_section(ms))
+   return ret;
+
+   ret = unregister_memory_section(ms);
+
+   return ret;
 }
 #else
 static int __remove_section(struct zone *zone, struct mem_section *ms)
@@ -346,8 +349,7 @@ EXPORT_SYMBOL_GPL(__add_pages);
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
-unsigned long nr_pages)
+int __remove_pages(unsigned long phys_start_pfn, unsigned long nr_pages)
 {
unsigned long i, ret = 0;
int sections_to_remove;
@@ -363,6 +365,7 @@ int __remove_pages(struct zone *zone, unsigned long 
phys_start_pfn,
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+   struct zone *zone = page_zone(pfn_to_page(pfn));
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 09/21] memory-hotplug: does not release memory region in PAGES_PER_SECTION chunks

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

Since applying a patch(de7f0cba96786c), release_mem_region() has been changed
as called in PAGES_PER_SECTION chunks because register_memory_resource() is
called in PAGES_PER_SECTION chunks by add_memory(). But it seems firmware
dependency. If CRS are written in the PAGES_PER_SECTION chunks in ACPI DSDT
Table, register_memory_resource() is called in PAGES_PER_SECTION chunks.
But if CRS are written in the DIMM unit in ACPI DSDT Table,
register_memory_resource() is called in DIMM unit. So release_mem_region()
should not be called in PAGES_PER_SECTION chunks. The patch fixes it.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   13 +
 mm/memory_hotplug.c |4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 11d8e05..dc0a035 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
 {
unsigned long start, start_pfn;
struct zone *zone;
-   int ret;
+   int i, ret;
+   int sections_to_remove;
 
start_pfn = base >> PAGE_SHIFT;
 
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
 * to sysfs "state" file and we can't remove sysfs entries
 * while writing to it. So we have to defer it to here.
 */
-   ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
-   if (ret)
-   return ret;
+   sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
+   for (i = 0; i < sections_to_remove; i++) {
+   unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
+   ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+   if (ret)
+   return ret;
+   }
 
/*
 * Update memory regions for memory remove
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e74a01d..2353887 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -358,11 +358,11 @@ int __remove_pages(struct zone *zone, unsigned long 
phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
 
+   release_mem_region(phys_start_pfn << PAGE_SHIFT,  nr_pages * PAGE_SIZE);
+
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-   release_mem_region(pfn << PAGE_SHIFT,
-  PAGES_PER_SECTION << PAGE_SHIFT);
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 01/21] memory-hotplug: rename remove_memory() to offline_memory()/offline_pages()

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

remove_memory() only try to offline pages. It is called in two cases:
1. hot remove a memory device
2. echo offline >/sys/devices/system/memory/memoryXX/state

In the 1st case, we should also change memory block's state, and notify
the userspace that the memory block's state is changed after offlining
pages.

So rename remove_memory() to offline_memory()/offline_pages(). And in
the 1st case, offline_memory() will be used. The function offline_memory()
is not implemented. In the 2nd case, offline_pages() will be used.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |2 +-
 drivers/base/memory.c  |9 +++--
 include/linux/memory_hotplug.h |3 ++-
 mm/memory_hotplug.c|   22 ++
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24c807f..2a7beac 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -318,7 +318,7 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
 */
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
-   result = remove_memory(info->start_addr, info->length);
+   result = offline_memory(info->start_addr, info->length);
if (result)
return result;
}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7dda4f7..44e7de6 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -248,26 +248,23 @@ static bool pages_correctly_reserved(unsigned long 
start_pfn,
 static int
 memory_block_action(unsigned long phys_index, unsigned long action)
 {
-   unsigned long start_pfn, start_paddr;
+   unsigned long start_pfn;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
struct page *first_page;
int ret;
 
first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
+   start_pfn = page_to_pfn(first_page);
 
switch (action) {
case MEM_ONLINE:
-   start_pfn = page_to_pfn(first_page);
-
if (!pages_correctly_reserved(start_pfn, nr_pages))
return -EBUSY;
 
ret = online_pages(start_pfn, nr_pages);
break;
case MEM_OFFLINE:
-   start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
-   ret = remove_memory(start_paddr,
-   nr_pages << PAGE_SHIFT);
+   ret = offline_pages(start_pfn, nr_pages);
break;
default:
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 910550f..c183f39 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -233,7 +233,8 @@ static inline int is_mem_section_removable(unsigned long 
pfn,
 extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
-extern int remove_memory(u64 start, u64 size);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section 
*ms);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3ad25f9..bb42316 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -866,7 +866,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long 
end_pfn)
return offlined;
 }
 
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
  unsigned long end_pfn, unsigned long timeout)
 {
unsigned long pfn, nr_pages, expire;
@@ -994,18 +994,24 @@ out:
return ret;
 }
 
-int remove_memory(u64 start, u64 size)
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
-   unsigned long start_pfn, end_pfn;
+   return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
 
-   start_pfn = PFN_DOWN(start);
-   end_pfn = start_pfn + PFN_DOWN(size);
-   return offline_pages(start_pfn, end_pfn, 120 * HZ);
+int offline_memory(u64 start, u64 size)
+{
+   return -EINVAL;
 }
 #else
-int remove_memory(u64 start, u64 size)
+int offline_pages(u

[RFC v9 PATCH 00/21] memory-hotplug: hot-remove physical memory

2012-09-05 Thread wency
From: Wen Congyang 

This patch series aims to support physical memory hot-remove.

The patches can free/remove the following things:

  - acpi_memory_info  : [RFC PATCH 4/19]
  - /sys/firmware/memmap/X/{end, start, type} : [RFC PATCH 8/19]
  - iomem_resource: [RFC PATCH 9/19]
  - mem_section and related sysfs files   : [RFC PATCH 10-11, 13-16/19]
  - page table of removed memory  : [RFC PATCH 12/19]
  - node and related sysfs files  : [RFC PATCH 18-19/19]

If you find lack of function for physical memory hot-remove, please let me
know.

How to test this patchset?
1. apply this patchset and build the kernel. MEMORY_HOTPLUG, MEMORY_HOTREMOVE,
   ACPI_HOTPLUG_MEMORY must be selected.
2. load the module acpi_memhotplug
3. hotplug the memory device(it depends on your hardware)
   You will see the memory device under the directory /sys/bus/acpi/devices/.
   Its name is PNP0C80:XX.
4. online/offline pages provided by this memory device
   You can write online/offline to /sys/devices/system/memory/memoryX/state to
   online/offline pages provided by this memory device
5. hotremove the memory device
   You can hotremove the memory device by the hardware, or writing 1 to
   /sys/bus/acpi/devices/PNP0C80:XX/eject.

Note: if the memory provided by the memory device is used by the kernel, it
can't be offlined. It is not a bug.

Known problems:
1. memory can't be offlined when CONFIG_MEMCG is selected.
   For example: there is a memory device on node 1. The address range
   is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
   and memory11 under the directory /sys/devices/system/memory/.
   If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
   when we online pages. When we online memory8, the memory stored page cgroup
   is not provided by this memory device. But when we online memory9, the memory
   stored page cgroup may be provided by memory8. So we can't offline memory8
   now. We should offline the memory in the reversed order.
   When the memory device is hotremoved, we will auto offline memory provided
   by this memory device. But we don't know which memory is onlined first, so
   offlining memory may fail. In such case, you should offline the memory by
   hand before hotremoving the memory device.
2. hotremoving memory device may cause kernel panicked
   This bug will be fixed by Liu Jiang's patch:
   https://lkml.org/lkml/2012/7/3/1

change log of v9:
 [RFC PATCH v9 8/21]
   * add a lock to protect the list map_entries
   * add an indicator to firmware_map_entry to remember whether the memory
 is allocated from bootmem
 [RFC PATCH v9 10/21]
   * change the macro to inline function
 [RFC PATCH v9 19/21]
   * don't offline the node if the cpu on the node is onlined
 [RFC PATCH v9 21/21]
   * create new patch: auto offline page_cgroup when onlining memory block
 failed

change log of v8:
 [RFC PATCH v8 17/20]
   * Fix problems when one node's range include the other nodes
 [RFC PATCH v8 18/20]
   * fix building error when CONFIG_MEMORY_HOTPLUG_SPARSE or CONFIG_HUGETLBFS
 is not defined.
 [RFC PATCH v8 19/20]
   * don't offline node when some memory sections are not removed
 [RFC PATCH v8 20/20]
   * create new patch: clear hwpoisoned flag when onlining pages

change log of v7:
 [RFC PATCH v7 4/19]
   * do not continue if acpi_memory_device_remove_memory() fails.
 [RFC PATCH v7 15/19]
   * handle usemap in register_page_bootmem_info_section() too.

change log of v6:
 [RFC PATCH v6 12/19]
   * fix building error on other archtitectures than x86

 [RFC PATCH v6 15-16/19]
   * fix building error on other archtitectures than x86

change log of v5:
 * merge the patchset to clear page table and the patchset to hot remove
   memory(from ishimatsu) to one big patchset.

 [RFC PATCH v5 1/19]
   * rename remove_memory() to offline_memory()/offline_pages()

 [RFC PATCH v5 2/19]
   * new patch: implement offline_memory(). This function offlines pages,
 update memory block's state, and notify the userspace that the memory
 block's state is changed.

 [RFC PATCH v5 4/19]
   * offline and remove memory in acpi_memory_disable_device() too.

 [RFC PATCH v5 17/19]
   * new patch: add a new function __remove_zone() to revert the things done
 in the function __add_zone().

 [RFC PATCH v5 18/19]
   * flush work befor reseting node device.

change log of v4:
 * remove "memory-hotplug : unify argument of firmware_map_add_early/hotplug"
   from the patch series, since the patch is a bugfix. It is being disccussed
   on other thread. But for testing the patch series, the patch is needed.
   So I added the patch as [PATCH 0/13].

 [RFC PATCH v4 2/13]
   * check memory is online or not at remove_memory()
   * add memory_add_physaddr_to_nid() to acpi_memory_device_remove() for
 getting node id
 
 [RFC PATCH v4 3/13]
   * create new patch : check memory is online or not at online_pages()


[RFC v9 PATCH 04/21] memory-hotplug: offline and remove memory when removing the memory device

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

We should offline and remove memory when removing the memory device.
The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

In the 1st case, acpi_memory_disable_device() will be called. In the 2nd
case, acpi_memory_device_remove() will be called. acpi_memory_device_remove()
will also be called when we unbind the memory device from the driver
acpi_memhotplug. If the type is ACPI_BUS_REMOVAL_EJECT, it means
that the user wants to eject the memory device, and we should offline
and remove memory in acpi_memory_device_remove().

The function remove_memory() is not implemeted now. It only check whether
all memory has been offllined now.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |   45 +--
 drivers/base/memory.c  |   39 ++
 include/linux/memory.h |5 
 include/linux/memory_hotplug.h |5 
 mm/memory_hotplug.c|   22 +++
 5 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 7873832..9d47458 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -310,25 +311,44 @@ static int acpi_memory_powerdown_device(struct 
acpi_memory_device *mem_device)
return 0;
 }
 
-static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+static int
+acpi_memory_device_remove_memory(struct acpi_memory_device *mem_device)
 {
int result;
struct acpi_memory_info *info, *n;
+   int node = mem_device->nid;
 
-
-   /*
-* Ask the VM to offline this memory range.
-* Note: Assume that this function returns zero on success
-*/
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
result = offline_memory(info->start_addr, info->length);
if (result)
return result;
+
+   result = remove_memory(node, info->start_addr,
+  info->length);
+   if (result)
+   return result;
}
+
+   list_del(&info->list);
kfree(info);
}
 
+   return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+   int result;
+
+   /*
+* Ask the VM to offline this memory range.
+* Note: Assume that this function returns zero on success
+*/
+   result = acpi_memory_device_remove_memory(mem_device);
+   if (result)
+   return result;
+
/* Power-off and eject the device */
result = acpi_memory_powerdown_device(mem_device);
if (result) {
@@ -477,12 +497,23 @@ static int acpi_memory_device_add(struct acpi_device 
*device)
 static int acpi_memory_device_remove(struct acpi_device *device, int type)
 {
struct acpi_memory_device *mem_device = NULL;
-
+   int result;
 
if (!device || !acpi_driver_data(device))
return -EINVAL;
 
mem_device = acpi_driver_data(device);
+
+   if (type == ACPI_BUS_REMOVAL_EJECT) {
+   /*
+* offline and remove memory only when the memory device is
+* ejected.
+*/
+   result = acpi_memory_device_remove_memory(mem_device);
+   if (result)
+   return result;
+   }
+
kfree(mem_device);
 
return 0;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 86c8821..038be73 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -70,6 +70,45 @@ void unregister_memory_isolate_notifier(struct 
notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_memory_isolate_notifier);
 
+bool is_memblk_offline(unsigned long start, unsigned long size)
+{
+   struct memory_block *mem = NULL;
+   struct mem_section *section;
+   unsigned long start_pfn, end_pfn;
+   unsigned long pfn, section_nr;
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = PFN_UP(start + size);
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   section_nr = pfn_to_section_nr(pfn);
+   if (!present_section_nr(section_nr))
+   continue;
+
+   section = __nr_to_section(section_nr);
+   /* same memblock? */
+   if (mem)
+   if ((section_nr >= mem->start_section_n

[RFC v9 PATCH 05/21] memory-hotplug: check whether memory is present or not

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

If system supports memory hot-remove, online_pages() may online removed pages.
So online_pages() need to check whether onlining pages are present or not.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 include/linux/mmzone.h |   19 +++
 mm/memory_hotplug.c|   13 +
 2 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2daa54f..ac3ae30 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1180,6 +1180,25 @@ void sparse_init(void);
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
+#ifdef CONFIG_SPARSEMEM
+static inline int pfns_present(unsigned long pfn, unsigned long nr_pages)
+{
+   int i;
+   for (i = 0; i < nr_pages; i++) {
+   if (pfn_present(pfn + i))
+   continue;
+   else
+   return -EINVAL;
+   }
+   return 0;
+}
+#else
+static inline int pfns_present(unsigned long pfn, unsigned long nr_pages)
+{
+   return 0;
+}
+#endif /* CONFIG_SPARSEMEM*/
+
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool early_pfn_in_nid(unsigned long pfn, int nid);
 #else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 49f7747..299747d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -467,6 +467,19 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
struct memory_notify arg;
 
lock_memory_hotplug();
+   /*
+* If system supports memory hot-remove, the memory may have been
+* removed. So we check whether the memory has been removed or not.
+*
+* Note: When CONFIG_SPARSEMEM is defined, pfns_present() become
+*   effective. If CONFIG_SPARSEMEM is not defined, pfns_present()
+*   always returns 0.
+*/
+   ret = pfns_present(pfn, nr_pages);
+   if (ret) {
+   unlock_memory_hotplug();
+   return ret;
+   }
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
arg.status_change_nid = -1;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 02/21] memory-hotplug: implement offline_memory()

2012-09-05 Thread wency
From: Wen Congyang 

The function offline_memory() will be called when hot removing a
memory device. The memory device may contain more than one memory
block. If the memory block has been offlined, __offline_pages()
will fail. So we should try to offline one memory block at a
time.

If the memory block is offlined in offline_memory(), we also
update it's state, and notify the userspace that its state is
changed.

The function offline_memory() also check each memory block's
state. So there is no need to check the memory block's state
before calling offline_memory().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
CC: Vasilis Liaskovitis 
Signed-off-by: Wen Congyang 
---
 drivers/base/memory.c  |   31 +++
 include/linux/memory_hotplug.h |2 ++
 mm/memory_hotplug.c|   37 -
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 44e7de6..86c8821 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -275,13 +275,11 @@ memory_block_action(unsigned long phys_index, unsigned 
long action)
return ret;
 }
 
-static int memory_block_change_state(struct memory_block *mem,
+static int __memory_block_change_state(struct memory_block *mem,
unsigned long to_state, unsigned long from_state_req)
 {
int ret = 0;
 
-   mutex_lock(&mem->state_mutex);
-
if (mem->state != from_state_req) {
ret = -EINVAL;
goto out;
@@ -309,10 +307,20 @@ static int memory_block_change_state(struct memory_block 
*mem,
break;
}
 out:
-   mutex_unlock(&mem->state_mutex);
return ret;
 }
 
+static int memory_block_change_state(struct memory_block *mem,
+   unsigned long to_state, unsigned long from_state_req)
+{
+   int ret;
+
+   mutex_lock(&mem->state_mutex);
+   ret = __memory_block_change_state(mem, to_state, from_state_req);
+   mutex_unlock(&mem->state_mutex);
+
+   return ret;
+}
 static ssize_t
 store_mem_state(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
@@ -653,6 +661,21 @@ int unregister_memory_section(struct mem_section *section)
 }
 
 /*
+ * offline one memory block. If the memory block has been offlined, do nothing.
+ */
+int offline_memory_block(struct memory_block *mem)
+{
+   int ret = 0;
+
+   mutex_lock(&mem->state_mutex);
+   if (mem->state != MEM_OFFLINE)
+   ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+   mutex_unlock(&mem->state_mutex);
+
+   return ret;
+}
+
+/*
  * Initialize the sysfs support for memory devices...
  */
 int __init memory_dev_init(void)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c183f39..0b040bb 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -10,6 +10,7 @@ struct page;
 struct zone;
 struct pglist_data;
 struct mem_section;
+struct memory_block;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
@@ -234,6 +235,7 @@ extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_memory_block(struct memory_block *mem);
 extern int offline_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bb42316..6fc1908 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1001,7 +1001,42 @@ int offline_pages(unsigned long start_pfn, unsigned long 
nr_pages)
 
 int offline_memory(u64 start, u64 size)
 {
-   return -EINVAL;
+   struct memory_block *mem = NULL;
+   struct mem_section *section;
+   unsigned long start_pfn, end_pfn;
+   unsigned long pfn, section_nr;
+   int ret;
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = start_pfn + PFN_DOWN(size);
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   section_nr = pfn_to_section_nr(pfn);
+   if (!present_section_nr(section_nr))
+   continue;
+
+   section = __nr_to_section(section_nr);
+   /* same memblock? */
+   if (mem)
+   if ((section_nr >= mem->start_section_nr) &&
+   (section_nr <= mem->end_section_nr))
+   continue;
+
+   mem = find_memory_block_hinted(section, mem);
+   if (!mem)
+   continue;
+
+   ret = offline

[RFC v9 PATCH 15/21] memory-hotplug: implement register_page_bootmem_info_section of sparse-vmemmap

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

For removing memmap region of sparse-vmemmap which is allocated bootmem,
memmap region of sparse-vmemmap needs to be registered by get_page_bootmem().
So the patch searches pages of virtual mapping and registers the pages by
get_page_bootmem().

Note: register_page_bootmem_memmap() is not implemented for ia64, ppc, s390,
and sparc.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 arch/ia64/mm/discontig.c   |6 
 arch/powerpc/mm/init_64.c  |6 
 arch/s390/mm/vmem.c|6 
 arch/sparc/mm/init_64.c|6 
 arch/x86/mm/init_64.c  |   52 
 include/linux/memory_hotplug.h |2 +
 include/linux/mm.h |3 +-
 mm/memory_hotplug.c|   31 +--
 8 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c641333..33943db 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -822,4 +822,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 {
return vmemmap_populate_basepages(start_page, size, node);
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 620b7ac..3690c44 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -298,5 +298,11 @@ int __meminit vmemmap_populate(struct page *start_page,
 
return 0;
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 6f896e7..eda55cd 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -227,6 +227,12 @@ out:
return ret;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
+
 /*
  * Add memory segment to the segment list if it doesn't overlap with
  * an already present segment.
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index d58edf5..add1cc7 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2077,6 +2077,12 @@ void __meminit vmemmap_populate_print_last(void)
node_start = 0;
}
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static void prot_init_common(unsigned long page_none,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e0d88ba..0075592 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1138,6 +1138,58 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
return 0;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   unsigned long addr = (unsigned long)start_page;
+   unsigned long end = (unsigned long)(start_page + size);
+   unsigned long next;
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+
+   for (; addr < end; addr = next) {
+   pte_t *pte = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd)) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   continue;
+   }
+   get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud)) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   continue;
+   }
+   get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+   if (!cpu_has_pse) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   continue;
+   get_page_bootmem(section_nr, pmd_page(*pmd),
+MIX_SECTION_INFO);
+
+   pte = pte_offset_kernel(pmd, addr);
+   if (pte_none(*pte))
+   continue;
+   get_page_bootmem(section_nr, pte_page(*pte),
+SECTION_INFO);
+   } else {
+   next = pmd_addr_end(addr, end);
+
+

[RFC v9 PATCH 06/21] memory-hotplug: export the function acpi_bus_remove()

2012-09-05 Thread wency
From: Wen Congyang 

The function acpi_bus_remove() can remove a acpi device from acpi device.
When a acpi device is removed, we need to call this function to remove
the acpi device from acpi bus. So export this function.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/scan.c |3 ++-
 include/acpi/acpi_bus.h |1 +
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index d1ecca2..1cefc34 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1224,7 +1224,7 @@ static int acpi_device_set_context(struct acpi_device 
*device)
return -ENODEV;
 }
 
-static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
 {
if (!dev)
return -EINVAL;
@@ -1246,6 +1246,7 @@ static int acpi_bus_remove(struct acpi_device *dev, int 
rmdevice)
 
return 0;
 }
+EXPORT_SYMBOL(acpi_bus_remove);
 
 static int acpi_add_single_object(struct acpi_device **child,
  acpi_handle handle, int type,
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index bde976e..2ccf109 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -360,6 +360,7 @@ bool acpi_bus_power_manageable(acpi_handle handle);
 bool acpi_bus_can_wakeup(acpi_handle handle);
 int acpi_power_resource_register_device(struct device *dev, acpi_handle 
handle);
 void acpi_power_resource_unregister_device(struct device *dev, acpi_handle 
handle);
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice);
 #ifdef CONFIG_ACPI_PROC_EVENT
 int acpi_bus_generate_proc_event(struct acpi_device *device, u8 type, int 
data);
 int acpi_bus_generate_proc_event4(const char *class, const char *bid, u8 type, 
int data);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 07/21] memory-hotplug: call acpi_bus_remove() to remove memory device

2012-09-05 Thread wency
From: Wen Congyang 

The memory device has been ejected and powoffed, so we can call
acpi_bus_remove() to remove the memory device from acpi bus.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 9d47458..b152767 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -425,8 +425,9 @@ static void acpi_memory_device_notify(acpi_handle handle, 
u32 event, void *data)
}
 
/*
-* TBD: Invoke acpi_bus_remove to cleanup data structures
+* Invoke acpi_bus_remove() to remove memory device
 */
+   acpi_bus_remove(device, 1);
 
/* _EJ0 succeeded; _OST is not necessary */
return;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 20/21] memory-hotplug: clear hwpoisoned flag when onlining pages

2012-09-05 Thread wency
From: Wen Congyang 

hwpoisoned may set when we offline a page by the sysfs interface
/sys/devices/system/memory/soft_offline_page or
/sys/devices/system/memory/hard_offline_page. If we don't clear
this flag when onlining pages, this page can't be freed, and will
not in free list. So we can't offline these pages again. So we
should clear this flag when onlining pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 270c249..140c080 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -661,6 +661,11 @@ EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 
 void __online_page_free(struct page *page)
 {
+#ifdef CONFIG_MEMORY_FAILURE
+   /* The page may be marked HWPoisoned by soft/hard offline page */
+   ClearPageHWPoison(page);
+#endif
+
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 21/21] memory-hotplug: auto offline page_cgroup when onlining memory block failed

2012-09-05 Thread wency
From: Wen Congyang 

When a memory block is onlined, we will try allocate memory on that node
to store page_cgroup. If onlining the memory block failed, we don't
offline the page cgroup, and we have no chance to offline this page cgroup
unless the memory block is onlined successfully again. It will cause
that we can't hot-remove the memory device on that node, because some
memory is used to store page cgroup. If onlining the memory block
is failed, there is no need to stort page cgroup for this memory. So
auto offline page_cgroup when onlining memory block failed.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/page_cgroup.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c..44db00e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct 
notifier_block *self,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
+   offline_page_cgroup(mn->start_pfn,
+   mn->nr_pages, mn->status_change_nid);
+   break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v9 PATCH 08/21] memory-hotplug: remove /sys/firmware/memmap/X sysfs

2012-09-05 Thread wency
From: Yasuaki Ishimatsu 

When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
sysfs files are created. But there is no code to remove these files. The patch
implements the function to remove them.

Note : The code does not free firmware_map_entry since there is no way to free
   memory which is allocated by bootmem.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/firmware/memmap.c|   98 +-
 include/linux/firmware-map.h |6 +++
 mm/memory_hotplug.c  |9 +++-
 3 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index c1cdc92..6740d26 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Data types 
--
@@ -41,6 +42,7 @@ struct firmware_map_entry {
const char  *type;  /* type of the memory range */
struct list_headlist;   /* entry for the linked list */
struct kobject  kobj;   /* kobject for each entry */
+   unsigned intbootmem:1; /* allocated from bootmem */
 };
 
 /*
@@ -79,7 +81,26 @@ static const struct sysfs_ops memmap_attr_ops = {
.show = memmap_attr_show,
 };
 
+
+static inline struct firmware_map_entry *
+to_memmap_entry(struct kobject *kobj)
+{
+   return container_of(kobj, struct firmware_map_entry, kobj);
+}
+
+static void release_firmware_map_entry(struct kobject *kobj)
+{
+   struct firmware_map_entry *entry = to_memmap_entry(kobj);
+
+   if (entry->bootmem)
+   /* There is no way to free memory allocated from bootmem */
+   return;
+
+   kfree(entry);
+}
+
 static struct kobj_type memmap_ktype = {
+   .release= release_firmware_map_entry,
.sysfs_ops  = &memmap_attr_ops,
.default_attrs  = def_attrs,
 };
@@ -94,6 +115,7 @@ static struct kobj_type memmap_ktype = {
  * in firmware initialisation code in one single thread of execution.
  */
 static LIST_HEAD(map_entries);
+static DEFINE_SPINLOCK(map_entries_lock);
 
 /**
  * firmware_map_add_entry() - Does the real work to add a firmware memmap 
entry.
@@ -118,11 +140,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
INIT_LIST_HEAD(&entry->list);
kobject_init(&entry->kobj, &memmap_ktype);
 
+   spin_lock(&map_entries_lock);
list_add_tail(&entry->list, &map_entries);
+   spin_unlock(&map_entries_lock);
 
return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+   spin_lock(&map_entries_lock);
+   list_del(&entry->list);
+   spin_unlock(&map_entries_lock);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +180,35 @@ static int add_sysfs_fw_map_entry(struct 
firmware_map_entry *entry)
return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+   kobject_put(&entry->kobj);
+}
+
+/*
+ * Search memmap entry
+ */
+
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   spin_lock(&map_entries_lock);
+   list_for_each_entry(entry, &map_entries, list)
+   if ((entry->start == start) && (entry->end == end) &&
+   (!strcmp(entry->type, type))) {
+   spin_unlock(&map_entries_lock);
+   return entry;
+   }
+
+   spin_unlock(&map_entries_lock);
+   return NULL;
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -193,9 +258,36 @@ int __init firmware_map_add_early(u64 start, u64 end, 
const char *type)
if (WARN_ON(!entry))
return -ENOMEM;
 
+   entry->bootmem = 1;
return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   entry = firmware_map_find_entry(start, end - 1, type);
+   if (!entry)
+   return -EINVAL;
+
+   firmware_map_remove_entry(entry);
+
+   /* remove the memmap

[RFC v9 PATCH 03/21] memory-hotplug: store the node id in acpi_memory_device

2012-09-05 Thread wency
From: Wen Congyang 

The memory device has only one node id. Store the node id when
enable the memory device, and we can reuse it when removing the
memory device.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
Reviewed-by: Yasuaki Ishimatsu 
---
 drivers/acpi/acpi_memhotplug.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 2a7beac..7873832 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -83,6 +83,7 @@ struct acpi_memory_info {
 struct acpi_memory_device {
struct acpi_device * device;
unsigned int state; /* State of the memory device */
+   int nid;
struct list_head res_list;
 };
 
@@ -256,6 +257,9 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
info->enabled = 1;
num_enabled++;
}
+
+   mem_device->nid = node;
+
if (!num_enabled) {
printk(KERN_ERR PREFIX "add_memory failed\n");
mem_device->state = MEMORY_INVALID_STATE;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 03/20] memory-hotplug: store the node id in acpi_memory_device

2012-08-28 Thread wency
From: Wen Congyang 

The memory device has only one node id. Store the node id when
enable the memory device, and we can reuse it when removing the
memory device.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
Reviewed-by: Yasuaki Ishimatsu 
---
 drivers/acpi/acpi_memhotplug.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 2a7beac..7873832 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -83,6 +83,7 @@ struct acpi_memory_info {
 struct acpi_memory_device {
struct acpi_device * device;
unsigned int state; /* State of the memory device */
+   int nid;
struct list_head res_list;
 };
 
@@ -256,6 +257,9 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
info->enabled = 1;
num_enabled++;
}
+
+   mem_device->nid = node;
+
if (!num_enabled) {
printk(KERN_ERR PREFIX "add_memory failed\n");
mem_device->state = MEMORY_INVALID_STATE;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 13/20] memory-hotplug: check page type in get_page_bootmem

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

There is a possibility that get_page_bootmem() is called to the same page many
times. So when get_page_bootmem is called to the same page, the function only
increments page->_count.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 mm/memory_hotplug.c |   15 +++
 1 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5f9f8c7..d85af6d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -95,10 +95,17 @@ static void release_memory_resource(struct resource *res)
 static void get_page_bootmem(unsigned long info,  struct page *page,
 unsigned long type)
 {
-   page->lru.next = (struct list_head *) type;
-   SetPagePrivate(page);
-   set_page_private(page, info);
-   atomic_inc(&page->_count);
+   unsigned long page_type;
+
+   page_type = (unsigned long) page->lru.next;
+   if (page_type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+   page_type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE){
+   page->lru.next = (struct list_head *) type;
+   SetPagePrivate(page);
+   set_page_private(page, info);
+   atomic_inc(&page->_count);
+   } else
+   atomic_inc(&page->_count);
 }
 
 /* reference to __meminit __free_pages_bootmem is valid
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 18/20] memory-hotplug: add node_device_release

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

When calling unregister_node(), the function shows following message at
device_release().

Device 'node2' does not have a release() function, it is broken and must be
fixed.

So the patch implements node_device_release()

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/base/node.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index af1a177..07523fb 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -252,6 +252,16 @@ static inline void hugetlb_register_node(struct node 
*node) {}
 static inline void hugetlb_unregister_node(struct node *node) {}
 #endif
 
+static void node_device_release(struct device *dev)
+{
+   struct node *node_dev = to_node(dev);
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+   flush_work(&node_dev->node_work);
+#endif
+
+   memset(node_dev, 0, sizeof(struct node));
+}
 
 /*
  * register_node - Setup a sysfs device for a node.
@@ -265,6 +275,7 @@ int register_node(struct node *node, int num, struct node 
*parent)
 
node->dev.id = num;
node->dev.bus = &node_subsys;
+   node->dev.release = node_device_release;
error = device_register(&node->dev);
 
if (!error){
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 10/20] memory-hotplug: add memory_block_release

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

When calling remove_memory_block(), the function shows following message at
device_release().

Device 'memory528' does not have a release() function, it is broken and must
be fixed.

remove_memory_block() calls kfree(mem). I think it shouled be called from
device_release(). So the patch implements memory_block_release()

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/base/memory.c |   11 ++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 038be73..1cd3ef3 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -109,6 +109,15 @@ bool is_memblk_offline(unsigned long start, unsigned long 
size)
 }
 EXPORT_SYMBOL(is_memblk_offline);
 
+#define to_memory_block(device) container_of(device, struct memory_block, dev)
+
+static void release_memory_block(struct device *dev)
+{
+   struct memory_block *mem = to_memory_block(dev);
+
+   kfree(mem);
+}
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
@@ -119,6 +128,7 @@ int register_memory(struct memory_block *memory)
 
memory->dev.bus = &memory_subsys;
memory->dev.id = memory->start_section_nr / sections_per_block;
+   memory->dev.release = release_memory_block;
 
error = device_register(&memory->dev);
return error;
@@ -674,7 +684,6 @@ int remove_memory_block(unsigned long node_id, struct 
mem_section *section,
mem_remove_simple_file(mem, phys_device);
mem_remove_simple_file(mem, removable);
unregister_memory(mem);
-   kfree(mem);
} else
kobject_put(&mem->dev.kobj);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 19/20] memory-hotplug: remove sysfs file of node

2012-08-28 Thread wency
From: Wen Congyang 

This patch introduces a new function try_offline_node() to
remove sysfs file of node when all memory sections of this
node are removed. If some memory sections of this node are
not removed, this function does nothing.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |   33 +
 1 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 493298f..fb8af64 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1286,6 +1286,37 @@ int offline_memory(u64 start, u64 size)
return 0;
 }
 
+/* offline the node if all memory sections of this node are removed */
+static void try_offline_node(int nid)
+{
+   unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
+   unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages;
+   unsigned long pfn;
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   unsigned long section_nr = pfn_to_section_nr(pfn);
+
+   if (!present_section_nr(section_nr))
+   continue;
+
+   if (pfn_to_nid(pfn) != nid)
+   continue;
+
+   /*
+* some memory sections of this node are not removed, and we
+* can't offline node now.
+*/
+   return;
+   }
+
+   /*
+* all memory sections of this node are removed, we can offline this
+* node now.
+*/
+   node_set_offline(nid);
+   unregister_one_node(nid);
+}
+
 int __ref remove_memory(int nid, u64 start, u64 size)
 {
int ret = 0;
@@ -1306,6 +1337,8 @@ int __ref remove_memory(int nid, u64 start, u64 size)
firmware_map_remove(start, start + size, "System RAM");
 
arch_remove_memory(start, size);
+
+   try_offline_node(nid);
 out:
unlock_memory_hotplug();
return ret;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 06/20] memory-hotplug: export the function acpi_bus_remove()

2012-08-28 Thread wency
From: Wen Congyang 

The function acpi_bus_remove() can remove a acpi device from acpi device.
When a acpi device is removed, we need to call this function to remove
the acpi device from acpi bus. So export this function.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/scan.c |3 ++-
 include/acpi/acpi_bus.h |1 +
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index d1ecca2..1cefc34 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1224,7 +1224,7 @@ static int acpi_device_set_context(struct acpi_device 
*device)
return -ENODEV;
 }
 
-static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
 {
if (!dev)
return -EINVAL;
@@ -1246,6 +1246,7 @@ static int acpi_bus_remove(struct acpi_device *dev, int 
rmdevice)
 
return 0;
 }
+EXPORT_SYMBOL(acpi_bus_remove);
 
 static int acpi_add_single_object(struct acpi_device **child,
  acpi_handle handle, int type,
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index bde976e..2ccf109 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -360,6 +360,7 @@ bool acpi_bus_power_manageable(acpi_handle handle);
 bool acpi_bus_can_wakeup(acpi_handle handle);
 int acpi_power_resource_register_device(struct device *dev, acpi_handle 
handle);
 void acpi_power_resource_unregister_device(struct device *dev, acpi_handle 
handle);
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice);
 #ifdef CONFIG_ACPI_PROC_EVENT
 int acpi_bus_generate_proc_event(struct acpi_device *device, u8 type, int 
data);
 int acpi_bus_generate_proc_event4(const char *class, const char *bid, u8 type, 
int data);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 01/20] memory-hotplug: rename remove_memory() to offline_memory()/offline_pages()

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

remove_memory() only try to offline pages. It is called in two cases:
1. hot remove a memory device
2. echo offline >/sys/devices/system/memory/memoryXX/state

In the 1st case, we should also change memory block's state, and notify
the userspace that the memory block's state is changed after offlining
pages.

So rename remove_memory() to offline_memory()/offline_pages(). And in
the 1st case, offline_memory() will be used. The function offline_memory()
is not implemented. In the 2nd case, offline_pages() will be used.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |2 +-
 drivers/base/memory.c  |9 +++--
 include/linux/memory_hotplug.h |3 ++-
 mm/memory_hotplug.c|   22 ++
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24c807f..2a7beac 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -318,7 +318,7 @@ static int acpi_memory_disable_device(struct 
acpi_memory_device *mem_device)
 */
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
-   result = remove_memory(info->start_addr, info->length);
+   result = offline_memory(info->start_addr, info->length);
if (result)
return result;
}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7dda4f7..44e7de6 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -248,26 +248,23 @@ static bool pages_correctly_reserved(unsigned long 
start_pfn,
 static int
 memory_block_action(unsigned long phys_index, unsigned long action)
 {
-   unsigned long start_pfn, start_paddr;
+   unsigned long start_pfn;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
struct page *first_page;
int ret;
 
first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
+   start_pfn = page_to_pfn(first_page);
 
switch (action) {
case MEM_ONLINE:
-   start_pfn = page_to_pfn(first_page);
-
if (!pages_correctly_reserved(start_pfn, nr_pages))
return -EBUSY;
 
ret = online_pages(start_pfn, nr_pages);
break;
case MEM_OFFLINE:
-   start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
-   ret = remove_memory(start_paddr,
-   nr_pages << PAGE_SHIFT);
+   ret = offline_pages(start_pfn, nr_pages);
break;
default:
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 910550f..c183f39 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -233,7 +233,8 @@ static inline int is_mem_section_removable(unsigned long 
pfn,
 extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
-extern int remove_memory(u64 start, u64 size);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section 
*ms);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3ad25f9..c182c76 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -866,7 +866,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long 
end_pfn)
return offlined;
 }
 
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
  unsigned long end_pfn, unsigned long timeout)
 {
unsigned long pfn, nr_pages, expire;
@@ -994,18 +994,24 @@ out:
return ret;
 }
 
-int remove_memory(u64 start, u64 size)
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
-   unsigned long start_pfn, end_pfn;
+   return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
 
-   start_pfn = PFN_DOWN(start);
-   end_pfn = start_pfn + PFN_DOWN(size);
-   return offline_pages(start_pfn, end_pfn, 120 * HZ);
+int offline_memory(u64 start, u64 size)
+{
+   return -EINVAL;
 }
 #else
-int remove_memory(u64 start, u64 size)
+int offline_pages(u

[RFC v8 PATCH 20/20] memory-hotplug: clear hwpoisoned flag when onlining pages

2012-08-28 Thread wency
From: Wen Congyang 

hwpoisoned may set when we offline a page by the sysfs interface
/sys/devices/system/memory/soft_offline_page or
/sys/devices/system/memory/hard_offline_page. If we don't clear
this flag when onlining pages, this page can't be freed, and will
not in free list. So we can't offline these pages again. So we
should clear this flag when onlining pages.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fb8af64..85603c4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -661,6 +661,11 @@ EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 
 void __online_page_free(struct page *page)
 {
+#ifdef CONFIG_MEMORY_FAILURE
+   /* The page may be marked HWPoisoned by soft/hard offline page */
+   ClearPageHWPoison(page);
+#endif
+
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 07/20] memory-hotplug: call acpi_bus_remove() to remove memory device

2012-08-28 Thread wency
From: Wen Congyang 

The memory device has been ejected and powoffed, so we can call
acpi_bus_remove() to remove the memory device from acpi bus.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 9d47458..b152767 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -425,8 +425,9 @@ static void acpi_memory_device_notify(acpi_handle handle, 
u32 event, void *data)
}
 
/*
-* TBD: Invoke acpi_bus_remove to cleanup data structures
+* Invoke acpi_bus_remove() to remove memory device
 */
+   acpi_bus_remove(device, 1);
 
/* _EJ0 succeeded; _OST is not necessary */
return;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 11/20] memory-hotplug: remove_memory calls __remove_pages

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

The patch adds __remove_pages() to remove_memory(). Then the range of
phys_start_pfn argument and nr_pages argument in __remove_pagse() may
have different zone. So zone argument is removed from __remove_pages()
and __remove_pages() caluculates zone in each section.

When CONFIG_SPARSEMEM_VMEMMAP is defined, there is no way to remove a memmap.
So __remove_section only calls unregister_memory_section().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |5 +
 include/linux/memory_hotplug.h  |3 +--
 mm/memory_hotplug.c |   18 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index dc0a035..cc14da4 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -76,7 +76,6 @@ unsigned long memory_block_size_bytes(void)
 static int pseries_remove_memblock(unsigned long base, unsigned int 
memblock_size)
 {
unsigned long start, start_pfn;
-   struct zone *zone;
int i, ret;
int sections_to_remove;
 
@@ -87,8 +86,6 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
return 0;
}
 
-   zone = page_zone(pfn_to_page(start_pfn));
-
/*
 * Remove section mappings and sysfs entries for the
 * section of the memory we are removing.
@@ -101,7 +98,7 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
-   ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+   ret = __remove_pages(start_pfn,  PAGES_PER_SECTION);
if (ret)
return ret;
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index fd84ea9..8bf820d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -90,8 +90,7 @@ extern bool is_pageblock_removable_nolock(struct page *page);
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
-   unsigned long nr_pages);
+extern int __remove_pages(unsigned long start_pfn, unsigned long nr_pages);
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 29aff4d..713f1b9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -275,11 +275,14 @@ static int __meminit __add_section(int nid, struct zone 
*zone,
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
-   /*
-* XXX: Freeing memmap with vmemmap is not implement yet.
-*  This should be removed later.
-*/
-   return -EBUSY;
+   int ret = -EINVAL;
+
+   if (!valid_section(ms))
+   return ret;
+
+   ret = unregister_memory_section(ms);
+
+   return ret;
 }
 #else
 static int __remove_section(struct zone *zone, struct mem_section *ms)
@@ -346,11 +349,11 @@ EXPORT_SYMBOL_GPL(__add_pages);
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
-unsigned long nr_pages)
+int __remove_pages(unsigned long phys_start_pfn, unsigned long nr_pages)
 {
unsigned long i, ret = 0;
int sections_to_remove;
+   struct zone *zone;
 
/*
 * We can only remove entire sections
@@ -363,6 +366,7 @@ int __remove_pages(struct zone *zone, unsigned long 
phys_start_pfn,
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+   zone = page_zone(pfn_to_page(pfn));
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 00/20] memory-hotplug: hot-remove physical memory

2012-08-28 Thread wency
From: Wen Congyang 

This patch series aims to support physical memory hot-remove.

The patches can free/remove the following things:

  - acpi_memory_info  : [RFC PATCH 4/19]
  - /sys/firmware/memmap/X/{end, start, type} : [RFC PATCH 8/19]
  - iomem_resource: [RFC PATCH 9/19]
  - mem_section and related sysfs files   : [RFC PATCH 10-11, 13-16/19]
  - page table of removed memory  : [RFC PATCH 12/19]
  - node and related sysfs files  : [RFC PATCH 18-19/19]

If you find lack of function for physical memory hot-remove, please let me
know.

Known problems:
1. memory can't be offlined when CONFIG_MEMCG is selected.

change log of v8:
 [RFC PATCH v8 17/20]
   * Fix problems when one node's range include the other nodes
 [RFC PATCH v8 18/20]
   * fix building error when CONFIG_MEMORY_HOTPLUG_SPARSE or CONFIG_HUGETLBFS
 is not defined.
 [RFC PATCH v8 19/20]
   * don't offline node when some memory sections are not removed
 [RFC PATCH v8 20/20]
   * create new patch: clear hwpoisoned flag when onlining pages

change log of v7:
 [RFC PATCH v7 4/19]
   * do not continue if acpi_memory_device_remove_memory() fails.
 [RFC PATCH v7 15/19]
   * handle usemap in register_page_bootmem_info_section() too.

change log of v6:
 [RFC PATCH v6 12/19]
   * fix building error on other archtitectures than x86

 [RFC PATCH v6 15-16/19]
   * fix building error on other archtitectures than x86

change log of v5:
 * merge the patchset to clear page table and the patchset to hot remove
   memory(from ishimatsu) to one big patchset.

 [RFC PATCH v5 1/19]
   * rename remove_memory() to offline_memory()/offline_pages()

 [RFC PATCH v5 2/19]
   * new patch: implement offline_memory(). This function offlines pages,
 update memory block's state, and notify the userspace that the memory
 block's state is changed.

 [RFC PATCH v5 4/19]
   * offline and remove memory in acpi_memory_disable_device() too.

 [RFC PATCH v5 17/19]
   * new patch: add a new function __remove_zone() to revert the things done
 in the function __add_zone().

 [RFC PATCH v5 18/19]
   * flush work befor reseting node device.

change log of v4:
 * remove "memory-hotplug : unify argument of firmware_map_add_early/hotplug"
   from the patch series, since the patch is a bugfix. It is being disccussed
   on other thread. But for testing the patch series, the patch is needed.
   So I added the patch as [PATCH 0/13].

 [RFC PATCH v4 2/13]
   * check memory is online or not at remove_memory()
   * add memory_add_physaddr_to_nid() to acpi_memory_device_remove() for
 getting node id
 
 [RFC PATCH v4 3/13]
   * create new patch : check memory is online or not at online_pages()

 [RFC PATCH v4 4/13]
   * add __ref section to remove_memory()
   * call firmware_map_remove_entry() before remove_sysfs_fw_map_entry()

 [RFC PATCH v4 11/13]
   * rewrite register_page_bootmem_memmap() for removing page used as PT/PMD

change log of v3:
 * rebase to 3.5.0-rc6

 [RFC PATCH v2 2/13]
   * remove extra kobject_put()

   * The patch was commented by Wen. Wen's comment is
 "acpi_memory_device_remove() should ignore a return value of
 remove_memory() since caller does not care the return value".
 But I did not change it since I think caller should care the
 return value. And I am trying to fix it as follow:

 https://lkml.org/lkml/2012/7/5/624

 [RFC PATCH v2 4/13]
   * remove a firmware_memmap_entry allocated by kzmalloc()

change log of v2:
 [RFC PATCH v2 2/13]
   * check whether memory block is offline or not before calling 
offline_memory()
   * check whether section is valid or not in is_memblk_offline()
   * call kobject_put() for each memory_block in is_memblk_offline()

 [RFC PATCH v2 3/13]
   * unify the end argument of firmware_map_add_early/hotplug

 [RFC PATCH v2 4/13]
   * add release_firmware_map_entry() for freeing firmware_map_entry

 [RFC PATCH v2 6/13]
  * add release_memory_block() for freeing memory_block

 [RFC PATCH v2 11/13]
  * fix wrong arguments of free_pages()

Wen Congyang (7):
  memory-hotplug: implement offline_memory()
  memory-hotplug: store the node id in acpi_memory_device
  memory-hotplug: export the function acpi_bus_remove()
  memory-hotplug: call acpi_bus_remove() to remove memory device
  memory-hotplug: introduce new function arch_remove_memory()
  memory-hotplug: remove sysfs file of node
  memory-hotplug: clear hwpoisoned flag when onlining pages

Yasuaki Ishimatsu (13):
  memory-hotplug: rename remove_memory() to
offline_memory()/offline_pages()
  memory-hotplug: offline and remove memory when removing the memory
device
  memory-hotplug: check whether memory is present or not
  memory-hotplug: remove /sys/firmware/memmap/X sysfs
  memory-hotplug: does not release memory region in PAGES_PER_SECTION
chunks
  memory-hotplug: add memory_block_release
  memory-hotplug: remove_memory calls __remove_pages
  memory-hotplug: check page

[RFC v8 PATCH 17/20] memory_hotplug: clear zone when the memory is removed

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

When a memory is added, we update zone's and pgdat's start_pfn and spanned_pages
in the function __add_zone(). So we should revert these when the memory is
removed. Add a new function __remove_zone() to do this.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 mm/memory_hotplug.c |  207 +++
 1 files changed, 207 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3aa0766..493298f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -308,10 +308,213 @@ static int __meminit __add_section(int nid, struct zone 
*zone,
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   struct mem_section *ms;
+
+   for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(start_pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(start_pfn)) != nid)
+   continue;
+
+   if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+   continue;
+
+   return start_pfn;
+   }
+
+   return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+   unsigned long start_pfn,
+   unsigned long end_pfn)
+{
+   struct mem_section *ms;
+   unsigned long pfn;
+
+   /* pfn is the end pfn of a memory section. */
+   pfn = end_pfn - 1;
+   for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (unlikely(pfn_to_nid(pfn)) != nid)
+   continue;
+
+   if (zone && zone != page_zone(pfn_to_page(pfn)))
+   continue;
+
+   return pfn;
+   }
+
+   return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+unsigned long end_pfn)
+{
+   unsigned long zone_start_pfn =  zone->zone_start_pfn;
+   unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+   unsigned long pfn;
+   struct mem_section *ms;
+   int nid = zone_to_nid(zone);
+
+   zone_span_writelock(zone);
+   if (zone_start_pfn == start_pfn) {
+   /*
+* If the section is smallest section in the zone, it need
+* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+* In this case, we find second smallest valid mem_section
+* for shrinking zone.
+*/
+   pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+   zone_end_pfn);
+   if (pfn) {
+   zone->zone_start_pfn = pfn;
+   zone->spanned_pages = zone_end_pfn - pfn;
+   }
+   } else if (zone_end_pfn == end_pfn) {
+   /*
+* If the section is biggest section in the zone, it need
+* shrink zone->spanned_pages.
+* In this case, we find second biggest valid mem_section for
+* shrinking zone.
+*/
+   pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+  start_pfn);
+   if (pfn)
+   zone->spanned_pages = pfn - zone_start_pfn + 1;
+   }
+
+   /*
+* The section is not biggest or smallest mem_section in the zone, it
+* only creates a hole in the zone. So in this case, we need not
+* change the zone. But perhaps, the zone has only hole data. Thus
+* it check the zone has only hole or not.
+*/
+   pfn = zone_start_pfn;
+   for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+   ms = __pfn_to_section(pfn);
+
+   if (unlikely(!valid_section(ms)))
+   continue;
+
+   if (page_zone(pfn_to_page(pfn)) != zone)
+   continue;
+
+/* If the section is current section, it continues the loop */
+   if (start_pfn == pfn)
+   continue;
+
+   /* If we find valid section, we have nothing to do */
+   

[RFC v8 PATCH 15/20] memory-hotplug: implement register_page_bootmem_info_section of sparse-vmemmap

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

For removing memmap region of sparse-vmemmap which is allocated bootmem,
memmap region of sparse-vmemmap needs to be registered by get_page_bootmem().
So the patch searches pages of virtual mapping and registers the pages by
get_page_bootmem().

Note: register_page_bootmem_memmap() is not implemented for ia64, ppc, s390,
and sparc.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 arch/ia64/mm/discontig.c   |6 
 arch/powerpc/mm/init_64.c  |6 
 arch/s390/mm/vmem.c|6 
 arch/sparc/mm/init_64.c|6 
 arch/x86/mm/init_64.c  |   52 
 include/linux/memory_hotplug.h |2 +
 include/linux/mm.h |3 +-
 mm/memory_hotplug.c|   31 +--
 8 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c641333..33943db 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -822,4 +822,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 {
return vmemmap_populate_basepages(start_page, size, node);
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 620b7ac..3690c44 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -298,5 +298,11 @@ int __meminit vmemmap_populate(struct page *start_page,
 
return 0;
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 6f896e7..eda55cd 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -227,6 +227,12 @@ out:
return ret;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
+
 /*
  * Add memory segment to the segment list if it doesn't overlap with
  * an already present segment.
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index d58edf5..add1cc7 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2077,6 +2077,12 @@ void __meminit vmemmap_populate_print_last(void)
node_start = 0;
}
 }
+
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   /* TODO */
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 static void prot_init_common(unsigned long page_none,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e0d88ba..0075592 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1138,6 +1138,58 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
return 0;
 }
 
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+   unsigned long addr = (unsigned long)start_page;
+   unsigned long end = (unsigned long)(start_page + size);
+   unsigned long next;
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+
+   for (; addr < end; addr = next) {
+   pte_t *pte = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd)) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   continue;
+   }
+   get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud)) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   continue;
+   }
+   get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+   if (!cpu_has_pse) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   continue;
+   get_page_bootmem(section_nr, pmd_page(*pmd),
+MIX_SECTION_INFO);
+
+   pte = pte_offset_kernel(pmd, addr);
+   if (pte_none(*pte))
+   continue;
+   get_page_bootmem(section_nr, pte_page(*pte),
+SECTION_INFO);
+   } else {
+   next = pmd_addr_end(addr, end);
+
+

[RFC v8 PATCH 05/20] memory-hotplug: check whether memory is present or not

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

If system supports memory hot-remove, online_pages() may online removed pages.
So online_pages() need to check whether onlining pages are present or not.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 include/linux/mmzone.h |   19 +++
 mm/memory_hotplug.c|   13 +
 2 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2daa54f..ac3ae30 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1180,6 +1180,25 @@ void sparse_init(void);
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
+#ifdef CONFIG_SPARSEMEM
+static inline int pfns_present(unsigned long pfn, unsigned long nr_pages)
+{
+   int i;
+   for (i = 0; i < nr_pages; i++) {
+   if (pfn_present(pfn + i))
+   continue;
+   else
+   return -EINVAL;
+   }
+   return 0;
+}
+#else
+static inline int pfns_present(unsigned long pfn, unsigned long nr_pages)
+{
+   return 0;
+}
+#endif /* CONFIG_SPARSEMEM*/
+
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool early_pfn_in_nid(unsigned long pfn, int nid);
 #else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 80cded7..3f1d7c5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -467,6 +467,19 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
struct memory_notify arg;
 
lock_memory_hotplug();
+   /*
+* If system supports memory hot-remove, the memory may have been
+* removed. So we check whether the memory has been removed or not.
+*
+* Note: When CONFIG_SPARSEMEM is defined, pfns_present() become
+*   effective. If CONFIG_SPARSEMEM is not defined, pfns_present()
+*   always returns 0.
+*/
+   ret = pfns_present(pfn, nr_pages);
+   if (ret) {
+   unlock_memory_hotplug();
+   return ret;
+   }
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
arg.status_change_nid = -1;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 16/20] memory-hotplug: free memmap of sparse-vmemmap

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

All pages of virtual mapping in removed memory cannot be freed, since some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch checks whether page can be freed or not.

How to check whether page can be freed or not?
 1. When removing memory, the page structs of the revmoved memory are filled
with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
In this case, the page used as PT/PMD can be freed.

Applying patch, __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is integrated
into one. So __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is deleted.

Note:  vmemmap_kfree() and vmemmap_free_bootmem() are not implemented for ia64,
ppc, s390, and sparc.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/ia64/mm/discontig.c  |8 +++
 arch/powerpc/mm/init_64.c |8 +++
 arch/s390/mm/vmem.c   |8 +++
 arch/sparc/mm/init_64.c   |8 +++
 arch/x86/mm/init_64.c |  119 +
 include/linux/mm.h|2 +
 mm/memory_hotplug.c   |   17 +--
 mm/sparse.c   |5 +-
 8 files changed, 158 insertions(+), 17 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 33943db..0d23b69 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -823,6 +823,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return vmemmap_populate_basepages(start_page, size, node);
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 3690c44..835a2b3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -299,6 +299,14 @@ int __meminit vmemmap_populate(struct page *start_page,
return 0;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index eda55cd..4b42b0b 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -227,6 +227,14 @@ out:
return ret;
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index add1cc7..1384826 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2078,6 +2078,14 @@ void __meminit vmemmap_populate_print_last(void)
}
 }
 
+void vmemmap_kfree(struct page *memmap, unsigned long nr_pages)
+{
+}
+
+void vmemmap_free_bootmem(struct page *memmap, unsigned long nr_pages)
+{
+}
+
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long size)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0075592..4e8f8a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1138,6 +1138,125 @@ vmemmap_populate(struct page *start_page, unsigned long 
size, int node)
return 0;
 }
 
+#define PAGE_INUSE 0xFD
+
+unsigned long find_and_clear_pte_page(unsigned long addr, unsigned long end,
+   struct page **pp, int *page_size)
+{
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
+   void *page_addr;
+   unsigned long next;
+
+   *pp = NULL;
+
+   pgd = pgd_offset_k(addr);
+   if (pgd_none(*pgd))
+   return pgd_addr_end(addr, end);
+
+   pud = pud_offset(pgd, addr);
+   if (pud_none(*pud))
+   return pud_addr_end(addr, end);
+
+   if (!cpu_has_pse) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+   pte = pte_offset_kernel(pmd, addr);
+   if (pte_none(*pte))
+   return next;
+
+   *page_size = PAGE_SIZE;
+   *pp = pte_page(*pte);
+   } else {
+   next = pmd_addr_end(addr, end);
+
+   pmd = pmd_offset(pud, addr);
+   if (pmd_none(*pmd))
+   return next;
+
+

[RFC v8 PATCH 02/20] memory-hotplug: implement offline_memory()

2012-08-28 Thread wency
From: Wen Congyang 

The function offline_memory() will be called when hot removing a
memory device. The memory device may contain more than one memory
block. If the memory block has been offlined, __offline_pages()
will fail. So we should try to offline one memory block at a
time.

If the memory block is offlined in offline_memory(), we also
update it's state, and notify the userspace that its state is
changed.

The function offline_memory() also check each memory block's
state. So there is no need to check the memory block's state
before calling offline_memory().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
CC: Vasilis Liaskovitis 
Signed-off-by: Wen Congyang 
---
 drivers/base/memory.c  |   31 +++
 include/linux/memory_hotplug.h |2 ++
 mm/memory_hotplug.c|   37 -
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 44e7de6..86c8821 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -275,13 +275,11 @@ memory_block_action(unsigned long phys_index, unsigned 
long action)
return ret;
 }
 
-static int memory_block_change_state(struct memory_block *mem,
+static int __memory_block_change_state(struct memory_block *mem,
unsigned long to_state, unsigned long from_state_req)
 {
int ret = 0;
 
-   mutex_lock(&mem->state_mutex);
-
if (mem->state != from_state_req) {
ret = -EINVAL;
goto out;
@@ -309,10 +307,20 @@ static int memory_block_change_state(struct memory_block 
*mem,
break;
}
 out:
-   mutex_unlock(&mem->state_mutex);
return ret;
 }
 
+static int memory_block_change_state(struct memory_block *mem,
+   unsigned long to_state, unsigned long from_state_req)
+{
+   int ret;
+
+   mutex_lock(&mem->state_mutex);
+   ret = __memory_block_change_state(mem, to_state, from_state_req);
+   mutex_unlock(&mem->state_mutex);
+
+   return ret;
+}
 static ssize_t
 store_mem_state(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
@@ -653,6 +661,21 @@ int unregister_memory_section(struct mem_section *section)
 }
 
 /*
+ * offline one memory block. If the memory block has been offlined, do nothing.
+ */
+int offline_memory_block(struct memory_block *mem)
+{
+   int ret = 0;
+
+   mutex_lock(&mem->state_mutex);
+   if (mem->state != MEM_OFFLINE)
+   ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+   mutex_unlock(&mem->state_mutex);
+
+   return ret;
+}
+
+/*
  * Initialize the sysfs support for memory devices...
  */
 int __init memory_dev_init(void)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c183f39..0b040bb 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -10,6 +10,7 @@ struct page;
 struct zone;
 struct pglist_data;
 struct mem_section;
+struct memory_block;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
@@ -234,6 +235,7 @@ extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_memory_block(struct memory_block *mem);
 extern int offline_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
int nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c182c76..3113cd4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1001,7 +1001,42 @@ int offline_pages(unsigned long start_pfn, unsigned long 
nr_pages)
 
 int offline_memory(u64 start, u64 size)
 {
-   return -EINVAL;
+   struct memory_block *mem = NULL;
+   struct mem_section *section;
+   unsigned long start_pfn, end_pfn;
+   unsigned long pfn, section_nr;
+   int ret;
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = start_pfn + PFN_DOWN(size);
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   section_nr = pfn_to_section_nr(pfn);
+   if (!present_section_nr(section_nr))
+   continue;
+
+   section = __nr_to_section(section_nr);
+   /* same memblock? */
+   if (mem)
+   if ((section_nr >= mem->start_section_nr) &&
+   (section_nr <= mem->end_section_nr))
+   continue;
+
+   mem = find_memory_block_hinted(section, mem);
+   if (!mem)
+   continue;
+
+   ret = offline

[RFC v8 PATCH 14/20] memory-hotplug: move register_page_bootmem_info_node and put_page_bootmem for sparse-vmemmap

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

For implementing register_page_bootmem_info_node of sparse-vmemmap,
register_page_bootmem_info_node and put_page_bootmem are moved to
memory_hotplug.c

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 include/linux/memory_hotplug.h |9 -
 mm/memory_hotplug.c|8 ++--
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index cdbbd79..1133e63 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -162,17 +162,8 @@ static inline void arch_refresh_nodedata(int nid, 
pg_data_t *pgdat)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-static inline void put_page_bootmem(struct page *page)
-{
-}
-#else
 extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
 extern void put_page_bootmem(struct page *page);
-#endif
 
 /*
  * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d85af6d..3ca66bc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -91,7 +91,6 @@ static void release_memory_resource(struct resource *res)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void get_page_bootmem(unsigned long info,  struct page *page,
 unsigned long type)
 {
@@ -127,6 +126,7 @@ void __ref put_page_bootmem(struct page *page)
 
 }
 
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
unsigned long *usemap, mapsize, section_nr, i;
@@ -163,6 +163,11 @@ static void register_page_bootmem_info_section(unsigned 
long start_pfn)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 
 }
+#else
+static inline void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+}
+#endif
 
 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -198,7 +203,6 @@ void register_page_bootmem_info_node(struct pglist_data 
*pgdat)
register_page_bootmem_info_section(pfn);
 
 }
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
   unsigned long end_pfn)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 12/20] memory-hotplug: introduce new function arch_remove_memory()

2012-08-28 Thread wency
From: Wen Congyang 

We don't call __add_pages() directly in the function add_memory()
because some other architecture related things need to be done
before or after calling __add_pages(). So we should introduce
a new function arch_remove_memory() to revert the things
done in arch_add_memory().

Note: the function for s390 is not implemented(I don't know how to
implement it for s390).

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 arch/ia64/mm/init.c  |   16 
 arch/powerpc/mm/mem.c|   14 +++
 arch/s390/mm/init.c  |   12 +++
 arch/sh/mm/init.c|   15 +++
 arch/tile/mm/init.c  |8 ++
 arch/x86/include/asm/pgtable_types.h |1 +
 arch/x86/mm/init_32.c|   10 ++
 arch/x86/mm/init_64.c|  160 ++
 arch/x86/mm/pageattr.c   |   47 +-
 include/linux/memory_hotplug.h   |1 +
 mm/memory_hotplug.c  |1 +
 11 files changed, 263 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 0eab454..1e345ed 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -688,6 +688,22 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   int ret;
+
+   ret = __remove_pages(start_pfn, nr_pages);
+   if (ret)
+   pr_warn("%s: Problem encountered in __remove_pages() as"
+   " ret=%d\n", __func__,  ret);
+
+   return ret;
+}
+#endif
 #endif
 
 /*
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index fbdad0e..011170b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,20 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+
+   start = (unsigned long)__va(start);
+   if (remove_section_mapping(start, start + size))
+   return -EINVAL;
+
+   return __remove_pages(start_pfn, nr_pages);
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6adbc08..501b20e 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -257,4 +257,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
vmem_remove_mapping(start, size);
return rc;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /*
+* There is no hardware or firmware interface which could trigger a
+* hot memory remove on s390. So there is nothing that needs to be
+* implemented.
+*/
+   return -EBUSY;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 82cc576..fc84491 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -558,4 +558,19 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   int ret;
+
+   ret = __remove_pages(start_pfn, nr_pages);
+   if (unlikely(ret))
+   pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
+   ret);
+
+   return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index ef29d6c..2749515 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 {
return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /* TODO */
+   return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 013286a..b725af2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -334,6 +334,7 @@ static inline void update_page_count(int level, unsigned 
long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t 
*pbase);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 575d86f..41eefe8 100644
--- a/arch/x

[RFC v8 PATCH 09/20] memory-hotplug: does not release memory region in PAGES_PER_SECTION chunks

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

Since applying a patch(de7f0cba96786c), release_mem_region() has been changed
as called in PAGES_PER_SECTION chunks because register_memory_resource() is
called in PAGES_PER_SECTION chunks by add_memory(). But it seems firmware
dependency. If CRS are written in the PAGES_PER_SECTION chunks in ACPI DSDT
Table, register_memory_resource() is called in PAGES_PER_SECTION chunks.
But if CRS are written in the DIMM unit in ACPI DSDT Table,
register_memory_resource() is called in DIMM unit. So release_mem_region()
should not be called in PAGES_PER_SECTION chunks. The patch fixes it.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   13 +
 mm/memory_hotplug.c |4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 11d8e05..dc0a035 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
 {
unsigned long start, start_pfn;
struct zone *zone;
-   int ret;
+   int i, ret;
+   int sections_to_remove;
 
start_pfn = base >> PAGE_SHIFT;
 
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
 * to sysfs "state" file and we can't remove sysfs entries
 * while writing to it. So we have to defer it to here.
 */
-   ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
-   if (ret)
-   return ret;
+   sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
+   for (i = 0; i < sections_to_remove; i++) {
+   unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
+   ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+   if (ret)
+   return ret;
+   }
 
/*
 * Update memory regions for memory remove
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 45b03b3..29aff4d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -358,11 +358,11 @@ int __remove_pages(struct zone *zone, unsigned long 
phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
 
+   release_mem_region(phys_start_pfn << PAGE_SHIFT,  nr_pages * PAGE_SIZE);
+
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-   release_mem_region(pfn << PAGE_SHIFT,
-  PAGES_PER_SECTION << PAGE_SHIFT);
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v8 PATCH 04/20] memory-hotplug: offline and remove memory when removing the memory device

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

We should offline and remove memory when removing the memory device.
The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

In the 1st case, acpi_memory_disable_device() will be called. In the 2nd
case, acpi_memory_device_remove() will be called. acpi_memory_device_remove()
will also be called when we unbind the memory device from the driver
acpi_memhotplug. If the type is ACPI_BUS_REMOVAL_EJECT, it means
that the user wants to eject the memory device, and we should offline
and remove memory in acpi_memory_device_remove().

The function remove_memory() is not implemeted now. It only check whether
all memory has been offllined now.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
Signed-off-by: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |   45 +--
 drivers/base/memory.c  |   39 ++
 include/linux/memory.h |5 
 include/linux/memory_hotplug.h |5 
 mm/memory_hotplug.c|   22 +++
 5 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 7873832..9d47458 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -310,25 +311,44 @@ static int acpi_memory_powerdown_device(struct 
acpi_memory_device *mem_device)
return 0;
 }
 
-static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+static int
+acpi_memory_device_remove_memory(struct acpi_memory_device *mem_device)
 {
int result;
struct acpi_memory_info *info, *n;
+   int node = mem_device->nid;
 
-
-   /*
-* Ask the VM to offline this memory range.
-* Note: Assume that this function returns zero on success
-*/
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
if (info->enabled) {
result = offline_memory(info->start_addr, info->length);
if (result)
return result;
+
+   result = remove_memory(node, info->start_addr,
+  info->length);
+   if (result)
+   return result;
}
+
+   list_del(&info->list);
kfree(info);
}
 
+   return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+   int result;
+
+   /*
+* Ask the VM to offline this memory range.
+* Note: Assume that this function returns zero on success
+*/
+   result = acpi_memory_device_remove_memory(mem_device);
+   if (result)
+   return result;
+
/* Power-off and eject the device */
result = acpi_memory_powerdown_device(mem_device);
if (result) {
@@ -477,12 +497,23 @@ static int acpi_memory_device_add(struct acpi_device 
*device)
 static int acpi_memory_device_remove(struct acpi_device *device, int type)
 {
struct acpi_memory_device *mem_device = NULL;
-
+   int result;
 
if (!device || !acpi_driver_data(device))
return -EINVAL;
 
mem_device = acpi_driver_data(device);
+
+   if (type == ACPI_BUS_REMOVAL_EJECT) {
+   /*
+* offline and remove memory only when the memory device is
+* ejected.
+*/
+   result = acpi_memory_device_remove_memory(mem_device);
+   if (result)
+   return result;
+   }
+
kfree(mem_device);
 
return 0;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 86c8821..038be73 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -70,6 +70,45 @@ void unregister_memory_isolate_notifier(struct 
notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_memory_isolate_notifier);
 
+bool is_memblk_offline(unsigned long start, unsigned long size)
+{
+   struct memory_block *mem = NULL;
+   struct mem_section *section;
+   unsigned long start_pfn, end_pfn;
+   unsigned long pfn, section_nr;
+
+   start_pfn = PFN_DOWN(start);
+   end_pfn = PFN_UP(start + size);
+
+   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+   section_nr = pfn_to_section_nr(pfn);
+   if (!present_section_nr(section_nr))
+   continue;
+
+   section = __nr_to_section(section_nr);
+   /* same memblock? */
+   if (mem)
+   if ((section_nr >= mem->start_section_n

[RFC v8 PATCH 08/20] memory-hotplug: remove /sys/firmware/memmap/X sysfs

2012-08-28 Thread wency
From: Yasuaki Ishimatsu 

When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
sysfs files are created. But there is no code to remove these files. The patch
implements the function to remove them.

Note : The code does not free firmware_map_entry since there is no way to free
   memory which is allocated by bootmem.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/firmware/memmap.c|   78 +-
 include/linux/firmware-map.h |6 +++
 mm/memory_hotplug.c  |9 -
 3 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index c1cdc92..b2e7e5e 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Data types 
--
@@ -79,7 +80,22 @@ static const struct sysfs_ops memmap_attr_ops = {
.show = memmap_attr_show,
 };
 
+#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
+
+static void release_firmware_map_entry(struct kobject *kobj)
+{
+   struct firmware_map_entry *entry = to_memmap_entry(kobj);
+   struct page *page;
+
+   page = virt_to_page(entry);
+   if (PageSlab(page) || PageCompound(page))
+   kfree(entry);
+
+   /* There is no way to free memory allocated from bootmem*/
+}
+
 static struct kobj_type memmap_ktype = {
+   .release= release_firmware_map_entry,
.sysfs_ops  = &memmap_attr_ops,
.default_attrs  = def_attrs,
 };
@@ -123,6 +139,16 @@ static int firmware_map_add_entry(u64 start, u64 end,
return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+   list_del(&entry->list);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +170,31 @@ static int add_sysfs_fw_map_entry(struct 
firmware_map_entry *entry)
return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+   kobject_put(&entry->kobj);
+}
+
+/*
+ * Search memmap entry
+ */
+
+struct firmware_map_entry * __meminit
+find_firmware_map_entry(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   list_for_each_entry(entry, &map_entries, list)
+   if ((entry->start == start) && (entry->end == end) &&
+   (!strcmp(entry->type, type)))
+   return entry;
+
+   return NULL;
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -196,6 +247,32 @@ int __init firmware_map_add_early(u64 start, u64 end, 
const char *type)
return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   entry = find_firmware_map_entry(start, end - 1, type);
+   if (!entry)
+   return -EINVAL;
+
+   firmware_map_remove_entry(entry);
+
+   /* remove the memmap entry */
+   remove_sysfs_fw_map_entry(entry);
+
+   return 0;
+}
+
 /*
  * Sysfs functions 
-
  */
@@ -218,7 +295,6 @@ static ssize_t type_show(struct firmware_map_entry *entry, 
char *buf)
 }
 
 #define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, 
attr)
-#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
 
 static ssize_t memmap_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index 43fe52f..71d4fa7 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -25,6 +25,7 @@
 
 int firmware_map_add_early(u64 start, u64 end, const char *type);
 int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
+int firmware_map_remove(u64 start, u64 end, const char *type);
 
 #else /* CONFIG_FIRMWARE_MEMMAP */
 
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 
end, const char *type)
return 0;
 }
 
+static inline int firmware_map_remove(u64 start, u64 end, const char *type)
+{
+   return 0;
+}
+
 #endif /

[RFC V7 PATCH 06/19] memory-hotplug: export the function acpi_bus_remove()

2012-08-20 Thread wency
From: Wen Congyang 

The function acpi_bus_remove() can remove a acpi device from acpi device.
When a acpi device is removed, we need to call this function to remove
the acpi device from acpi bus. So export this function.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/scan.c |3 ++-
 include/acpi/acpi_bus.h |1 +
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index d1ecca2..1cefc34 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1224,7 +1224,7 @@ static int acpi_device_set_context(struct acpi_device 
*device)
return -ENODEV;
 }
 
-static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
 {
if (!dev)
return -EINVAL;
@@ -1246,6 +1246,7 @@ static int acpi_bus_remove(struct acpi_device *dev, int 
rmdevice)
 
return 0;
 }
+EXPORT_SYMBOL(acpi_bus_remove);
 
 static int acpi_add_single_object(struct acpi_device **child,
  acpi_handle handle, int type,
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index bde976e..2ccf109 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -360,6 +360,7 @@ bool acpi_bus_power_manageable(acpi_handle handle);
 bool acpi_bus_can_wakeup(acpi_handle handle);
 int acpi_power_resource_register_device(struct device *dev, acpi_handle 
handle);
 void acpi_power_resource_unregister_device(struct device *dev, acpi_handle 
handle);
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice);
 #ifdef CONFIG_ACPI_PROC_EVENT
 int acpi_bus_generate_proc_event(struct acpi_device *device, u8 type, int 
data);
 int acpi_bus_generate_proc_event4(const char *class, const char *bid, u8 type, 
int data);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC V7 PATCH 00/19] memory-hotplug: hot-remove physical memory

2012-08-20 Thread wency
From: Wen Congyang 

This patch series aims to support physical memory hot-remove.

The patches can free/remove the following things:

  - acpi_memory_info  : [RFC PATCH 4/19]
  - /sys/firmware/memmap/X/{end, start, type} : [RFC PATCH 8/19]
  - iomem_resource: [RFC PATCH 9/19]
  - mem_section and related sysfs files   : [RFC PATCH 10-11, 13-16/19]
  - page table of removed memory  : [RFC PATCH 12/19]
  - node and related sysfs files  : [RFC PATCH 18-19/19]

If you find lack of function for physical memory hot-remove, please let me
know.

Known problems:
1. memory can't be offlined when CONFIG_MEMCG is selected.

change log of v7:
 [RFC PATCH v7 4/19]
   * do not continue if acpi_memory_device_remove_memory() fails.
 [RFC PATCH v7 15/19]
   * handle usemap in register_page_bootmem_info_section() too.

change log of v6:
 [RFC PATCH v6 12/19]
   * fix building error on other archtitectures than x86

 [RFC PATCH v6 15-16/19]
   * fix building error on other archtitectures than x86

change log of v5:
 * merge the patchset to clear page table and the patchset to hot remove
   memory(from ishimatsu) to one big patchset.

 [RFC PATCH v5 1/19]
   * rename remove_memory() to offline_memory()/offline_pages()

 [RFC PATCH v5 2/19]
   * new patch: implement offline_memory(). This function offlines pages,
 update memory block's state, and notify the userspace that the memory
 block's state is changed.

 [RFC PATCH v5 4/19]
   * offline and remove memory in acpi_memory_disable_device() too.

 [RFC PATCH v5 17/19]
   * new patch: add a new function __remove_zone() to revert the things done
 in the function __add_zone().

 [RFC PATCH v5 18/19]
   * flush work befor reseting node device.

change log of v4:
 * remove "memory-hotplug : unify argument of firmware_map_add_early/hotplug"
   from the patch series, since the patch is a bugfix. It is being disccussed
   on other thread. But for testing the patch series, the patch is needed.
   So I added the patch as [PATCH 0/13].

 [RFC PATCH v4 2/13]
   * check memory is online or not at remove_memory()
   * add memory_add_physaddr_to_nid() to acpi_memory_device_remove() for
 getting node id
 
 [RFC PATCH v4 3/13]
   * create new patch : check memory is online or not at online_pages()

 [RFC PATCH v4 4/13]
   * add __ref section to remove_memory()
   * call firmware_map_remove_entry() before remove_sysfs_fw_map_entry()

 [RFC PATCH v4 11/13]
   * rewrite register_page_bootmem_memmap() for removing page used as PT/PMD

change log of v3:
 * rebase to 3.5.0-rc6

 [RFC PATCH v2 2/13]
   * remove extra kobject_put()

   * The patch was commented by Wen. Wen's comment is
 "acpi_memory_device_remove() should ignore a return value of
 remove_memory() since caller does not care the return value".
 But I did not change it since I think caller should care the
 return value. And I am trying to fix it as follow:

 https://lkml.org/lkml/2012/7/5/624

 [RFC PATCH v2 4/13]
   * remove a firmware_memmap_entry allocated by kzmalloc()

change log of v2:
 [RFC PATCH v2 2/13]
   * check whether memory block is offline or not before calling 
offline_memory()
   * check whether section is valid or not in is_memblk_offline()
   * call kobject_put() for each memory_block in is_memblk_offline()

 [RFC PATCH v2 3/13]
   * unify the end argument of firmware_map_add_early/hotplug

 [RFC PATCH v2 4/13]
   * add release_firmware_map_entry() for freeing firmware_map_entry

 [RFC PATCH v2 6/13]
  * add release_memory_block() for freeing memory_block

 [RFC PATCH v2 11/13]
  * fix wrong arguments of free_pages()

Wen Congyang (6):
  memory-hotplug: implement offline_memory()
  memory-hotplug: store the node id in acpi_memory_device
  memory-hotplug: offline and remove memory when removing the memory
device
  memory-hotplug: export the function acpi_bus_remove()
  memory-hotplug: call acpi_bus_remove() to remove memory device
  memory-hotplug: introduce new function arch_remove_memory()

Yasuaki Ishimatsu (13):
  memory-hotplug: rename remove_memory() to
offline_memory()/offline_pages()
  memory-hotplug: check whether memory is present or not
  memory-hotplug: remove /sys/firmware/memmap/X sysfs
  memory-hotplug: does not release memory region in PAGES_PER_SECTION
chunks
  memory-hotplug: add memory_block_release
  memory-hotplug: remove_memory calls __remove_pages
  memory-hotplug: check page type in get_page_bootmem
  memory-hotplug: move register_page_bootmem_info_node and
put_page_bootmem for sparse-vmemmap
  memory-hotplug: implement register_page_bootmem_info_section of
sparse-vmemmap
  memory-hotplug: free memmap of sparse-vmemmap
  memory_hotplug: clear zone when the memory is removed
  memory-hotplug: add node_device_release
  memory-hotplug: remove sysfs file of node

 arch/ia64/mm/discontig.c|   14 +
 arch/ia64/mm/init.c  

[RFC V7 PATCH 03/19] memory-hotplug: store the node id in acpi_memory_device

2012-08-20 Thread wency
From: Wen Congyang 

The memory device has only one node id. Store the node id when
enable the memory device, and we can reuse it when removing the
memory device.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
Reviewed-by: Yasuaki Ishimatsu 
---
 drivers/acpi/acpi_memhotplug.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 2a7beac..7873832 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -83,6 +83,7 @@ struct acpi_memory_info {
 struct acpi_memory_device {
struct acpi_device * device;
unsigned int state; /* State of the memory device */
+   int nid;
struct list_head res_list;
 };
 
@@ -256,6 +257,9 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
info->enabled = 1;
num_enabled++;
}
+
+   mem_device->nid = node;
+
if (!num_enabled) {
printk(KERN_ERR PREFIX "add_memory failed\n");
mem_device->state = MEMORY_INVALID_STATE;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC V7 PATCH 07/19] memory-hotplug: call acpi_bus_remove() to remove memory device

2012-08-20 Thread wency
From: Wen Congyang 

The memory device has been ejected and powoffed, so we can call
acpi_bus_remove() to remove the memory device from acpi bus.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 drivers/acpi/acpi_memhotplug.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 9d47458..b152767 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -425,8 +425,9 @@ static void acpi_memory_device_notify(acpi_handle handle, 
u32 event, void *data)
}
 
/*
-* TBD: Invoke acpi_bus_remove to cleanup data structures
+* Invoke acpi_bus_remove() to remove memory device
 */
+   acpi_bus_remove(device, 1);
 
/* _EJ0 succeeded; _OST is not necessary */
return;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC V7 PATCH 09/19] memory-hotplug: does not release memory region in PAGES_PER_SECTION chunks

2012-08-20 Thread wency
From: Yasuaki Ishimatsu 

Since applying a patch(de7f0cba96786c), release_mem_region() has been changed
as called in PAGES_PER_SECTION chunks because register_memory_resource() is
called in PAGES_PER_SECTION chunks by add_memory(). But it seems firmware
dependency. If CRS are written in the PAGES_PER_SECTION chunks in ACPI DSDT
Table, register_memory_resource() is called in PAGES_PER_SECTION chunks.
But if CRS are written in the DIMM unit in ACPI DSDT Table,
register_memory_resource() is called in DIMM unit. So release_mem_region()
should not be called in PAGES_PER_SECTION chunks. The patch fixes it.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   13 +
 mm/memory_hotplug.c |4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 11d8e05..dc0a035 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
 {
unsigned long start, start_pfn;
struct zone *zone;
-   int ret;
+   int i, ret;
+   int sections_to_remove;
 
start_pfn = base >> PAGE_SHIFT;
 
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
 * to sysfs "state" file and we can't remove sysfs entries
 * while writing to it. So we have to defer it to here.
 */
-   ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
-   if (ret)
-   return ret;
+   sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
+   for (i = 0; i < sections_to_remove; i++) {
+   unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
+   ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+   if (ret)
+   return ret;
+   }
 
/*
 * Update memory regions for memory remove
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 45b03b3..29aff4d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -358,11 +358,11 @@ int __remove_pages(struct zone *zone, unsigned long 
phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
 
+   release_mem_region(phys_start_pfn << PAGE_SHIFT,  nr_pages * PAGE_SIZE);
+
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-   release_mem_region(pfn << PAGE_SHIFT,
-  PAGES_PER_SECTION << PAGE_SHIFT);
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC V7 PATCH 10/19] memory-hotplug: add memory_block_release

2012-08-20 Thread wency
From: Yasuaki Ishimatsu 

When calling remove_memory_block(), the function shows following message at
device_release().

Device 'memory528' does not have a release() function, it is broken and must
be fixed.

remove_memory_block() calls kfree(mem). I think it shouled be called from
device_release(). So the patch implements memory_block_release()

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/base/memory.c |   11 ++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 038be73..1cd3ef3 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -109,6 +109,15 @@ bool is_memblk_offline(unsigned long start, unsigned long 
size)
 }
 EXPORT_SYMBOL(is_memblk_offline);
 
+#define to_memory_block(device) container_of(device, struct memory_block, dev)
+
+static void release_memory_block(struct device *dev)
+{
+   struct memory_block *mem = to_memory_block(dev);
+
+   kfree(mem);
+}
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
@@ -119,6 +128,7 @@ int register_memory(struct memory_block *memory)
 
memory->dev.bus = &memory_subsys;
memory->dev.id = memory->start_section_nr / sections_per_block;
+   memory->dev.release = release_memory_block;
 
error = device_register(&memory->dev);
return error;
@@ -674,7 +684,6 @@ int remove_memory_block(unsigned long node_id, struct 
mem_section *section,
mem_remove_simple_file(mem, phys_device);
mem_remove_simple_file(mem, removable);
unregister_memory(mem);
-   kfree(mem);
} else
kobject_put(&mem->dev.kobj);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC V7 PATCH 11/19] memory-hotplug: remove_memory calls __remove_pages

2012-08-20 Thread wency
From: Yasuaki Ishimatsu 

The patch adds __remove_pages() to remove_memory(). Then the range of
phys_start_pfn argument and nr_pages argument in __remove_pagse() may
have different zone. So zone argument is removed from __remove_pages()
and __remove_pages() caluculates zone in each section.

When CONFIG_SPARSEMEM_VMEMMAP is defined, there is no way to remove a memmap.
So __remove_section only calls unregister_memory_section().

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |5 +
 include/linux/memory_hotplug.h  |3 +--
 mm/memory_hotplug.c |   18 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index dc0a035..cc14da4 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -76,7 +76,6 @@ unsigned long memory_block_size_bytes(void)
 static int pseries_remove_memblock(unsigned long base, unsigned int 
memblock_size)
 {
unsigned long start, start_pfn;
-   struct zone *zone;
int i, ret;
int sections_to_remove;
 
@@ -87,8 +86,6 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
return 0;
}
 
-   zone = page_zone(pfn_to_page(start_pfn));
-
/*
 * Remove section mappings and sysfs entries for the
 * section of the memory we are removing.
@@ -101,7 +98,7 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
-   ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+   ret = __remove_pages(start_pfn,  PAGES_PER_SECTION);
if (ret)
return ret;
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index fd84ea9..8bf820d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -90,8 +90,7 @@ extern bool is_pageblock_removable_nolock(struct page *page);
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
-   unsigned long nr_pages);
+extern int __remove_pages(unsigned long start_pfn, unsigned long nr_pages);
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 29aff4d..713f1b9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -275,11 +275,14 @@ static int __meminit __add_section(int nid, struct zone 
*zone,
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
-   /*
-* XXX: Freeing memmap with vmemmap is not implement yet.
-*  This should be removed later.
-*/
-   return -EBUSY;
+   int ret = -EINVAL;
+
+   if (!valid_section(ms))
+   return ret;
+
+   ret = unregister_memory_section(ms);
+
+   return ret;
 }
 #else
 static int __remove_section(struct zone *zone, struct mem_section *ms)
@@ -346,11 +349,11 @@ EXPORT_SYMBOL_GPL(__add_pages);
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
-unsigned long nr_pages)
+int __remove_pages(unsigned long phys_start_pfn, unsigned long nr_pages)
 {
unsigned long i, ret = 0;
int sections_to_remove;
+   struct zone *zone;
 
/*
 * We can only remove entire sections
@@ -363,6 +366,7 @@ int __remove_pages(struct zone *zone, unsigned long 
phys_start_pfn,
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+   zone = page_zone(pfn_to_page(pfn));
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC V7 PATCH 08/19] memory-hotplug: remove /sys/firmware/memmap/X sysfs

2012-08-20 Thread wency
From: Yasuaki Ishimatsu 

When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
sysfs files are created. But there is no code to remove these files. The patch
implements the function to remove them.

Note : The code does not free firmware_map_entry since there is no way to free
   memory which is allocated by bootmem.

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 drivers/firmware/memmap.c|   78 +-
 include/linux/firmware-map.h |6 +++
 mm/memory_hotplug.c  |9 -
 3 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index c1cdc92..b2e7e5e 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Data types 
--
@@ -79,7 +80,22 @@ static const struct sysfs_ops memmap_attr_ops = {
.show = memmap_attr_show,
 };
 
+#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
+
+static void release_firmware_map_entry(struct kobject *kobj)
+{
+   struct firmware_map_entry *entry = to_memmap_entry(kobj);
+   struct page *page;
+
+   page = virt_to_page(entry);
+   if (PageSlab(page) || PageCompound(page))
+   kfree(entry);
+
+   /* There is no way to free memory allocated from bootmem*/
+}
+
 static struct kobj_type memmap_ktype = {
+   .release= release_firmware_map_entry,
.sysfs_ops  = &memmap_attr_ops,
.default_attrs  = def_attrs,
 };
@@ -123,6 +139,16 @@ static int firmware_map_add_entry(u64 start, u64 end,
return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+   list_del(&entry->list);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +170,31 @@ static int add_sysfs_fw_map_entry(struct 
firmware_map_entry *entry)
return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+   kobject_put(&entry->kobj);
+}
+
+/*
+ * Search memmap entry
+ */
+
+struct firmware_map_entry * __meminit
+find_firmware_map_entry(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   list_for_each_entry(entry, &map_entries, list)
+   if ((entry->start == start) && (entry->end == end) &&
+   (!strcmp(entry->type, type)))
+   return entry;
+
+   return NULL;
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -196,6 +247,32 @@ int __init firmware_map_add_early(u64 start, u64 end, 
const char *type)
return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+   struct firmware_map_entry *entry;
+
+   entry = find_firmware_map_entry(start, end - 1, type);
+   if (!entry)
+   return -EINVAL;
+
+   firmware_map_remove_entry(entry);
+
+   /* remove the memmap entry */
+   remove_sysfs_fw_map_entry(entry);
+
+   return 0;
+}
+
 /*
  * Sysfs functions 
-
  */
@@ -218,7 +295,6 @@ static ssize_t type_show(struct firmware_map_entry *entry, 
char *buf)
 }
 
 #define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, 
attr)
-#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
 
 static ssize_t memmap_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index 43fe52f..71d4fa7 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -25,6 +25,7 @@
 
 int firmware_map_add_early(u64 start, u64 end, const char *type);
 int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
+int firmware_map_remove(u64 start, u64 end, const char *type);
 
 #else /* CONFIG_FIRMWARE_MEMMAP */
 
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 
end, const char *type)
return 0;
 }
 
+static inline int firmware_map_remove(u64 start, u64 end, const char *type)
+{
+   return 0;
+}
+
 #endif /

[RFC V7 PATCH 12/19] memory-hotplug: introduce new function arch_remove_memory()

2012-08-20 Thread wency
From: Wen Congyang 

We don't call __add_pages() directly in the function add_memory()
because some other architecture related things need to be done
before or after calling __add_pages(). So we should introduce
a new function arch_remove_memory() to revert the things
done in arch_add_memory().

Note: the function for s390 is not implemented(I don't know how to
implement it for s390).

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Yasuaki Ishimatsu 
Signed-off-by: Wen Congyang 
---
 arch/ia64/mm/init.c  |   16 
 arch/powerpc/mm/mem.c|   14 +++
 arch/s390/mm/init.c  |   12 +++
 arch/sh/mm/init.c|   15 +++
 arch/tile/mm/init.c  |8 ++
 arch/x86/include/asm/pgtable_types.h |1 +
 arch/x86/mm/init_32.c|   10 ++
 arch/x86/mm/init_64.c|  160 ++
 arch/x86/mm/pageattr.c   |   47 +-
 include/linux/memory_hotplug.h   |1 +
 mm/memory_hotplug.c  |1 +
 11 files changed, 263 insertions(+), 22 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 0eab454..1e345ed 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -688,6 +688,22 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return ret;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   int ret;
+
+   ret = __remove_pages(start_pfn, nr_pages);
+   if (ret)
+   pr_warn("%s: Problem encountered in __remove_pages() as"
+   " ret=%d\n", __func__,  ret);
+
+   return ret;
+}
+#endif
 #endif
 
 /*
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index baaafde..249cef4 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,20 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
return __add_pages(nid, zone, start_pfn, nr_pages);
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+
+   start = (unsigned long)__va(start);
+   if (remove_section_mapping(start, start + size))
+   return -EINVAL;
+
+   return __remove_pages(start_pfn, nr_pages);
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6adbc08..501b20e 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -257,4 +257,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
vmem_remove_mapping(start, size);
return rc;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /*
+* There is no hardware or firmware interface which could trigger a
+* hot memory remove on s390. So there is nothing that needs to be
+* implemented.
+*/
+   return -EBUSY;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 82cc576..fc84491 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -558,4 +558,19 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   int ret;
+
+   ret = __remove_pages(start_pfn, nr_pages);
+   if (unlikely(ret))
+   pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
+   ret);
+
+   return ret;
+}
+#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index ef29d6c..2749515 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 {
return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+   /* TODO */
+   return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 013286a..b725af2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -334,6 +334,7 @@ static inline void update_page_count(int level, unsigned 
long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t 
*pbase);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 575d86f..41eefe8 100644
--- a/arch/x

[RFC V7 PATCH 14/19] memory-hotplug: move register_page_bootmem_info_node and put_page_bootmem for sparse-vmemmap

2012-08-20 Thread wency
From: Yasuaki Ishimatsu 

For implementing register_page_bootmem_info_node of sparse-vmemmap,
register_page_bootmem_info_node and put_page_bootmem are moved to
memory_hotplug.c

CC: David Rientjes 
CC: Jiang Liu 
CC: Len Brown 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Christoph Lameter 
Cc: Minchan Kim 
CC: Andrew Morton 
CC: KOSAKI Motohiro 
CC: Wen Congyang 
Signed-off-by: Yasuaki Ishimatsu 
---
 include/linux/memory_hotplug.h |9 -
 mm/memory_hotplug.c|8 ++--
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index cdbbd79..1133e63 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -162,17 +162,8 @@ static inline void arch_refresh_nodedata(int nid, 
pg_data_t *pgdat)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-static inline void put_page_bootmem(struct page *page)
-{
-}
-#else
 extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
 extern void put_page_bootmem(struct page *page);
-#endif
 
 /*
  * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d85af6d..3ca66bc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -91,7 +91,6 @@ static void release_memory_resource(struct resource *res)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void get_page_bootmem(unsigned long info,  struct page *page,
 unsigned long type)
 {
@@ -127,6 +126,7 @@ void __ref put_page_bootmem(struct page *page)
 
 }
 
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
unsigned long *usemap, mapsize, section_nr, i;
@@ -163,6 +163,11 @@ static void register_page_bootmem_info_section(unsigned 
long start_pfn)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 
 }
+#else
+static inline void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+}
+#endif
 
 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -198,7 +203,6 @@ void register_page_bootmem_info_node(struct pglist_data 
*pgdat)
register_page_bootmem_info_section(pfn);
 
 }
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
   unsigned long end_pfn)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   >