from:"Luiz Capitulino"

[RESEND] x86: numa: setup_node_data(): drop dead code and rename function

2014-09-15 Thread Luiz Capitulino

 for memmap
[0.00]   DMA32 zone: 491008 pages, LIFO batch:31
[0.00]   Normal zone: 520 pages used for memmap
[0.00]   Normal zone: 33280 pages, LIFO batch:7

This commit was tested on a two node bare-metal NUMA machine and Linux as
a numa guest on hyperv and qemu/kvm.

PS: The wrong memory range reported by setup_node_data() seems to be
harmless in the current kernel because it's just not used.  However,
that bad range is used in kernel 2.6.32 to initialize the old boot
memory allocator, which causes a crash during boot.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
Cc: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Cc: Yinghai Lu ying...@kernel.org
Acked-by: Rik van Riel r...@redhat.com
Cc: Andi Kleen a...@firstfloor.org
Cc: David Rientjes rient...@google.com
Cc: Ingo Molnar mi...@elte.hu
Cc: H. Peter Anvin h...@zytor.com
Cc: Thomas Gleixner t...@linutronix.de
Signed-off-by: Andrew Morton a...@linux-foundation.org
---

I posted this patch more than two months ago. Andrew picked it up and it
rested in the -mm tree for a couple of weeks. Andrew dropped it from -mm
to move it forward, but looks like it hasn't been picked by anyone else
since then. Resending...

 arch/x86/include/asm/numa.h |  1 -
 arch/x86/mm/numa.c  | 34 ++
 mm/page_alloc.c |  2 ++
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL  (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..d221374 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, numa_meminfo);
 }
 
-/* Initialize NODE_DATA for a node on the local memory */
-static void __init setup_node_data(int nid, u64 start, u64 end)
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
 {
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
@@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
int tnid;
 
/*
-* Don't confuse VM with a node that doesn't have the
-* minimum amount of memory:
-*/
-   if (end  (end - start)  NODE_MIN_SIZE)
-   return;
-
-   start = roundup(start, ZONE_ALIGN);
-
-   printk(KERN_INFO Initmem setup node %d [mem %#010Lx-%#010Lx]\n,
-  nid, start, end - 1);
-
-   /*
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
@@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
nd = __va(nd_pa);
 
/* report and initialize */
-   printk(KERN_INFO   NODE_DATA [mem %#010Lx-%#010Lx]\n,
+   printk(KERN_INFO NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n, nid,
   nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa  PAGE_SHIFT);
if (tnid != nid)
@@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-   NODE_DATA(nid)-node_id = nid;
-   NODE_DATA(nid)-node_start_pfn = start  PAGE_SHIFT;
-   NODE_DATA(nid)-node_spanned_pages = (end - start)  PAGE_SHIFT;
 
node_set_online(nid);
 }
@@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct 
numa_meminfo *mi)
end = max(mi-blk[i].end, end);
}
 
-   if (start  end)
-   setup_node_data(nid, start, end);
+   if (start = end)
+   continue;
+
+   /*
+* Don't confuse VM with a node that doesn't have the
+* minimum amount of memory:
+*/
+   if (end  (end - start)  NODE_MIN_SIZE)
+   continue;
+
+   alloc_node_data(nid);
}
 
/* Dump memblock with node info and return. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18cee0d..d0e3d2f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4976,6 +4976,8 @@ void __paginginit free_area_init_node(int nid, unsigned 
long *zones_size,
pgdat-node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, start_pfn, end_pfn);
+   printk(KERN_INFO Initmem setup node %d [mem %#010Lx-%#010Lx]\n, nid,
+   (u64) start_pfn  PAGE_SHIFT, (u64) (end_pfn  
PAGE_SHIFT) - 1);
 #endif

Re: [RESEND] x86: numa: setup_node_data(): drop dead code and rename function

2014-09-15 Thread Luiz Capitulino

On Mon, 15 Sep 2014 17:13:39 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 On Mon, 15 Sep 2014, Luiz Capitulino wrote:
 
  The setup_node_data() function allocates a pg_data_t object, inserts it
  into the node_data[] array and initializes the following fields: node_id,
  node_start_pfn and node_spanned_pages.
  
  However, a few function calls later during the kernel boot,
  free_area_init_node() re-initializes those fields, possibly with
  setup_node_data() is not used.
  
  This causes a small glitch when running Linux as a hyperv numa guest:
  
  [0.00] SRAT: PXM 0 - APIC 0x00 - Node 0
  [0.00] SRAT: PXM 0 - APIC 0x01 - Node 0
  [0.00] SRAT: PXM 1 - APIC 0x02 - Node 1
  [0.00] SRAT: PXM 1 - APIC 0x03 - Node 1
  [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x8020-0xf7ff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x1-0x1081f]
  [0.00] NUMA: Node 1 [mem 0x8020-0xf7ff] + [mem 
  0x1-0x1081f] - [mem 0x8020-0x1081f]
  [0.00] Initmem setup node 0 [mem 0x-0x7fff]
  [0.00]   NODE_DATA [mem 0x7ffdc000-0x7ffe]
  [0.00] Initmem setup node 1 [mem 0x8080-0x1081f]
  [0.00]   NODE_DATA [mem 0x1081ea000-0x1081fdfff]
  [0.00] crashkernel: memory value expected
  [0.00]  [ea00-ea0001ff] PMD - 
  [88007de0-88007fdf] on node 0
  [0.00]  [ea000200-ea00043f] PMD - 
  [88010560-8801077f] on node 1
  [0.00] Zone ranges:
  [0.00]   DMA  [mem 0x1000-0x00ff]
  [0.00]   DMA32[mem 0x0100-0x]
  [0.00]   Normal   [mem 0x1-0x1081f]
  [0.00] Movable zone start for each node
  [0.00] Early memory node ranges
  [0.00]   node   0: [mem 0x1000-0x0009efff]
  [0.00]   node   0: [mem 0x0010-0x7ffe]
  [0.00]   node   1: [mem 0x8020-0xf7ff]
  [0.00]   node   1: [mem 0x1-0x1081f]
  [0.00] On node 0 totalpages: 524174
  [0.00]   DMA zone: 64 pages used for memmap
  [0.00]   DMA zone: 21 pages reserved
  [0.00]   DMA zone: 3998 pages, LIFO batch:0
  [0.00]   DMA32 zone: 8128 pages used for memmap
  [0.00]   DMA32 zone: 520176 pages, LIFO batch:31
  [0.00] On node 1 totalpages: 524288
  [0.00]   DMA32 zone: 7672 pages used for memmap
  [0.00]   DMA32 zone: 491008 pages, LIFO batch:31
  [0.00]   Normal zone: 520 pages used for memmap
  [0.00]   Normal zone: 33280 pages, LIFO batch:7
  
  In this dmesg, the SRAT table reports that the memory range for node 1
  starts at 0x8020.  However, the line starting with Initmem reports
  that node 1 memory range starts at 0x8080.  The Initmem line is
  reported by setup_node_data() and is wrong, because the kernel ends up
  using the range as reported in the SRAT table.
  
  This commit drops all that dead code from setup_node_data(), renames it to
  alloc_node_data() and adds a printk() to free_area_init_node() so that we
  report a node's memory range accurately.
  
  Here's the same dmesg section with this patch applied:
  
  [0.00] SRAT: PXM 0 - APIC 0x00 - Node 0
  [0.00] SRAT: PXM 0 - APIC 0x01 - Node 0
  [0.00] SRAT: PXM 1 - APIC 0x02 - Node 1
  [0.00] SRAT: PXM 1 - APIC 0x03 - Node 1
  [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x8020-0xf7ff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x1-0x1081f]
  [0.00] NUMA: Node 1 [mem 0x8020-0xf7ff] + [mem 
  0x1-0x1081f] - [mem 0x8020-0x1081f]
  [0.00] NODE_DATA(0) allocated [mem 0x7ffdc000-0x7ffe]
  [0.00] NODE_DATA(1) allocated [mem 0x1081ea000-0x1081fdfff]
  [0.00] crashkernel: memory value expected
  [0.00]  [ea00-ea0001ff] PMD - 
  [88007de0-88007fdf] on node 0
  [0.00]  [ea000200-ea00043f] PMD - 
  [88010560-8801077f] on node 1
  [0.00] Zone ranges:
  [0.00]   DMA  [mem 0x1000-0x00ff]
  [0.00]   DMA32[mem 0x0100-0x]
  [0.00]   Normal   [mem 0x1-0x1081f]
  [0.00] Movable zone start for each node
  [0.00] Early memory node ranges
  [0.00]   node   0: [mem 0x1000-0x0009efff]
  [0.00]   node   0: [mem 0x0010-0x7ffe]
  [0.00]   node   1: [mem 0x8020-0xf7ff]
  [0.00]   node   1: [mem 0x1-0x1081f]
  [0.00] Initmem setup node 0 [mem 0x1000-0x7ffe]
  [0.00] On node 0 totalpages: 524174
  [0.00]   DMA zone: 64 pages used for memmap
  [0.00]   DMA zone: 21 pages reserved

Re: [patch] mm, hugetlb: generalize writes to nr_hugepages

2014-07-09 Thread Luiz Capitulino

On Tue, 8 Jul 2014 15:11:13 -0700
Andrew Morton  wrote:

> On Wed, 2 Jul 2014 17:44:46 -0700 (PDT) David Rientjes  
> wrote:
> 
> > > > @@ -2248,36 +2257,18 @@ static int hugetlb_sysctl_handler_common(bool 
> > > > obey_mempolicy,
> > > >  void __user *buffer, size_t *length, loff_t 
> > > > *ppos)
> > > >  {
> > > > struct hstate *h = _hstate;
> > > > -   unsigned long tmp;
> > > > +   unsigned long tmp = h->max_huge_pages;
> > > > int ret;
> > > >  
> > > > -   if (!hugepages_supported())
> > > > -   return -ENOTSUPP;
> > > 
> > > Shouldn't you add this check to __nr_hugepages_store_common()? Otherwise
> > > looks good to me.
> > > 
> > 
> > Hmm, I think you're right but I don't think __nr_hugepages_store_common() 
> > is the right place: if we have a legitimate hstate for the sysfs tunables 
> > then we should support hugepages.  I think this should be kept in 
> > hugetlb_sysctl_handler_common().
> 
> This?

Yes.

> 
> --- a/mm/hugetlb.c~mm-hugetlb-generalize-writes-to-nr_hugepages-fix
> +++ a/mm/hugetlb.c
> @@ -2260,6 +2260,9 @@ static int hugetlb_sysctl_handler_common
>   unsigned long tmp = h->max_huge_pages;
>   int ret;
>  
> + if (!hugepages_supported())
> + return -ENOTSUPP;
> +
>   table->data = 
>   table->maxlen = sizeof(unsigned long);
>   ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
> _
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] mm, hugetlb: generalize writes to nr_hugepages

2014-07-09 Thread Luiz Capitulino

On Tue, 8 Jul 2014 15:11:13 -0700
Andrew Morton a...@linux-foundation.org wrote:

 On Wed, 2 Jul 2014 17:44:46 -0700 (PDT) David Rientjes rient...@google.com 
 wrote:
 
@@ -2248,36 +2257,18 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
 void __user *buffer, size_t *length, loff_t 
*ppos)
 {
struct hstate *h = default_hstate;
-   unsigned long tmp;
+   unsigned long tmp = h-max_huge_pages;
int ret;
 
-   if (!hugepages_supported())
-   return -ENOTSUPP;
   
   Shouldn't you add this check to __nr_hugepages_store_common()? Otherwise
   looks good to me.
   
  
  Hmm, I think you're right but I don't think __nr_hugepages_store_common() 
  is the right place: if we have a legitimate hstate for the sysfs tunables 
  then we should support hugepages.  I think this should be kept in 
  hugetlb_sysctl_handler_common().
 
 This?

Yes.

 
 --- a/mm/hugetlb.c~mm-hugetlb-generalize-writes-to-nr_hugepages-fix
 +++ a/mm/hugetlb.c
 @@ -2260,6 +2260,9 @@ static int hugetlb_sysctl_handler_common
   unsigned long tmp = h-max_huge_pages;
   int ret;
  
 + if (!hugepages_supported())
 + return -ENOTSUPP;
 +
   table-data = tmp;
   table-maxlen = sizeof(unsigned long);
   ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
 _
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] x86: numa: setup_node_data(): drop dead code and rename function

2014-07-03 Thread Luiz Capitulino

atch:31
[0.00] Initmem setup node 1 [mem 0x8020-0x1081f]
[0.00] On node 1 totalpages: 524288
[0.00]   DMA32 zone: 7672 pages used for memmap
[0.00]   DMA32 zone: 491008 pages, LIFO batch:31
[0.00]   Normal zone: 520 pages used for memmap
[0.00]   Normal zone: 33280 pages, LIFO batch:7

This commit was tested on a two node bare-metal NUMA machine and Linux
as a numa guest on hyperv and qemu/kvm.

PS: The wrong memory range reported by setup_node_data() seems to be
harmless in the current kernel because it's just not used. However,
that bad range is used in kernel 2.6.32 to initialize the old boot
    memory allocator, which causes a crash during boot.

Signed-off-by: Luiz Capitulino 
---

v2

 - Restore Initmem text when reporting the memory range in dmesg [David]

o David, Andrew: I'm sending v2 instead of sending an incremental patch
  because I had to update the commit log.

 arch/x86/include/asm/numa.h |  1 -
 arch/x86/mm/numa.c  | 34 ++
 mm/page_alloc.c |  2 ++
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..d221374 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, _meminfo);
 }
 
-/* Initialize NODE_DATA for a node on the local memory */
-static void __init setup_node_data(int nid, u64 start, u64 end)
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
 {
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
@@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
int tnid;
 
/*
-* Don't confuse VM with a node that doesn't have the
-* minimum amount of memory:
-*/
-   if (end && (end - start) < NODE_MIN_SIZE)
-   return;
-
-   start = roundup(start, ZONE_ALIGN);
-
-   printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
-  nid, start, end - 1);
-
-   /*
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
@@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
nd = __va(nd_pa);
 
/* report and initialize */
-   printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]\n",
+   printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
   nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
if (tnid != nid)
@@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-   NODE_DATA(nid)->node_id = nid;
-   NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
-   NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
 
node_set_online(nid);
 }
@@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct 
numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
 
-   if (start < end)
-   setup_node_data(nid, start, end);
+   if (start >= end)
+   continue;
+
+   /*
+* Don't confuse VM with a node that doesn't have the
+* minimum amount of memory:
+*/
+   if (end && (end - start) < NODE_MIN_SIZE)
+   continue;
+
+   alloc_node_data(nid);
}
 
/* Dump memblock with node info and return. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20d17f8..9c699e7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4957,6 +4957,8 @@ void __paginginit free_area_init_node(int nid, unsigned 
long *zones_size,
pgdat->node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, _pfn, _pfn);
+   printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+   (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << 
PAGE_SHIFT) - 1);
 #endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
  zones_size, zholes_size);
-- 
1.9.3

--
To unsu

[PATCH v2] x86: numa: setup_node_data(): drop dead code and rename function

2014-07-03 Thread Luiz Capitulino

: 524288
[0.00]   DMA32 zone: 7672 pages used for memmap
[0.00]   DMA32 zone: 491008 pages, LIFO batch:31
[0.00]   Normal zone: 520 pages used for memmap
[0.00]   Normal zone: 33280 pages, LIFO batch:7

This commit was tested on a two node bare-metal NUMA machine and Linux
as a numa guest on hyperv and qemu/kvm.

PS: The wrong memory range reported by setup_node_data() seems to be
harmless in the current kernel because it's just not used. However,
that bad range is used in kernel 2.6.32 to initialize the old boot
memory allocator, which causes a crash during boot.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---

v2

 - Restore Initmem text when reporting the memory range in dmesg [David]

o David, Andrew: I'm sending v2 instead of sending an incremental patch
  because I had to update the commit log.

 arch/x86/include/asm/numa.h |  1 -
 arch/x86/mm/numa.c  | 34 ++
 mm/page_alloc.c |  2 ++
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL  (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..d221374 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, numa_meminfo);
 }
 
-/* Initialize NODE_DATA for a node on the local memory */
-static void __init setup_node_data(int nid, u64 start, u64 end)
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
 {
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
@@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
int tnid;
 
/*
-* Don't confuse VM with a node that doesn't have the
-* minimum amount of memory:
-*/
-   if (end  (end - start)  NODE_MIN_SIZE)
-   return;
-
-   start = roundup(start, ZONE_ALIGN);
-
-   printk(KERN_INFO Initmem setup node %d [mem %#010Lx-%#010Lx]\n,
-  nid, start, end - 1);
-
-   /*
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
@@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
nd = __va(nd_pa);
 
/* report and initialize */
-   printk(KERN_INFO   NODE_DATA [mem %#010Lx-%#010Lx]\n,
+   printk(KERN_INFO NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n, nid,
   nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa  PAGE_SHIFT);
if (tnid != nid)
@@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-   NODE_DATA(nid)-node_id = nid;
-   NODE_DATA(nid)-node_start_pfn = start  PAGE_SHIFT;
-   NODE_DATA(nid)-node_spanned_pages = (end - start)  PAGE_SHIFT;
 
node_set_online(nid);
 }
@@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct 
numa_meminfo *mi)
end = max(mi-blk[i].end, end);
}
 
-   if (start  end)
-   setup_node_data(nid, start, end);
+   if (start = end)
+   continue;
+
+   /*
+* Don't confuse VM with a node that doesn't have the
+* minimum amount of memory:
+*/
+   if (end  (end - start)  NODE_MIN_SIZE)
+   continue;
+
+   alloc_node_data(nid);
}
 
/* Dump memblock with node info and return. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20d17f8..9c699e7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4957,6 +4957,8 @@ void __paginginit free_area_init_node(int nid, unsigned 
long *zones_size,
pgdat-node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, start_pfn, end_pfn);
+   printk(KERN_INFO Initmem setup node %d [mem %#010Lx-%#010Lx]\n, nid,
+   (u64) start_pfn  PAGE_SHIFT, (u64) (end_pfn  
PAGE_SHIFT) - 1);
 #endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
  zones_size, zholes_size);
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-07-02 Thread Luiz Capitulino

On Wed, 2 Jul 2014 16:20:47 -0700 (PDT)
David Rientjes  wrote:

> On Wed, 2 Jul 2014, Luiz Capitulino wrote:
> 
> > > With this patch, the dmesg changes break one of my scripts that we use to 
> > > determine the start and end address of a node (doubly bad because there's 
> > > no sysfs interface to determine this otherwise and we have to do this at 
> > > boot to acquire the system topology).
> > > 
> > > Specifically, the removal of the
> > > 
> > >   "Initmem setup node X [mem 0xstart-0xend]"
> > > 
> > > lines that are replaced when each node is onlined to
> > > 
> > >   "Node 0 memory range 0xstart-0xend"
> > > 
> > > And if I just noticed this breakage when booting the latest -mm kernel, 
> > > I'm assuming I'm not the only person who is going to run into it.  Is it 
> > > possible to not change the dmesg output?
> > 
> > Sure. I can add back the original text. The only detail is that with this
> > patch that line is now printed a little bit later during boot and the
> > NODA_DATA lines also changed. Are you OK with that?
> > 
> 
> Yes, please.  I think it should be incremental on your patch since it's 
> already in -mm with " fix" appended so the title of the patch would be 
> "x86: numa: setup_node_data(): drop dead code and rename function fix" and 
> then Andrew can fold it into the original when sending it to the x86 
> maintainers.
> 
> > What's the guidelines on changing what's printed in dmesg?
> > 
> 
> That's the scary part, there doesn't seem to be any.  It's especially 
> crucial for things that only get printed once and aren't available 
> anywhere else at runtime; there was talk of adding a sysfs interface that 
> defines the start and end addresses of nodes but it's complicated because 
> nodes can overlap each other.  If that had been available years ago then I 
> don't think anybody would raise their hand about this issue.
> 
> These lines went under a smaller change a few years ago for 
> s/Bootmem/Initmem/.  I don't even have to look at the git history to know 
> that because it broke our scripts back then as well.  You just happened to 
> touch lines that I really care about and breaks my topology information :)  
> I wouldn't complain if it was just my userspace, but I have no doubt 
> others have parsed their dmesg in a similar way because people have 
> provided me with data that they retrieved by scraping the kernel log.

No problem. I'll send a patch shortly as you suggested.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] mm, hugetlb: generalize writes to nr_hugepages

2014-07-02 Thread Luiz Capitulino

On Wed, 2 Jul 2014 17:44:46 -0700 (PDT)
David Rientjes  wrote:

> On Wed, 2 Jul 2014, Luiz Capitulino wrote:
> 
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -1734,21 +1734,13 @@ static ssize_t nr_hugepages_show_common(struct 
> > > kobject *kobj,
> > >   return sprintf(buf, "%lu\n", nr_huge_pages);
> > >  }
> > >  
> > > -static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
> > > - struct kobject *kobj, struct kobj_attribute *attr,
> > > - const char *buf, size_t len)
> > > +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
> > > +struct hstate *h, int nid,
> > > +unsigned long count, size_t len)
> > >  {
> > >   int err;
> > > - int nid;
> > > - unsigned long count;
> > > - struct hstate *h;
> > >   NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
> > >  
> > > - err = kstrtoul(buf, 10, );
> > > - if (err)
> > > - goto out;
> > > -
> > > - h = kobj_to_hstate(kobj, );
> > >   if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
> > >   err = -EINVAL;
> > >   goto out;
> > > @@ -1784,6 +1776,23 @@ out:
> > >   return err;
> > >  }
> > >  
> > > +static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
> > > +  struct kobject *kobj, const char *buf,
> > > +  size_t len)
> > > +{
> > > + struct hstate *h;
> > > + unsigned long count;
> > > + int nid;
> > > + int err;
> > > +
> > > + err = kstrtoul(buf, 10, );
> > > + if (err)
> > > + return err;
> > > +
> > > + h = kobj_to_hstate(kobj, );
> > > + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
> > > +}
> > > +
> > >  static ssize_t nr_hugepages_show(struct kobject *kobj,
> > >  struct kobj_attribute *attr, char *buf)
> > >  {
> > > @@ -1793,7 +1802,7 @@ static ssize_t nr_hugepages_show(struct kobject 
> > > *kobj,
> > >  static ssize_t nr_hugepages_store(struct kobject *kobj,
> > >  struct kobj_attribute *attr, const char *buf, size_t len)
> > >  {
> > > - return nr_hugepages_store_common(false, kobj, attr, buf, len);
> > > + return nr_hugepages_store_common(false, kobj, buf, len);
> > >  }
> > >  HSTATE_ATTR(nr_hugepages);
> > >  
> > > @@ -1812,7 +1821,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct 
> > > kobject *kobj,
> > >  static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
> > >  struct kobj_attribute *attr, const char *buf, size_t len)
> > >  {
> > > - return nr_hugepages_store_common(true, kobj, attr, buf, len);
> > > + return nr_hugepages_store_common(true, kobj, buf, len);
> > >  }
> > >  HSTATE_ATTR(nr_hugepages_mempolicy);
> > >  #endif
> > > @@ -2248,36 +2257,18 @@ static int hugetlb_sysctl_handler_common(bool 
> > > obey_mempolicy,
> > >void __user *buffer, size_t *length, loff_t *ppos)
> > >  {
> > >   struct hstate *h = _hstate;
> > > - unsigned long tmp;
> > > + unsigned long tmp = h->max_huge_pages;
> > >   int ret;
> > >  
> > > - if (!hugepages_supported())
> > > - return -ENOTSUPP;
> > 
> > Shouldn't you add this check to __nr_hugepages_store_common()? Otherwise
> > looks good to me.
> > 
> 
> Hmm, I think you're right but I don't think __nr_hugepages_store_common() 
> is the right place: if we have a legitimate hstate for the sysfs tunables 
> then we should support hugepages.  I think this should be kept in 
> hugetlb_sysctl_handler_common().

You seem to be right. Feel free to add if you respin:

Reviewed-by: Luiz Capitulino 

> 
> > > -
> > > - tmp = h->max_huge_pages;
> > > -
> > > - if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
> > > - return -EINVAL;
> > > -
> > >   table->data = 
> > >   table->maxlen = sizeof(unsigned long);
> > >   ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
> > >   if (ret)
> > >   g

Re: [patch] mm, hugetlb: remove hugetlb_zero and hugetlb_infinity

2014-07-02 Thread Luiz Capitulino

On Mon, 30 Jun 2014 17:46:35 -0700 (PDT)
David Rientjes  wrote:

> They are unnecessary: "zero" can be used in place of "hugetlb_zero" and 
> passing extra2 == NULL is equivalent to infinity.
> 
> Signed-off-by: David Rientjes 

Reviewed-by: Luiz Capitulino 

> ---
>  include/linux/hugetlb.h | 1 -
>  kernel/sysctl.c | 9 +++--
>  mm/hugetlb.c| 1 -
>  3 files changed, 3 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -86,7 +86,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long 
> addr, pud_t *pud);
>  #endif
>  
>  extern unsigned long hugepages_treat_as_movable;
> -extern const unsigned long hugetlb_zero, hugetlb_infinity;
>  extern int sysctl_hugetlb_shm_group;
>  extern struct list_head huge_boot_pages;
>  
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = {
>   .maxlen = sizeof(unsigned long),
>   .mode   = 0644,
>   .proc_handler   = hugetlb_sysctl_handler,
> - .extra1 = (void *)_zero,
> - .extra2 = (void *)_infinity,
> + .extra1 = ,
>   },
>  #ifdef CONFIG_NUMA
>   {
> @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = {
>   .maxlen = sizeof(unsigned long),
>   .mode   = 0644,
>   .proc_handler   = _mempolicy_sysctl_handler,
> - .extra1 = (void *)_zero,
> - .extra2 = (void *)_infinity,
> + .extra1 = ,
>   },
>  #endif
>{
> @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = {
>   .maxlen = sizeof(unsigned long),
>   .mode   = 0644,
>   .proc_handler   = hugetlb_overcommit_handler,
> - .extra1 = (void *)_zero,
> - .extra2 = (void *)_infinity,
> + .extra1 = ,
>   },
>  #endif
>   {
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -35,7 +35,6 @@
>  #include 
>  #include "internal.h"
>  
> -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
>  unsigned long hugepages_treat_as_movable;
>  
>  int hugetlb_max_hstate __read_mostly;
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] mm, hugetlb: generalize writes to nr_hugepages

2014-07-02 Thread Luiz Capitulino

On Mon, 30 Jun 2014 16:57:06 -0700 (PDT)
David Rientjes  wrote:

> Three different interfaces alter the maximum number of hugepages for an
> hstate:
> 
>  - /proc/sys/vm/nr_hugepages for global number of hugepages of the default
>hstate,
> 
>  - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages for global number of
>hugepages for a specific hstate, and
> 
>  - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages/mempolicy for number of
>hugepages for a specific hstate over the set of allowed nodes.
> 
> Generalize the code so that a single function handles all of these writes 
> instead of duplicating the code in two different functions.
> 
> This decreases the number of lines of code, but also reduces the size of
> .text by about half a percent since set_max_huge_pages() can be inlined.
> 
> Signed-off-by: David Rientjes 
> ---
>  mm/hugetlb.c | 61 
> ++--
>  1 file changed, 26 insertions(+), 35 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1734,21 +1734,13 @@ static ssize_t nr_hugepages_show_common(struct 
> kobject *kobj,
>   return sprintf(buf, "%lu\n", nr_huge_pages);
>  }
>  
> -static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
> - struct kobject *kobj, struct kobj_attribute *attr,
> - const char *buf, size_t len)
> +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
> +struct hstate *h, int nid,
> +unsigned long count, size_t len)
>  {
>   int err;
> - int nid;
> - unsigned long count;
> - struct hstate *h;
>   NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
>  
> - err = kstrtoul(buf, 10, );
> - if (err)
> - goto out;
> -
> - h = kobj_to_hstate(kobj, );
>   if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
>   err = -EINVAL;
>   goto out;
> @@ -1784,6 +1776,23 @@ out:
>   return err;
>  }
>  
> +static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
> +  struct kobject *kobj, const char *buf,
> +  size_t len)
> +{
> + struct hstate *h;
> + unsigned long count;
> + int nid;
> + int err;
> +
> + err = kstrtoul(buf, 10, );
> + if (err)
> + return err;
> +
> + h = kobj_to_hstate(kobj, );
> + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
> +}
> +
>  static ssize_t nr_hugepages_show(struct kobject *kobj,
>  struct kobj_attribute *attr, char *buf)
>  {
> @@ -1793,7 +1802,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj,
>  static ssize_t nr_hugepages_store(struct kobject *kobj,
>  struct kobj_attribute *attr, const char *buf, size_t len)
>  {
> - return nr_hugepages_store_common(false, kobj, attr, buf, len);
> + return nr_hugepages_store_common(false, kobj, buf, len);
>  }
>  HSTATE_ATTR(nr_hugepages);
>  
> @@ -1812,7 +1821,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct 
> kobject *kobj,
>  static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
>  struct kobj_attribute *attr, const char *buf, size_t len)
>  {
> - return nr_hugepages_store_common(true, kobj, attr, buf, len);
> + return nr_hugepages_store_common(true, kobj, buf, len);
>  }
>  HSTATE_ATTR(nr_hugepages_mempolicy);
>  #endif
> @@ -2248,36 +2257,18 @@ static int hugetlb_sysctl_handler_common(bool 
> obey_mempolicy,
>void __user *buffer, size_t *length, loff_t *ppos)
>  {
>   struct hstate *h = _hstate;
> - unsigned long tmp;
> + unsigned long tmp = h->max_huge_pages;
>   int ret;
>  
> - if (!hugepages_supported())
> - return -ENOTSUPP;

Shouldn't you add this check to __nr_hugepages_store_common()? Otherwise
looks good to me.

> -
> - tmp = h->max_huge_pages;
> -
> - if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
> - return -EINVAL;
> -
>   table->data = 
>   table->maxlen = sizeof(unsigned long);
>   ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
>   if (ret)
>   goto out;
>  
> - if (write) {
> - NODEMASK_ALLOC(nodemask_t, nodes_allowed,
> - GFP_KERNEL | __GFP_NORETRY);
> - if (!(obey_mempolicy &&
> -init_nodemask_of_mempolicy(nodes_allowed))) {
> - NODEMASK_FREE(nodes_allowed);
> - nodes_allowed = _states[N_MEMORY];
> - }
> - h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
> -
> - if (nodes_allowed != _states[N_MEMORY])
> - NODEMASK_FREE(nodes_allowed);
> - }
> +

Re: [PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-07-02 Thread Luiz Capitulino

On Mon, 30 Jun 2014 16:42:39 -0700 (PDT)
David Rientjes  wrote:

> On Thu, 19 Jun 2014, Luiz Capitulino wrote:
> 
> > The setup_node_data() function allocates a pg_data_t object, inserts it
> > into the node_data[] array and initializes the following fields:
> > node_id, node_start_pfn and node_spanned_pages.
> > 
> > However, a few function calls later during the kernel boot,
> > free_area_init_node() re-initializes those fields, possibly with
> > different values. This means that the initialization done by
> > setup_node_data() is not used.
> > 
> > This causes a small glitch when running Linux as a hyperv numa guest:
> > 
> > [0.00] SRAT: PXM 0 -> APIC 0x00 -> Node 0
> > [0.00] SRAT: PXM 0 -> APIC 0x01 -> Node 0
> > [0.00] SRAT: PXM 1 -> APIC 0x02 -> Node 1
> > [0.00] SRAT: PXM 1 -> APIC 0x03 -> Node 1
> > [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
> > [0.00] SRAT: Node 1 PXM 1 [mem 0x8020-0xf7ff]
> > [0.00] SRAT: Node 1 PXM 1 [mem 0x1-0x1081f]
> > [0.00] NUMA: Node 1 [mem 0x8020-0xf7ff] + [mem 
> > 0x1-0x1081f] -> [mem 0x8020-0x1081f]
> > [0.00] Initmem setup node 0 [mem 0x-0x7fff]
> > [0.00]   NODE_DATA [mem 0x7ffec000-0x7ffe]
> > [0.00] Initmem setup node 1 [mem 0x8080-0x1081f]
> > [0.00]   NODE_DATA [mem 0x1081fa000-0x1081fdfff]
> > [0.00] crashkernel: memory value expected
> > [0.00]  [ea00-ea0001ff] PMD -> 
> > [88007de0-88007fdf] on node 0
> > [0.00]  [ea000200-ea00043f] PMD -> 
> > [88010560-8801077f] on node 1
> > [0.00] Zone ranges:
> > [0.00]   DMA  [mem 0x1000-0x00ff]
> > [0.00]   DMA32[mem 0x0100-0x]
> > [0.00]   Normal   [mem 0x1-0x1081f]
> > [0.00] Movable zone start for each node
> > [0.00] Early memory node ranges
> > [0.00]   node   0: [mem 0x1000-0x0009efff]
> > [0.00]   node   0: [mem 0x0010-0x7ffe]
> > [0.00]   node   1: [mem 0x8020-0xf7ff]
> > [0.00]   node   1: [mem 0x1-0x1081f]
> > [0.00] On node 0 totalpages: 524174
> > [0.00]   DMA zone: 64 pages used for memmap
> > [0.00]   DMA zone: 21 pages reserved
> > [0.00]   DMA zone: 3998 pages, LIFO batch:0
> > [0.00]   DMA32 zone: 8128 pages used for memmap
> > [0.00]   DMA32 zone: 520176 pages, LIFO batch:31
> > [0.00] On node 1 totalpages: 524288
> > [0.00]   DMA32 zone: 7672 pages used for memmap
> > [0.00]   DMA32 zone: 491008 pages, LIFO batch:31
> > [0.00]   Normal zone: 520 pages used for memmap
> > [0.00]   Normal zone: 33280 pages, LIFO batch:7
> > 
> > In this dmesg, the SRAT table reports that the memory range for node 1
> > starts at 0x8020. However, the line starting with "Initmem" reports
> > that node 1 memory range starts at 0x8080. The "Initmem" line is
> > reported by setup_node_data() and is wrong, because the kernel ends up
> > using the range as reported in the SRAT table.
> > 
> > This commit drops all that dead code from setup_node_data(), renames it
> > to alloc_node_data() and adds a printk() to free_area_init_node() so
> > that we report a node's memory range accurately.
> > 
> > Here's the same dmesg section with this patch applied:
> > 
> > [0.00] SRAT: PXM 0 -> APIC 0x00 -> Node 0
> > [0.00] SRAT: PXM 0 -> APIC 0x01 -> Node 0
> > [0.00] SRAT: PXM 1 -> APIC 0x02 -> Node 1
> > [0.00] SRAT: PXM 1 -> APIC 0x03 -> Node 1
> > [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
> > [0.00] SRAT: Node 1 PXM 1 [mem 0x8020-0xf7ff]
> > [0.00] SRAT: Node 1 PXM 1 [mem 0x1-0x1081f]
> > [0.00] NUMA: Node 1 [mem 0x8020-0xf7ff] + [mem 
> > 0x1-0x1081f] -> [mem 0x8020-0x1081f]
> > [0.00] NODE_DATA(0) allocated [mem 0x7ffec000-0x7ffe]
> > [0.00] NODE_DATA(1) allocated [mem 0x1081fa000-0x1081fdfff]
> > [0.00] crashkernel: memory value expected
> > [0.00]  [ea00-ea0001ff] PMD -> 
> > [88007de0-88007fdf] on node 0
> > [0.00]  [ea000200-ea00043f] PMD -> 
> &

Re: [PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-07-02 Thread Luiz Capitulino

On Mon, 30 Jun 2014 16:42:39 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 On Thu, 19 Jun 2014, Luiz Capitulino wrote:
 
  The setup_node_data() function allocates a pg_data_t object, inserts it
  into the node_data[] array and initializes the following fields:
  node_id, node_start_pfn and node_spanned_pages.
  
  However, a few function calls later during the kernel boot,
  free_area_init_node() re-initializes those fields, possibly with
  different values. This means that the initialization done by
  setup_node_data() is not used.
  
  This causes a small glitch when running Linux as a hyperv numa guest:
  
  [0.00] SRAT: PXM 0 - APIC 0x00 - Node 0
  [0.00] SRAT: PXM 0 - APIC 0x01 - Node 0
  [0.00] SRAT: PXM 1 - APIC 0x02 - Node 1
  [0.00] SRAT: PXM 1 - APIC 0x03 - Node 1
  [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x8020-0xf7ff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x1-0x1081f]
  [0.00] NUMA: Node 1 [mem 0x8020-0xf7ff] + [mem 
  0x1-0x1081f] - [mem 0x8020-0x1081f]
  [0.00] Initmem setup node 0 [mem 0x-0x7fff]
  [0.00]   NODE_DATA [mem 0x7ffec000-0x7ffe]
  [0.00] Initmem setup node 1 [mem 0x8080-0x1081f]
  [0.00]   NODE_DATA [mem 0x1081fa000-0x1081fdfff]
  [0.00] crashkernel: memory value expected
  [0.00]  [ea00-ea0001ff] PMD - 
  [88007de0-88007fdf] on node 0
  [0.00]  [ea000200-ea00043f] PMD - 
  [88010560-8801077f] on node 1
  [0.00] Zone ranges:
  [0.00]   DMA  [mem 0x1000-0x00ff]
  [0.00]   DMA32[mem 0x0100-0x]
  [0.00]   Normal   [mem 0x1-0x1081f]
  [0.00] Movable zone start for each node
  [0.00] Early memory node ranges
  [0.00]   node   0: [mem 0x1000-0x0009efff]
  [0.00]   node   0: [mem 0x0010-0x7ffe]
  [0.00]   node   1: [mem 0x8020-0xf7ff]
  [0.00]   node   1: [mem 0x1-0x1081f]
  [0.00] On node 0 totalpages: 524174
  [0.00]   DMA zone: 64 pages used for memmap
  [0.00]   DMA zone: 21 pages reserved
  [0.00]   DMA zone: 3998 pages, LIFO batch:0
  [0.00]   DMA32 zone: 8128 pages used for memmap
  [0.00]   DMA32 zone: 520176 pages, LIFO batch:31
  [0.00] On node 1 totalpages: 524288
  [0.00]   DMA32 zone: 7672 pages used for memmap
  [0.00]   DMA32 zone: 491008 pages, LIFO batch:31
  [0.00]   Normal zone: 520 pages used for memmap
  [0.00]   Normal zone: 33280 pages, LIFO batch:7
  
  In this dmesg, the SRAT table reports that the memory range for node 1
  starts at 0x8020. However, the line starting with Initmem reports
  that node 1 memory range starts at 0x8080. The Initmem line is
  reported by setup_node_data() and is wrong, because the kernel ends up
  using the range as reported in the SRAT table.
  
  This commit drops all that dead code from setup_node_data(), renames it
  to alloc_node_data() and adds a printk() to free_area_init_node() so
  that we report a node's memory range accurately.
  
  Here's the same dmesg section with this patch applied:
  
  [0.00] SRAT: PXM 0 - APIC 0x00 - Node 0
  [0.00] SRAT: PXM 0 - APIC 0x01 - Node 0
  [0.00] SRAT: PXM 1 - APIC 0x02 - Node 1
  [0.00] SRAT: PXM 1 - APIC 0x03 - Node 1
  [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x7fff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x8020-0xf7ff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x1-0x1081f]
  [0.00] NUMA: Node 1 [mem 0x8020-0xf7ff] + [mem 
  0x1-0x1081f] - [mem 0x8020-0x1081f]
  [0.00] NODE_DATA(0) allocated [mem 0x7ffec000-0x7ffe]
  [0.00] NODE_DATA(1) allocated [mem 0x1081fa000-0x1081fdfff]
  [0.00] crashkernel: memory value expected
  [0.00]  [ea00-ea0001ff] PMD - 
  [88007de0-88007fdf] on node 0
  [0.00]  [ea000200-ea00043f] PMD - 
  [88010560-8801077f] on node 1
  [0.00] Zone ranges:
  [0.00]   DMA  [mem 0x1000-0x00ff]
  [0.00]   DMA32[mem 0x0100-0x]
  [0.00]   Normal   [mem 0x1-0x1081f]
  [0.00] Movable zone start for each node
  [0.00] Early memory node ranges
  [0.00]   node   0: [mem 0x1000-0x0009efff]
  [0.00]   node   0: [mem 0x0010-0x7ffe]
  [0.00]   node   1: [mem 0x8020-0xf7ff]
  [0.00]   node   1: [mem 0x1-0x1081f]
  [0.00] Node 0 memory range 0x1000-0x7ffe
  [0.00] On node 0 totalpages: 524174
  [0.00]   DMA zone: 64 pages used for memmap

Re: [patch] mm, hugetlb: generalize writes to nr_hugepages

2014-07-02 Thread Luiz Capitulino

On Mon, 30 Jun 2014 16:57:06 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 Three different interfaces alter the maximum number of hugepages for an
 hstate:
 
  - /proc/sys/vm/nr_hugepages for global number of hugepages of the default
hstate,
 
  - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages for global number of
hugepages for a specific hstate, and
 
  - /sys/kernel/mm/hugepages/hugepages-X/nr_hugepages/mempolicy for number of
hugepages for a specific hstate over the set of allowed nodes.
 
 Generalize the code so that a single function handles all of these writes 
 instead of duplicating the code in two different functions.
 
 This decreases the number of lines of code, but also reduces the size of
 .text by about half a percent since set_max_huge_pages() can be inlined.
 
 Signed-off-by: David Rientjes rient...@google.com
 ---
  mm/hugetlb.c | 61 
 ++--
  1 file changed, 26 insertions(+), 35 deletions(-)
 
 diff --git a/mm/hugetlb.c b/mm/hugetlb.c
 --- a/mm/hugetlb.c
 +++ b/mm/hugetlb.c
 @@ -1734,21 +1734,13 @@ static ssize_t nr_hugepages_show_common(struct 
 kobject *kobj,
   return sprintf(buf, %lu\n, nr_huge_pages);
  }
  
 -static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
 - struct kobject *kobj, struct kobj_attribute *attr,
 - const char *buf, size_t len)
 +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
 +struct hstate *h, int nid,
 +unsigned long count, size_t len)
  {
   int err;
 - int nid;
 - unsigned long count;
 - struct hstate *h;
   NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
  
 - err = kstrtoul(buf, 10, count);
 - if (err)
 - goto out;
 -
 - h = kobj_to_hstate(kobj, nid);
   if (hstate_is_gigantic(h)  !gigantic_page_supported()) {
   err = -EINVAL;
   goto out;
 @@ -1784,6 +1776,23 @@ out:
   return err;
  }
  
 +static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
 +  struct kobject *kobj, const char *buf,
 +  size_t len)
 +{
 + struct hstate *h;
 + unsigned long count;
 + int nid;
 + int err;
 +
 + err = kstrtoul(buf, 10, count);
 + if (err)
 + return err;
 +
 + h = kobj_to_hstate(kobj, nid);
 + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
 +}
 +
  static ssize_t nr_hugepages_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
  {
 @@ -1793,7 +1802,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj,
  static ssize_t nr_hugepages_store(struct kobject *kobj,
  struct kobj_attribute *attr, const char *buf, size_t len)
  {
 - return nr_hugepages_store_common(false, kobj, attr, buf, len);
 + return nr_hugepages_store_common(false, kobj, buf, len);
  }
  HSTATE_ATTR(nr_hugepages);
  
 @@ -1812,7 +1821,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct 
 kobject *kobj,
  static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
  struct kobj_attribute *attr, const char *buf, size_t len)
  {
 - return nr_hugepages_store_common(true, kobj, attr, buf, len);
 + return nr_hugepages_store_common(true, kobj, buf, len);
  }
  HSTATE_ATTR(nr_hugepages_mempolicy);
  #endif
 @@ -2248,36 +2257,18 @@ static int hugetlb_sysctl_handler_common(bool 
 obey_mempolicy,
void __user *buffer, size_t *length, loff_t *ppos)
  {
   struct hstate *h = default_hstate;
 - unsigned long tmp;
 + unsigned long tmp = h-max_huge_pages;
   int ret;
  
 - if (!hugepages_supported())
 - return -ENOTSUPP;

Shouldn't you add this check to __nr_hugepages_store_common()? Otherwise
looks good to me.

 -
 - tmp = h-max_huge_pages;
 -
 - if (write  hstate_is_gigantic(h)  !gigantic_page_supported())
 - return -EINVAL;
 -
   table-data = tmp;
   table-maxlen = sizeof(unsigned long);
   ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
   if (ret)
   goto out;
  
 - if (write) {
 - NODEMASK_ALLOC(nodemask_t, nodes_allowed,
 - GFP_KERNEL | __GFP_NORETRY);
 - if (!(obey_mempolicy 
 -init_nodemask_of_mempolicy(nodes_allowed))) {
 - NODEMASK_FREE(nodes_allowed);
 - nodes_allowed = node_states[N_MEMORY];
 - }
 - h-max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
 -
 - if (nodes_allowed != node_states[N_MEMORY])
 - NODEMASK_FREE(nodes_allowed);
 - }
 + if (write)
 + ret =

Re: [patch] mm, hugetlb: remove hugetlb_zero and hugetlb_infinity

2014-07-02 Thread Luiz Capitulino

On Mon, 30 Jun 2014 17:46:35 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 They are unnecessary: zero can be used in place of hugetlb_zero and 
 passing extra2 == NULL is equivalent to infinity.
 
 Signed-off-by: David Rientjes rient...@google.com

Reviewed-by: Luiz Capitulino lcapitul...@redhat.com

 ---
  include/linux/hugetlb.h | 1 -
  kernel/sysctl.c | 9 +++--
  mm/hugetlb.c| 1 -
  3 files changed, 3 insertions(+), 8 deletions(-)
 
 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
 --- a/include/linux/hugetlb.h
 +++ b/include/linux/hugetlb.h
 @@ -86,7 +86,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long 
 addr, pud_t *pud);
  #endif
  
  extern unsigned long hugepages_treat_as_movable;
 -extern const unsigned long hugetlb_zero, hugetlb_infinity;
  extern int sysctl_hugetlb_shm_group;
  extern struct list_head huge_boot_pages;
  
 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
 --- a/kernel/sysctl.c
 +++ b/kernel/sysctl.c
 @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = {
   .maxlen = sizeof(unsigned long),
   .mode   = 0644,
   .proc_handler   = hugetlb_sysctl_handler,
 - .extra1 = (void *)hugetlb_zero,
 - .extra2 = (void *)hugetlb_infinity,
 + .extra1 = zero,
   },
  #ifdef CONFIG_NUMA
   {
 @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = {
   .maxlen = sizeof(unsigned long),
   .mode   = 0644,
   .proc_handler   = hugetlb_mempolicy_sysctl_handler,
 - .extra1 = (void *)hugetlb_zero,
 - .extra2 = (void *)hugetlb_infinity,
 + .extra1 = zero,
   },
  #endif
{
 @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = {
   .maxlen = sizeof(unsigned long),
   .mode   = 0644,
   .proc_handler   = hugetlb_overcommit_handler,
 - .extra1 = (void *)hugetlb_zero,
 - .extra2 = (void *)hugetlb_infinity,
 + .extra1 = zero,
   },
  #endif
   {
 diff --git a/mm/hugetlb.c b/mm/hugetlb.c
 --- a/mm/hugetlb.c
 +++ b/mm/hugetlb.c
 @@ -35,7 +35,6 @@
  #include linux/node.h
  #include internal.h
  
 -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
  unsigned long hugepages_treat_as_movable;
  
  int hugetlb_max_hstate __read_mostly;
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-07-02 Thread Luiz Capitulino

On Wed, 2 Jul 2014 16:20:47 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 On Wed, 2 Jul 2014, Luiz Capitulino wrote:
 
   With this patch, the dmesg changes break one of my scripts that we use to 
   determine the start and end address of a node (doubly bad because there's 
   no sysfs interface to determine this otherwise and we have to do this at 
   boot to acquire the system topology).
   
   Specifically, the removal of the
   
 Initmem setup node X [mem 0xstart-0xend]
   
   lines that are replaced when each node is onlined to
   
 Node 0 memory range 0xstart-0xend
   
   And if I just noticed this breakage when booting the latest -mm kernel, 
   I'm assuming I'm not the only person who is going to run into it.  Is it 
   possible to not change the dmesg output?
  
  Sure. I can add back the original text. The only detail is that with this
  patch that line is now printed a little bit later during boot and the
  NODA_DATA lines also changed. Are you OK with that?
  
 
 Yes, please.  I think it should be incremental on your patch since it's 
 already in -mm with  fix appended so the title of the patch would be 
 x86: numa: setup_node_data(): drop dead code and rename function fix and 
 then Andrew can fold it into the original when sending it to the x86 
 maintainers.
 
  What's the guidelines on changing what's printed in dmesg?
  
 
 That's the scary part, there doesn't seem to be any.  It's especially 
 crucial for things that only get printed once and aren't available 
 anywhere else at runtime; there was talk of adding a sysfs interface that 
 defines the start and end addresses of nodes but it's complicated because 
 nodes can overlap each other.  If that had been available years ago then I 
 don't think anybody would raise their hand about this issue.
 
 These lines went under a smaller change a few years ago for 
 s/Bootmem/Initmem/.  I don't even have to look at the git history to know 
 that because it broke our scripts back then as well.  You just happened to 
 touch lines that I really care about and breaks my topology information :)  
 I wouldn't complain if it was just my userspace, but I have no doubt 
 others have parsed their dmesg in a similar way because people have 
 provided me with data that they retrieved by scraping the kernel log.

No problem. I'll send a patch shortly as you suggested.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] mm, hugetlb: generalize writes to nr_hugepages

2014-07-02 Thread Luiz Capitulino

On Wed, 2 Jul 2014 17:44:46 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 On Wed, 2 Jul 2014, Luiz Capitulino wrote:
 
   diff --git a/mm/hugetlb.c b/mm/hugetlb.c
   --- a/mm/hugetlb.c
   +++ b/mm/hugetlb.c
   @@ -1734,21 +1734,13 @@ static ssize_t nr_hugepages_show_common(struct 
   kobject *kobj,
 return sprintf(buf, %lu\n, nr_huge_pages);
}

   -static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
   - struct kobject *kobj, struct kobj_attribute *attr,
   - const char *buf, size_t len)
   +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
   +struct hstate *h, int nid,
   +unsigned long count, size_t len)
{
 int err;
   - int nid;
   - unsigned long count;
   - struct hstate *h;
 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);

   - err = kstrtoul(buf, 10, count);
   - if (err)
   - goto out;
   -
   - h = kobj_to_hstate(kobj, nid);
 if (hstate_is_gigantic(h)  !gigantic_page_supported()) {
 err = -EINVAL;
 goto out;
   @@ -1784,6 +1776,23 @@ out:
 return err;
}

   +static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
   +  struct kobject *kobj, const char *buf,
   +  size_t len)
   +{
   + struct hstate *h;
   + unsigned long count;
   + int nid;
   + int err;
   +
   + err = kstrtoul(buf, 10, count);
   + if (err)
   + return err;
   +
   + h = kobj_to_hstate(kobj, nid);
   + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
   +}
   +
static ssize_t nr_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
   @@ -1793,7 +1802,7 @@ static ssize_t nr_hugepages_show(struct kobject 
   *kobj,
static ssize_t nr_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
   - return nr_hugepages_store_common(false, kobj, attr, buf, len);
   + return nr_hugepages_store_common(false, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages);

   @@ -1812,7 +1821,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct 
   kobject *kobj,
static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
   - return nr_hugepages_store_common(true, kobj, attr, buf, len);
   + return nr_hugepages_store_common(true, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages_mempolicy);
#endif
   @@ -2248,36 +2257,18 @@ static int hugetlb_sysctl_handler_common(bool 
   obey_mempolicy,
  void __user *buffer, size_t *length, loff_t *ppos)
{
 struct hstate *h = default_hstate;
   - unsigned long tmp;
   + unsigned long tmp = h-max_huge_pages;
 int ret;

   - if (!hugepages_supported())
   - return -ENOTSUPP;
  
  Shouldn't you add this check to __nr_hugepages_store_common()? Otherwise
  looks good to me.
  
 
 Hmm, I think you're right but I don't think __nr_hugepages_store_common() 
 is the right place: if we have a legitimate hstate for the sysfs tunables 
 then we should support hugepages.  I think this should be kept in 
 hugetlb_sysctl_handler_common().

You seem to be right. Feel free to add if you respin:

Reviewed-by: Luiz Capitulino lcapitul...@redhat.com

 
   -
   - tmp = h-max_huge_pages;
   -
   - if (write  hstate_is_gigantic(h)  !gigantic_page_supported())
   - return -EINVAL;
   -
 table-data = tmp;
 table-maxlen = sizeof(unsigned long);
 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
 if (ret)
 goto out;

   - if (write) {
   - NODEMASK_ALLOC(nodemask_t, nodes_allowed,
   - GFP_KERNEL | __GFP_NORETRY);
   - if (!(obey_mempolicy 
   -init_nodemask_of_mempolicy(nodes_allowed))) {
   - NODEMASK_FREE(nodes_allowed);
   - nodes_allowed = node_states[N_MEMORY];
   - }
   - h-max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
   -
   - if (nodes_allowed != node_states[N_MEMORY])
   - NODEMASK_FREE(nodes_allowed);
   - }
   + if (write)
   + ret = __nr_hugepages_store_common(obey_mempolicy, h,
   +   NUMA_NO_NODE, tmp, *length);
out:
 return ret;
}
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-06-26 Thread Luiz Capitulino

On Thu, 26 Jun 2014 10:51:11 -0400
Rik van Riel  wrote:

> -BEGIN PGP SIGNED MESSAGE-
> Hash: SHA1
> 
> On 06/19/2014 10:20 PM, Luiz Capitulino wrote:
> 
> > @@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct
> > numa_meminfo *mi) end = max(mi->blk[i].end, end); }
> > 
> > -   if (start < end) -  setup_node_data(nid, 
> > start, end); + if
> > (start >= end) +continue; + +   /* +
> >  * Don't confuse VM with a
> > node that doesn't have the + * minimum amount of memory: +  
> >  */ +
> > if (end && (end - start) < NODE_MIN_SIZE) + continue; + +
> > alloc_node_data(nid); }
> 
> Minor nit.  If we skip a too-small node, should we remember that we
> did so, and add its memory to another node, assuming it is physically
> contiguous memory?

Interesting point. Honest question, please disregard if this doesn't
make sense: but won't this affect automatic numa performance? Because
the kernel won't know that that extra memory actually pertains to another
node and hence that extra memory will have a difference distance of the
node that's making use it of it.

If my thinking is wrong or if even then you believe this is a good feature,
I can work on it on a different patch, as this check is not being introduced
by this patch. Although I also wonder how many numa machines have such small
nodes...

> Other than that...
> 
> Acked-by: Rik van Riel 

Thanks!

> 
> - -- 
> All rights reversed
> -BEGIN PGP SIGNATURE-
> Version: GnuPG v1
> Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/
> 
> iQEcBAEBAgAGBQJTrDNfAAoJEM553pKExN6DrNgH/j160OIey5moCEFMH51a1e3+
> D6iOIXxsVii5/wqabYuA1DCQ8Asgd/UK2BWdxxRZVZuTHXXn97iifq1IkIPEQxXc
> pjz25/ZFSpa3fgZk8iyUzOQjLukFfkiaO1mSopO7IWwUZoEa9fJ7bOBvwcnFU4oQ
> uZAV375RpxiPEXNh2qQZXX0kNrycZd8S81jUSuQv3OLPRI1EQo+txOg/u7ir0pOJ
> z1fkBK0hiSHziAzB/nyjR/RgSb23vpMlUlPoGMhwCMp08aJkL147bHZvsCtlg/w4
> kBqq/zy9te4ecSicUsX/l16o0SJ9a1JtvFAlqz0iqlGcKQGCEw2P+y0ZyrhfvaE=
> =NOgK
> -END PGP SIGNATURE-
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-06-26 Thread Luiz Capitulino

On Thu, 26 Jun 2014 10:51:11 -0400
Rik van Riel r...@redhat.com wrote:

 -BEGIN PGP SIGNED MESSAGE-
 Hash: SHA1
 
 On 06/19/2014 10:20 PM, Luiz Capitulino wrote:
 
  @@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct
  numa_meminfo *mi) end = max(mi-blk[i].end, end); }
  
  -   if (start  end) -  setup_node_data(nid, 
  start, end); + if
  (start = end) +continue; + +   /* +
   * Don't confuse VM with a
  node that doesn't have the + * minimum amount of memory: +  
   */ +
  if (end  (end - start)  NODE_MIN_SIZE) + continue; + +
  alloc_node_data(nid); }
 
 Minor nit.  If we skip a too-small node, should we remember that we
 did so, and add its memory to another node, assuming it is physically
 contiguous memory?

Interesting point. Honest question, please disregard if this doesn't
make sense: but won't this affect automatic numa performance? Because
the kernel won't know that that extra memory actually pertains to another
node and hence that extra memory will have a difference distance of the
node that's making use it of it.

If my thinking is wrong or if even then you believe this is a good feature,
I can work on it on a different patch, as this check is not being introduced
by this patch. Although I also wonder how many numa machines have such small
nodes...

 Other than that...
 
 Acked-by: Rik van Riel r...@redhat.com

Thanks!

 
 - -- 
 All rights reversed
 -BEGIN PGP SIGNATURE-
 Version: GnuPG v1
 Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/
 
 iQEcBAEBAgAGBQJTrDNfAAoJEM553pKExN6DrNgH/j160OIey5moCEFMH51a1e3+
 D6iOIXxsVii5/wqabYuA1DCQ8Asgd/UK2BWdxxRZVZuTHXXn97iifq1IkIPEQxXc
 pjz25/ZFSpa3fgZk8iyUzOQjLukFfkiaO1mSopO7IWwUZoEa9fJ7bOBvwcnFU4oQ
 uZAV375RpxiPEXNh2qQZXX0kNrycZd8S81jUSuQv3OLPRI1EQo+txOg/u7ir0pOJ
 z1fkBK0hiSHziAzB/nyjR/RgSb23vpMlUlPoGMhwCMp08aJkL147bHZvsCtlg/w4
 kBqq/zy9te4ecSicUsX/l16o0SJ9a1JtvFAlqz0iqlGcKQGCEw2P+y0ZyrhfvaE=
 =NOgK
 -END PGP SIGNATURE-
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-06-19 Thread Luiz Capitulino

:31
[0.00] Node 1 memory range 0x8020-0x1081f
[0.00] On node 1 totalpages: 524288
[0.00]   DMA32 zone: 7672 pages used for memmap
[0.00]   DMA32 zone: 491008 pages, LIFO batch:31
[0.00]   Normal zone: 520 pages used for memmap
[0.00]   Normal zone: 33280 pages, LIFO batch:7

This commit was tested on a two node bare-metal NUMA machine and Linux
as a numa guest on hyperv and qemu/kvm.

PS: The wrong memory range reported by setup_node_data() seems to be
harmless in the current kernel because it's just not used. However,
that bad range is used in kernel 2.6.32 to initialize the old boot
    memory allocator, which causes a crash during boot.

Signed-off-by: Luiz Capitulino 
---
 arch/x86/include/asm/numa.h |  1 -
 arch/x86/mm/numa.c  | 34 ++
 mm/page_alloc.c |  2 ++
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..d221374 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, _meminfo);
 }
 
-/* Initialize NODE_DATA for a node on the local memory */
-static void __init setup_node_data(int nid, u64 start, u64 end)
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
 {
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
@@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
int tnid;
 
/*
-* Don't confuse VM with a node that doesn't have the
-* minimum amount of memory:
-*/
-   if (end && (end - start) < NODE_MIN_SIZE)
-   return;
-
-   start = roundup(start, ZONE_ALIGN);
-
-   printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
-  nid, start, end - 1);
-
-   /*
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
@@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
nd = __va(nd_pa);
 
/* report and initialize */
-   printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]\n",
+   printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
   nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
if (tnid != nid)
@@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-   NODE_DATA(nid)->node_id = nid;
-   NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
-   NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
 
node_set_online(nid);
 }
@@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct 
numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
 
-   if (start < end)
-   setup_node_data(nid, start, end);
+   if (start >= end)
+   continue;
+
+   /*
+* Don't confuse VM with a node that doesn't have the
+* minimum amount of memory:
+*/
+   if (end && (end - start) < NODE_MIN_SIZE)
+   continue;
+
+   alloc_node_data(nid);
}
 
/* Dump memblock with node info and return. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa2..e57b7d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4956,6 +4956,8 @@ void __paginginit free_area_init_node(int nid, unsigned 
long *zones_size,
pgdat->node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, _pfn, _pfn);
+   printk(KERN_INFO "Node %d memory range %#010Lx-%#010Lx\n", nid,
+   (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << 
PAGE_SHIFT) - 1);
 #endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
  zones_size, zholes_size);
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: numa: setup_node_data(): drop dead code and rename function

2014-06-19 Thread Luiz Capitulino

[0.00]   DMA32 zone: 7672 pages used for memmap
[0.00]   DMA32 zone: 491008 pages, LIFO batch:31
[0.00]   Normal zone: 520 pages used for memmap
[0.00]   Normal zone: 33280 pages, LIFO batch:7

This commit was tested on a two node bare-metal NUMA machine and Linux
as a numa guest on hyperv and qemu/kvm.

PS: The wrong memory range reported by setup_node_data() seems to be
harmless in the current kernel because it's just not used. However,
that bad range is used in kernel 2.6.32 to initialize the old boot
memory allocator, which causes a crash during boot.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 arch/x86/include/asm/numa.h |  1 -
 arch/x86/mm/numa.c  | 34 ++
 mm/page_alloc.c |  2 ++
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL  (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..d221374 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, numa_meminfo);
 }
 
-/* Initialize NODE_DATA for a node on the local memory */
-static void __init setup_node_data(int nid, u64 start, u64 end)
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
 {
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
@@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
int tnid;
 
/*
-* Don't confuse VM with a node that doesn't have the
-* minimum amount of memory:
-*/
-   if (end  (end - start)  NODE_MIN_SIZE)
-   return;
-
-   start = roundup(start, ZONE_ALIGN);
-
-   printk(KERN_INFO Initmem setup node %d [mem %#010Lx-%#010Lx]\n,
-  nid, start, end - 1);
-
-   /*
 * Allocate node data.  Try node-local memory and then any node.
 * Never allocate in DMA zone.
 */
@@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
nd = __va(nd_pa);
 
/* report and initialize */
-   printk(KERN_INFO   NODE_DATA [mem %#010Lx-%#010Lx]\n,
+   printk(KERN_INFO NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n, nid,
   nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa  PAGE_SHIFT);
if (tnid != nid)
@@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
 
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-   NODE_DATA(nid)-node_id = nid;
-   NODE_DATA(nid)-node_start_pfn = start  PAGE_SHIFT;
-   NODE_DATA(nid)-node_spanned_pages = (end - start)  PAGE_SHIFT;
 
node_set_online(nid);
 }
@@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct 
numa_meminfo *mi)
end = max(mi-blk[i].end, end);
}
 
-   if (start  end)
-   setup_node_data(nid, start, end);
+   if (start = end)
+   continue;
+
+   /*
+* Don't confuse VM with a node that doesn't have the
+* minimum amount of memory:
+*/
+   if (end  (end - start)  NODE_MIN_SIZE)
+   continue;
+
+   alloc_node_data(nid);
}
 
/* Dump memblock with node info and return. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa2..e57b7d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4956,6 +4956,8 @@ void __paginginit free_area_init_node(int nid, unsigned 
long *zones_size,
pgdat-node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, start_pfn, end_pfn);
+   printk(KERN_INFO Node %d memory range %#010Lx-%#010Lx\n, nid,
+   (u64) start_pfn  PAGE_SHIFT, (u64) (end_pfn  
PAGE_SHIFT) - 1);
 #endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
  zones_size, zholes_size);
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-11 Thread Luiz Capitulino


Yinghai, sorry for my late reply.

On Mon, 9 Jun 2014 15:13:41 -0700
Yinghai Lu  wrote:

> On Mon, Jun 9, 2014 at 12:03 PM, Luiz Capitulino  
> wrote:
> > On Sun, 8 Jun 2014 18:29:11 -0700
> > Yinghai Lu  wrote:
> >
> >> On Sun, Jun 8, 2014 at 3:14 PM, Luiz Capitulino  
> >> wrote:
> > [0.00] e820: BIOS-provided physical RAM map:
> > [0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
> > [0.00] BIOS-e820: [mem 0x0009fc00-0x0009] 
> > reserved
> > [0.00] BIOS-e820: [mem 0x000e-0x000f] 
> > reserved
> > [0.00] BIOS-e820: [mem 0x0010-0x3ffe] usable
> > [0.00] BIOS-e820: [mem 0x3fff-0x3fffefff] ACPI 
> > data
> > [0.00] BIOS-e820: [mem 0x3000-0x3fff] ACPI 
> > NVS
> > [0.00] BIOS-e820: [mem 0x4020-0x801f] usable
> ...
> > [0.00] SRAT: PXM 0 -> APIC 0x00 -> Node 0
> > [0.00] SRAT: PXM 0 -> APIC 0x01 -> Node 0
> > [0.00] SRAT: PXM 1 -> APIC 0x02 -> Node 1
> > [0.00] SRAT: PXM 1 -> APIC 0x03 -> Node 1
> > [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x3fff]
> > [0.00] SRAT: Node 1 PXM 1 [mem 0x4020-0x801f]
> > [0.00] Initmem setup node 0 [mem 0x-0x3fff]
> > [0.00]   NODE_DATA [mem 0x3ffec000-0x3ffe]
> > [0.00] Initmem setup node 1 [mem 0x4080-0x801f]
> > [0.00]   NODE_DATA [mem 0x801fb000-0x801fefff]
> 
> so node1 start is aligned to 8M from 2M
> 
> node0: [0, 1G)
> node1: [1G+2M, 2G+2M)
> 
> The zone should not cross the 8M boundary?

Yes, but the question is: why?

> In the case should we trim the memblock for numa to be 8M alignment ?

My current thinking, after discussing this with David, is to just page
align the memory range. This should fix the hyperv-triggered bug in 2.6.32
and seems to be the right thing for upstream too.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-11 Thread Luiz Capitulino


Yinghai, sorry for my late reply.

On Mon, 9 Jun 2014 15:13:41 -0700
Yinghai Lu ying...@kernel.org wrote:

 On Mon, Jun 9, 2014 at 12:03 PM, Luiz Capitulino lcapitul...@redhat.com 
 wrote:
  On Sun, 8 Jun 2014 18:29:11 -0700
  Yinghai Lu ying...@kernel.org wrote:
 
  On Sun, Jun 8, 2014 at 3:14 PM, Luiz Capitulino lcapitul...@redhat.com 
  wrote:
  [0.00] e820: BIOS-provided physical RAM map:
  [0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
  [0.00] BIOS-e820: [mem 0x0009fc00-0x0009] 
  reserved
  [0.00] BIOS-e820: [mem 0x000e-0x000f] 
  reserved
  [0.00] BIOS-e820: [mem 0x0010-0x3ffe] usable
  [0.00] BIOS-e820: [mem 0x3fff-0x3fffefff] ACPI 
  data
  [0.00] BIOS-e820: [mem 0x3000-0x3fff] ACPI 
  NVS
  [0.00] BIOS-e820: [mem 0x4020-0x801f] usable
 ...
  [0.00] SRAT: PXM 0 - APIC 0x00 - Node 0
  [0.00] SRAT: PXM 0 - APIC 0x01 - Node 0
  [0.00] SRAT: PXM 1 - APIC 0x02 - Node 1
  [0.00] SRAT: PXM 1 - APIC 0x03 - Node 1
  [0.00] SRAT: Node 0 PXM 0 [mem 0x-0x3fff]
  [0.00] SRAT: Node 1 PXM 1 [mem 0x4020-0x801f]
  [0.00] Initmem setup node 0 [mem 0x-0x3fff]
  [0.00]   NODE_DATA [mem 0x3ffec000-0x3ffe]
  [0.00] Initmem setup node 1 [mem 0x4080-0x801f]
  [0.00]   NODE_DATA [mem 0x801fb000-0x801fefff]
 
 so node1 start is aligned to 8M from 2M
 
 node0: [0, 1G)
 node1: [1G+2M, 2G+2M)
 
 The zone should not cross the 8M boundary?

Yes, but the question is: why?

 In the case should we trim the memblock for numa to be 8M alignment ?

My current thinking, after discussing this with David, is to just page
align the memory range. This should fix the hyperv-triggered bug in 2.6.32
and seems to be the right thing for upstream too.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-10 Thread Luiz Capitulino

On Tue, 10 Jun 2014 15:10:01 -0700 (PDT)
David Rientjes  wrote:

> On Mon, 9 Jun 2014, Luiz Capitulino wrote:
> 
> > > > > > diff --git a/arch/x86/include/asm/numa.h 
> > > > > > b/arch/x86/include/asm/numa.h
> > > > > > index 4064aca..01b493e 100644
> > > > > > --- a/arch/x86/include/asm/numa.h
> > > > > > +++ b/arch/x86/include/asm/numa.h
> > > > > > @@ -9,7 +9,6 @@
> > > > > >  #ifdef CONFIG_NUMA
> > > > > >  
> > > > > >  #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
> > > > > > -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
> > > > > >  
> > > > > >  /*
> > > > > >   * Too small node sizes may confuse the VM badly. Usually they
> > > > > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > > > > index 1d045f9..69f6362 100644
> > > > > > --- a/arch/x86/mm/numa.c
> > > > > > +++ b/arch/x86/mm/numa.c
> > > > > > @@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 
> > > > > > start, u64 end)
> > > > > > if (end && (end - start) < NODE_MIN_SIZE)
> > > > > > return;
> > > > > >  
> > > > > > -   start = roundup(start, ZONE_ALIGN);
> > > > > > -
> > > > > > printk(KERN_INFO "Initmem setup node %d [mem 
> > > > > > %#010Lx-%#010Lx]\n",
> > > > > >nid, start, end - 1);
> > > > > >  
> > > > > 
> > > > > What ensures this start address is page aligned from the BIOS?
> > > > 
> > > > To which start address do you refer to?
> > > 
> > > The start address displayed in the dmesg is not page aligned anymore with 
> > > your change, correct?  
> > 
> > I have to check that but I don't expect this to happen because my
> > understanding of the code is that what's rounded up here is just discarded
> > in free_area_init_node(). Am I wrong?
> > 
> 
> NODE_DATA(nid)->node_start_pfn needs to be accurate if 
> node_set_online(nid).  Since there is no guarantee about page alignment 
> from the ACPI spec, removing the roundup() entirely could cause the 
> address shift >> PAGE_SIZE to be off by one.  I, like you, do not see the 
> need for the ZONE_ALIGN above, but I think we agree that it should be 
> replaced with PAGE_SIZE instead.

Agreed. I'm just not completely sure setup_node_data() is the best place
for it, shouldn't we do it in acpi_numa_memory_affinity_init(), which is
when the ranges are read off the SRAT table?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-10 Thread Luiz Capitulino

On Tue, 10 Jun 2014 15:10:01 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 On Mon, 9 Jun 2014, Luiz Capitulino wrote:
 
  diff --git a/arch/x86/include/asm/numa.h 
  b/arch/x86/include/asm/numa.h
  index 4064aca..01b493e 100644
  --- a/arch/x86/include/asm/numa.h
  +++ b/arch/x86/include/asm/numa.h
  @@ -9,7 +9,6 @@
   #ifdef CONFIG_NUMA
   
   #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
  -#define ZONE_ALIGN (1UL  (MAX_ORDER+PAGE_SHIFT))
   
   /*
* Too small node sizes may confuse the VM badly. Usually they
  diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
  index 1d045f9..69f6362 100644
  --- a/arch/x86/mm/numa.c
  +++ b/arch/x86/mm/numa.c
  @@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 
  start, u64 end)
  if (end  (end - start)  NODE_MIN_SIZE)
  return;
   
  -   start = roundup(start, ZONE_ALIGN);
  -
  printk(KERN_INFO Initmem setup node %d [mem 
  %#010Lx-%#010Lx]\n,
 nid, start, end - 1);
   
 
 What ensures this start address is page aligned from the BIOS?

To which start address do you refer to?
   
   The start address displayed in the dmesg is not page aligned anymore with 
   your change, correct?  
  
  I have to check that but I don't expect this to happen because my
  understanding of the code is that what's rounded up here is just discarded
  in free_area_init_node(). Am I wrong?
  
 
 NODE_DATA(nid)-node_start_pfn needs to be accurate if 
 node_set_online(nid).  Since there is no guarantee about page alignment 
 from the ACPI spec, removing the roundup() entirely could cause the 
 address shift  PAGE_SIZE to be off by one.  I, like you, do not see the 
 need for the ZONE_ALIGN above, but I think we agree that it should be 
 replaced with PAGE_SIZE instead.

Agreed. I'm just not completely sure setup_node_data() is the best place
for it, shouldn't we do it in acpi_numa_memory_affinity_init(), which is
when the ranges are read off the SRAT table?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-09 Thread Luiz Capitulino

On Mon, 9 Jun 2014 14:57:16 -0700 (PDT)
David Rientjes  wrote:

> On Mon, 9 Jun 2014, Luiz Capitulino wrote:
> 
> > > > diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
> > > > index 4064aca..01b493e 100644
> > > > --- a/arch/x86/include/asm/numa.h
> > > > +++ b/arch/x86/include/asm/numa.h
> > > > @@ -9,7 +9,6 @@
> > > >  #ifdef CONFIG_NUMA
> > > >  
> > > >  #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
> > > > -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
> > > >  
> > > >  /*
> > > >   * Too small node sizes may confuse the VM badly. Usually they
> > > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > > index 1d045f9..69f6362 100644
> > > > --- a/arch/x86/mm/numa.c
> > > > +++ b/arch/x86/mm/numa.c
> > > > @@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 
> > > > start, u64 end)
> > > > if (end && (end - start) < NODE_MIN_SIZE)
> > > > return;
> > > >  
> > > > -   start = roundup(start, ZONE_ALIGN);
> > > > -
> > > > printk(KERN_INFO "Initmem setup node %d [mem 
> > > > %#010Lx-%#010Lx]\n",
> > > >nid, start, end - 1);
> > > >  
> > > 
> > > What ensures this start address is page aligned from the BIOS?
> > 
> > To which start address do you refer to?
> 
> The start address displayed in the dmesg is not page aligned anymore with 
> your change, correct?  

I have to check that but I don't expect this to happen because my
understanding of the code is that what's rounded up here is just discarded
in free_area_init_node(). Am I wrong?

> acpi_parse_memory_affinity() does no 
> transformations on the table, the base address is coming strictly from the 
> SRAT and there is no page alignment requirement in the ACPI specification.  
> NODE_DATA(nid)->node_start_pfn will be correct because it does the shift 
> for you, but it still seems you want to at least align to PAGE_SIZE here. 

I do agree we need to align to PAGE_SIZE, but I'm not sure where we should
do it.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-09 Thread Luiz Capitulino

On Sun, 8 Jun 2014 18:29:11 -0700
Yinghai Lu  wrote:

> On Sun, Jun 8, 2014 at 3:14 PM, Luiz Capitulino  
> wrote:
> > In short, I believe this is just dead code for the upstream kernel but this
> > causes a bug for 2.6.32 based kernels.
> >
> > The setup_node_data() function is used to initialize NODE_DATA() for a node.
> > It gets a node id and a memory range. The start address for the memory range
> > is rounded up to ZONE_ALIGN and then it's used to initialize
> > NODE_DATA(nid)->node_start_pfn.
> > The 2.6.32 kernel did use the rounded up range start to register a node's
> > memory range with the bootmem interface by calling init_bootmem_node().
> > A few steps later during bootmem initialization, the 2.6.32 kernel calls
> > free_bootmem_with_active_regions() to initialize the bootmem bitmap. This
> > function goes through all memory ranges read from the SRAT table and try
> > to mark them as usable for bootmem usage. However, before marking a range
> > as usable, mark_bootmem_node() asserts if the memory range start address
> > (as read from the SRAT table) is less than the value registered with
> > init_bootmem_node(). The assertion will trigger whenever the memory range
> > start address is rounded up, as it will always be greater than what is
> > reported in the SRAT table. This is true when the 2.6.32 kernel runs as a
> > HyperV guest on Windows Server 2012. Dropping ZONE_ALIGN solves the
> > problem there.
> 
> What is e820 memmap and srat from HyperV guest?

I think the dmesg below provides this? Let me know otherwise.

> Can you post bootlog first 200 lines?

[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Initializing cgroup subsys cpuacct
[0.00] Linux version 3.15.0-rc6+ 
(r...@amd-6168-8-1.englab.nay.redhat.com) (gcc version 4.4.7 20120313 (Red Hat 
4.4.7-3) (GCC) ) #113 SMP Thu May 29 16:28:41 CST 2014
[0.00] Command line: ro root=/dev/mapper/vg_dhcp66106105-lv_root 
rd_NO_LUKS  KEYBOARDTYPE=pc KEYTABLE=us LANG=en_US.UTF-8 rd_NO_MD 
rd_LVM_LV=vg_dhcp66106105/lv_swap SYSFONT=latarcyrheb-sun16 crashkernel=auto 
rd_LVM_LV=vg_dhcp66106105/lv_root rd_NO_DM rhgb quiet KEYBOARDTYPE=pc 
KEYTABLE=us rd_NO_DM console=ttyS0,115200
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x3ffe] usable
[0.00] BIOS-e820: [mem 0x3fff-0x3fffefff] ACPI data
[0.00] BIOS-e820: [mem 0x3000-0x3fff] ACPI NVS
[0.00] BIOS-e820: [mem 0x4020-0x801f] usable
[0.00] NX (Execute Disable) protection: active
[0.00] SMBIOS 2.3 present.
[0.00] DMI: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 
090006  05/23/2012
[0.00] Hypervisor detected: Microsoft HyperV
[0.00] HyperV: features 0xe7f, hints 0x2c
[0.00] HyperV: LAPIC Timer Frequency: 0x30d40
[0.00] e820: update [mem 0x-0x0fff] usable ==> reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] No AGP bridge found
[0.00] e820: last_pfn = 0x80200 max_arch_pfn = 0x4
[0.00] MTRR default type: uncachable
[0.00] MTRR fixed ranges enabled:
[0.00]   0-9 write-back
[0.00]   A-D uncachable
[0.00]   E-F write-back
[0.00] MTRR variable ranges enabled:
[0.00]   0 base 000 mask 3FF write-back
[0.00]   1 disabled
[0.00]   2 disabled
[0.00]   3 disabled
[0.00]   4 disabled
[0.00]   5 disabled
[0.00]   6 disabled
[0.00]   7 disabled
[0.00] x86 PAT enabled: cpu 0, old 0x7040600070406, new 0x7010600070106
[0.00] found SMP MP-table at [mem 0x000ff780-0x000ff78f] mapped at 
[880ff780]
[0.00] Scanning 1 areas for low memory corruption
[0.00] Base memory trampoline at [88099000] 99000 size 24576
[0.00] init_memory_mapping: [mem 0x-0x000f]
[0.00]  [mem 0x-0x000f] page 4k
[0.00] BRK [0x020eb000, 0x020ebfff] PGTABLE
[0.00] BRK [0x020ec000, 0x020ecfff] PGTABLE
[0.00] BRK [0x020ed000, 0x020edfff] PGTABLE
[0.00] init_memory_mapping: [mem 0x8000-0x801f]
[0.00]  [mem 0x8000-0x801f] page 2M
[0.00] BRK [0x020ee000, 0x020eefff] PGTABLE
[0.00] init_memory_mapping: [mem 0x7c00-0x7fff]
[0.00]  [mem 0x7c00-0x7fff] page 2M
[0.00] BRK [0x020e

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-09 Thread Luiz Capitulino

On Sun, 8 Jun 2014 15:25:50 -0700 (PDT)
David Rientjes  wrote:

> On Sun, 8 Jun 2014, Luiz Capitulino wrote:
> 
> > diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
> > index 4064aca..01b493e 100644
> > --- a/arch/x86/include/asm/numa.h
> > +++ b/arch/x86/include/asm/numa.h
> > @@ -9,7 +9,6 @@
> >  #ifdef CONFIG_NUMA
> >  
> >  #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
> > -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
> >  
> >  /*
> >   * Too small node sizes may confuse the VM badly. Usually they
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1d045f9..69f6362 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 start, 
> > u64 end)
> > if (end && (end - start) < NODE_MIN_SIZE)
> > return;
> >  
> > -   start = roundup(start, ZONE_ALIGN);
> > -
> > printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
> >nid, start, end - 1);
> >  
> 
> What ensures this start address is page aligned from the BIOS?

To which start address do you refer to? The start address passed to
setup_node_data() comes from memblks registered when the SRAT table is parsed.
Those memblks get some transformations between the parsing of the SRAT table
and this point. I haven't checked them in detail to see if they are aligned
at some point. But no alignment is enforced in the code that adds the memblks
read from the SRAT table, which is acpi_numa_memory_affinity_init().
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-09 Thread Luiz Capitulino

On Sun, 8 Jun 2014 15:25:50 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 On Sun, 8 Jun 2014, Luiz Capitulino wrote:
 
  diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
  index 4064aca..01b493e 100644
  --- a/arch/x86/include/asm/numa.h
  +++ b/arch/x86/include/asm/numa.h
  @@ -9,7 +9,6 @@
   #ifdef CONFIG_NUMA
   
   #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
  -#define ZONE_ALIGN (1UL  (MAX_ORDER+PAGE_SHIFT))
   
   /*
* Too small node sizes may confuse the VM badly. Usually they
  diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
  index 1d045f9..69f6362 100644
  --- a/arch/x86/mm/numa.c
  +++ b/arch/x86/mm/numa.c
  @@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 start, 
  u64 end)
  if (end  (end - start)  NODE_MIN_SIZE)
  return;
   
  -   start = roundup(start, ZONE_ALIGN);
  -
  printk(KERN_INFO Initmem setup node %d [mem %#010Lx-%#010Lx]\n,
 nid, start, end - 1);
   
 
 What ensures this start address is page aligned from the BIOS?

To which start address do you refer to? The start address passed to
setup_node_data() comes from memblks registered when the SRAT table is parsed.
Those memblks get some transformations between the parsing of the SRAT table
and this point. I haven't checked them in detail to see if they are aligned
at some point. But no alignment is enforced in the code that adds the memblks
read from the SRAT table, which is acpi_numa_memory_affinity_init().
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-09 Thread Luiz Capitulino

On Sun, 8 Jun 2014 18:29:11 -0700
Yinghai Lu ying...@kernel.org wrote:

 On Sun, Jun 8, 2014 at 3:14 PM, Luiz Capitulino lcapitul...@redhat.com 
 wrote:
  In short, I believe this is just dead code for the upstream kernel but this
  causes a bug for 2.6.32 based kernels.
 
  The setup_node_data() function is used to initialize NODE_DATA() for a node.
  It gets a node id and a memory range. The start address for the memory range
  is rounded up to ZONE_ALIGN and then it's used to initialize
  NODE_DATA(nid)-node_start_pfn.
  The 2.6.32 kernel did use the rounded up range start to register a node's
  memory range with the bootmem interface by calling init_bootmem_node().
  A few steps later during bootmem initialization, the 2.6.32 kernel calls
  free_bootmem_with_active_regions() to initialize the bootmem bitmap. This
  function goes through all memory ranges read from the SRAT table and try
  to mark them as usable for bootmem usage. However, before marking a range
  as usable, mark_bootmem_node() asserts if the memory range start address
  (as read from the SRAT table) is less than the value registered with
  init_bootmem_node(). The assertion will trigger whenever the memory range
  start address is rounded up, as it will always be greater than what is
  reported in the SRAT table. This is true when the 2.6.32 kernel runs as a
  HyperV guest on Windows Server 2012. Dropping ZONE_ALIGN solves the
  problem there.
 
 What is e820 memmap and srat from HyperV guest?

I think the dmesg below provides this? Let me know otherwise.

 Can you post bootlog first 200 lines?

[0.00] Initializing cgroup subsys cpuset
[0.00] Initializing cgroup subsys cpu
[0.00] Initializing cgroup subsys cpuacct
[0.00] Linux version 3.15.0-rc6+ 
(r...@amd-6168-8-1.englab.nay.redhat.com) (gcc version 4.4.7 20120313 (Red Hat 
4.4.7-3) (GCC) ) #113 SMP Thu May 29 16:28:41 CST 2014
[0.00] Command line: ro root=/dev/mapper/vg_dhcp66106105-lv_root 
rd_NO_LUKS  KEYBOARDTYPE=pc KEYTABLE=us LANG=en_US.UTF-8 rd_NO_MD 
rd_LVM_LV=vg_dhcp66106105/lv_swap SYSFONT=latarcyrheb-sun16 crashkernel=auto 
rd_LVM_LV=vg_dhcp66106105/lv_root rd_NO_DM rhgb quiet KEYBOARDTYPE=pc 
KEYTABLE=us rd_NO_DM console=ttyS0,115200
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009fbff] usable
[0.00] BIOS-e820: [mem 0x0009fc00-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x3ffe] usable
[0.00] BIOS-e820: [mem 0x3fff-0x3fffefff] ACPI data
[0.00] BIOS-e820: [mem 0x3000-0x3fff] ACPI NVS
[0.00] BIOS-e820: [mem 0x4020-0x801f] usable
[0.00] NX (Execute Disable) protection: active
[0.00] SMBIOS 2.3 present.
[0.00] DMI: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 
090006  05/23/2012
[0.00] Hypervisor detected: Microsoft HyperV
[0.00] HyperV: features 0xe7f, hints 0x2c
[0.00] HyperV: LAPIC Timer Frequency: 0x30d40
[0.00] e820: update [mem 0x-0x0fff] usable == reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] No AGP bridge found
[0.00] e820: last_pfn = 0x80200 max_arch_pfn = 0x4
[0.00] MTRR default type: uncachable
[0.00] MTRR fixed ranges enabled:
[0.00]   0-9 write-back
[0.00]   A-D uncachable
[0.00]   E-F write-back
[0.00] MTRR variable ranges enabled:
[0.00]   0 base 000 mask 3FF write-back
[0.00]   1 disabled
[0.00]   2 disabled
[0.00]   3 disabled
[0.00]   4 disabled
[0.00]   5 disabled
[0.00]   6 disabled
[0.00]   7 disabled
[0.00] x86 PAT enabled: cpu 0, old 0x7040600070406, new 0x7010600070106
[0.00] found SMP MP-table at [mem 0x000ff780-0x000ff78f] mapped at 
[880ff780]
[0.00] Scanning 1 areas for low memory corruption
[0.00] Base memory trampoline at [88099000] 99000 size 24576
[0.00] init_memory_mapping: [mem 0x-0x000f]
[0.00]  [mem 0x-0x000f] page 4k
[0.00] BRK [0x020eb000, 0x020ebfff] PGTABLE
[0.00] BRK [0x020ec000, 0x020ecfff] PGTABLE
[0.00] BRK [0x020ed000, 0x020edfff] PGTABLE
[0.00] init_memory_mapping: [mem 0x8000-0x801f]
[0.00]  [mem 0x8000-0x801f] page 2M
[0.00] BRK [0x020ee000, 0x020eefff] PGTABLE
[0.00] init_memory_mapping: [mem 0x7c00-0x7fff]
[0.00]  [mem 0x7c00-0x7fff] page 2M
[0.00] BRK [0x020ef000, 0x020e] PGTABLE
[0.00] init_memory_mapping: [mem 0x0010-0x3ffe]
[0.00]  [mem 0x0010-0x001f] page 4k
[0.00

Re: [PATCH] x86: numa: drop ZONE_ALIGN

2014-06-09 Thread Luiz Capitulino

On Mon, 9 Jun 2014 14:57:16 -0700 (PDT)
David Rientjes rient...@google.com wrote:

 On Mon, 9 Jun 2014, Luiz Capitulino wrote:
 
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL  (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1d045f9..69f6362 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 
start, u64 end)
if (end  (end - start)  NODE_MIN_SIZE)
return;
 
-   start = roundup(start, ZONE_ALIGN);
-
printk(KERN_INFO Initmem setup node %d [mem 
%#010Lx-%#010Lx]\n,
   nid, start, end - 1);
 
   
   What ensures this start address is page aligned from the BIOS?
  
  To which start address do you refer to?
 
 The start address displayed in the dmesg is not page aligned anymore with 
 your change, correct?  

I have to check that but I don't expect this to happen because my
understanding of the code is that what's rounded up here is just discarded
in free_area_init_node(). Am I wrong?

 acpi_parse_memory_affinity() does no 
 transformations on the table, the base address is coming strictly from the 
 SRAT and there is no page alignment requirement in the ACPI specification.  
 NODE_DATA(nid)-node_start_pfn will be correct because it does the shift 
 for you, but it still seems you want to at least align to PAGE_SIZE here. 

I do agree we need to align to PAGE_SIZE, but I'm not sure where we should
do it.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: numa: drop ZONE_ALIGN

2014-06-08 Thread Luiz Capitulino

In short, I believe this is just dead code for the upstream kernel but this
causes a bug for 2.6.32 based kernels.

The setup_node_data() function is used to initialize NODE_DATA() for a node.
It gets a node id and a memory range. The start address for the memory range
is rounded up to ZONE_ALIGN and then it's used to initialize
NODE_DATA(nid)->node_start_pfn.

However, a few function calls later free_area_init_node() is called and it
overwrites NODE_DATA()->node_start_pfn with the lowest PFN for the node.
Here's the call callchain:

setup_arch()
  initmem_init()
x86_numa_init()
  numa_init()
numa_register_memblks()
  setup_node_data()<-- initializes NODE_DATA()->node_start_pfn
  ...
  x86_init.paging.pagetable_init()
paging_init()
  zone_sizes_init()
free_area_init_nodes()
  free_area_init_node()<-- overwrites NODE_DATA()->node_start_pfn

This doesn't seem to cause any problems to the current kernel because the
rounded up start address is not really used. However, I came accross this
dead assignment while debugging a real issue on a 2.6.32 based kernel.

The 2.6.32 kernel did use the rounded up range start to register a node's
memory range with the bootmem interface by calling init_bootmem_node().
A few steps later during bootmem initialization, the 2.6.32 kernel calls
free_bootmem_with_active_regions() to initialize the bootmem bitmap. This
function goes through all memory ranges read from the SRAT table and try
to mark them as usable for bootmem usage. However, before marking a range
as usable, mark_bootmem_node() asserts if the memory range start address
(as read from the SRAT table) is less than the value registered with
init_bootmem_node(). The assertion will trigger whenever the memory range
start address is rounded up, as it will always be greater than what is
reported in the SRAT table. This is true when the 2.6.32 kernel runs as a
HyperV guest on Windows Server 2012. Dropping ZONE_ALIGN solves the
problem there.

Signed-off-by: Luiz Capitulino 
---
 arch/x86/include/asm/numa.h | 1 -
 arch/x86/mm/numa.c  | 2 --
 2 files changed, 3 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1d045f9..69f6362 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
if (end && (end - start) < NODE_MIN_SIZE)
return;
 
-   start = roundup(start, ZONE_ALIGN);
-
printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
   nid, start, end - 1);
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: numa: drop ZONE_ALIGN

2014-06-08 Thread Luiz Capitulino

In short, I believe this is just dead code for the upstream kernel but this
causes a bug for 2.6.32 based kernels.

The setup_node_data() function is used to initialize NODE_DATA() for a node.
It gets a node id and a memory range. The start address for the memory range
is rounded up to ZONE_ALIGN and then it's used to initialize
NODE_DATA(nid)-node_start_pfn.

However, a few function calls later free_area_init_node() is called and it
overwrites NODE_DATA()-node_start_pfn with the lowest PFN for the node.
Here's the call callchain:

setup_arch()
  initmem_init()
x86_numa_init()
  numa_init()
numa_register_memblks()
  setup_node_data()-- initializes NODE_DATA()-node_start_pfn
  ...
  x86_init.paging.pagetable_init()
paging_init()
  zone_sizes_init()
free_area_init_nodes()
  free_area_init_node()-- overwrites NODE_DATA()-node_start_pfn

This doesn't seem to cause any problems to the current kernel because the
rounded up start address is not really used. However, I came accross this
dead assignment while debugging a real issue on a 2.6.32 based kernel.

The 2.6.32 kernel did use the rounded up range start to register a node's
memory range with the bootmem interface by calling init_bootmem_node().
A few steps later during bootmem initialization, the 2.6.32 kernel calls
free_bootmem_with_active_regions() to initialize the bootmem bitmap. This
function goes through all memory ranges read from the SRAT table and try
to mark them as usable for bootmem usage. However, before marking a range
as usable, mark_bootmem_node() asserts if the memory range start address
(as read from the SRAT table) is less than the value registered with
init_bootmem_node(). The assertion will trigger whenever the memory range
start address is rounded up, as it will always be greater than what is
reported in the SRAT table. This is true when the 2.6.32 kernel runs as a
HyperV guest on Windows Server 2012. Dropping ZONE_ALIGN solves the
problem there.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 arch/x86/include/asm/numa.h | 1 -
 arch/x86/mm/numa.c  | 2 --
 2 files changed, 3 deletions(-)

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064aca..01b493e 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS(MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL  (MAX_ORDER+PAGE_SHIFT))
 
 /*
  * Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1d045f9..69f6362 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -200,8 +200,6 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
if (end  (end - start)  NODE_MIN_SIZE)
return;
 
-   start = roundup(start, ZONE_ALIGN);
-
printk(KERN_INFO Initmem setup node %d [mem %#010Lx-%#010Lx]\n,
   nid, start, end - 1);
 
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-25 Thread Luiz Capitulino

On Tue, 22 Apr 2014 14:55:46 -0700
Andrew Morton  wrote:

> On Tue, 22 Apr 2014 17:37:26 -0400 Luiz Capitulino  
> wrote:
> 
> > On Thu, 17 Apr 2014 16:01:10 -0700
> > Andrew Morton  wrote:
> > 
> > > On Thu, 10 Apr 2014 13:58:40 -0400 Luiz Capitulino 
> > >  wrote:
> > > 
> > > > The HugeTLB subsystem uses the buddy allocator to allocate hugepages 
> > > > during
> > > > runtime. This means that hugepages allocation during runtime is limited 
> > > > to
> > > > MAX_ORDER order. For archs supporting gigantic pages (that is, page 
> > > > sizes
> > > > greater than MAX_ORDER), this in turn means that those pages can't be
> > > > allocated at runtime.
> > > 
> > > Dumb question: what's wrong with just increasing MAX_ORDER?
> > 
> > To be honest I'm not a buddy allocator expert and I'm not familiar with
> > what is involved in increasing MAX_ORDER. What I do know though is that it's
> > not just a matter of increasing a macro's value. For example, for sparsemem
> > support we have this check (include/linux/mmzone.h:1084):
> > 
> > #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
> > #error Allocator MAX_ORDER exceeds SECTION_SIZE
> > #endif
> > 
> > I _guess_ it's because we can't allocate more pages than what's within a
> > section on sparsemem. Can sparsemem and the other stuff be changed to
> > accommodate a bigger MAX_ORDER? I don't know. Is it worth it to increase
> > MAX_ORDER and do all the required changes, given that a bigger MAX_ORDER is
> > only useful for HugeTLB and the archs supporting gigantic pages? I'd guess 
> > not.
> 
> afacit we'd need to increase SECTION_SIZE_BITS to 29 or more to
> accommodate 1G MAX_ORDER.  I assume this means that some machines with
> sparse physical memory layout may not be able to use all (or as much)
> of the physical memory.  Perhaps Yinghai can advise?

Yinghai?

> I do think we should fully explore this option before giving up and
> adding new special-case code. 

I'll look into that, but it may take a bit.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-25 Thread Luiz Capitulino

On Tue, 22 Apr 2014 14:55:46 -0700
Andrew Morton a...@linux-foundation.org wrote:

 On Tue, 22 Apr 2014 17:37:26 -0400 Luiz Capitulino lcapitul...@redhat.com 
 wrote:
 
  On Thu, 17 Apr 2014 16:01:10 -0700
  Andrew Morton a...@linux-foundation.org wrote:
  
   On Thu, 10 Apr 2014 13:58:40 -0400 Luiz Capitulino 
   lcapitul...@redhat.com wrote:
   
The HugeTLB subsystem uses the buddy allocator to allocate hugepages 
during
runtime. This means that hugepages allocation during runtime is limited 
to
MAX_ORDER order. For archs supporting gigantic pages (that is, page 
sizes
greater than MAX_ORDER), this in turn means that those pages can't be
allocated at runtime.
   
   Dumb question: what's wrong with just increasing MAX_ORDER?
  
  To be honest I'm not a buddy allocator expert and I'm not familiar with
  what is involved in increasing MAX_ORDER. What I do know though is that it's
  not just a matter of increasing a macro's value. For example, for sparsemem
  support we have this check (include/linux/mmzone.h:1084):
  
  #if (MAX_ORDER - 1 + PAGE_SHIFT)  SECTION_SIZE_BITS
  #error Allocator MAX_ORDER exceeds SECTION_SIZE
  #endif
  
  I _guess_ it's because we can't allocate more pages than what's within a
  section on sparsemem. Can sparsemem and the other stuff be changed to
  accommodate a bigger MAX_ORDER? I don't know. Is it worth it to increase
  MAX_ORDER and do all the required changes, given that a bigger MAX_ORDER is
  only useful for HugeTLB and the archs supporting gigantic pages? I'd guess 
  not.
 
 afacit we'd need to increase SECTION_SIZE_BITS to 29 or more to
 accommodate 1G MAX_ORDER.  I assume this means that some machines with
 sparse physical memory layout may not be able to use all (or as much)
 of the physical memory.  Perhaps Yinghai can advise?

Yinghai?

 I do think we should fully explore this option before giving up and
 adding new special-case code. 

I'll look into that, but it may take a bit.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmotm 2014-04-22-15-20 uploaded (uml 32- and 64-bit defconfigs)

2014-04-23 Thread Luiz Capitulino

On Wed, 23 Apr 2014 10:48:27 -0700
Randy Dunlap  wrote:

> On 04/23/14 10:41, Luiz Capitulino wrote:
> > On Wed, 23 Apr 2014 10:10:29 -0700
> > Randy Dunlap  wrote:
> > 
> >> On 04/22/14 15:21, a...@linux-foundation.org wrote:
> >>> The mm-of-the-moment snapshot 2014-04-22-15-20 has been uploaded to
> >>>
> >>>http://www.ozlabs.org/~akpm/mmotm/
> >>>
> >>> mmotm-readme.txt says
> >>>
> >>> README for mm-of-the-moment:
> >>>
> >>> http://www.ozlabs.org/~akpm/mmotm/
> >>>
> >>> This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
> >>> more than once a week.
> >>>
> >>> You will need quilt to apply these patches to the latest Linus release 
> >>> (3.x
> >>> or 3.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
> >>> http://ozlabs.org/~akpm/mmotm/series
> >>>
> >>
> >> include/linux/hugetlb.h:468:9: error: 'HPAGE_SHIFT' undeclared (first use 
> >> in this function)
> > 
> > The patch adding HPAGE_SHIFT usage to hugetlb.h in current mmotm is this:
> > 
> > http://www.ozlabs.org/~akpm/mmotm/broken-out/hugetlb-ensure-hugepage-access-is-denied-if-hugepages-are-not-supported.patch
> > 
> > But I can't reproduce the issue to be sure what the problem is. Are you
> > building the kernel on 32bits? Can you provide the output of
> > "grep -i huge .config" or send your .config in private?
> > 
> 
> [adding Richard to cc:]
> 
> 
> As in $subject, if I build uml x86 32-bit or 64-bit defconfig, the build 
> fails with
> this error.

Oh, I missed the subject info completely. Sorry about that.

So, the issue really seems to be introduced by patch:

 hugetlb-ensure-hugepage-access-is-denied-if-hugepages-are-not-supported.patch

And the problem is that UML doesn't define HPAGE_SHIFT. The following patch
fixes it, but I'll let Nishanth decide what to do here.

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4eace5e..3aab7df 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -458,6 +458,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
return ptl;
 }
 
+#ifndef HPAGE_SHIFT
+#define HPAGE_SHIFT 0
+#endif
+
 static inline bool hugepages_supported(void)
 {
/*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmotm 2014-04-22-15-20 uploaded (uml 32- and 64-bit defconfigs)

2014-04-23 Thread Luiz Capitulino

On Wed, 23 Apr 2014 10:10:29 -0700
Randy Dunlap  wrote:

> On 04/22/14 15:21, a...@linux-foundation.org wrote:
> > The mm-of-the-moment snapshot 2014-04-22-15-20 has been uploaded to
> > 
> >http://www.ozlabs.org/~akpm/mmotm/
> > 
> > mmotm-readme.txt says
> > 
> > README for mm-of-the-moment:
> > 
> > http://www.ozlabs.org/~akpm/mmotm/
> > 
> > This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
> > more than once a week.
> > 
> > You will need quilt to apply these patches to the latest Linus release (3.x
> > or 3.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
> > http://ozlabs.org/~akpm/mmotm/series
> > 
> 
> include/linux/hugetlb.h:468:9: error: 'HPAGE_SHIFT' undeclared (first use in 
> this function)

The patch adding HPAGE_SHIFT usage to hugetlb.h in current mmotm is this:

http://www.ozlabs.org/~akpm/mmotm/broken-out/hugetlb-ensure-hugepage-access-is-denied-if-hugepages-are-not-supported.patch

But I can't reproduce the issue to be sure what the problem is. Are you
building the kernel on 32bits? Can you provide the output of
"grep -i huge .config" or send your .config in private?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmotm 2014-04-22-15-20 uploaded (uml 32- and 64-bit defconfigs)

2014-04-23 Thread Luiz Capitulino

On Wed, 23 Apr 2014 10:10:29 -0700
Randy Dunlap rdun...@infradead.org wrote:

 On 04/22/14 15:21, a...@linux-foundation.org wrote:
  The mm-of-the-moment snapshot 2014-04-22-15-20 has been uploaded to
  
 http://www.ozlabs.org/~akpm/mmotm/
  
  mmotm-readme.txt says
  
  README for mm-of-the-moment:
  
  http://www.ozlabs.org/~akpm/mmotm/
  
  This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
  more than once a week.
  
  You will need quilt to apply these patches to the latest Linus release (3.x
  or 3.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
  http://ozlabs.org/~akpm/mmotm/series
  
 
 include/linux/hugetlb.h:468:9: error: 'HPAGE_SHIFT' undeclared (first use in 
 this function)

The patch adding HPAGE_SHIFT usage to hugetlb.h in current mmotm is this:

http://www.ozlabs.org/~akpm/mmotm/broken-out/hugetlb-ensure-hugepage-access-is-denied-if-hugepages-are-not-supported.patch

But I can't reproduce the issue to be sure what the problem is. Are you
building the kernel on 32bits? Can you provide the output of
grep -i huge .config or send your .config in private?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmotm 2014-04-22-15-20 uploaded (uml 32- and 64-bit defconfigs)

2014-04-23 Thread Luiz Capitulino

On Wed, 23 Apr 2014 10:48:27 -0700
Randy Dunlap rdun...@infradead.org wrote:

 On 04/23/14 10:41, Luiz Capitulino wrote:
  On Wed, 23 Apr 2014 10:10:29 -0700
  Randy Dunlap rdun...@infradead.org wrote:
  
  On 04/22/14 15:21, a...@linux-foundation.org wrote:
  The mm-of-the-moment snapshot 2014-04-22-15-20 has been uploaded to
 
 http://www.ozlabs.org/~akpm/mmotm/
 
  mmotm-readme.txt says
 
  README for mm-of-the-moment:
 
  http://www.ozlabs.org/~akpm/mmotm/
 
  This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
  more than once a week.
 
  You will need quilt to apply these patches to the latest Linus release 
  (3.x
  or 3.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
  http://ozlabs.org/~akpm/mmotm/series
 
 
  include/linux/hugetlb.h:468:9: error: 'HPAGE_SHIFT' undeclared (first use 
  in this function)
  
  The patch adding HPAGE_SHIFT usage to hugetlb.h in current mmotm is this:
  
  http://www.ozlabs.org/~akpm/mmotm/broken-out/hugetlb-ensure-hugepage-access-is-denied-if-hugepages-are-not-supported.patch
  
  But I can't reproduce the issue to be sure what the problem is. Are you
  building the kernel on 32bits? Can you provide the output of
  grep -i huge .config or send your .config in private?
  
 
 [adding Richard to cc:]
 
 
 As in $subject, if I build uml x86 32-bit or 64-bit defconfig, the build 
 fails with
 this error.

Oh, I missed the subject info completely. Sorry about that.

So, the issue really seems to be introduced by patch:

 hugetlb-ensure-hugepage-access-is-denied-if-hugepages-are-not-supported.patch

And the problem is that UML doesn't define HPAGE_SHIFT. The following patch
fixes it, but I'll let Nishanth decide what to do here.

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4eace5e..3aab7df 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -458,6 +458,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
return ptl;
 }
 
+#ifndef HPAGE_SHIFT
+#define HPAGE_SHIFT 0
+#endif
+
 static inline bool hugepages_supported(void)
 {
/*
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-22 Thread Luiz Capitulino

On Thu, 17 Apr 2014 16:01:10 -0700
Andrew Morton  wrote:

> On Thu, 10 Apr 2014 13:58:40 -0400 Luiz Capitulino  
> wrote:
> 
> > The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
> > runtime. This means that hugepages allocation during runtime is limited to
> > MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
> > greater than MAX_ORDER), this in turn means that those pages can't be
> > allocated at runtime.
> 
> Dumb question: what's wrong with just increasing MAX_ORDER?

To be honest I'm not a buddy allocator expert and I'm not familiar with
what is involved in increasing MAX_ORDER. What I do know though is that it's
not just a matter of increasing a macro's value. For example, for sparsemem
support we have this check (include/linux/mmzone.h:1084):

#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif

I _guess_ it's because we can't allocate more pages than what's within a
section on sparsemem. Can sparsemem and the other stuff be changed to
accommodate a bigger MAX_ORDER? I don't know. Is it worth it to increase
MAX_ORDER and do all the required changes, given that a bigger MAX_ORDER is
only useful for HugeTLB and the archs supporting gigantic pages? I'd guess not.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-22 Thread Luiz Capitulino

On Thu, 17 Apr 2014 16:00:39 -0700
Andrew Morton  wrote:

> On Thu, 10 Apr 2014 13:58:45 -0400 Luiz Capitulino  
> wrote:
> 
> > HugeTLB is limited to allocating hugepages whose size are less than
> > MAX_ORDER order. This is so because HugeTLB allocates hugepages via
> > the buddy allocator. Gigantic pages (that is, pages whose size is
> > greater than MAX_ORDER order) have to be allocated at boottime.
> > 
> > However, boottime allocation has at least two serious problems. First,
> > it doesn't support NUMA and second, gigantic pages allocated at
> > boottime can't be freed.
> > 
> > This commit solves both issues by adding support for allocating gigantic
> > pages during runtime. It works just like regular sized hugepages,
> > meaning that the interface in sysfs is the same, it supports NUMA,
> > and gigantic pages can be freed.
> > 
> > For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
> > gigantic pages on node 1, one can do:
> > 
> >  # echo 2 > \
> >/sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
> > 
> > And to free them all:
> > 
> >  # echo 0 > \
> >/sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
> > 
> > The one problem with gigantic page allocation at runtime is that it
> > can't be serviced by the buddy allocator. To overcome that problem, this
> > commit scans all zones from a node looking for a large enough contiguous
> > region. When one is found, it's allocated by using CMA, that is, we call
> > alloc_contig_range() to do the actual allocation. For example, on x86_64
> > we scan all zones looking for a 1GB contiguous region. When one is found,
> > it's allocated by alloc_contig_range().
> > 
> > One expected issue with that approach is that such gigantic contiguous
> > regions tend to vanish as runtime goes by. The best way to avoid this for
> > now is to make gigantic page allocations very early during system boot, say
> > from a init script. Other possible optimization include using compaction,
> > which is supported by CMA but is not explicitly used by this commit.
> 
> Why aren't we using compaction?

The main reason is that I'm not sure what's the best way to use it in the
context of a 1GB allocation. I mean, the most obvious way (which seems to
be what the DMA subsystem does) is trial and error: just pass a gigantic
PFN range to alloc_contig_range() and if it fails you go to the next range
(or try again in certain cases). This might work, but to be honest I'm not
sure what are the implications of doing that for a 1GB range, especially
because compaction (as implemented by CMA) is synchronous.

As I see compaction usage as an optimization, I've opted for submitting the
simplest implementation that works. I've tested this series on two NUMA
machines and it worked just fine. Future improvements can be done on top.

Also note that this is about HugeTLB making use of compaction automatically.
There's nothing in this series that prevents the user from manually compacting
memory by writing to /sys/devices/system/node/nodeN/compact. As HugeTLB
page reservation is a manual procedure anyways, I don't think that manually
starting compaction is that bad.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-22 Thread Luiz Capitulino

On Thu, 17 Apr 2014 16:00:39 -0700
Andrew Morton a...@linux-foundation.org wrote:

 On Thu, 10 Apr 2014 13:58:45 -0400 Luiz Capitulino lcapitul...@redhat.com 
 wrote:
 
  HugeTLB is limited to allocating hugepages whose size are less than
  MAX_ORDER order. This is so because HugeTLB allocates hugepages via
  the buddy allocator. Gigantic pages (that is, pages whose size is
  greater than MAX_ORDER order) have to be allocated at boottime.
  
  However, boottime allocation has at least two serious problems. First,
  it doesn't support NUMA and second, gigantic pages allocated at
  boottime can't be freed.
  
  This commit solves both issues by adding support for allocating gigantic
  pages during runtime. It works just like regular sized hugepages,
  meaning that the interface in sysfs is the same, it supports NUMA,
  and gigantic pages can be freed.
  
  For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
  gigantic pages on node 1, one can do:
  
   # echo 2  \
 /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
  
  And to free them all:
  
   # echo 0  \
 /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
  
  The one problem with gigantic page allocation at runtime is that it
  can't be serviced by the buddy allocator. To overcome that problem, this
  commit scans all zones from a node looking for a large enough contiguous
  region. When one is found, it's allocated by using CMA, that is, we call
  alloc_contig_range() to do the actual allocation. For example, on x86_64
  we scan all zones looking for a 1GB contiguous region. When one is found,
  it's allocated by alloc_contig_range().
  
  One expected issue with that approach is that such gigantic contiguous
  regions tend to vanish as runtime goes by. The best way to avoid this for
  now is to make gigantic page allocations very early during system boot, say
  from a init script. Other possible optimization include using compaction,
  which is supported by CMA but is not explicitly used by this commit.
 
 Why aren't we using compaction?

The main reason is that I'm not sure what's the best way to use it in the
context of a 1GB allocation. I mean, the most obvious way (which seems to
be what the DMA subsystem does) is trial and error: just pass a gigantic
PFN range to alloc_contig_range() and if it fails you go to the next range
(or try again in certain cases). This might work, but to be honest I'm not
sure what are the implications of doing that for a 1GB range, especially
because compaction (as implemented by CMA) is synchronous.

As I see compaction usage as an optimization, I've opted for submitting the
simplest implementation that works. I've tested this series on two NUMA
machines and it worked just fine. Future improvements can be done on top.

Also note that this is about HugeTLB making use of compaction automatically.
There's nothing in this series that prevents the user from manually compacting
memory by writing to /sys/devices/system/node/nodeN/compact. As HugeTLB
page reservation is a manual procedure anyways, I don't think that manually
starting compaction is that bad.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-22 Thread Luiz Capitulino

On Thu, 17 Apr 2014 16:01:10 -0700
Andrew Morton a...@linux-foundation.org wrote:

 On Thu, 10 Apr 2014 13:58:40 -0400 Luiz Capitulino lcapitul...@redhat.com 
 wrote:
 
  The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
  runtime. This means that hugepages allocation during runtime is limited to
  MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
  greater than MAX_ORDER), this in turn means that those pages can't be
  allocated at runtime.
 
 Dumb question: what's wrong with just increasing MAX_ORDER?

To be honest I'm not a buddy allocator expert and I'm not familiar with
what is involved in increasing MAX_ORDER. What I do know though is that it's
not just a matter of increasing a macro's value. For example, for sparsemem
support we have this check (include/linux/mmzone.h:1084):

#if (MAX_ORDER - 1 + PAGE_SHIFT)  SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif

I _guess_ it's because we can't allocate more pages than what's within a
section on sparsemem. Can sparsemem and the other stuff be changed to
accommodate a bigger MAX_ORDER? I don't know. Is it worth it to increase
MAX_ORDER and do all the required changes, given that a bigger MAX_ORDER is
only useful for HugeTLB and the archs supporting gigantic pages? I'd guess not.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-17 Thread Luiz Capitulino

On Thu, 17 Apr 2014 11:52:42 -0700
Andrew Morton  wrote:

> On Thu, 17 Apr 2014 11:13:05 -0400 Luiz Capitulino  
> wrote:
> 
> > On Thu, 10 Apr 2014 13:58:40 -0400
> > Luiz Capitulino  wrote:
> > 
> > > [Full introduction right after the changelog]
> > > 
> > > Changelog
> > > -
> > > 
> > > v3
> > > 
> > > - Dropped unnecessary WARN_ON() call [Kirill]
> > > - Always check if the pfn range lies within a zone [Yasuaki]
> > > - Renamed some function arguments for consistency
> > 
> > Andrew, this series got four ACKs but it seems that you haven't picked
> > it yet. Is there anything missing to be addressed?
> 
> I don't look at new features until after -rc1.  Then it takes a week or
> more to work through the backlog.  We'll get there.

I see, just wanted to make sure it was in your radar. Thanks a lot.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-17 Thread Luiz Capitulino

On Thu, 10 Apr 2014 13:58:40 -0400
Luiz Capitulino  wrote:

> [Full introduction right after the changelog]
> 
> Changelog
> -
> 
> v3
> 
> - Dropped unnecessary WARN_ON() call [Kirill]
> - Always check if the pfn range lies within a zone [Yasuaki]
> - Renamed some function arguments for consistency

Andrew, this series got four ACKs but it seems that you haven't picked
it yet. Is there anything missing to be addressed?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-17 Thread Luiz Capitulino

On Thu, 10 Apr 2014 13:58:40 -0400
Luiz Capitulino lcapitul...@redhat.com wrote:

 [Full introduction right after the changelog]
 
 Changelog
 -
 
 v3
 
 - Dropped unnecessary WARN_ON() call [Kirill]
 - Always check if the pfn range lies within a zone [Yasuaki]
 - Renamed some function arguments for consistency

Andrew, this series got four ACKs but it seems that you haven't picked
it yet. Is there anything missing to be addressed?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-17 Thread Luiz Capitulino

On Thu, 17 Apr 2014 11:52:42 -0700
Andrew Morton a...@linux-foundation.org wrote:

 On Thu, 17 Apr 2014 11:13:05 -0400 Luiz Capitulino lcapitul...@redhat.com 
 wrote:
 
  On Thu, 10 Apr 2014 13:58:40 -0400
  Luiz Capitulino lcapitul...@redhat.com wrote:
  
   [Full introduction right after the changelog]
   
   Changelog
   -
   
   v3
   
   - Dropped unnecessary WARN_ON() call [Kirill]
   - Always check if the pfn range lies within a zone [Yasuaki]
   - Renamed some function arguments for consistency
  
  Andrew, this series got four ACKs but it seems that you haven't picked
  it yet. Is there anything missing to be addressed?
 
 I don't look at new features until after -rc1.  Then it takes a week or
 more to work through the backlog.  We'll get there.

I see, just wanted to make sure it was in your radar. Thanks a lot.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/5] hugetlb: prep_compound_gigantic_page(): drop __init marker

2014-04-10 Thread Luiz Capitulino

This function is going to be used by non-init code in a future
commit.

Signed-off-by: Luiz Capitulino 
---
 mm/hugetlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd30f22..957231b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,8 +690,7 @@ static void prep_new_huge_page(struct hstate *h, struct 
page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
 }
 
-static void __init prep_compound_gigantic_page(struct page *page,
-  unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 {
int i;
int nr_pages = 1 << order;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/5] hugetlb: move helpers up in the file

2014-04-10 Thread Luiz Capitulino

Next commit will add new code which will want to call
for_each_node_mask_to_alloc() macro. Move it, its buddy
for_each_node_mask_to_free() and their dependencies up in the file so
the new code can use them. This is just code movement, no logic change.

Signed-off-by: Luiz Capitulino 
Reviewed-by: Andrea Arcangeli 
Reviewed-by: Naoya Horiguchi 
Reviewed-by: Yasuaki Ishimatsu 
---
 mm/hugetlb.c | 146 +--
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8cbaa97..6f1ca74 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -607,6 +607,79 @@ err:
return NULL;
 }
 
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   nid = next_node(nid, *nodes_allowed);
+   if (nid == MAX_NUMNODES)
+   nid = first_node(*nodes_allowed);
+   VM_BUG_ON(nid >= MAX_NUMNODES);
+
+   return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   if (!node_isset(nid, *nodes_allowed))
+   nid = next_node_allowed(nid, nodes_allowed);
+   return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+   nodemask_t *nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+   h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t 
*nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+   h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)  \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes > 0 && \
+   ((node = hstate_next_node_to_alloc(hs, mask)) || 1);\
+   nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)   \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes > 0 && \
+   ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+   nr_nodes--)
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
int i;
@@ -786,79 +859,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
return page;
 }
 
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed.  Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   nid = next_node(nid, *nodes_allowed);
-   if (nid == MAX_NUMNODES)
-   nid = first_node(*nodes_allowed);
-   VM_BUG_ON(nid >= MAX_NUMNODES);
-
-   return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   if (!node_isset(nid, *nodes_allowed))
-   nid = next_node_allowed(nid, nodes_allowed);
-   return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
-   nodemask_t *nodes_allowed)
-{
-   int nid;
-
-   VM_BUG_ON(!nodes_allowed);
-
-   nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
-   h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
-   return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"]

[PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-10 Thread Luiz Capitulino

[Full introduction right after the changelog]

Changelog
-

v3

- Dropped unnecessary WARN_ON() call [Kirill]
- Always check if the pfn range lies within a zone [Yasuaki]
- Renamed some function arguments for consistency

v2

- Rewrote allocation loop to avoid scanning unless PFNs [Yasuaki]
- Dropped incomplete multi-arch support [Naoya]
- Added patch to drop __init from prep_compound_gigantic_page()
- Restricted the feature to x86_64 (more details in patch 5/5)
- Added review-bys plus minor changelog changes

Introduction


The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
runtime. This means that hugepages allocation during runtime is limited to
MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
greater than MAX_ORDER), this in turn means that those pages can't be
allocated at runtime.

HugeTLB supports gigantic page allocation during boottime, via the boot
allocator. To this end the kernel provides the command-line options
hugepagesz= and hugepages=, which can be used to instruct the kernel to
allocate N gigantic pages during boot.

For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can
be allocated and freed at runtime. If one wants to allocate 1G gigantic pages,
this has to be done at boot via the hugepagesz= and hugepages= command-line
options.

Now, gigantic page allocation at boottime has two serious problems:

 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel
evenly distributes boottime allocated hugepages among nodes.

For example, suppose you have a four-node NUMA machine and want
to allocate four 1G gigantic pages at boottime. The kernel will
allocate one gigantic page per node.

On the other hand, we do have users who want to be able to specify
which NUMA node gigantic pages should allocated from. So that they
can place virtual machines on a specific NUMA node.

 2. Gigantic pages allocated at boottime can't be freed

At this point it's important to observe that regular hugepages allocated
at runtime don't have those problems. This is so because HugeTLB interface
for runtime allocation in sysfs supports NUMA and runtime allocated pages
can be freed just fine via the buddy allocator.

This series adds support for allocating gigantic pages at runtime. It does
so by allocating gigantic pages via CMA instead of the buddy allocator.
Releasing gigantic pages is also supported via CMA. As this series builds
on top of the existing HugeTLB interface, it makes gigantic page allocation
and releasing just like regular sized hugepages. This also means that NUMA
support just works.

For example, to allocate two 1G gigantic pages on node 1, one can do:

 # echo 2 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And, to release all gigantic pages on the same node:

 # echo 0 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

Please, refer to patch 5/5 for full technical details.

Finally, please note that this series is a follow up for a previous series
that tried to extend the command-line options set to be NUMA aware:

 http://marc.info/?l=linux-mm=139593335312191=2

During the discussion of that series it was agreed that having runtime
allocation support for gigantic pages was a better solution.

Luiz Capitulino (5):
  hugetlb: prep_compound_gigantic_page(): drop __init marker
  hugetlb: add hstate_is_gigantic()
  hugetlb: update_and_free_page(): don't clear PG_reserved bit
  hugetlb: move helpers up in the file
  hugetlb: add support for gigantic page allocation at runtime

 include/linux/hugetlb.h |   5 +
 mm/hugetlb.c| 336 ++--
 2 files changed, 245 insertions(+), 96 deletions(-)

-- 
1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-10 Thread Luiz Capitulino

HugeTLB is limited to allocating hugepages whose size are less than
MAX_ORDER order. This is so because HugeTLB allocates hugepages via
the buddy allocator. Gigantic pages (that is, pages whose size is
greater than MAX_ORDER order) have to be allocated at boottime.

However, boottime allocation has at least two serious problems. First,
it doesn't support NUMA and second, gigantic pages allocated at
boottime can't be freed.

This commit solves both issues by adding support for allocating gigantic
pages during runtime. It works just like regular sized hugepages,
meaning that the interface in sysfs is the same, it supports NUMA,
and gigantic pages can be freed.

For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
gigantic pages on node 1, one can do:

 # echo 2 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And to free them all:

 # echo 0 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

The one problem with gigantic page allocation at runtime is that it
can't be serviced by the buddy allocator. To overcome that problem, this
commit scans all zones from a node looking for a large enough contiguous
region. When one is found, it's allocated by using CMA, that is, we call
alloc_contig_range() to do the actual allocation. For example, on x86_64
we scan all zones looking for a 1GB contiguous region. When one is found,
it's allocated by alloc_contig_range().

One expected issue with that approach is that such gigantic contiguous
regions tend to vanish as runtime goes by. The best way to avoid this for
now is to make gigantic page allocations very early during system boot, say
from a init script. Other possible optimization include using compaction,
which is supported by CMA but is not explicitly used by this commit.

It's also important to note the following:

 1. Gigantic pages allocated at boottime by the hugepages= command-line
option can be freed at runtime just fine

 2. This commit adds support for gigantic pages only to x86_64. The
reason is that I don't have access to nor experience with other archs.
The code is arch indepedent though, so it should be simple to add
support to different archs

 3. I didn't add support for hugepage overcommit, that is allocating
a gigantic page on demand when
   /proc/sys/vm/nr_overcommit_hugepages > 0. The reason is that I don't
   think it's reasonable to do the hard and long work required for
   allocating a gigantic page at fault time. But it should be simple
   to add this if wanted

Signed-off-by: Luiz Capitulino 
---
 mm/hugetlb.c | 167 +++
 1 file changed, 156 insertions(+), 11 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6f1ca74..161dc39 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -680,11 +680,150 @@ static int hstate_next_node_to_free(struct hstate *h, 
nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
 
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1 << order;
+   struct page *p = page + 1;
+
+   for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p->first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn,
+   unsigned long nr_pages)
+{
+   unsigned long end_pfn = start_pfn + nr_pages;
+   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+   unsigned long nr_pages)
+{
+   unsigned long i, end_pfn = start_pfn + nr_pages;
+   struct page *page;
+
+   for (i = start_pfn; i < end_pfn; i++) {
+   if (!pfn_valid(i))
+   return false;
+
+   page = pfn_to_page(i);
+
+   if (PageReserved(page))
+   return false;
+
+   if (page_count(page) > 0)
+   return false;
+
+   if (PageHuge(page))
+   return false;
+   }
+
+   return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+   unsigned long start_pfn, unsigned long nr_pages)
+{
+   unsigned long last_pfn = start_pfn + nr_pages - 1;
+   return zone_spans_pfn(zone, last_pfn);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+   unsigned lon

[PATCH 3/5] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-10 Thread Luiz Capitulino

Hugepages pages never get the PG_reserved bit set, so don't clear it.

However, note that if the bit gets mistakenly set free_pages_check() will
catch it.

Signed-off-by: Luiz Capitulino 
---
 mm/hugetlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a5e679b..8cbaa97 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -618,8 +618,8 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
-   1 << PG_active | 1 << PG_reserved |
-   1 << PG_private | 1 << PG_writeback);
+   1 << PG_active | 1 << PG_private |
+   1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/5] hugetlb: add hstate_is_gigantic()

2014-04-10 Thread Luiz Capitulino

Signed-off-by: Luiz Capitulino 
Reviewed-by: Andrea Arcangeli 
Reviewed-by: Naoya Horiguchi 
Reviewed-by: Yasuaki Ishimatsu 
---
 include/linux/hugetlb.h |  5 +
 mm/hugetlb.c| 28 ++--
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b337cf..62a8b88 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) >= MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1 << h->order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 957231b..a5e679b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -611,7 +611,7 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
 {
int i;
 
-   VM_BUG_ON(h->order >= MAX_ORDER);
+   VM_BUG_ON(hstate_is_gigantic(h));
 
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
@@ -664,7 +664,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
 
-   if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+   if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) {
/* remove the page from active list */
list_del(>lru);
update_and_free_page(h, page);
@@ -768,7 +768,7 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
 {
struct page *page;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
page = alloc_pages_exact_node(nid,
@@ -962,7 +962,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, 
int nid)
struct page *page;
unsigned int r_nid;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
/*
@@ -1155,7 +1155,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
 
/* Cannot return gigantic pages currently */
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1354,7 +1354,7 @@ static void __init gather_bootmem_prealloc(void)
 * fix confusing memory reports from free(1) and another
 * side-effects, like CommitLimit going negative.
 */
-   if (h->order > (MAX_ORDER - 1))
+   if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
}
 }
@@ -1364,7 +1364,7 @@ static void __init hugetlb_hstate_alloc_pages(struct 
hstate *h)
unsigned long i;
 
for (i = 0; i < h->max_huge_pages; ++i) {
-   if (h->order >= MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1380,7 +1380,7 @@ static void __init hugetlb_init_hstates(void)
 
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
-   if (h->order < MAX_ORDER)
+   if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
 }
@@ -1414,7 +1414,7 @@ static void try_to_free_low(struct hstate *h, unsigned 
long count,
 {
int i;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
for_each_node_mask(i, *nodes_allowed) {
@@ -1477,7 +1477,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, 
unsigned long count,
 {
unsigned long min_count, ret;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return h->max_huge_pages;
 
/*
@@ -1604,7 +1604,7 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
goto out;
 
h = kobj_to_hstate(kobj, );
-   if (h->order >= MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
err = -EINVAL;
goto out;
}
@@ -1687,7 +1687,7 @@ static ssize_t nr_overcommit_hugepages_store(struct 
kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return -EINVAL;
 
err = kstrtoul(buf, 10, );
@@ -2112,7 +2112,7 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
 
tmp = h->max_huge_pages;
 
-   if (write

[PATCH 3/5] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-10 Thread Luiz Capitulino

Hugepages pages never get the PG_reserved bit set, so don't clear it.

However, note that if the bit gets mistakenly set free_pages_check() will
catch it.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 mm/hugetlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a5e679b..8cbaa97 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -618,8 +618,8 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
for (i = 0; i  pages_per_huge_page(h); i++) {
page[i].flags = ~(1  PG_locked | 1  PG_error |
1  PG_referenced | 1  PG_dirty |
-   1  PG_active | 1  PG_reserved |
-   1  PG_private | 1  PG_writeback);
+   1  PG_active | 1  PG_private |
+   1  PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/5] hugetlb: add hstate_is_gigantic()

2014-04-10 Thread Luiz Capitulino

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
Reviewed-by: Andrea Arcangeli aarca...@redhat.com
Reviewed-by: Naoya Horiguchi n-horigu...@ah.jp.nec.com
Reviewed-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 include/linux/hugetlb.h |  5 +
 mm/hugetlb.c| 28 ++--
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b337cf..62a8b88 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h-order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) = MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1  h-order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 957231b..a5e679b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -611,7 +611,7 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
 {
int i;
 
-   VM_BUG_ON(h-order = MAX_ORDER);
+   VM_BUG_ON(hstate_is_gigantic(h));
 
h-nr_huge_pages--;
h-nr_huge_pages_node[page_to_nid(page)]--;
@@ -664,7 +664,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h-resv_huge_pages++;
 
-   if (h-surplus_huge_pages_node[nid]  huge_page_order(h)  MAX_ORDER) {
+   if (h-surplus_huge_pages_node[nid]  !hstate_is_gigantic(h)) {
/* remove the page from active list */
list_del(page-lru);
update_and_free_page(h, page);
@@ -768,7 +768,7 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
 {
struct page *page;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
page = alloc_pages_exact_node(nid,
@@ -962,7 +962,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, 
int nid)
struct page *page;
unsigned int r_nid;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
/*
@@ -1155,7 +1155,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h-resv_huge_pages -= unused_resv_pages;
 
/* Cannot return gigantic pages currently */
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
nr_pages = min(unused_resv_pages, h-surplus_huge_pages);
@@ -1354,7 +1354,7 @@ static void __init gather_bootmem_prealloc(void)
 * fix confusing memory reports from free(1) and another
 * side-effects, like CommitLimit going negative.
 */
-   if (h-order  (MAX_ORDER - 1))
+   if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1  h-order);
}
 }
@@ -1364,7 +1364,7 @@ static void __init hugetlb_hstate_alloc_pages(struct 
hstate *h)
unsigned long i;
 
for (i = 0; i  h-max_huge_pages; ++i) {
-   if (h-order = MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1380,7 +1380,7 @@ static void __init hugetlb_init_hstates(void)
 
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
-   if (h-order  MAX_ORDER)
+   if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
 }
@@ -1414,7 +1414,7 @@ static void try_to_free_low(struct hstate *h, unsigned 
long count,
 {
int i;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
for_each_node_mask(i, *nodes_allowed) {
@@ -1477,7 +1477,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, 
unsigned long count,
 {
unsigned long min_count, ret;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return h-max_huge_pages;
 
/*
@@ -1604,7 +1604,7 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
goto out;
 
h = kobj_to_hstate(kobj, nid);
-   if (h-order = MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
err = -EINVAL;
goto out;
}
@@ -1687,7 +1687,7 @@ static ssize_t nr_overcommit_hugepages_store(struct 
kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return -EINVAL;
 
err = kstrtoul(buf, 10, input);
@@ -2112,7 +2112,7 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
 
tmp = h-max_huge_pages;
 
-   if (write  h-order = MAX_ORDER)
+   if (write  hstate_is_gigantic(h))
return

[PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-10 Thread Luiz Capitulino

HugeTLB is limited to allocating hugepages whose size are less than
MAX_ORDER order. This is so because HugeTLB allocates hugepages via
the buddy allocator. Gigantic pages (that is, pages whose size is
greater than MAX_ORDER order) have to be allocated at boottime.

However, boottime allocation has at least two serious problems. First,
it doesn't support NUMA and second, gigantic pages allocated at
boottime can't be freed.

This commit solves both issues by adding support for allocating gigantic
pages during runtime. It works just like regular sized hugepages,
meaning that the interface in sysfs is the same, it supports NUMA,
and gigantic pages can be freed.

For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
gigantic pages on node 1, one can do:

 # echo 2  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And to free them all:

 # echo 0  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

The one problem with gigantic page allocation at runtime is that it
can't be serviced by the buddy allocator. To overcome that problem, this
commit scans all zones from a node looking for a large enough contiguous
region. When one is found, it's allocated by using CMA, that is, we call
alloc_contig_range() to do the actual allocation. For example, on x86_64
we scan all zones looking for a 1GB contiguous region. When one is found,
it's allocated by alloc_contig_range().

One expected issue with that approach is that such gigantic contiguous
regions tend to vanish as runtime goes by. The best way to avoid this for
now is to make gigantic page allocations very early during system boot, say
from a init script. Other possible optimization include using compaction,
which is supported by CMA but is not explicitly used by this commit.

It's also important to note the following:

 1. Gigantic pages allocated at boottime by the hugepages= command-line
option can be freed at runtime just fine

 2. This commit adds support for gigantic pages only to x86_64. The
reason is that I don't have access to nor experience with other archs.
The code is arch indepedent though, so it should be simple to add
support to different archs

 3. I didn't add support for hugepage overcommit, that is allocating
a gigantic page on demand when
   /proc/sys/vm/nr_overcommit_hugepages  0. The reason is that I don't
   think it's reasonable to do the hard and long work required for
   allocating a gigantic page at fault time. But it should be simple
   to add this if wanted

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 mm/hugetlb.c | 167 +++
 1 file changed, 156 insertions(+), 11 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6f1ca74..161dc39 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -680,11 +680,150 @@ static int hstate_next_node_to_free(struct hstate *h, 
nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
 
+#if defined(CONFIG_CMA)  defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1  order;
+   struct page *p = page + 1;
+
+   for (i = 1; i  nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p-first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1  order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn,
+   unsigned long nr_pages)
+{
+   unsigned long end_pfn = start_pfn + nr_pages;
+   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+   unsigned long nr_pages)
+{
+   unsigned long i, end_pfn = start_pfn + nr_pages;
+   struct page *page;
+
+   for (i = start_pfn; i  end_pfn; i++) {
+   if (!pfn_valid(i))
+   return false;
+
+   page = pfn_to_page(i);
+
+   if (PageReserved(page))
+   return false;
+
+   if (page_count(page)  0)
+   return false;
+
+   if (PageHuge(page))
+   return false;
+   }
+
+   return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+   unsigned long start_pfn, unsigned long nr_pages)
+{
+   unsigned long last_pfn = start_pfn + nr_pages - 1;
+   return zone_spans_pfn(zone, last_pfn);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+   unsigned long nr_pages = 1  order

[PATCH v3 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-10 Thread Luiz Capitulino

[Full introduction right after the changelog]

Changelog
-

v3

- Dropped unnecessary WARN_ON() call [Kirill]
- Always check if the pfn range lies within a zone [Yasuaki]
- Renamed some function arguments for consistency

v2

- Rewrote allocation loop to avoid scanning unless PFNs [Yasuaki]
- Dropped incomplete multi-arch support [Naoya]
- Added patch to drop __init from prep_compound_gigantic_page()
- Restricted the feature to x86_64 (more details in patch 5/5)
- Added review-bys plus minor changelog changes

Introduction


The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
runtime. This means that hugepages allocation during runtime is limited to
MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
greater than MAX_ORDER), this in turn means that those pages can't be
allocated at runtime.

HugeTLB supports gigantic page allocation during boottime, via the boot
allocator. To this end the kernel provides the command-line options
hugepagesz= and hugepages=, which can be used to instruct the kernel to
allocate N gigantic pages during boot.

For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can
be allocated and freed at runtime. If one wants to allocate 1G gigantic pages,
this has to be done at boot via the hugepagesz= and hugepages= command-line
options.

Now, gigantic page allocation at boottime has two serious problems:

 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel
evenly distributes boottime allocated hugepages among nodes.

For example, suppose you have a four-node NUMA machine and want
to allocate four 1G gigantic pages at boottime. The kernel will
allocate one gigantic page per node.

On the other hand, we do have users who want to be able to specify
which NUMA node gigantic pages should allocated from. So that they
can place virtual machines on a specific NUMA node.

 2. Gigantic pages allocated at boottime can't be freed

At this point it's important to observe that regular hugepages allocated
at runtime don't have those problems. This is so because HugeTLB interface
for runtime allocation in sysfs supports NUMA and runtime allocated pages
can be freed just fine via the buddy allocator.

This series adds support for allocating gigantic pages at runtime. It does
so by allocating gigantic pages via CMA instead of the buddy allocator.
Releasing gigantic pages is also supported via CMA. As this series builds
on top of the existing HugeTLB interface, it makes gigantic page allocation
and releasing just like regular sized hugepages. This also means that NUMA
support just works.

For example, to allocate two 1G gigantic pages on node 1, one can do:

 # echo 2  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And, to release all gigantic pages on the same node:

 # echo 0  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

Please, refer to patch 5/5 for full technical details.

Finally, please note that this series is a follow up for a previous series
that tried to extend the command-line options set to be NUMA aware:

 http://marc.info/?l=linux-mmm=139593335312191w=2

During the discussion of that series it was agreed that having runtime
allocation support for gigantic pages was a better solution.

Luiz Capitulino (5):
  hugetlb: prep_compound_gigantic_page(): drop __init marker
  hugetlb: add hstate_is_gigantic()
  hugetlb: update_and_free_page(): don't clear PG_reserved bit
  hugetlb: move helpers up in the file
  hugetlb: add support for gigantic page allocation at runtime

 include/linux/hugetlb.h |   5 +
 mm/hugetlb.c| 336 ++--
 2 files changed, 245 insertions(+), 96 deletions(-)

-- 
1.8.1.4
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/5] hugetlb: prep_compound_gigantic_page(): drop __init marker

2014-04-10 Thread Luiz Capitulino

This function is going to be used by non-init code in a future
commit.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 mm/hugetlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd30f22..957231b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,8 +690,7 @@ static void prep_new_huge_page(struct hstate *h, struct 
page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
 }
 
-static void __init prep_compound_gigantic_page(struct page *page,
-  unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 {
int i;
int nr_pages = 1  order;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/5] hugetlb: move helpers up in the file

2014-04-10 Thread Luiz Capitulino

Next commit will add new code which will want to call
for_each_node_mask_to_alloc() macro. Move it, its buddy
for_each_node_mask_to_free() and their dependencies up in the file so
the new code can use them. This is just code movement, no logic change.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
Reviewed-by: Andrea Arcangeli aarca...@redhat.com
Reviewed-by: Naoya Horiguchi n-horigu...@ah.jp.nec.com
Reviewed-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 mm/hugetlb.c | 146 +--
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8cbaa97..6f1ca74 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -607,6 +607,79 @@ err:
return NULL;
 }
 
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h-next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   nid = next_node(nid, *nodes_allowed);
+   if (nid == MAX_NUMNODES)
+   nid = first_node(*nodes_allowed);
+   VM_BUG_ON(nid = MAX_NUMNODES);
+
+   return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   if (!node_isset(nid, *nodes_allowed))
+   nid = next_node_allowed(nid, nodes_allowed);
+   return nid;
+}
+
+/*
+ * returns the previously saved node [this node] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+   nodemask_t *nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h-next_nid_to_alloc, nodes_allowed);
+   h-next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node [this node] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t 
*nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h-next_nid_to_free, nodes_allowed);
+   h-next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)  \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes  0  \
+   ((node = hstate_next_node_to_alloc(hs, mask)) || 1);\
+   nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)   \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes  0  \
+   ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+   nr_nodes--)
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
int i;
@@ -786,79 +859,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
return page;
 }
 
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h-next_node_to_{alloc|free} might
- * be outside of *nodes_allowed.  Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   nid = next_node(nid, *nodes_allowed);
-   if (nid == MAX_NUMNODES)
-   nid = first_node(*nodes_allowed);
-   VM_BUG_ON(nid = MAX_NUMNODES);
-
-   return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   if (!node_isset(nid, *nodes_allowed))
-   nid = next_node_allowed(nid, nodes_allowed);
-   return nid;
-}
-
-/*
- * returns the previously saved node [this node] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
-   nodemask_t *nodes_allowed)
-{
-   int nid;
-
-   VM_BUG_ON(!nodes_allowed);
-
-   nid = get_valid_node_allowed(h-next_nid_to_alloc, nodes_allowed);
-   h-next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
-   return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node [this node] from which to free

Re: [PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-09 Thread Luiz Capitulino

On Wed, 9 Apr 2014 09:42:01 +0900
Yasuaki Ishimatsu  wrote:

> (2014/04/09 4:02), Luiz Capitulino wrote:
> > HugeTLB is limited to allocating hugepages whose size are less than
> > MAX_ORDER order. This is so because HugeTLB allocates hugepages via
> > the buddy allocator. Gigantic pages (that is, pages whose size is
> > greater than MAX_ORDER order) have to be allocated at boottime.
> > 
> > However, boottime allocation has at least two serious problems. First,
> > it doesn't support NUMA and second, gigantic pages allocated at
> > boottime can't be freed.
> > 
> > This commit solves both issues by adding support for allocating gigantic
> > pages during runtime. It works just like regular sized hugepages,
> > meaning that the interface in sysfs is the same, it supports NUMA,
> > and gigantic pages can be freed.
> > 
> > For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
> > gigantic pages on node 1, one can do:
> > 
> >   # echo 2 > \
> > 
> > /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
> > 
> > And to free them all:
> > 
> >   # echo 0 > \
> > 
> > /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
> > 
> > The one problem with gigantic page allocation at runtime is that it
> > can't be serviced by the buddy allocator. To overcome that problem, this
> > commit scans all zones from a node looking for a large enough contiguous
> > region. When one is found, it's allocated by using CMA, that is, we call
> > alloc_contig_range() to do the actual allocation. For example, on x86_64
> > we scan all zones looking for a 1GB contiguous region. When one is found,
> > it's allocated by alloc_contig_range().
> > 
> > One expected issue with that approach is that such gigantic contiguous
> > regions tend to vanish as runtime goes by. The best way to avoid this for
> > now is to make gigantic page allocations very early during system boot, say
> > from a init script. Other possible optimization include using compaction,
> > which is supported by CMA but is not explicitly used by this commit.
> > 
> > It's also important to note the following:
> > 
> >   1. Gigantic pages allocated at boottime by the hugepages= command-line
> >  option can be freed at runtime just fine
> > 
> >   2. This commit adds support for gigantic pages only to x86_64. The
> >  reason is that I don't have access to nor experience with other archs.
> >  The code is arch indepedent though, so it should be simple to add
> >  support to different archs
> > 
> >   3. I didn't add support for hugepage overcommit, that is allocating
> >  a gigantic page on demand when
> > /proc/sys/vm/nr_overcommit_hugepages > 0. The reason is that I don't
> > think it's reasonable to do the hard and long work required for
> > allocating a gigantic page at fault time. But it should be simple
> > to add this if wanted
> > 
> > Signed-off-by: Luiz Capitulino 
> > ---
> >   mm/hugetlb.c | 158 
> > ++-
> >   1 file changed, 147 insertions(+), 11 deletions(-)
> > 
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 9dded98..2258045 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -679,11 +679,141 @@ static int hstate_next_node_to_free(struct hstate 
> > *h, nodemask_t *nodes_allowed)
> > ((node = hstate_next_node_to_free(hs, mask)) || 1); \
> > nr_nodes--)
> >   
> > +#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
> > +static void destroy_compound_gigantic_page(struct page *page,
> > +   unsigned long order)
> > +{
> > +   int i;
> > +   int nr_pages = 1 << order;
> > +   struct page *p = page + 1;
> > +
> > +   for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
> > +   __ClearPageTail(p);
> > +   set_page_refcounted(p);
> > +   p->first_page = NULL;
> > +   }
> > +
> > +   set_compound_order(page, 0);
> > +   __ClearPageHead(page);
> > +}
> > +
> > +static void free_gigantic_page(struct page *page, unsigned order)
> > +{
> > +   free_contig_range(page_to_pfn(page), 1 << order);
> > +}
> > +
> > +static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long 
> > count)
> > +{
> > +   unsigned long end_pfn = start_pfn + count;
&

Re: [PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-09 Thread Luiz Capitulino

On Wed, 9 Apr 2014 09:42:01 +0900
Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com wrote:

 (2014/04/09 4:02), Luiz Capitulino wrote:
  HugeTLB is limited to allocating hugepages whose size are less than
  MAX_ORDER order. This is so because HugeTLB allocates hugepages via
  the buddy allocator. Gigantic pages (that is, pages whose size is
  greater than MAX_ORDER order) have to be allocated at boottime.
  
  However, boottime allocation has at least two serious problems. First,
  it doesn't support NUMA and second, gigantic pages allocated at
  boottime can't be freed.
  
  This commit solves both issues by adding support for allocating gigantic
  pages during runtime. It works just like regular sized hugepages,
  meaning that the interface in sysfs is the same, it supports NUMA,
  and gigantic pages can be freed.
  
  For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
  gigantic pages on node 1, one can do:
  
# echo 2  \
  
  /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
  
  And to free them all:
  
# echo 0  \
  
  /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
  
  The one problem with gigantic page allocation at runtime is that it
  can't be serviced by the buddy allocator. To overcome that problem, this
  commit scans all zones from a node looking for a large enough contiguous
  region. When one is found, it's allocated by using CMA, that is, we call
  alloc_contig_range() to do the actual allocation. For example, on x86_64
  we scan all zones looking for a 1GB contiguous region. When one is found,
  it's allocated by alloc_contig_range().
  
  One expected issue with that approach is that such gigantic contiguous
  regions tend to vanish as runtime goes by. The best way to avoid this for
  now is to make gigantic page allocations very early during system boot, say
  from a init script. Other possible optimization include using compaction,
  which is supported by CMA but is not explicitly used by this commit.
  
  It's also important to note the following:
  
1. Gigantic pages allocated at boottime by the hugepages= command-line
   option can be freed at runtime just fine
  
2. This commit adds support for gigantic pages only to x86_64. The
   reason is that I don't have access to nor experience with other archs.
   The code is arch indepedent though, so it should be simple to add
   support to different archs
  
3. I didn't add support for hugepage overcommit, that is allocating
   a gigantic page on demand when
  /proc/sys/vm/nr_overcommit_hugepages  0. The reason is that I don't
  think it's reasonable to do the hard and long work required for
  allocating a gigantic page at fault time. But it should be simple
  to add this if wanted
  
  Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
  ---
mm/hugetlb.c | 158 
  ++-
1 file changed, 147 insertions(+), 11 deletions(-)
  
  diff --git a/mm/hugetlb.c b/mm/hugetlb.c
  index 9dded98..2258045 100644
  --- a/mm/hugetlb.c
  +++ b/mm/hugetlb.c
  @@ -679,11 +679,141 @@ static int hstate_next_node_to_free(struct hstate 
  *h, nodemask_t *nodes_allowed)
  ((node = hstate_next_node_to_free(hs, mask)) || 1); \
  nr_nodes--)

  +#if defined(CONFIG_CMA)  defined(CONFIG_X86_64)
  +static void destroy_compound_gigantic_page(struct page *page,
  +   unsigned long order)
  +{
  +   int i;
  +   int nr_pages = 1  order;
  +   struct page *p = page + 1;
  +
  +   for (i = 1; i  nr_pages; i++, p = mem_map_next(p, page, i)) {
  +   __ClearPageTail(p);
  +   set_page_refcounted(p);
  +   p-first_page = NULL;
  +   }
  +
  +   set_compound_order(page, 0);
  +   __ClearPageHead(page);
  +}
  +
  +static void free_gigantic_page(struct page *page, unsigned order)
  +{
  +   free_contig_range(page_to_pfn(page), 1  order);
  +}
  +
  +static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long 
  count)
  +{
  +   unsigned long end_pfn = start_pfn + count;
  +   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
  +}
  +
  +static bool pfn_range_valid_gigantic(unsigned long start_pfn,
  +   unsigned long nr_pages)
  +{
  +   unsigned long i, end_pfn = start_pfn + nr_pages;
  +   struct page *page;
  +
  +   for (i = start_pfn; i  end_pfn; i++) {
  +   if (!pfn_valid(i))
  +   return false;
  +
  +   page = pfn_to_page(i);
  +
  +   if (PageReserved(page))
  +   return false;
  +
  +   if (page_count(page)  0)
  +   return false;
  +
  +   if (PageHuge(page))
  +   return false;
  +   }
  +
  +   return true;
  +}
  +
  +static struct page *alloc_gigantic_page(int nid, unsigned order)
  +{
  +   unsigned long nr_pages = 1  order

Re: [PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime

2014-04-08 Thread Luiz Capitulino

On Tue, 8 Apr 2014 15:51:02 -0700
Andrew Morton  wrote:

> On Mon, 7 Apr 2014 14:49:35 -0400 Luiz Capitulino  
> wrote:
> 
> > > > ---
> > > >  arch/x86/include/asm/hugetlb.h |  10 +++
> > > >  mm/hugetlb.c   | 177 
> > > > ++---
> > > >  2 files changed, 176 insertions(+), 11 deletions(-)
> > > > 
> > > > diff --git a/arch/x86/include/asm/hugetlb.h 
> > > > b/arch/x86/include/asm/hugetlb.h
> > > > index a809121..2b262f7 100644
> > > > --- a/arch/x86/include/asm/hugetlb.h
> > > > +++ b/arch/x86/include/asm/hugetlb.h
> > > > @@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page 
> > > > *page)
> > > >  {
> > > >  }
> > > >  
> > > > +static inline int arch_prepare_gigantic_page(struct page *page)
> > > > +{
> > > > +   return 0;
> > > > +}
> > > > +
> > > > +static inline void arch_release_gigantic_page(struct page *page)
> > > > +{
> > > > +}
> > > > +
> > > > +
> > > >  static inline void arch_clear_hugepage_flags(struct page *page)
> > > >  {
> > > >  }
> > > 
> > > These are defined only on arch/x86, but called in generic code.
> > > Does it cause build failure on other archs?
> > 
> > Hmm, probably. The problem here is that I'm unable to test this
> > code in other archs. So I think the best solution for the first
> > merge is to make the build of this feature conditional to x86_64?
> > Then the first person interested in making this work in other
> > archs add the generic code. Sounds reasonable?
> 
> These functions don't actually do anything so if and when other
> architectures come along to implement this feature, their developers
> won't know what you were thinking when you added them.  So how about
> some code comments to explain their roles and responsibilities?
> 
> Or just delete them altogether and let people add them (or something
> similar) if and when the need arises.  It's hard to tell when one lacks
> telepathic powers, sigh.

That's exactly what I did for v2 (already posted).
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/5] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-08 Thread Luiz Capitulino

On Tue, 8 Apr 2014 23:51:26 +0300
"Kirill A. Shutemov"  wrote:

> On Tue, Apr 08, 2014 at 03:02:18PM -0400, Luiz Capitulino wrote:
> > Hugepages pages never get the PG_reserved bit set, so don't clear it. But
> > add a warning just in case.
> 
> I don't think WARN_ON() is needed. PG_reserved will be catched by
> free_pages_check().

Correct. I'll drop it.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/5] hugetlb: prep_compound_gigantic_page(): drop __init marker

2014-04-08 Thread Luiz Capitulino

On Tue, 08 Apr 2014 15:51:51 -0400
Naoya Horiguchi  wrote:

> Hi Luiz,
> 
> On Tue, Apr 08, 2014 at 03:02:16PM -0400, Luiz Capitulino wrote:
> > This function is going to be used by non-init code in a future
> > commit.
> > 
> > Signed-off-by: Luiz Capitulino 
> > ---
> >  mm/hugetlb.c | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> > 
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 7c02b9d..319db28 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -689,8 +689,7 @@ static void prep_new_huge_page(struct hstate *h, struct 
> > page *page, int nid)
> > put_page(page); /* free it into the hugepage allocator */
> >  }
> >  
> > -static void __init prep_compound_gigantic_page(struct page *page,
> > -  unsigned long order)
> > +static void prep_compound_gigantic_page(struct page *page, unsigned long 
> > order)
> >  {
> > int i;
> > int nr_pages = 1 << order;
> 
> Is __ClearPageReserved() in this function relevant only in boot time
> allocation? 

Yes.

> If yes, it might be good to avoid calling it in runtime
> allocation.

The problem is that prep_compound_gigantic_page() is good and used by
both boottime and runtime allocations. Having two functions to do the
same thing seems like overkill, especially because the runtime allocation
code skips reserved pages. So the reserved bit should always be cleared
for runtime allocated gigantic pages.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-08 Thread Luiz Capitulino

[Changelog at the bottom]

Introduction


The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
runtime. This means that hugepages allocation during runtime is limited to
MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
greater than MAX_ORDER), this in turn means that those pages can't be
allocated at runtime.

HugeTLB supports gigantic page allocation during boottime, via the boot
allocator. To this end the kernel provides the command-line options
hugepagesz= and hugepages=, which can be used to instruct the kernel to
allocate N gigantic pages during boot.

For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can
be allocated and freed at runtime. If one wants to allocate 1G gigantic pages,
this has to be done at boot via the hugepagesz= and hugepages= command-line
options.

Now, gigantic page allocation at boottime has two serious problems:

 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel
evenly distributes boottime allocated hugepages among nodes.

For example, suppose you have a four-node NUMA machine and want
to allocate four 1G gigantic pages at boottime. The kernel will
allocate one gigantic page per node.

On the other hand, we do have users who want to be able to specify
which NUMA node gigantic pages should allocated from. So that they
can place virtual machines on a specific NUMA node.

 2. Gigantic pages allocated at boottime can't be freed

At this point it's important to observe that regular hugepages allocated
at runtime don't have those problems. This is so because HugeTLB interface
for runtime allocation in sysfs supports NUMA and runtime allocated pages
can be freed just fine via the buddy allocator.

This series adds support for allocating gigantic pages at runtime. It does
so by allocating gigantic pages via CMA instead of the buddy allocator.
Releasing gigantic pages is also supported via CMA. As this series builds
on top of the existing HugeTLB interface, it makes gigantic page allocation
and releasing just like regular sized hugepages. This also means that NUMA
support just works.

For example, to allocate two 1G gigantic pages on node 1, one can do:

 # echo 2 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And, to release all gigantic pages on the same node:

 # echo 0 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

Please, refer to patch 5/5 for full technical details.

Finally, please note that this series is a follow up for a previous series
that tried to extend the command-line options set to be NUMA aware:

 http://marc.info/?l=linux-mm=139593335312191=2

During the discussion of that series it was agreed that having runtime
allocation support for gigantic pages was a better solution.

Changelog
-

v2

- Rewrote allocation loop to avoid scanning useless PFNs [Yasuaki]
- Dropped incomplete multi-arch support [Naoya]
- Added patch to drop __init from prep_compound_gigantic_page()
- Restricted the feature to x86_64 (more details in patch 5/5)
- Added reviewed-bys plus minor changelog changes

Luiz Capitulino (5):
  hugetlb: prep_compound_gigantic_page(): drop __init marker
  hugetlb: add hstate_is_gigantic()
  hugetlb: update_and_free_page(): don't clear PG_reserved bit
  hugetlb: move helpers up in the file
  hugetlb: add support for gigantic page allocation at runtime

 include/linux/hugetlb.h |   5 +
 mm/hugetlb.c| 328 ++--
 2 files changed, 237 insertions(+), 96 deletions(-)

-- 
1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/5] hugetlb: add hstate_is_gigantic()

2014-04-08 Thread Luiz Capitulino

Signed-off-by: Luiz Capitulino 
Reviewed-by: Andrea Arcangeli 
Reviewed-by: Naoya Horiguchi 
Reviewed-by: Yasuaki Ishimatsu 
---
 include/linux/hugetlb.h |  5 +
 mm/hugetlb.c| 28 ++--
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b337cf..62a8b88 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) >= MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1 << h->order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 319db28..8674eda 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -610,7 +610,7 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
 {
int i;
 
-   VM_BUG_ON(h->order >= MAX_ORDER);
+   VM_BUG_ON(hstate_is_gigantic(h));
 
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
@@ -663,7 +663,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
 
-   if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+   if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) {
/* remove the page from active list */
list_del(>lru);
update_and_free_page(h, page);
@@ -767,7 +767,7 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
 {
struct page *page;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
page = alloc_pages_exact_node(nid,
@@ -961,7 +961,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, 
int nid)
struct page *page;
unsigned int r_nid;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
/*
@@ -1154,7 +1154,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
 
/* Cannot return gigantic pages currently */
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1353,7 +1353,7 @@ static void __init gather_bootmem_prealloc(void)
 * fix confusing memory reports from free(1) and another
 * side-effects, like CommitLimit going negative.
 */
-   if (h->order > (MAX_ORDER - 1))
+   if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
}
 }
@@ -1363,7 +1363,7 @@ static void __init hugetlb_hstate_alloc_pages(struct 
hstate *h)
unsigned long i;
 
for (i = 0; i < h->max_huge_pages; ++i) {
-   if (h->order >= MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1379,7 +1379,7 @@ static void __init hugetlb_init_hstates(void)
 
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
-   if (h->order < MAX_ORDER)
+   if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
 }
@@ -1413,7 +1413,7 @@ static void try_to_free_low(struct hstate *h, unsigned 
long count,
 {
int i;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
for_each_node_mask(i, *nodes_allowed) {
@@ -1476,7 +1476,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, 
unsigned long count,
 {
unsigned long min_count, ret;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return h->max_huge_pages;
 
/*
@@ -1602,7 +1602,7 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
goto out;
 
h = kobj_to_hstate(kobj, );
-   if (h->order >= MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
err = -EINVAL;
goto out;
}
@@ -1685,7 +1685,7 @@ static ssize_t nr_overcommit_hugepages_store(struct 
kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return -EINVAL;
 
err = kstrtoul(buf, 10, );
@@ -2110,7 +2110,7 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
 
tmp = h->max_huge_pages;
 
-   if (write

[PATCH 4/5] hugetlb: move helpers up in the file

2014-04-08 Thread Luiz Capitulino

Next commit will add new code which will want to call
for_each_node_mask_to_alloc() macro. Move it, its buddy
for_each_node_mask_to_free() and their dependencies up in the file so
the new code can use them. This is just code movement, no logic change.

Signed-off-by: Luiz Capitulino 
Reviewed-by: Andrea Arcangeli 
Reviewed-by: Naoya Horiguchi 
Reviewed-by: Yasuaki Ishimatsu 
---
 mm/hugetlb.c | 146 +--
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c295bba..9dded98 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -606,6 +606,79 @@ err:
return NULL;
 }
 
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   nid = next_node(nid, *nodes_allowed);
+   if (nid == MAX_NUMNODES)
+   nid = first_node(*nodes_allowed);
+   VM_BUG_ON(nid >= MAX_NUMNODES);
+
+   return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   if (!node_isset(nid, *nodes_allowed))
+   nid = next_node_allowed(nid, nodes_allowed);
+   return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+   nodemask_t *nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+   h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t 
*nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+   h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)  \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes > 0 && \
+   ((node = hstate_next_node_to_alloc(hs, mask)) || 1);\
+   nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)   \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes > 0 && \
+   ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+   nr_nodes--)
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
int i;
@@ -786,79 +859,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
return page;
 }
 
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed.  Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   nid = next_node(nid, *nodes_allowed);
-   if (nid == MAX_NUMNODES)
-   nid = first_node(*nodes_allowed);
-   VM_BUG_ON(nid >= MAX_NUMNODES);
-
-   return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   if (!node_isset(nid, *nodes_allowed))
-   nid = next_node_allowed(nid, nodes_allowed);
-   return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
-   nodemask_t *nodes_allowed)
-{
-   int nid;
-
-   VM_BUG_ON(!nodes_allowed);
-
-   nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
-   h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
-   return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"]

[PATCH 3/5] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-08 Thread Luiz Capitulino

Hugepages pages never get the PG_reserved bit set, so don't clear it. But
add a warning just in case.

Signed-off-by: Luiz Capitulino 
Reviewed-by: Andrea Arcangeli 
Reviewed-by: Naoya Horiguchi 
Reviewed-by: Yasuaki Ishimatsu 
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8674eda..c295bba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -617,8 +617,9 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
-   1 << PG_active | 1 << PG_reserved |
-   1 << PG_private | 1 << PG_writeback);
+   1 << PG_active | 1 << PG_private |
+   1 << PG_writeback);
+   WARN_ON(PageReserved([i]));
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-08 Thread Luiz Capitulino

HugeTLB is limited to allocating hugepages whose size are less than
MAX_ORDER order. This is so because HugeTLB allocates hugepages via
the buddy allocator. Gigantic pages (that is, pages whose size is
greater than MAX_ORDER order) have to be allocated at boottime.

However, boottime allocation has at least two serious problems. First,
it doesn't support NUMA and second, gigantic pages allocated at
boottime can't be freed.

This commit solves both issues by adding support for allocating gigantic
pages during runtime. It works just like regular sized hugepages,
meaning that the interface in sysfs is the same, it supports NUMA,
and gigantic pages can be freed.

For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
gigantic pages on node 1, one can do:

 # echo 2 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And to free them all:

 # echo 0 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

The one problem with gigantic page allocation at runtime is that it
can't be serviced by the buddy allocator. To overcome that problem, this
commit scans all zones from a node looking for a large enough contiguous
region. When one is found, it's allocated by using CMA, that is, we call
alloc_contig_range() to do the actual allocation. For example, on x86_64
we scan all zones looking for a 1GB contiguous region. When one is found,
it's allocated by alloc_contig_range().

One expected issue with that approach is that such gigantic contiguous
regions tend to vanish as runtime goes by. The best way to avoid this for
now is to make gigantic page allocations very early during system boot, say
from a init script. Other possible optimization include using compaction,
which is supported by CMA but is not explicitly used by this commit.

It's also important to note the following:

 1. Gigantic pages allocated at boottime by the hugepages= command-line
option can be freed at runtime just fine

 2. This commit adds support for gigantic pages only to x86_64. The
reason is that I don't have access to nor experience with other archs.
The code is arch indepedent though, so it should be simple to add
support to different archs

 3. I didn't add support for hugepage overcommit, that is allocating
a gigantic page on demand when
   /proc/sys/vm/nr_overcommit_hugepages > 0. The reason is that I don't
   think it's reasonable to do the hard and long work required for
   allocating a gigantic page at fault time. But it should be simple
   to add this if wanted

Signed-off-by: Luiz Capitulino 
---
 mm/hugetlb.c | 158 ++-
 1 file changed, 147 insertions(+), 11 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9dded98..2258045 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -679,11 +679,141 @@ static int hstate_next_node_to_free(struct hstate *h, 
nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
 
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1 << order;
+   struct page *p = page + 1;
+
+   for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p->first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long count)
+{
+   unsigned long end_pfn = start_pfn + count;
+   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+   unsigned long nr_pages)
+{
+   unsigned long i, end_pfn = start_pfn + nr_pages;
+   struct page *page;
+
+   for (i = start_pfn; i < end_pfn; i++) {
+   if (!pfn_valid(i))
+   return false;
+
+   page = pfn_to_page(i);
+
+   if (PageReserved(page))
+   return false;
+
+   if (page_count(page) > 0)
+   return false;
+
+   if (PageHuge(page))
+   return false;
+   }
+
+   return true;
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+   unsigned long nr_pages = 1 << order;
+   unsigned long ret, pfn, flags;
+   struct zone *z;
+
+   z = NODE_DATA(nid)->node_zones;
+   for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+   spin_lock_irqsave(>lock, flags)

[PATCH 1/5] hugetlb: prep_compound_gigantic_page(): drop __init marker

2014-04-08 Thread Luiz Capitulino

This function is going to be used by non-init code in a future
commit.

Signed-off-by: Luiz Capitulino 
---
 mm/hugetlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c02b9d..319db28 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -689,8 +689,7 @@ static void prep_new_huge_page(struct hstate *h, struct 
page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
 }
 
-static void __init prep_compound_gigantic_page(struct page *page,
-  unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 {
int i;
int nr_pages = 1 << order;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/5] hugetlb: prep_compound_gigantic_page(): drop __init marker

2014-04-08 Thread Luiz Capitulino

This function is going to be used by non-init code in a future
commit.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 mm/hugetlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c02b9d..319db28 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -689,8 +689,7 @@ static void prep_new_huge_page(struct hstate *h, struct 
page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
 }
 
-static void __init prep_compound_gigantic_page(struct page *page,
-  unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 {
int i;
int nr_pages = 1  order;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/5] hugetlb: add support for gigantic page allocation at runtime

2014-04-08 Thread Luiz Capitulino

HugeTLB is limited to allocating hugepages whose size are less than
MAX_ORDER order. This is so because HugeTLB allocates hugepages via
the buddy allocator. Gigantic pages (that is, pages whose size is
greater than MAX_ORDER order) have to be allocated at boottime.

However, boottime allocation has at least two serious problems. First,
it doesn't support NUMA and second, gigantic pages allocated at
boottime can't be freed.

This commit solves both issues by adding support for allocating gigantic
pages during runtime. It works just like regular sized hugepages,
meaning that the interface in sysfs is the same, it supports NUMA,
and gigantic pages can be freed.

For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
gigantic pages on node 1, one can do:

 # echo 2  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And to free them all:

 # echo 0  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

The one problem with gigantic page allocation at runtime is that it
can't be serviced by the buddy allocator. To overcome that problem, this
commit scans all zones from a node looking for a large enough contiguous
region. When one is found, it's allocated by using CMA, that is, we call
alloc_contig_range() to do the actual allocation. For example, on x86_64
we scan all zones looking for a 1GB contiguous region. When one is found,
it's allocated by alloc_contig_range().

One expected issue with that approach is that such gigantic contiguous
regions tend to vanish as runtime goes by. The best way to avoid this for
now is to make gigantic page allocations very early during system boot, say
from a init script. Other possible optimization include using compaction,
which is supported by CMA but is not explicitly used by this commit.

It's also important to note the following:

 1. Gigantic pages allocated at boottime by the hugepages= command-line
option can be freed at runtime just fine

 2. This commit adds support for gigantic pages only to x86_64. The
reason is that I don't have access to nor experience with other archs.
The code is arch indepedent though, so it should be simple to add
support to different archs

 3. I didn't add support for hugepage overcommit, that is allocating
a gigantic page on demand when
   /proc/sys/vm/nr_overcommit_hugepages  0. The reason is that I don't
   think it's reasonable to do the hard and long work required for
   allocating a gigantic page at fault time. But it should be simple
   to add this if wanted

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 mm/hugetlb.c | 158 ++-
 1 file changed, 147 insertions(+), 11 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9dded98..2258045 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -679,11 +679,141 @@ static int hstate_next_node_to_free(struct hstate *h, 
nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
 
+#if defined(CONFIG_CMA)  defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1  order;
+   struct page *p = page + 1;
+
+   for (i = 1; i  nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p-first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1  order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long count)
+{
+   unsigned long end_pfn = start_pfn + count;
+   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+   unsigned long nr_pages)
+{
+   unsigned long i, end_pfn = start_pfn + nr_pages;
+   struct page *page;
+
+   for (i = start_pfn; i  end_pfn; i++) {
+   if (!pfn_valid(i))
+   return false;
+
+   page = pfn_to_page(i);
+
+   if (PageReserved(page))
+   return false;
+
+   if (page_count(page)  0)
+   return false;
+
+   if (PageHuge(page))
+   return false;
+   }
+
+   return true;
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+   unsigned long nr_pages = 1  order;
+   unsigned long ret, pfn, flags;
+   struct zone *z;
+
+   z = NODE_DATA(nid)-node_zones;
+   for (; z - NODE_DATA(nid)-node_zones  MAX_NR_ZONES; z++) {
+   spin_lock_irqsave(z-lock, flags);
+
+   pfn = ALIGN(z-zone_start_pfn, nr_pages

[PATCH 3/5] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-08 Thread Luiz Capitulino

Hugepages pages never get the PG_reserved bit set, so don't clear it. But
add a warning just in case.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
Reviewed-by: Andrea Arcangeli aarca...@redhat.com
Reviewed-by: Naoya Horiguchi n-horigu...@ah.jp.nec.com
Reviewed-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8674eda..c295bba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -617,8 +617,9 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
for (i = 0; i  pages_per_huge_page(h); i++) {
page[i].flags = ~(1  PG_locked | 1  PG_error |
1  PG_referenced | 1  PG_dirty |
-   1  PG_active | 1  PG_reserved |
-   1  PG_private | 1  PG_writeback);
+   1  PG_active | 1  PG_private |
+   1  PG_writeback);
+   WARN_ON(PageReserved(page[i]));
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/5] hugetlb: move helpers up in the file

2014-04-08 Thread Luiz Capitulino

Next commit will add new code which will want to call
for_each_node_mask_to_alloc() macro. Move it, its buddy
for_each_node_mask_to_free() and their dependencies up in the file so
the new code can use them. This is just code movement, no logic change.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
Reviewed-by: Andrea Arcangeli aarca...@redhat.com
Reviewed-by: Naoya Horiguchi n-horigu...@ah.jp.nec.com
Reviewed-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 mm/hugetlb.c | 146 +--
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c295bba..9dded98 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -606,6 +606,79 @@ err:
return NULL;
 }
 
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h-next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   nid = next_node(nid, *nodes_allowed);
+   if (nid == MAX_NUMNODES)
+   nid = first_node(*nodes_allowed);
+   VM_BUG_ON(nid = MAX_NUMNODES);
+
+   return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   if (!node_isset(nid, *nodes_allowed))
+   nid = next_node_allowed(nid, nodes_allowed);
+   return nid;
+}
+
+/*
+ * returns the previously saved node [this node] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+   nodemask_t *nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h-next_nid_to_alloc, nodes_allowed);
+   h-next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node [this node] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t 
*nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h-next_nid_to_free, nodes_allowed);
+   h-next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)  \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes  0  \
+   ((node = hstate_next_node_to_alloc(hs, mask)) || 1);\
+   nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)   \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes  0  \
+   ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+   nr_nodes--)
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
int i;
@@ -786,79 +859,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
return page;
 }
 
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h-next_node_to_{alloc|free} might
- * be outside of *nodes_allowed.  Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   nid = next_node(nid, *nodes_allowed);
-   if (nid == MAX_NUMNODES)
-   nid = first_node(*nodes_allowed);
-   VM_BUG_ON(nid = MAX_NUMNODES);
-
-   return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   if (!node_isset(nid, *nodes_allowed))
-   nid = next_node_allowed(nid, nodes_allowed);
-   return nid;
-}
-
-/*
- * returns the previously saved node [this node] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
-   nodemask_t *nodes_allowed)
-{
-   int nid;
-
-   VM_BUG_ON(!nodes_allowed);
-
-   nid = get_valid_node_allowed(h-next_nid_to_alloc, nodes_allowed);
-   h-next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
-   return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node [this node] from which to free

[PATCH 2/5] hugetlb: add hstate_is_gigantic()

2014-04-08 Thread Luiz Capitulino

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
Reviewed-by: Andrea Arcangeli aarca...@redhat.com
Reviewed-by: Naoya Horiguchi n-horigu...@ah.jp.nec.com
Reviewed-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
---
 include/linux/hugetlb.h |  5 +
 mm/hugetlb.c| 28 ++--
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b337cf..62a8b88 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h-order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) = MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1  h-order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 319db28..8674eda 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -610,7 +610,7 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
 {
int i;
 
-   VM_BUG_ON(h-order = MAX_ORDER);
+   VM_BUG_ON(hstate_is_gigantic(h));
 
h-nr_huge_pages--;
h-nr_huge_pages_node[page_to_nid(page)]--;
@@ -663,7 +663,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h-resv_huge_pages++;
 
-   if (h-surplus_huge_pages_node[nid]  huge_page_order(h)  MAX_ORDER) {
+   if (h-surplus_huge_pages_node[nid]  !hstate_is_gigantic(h)) {
/* remove the page from active list */
list_del(page-lru);
update_and_free_page(h, page);
@@ -767,7 +767,7 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
 {
struct page *page;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
page = alloc_pages_exact_node(nid,
@@ -961,7 +961,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, 
int nid)
struct page *page;
unsigned int r_nid;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
/*
@@ -1154,7 +1154,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h-resv_huge_pages -= unused_resv_pages;
 
/* Cannot return gigantic pages currently */
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
nr_pages = min(unused_resv_pages, h-surplus_huge_pages);
@@ -1353,7 +1353,7 @@ static void __init gather_bootmem_prealloc(void)
 * fix confusing memory reports from free(1) and another
 * side-effects, like CommitLimit going negative.
 */
-   if (h-order  (MAX_ORDER - 1))
+   if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1  h-order);
}
 }
@@ -1363,7 +1363,7 @@ static void __init hugetlb_hstate_alloc_pages(struct 
hstate *h)
unsigned long i;
 
for (i = 0; i  h-max_huge_pages; ++i) {
-   if (h-order = MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1379,7 +1379,7 @@ static void __init hugetlb_init_hstates(void)
 
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
-   if (h-order  MAX_ORDER)
+   if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
 }
@@ -1413,7 +1413,7 @@ static void try_to_free_low(struct hstate *h, unsigned 
long count,
 {
int i;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
for_each_node_mask(i, *nodes_allowed) {
@@ -1476,7 +1476,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, 
unsigned long count,
 {
unsigned long min_count, ret;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return h-max_huge_pages;
 
/*
@@ -1602,7 +1602,7 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
goto out;
 
h = kobj_to_hstate(kobj, nid);
-   if (h-order = MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
err = -EINVAL;
goto out;
}
@@ -1685,7 +1685,7 @@ static ssize_t nr_overcommit_hugepages_store(struct 
kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return -EINVAL;
 
err = kstrtoul(buf, 10, input);
@@ -2110,7 +2110,7 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
 
tmp = h-max_huge_pages;
 
-   if (write  h-order = MAX_ORDER)
+   if (write  hstate_is_gigantic(h))
return

[PATCH v2 0/5] hugetlb: add support gigantic page allocation at runtime

2014-04-08 Thread Luiz Capitulino

[Changelog at the bottom]

Introduction


The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
runtime. This means that hugepages allocation during runtime is limited to
MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
greater than MAX_ORDER), this in turn means that those pages can't be
allocated at runtime.

HugeTLB supports gigantic page allocation during boottime, via the boot
allocator. To this end the kernel provides the command-line options
hugepagesz= and hugepages=, which can be used to instruct the kernel to
allocate N gigantic pages during boot.

For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can
be allocated and freed at runtime. If one wants to allocate 1G gigantic pages,
this has to be done at boot via the hugepagesz= and hugepages= command-line
options.

Now, gigantic page allocation at boottime has two serious problems:

 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel
evenly distributes boottime allocated hugepages among nodes.

For example, suppose you have a four-node NUMA machine and want
to allocate four 1G gigantic pages at boottime. The kernel will
allocate one gigantic page per node.

On the other hand, we do have users who want to be able to specify
which NUMA node gigantic pages should allocated from. So that they
can place virtual machines on a specific NUMA node.

 2. Gigantic pages allocated at boottime can't be freed

At this point it's important to observe that regular hugepages allocated
at runtime don't have those problems. This is so because HugeTLB interface
for runtime allocation in sysfs supports NUMA and runtime allocated pages
can be freed just fine via the buddy allocator.

This series adds support for allocating gigantic pages at runtime. It does
so by allocating gigantic pages via CMA instead of the buddy allocator.
Releasing gigantic pages is also supported via CMA. As this series builds
on top of the existing HugeTLB interface, it makes gigantic page allocation
and releasing just like regular sized hugepages. This also means that NUMA
support just works.

For example, to allocate two 1G gigantic pages on node 1, one can do:

 # echo 2  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And, to release all gigantic pages on the same node:

 # echo 0  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

Please, refer to patch 5/5 for full technical details.

Finally, please note that this series is a follow up for a previous series
that tried to extend the command-line options set to be NUMA aware:

 http://marc.info/?l=linux-mmm=139593335312191w=2

During the discussion of that series it was agreed that having runtime
allocation support for gigantic pages was a better solution.

Changelog
-

v2

- Rewrote allocation loop to avoid scanning useless PFNs [Yasuaki]
- Dropped incomplete multi-arch support [Naoya]
- Added patch to drop __init from prep_compound_gigantic_page()
- Restricted the feature to x86_64 (more details in patch 5/5)
- Added reviewed-bys plus minor changelog changes

Luiz Capitulino (5):
  hugetlb: prep_compound_gigantic_page(): drop __init marker
  hugetlb: add hstate_is_gigantic()
  hugetlb: update_and_free_page(): don't clear PG_reserved bit
  hugetlb: move helpers up in the file
  hugetlb: add support for gigantic page allocation at runtime

 include/linux/hugetlb.h |   5 +
 mm/hugetlb.c| 328 ++--
 2 files changed, 237 insertions(+), 96 deletions(-)

-- 
1.8.1.4
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/5] hugetlb: prep_compound_gigantic_page(): drop __init marker

2014-04-08 Thread Luiz Capitulino

On Tue, 08 Apr 2014 15:51:51 -0400
Naoya Horiguchi n-horigu...@ah.jp.nec.com wrote:

 Hi Luiz,
 
 On Tue, Apr 08, 2014 at 03:02:16PM -0400, Luiz Capitulino wrote:
  This function is going to be used by non-init code in a future
  commit.
  
  Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
  ---
   mm/hugetlb.c | 3 +--
   1 file changed, 1 insertion(+), 2 deletions(-)
  
  diff --git a/mm/hugetlb.c b/mm/hugetlb.c
  index 7c02b9d..319db28 100644
  --- a/mm/hugetlb.c
  +++ b/mm/hugetlb.c
  @@ -689,8 +689,7 @@ static void prep_new_huge_page(struct hstate *h, struct 
  page *page, int nid)
  put_page(page); /* free it into the hugepage allocator */
   }
   
  -static void __init prep_compound_gigantic_page(struct page *page,
  -  unsigned long order)
  +static void prep_compound_gigantic_page(struct page *page, unsigned long 
  order)
   {
  int i;
  int nr_pages = 1  order;
 
 Is __ClearPageReserved() in this function relevant only in boot time
 allocation? 

Yes.

 If yes, it might be good to avoid calling it in runtime
 allocation.

The problem is that prep_compound_gigantic_page() is good and used by
both boottime and runtime allocations. Having two functions to do the
same thing seems like overkill, especially because the runtime allocation
code skips reserved pages. So the reserved bit should always be cleared
for runtime allocated gigantic pages.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/5] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-08 Thread Luiz Capitulino

On Tue, 8 Apr 2014 23:51:26 +0300
Kirill A. Shutemov kir...@shutemov.name wrote:

 On Tue, Apr 08, 2014 at 03:02:18PM -0400, Luiz Capitulino wrote:
  Hugepages pages never get the PG_reserved bit set, so don't clear it. But
  add a warning just in case.
 
 I don't think WARN_ON() is needed. PG_reserved will be catched by
 free_pages_check().

Correct. I'll drop it.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime

2014-04-08 Thread Luiz Capitulino

On Tue, 8 Apr 2014 15:51:02 -0700
Andrew Morton a...@linux-foundation.org wrote:

 On Mon, 7 Apr 2014 14:49:35 -0400 Luiz Capitulino lcapitul...@redhat.com 
 wrote:
 
---
 arch/x86/include/asm/hugetlb.h |  10 +++
 mm/hugetlb.c   | 177 
++---
 2 files changed, 176 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/hugetlb.h 
b/arch/x86/include/asm/hugetlb.h
index a809121..2b262f7 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page 
*page)
 {
 }
 
+static inline int arch_prepare_gigantic_page(struct page *page)
+{
+   return 0;
+}
+
+static inline void arch_release_gigantic_page(struct page *page)
+{
+}
+
+
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
   
   These are defined only on arch/x86, but called in generic code.
   Does it cause build failure on other archs?
  
  Hmm, probably. The problem here is that I'm unable to test this
  code in other archs. So I think the best solution for the first
  merge is to make the build of this feature conditional to x86_64?
  Then the first person interested in making this work in other
  archs add the generic code. Sounds reasonable?
 
 These functions don't actually do anything so if and when other
 architectures come along to implement this feature, their developers
 won't know what you were thinking when you added them.  So how about
 some code comments to explain their roles and responsibilities?
 
 Or just delete them altogether and let people add them (or something
 similar) if and when the need arises.  It's hard to tell when one lacks
 telepathic powers, sigh.

That's exactly what I did for v2 (already posted).
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime

2014-04-04 Thread Luiz Capitulino

On Fri, 4 Apr 2014 12:05:17 +0900
Yasuaki Ishimatsu  wrote:

> (2014/04/03 3:08), Luiz Capitulino wrote:
> > HugeTLB is limited to allocating hugepages whose size are less than
> > MAX_ORDER order. This is so because HugeTLB allocates hugepages via
> > the buddy allocator. Gigantic pages (that is, pages whose size is
> > greater than MAX_ORDER order) have to be allocated at boottime.
> > 
> > However, boottime allocation has at least two serious problems. First,
> > it doesn't support NUMA and second, gigantic pages allocated at
> > boottime can't be freed.
> > 
> > This commit solves both issues by adding support for allocating gigantic
> > pages during runtime. It works just like regular sized hugepages,
> > meaning that the interface in sysfs is the same, it supports NUMA,
> > and gigantic pages can be freed.
> > 
> > For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
> > gigantic pages on node 1, one can do:
> > 
> >   # echo 2 > \
> > 
> > /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
> > 
> > And to free them later:
> > 
> >   # echo 0 > \
> > 
> > /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
> > 
> > The one problem with gigantic page allocation at runtime is that it
> > can't be serviced by the buddy allocator. To overcome that problem, this
> > series scans all zones from a node looking for a large enough contiguous
> > region. When one is found, it's allocated by using CMA, that is, we call
> > alloc_contig_range() to do the actual allocation. For example, on x86_64
> > we scan all zones looking for a 1GB contiguous region. When one is found
> > it's allocated by alloc_contig_range().
> > 
> > One expected issue with that approach is that such gigantic contiguous
> > regions tend to vanish as time goes by. The best way to avoid this for
> > now is to make gigantic page allocations very early during boot, say
> > from a init script. Other possible optimization include using compaction,
> > which is supported by CMA but is not explicitly used by this commit.
> > 
> > It's also important to note the following:
> > 
> >   1. My target systems are x86_64 machines, so I have only tested 1GB
> >  pages allocation/release. I did try to make this arch indepedent
> >  and expect it to work on other archs but didn't try it myself
> > 
> >   2. I didn't add support for hugepage overcommit, that is allocating
> >  a gigantic page on demand when
> > /proc/sys/vm/nr_overcommit_hugepages > 0. The reason is that I don't
> > think it's reasonable to do the hard and long work required for
> > allocating a gigantic page at fault time. But it should be simple
> > to add this if wanted
> > 
> > Signed-off-by: Luiz Capitulino 
> > ---
> >   arch/x86/include/asm/hugetlb.h |  10 +++
> >   mm/hugetlb.c   | 177 
> > ++---
> >   2 files changed, 176 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> > index a809121..2b262f7 100644
> > --- a/arch/x86/include/asm/hugetlb.h
> > +++ b/arch/x86/include/asm/hugetlb.h
> > @@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page 
> > *page)
> >   {
> >   }
> >   
> > +static inline int arch_prepare_gigantic_page(struct page *page)
> > +{
> > +   return 0;
> > +}
> > +
> > +static inline void arch_release_gigantic_page(struct page *page)
> > +{
> > +}
> > +
> > +
> >   static inline void arch_clear_hugepage_flags(struct page *page)
> >   {
> >   }
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 2c7a44a..c68515e 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -643,11 +643,159 @@ static int hstate_next_node_to_free(struct hstate 
> > *h, nodemask_t *nodes_allowed)
> > ((node = hstate_next_node_to_free(hs, mask)) || 1); \
> > nr_nodes--)
> >   
> > +#ifdef CONFIG_CMA
> > +static void destroy_compound_gigantic_page(struct page *page,
> > +   unsigned long order)
> > +{
> > +   int i;
> > +   int nr_pages = 1 << order;
> > +   struct page *p = page + 1;
> > +
> > +   for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
> > +   __ClearPageTail(p);
> > +   set_page_refcounted(p);
> > +   p->f

Re: [PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime

2014-04-04 Thread Luiz Capitulino

On Fri, 4 Apr 2014 12:05:17 +0900
Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com wrote:

 (2014/04/03 3:08), Luiz Capitulino wrote:
  HugeTLB is limited to allocating hugepages whose size are less than
  MAX_ORDER order. This is so because HugeTLB allocates hugepages via
  the buddy allocator. Gigantic pages (that is, pages whose size is
  greater than MAX_ORDER order) have to be allocated at boottime.
  
  However, boottime allocation has at least two serious problems. First,
  it doesn't support NUMA and second, gigantic pages allocated at
  boottime can't be freed.
  
  This commit solves both issues by adding support for allocating gigantic
  pages during runtime. It works just like regular sized hugepages,
  meaning that the interface in sysfs is the same, it supports NUMA,
  and gigantic pages can be freed.
  
  For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
  gigantic pages on node 1, one can do:
  
# echo 2  \
  
  /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
  
  And to free them later:
  
# echo 0  \
  
  /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
  
  The one problem with gigantic page allocation at runtime is that it
  can't be serviced by the buddy allocator. To overcome that problem, this
  series scans all zones from a node looking for a large enough contiguous
  region. When one is found, it's allocated by using CMA, that is, we call
  alloc_contig_range() to do the actual allocation. For example, on x86_64
  we scan all zones looking for a 1GB contiguous region. When one is found
  it's allocated by alloc_contig_range().
  
  One expected issue with that approach is that such gigantic contiguous
  regions tend to vanish as time goes by. The best way to avoid this for
  now is to make gigantic page allocations very early during boot, say
  from a init script. Other possible optimization include using compaction,
  which is supported by CMA but is not explicitly used by this commit.
  
  It's also important to note the following:
  
1. My target systems are x86_64 machines, so I have only tested 1GB
   pages allocation/release. I did try to make this arch indepedent
   and expect it to work on other archs but didn't try it myself
  
2. I didn't add support for hugepage overcommit, that is allocating
   a gigantic page on demand when
  /proc/sys/vm/nr_overcommit_hugepages  0. The reason is that I don't
  think it's reasonable to do the hard and long work required for
  allocating a gigantic page at fault time. But it should be simple
  to add this if wanted
  
  Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
  ---
arch/x86/include/asm/hugetlb.h |  10 +++
mm/hugetlb.c   | 177 
  ++---
2 files changed, 176 insertions(+), 11 deletions(-)
  
  diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
  index a809121..2b262f7 100644
  --- a/arch/x86/include/asm/hugetlb.h
  +++ b/arch/x86/include/asm/hugetlb.h
  @@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page 
  *page)
{
}

  +static inline int arch_prepare_gigantic_page(struct page *page)
  +{
  +   return 0;
  +}
  +
  +static inline void arch_release_gigantic_page(struct page *page)
  +{
  +}
  +
  +
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
  diff --git a/mm/hugetlb.c b/mm/hugetlb.c
  index 2c7a44a..c68515e 100644
  --- a/mm/hugetlb.c
  +++ b/mm/hugetlb.c
  @@ -643,11 +643,159 @@ static int hstate_next_node_to_free(struct hstate 
  *h, nodemask_t *nodes_allowed)
  ((node = hstate_next_node_to_free(hs, mask)) || 1); \
  nr_nodes--)

  +#ifdef CONFIG_CMA
  +static void destroy_compound_gigantic_page(struct page *page,
  +   unsigned long order)
  +{
  +   int i;
  +   int nr_pages = 1  order;
  +   struct page *p = page + 1;
  +
  +   for (i = 1; i  nr_pages; i++, p = mem_map_next(p, page, i)) {
  +   __ClearPageTail(p);
  +   set_page_refcounted(p);
  +   p-first_page = NULL;
  +   }
  +
  +   set_compound_order(page, 0);
  +   __ClearPageHead(page);
  +}
  +
  +static void free_gigantic_page(struct page *page, unsigned order)
  +{
  +   free_contig_range(page_to_pfn(page), 1  order);
  +}
  +
  +static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long 
  count)
  +{
  +   unsigned long end_pfn = start_pfn + count;
  +   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
  +}
  +
  +static bool pfn_valid_gigantic(unsigned long pfn)
  +{
  +   struct page *page;
  +
  +   if (!pfn_valid(pfn))
  +   return false;
  +
  +   page = pfn_to_page(pfn);
  +
  +   if (PageReserved(page))
  +   return false;
  +
  +   if (page_count(page)  0)
  +   return false;
  +
  +   return true;
  +}
  +
  +static inline bool

[PATCH 2/4] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-02 Thread Luiz Capitulino

Hugepages pages never get the PG_reserved bit set, so don't clear it. But
add a warning just in case.

Signed-off-by: Luiz Capitulino 
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8c50547..7e07e47 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -581,8 +581,9 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
-   1 << PG_active | 1 << PG_reserved |
-   1 << PG_private | 1 << PG_writeback);
+   1 << PG_active | 1 << PG_private |
+   1 << PG_writeback);
+   WARN_ON(PageReserved([i]));
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/4] hugetlb: add support gigantic page allocation at runtime

2014-04-02 Thread Luiz Capitulino

The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
runtime. This means that hugepages allocation during runtime is limited to
MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
greater than MAX_ORDER), this in turn means that those pages can't be
allocated at runtime.

HugeTLB supports gigantic page allocation during boottime, via the boot
allocator. To this end the kernel provides the command-line options
hugepagesz= and hugepages=, which can be used to instruct the kernel to
allocate N gigantic pages during boot.

For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can
be allocated and freed at runtime. If one wants to allocate 1G gigantic pages,
this has to be done at boot via the hugepagesz= and hugepages= command-line
options.

Now, gigantic page allocation at boottime has two serious problems:

 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel
evenly distributes boottime allocated hugepages among nodes.

For example, suppose you have a four-node NUMA machine and want
to allocate four 1G gigantic pages at boottime. The kernel will
allocate one gigantic page per node.

On the other hand, we do have users who want to be able to specify
which NUMA node gigantic pages should allocated from. So that they
can place virtual machines on a specific NUMA node.

 2. Gigantic pages allocated at boottime can't be freed

At this point it's important to observe that regular hugepages allocated
at runtime don't have those problems. This is so because HugeTLB interface
for runtime allocation in sysfs supports NUMA and runtime allocated pages
can be freed just fine via the buddy allocator.

This series adds support for allocating gigantic pages at runtime. It does
so by allocating gigantic pages via CMA instead of the buddy allocator.
Releasing gigantic pages is also supported via CMA. As this series builds
on top of the existing HugeTLB interface, it makes gigantic page allocation
and releasing just like regular sized hugepages. This also means that NUMA
support just works.

For example, to allocate two 1G gigantic pages on node 1, one can do:

 # echo 2 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And, to release all gigantic pages on the same node:

 # echo 0 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

Please, refer to patch 4/4 for full technical details.

Finally, please note that this series is a follow up for a previous series
that tried to extend the command-line options set to be NUMA aware:

 http://marc.info/?l=linux-mm=139593335312191=2

During the discussion of that series it was agreed that having runtime
allocation support for gigantic pages was a better solution.

Luiz Capitulino (4):
  hugetlb: add hstate_is_gigantic()
  hugetlb: update_and_free_page(): don't clear PG_reserved bit
  hugetlb: move helpers up in the file
  hugetlb: add support for gigantic page allocation at runtime

 arch/x86/include/asm/hugetlb.h |  10 ++
 include/linux/hugetlb.h|   5 +
 mm/hugetlb.c   | 344 ++---
 3 files changed, 265 insertions(+), 94 deletions(-)

-- 
1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] hugetlb: move helpers up in the file

2014-04-02 Thread Luiz Capitulino

Next commit will add new code which will want to call the
for_each_node_mask_to_alloc() macro. Move it, its buddy
for_each_node_mask_to_free() and their dependencies up in the file so
the new code can use them. This is just code movement, no logic change.

Signed-off-by: Luiz Capitulino 
---
 mm/hugetlb.c | 146 +--
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7e07e47..2c7a44a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -570,6 +570,79 @@ err:
return NULL;
 }
 
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   nid = next_node(nid, *nodes_allowed);
+   if (nid == MAX_NUMNODES)
+   nid = first_node(*nodes_allowed);
+   VM_BUG_ON(nid >= MAX_NUMNODES);
+
+   return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   if (!node_isset(nid, *nodes_allowed))
+   nid = next_node_allowed(nid, nodes_allowed);
+   return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+   nodemask_t *nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+   h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t 
*nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+   h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)  \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes > 0 && \
+   ((node = hstate_next_node_to_alloc(hs, mask)) || 1);\
+   nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)   \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes > 0 && \
+   ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+   nr_nodes--)
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
int i;
@@ -750,79 +823,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
return page;
 }
 
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed.  Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   nid = next_node(nid, *nodes_allowed);
-   if (nid == MAX_NUMNODES)
-   nid = first_node(*nodes_allowed);
-   VM_BUG_ON(nid >= MAX_NUMNODES);
-
-   return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   if (!node_isset(nid, *nodes_allowed))
-   nid = next_node_allowed(nid, nodes_allowed);
-   return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
-   nodemask_t *nodes_allowed)
-{
-   int nid;
-
-   VM_BUG_ON(!nodes_allowed);
-
-   nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
-   h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
-   return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"] from which to free a huge page.  Advance the
- * next node id whether or not w

[PATCH 1/4] hugetlb: add hstate_is_gigantic()

2014-04-02 Thread Luiz Capitulino

Signed-off-by: Luiz Capitulino 
---
 include/linux/hugetlb.h |  5 +
 mm/hugetlb.c| 28 ++--
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8c43cc4..8590134 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -333,6 +333,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) >= MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1 << h->order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c01cb9f..8c50547 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -574,7 +574,7 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
 {
int i;
 
-   VM_BUG_ON(h->order >= MAX_ORDER);
+   VM_BUG_ON(hstate_is_gigantic(h));
 
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
@@ -627,7 +627,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
 
-   if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+   if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) {
/* remove the page from active list */
list_del(>lru);
update_and_free_page(h, page);
@@ -731,7 +731,7 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
 {
struct page *page;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
page = alloc_pages_exact_node(nid,
@@ -925,7 +925,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, 
int nid)
struct page *page;
unsigned int r_nid;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
/*
@@ -1118,7 +1118,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
 
/* Cannot return gigantic pages currently */
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1328,7 +1328,7 @@ static void __init gather_bootmem_prealloc(void)
 * fix confusing memory reports from free(1) and another
 * side-effects, like CommitLimit going negative.
 */
-   if (h->order > (MAX_ORDER - 1))
+   if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
}
 }
@@ -1338,7 +1338,7 @@ static void __init hugetlb_hstate_alloc_pages(struct 
hstate *h)
unsigned long i;
 
for (i = 0; i < h->max_huge_pages; ++i) {
-   if (h->order >= MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1354,7 +1354,7 @@ static void __init hugetlb_init_hstates(void)
 
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
-   if (h->order < MAX_ORDER)
+   if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
 }
@@ -1388,7 +1388,7 @@ static void try_to_free_low(struct hstate *h, unsigned 
long count,
 {
int i;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
for_each_node_mask(i, *nodes_allowed) {
@@ -1451,7 +1451,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, 
unsigned long count,
 {
unsigned long min_count, ret;
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return h->max_huge_pages;
 
/*
@@ -1577,7 +1577,7 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
goto out;
 
h = kobj_to_hstate(kobj, );
-   if (h->order >= MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
err = -EINVAL;
goto out;
}
@@ -1660,7 +1660,7 @@ static ssize_t nr_overcommit_hugepages_store(struct 
kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
 
-   if (h->order >= MAX_ORDER)
+   if (hstate_is_gigantic(h))
return -EINVAL;
 
err = kstrtoul(buf, 10, );
@@ -2071,7 +2071,7 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
 
tmp = h->max_huge_pages;
 
-   if (write && h->order >= MAX_ORDER)
+   if (write && hstate_is_gigantic(h

[PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime

2014-04-02 Thread Luiz Capitulino

HugeTLB is limited to allocating hugepages whose size are less than
MAX_ORDER order. This is so because HugeTLB allocates hugepages via
the buddy allocator. Gigantic pages (that is, pages whose size is
greater than MAX_ORDER order) have to be allocated at boottime.

However, boottime allocation has at least two serious problems. First,
it doesn't support NUMA and second, gigantic pages allocated at
boottime can't be freed.

This commit solves both issues by adding support for allocating gigantic
pages during runtime. It works just like regular sized hugepages,
meaning that the interface in sysfs is the same, it supports NUMA,
and gigantic pages can be freed.

For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
gigantic pages on node 1, one can do:

 # echo 2 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And to free them later:

 # echo 0 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

The one problem with gigantic page allocation at runtime is that it
can't be serviced by the buddy allocator. To overcome that problem, this
series scans all zones from a node looking for a large enough contiguous
region. When one is found, it's allocated by using CMA, that is, we call
alloc_contig_range() to do the actual allocation. For example, on x86_64
we scan all zones looking for a 1GB contiguous region. When one is found
it's allocated by alloc_contig_range().

One expected issue with that approach is that such gigantic contiguous
regions tend to vanish as time goes by. The best way to avoid this for
now is to make gigantic page allocations very early during boot, say
from a init script. Other possible optimization include using compaction,
which is supported by CMA but is not explicitly used by this commit.

It's also important to note the following:

 1. My target systems are x86_64 machines, so I have only tested 1GB
pages allocation/release. I did try to make this arch indepedent
and expect it to work on other archs but didn't try it myself

 2. I didn't add support for hugepage overcommit, that is allocating
a gigantic page on demand when
   /proc/sys/vm/nr_overcommit_hugepages > 0. The reason is that I don't
   think it's reasonable to do the hard and long work required for
   allocating a gigantic page at fault time. But it should be simple
   to add this if wanted

Signed-off-by: Luiz Capitulino 
---
 arch/x86/include/asm/hugetlb.h |  10 +++
 mm/hugetlb.c   | 177 ++---
 2 files changed, 176 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index a809121..2b262f7 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline int arch_prepare_gigantic_page(struct page *page)
+{
+   return 0;
+}
+
+static inline void arch_release_gigantic_page(struct page *page)
+{
+}
+
+
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2c7a44a..c68515e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -643,11 +643,159 @@ static int hstate_next_node_to_free(struct hstate *h, 
nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
 
+#ifdef CONFIG_CMA
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1 << order;
+   struct page *p = page + 1;
+
+   for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p->first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long count)
+{
+   unsigned long end_pfn = start_pfn + count;
+   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_valid_gigantic(unsigned long pfn)
+{
+   struct page *page;
+
+   if (!pfn_valid(pfn))
+   return false;
+
+   page = pfn_to_page(pfn);
+
+   if (PageReserved(page))
+   return false;
+
+   if (page_count(page) > 0)
+   return false;
+
+   return true;
+}
+
+static inline bool pfn_aligned_gigantic(unsigned long pfn, unsigned order)
+{
+   return IS_ALIGNED((phys_addr_t) pfn << PAGE_SHIFT, PAGE_SIZE << order);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+   unsigned long ret, i, count, start_pfn, flags;
+   unsigned long nr_pages = 1 <

[PATCH 4/4] hugetlb: add support for gigantic page allocation at runtime

2014-04-02 Thread Luiz Capitulino

HugeTLB is limited to allocating hugepages whose size are less than
MAX_ORDER order. This is so because HugeTLB allocates hugepages via
the buddy allocator. Gigantic pages (that is, pages whose size is
greater than MAX_ORDER order) have to be allocated at boottime.

However, boottime allocation has at least two serious problems. First,
it doesn't support NUMA and second, gigantic pages allocated at
boottime can't be freed.

This commit solves both issues by adding support for allocating gigantic
pages during runtime. It works just like regular sized hugepages,
meaning that the interface in sysfs is the same, it supports NUMA,
and gigantic pages can be freed.

For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G
gigantic pages on node 1, one can do:

 # echo 2  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And to free them later:

 # echo 0  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

The one problem with gigantic page allocation at runtime is that it
can't be serviced by the buddy allocator. To overcome that problem, this
series scans all zones from a node looking for a large enough contiguous
region. When one is found, it's allocated by using CMA, that is, we call
alloc_contig_range() to do the actual allocation. For example, on x86_64
we scan all zones looking for a 1GB contiguous region. When one is found
it's allocated by alloc_contig_range().

One expected issue with that approach is that such gigantic contiguous
regions tend to vanish as time goes by. The best way to avoid this for
now is to make gigantic page allocations very early during boot, say
from a init script. Other possible optimization include using compaction,
which is supported by CMA but is not explicitly used by this commit.

It's also important to note the following:

 1. My target systems are x86_64 machines, so I have only tested 1GB
pages allocation/release. I did try to make this arch indepedent
and expect it to work on other archs but didn't try it myself

 2. I didn't add support for hugepage overcommit, that is allocating
a gigantic page on demand when
   /proc/sys/vm/nr_overcommit_hugepages  0. The reason is that I don't
   think it's reasonable to do the hard and long work required for
   allocating a gigantic page at fault time. But it should be simple
   to add this if wanted

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 arch/x86/include/asm/hugetlb.h |  10 +++
 mm/hugetlb.c   | 177 ++---
 2 files changed, 176 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index a809121..2b262f7 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline int arch_prepare_gigantic_page(struct page *page)
+{
+   return 0;
+}
+
+static inline void arch_release_gigantic_page(struct page *page)
+{
+}
+
+
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2c7a44a..c68515e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -643,11 +643,159 @@ static int hstate_next_node_to_free(struct hstate *h, 
nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
 
+#ifdef CONFIG_CMA
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1  order;
+   struct page *p = page + 1;
+
+   for (i = 1; i  nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p-first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1  order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long count)
+{
+   unsigned long end_pfn = start_pfn + count;
+   return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_valid_gigantic(unsigned long pfn)
+{
+   struct page *page;
+
+   if (!pfn_valid(pfn))
+   return false;
+
+   page = pfn_to_page(pfn);
+
+   if (PageReserved(page))
+   return false;
+
+   if (page_count(page)  0)
+   return false;
+
+   return true;
+}
+
+static inline bool pfn_aligned_gigantic(unsigned long pfn, unsigned order)
+{
+   return IS_ALIGNED((phys_addr_t) pfn  PAGE_SHIFT, PAGE_SIZE  order);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+   unsigned long ret, i, count, start_pfn, flags;
+   unsigned long nr_pages = 1  order;
+   struct zone *z;
+
+   z

[PATCH 1/4] hugetlb: add hstate_is_gigantic()

2014-04-02 Thread Luiz Capitulino

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 include/linux/hugetlb.h |  5 +
 mm/hugetlb.c| 28 ++--
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8c43cc4..8590134 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -333,6 +333,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h-order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) = MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1  h-order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c01cb9f..8c50547 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -574,7 +574,7 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
 {
int i;
 
-   VM_BUG_ON(h-order = MAX_ORDER);
+   VM_BUG_ON(hstate_is_gigantic(h));
 
h-nr_huge_pages--;
h-nr_huge_pages_node[page_to_nid(page)]--;
@@ -627,7 +627,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h-resv_huge_pages++;
 
-   if (h-surplus_huge_pages_node[nid]  huge_page_order(h)  MAX_ORDER) {
+   if (h-surplus_huge_pages_node[nid]  !hstate_is_gigantic(h)) {
/* remove the page from active list */
list_del(page-lru);
update_and_free_page(h, page);
@@ -731,7 +731,7 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
 {
struct page *page;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
page = alloc_pages_exact_node(nid,
@@ -925,7 +925,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, 
int nid)
struct page *page;
unsigned int r_nid;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return NULL;
 
/*
@@ -1118,7 +1118,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h-resv_huge_pages -= unused_resv_pages;
 
/* Cannot return gigantic pages currently */
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
nr_pages = min(unused_resv_pages, h-surplus_huge_pages);
@@ -1328,7 +1328,7 @@ static void __init gather_bootmem_prealloc(void)
 * fix confusing memory reports from free(1) and another
 * side-effects, like CommitLimit going negative.
 */
-   if (h-order  (MAX_ORDER - 1))
+   if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1  h-order);
}
 }
@@ -1338,7 +1338,7 @@ static void __init hugetlb_hstate_alloc_pages(struct 
hstate *h)
unsigned long i;
 
for (i = 0; i  h-max_huge_pages; ++i) {
-   if (h-order = MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1354,7 +1354,7 @@ static void __init hugetlb_init_hstates(void)
 
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
-   if (h-order  MAX_ORDER)
+   if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
 }
@@ -1388,7 +1388,7 @@ static void try_to_free_low(struct hstate *h, unsigned 
long count,
 {
int i;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return;
 
for_each_node_mask(i, *nodes_allowed) {
@@ -1451,7 +1451,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, 
unsigned long count,
 {
unsigned long min_count, ret;
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return h-max_huge_pages;
 
/*
@@ -1577,7 +1577,7 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
goto out;
 
h = kobj_to_hstate(kobj, nid);
-   if (h-order = MAX_ORDER) {
+   if (hstate_is_gigantic(h)) {
err = -EINVAL;
goto out;
}
@@ -1660,7 +1660,7 @@ static ssize_t nr_overcommit_hugepages_store(struct 
kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
 
-   if (h-order = MAX_ORDER)
+   if (hstate_is_gigantic(h))
return -EINVAL;
 
err = kstrtoul(buf, 10, input);
@@ -2071,7 +2071,7 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
 
tmp = h-max_huge_pages;
 
-   if (write  h-order = MAX_ORDER)
+   if (write  hstate_is_gigantic(h))
return -EINVAL;
 
table-data = tmp;
@@ -2124,7 +2124,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, 
int write,
 
tmp = h-nr_overcommit_huge_pages

[PATCH 3/4] hugetlb: move helpers up in the file

2014-04-02 Thread Luiz Capitulino

Next commit will add new code which will want to call the
for_each_node_mask_to_alloc() macro. Move it, its buddy
for_each_node_mask_to_free() and their dependencies up in the file so
the new code can use them. This is just code movement, no logic change.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 mm/hugetlb.c | 146 +--
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7e07e47..2c7a44a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -570,6 +570,79 @@ err:
return NULL;
 }
 
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h-next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   nid = next_node(nid, *nodes_allowed);
+   if (nid == MAX_NUMNODES)
+   nid = first_node(*nodes_allowed);
+   VM_BUG_ON(nid = MAX_NUMNODES);
+
+   return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+   if (!node_isset(nid, *nodes_allowed))
+   nid = next_node_allowed(nid, nodes_allowed);
+   return nid;
+}
+
+/*
+ * returns the previously saved node [this node] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+   nodemask_t *nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h-next_nid_to_alloc, nodes_allowed);
+   h-next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node [this node] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t 
*nodes_allowed)
+{
+   int nid;
+
+   VM_BUG_ON(!nodes_allowed);
+
+   nid = get_valid_node_allowed(h-next_nid_to_free, nodes_allowed);
+   h-next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+   return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)  \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes  0  \
+   ((node = hstate_next_node_to_alloc(hs, mask)) || 1);\
+   nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)   \
+   for (nr_nodes = nodes_weight(*mask);\
+   nr_nodes  0  \
+   ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+   nr_nodes--)
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
int i;
@@ -750,79 +823,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
return page;
 }
 
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h-next_node_to_{alloc|free} might
- * be outside of *nodes_allowed.  Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   nid = next_node(nid, *nodes_allowed);
-   if (nid == MAX_NUMNODES)
-   nid = first_node(*nodes_allowed);
-   VM_BUG_ON(nid = MAX_NUMNODES);
-
-   return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-   if (!node_isset(nid, *nodes_allowed))
-   nid = next_node_allowed(nid, nodes_allowed);
-   return nid;
-}
-
-/*
- * returns the previously saved node [this node] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
-   nodemask_t *nodes_allowed)
-{
-   int nid;
-
-   VM_BUG_ON(!nodes_allowed);
-
-   nid = get_valid_node_allowed(h-next_nid_to_alloc, nodes_allowed);
-   h-next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
-   return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node [this node] from which to free a huge page.  Advance the
- * next node id whether or not we find a free huge page to free so
- * that the next attempt to free addresses the next node

[PATCH 0/4] hugetlb: add support gigantic page allocation at runtime

2014-04-02 Thread Luiz Capitulino

The HugeTLB subsystem uses the buddy allocator to allocate hugepages during
runtime. This means that hugepages allocation during runtime is limited to
MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes
greater than MAX_ORDER), this in turn means that those pages can't be
allocated at runtime.

HugeTLB supports gigantic page allocation during boottime, via the boot
allocator. To this end the kernel provides the command-line options
hugepagesz= and hugepages=, which can be used to instruct the kernel to
allocate N gigantic pages during boot.

For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can
be allocated and freed at runtime. If one wants to allocate 1G gigantic pages,
this has to be done at boot via the hugepagesz= and hugepages= command-line
options.

Now, gigantic page allocation at boottime has two serious problems:

 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel
evenly distributes boottime allocated hugepages among nodes.

For example, suppose you have a four-node NUMA machine and want
to allocate four 1G gigantic pages at boottime. The kernel will
allocate one gigantic page per node.

On the other hand, we do have users who want to be able to specify
which NUMA node gigantic pages should allocated from. So that they
can place virtual machines on a specific NUMA node.

 2. Gigantic pages allocated at boottime can't be freed

At this point it's important to observe that regular hugepages allocated
at runtime don't have those problems. This is so because HugeTLB interface
for runtime allocation in sysfs supports NUMA and runtime allocated pages
can be freed just fine via the buddy allocator.

This series adds support for allocating gigantic pages at runtime. It does
so by allocating gigantic pages via CMA instead of the buddy allocator.
Releasing gigantic pages is also supported via CMA. As this series builds
on top of the existing HugeTLB interface, it makes gigantic page allocation
and releasing just like regular sized hugepages. This also means that NUMA
support just works.

For example, to allocate two 1G gigantic pages on node 1, one can do:

 # echo 2  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

And, to release all gigantic pages on the same node:

 # echo 0  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

Please, refer to patch 4/4 for full technical details.

Finally, please note that this series is a follow up for a previous series
that tried to extend the command-line options set to be NUMA aware:

 http://marc.info/?l=linux-mmm=139593335312191w=2

During the discussion of that series it was agreed that having runtime
allocation support for gigantic pages was a better solution.

Luiz Capitulino (4):
  hugetlb: add hstate_is_gigantic()
  hugetlb: update_and_free_page(): don't clear PG_reserved bit
  hugetlb: move helpers up in the file
  hugetlb: add support for gigantic page allocation at runtime

 arch/x86/include/asm/hugetlb.h |  10 ++
 include/linux/hugetlb.h|   5 +
 mm/hugetlb.c   | 344 ++---
 3 files changed, 265 insertions(+), 94 deletions(-)

-- 
1.8.1.4
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/4] hugetlb: update_and_free_page(): don't clear PG_reserved bit

2014-04-02 Thread Luiz Capitulino

Hugepages pages never get the PG_reserved bit set, so don't clear it. But
add a warning just in case.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8c50547..7e07e47 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -581,8 +581,9 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
for (i = 0; i  pages_per_huge_page(h); i++) {
page[i].flags = ~(1  PG_locked | 1  PG_error |
1  PG_referenced | 1  PG_dirty |
-   1  PG_active | 1  PG_reserved |
-   1  PG_private | 1  PG_writeback);
+   1  PG_active | 1  PG_private |
+   1  PG_writeback);
+   WARN_ON(PageReserved(page[i]));
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC] hugetlb: add support for 1GB huge page allocation at runtime

2014-03-27 Thread Luiz Capitulino

Currently, 1GB huge page allocation is only possible at boottime, by means
of the hugepages= command-line option. This has a number of drawbacks, the
most important of them being:

 1. On a NUMA machine, the kernel will evenly distribute page allocation
among nodes. For example, if you have a NUMA machine with 4 nodes and
want to allocate four 1GB huge pages, the kernel will try to allocate
one page per node.

On the other hand, we have users who want to be able to specify
from which node an allocation should be made. For example, they
want to be able to allocate two 1GB huge pages from node 1 only.
Supporting this use-case is the main motivation for this feature.

 2. Once allocated, boottime huge pages can't be freed

This commit solves both issues by adding support for allocating 1GB huge
pages during runtime, just like 2MB huge pages, which supports NUMA and
has a standard use interface in sysfs.

For example, to allocate two 1GB huge pages from node 1, one can do:

 # echo 2 > \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

  (You need hugeTLB properly configured to have that sysfs entry)

The one problem with 1GB huge page runtime allocation is that such gigantic
allocation can't be serviced by the buddy allocator, which is limited to
allocating 2048 pages on most archs. To overcome that problem, we scan all
zones from a node looking for a 1GB contiguous region. When one is found,
it's allocated by using CMA, that is, we call alloc_contig_range().

One expected issue with 1GB huge page support is that free 1GB contiguous
regions tend to vanish as time goes by. The best way to avoid this for now
is to make 1GB huge pages allocations very early during boot, say from a
init script. Other possible optimization include using compaction, which
is already supported by CMA but is not explicitly used by this commit.

This patch is quite complete and works, I'm labelling it RFC because of
the following:

1. I haven't tested surplus pages, cgroup support, allocating through
   hugetlbfs and a few other things

2. I haven't looked at adding 1GB huge page support to alloc_huge_page_node(),
   which seems to allocate huge pages on demand. Do we need this for the
   first merge?

3. Should 1GB huge page allocation code update HTLB_BUDDY_PGALLOC and
   HTLB_BUDDY_PGALLOC_FAIL? I think it shouldn't, as we don't allocate from
   the buddy

Signed-off-by: Luiz Capitulino 
---

This patch is a follow up for this series:

 http://marc.info/?l=linux-mm=139234006724423=2

That series introduced a command-line option to allow the user to specify from
which NUMA node a 1GB hugepage allocation should be made. In that discussion
is was suggested that having support for runtime allocation was a better 
solution.

 arch/x86/include/asm/hugetlb.h |  10 +++
 include/linux/hugetlb.h|   5 ++
 mm/hugetlb.c   | 176 ++---
 3 files changed, 181 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index a809121..2b262f7 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline int arch_prepare_gigantic_page(struct page *page)
+{
+   return 0;
+}
+
+static inline void arch_release_gigantic_page(struct page *page)
+{
+}
+
+
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8c43cc4..8590134 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -333,6 +333,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) >= MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1 << h->order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c01cb9f..53b5ddc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -570,11 +570,146 @@ err:
return NULL;
 }
 
+#ifdef CONFIG_CMA
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1 << order;
+   struct page *p = page + 1;
+
+   for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p->first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long count)
+{
+   unsigned long end_pfn = start_pfn + count;
+   return a

[PATCH RFC] hugetlb: add support for 1GB huge page allocation at runtime

2014-03-27 Thread Luiz Capitulino

Currently, 1GB huge page allocation is only possible at boottime, by means
of the hugepages= command-line option. This has a number of drawbacks, the
most important of them being:

 1. On a NUMA machine, the kernel will evenly distribute page allocation
among nodes. For example, if you have a NUMA machine with 4 nodes and
want to allocate four 1GB huge pages, the kernel will try to allocate
one page per node.

On the other hand, we have users who want to be able to specify
from which node an allocation should be made. For example, they
want to be able to allocate two 1GB huge pages from node 1 only.
Supporting this use-case is the main motivation for this feature.

 2. Once allocated, boottime huge pages can't be freed

This commit solves both issues by adding support for allocating 1GB huge
pages during runtime, just like 2MB huge pages, which supports NUMA and
has a standard use interface in sysfs.

For example, to allocate two 1GB huge pages from node 1, one can do:

 # echo 2  \
   /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages

  (You need hugeTLB properly configured to have that sysfs entry)

The one problem with 1GB huge page runtime allocation is that such gigantic
allocation can't be serviced by the buddy allocator, which is limited to
allocating 2048 pages on most archs. To overcome that problem, we scan all
zones from a node looking for a 1GB contiguous region. When one is found,
it's allocated by using CMA, that is, we call alloc_contig_range().

One expected issue with 1GB huge page support is that free 1GB contiguous
regions tend to vanish as time goes by. The best way to avoid this for now
is to make 1GB huge pages allocations very early during boot, say from a
init script. Other possible optimization include using compaction, which
is already supported by CMA but is not explicitly used by this commit.

This patch is quite complete and works, I'm labelling it RFC because of
the following:

1. I haven't tested surplus pages, cgroup support, allocating through
   hugetlbfs and a few other things

2. I haven't looked at adding 1GB huge page support to alloc_huge_page_node(),
   which seems to allocate huge pages on demand. Do we need this for the
   first merge?

3. Should 1GB huge page allocation code update HTLB_BUDDY_PGALLOC and
   HTLB_BUDDY_PGALLOC_FAIL? I think it shouldn't, as we don't allocate from
   the buddy

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---

This patch is a follow up for this series:

 http://marc.info/?l=linux-mmm=139234006724423w=2

That series introduced a command-line option to allow the user to specify from
which NUMA node a 1GB hugepage allocation should be made. In that discussion
is was suggested that having support for runtime allocation was a better 
solution.

 arch/x86/include/asm/hugetlb.h |  10 +++
 include/linux/hugetlb.h|   5 ++
 mm/hugetlb.c   | 176 ++---
 3 files changed, 181 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index a809121..2b262f7 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -91,6 +91,16 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline int arch_prepare_gigantic_page(struct page *page)
+{
+   return 0;
+}
+
+static inline void arch_release_gigantic_page(struct page *page)
+{
+}
+
+
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8c43cc4..8590134 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -333,6 +333,11 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h-order + PAGE_SHIFT;
 }
 
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+   return huge_page_order(h) = MAX_ORDER;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
return 1  h-order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c01cb9f..53b5ddc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -570,11 +570,146 @@ err:
return NULL;
 }
 
+#ifdef CONFIG_CMA
+static void destroy_compound_gigantic_page(struct page *page,
+   unsigned long order)
+{
+   int i;
+   int nr_pages = 1  order;
+   struct page *p = page + 1;
+
+   for (i = 1; i  nr_pages; i++, p = mem_map_next(p, page, i)) {
+   __ClearPageTail(p);
+   set_page_refcounted(p);
+   p-first_page = NULL;
+   }
+
+   set_compound_order(page, 0);
+   __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+   free_contig_range(page_to_pfn(page), 1  order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long count)
+{
+   unsigned long end_pfn = start_pfn + count;
+   return alloc_contig_range(start_pfn

Re: [PATCH 4/4] hugetlb: add hugepages_node= command-line option

2014-02-20 Thread Luiz Capitulino

On Thu, 20 Feb 2014 15:15:46 -0800 (PST)
David Rientjes  wrote:

> Do I really need to do your work for you and work on 1GB hugepages at 
> runtime, which many more people would be interested in?  Or are we just 
> seeking the easiest way out here with something that shuts the customer up 
> and leaves a kernel command line option that we'll need to maintain to 
> avoid breaking backwards compatibility in the future?

We're seeking a pragmatic solution.

I've said many times in this thread that we're also interested on being
able to allocate 1GB at runtime and would work on it on top of the
command-line option, which is ready, works and solves a real world problem.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] hugetlb: add hugepages_node= command-line option

2014-02-20 Thread Luiz Capitulino

On Wed, 19 Feb 2014 20:51:55 -0800 (PST)
David Rientjes  wrote:

> On Wed, 19 Feb 2014, Luiz Capitulino wrote:
> 
> > > Yes, my concrete objection is that the command line interface is 
> > > unnecessary if you can dynamically allocate and free 1GB pages at runtime 
> > > unless memory will be so fragmented that it cannot be done when userspace 
> > > is brought up.  That is not your use case, thus this support is not 
> > 
> > Yes it is. The early boot is the most reliable moment to allocate huge pages
> > and we want to take advantage from that.
> > 
> 
> Your use case is 8GB of hugepages on a 32GB machine.  It shouldn't be 
> necessary to do that at boot.

That's shortsighted because it's tied to a particular machine. The same
customer asked for more flexibility, too.

Look, we're also looking forward to allocating 1G huge pages from user-space.
We actually agree here. What we're suggesting is having _both_, the
command-line option (which offers higher reliability and is a low hanging
fruit right now) _and_ later we add support to allocate 1G huge pages from
user-space. No loss here, that's the maximum benefit for all users.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] hugetlb: add hugepages_node= command-line option

2014-02-20 Thread Luiz Capitulino

On Wed, 19 Feb 2014 20:51:55 -0800 (PST)
David Rientjes rient...@google.com wrote:

 On Wed, 19 Feb 2014, Luiz Capitulino wrote:
 
   Yes, my concrete objection is that the command line interface is 
   unnecessary if you can dynamically allocate and free 1GB pages at runtime 
   unless memory will be so fragmented that it cannot be done when userspace 
   is brought up.  That is not your use case, thus this support is not 
  
  Yes it is. The early boot is the most reliable moment to allocate huge pages
  and we want to take advantage from that.
  
 
 Your use case is 8GB of hugepages on a 32GB machine.  It shouldn't be 
 necessary to do that at boot.

That's shortsighted because it's tied to a particular machine. The same
customer asked for more flexibility, too.

Look, we're also looking forward to allocating 1G huge pages from user-space.
We actually agree here. What we're suggesting is having _both_, the
command-line option (which offers higher reliability and is a low hanging
fruit right now) _and_ later we add support to allocate 1G huge pages from
user-space. No loss here, that's the maximum benefit for all users.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] hugetlb: add hugepages_node= command-line option

2014-02-20 Thread Luiz Capitulino

On Thu, 20 Feb 2014 15:15:46 -0800 (PST)
David Rientjes rient...@google.com wrote:

 Do I really need to do your work for you and work on 1GB hugepages at 
 runtime, which many more people would be interested in?  Or are we just 
 seeking the easiest way out here with something that shuts the customer up 
 and leaves a kernel command line option that we'll need to maintain to 
 avoid breaking backwards compatibility in the future?

We're seeking a pragmatic solution.

I've said many times in this thread that we're also interested on being
able to allocate 1GB at runtime and would work on it on top of the
command-line option, which is ready, works and solves a real world problem.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] hugetlb: add hugepages_node= command-line option

2014-02-19 Thread Luiz Capitulino

On Wed, 19 Feb 2014 19:46:41 -0800 (PST)
David Rientjes  wrote:

> On Wed, 19 Feb 2014, Marcelo Tosatti wrote:
> 
> > We agree that, in the future, we'd like to provide the ability to
> > dynamically allocate and free 1GB pages at runtime.
> > 
> > Extending the kernel command line interface is a first step.
> > 
> > Do you have a concrete objection to that first step ?
> > 
> 
> Yes, my concrete objection is that the command line interface is 
> unnecessary if you can dynamically allocate and free 1GB pages at runtime 
> unless memory will be so fragmented that it cannot be done when userspace 
> is brought up.  That is not your use case, thus this support is not 

Yes it is. The early boot is the most reliable moment to allocate huge pages
and we want to take advantage from that.

I would understand your position against this series if it was intrusive
or if it was changing the code in an undesirable way, but the code it adds
is completely self-contained and runs only once at boot. It gives us just
one more choice for huge pages allocation at boot. I don't see the huge
problem here.

> needed.  I think Mel also brought up this point.
>
> There's no "first step" about it, this is unnecessary for your use case if 
> you can do it at runtime.  I'm not sure what's so surprising about this.
> 
> > > You can't specify an interleave behavior with Luiz's command line 
> > > interface so now we'd have two different interfaces for allocating 
> > > hugepage sizes depending on whether you're specifying a node or not.  
> > > It's "hugepagesz=1G hugepages=16" vs "hugepage_node=1:16:1G" (and I'd 
> > > have 
> > > to look at previous messages in this thread to see if that means 16 1GB 
> > > pages on node 1 or 1 1GB pages on node 16.)
> > 
> > What syntax do you prefer and why ?
> > 
> 
> I'm not sure it's interesting to talk about since this patchset is 
> unnecessary if you can do it at runtime, but since "hugepagesz=" and 
> "hugepages=" have existed for many kernel releases, we must maintain 
> backwards compatibility.  Thus, it seems, the easiest addition would have 
> been "hugepagesnode=" which I've mentioned several times, there's no 
> reason to implement yet another command line option purely as a shorthand 
> which hugepage_node=1:2:1G is and in a very cryptic way.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/4] hugetlb: add hugepages_node= command-line option

2014-02-19 Thread Luiz Capitulino

On Wed, 19 Feb 2014 19:46:41 -0800 (PST)
David Rientjes rient...@google.com wrote:

 On Wed, 19 Feb 2014, Marcelo Tosatti wrote:
 
  We agree that, in the future, we'd like to provide the ability to
  dynamically allocate and free 1GB pages at runtime.
  
  Extending the kernel command line interface is a first step.
  
  Do you have a concrete objection to that first step ?
  
 
 Yes, my concrete objection is that the command line interface is 
 unnecessary if you can dynamically allocate and free 1GB pages at runtime 
 unless memory will be so fragmented that it cannot be done when userspace 
 is brought up.  That is not your use case, thus this support is not 

Yes it is. The early boot is the most reliable moment to allocate huge pages
and we want to take advantage from that.

I would understand your position against this series if it was intrusive
or if it was changing the code in an undesirable way, but the code it adds
is completely self-contained and runs only once at boot. It gives us just
one more choice for huge pages allocation at boot. I don't see the huge
problem here.

 needed.  I think Mel also brought up this point.

 There's no first step about it, this is unnecessary for your use case if 
 you can do it at runtime.  I'm not sure what's so surprising about this.
 
   You can't specify an interleave behavior with Luiz's command line 
   interface so now we'd have two different interfaces for allocating 
   hugepage sizes depending on whether you're specifying a node or not.  
   It's hugepagesz=1G hugepages=16 vs hugepage_node=1:16:1G (and I'd 
   have 
   to look at previous messages in this thread to see if that means 16 1GB 
   pages on node 1 or 1 1GB pages on node 16.)
  
  What syntax do you prefer and why ?
  
 
 I'm not sure it's interesting to talk about since this patchset is 
 unnecessary if you can do it at runtime, but since hugepagesz= and 
 hugepages= have existed for many kernel releases, we must maintain 
 backwards compatibility.  Thus, it seems, the easiest addition would have 
 been hugepagesnode= which I've mentioned several times, there's no 
 reason to implement yet another command line option purely as a shorthand 
 which hugepage_node=1:2:1G is and in a very cryptic way.
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] fs/proc/meminfo: meminfo_proc_show(): fix typo in comment

2014-02-18 Thread Luiz Capitulino

It should read "reclaimable slab" and not "reclaimable swap".

Signed-off-by: Luiz Capitulino 
---
 fs/proc/meminfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 136e548..7445af0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
available += pagecache;
 
/*
-* Part of the reclaimable swap consists of items that are in use,
+* Part of the reclaimable slab consists of items that are in use,
 * and cannot be freed. Cap this estimate at the low watermark.
 */
available += global_page_state(NR_SLAB_RECLAIMABLE) -
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] fs/proc/meminfo: meminfo_proc_show(): fix typo in comment

2014-02-18 Thread Luiz Capitulino

It should read reclaimable slab and not reclaimable swap.

Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
---
 fs/proc/meminfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 136e548..7445af0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
available += pagecache;
 
/*
-* Part of the reclaimable swap consists of items that are in use,
+* Part of the reclaimable slab consists of items that are in use,
 * and cannot be freed. Cap this estimate at the low watermark.
 */
available += global_page_state(NR_SLAB_RECLAIMABLE) -
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

< 1 2 3 4 5 6 >

301 - 400 of 588 matches

Mail list logo