from:"Elena Ufimtseva"

[PATCH v6 1/1] xen: vnuma for pv guests

2014-07-18 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  120 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 189 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..8c8b098
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+int xen_numa_init(void);
+#else
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..045a8b3 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 
+#include "asm/xen/vnuma.h"
 #include "numa_internal.h"
 
 int __initdata numa_off;
@@ -687,6 +688,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_ACPI_NUMA
if (!numa_init(x86_acpi_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..185ec9b 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
 obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
 obj-$(CONFIG_SWIOTLB_XEN)  += pci-swiotlb-xen.o
+obj-$(CONFIG_NUMA) += vnuma.o
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2e555163..9dc0d3b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -642,6 +643,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (xen_initial_domain())
+   numa_off = 1;
+   else
+   numa_off = 0;
 #endif
 }
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..73f052f
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,120 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * Called from numa_init if numa_off = 0.
+ */
+int __init xen_numa_init(void)
+{
+   unsigned int i, j, idx;
+   unsigned int cpu, pcpus, nr_nodes, nr_cpus;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vmem;
+   u64 physm, physd, physc;
+   int rc;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF
+   };
+
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   if (!xen_pv_domain())
+   return rc;
+
+   /* get the number of nodes for allocation of memblocks. */
+   pcpus = num_possible_cpus();
+   nr_cpus = setup_max_cpus < pcpus ? setup_max_cpus : pcpus;
+
+   /* support for nodes with at least one cpu per node. */
+   nr_nodes = nr_cpus;
+
+   /*
+* Allocate arrays for nr_cpus/nr_nodes sizes and let
+* hypervisor know that these are the boundaries. Partial
+* copy is not allowed and hypercall will fail.
+*/
+
+   mem_size =  nr_nodes * sizeof(struct vmemrange);
+   dist_size = nr_nodes * nr_nodes * sizeof(*numa_topo.distance.h);
+   cpu_to_node_size = nr_cpus * sizeof(*numa_topo.cpu_to_node.h);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+
+   if (!physm || !physd || !physc)
+   goto out;
+
+   vmem = __va(physm);
+   vdistance  = __va(physd);
+   cpu_to_node  = __va(physc);
+
+   numa_topo.nr_nodes = nr_nodes;
+   numa_topo.nr_cpus = nr_cpus;
+
+   set_xen_guest_handle(numa_topo.memrange.h, vmem);
+   set_xen_guest_handle(numa_topo.distance.h, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node.h, cpu_to_node);
+
+   if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, _topo) < 0)
+   goto out;
+
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   for (i = 0; i < nr_nodes; i++) {
+   if (num

[PATCH v6 0/1] introduce vnuma for Xen guests

2014-07-18 Thread Elena Ufimtseva

xen: vnuma for PV guests

This patch is an addition to Xen vNUMA implementation posted
to xen-devel mailing list.

The patchset introduces vNUMA for paravirtualized Xen guests.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.

vNUMA topology that exposed to guest is a structure with following
fields:
number of vNUMA nodes;
number of vcpus;
distance table;
vnodes memory sizes;
vcpus to vnodes mapping;

Subop hypercall XENMEM_gevnumainfo is called to get the information about
vNUMA topology. Before calling it, guest should provide number of  
of nr_nodes and nr_cpus to hypervisor to prevent overflows. Hypervisor also
copies back number of vnodes and vcpus that were copied to the guest.

This patch is available here:
https://git.gitorious.org/vnuma/linux_vnuma.git
git://gitorious.org/vnuma/linux_vnuma.git

Patchset for Xen is available here:
https://git.gitorious.org/vnuma/xen_vnuma.git
git://gitorious.org/vnuma/xen_vnuma.git

Example of vnuma enabled pv domain with 4 nodes and 4 cpus boot:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:

root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Elena Ufimtseva (1):
  xen: vnuma for pv guests

 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  120 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 189 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v6 1/1] xen: vnuma for pv guests

2014-07-18 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  120 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 189 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..8c8b098
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+int xen_numa_init(void);
+#else
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706..045a8b3 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -18,6 +18,7 @@
 #include asm/acpi.h
 #include asm/amd_nb.h
 
+#include asm/xen/vnuma.h
 #include numa_internal.h
 
 int __initdata numa_off;
@@ -687,6 +688,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_ACPI_NUMA
if (!numa_init(x86_acpi_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..185ec9b 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
 obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
 obj-$(CONFIG_SWIOTLB_XEN)  += pci-swiotlb-xen.o
+obj-$(CONFIG_NUMA) += vnuma.o
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2e555163..9dc0d3b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include asm/numa.h
 #include asm/xen/hypervisor.h
 #include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
 
 #include xen/xen.h
 #include xen/page.h
@@ -642,6 +643,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (xen_initial_domain())
+   numa_off = 1;
+   else
+   numa_off = 0;
 #endif
 }
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..73f052f
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,120 @@
+#include linux/err.h
+#include linux/memblock.h
+#include xen/interface/xen.h
+#include xen/interface/memory.h
+#include asm/xen/interface.h
+#include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
+
+/*
+ * Called from numa_init if numa_off = 0.
+ */
+int __init xen_numa_init(void)
+{
+   unsigned int i, j, idx;
+   unsigned int cpu, pcpus, nr_nodes, nr_cpus;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vmem;
+   u64 physm, physd, physc;
+   int rc;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF
+   };
+
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   if (!xen_pv_domain())
+   return rc;
+
+   /* get the number of nodes for allocation of memblocks. */
+   pcpus = num_possible_cpus();
+   nr_cpus = setup_max_cpus  pcpus ? setup_max_cpus : pcpus;
+
+   /* support for nodes with at least one cpu per node. */
+   nr_nodes = nr_cpus;
+
+   /*
+* Allocate arrays for nr_cpus/nr_nodes sizes and let
+* hypervisor know that these are the boundaries. Partial
+* copy is not allowed and hypercall will fail.
+*/
+
+   mem_size =  nr_nodes * sizeof(struct vmemrange);
+   dist_size = nr_nodes * nr_nodes * sizeof(*numa_topo.distance.h);
+   cpu_to_node_size = nr_cpus * sizeof(*numa_topo.cpu_to_node.h);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+
+   if (!physm || !physd || !physc)
+   goto out;
+
+   vmem = __va(physm);
+   vdistance  = __va(physd);
+   cpu_to_node  = __va(physc);
+
+   numa_topo.nr_nodes = nr_nodes;
+   numa_topo.nr_cpus = nr_cpus;
+
+   set_xen_guest_handle(numa_topo.memrange.h, vmem);
+   set_xen_guest_handle(numa_topo.distance.h, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node.h, cpu_to_node);
+
+   if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, numa_topo)  0

[PATCH v6 0/1] introduce vnuma for Xen guests

2014-07-18 Thread Elena Ufimtseva

xen: vnuma for PV guests

This patch is an addition to Xen vNUMA implementation posted
to xen-devel mailing list.

The patchset introduces vNUMA for paravirtualized Xen guests.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.

vNUMA topology that exposed to guest is a structure with following
fields:
number of vNUMA nodes;
number of vcpus;
distance table;
vnodes memory sizes;
vcpus to vnodes mapping;

Subop hypercall XENMEM_gevnumainfo is called to get the information about
vNUMA topology. Before calling it, guest should provide number of  
of nr_nodes and nr_cpus to hypervisor to prevent overflows. Hypervisor also
copies back number of vnodes and vcpus that were copied to the guest.

This patch is available here:
https://git.gitorious.org/vnuma/linux_vnuma.git
git://gitorious.org/vnuma/linux_vnuma.git

Patchset for Xen is available here:
https://git.gitorious.org/vnuma/xen_vnuma.git
git://gitorious.org/vnuma/xen_vnuma.git

Example of vnuma enabled pv domain with 4 nodes and 4 cpus boot:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:

root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Elena Ufimtseva (1):
  xen: vnuma for pv guests

 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  120 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 189 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 1/2] xen: vnuma for pv guests

2014-06-02 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  121 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..8c8b098
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+int xen_numa_init(void);
+#else
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1d045f9..37a9c84 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 
+#include "asm/xen/vnuma.h"
 #include "numa_internal.h"
 
 int __initdata numa_off;
@@ -687,6 +688,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_ACPI_NUMA
if (!numa_init(x86_acpi_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..185ec9b 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
 obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
 obj-$(CONFIG_SWIOTLB_XEN)  += pci-swiotlb-xen.o
+obj-$(CONFIG_NUMA) += vnuma.o
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0982233..0235f19 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -622,6 +623,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (xen_initial_domain())
+   numa_off = 1;
+   else
+   numa_off = 0;
 #endif
 }
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..a02f9c6
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,121 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * Called from numa_init if numa_off = 0;
+ */
+int __init xen_numa_init(void)
+{
+   unsigned int i, j, idx;
+   unsigned int cpu, pcpus, nr_nodes, nr_cpus;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vmem;
+   u64 physm, physd, physc;
+   int rc;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF
+   };
+
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   /* get the number of nodes for allocation of memblocks */
+   pcpus = num_possible_cpus();
+   nr_cpus = setup_max_cpus < pcpus ? setup_max_cpus : pcpus;
+
+   /* support for nodes with at least one cpu */
+   nr_nodes = nr_cpus;
+
+   /*
+* Allocate arrays for nr_cpus/nr_nodes sizes and let
+* hypervisor know that these are the boundaries. Partial
+* copy is not allowed and hypercall will fail.
+*/
+
+   mem_size =  nr_nodes * sizeof(struct vmemrange);
+   dist_size = nr_nodes * nr_nodes * sizeof(*numa_topo.distance.h);
+   cpu_to_node_size = nr_cpus * sizeof(*numa_topo.cpu_to_node.h);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+
+   if (!physm || !physd || !physc)
+   goto out;
+
+   vmem = __va(physm);
+   vdistance  = __va(physd);
+   cpu_to_node  = __va(physc);
+
+   numa_topo.nr_nodes = nr_nodes;
+   numa_topo.nr_cpus = nr_cpus;
+
+   set_xen_guest_handle(numa_topo.memrange.h, vmem);
+   set_xen_guest_handle(numa_topo.distance.h, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node.h, cpu_to_node);
+
+   if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, _topo) < 0)
+   goto out;
+
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   f

[PATCH v3 0/2] xen: vnuma for PV guests

2014-06-02 Thread Elena Ufimtseva

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.

Patchsets for Xen and linux:
git://gitorious.org/xenvnuma_v5/linuxvnuma_v5.git
https://git.gitorious.org/xenvnuma_v5/linuxvnuma_v5.git

Xen patchset is available at:
git://gitorious.org/xenvnuma_v5/xenvnuma_v5.git
https://git.gitorious.org/xenvnuma_v5/xenvnuma_v5.git


Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Elena Ufimtseva (1):
  Xen vnuma introduction.

 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  121 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 1/2] xen: vnuma for pv guests

2014-06-02 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  121 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..8c8b098
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+int xen_numa_init(void);
+#else
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1d045f9..37a9c84 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -18,6 +18,7 @@
 #include asm/acpi.h
 #include asm/amd_nb.h
 
+#include asm/xen/vnuma.h
 #include numa_internal.h
 
 int __initdata numa_off;
@@ -687,6 +688,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_ACPI_NUMA
if (!numa_init(x86_acpi_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..185ec9b 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
 obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
 obj-$(CONFIG_SWIOTLB_XEN)  += pci-swiotlb-xen.o
+obj-$(CONFIG_NUMA) += vnuma.o
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0982233..0235f19 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include asm/numa.h
 #include asm/xen/hypervisor.h
 #include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
 
 #include xen/xen.h
 #include xen/page.h
@@ -622,6 +623,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (xen_initial_domain())
+   numa_off = 1;
+   else
+   numa_off = 0;
 #endif
 }
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..a02f9c6
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,121 @@
+#include linux/err.h
+#include linux/memblock.h
+#include xen/interface/xen.h
+#include xen/interface/memory.h
+#include asm/xen/interface.h
+#include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
+
+/*
+ * Called from numa_init if numa_off = 0;
+ */
+int __init xen_numa_init(void)
+{
+   unsigned int i, j, idx;
+   unsigned int cpu, pcpus, nr_nodes, nr_cpus;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vmem;
+   u64 physm, physd, physc;
+   int rc;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF
+   };
+
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   /* get the number of nodes for allocation of memblocks */
+   pcpus = num_possible_cpus();
+   nr_cpus = setup_max_cpus  pcpus ? setup_max_cpus : pcpus;
+
+   /* support for nodes with at least one cpu */
+   nr_nodes = nr_cpus;
+
+   /*
+* Allocate arrays for nr_cpus/nr_nodes sizes and let
+* hypervisor know that these are the boundaries. Partial
+* copy is not allowed and hypercall will fail.
+*/
+
+   mem_size =  nr_nodes * sizeof(struct vmemrange);
+   dist_size = nr_nodes * nr_nodes * sizeof(*numa_topo.distance.h);
+   cpu_to_node_size = nr_cpus * sizeof(*numa_topo.cpu_to_node.h);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+
+   if (!physm || !physd || !physc)
+   goto out;
+
+   vmem = __va(physm);
+   vdistance  = __va(physd);
+   cpu_to_node  = __va(physc);
+
+   numa_topo.nr_nodes = nr_nodes;
+   numa_topo.nr_cpus = nr_cpus;
+
+   set_xen_guest_handle(numa_topo.memrange.h, vmem);
+   set_xen_guest_handle(numa_topo.distance.h, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node.h, cpu_to_node);
+
+   if (HYPERVISOR_memory_op

[PATCH v3 0/2] xen: vnuma for PV guests

2014-06-02 Thread Elena Ufimtseva

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.

Patchsets for Xen and linux:
git://gitorious.org/xenvnuma_v5/linuxvnuma_v5.git
https://git.gitorious.org/xenvnuma_v5/linuxvnuma_v5.git

Xen patchset is available at:
git://gitorious.org/xenvnuma_v5/xenvnuma_v5.git
https://git.gitorious.org/xenvnuma_v5/xenvnuma_v5.git


Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Elena Ufimtseva (1):
  Xen vnuma introduction.

 arch/x86/include/asm/xen/vnuma.h |   10 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|1 +
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  121 ++
 include/xen/interface/memory.h   |   50 
 6 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-02-03 Thread Elena Ufimtseva

On Sun, Jan 26, 2014 at 1:02 PM, Elena Ufimtseva  wrote:
> On Fri, Jan 24, 2014 at 8:38 AM, Mel Gorman  wrote:
>> On Thu, Jan 23, 2014 at 11:23:37AM -0500, Elena Ufimtseva wrote:
>>> >> >> 
>>> >> >>
>>> >> >> This dump doesn't look dramatically different, either.
>>> >> >>
>>> >> >>>
>>> >> >>> The other question is - how is AutoNUMA running when it is not 
>>> >> >>> enabled?
>>> >> >>> Shouldn't those _PAGE_NUMA ops be nops when AutoNUMA hasn't even been
>>> >> >>> turned on?
>>> >> >>
>>> >> >>
>>> >> >> Well, NUMA_BALANCING is enabled in the kernel config[1], but I 
>>> >> >> presume you
>>> >> >> mean not enabled at runtime?
>>> >> >>
>>> >> >> [1]
>>> >> >> http://git.uplinklabs.net/snoonan/projects/archlinux/ec2/ec2-packages.git/tree/linux-ec2/config.x86_64
>>> >>
>>> >>
>>> >>
>>> >> --
>>> >> Elena
>>>
>>> I was able to reproduce this consistently, also with the latest mm
>>> patches from yesterday.
>>> Can you please try this:
>>>
>>
>> Thanks Elena,
>>
>>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>>> index ce563be..76dcf96 100644
>>> --- a/arch/x86/xen/mmu.c
>>> +++ b/arch/x86/xen/mmu.c
>>> @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct
>>> *mm, unsigned long addr,
>>>  /* Assume pteval_t is equivalent to all the other *val_t types. */
>>>  static pteval_t pte_mfn_to_pfn(pteval_t val)
>>>  {
>>> -   if (val & _PAGE_PRESENT) {
>>> +   if ((val & _PAGE_PRESENT) || ((val &
>>> (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
>>> unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>>> unsigned long pfn = mfn_to_pfn(mfn);
>>>
>>> @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
>>>
>>>  static pteval_t pte_pfn_to_mfn(pteval_t val)
>>>  {
>>> -   if (val & _PAGE_PRESENT) {
>>> +   if ((val & _PAGE_PRESENT) || ((val &
>>> (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
>>> unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>>> pteval_t flags = val & PTE_FLAGS_MASK;
>>> unsigned long mfn;
>>
>> Would reusing pte_present be an option? Ordinarily I expect that
>> PAGE_NUMA/PAGE_PROTNONE is only set if PAGE_PRESENT is not set and 
>> pte_present
>> is defined as
>>
>> static inline int pte_present(pte_t a)
>> {
>> return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
>>_PAGE_NUMA);
>> }
>>
>> So it looks like it work work. Of course it would need to be split to
>> reuse it within xen if pte_present was split to have a pteval_present
>> helper like so
>>
>> static inline int pteval_present(pteval_t val)
>> {
>> /*
>>  * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this
>>  * way clearly states that the intent is that a protnone and numa
>>  * hinting ptes are considered present for the purposes of
>>  * pagetable operations like zapping, protection changes, gup etc.
>>  */
>> return val & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA);
>> }
>>
>> static inline int pte_present(pte_t pte)
>> {
>> return pteval_present(pte_flags(pte))
>> }
>>
>> If Xen is doing some other tricks with _PAGE_PRESENT then it might be
>> ruled out as an option. If so, then maybe it could still be made a
>> little clearer for future reference?
>
> Yes, sure, it should work, I tried it.
> Thank you Mel.
>
>>
>>
>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>> index c1d406f..ff621de 100644
>> --- a/arch/x86/xen/mmu.c
>> +++ b/arch/x86/xen/mmu.c
>> @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, 
>> unsigned long addr,
>>  /* Assume pteval_t is equivalent to all the other *val_t types. */
>>  static pteval_t pte_mfn_to_pfn(pteval_t val)
>>  {
>> -   if (val & _PAGE_PRESENT) {
>> +   if ((val & _PAGE_PRESENT) || pteval_numa(val)) {
>>

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-02-03 Thread Elena Ufimtseva

On Sun, Jan 26, 2014 at 1:02 PM, Elena Ufimtseva ufimts...@gmail.com wrote:
 On Fri, Jan 24, 2014 at 8:38 AM, Mel Gorman mgor...@suse.de wrote:
 On Thu, Jan 23, 2014 at 11:23:37AM -0500, Elena Ufimtseva wrote:
   SNIP
  
   This dump doesn't look dramatically different, either.
  
  
   The other question is - how is AutoNUMA running when it is not 
   enabled?
   Shouldn't those _PAGE_NUMA ops be nops when AutoNUMA hasn't even been
   turned on?
  
  
   Well, NUMA_BALANCING is enabled in the kernel config[1], but I 
   presume you
   mean not enabled at runtime?
  
   [1]
   http://git.uplinklabs.net/snoonan/projects/archlinux/ec2/ec2-packages.git/tree/linux-ec2/config.x86_64
 
 
 
  --
  Elena

 I was able to reproduce this consistently, also with the latest mm
 patches from yesterday.
 Can you please try this:


 Thanks Elena,

 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
 index ce563be..76dcf96 100644
 --- a/arch/x86/xen/mmu.c
 +++ b/arch/x86/xen/mmu.c
 @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct
 *mm, unsigned long addr,
  /* Assume pteval_t is equivalent to all the other *val_t types. */
  static pteval_t pte_mfn_to_pfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || ((val 
 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
 unsigned long mfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 unsigned long pfn = mfn_to_pfn(mfn);

 @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)

  static pteval_t pte_pfn_to_mfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || ((val 
 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
 unsigned long pfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 pteval_t flags = val  PTE_FLAGS_MASK;
 unsigned long mfn;

 Would reusing pte_present be an option? Ordinarily I expect that
 PAGE_NUMA/PAGE_PROTNONE is only set if PAGE_PRESENT is not set and 
 pte_present
 is defined as

 static inline int pte_present(pte_t a)
 {
 return pte_flags(a)  (_PAGE_PRESENT | _PAGE_PROTNONE |
_PAGE_NUMA);
 }

 So it looks like it work work. Of course it would need to be split to
 reuse it within xen if pte_present was split to have a pteval_present
 helper like so

 static inline int pteval_present(pteval_t val)
 {
 /*
  * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this
  * way clearly states that the intent is that a protnone and numa
  * hinting ptes are considered present for the purposes of
  * pagetable operations like zapping, protection changes, gup etc.
  */
 return val  (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA);
 }

 static inline int pte_present(pte_t pte)
 {
 return pteval_present(pte_flags(pte))
 }

 If Xen is doing some other tricks with _PAGE_PRESENT then it might be
 ruled out as an option. If so, then maybe it could still be made a
 little clearer for future reference?

 Yes, sure, it should work, I tried it.
 Thank you Mel.



 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
 index c1d406f..ff621de 100644
 --- a/arch/x86/xen/mmu.c
 +++ b/arch/x86/xen/mmu.c
 @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, 
 unsigned long addr,
  /* Assume pteval_t is equivalent to all the other *val_t types. */
  static pteval_t pte_mfn_to_pfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || pteval_numa(val)) {
 unsigned long mfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 unsigned long pfn = mfn_to_pfn(mfn);

 @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)

  static pteval_t pte_pfn_to_mfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || pteval_numa(val)) {
 unsigned long pfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 pteval_t flags = val  PTE_FLAGS_MASK;
 unsigned long mfn;
 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
 index 8e4f41d..693fe00 100644
 --- a/include/asm-generic/pgtable.h
 +++ b/include/asm-generic/pgtable.h
 @@ -654,10 +654,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
   * (because _PAGE_PRESENT is not set).
   */
  #ifndef pte_numa
 +static inline int pteval_numa(pteval_t pteval)
 +{
 +   return (pteval  (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
 +}
 +
  static inline int pte_numa(pte_t pte)
  {
 -   return (pte_flags(pte) 
 -   (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
 +   return pteval_numa(pte_flags(pte));
  }
  #endif




 --
 Elena

Here are two variants of this change . First one adds check for
_PAGE_NUMA flag in xen pte translations.
Second adds proposed by Mel pteval_present (comments are left
untouched :) and respective patch for xen pte translation that
uses pteval_present.
Mel, you can pick any of these two

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-26 Thread Elena Ufimtseva

On Fri, Jan 24, 2014 at 8:38 AM, Mel Gorman  wrote:
> On Thu, Jan 23, 2014 at 11:23:37AM -0500, Elena Ufimtseva wrote:
>> >> >> 
>> >> >>
>> >> >> This dump doesn't look dramatically different, either.
>> >> >>
>> >> >>>
>> >> >>> The other question is - how is AutoNUMA running when it is not 
>> >> >>> enabled?
>> >> >>> Shouldn't those _PAGE_NUMA ops be nops when AutoNUMA hasn't even been
>> >> >>> turned on?
>> >> >>
>> >> >>
>> >> >> Well, NUMA_BALANCING is enabled in the kernel config[1], but I presume 
>> >> >> you
>> >> >> mean not enabled at runtime?
>> >> >>
>> >> >> [1]
>> >> >> http://git.uplinklabs.net/snoonan/projects/archlinux/ec2/ec2-packages.git/tree/linux-ec2/config.x86_64
>> >>
>> >>
>> >>
>> >> --
>> >> Elena
>>
>> I was able to reproduce this consistently, also with the latest mm
>> patches from yesterday.
>> Can you please try this:
>>
>
> Thanks Elena,
>
>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>> index ce563be..76dcf96 100644
>> --- a/arch/x86/xen/mmu.c
>> +++ b/arch/x86/xen/mmu.c
>> @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct
>> *mm, unsigned long addr,
>>  /* Assume pteval_t is equivalent to all the other *val_t types. */
>>  static pteval_t pte_mfn_to_pfn(pteval_t val)
>>  {
>> -   if (val & _PAGE_PRESENT) {
>> +   if ((val & _PAGE_PRESENT) || ((val &
>> (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
>> unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>> unsigned long pfn = mfn_to_pfn(mfn);
>>
>> @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
>>
>>  static pteval_t pte_pfn_to_mfn(pteval_t val)
>>  {
>> -   if (val & _PAGE_PRESENT) {
>> +   if ((val & _PAGE_PRESENT) || ((val &
>> (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
>> unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
>> pteval_t flags = val & PTE_FLAGS_MASK;
>> unsigned long mfn;
>
> Would reusing pte_present be an option? Ordinarily I expect that
> PAGE_NUMA/PAGE_PROTNONE is only set if PAGE_PRESENT is not set and pte_present
> is defined as
>
> static inline int pte_present(pte_t a)
> {
> return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
>_PAGE_NUMA);
> }
>
> So it looks like it work work. Of course it would need to be split to
> reuse it within xen if pte_present was split to have a pteval_present
> helper like so
>
> static inline int pteval_present(pteval_t val)
> {
> /*
>  * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this
>  * way clearly states that the intent is that a protnone and numa
>  * hinting ptes are considered present for the purposes of
>  * pagetable operations like zapping, protection changes, gup etc.
>  */
> return val & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA);
> }
>
> static inline int pte_present(pte_t pte)
> {
> return pteval_present(pte_flags(pte))
> }
>
> If Xen is doing some other tricks with _PAGE_PRESENT then it might be
> ruled out as an option. If so, then maybe it could still be made a
> little clearer for future reference?

Yes, sure, it should work, I tried it.
Thank you Mel.

>
>
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index c1d406f..ff621de 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, 
> unsigned long addr,
>  /* Assume pteval_t is equivalent to all the other *val_t types. */
>  static pteval_t pte_mfn_to_pfn(pteval_t val)
>  {
> -   if (val & _PAGE_PRESENT) {
> +   if ((val & _PAGE_PRESENT) || pteval_numa(val)) {
> unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
> unsigned long pfn = mfn_to_pfn(mfn);
>
> @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
>
>  static pteval_t pte_pfn_to_mfn(pteval_t val)
>  {
> -   if (val & _PAGE_PRESENT) {
> +   if ((val & _PAGE_PRESENT) || pteval_numa(val)) {
> unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
> pt

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-26 Thread Elena Ufimtseva

On Fri, Jan 24, 2014 at 8:38 AM, Mel Gorman mgor...@suse.de wrote:
 On Thu, Jan 23, 2014 at 11:23:37AM -0500, Elena Ufimtseva wrote:
   SNIP
  
   This dump doesn't look dramatically different, either.
  
  
   The other question is - how is AutoNUMA running when it is not 
   enabled?
   Shouldn't those _PAGE_NUMA ops be nops when AutoNUMA hasn't even been
   turned on?
  
  
   Well, NUMA_BALANCING is enabled in the kernel config[1], but I presume 
   you
   mean not enabled at runtime?
  
   [1]
   http://git.uplinklabs.net/snoonan/projects/archlinux/ec2/ec2-packages.git/tree/linux-ec2/config.x86_64
 
 
 
  --
  Elena

 I was able to reproduce this consistently, also with the latest mm
 patches from yesterday.
 Can you please try this:


 Thanks Elena,

 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
 index ce563be..76dcf96 100644
 --- a/arch/x86/xen/mmu.c
 +++ b/arch/x86/xen/mmu.c
 @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct
 *mm, unsigned long addr,
  /* Assume pteval_t is equivalent to all the other *val_t types. */
  static pteval_t pte_mfn_to_pfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || ((val 
 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
 unsigned long mfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 unsigned long pfn = mfn_to_pfn(mfn);

 @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)

  static pteval_t pte_pfn_to_mfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || ((val 
 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)) {
 unsigned long pfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 pteval_t flags = val  PTE_FLAGS_MASK;
 unsigned long mfn;

 Would reusing pte_present be an option? Ordinarily I expect that
 PAGE_NUMA/PAGE_PROTNONE is only set if PAGE_PRESENT is not set and pte_present
 is defined as

 static inline int pte_present(pte_t a)
 {
 return pte_flags(a)  (_PAGE_PRESENT | _PAGE_PROTNONE |
_PAGE_NUMA);
 }

 So it looks like it work work. Of course it would need to be split to
 reuse it within xen if pte_present was split to have a pteval_present
 helper like so

 static inline int pteval_present(pteval_t val)
 {
 /*
  * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this
  * way clearly states that the intent is that a protnone and numa
  * hinting ptes are considered present for the purposes of
  * pagetable operations like zapping, protection changes, gup etc.
  */
 return val  (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA);
 }

 static inline int pte_present(pte_t pte)
 {
 return pteval_present(pte_flags(pte))
 }

 If Xen is doing some other tricks with _PAGE_PRESENT then it might be
 ruled out as an option. If so, then maybe it could still be made a
 little clearer for future reference?

Yes, sure, it should work, I tried it.
Thank you Mel.



 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
 index c1d406f..ff621de 100644
 --- a/arch/x86/xen/mmu.c
 +++ b/arch/x86/xen/mmu.c
 @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, 
 unsigned long addr,
  /* Assume pteval_t is equivalent to all the other *val_t types. */
  static pteval_t pte_mfn_to_pfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || pteval_numa(val)) {
 unsigned long mfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 unsigned long pfn = mfn_to_pfn(mfn);

 @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)

  static pteval_t pte_pfn_to_mfn(pteval_t val)
  {
 -   if (val  _PAGE_PRESENT) {
 +   if ((val  _PAGE_PRESENT) || pteval_numa(val)) {
 unsigned long pfn = (val  PTE_PFN_MASK)  PAGE_SHIFT;
 pteval_t flags = val  PTE_FLAGS_MASK;
 unsigned long mfn;
 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
 index 8e4f41d..693fe00 100644
 --- a/include/asm-generic/pgtable.h
 +++ b/include/asm-generic/pgtable.h
 @@ -654,10 +654,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
   * (because _PAGE_PRESENT is not set).
   */
  #ifndef pte_numa
 +static inline int pteval_numa(pteval_t pteval)
 +{
 +   return (pteval  (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
 +}
 +
  static inline int pte_numa(pte_t pte)
  {
 -   return (pte_flags(pte) 
 -   (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
 +   return pteval_numa(pte_flags(pte));
  }
  #endif




-- 
Elena
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-23 Thread Elena Ufimtseva

On Thu, Jan 23, 2014 at 6:20 PM, Steven Noonan  wrote:
> On Thu, Jan 23, 2014 at 11:23:37AM -0500, Elena Ufimtseva wrote:
>> On Wed, Jan 22, 2014 at 3:33 PM, Steven Noonan  wrote:
>> > On Wed, Jan 22, 2014 at 03:18:50PM -0500, Elena Ufimtseva wrote:
>> >> On Wed, Jan 22, 2014 at 9:29 AM, Daniel Borkmann  
>> >> wrote:
>> >> > On 01/22/2014 08:29 AM, Steven Noonan wrote:
>> >> >>
>> >> >> On Wed, Jan 22, 2014 at 12:02:15AM -0500, Konrad Rzeszutek Wilk wrote:
>> >> >>>
>> >> >>> On Tue, Jan 21, 2014 at 07:20:45PM -0800, Steven Noonan wrote:
>> >> >>>>
>> >> >>>> On Tue, Jan 21, 2014 at 06:47:07PM -0800, Linus Torvalds wrote:
>> >> >>>>>
>> >> >>>>> On Tue, Jan 21, 2014 at 5:49 PM, Greg Kroah-Hartman
>> >> >>>>>  wrote:
>> >> >>>
>> >> >>>
>> >> >>> Adding extra folks to the party.
>> >> >>>>>>
>> >> >>>>>>
>> >> >>>>>> Odds are this also shows up in 3.13, right?
>> >> >>>>
>> >> >>>>
>> >> >>>> Reproduced using 3.13 on the PV guest:
>> >> >>>>
>> >> >>>> [  368.756763] BUG: Bad page map in process mp
>> >> >>>> pte:8004a67c6165 pmd:e9b706067
>> >> >>>> [  368.756777] page:ea001299f180 count:0 mapcount:-1
>> >> >>>> mapping:  (null) index:0x0
>> >> >>>> [  368.756781] page flags: 0x2f8014(referenced|dirty)
>> >> >>>> [  368.756786] addr:7fd1388b7000 vm_flags:00100071
>> >> >>>> anon_vma:880e9ba15f80 mapping:  (null) index:7fd1388b7
>> >> >>>> [  368.756792] CPU: 29 PID: 618 Comm: mp Not tainted 
>> >> >>>> 3.13.0-ec2
>> >> >>>> #1
>> >> >>>> [  368.756795]  880e9b718958 880e9eaf3cc0
>> >> >>>> 814d8748 7fd1388b7000
>> >> >>>> [  368.756803]  880e9eaf3d08 8116d289
>> >> >>>>  
>> >> >>>> [  368.756809]  880e9b7065b8 ea001299f180
>> >> >>>> 7fd1388b8000 880e9eaf3e30
>> >> >>>> [  368.756815] Call Trace:
>> >> >>>> [  368.756825]  [] dump_stack+0x45/0x56
>> >> >>>> [  368.756833]  [] 
>> >> >>>> print_bad_pte+0x229/0x250
>> >> >>>> [  368.756837]  []
>> >> >>>> unmap_single_vma+0x583/0x890
>> >> >>>> [  368.756842]  [] unmap_vmas+0x65/0x90
>> >> >>>> [  368.756847]  [] unmap_region+0xac/0x120
>> >> >>>> [  368.756852]  [] ? 
>> >> >>>> vma_rb_erase+0x1c9/0x210
>> >> >>>> [  368.756856]  [] do_munmap+0x280/0x370
>> >> >>>> [  368.756860]  [] vm_munmap+0x41/0x60
>> >> >>>> [  368.756864]  [] SyS_munmap+0x22/0x30
>> >> >>>> [  368.756869]  []
>> >> >>>> system_call_fastpath+0x1a/0x1f
>> >> >>>> [  368.756872] Disabling lock debugging due to kernel taint
>> >> >>>> [  368.760084] BUG: Bad rss-counter state mm:880e9d079680
>> >> >>>> idx:0 val:-1
>> >> >>>> [  368.760091] BUG: Bad rss-counter state mm:880e9d079680
>> >> >>>> idx:1 val:1
>> >> >>>>
>> >> >>>>>
>> >> >>>>> Probably. I don't have a Xen PV setup to test with (and very little
>> >> >>>>> interest in setting one up).. And I have a suspicion that it might 
>> >> >>>>> not
>> >> >>>>> be so much about Xen PV, as perhaps about the kind of hardware.
>> >> >>>>>
>> >> >>>>> I suspect the issue has something to do with the magic _PAGE_NUMA
>> >> >>>>> tie-in with _PAGE_PRESENT. And then mprotect(PROT_NONE) ends up
>> >> >>>>> removing the _PAGE_PRESEN

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-23 Thread Elena Ufimtseva

On Wed, Jan 22, 2014 at 3:33 PM, Steven Noonan  wrote:
> On Wed, Jan 22, 2014 at 03:18:50PM -0500, Elena Ufimtseva wrote:
>> On Wed, Jan 22, 2014 at 9:29 AM, Daniel Borkmann  
>> wrote:
>> > On 01/22/2014 08:29 AM, Steven Noonan wrote:
>> >>
>> >> On Wed, Jan 22, 2014 at 12:02:15AM -0500, Konrad Rzeszutek Wilk wrote:
>> >>>
>> >>> On Tue, Jan 21, 2014 at 07:20:45PM -0800, Steven Noonan wrote:
>> >>>>
>> >>>> On Tue, Jan 21, 2014 at 06:47:07PM -0800, Linus Torvalds wrote:
>> >>>>>
>> >>>>> On Tue, Jan 21, 2014 at 5:49 PM, Greg Kroah-Hartman
>> >>>>>  wrote:
>> >>>
>> >>>
>> >>> Adding extra folks to the party.
>> >>>>>>
>> >>>>>>
>> >>>>>> Odds are this also shows up in 3.13, right?
>> >>>>
>> >>>>
>> >>>> Reproduced using 3.13 on the PV guest:
>> >>>>
>> >>>> [  368.756763] BUG: Bad page map in process mp
>> >>>> pte:8004a67c6165 pmd:e9b706067
>> >>>> [  368.756777] page:ea001299f180 count:0 mapcount:-1
>> >>>> mapping:  (null) index:0x0
>> >>>> [  368.756781] page flags: 0x2f8014(referenced|dirty)
>> >>>> [  368.756786] addr:7fd1388b7000 vm_flags:00100071
>> >>>> anon_vma:880e9ba15f80 mapping:  (null) index:7fd1388b7
>> >>>> [  368.756792] CPU: 29 PID: 618 Comm: mp Not tainted 3.13.0-ec2
>> >>>> #1
>> >>>> [  368.756795]  880e9b718958 880e9eaf3cc0
>> >>>> 814d8748 7fd1388b7000
>> >>>> [  368.756803]  880e9eaf3d08 8116d289
>> >>>>  
>> >>>> [  368.756809]  880e9b7065b8 ea001299f180
>> >>>> 7fd1388b8000 880e9eaf3e30
>> >>>> [  368.756815] Call Trace:
>> >>>> [  368.756825]  [] dump_stack+0x45/0x56
>> >>>> [  368.756833]  [] print_bad_pte+0x229/0x250
>> >>>> [  368.756837]  []
>> >>>> unmap_single_vma+0x583/0x890
>> >>>> [  368.756842]  [] unmap_vmas+0x65/0x90
>> >>>> [  368.756847]  [] unmap_region+0xac/0x120
>> >>>> [  368.756852]  [] ? vma_rb_erase+0x1c9/0x210
>> >>>> [  368.756856]  [] do_munmap+0x280/0x370
>> >>>> [  368.756860]  [] vm_munmap+0x41/0x60
>> >>>> [  368.756864]  [] SyS_munmap+0x22/0x30
>> >>>> [  368.756869]  []
>> >>>> system_call_fastpath+0x1a/0x1f
>> >>>> [  368.756872] Disabling lock debugging due to kernel taint
>> >>>> [  368.760084] BUG: Bad rss-counter state mm:880e9d079680
>> >>>> idx:0 val:-1
>> >>>> [  368.760091] BUG: Bad rss-counter state mm:880e9d079680
>> >>>> idx:1 val:1
>> >>>>
>> >>>>>
>> >>>>> Probably. I don't have a Xen PV setup to test with (and very little
>> >>>>> interest in setting one up).. And I have a suspicion that it might not
>> >>>>> be so much about Xen PV, as perhaps about the kind of hardware.
>> >>>>>
>> >>>>> I suspect the issue has something to do with the magic _PAGE_NUMA
>> >>>>> tie-in with _PAGE_PRESENT. And then mprotect(PROT_NONE) ends up
>> >>>>> removing the _PAGE_PRESENT bit, and now the crazy numa code is
>> >>>>> confused.
>> >>>>>
>> >>>>> The whole _PAGE_NUMA thing is a f*cking horrible hack, and shares the
>> >>>>> bit with _PAGE_PROTNONE, which is why it then has that tie-in to
>> >>>>> _PAGE_PRESENT.
>> >>>>>
>> >>>>> Adding Andrea to the Cc, because he's the author of that horridness.
>> >>>>> Putting Steven's test-case here as an attachement for Andrea, maybe
>> >>>>> that makes him go "Ahh, yes, silly case".
>> >>>>>
>> >>>>> Also added Kirill, because he was involved the last _PAGE_NUMA debacle.
>> >>>>>
>> >>>>> Andrea, you ca

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-23 Thread Elena Ufimtseva

On Wed, Jan 22, 2014 at 3:33 PM, Steven Noonan ste...@uplinklabs.net wrote:
 On Wed, Jan 22, 2014 at 03:18:50PM -0500, Elena Ufimtseva wrote:
 On Wed, Jan 22, 2014 at 9:29 AM, Daniel Borkmann borkm...@iogearbox.net 
 wrote:
  On 01/22/2014 08:29 AM, Steven Noonan wrote:
 
  On Wed, Jan 22, 2014 at 12:02:15AM -0500, Konrad Rzeszutek Wilk wrote:
 
  On Tue, Jan 21, 2014 at 07:20:45PM -0800, Steven Noonan wrote:
 
  On Tue, Jan 21, 2014 at 06:47:07PM -0800, Linus Torvalds wrote:
 
  On Tue, Jan 21, 2014 at 5:49 PM, Greg Kroah-Hartman
  gre...@linuxfoundation.org wrote:
 
 
  Adding extra folks to the party.
 
 
  Odds are this also shows up in 3.13, right?
 
 
  Reproduced using 3.13 on the PV guest:
 
  [  368.756763] BUG: Bad page map in process mp
  pte:8004a67c6165 pmd:e9b706067
  [  368.756777] page:ea001299f180 count:0 mapcount:-1
  mapping:  (null) index:0x0
  [  368.756781] page flags: 0x2f8014(referenced|dirty)
  [  368.756786] addr:7fd1388b7000 vm_flags:00100071
  anon_vma:880e9ba15f80 mapping:  (null) index:7fd1388b7
  [  368.756792] CPU: 29 PID: 618 Comm: mp Not tainted 3.13.0-ec2
  #1
  [  368.756795]  880e9b718958 880e9eaf3cc0
  814d8748 7fd1388b7000
  [  368.756803]  880e9eaf3d08 8116d289
   
  [  368.756809]  880e9b7065b8 ea001299f180
  7fd1388b8000 880e9eaf3e30
  [  368.756815] Call Trace:
  [  368.756825]  [814d8748] dump_stack+0x45/0x56
  [  368.756833]  [8116d289] print_bad_pte+0x229/0x250
  [  368.756837]  [8116eae3]
  unmap_single_vma+0x583/0x890
  [  368.756842]  [8116feb5] unmap_vmas+0x65/0x90
  [  368.756847]  [81175dac] unmap_region+0xac/0x120
  [  368.756852]  [81176379] ? vma_rb_erase+0x1c9/0x210
  [  368.756856]  [81177f10] do_munmap+0x280/0x370
  [  368.756860]  [81178041] vm_munmap+0x41/0x60
  [  368.756864]  [81178f32] SyS_munmap+0x22/0x30
  [  368.756869]  [814e70ed]
  system_call_fastpath+0x1a/0x1f
  [  368.756872] Disabling lock debugging due to kernel taint
  [  368.760084] BUG: Bad rss-counter state mm:880e9d079680
  idx:0 val:-1
  [  368.760091] BUG: Bad rss-counter state mm:880e9d079680
  idx:1 val:1
 
 
  Probably. I don't have a Xen PV setup to test with (and very little
  interest in setting one up).. And I have a suspicion that it might not
  be so much about Xen PV, as perhaps about the kind of hardware.
 
  I suspect the issue has something to do with the magic _PAGE_NUMA
  tie-in with _PAGE_PRESENT. And then mprotect(PROT_NONE) ends up
  removing the _PAGE_PRESENT bit, and now the crazy numa code is
  confused.
 
  The whole _PAGE_NUMA thing is a f*cking horrible hack, and shares the
  bit with _PAGE_PROTNONE, which is why it then has that tie-in to
  _PAGE_PRESENT.
 
  Adding Andrea to the Cc, because he's the author of that horridness.
  Putting Steven's test-case here as an attachement for Andrea, maybe
  that makes him go Ahh, yes, silly case.
 
  Also added Kirill, because he was involved the last _PAGE_NUMA debacle.
 
  Andrea, you can find the thread on lkml, but it boils down to commit
  1667918b6483 (backported to 3.12.7 as 3d792d616ba4) breaking the
  attached test-case (but apparently only under Xen PV). There it
  apparently causes a BUG: Bad page map .. error.
 
 
  I *think* it is due to the fact that pmd_numa and pte_numa is getting the
  _raw_
  value of PMDs and PTEs. That is - it does not use the pvops interface
  and instead reads the values directly from the page-table. Since the
  page-table is also manipulated by the hypervisor - there are certain
  flags it also sets to do its business. It might be that it uses
  _PAGE_GLOBAL as well - and Linux picks up on that. If it was using
  pte_flags that would invoke the pvops interface.
 
  Elena, Dariof and George, you guys had been looking at this a bit deeper
  than I have. Does the Xen hypervisor use the _PAGE_GLOBAL for PV guests?

It does use _PAGE_GLOBAL for guest user pages

 
  This not-compiled-totally-bad-patch might shed some light on what I was
  thinking _could_ fix this issue - and IS NOT A FIX - JUST A HACK.
  It does not fix it for PMDs naturally (as there are no PMD paravirt ops
  for that).
 
 
  Unfortunately the Totally Bad Patch seems to make no difference. I am
  still able to repro the issue:

 Steven, do you use numa=fake on boot cmd line for pv guest?

 I had similar issue on pv guest. Let me check if the fix that resolved
 this for me will help with 3.13.

 Nope:

 # cat /proc/cmdline
 root=/dev/xvda1 ro rootwait rootfstype=ext4 nomodeset console=hvc0 
 earlyprintk=xen,verbose loglevel=7



 
 
  Maybe this one is also related to this BUG here (cc'ed people investigating
  this one

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-23 Thread Elena Ufimtseva

On Thu, Jan 23, 2014 at 6:20 PM, Steven Noonan ste...@uplinklabs.net wrote:
 On Thu, Jan 23, 2014 at 11:23:37AM -0500, Elena Ufimtseva wrote:
 On Wed, Jan 22, 2014 at 3:33 PM, Steven Noonan ste...@uplinklabs.net wrote:
  On Wed, Jan 22, 2014 at 03:18:50PM -0500, Elena Ufimtseva wrote:
  On Wed, Jan 22, 2014 at 9:29 AM, Daniel Borkmann borkm...@iogearbox.net 
  wrote:
   On 01/22/2014 08:29 AM, Steven Noonan wrote:
  
   On Wed, Jan 22, 2014 at 12:02:15AM -0500, Konrad Rzeszutek Wilk wrote:
  
   On Tue, Jan 21, 2014 at 07:20:45PM -0800, Steven Noonan wrote:
  
   On Tue, Jan 21, 2014 at 06:47:07PM -0800, Linus Torvalds wrote:
  
   On Tue, Jan 21, 2014 at 5:49 PM, Greg Kroah-Hartman
   gre...@linuxfoundation.org wrote:
  
  
   Adding extra folks to the party.
  
  
   Odds are this also shows up in 3.13, right?
  
  
   Reproduced using 3.13 on the PV guest:
  
   [  368.756763] BUG: Bad page map in process mp
   pte:8004a67c6165 pmd:e9b706067
   [  368.756777] page:ea001299f180 count:0 mapcount:-1
   mapping:  (null) index:0x0
   [  368.756781] page flags: 0x2f8014(referenced|dirty)
   [  368.756786] addr:7fd1388b7000 vm_flags:00100071
   anon_vma:880e9ba15f80 mapping:  (null) index:7fd1388b7
   [  368.756792] CPU: 29 PID: 618 Comm: mp Not tainted 
   3.13.0-ec2
   #1
   [  368.756795]  880e9b718958 880e9eaf3cc0
   814d8748 7fd1388b7000
   [  368.756803]  880e9eaf3d08 8116d289
    
   [  368.756809]  880e9b7065b8 ea001299f180
   7fd1388b8000 880e9eaf3e30
   [  368.756815] Call Trace:
   [  368.756825]  [814d8748] dump_stack+0x45/0x56
   [  368.756833]  [8116d289] 
   print_bad_pte+0x229/0x250
   [  368.756837]  [8116eae3]
   unmap_single_vma+0x583/0x890
   [  368.756842]  [8116feb5] unmap_vmas+0x65/0x90
   [  368.756847]  [81175dac] unmap_region+0xac/0x120
   [  368.756852]  [81176379] ? 
   vma_rb_erase+0x1c9/0x210
   [  368.756856]  [81177f10] do_munmap+0x280/0x370
   [  368.756860]  [81178041] vm_munmap+0x41/0x60
   [  368.756864]  [81178f32] SyS_munmap+0x22/0x30
   [  368.756869]  [814e70ed]
   system_call_fastpath+0x1a/0x1f
   [  368.756872] Disabling lock debugging due to kernel taint
   [  368.760084] BUG: Bad rss-counter state mm:880e9d079680
   idx:0 val:-1
   [  368.760091] BUG: Bad rss-counter state mm:880e9d079680
   idx:1 val:1
  
  
   Probably. I don't have a Xen PV setup to test with (and very little
   interest in setting one up).. And I have a suspicion that it might 
   not
   be so much about Xen PV, as perhaps about the kind of hardware.
  
   I suspect the issue has something to do with the magic _PAGE_NUMA
   tie-in with _PAGE_PRESENT. And then mprotect(PROT_NONE) ends up
   removing the _PAGE_PRESENT bit, and now the crazy numa code is
   confused.
  
   The whole _PAGE_NUMA thing is a f*cking horrible hack, and shares 
   the
   bit with _PAGE_PROTNONE, which is why it then has that tie-in to
   _PAGE_PRESENT.
  
   Adding Andrea to the Cc, because he's the author of that horridness.
   Putting Steven's test-case here as an attachement for Andrea, maybe
   that makes him go Ahh, yes, silly case.
  
   Also added Kirill, because he was involved the last _PAGE_NUMA 
   debacle.
  
   Andrea, you can find the thread on lkml, but it boils down to commit
   1667918b6483 (backported to 3.12.7 as 3d792d616ba4) breaking the
   attached test-case (but apparently only under Xen PV). There it
   apparently causes a BUG: Bad page map .. error.
  
  
   I *think* it is due to the fact that pmd_numa and pte_numa is getting 
   the
   _raw_
   value of PMDs and PTEs. That is - it does not use the pvops interface
   and instead reads the values directly from the page-table. Since the
   page-table is also manipulated by the hypervisor - there are certain
   flags it also sets to do its business. It might be that it uses
   _PAGE_GLOBAL as well - and Linux picks up on that. If it was using
   pte_flags that would invoke the pvops interface.
  
   Elena, Dariof and George, you guys had been looking at this a bit 
   deeper
   than I have. Does the Xen hypervisor use the _PAGE_GLOBAL for PV 
   guests?

 It does use _PAGE_GLOBAL for guest user pages

  
   This not-compiled-totally-bad-patch might shed some light on what I 
   was
   thinking _could_ fix this issue - and IS NOT A FIX - JUST A HACK.
   It does not fix it for PMDs naturally (as there are no PMD paravirt 
   ops
   for that).
  
  
   Unfortunately the Totally Bad Patch seems to make no difference. I am
   still able to repro the issue:
 
  Steven, do you use numa=fake on boot cmd line for pv guest?
 
  I had similar issue on pv guest. Let me check

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-22 Thread Elena Ufimtseva

On Wed, Jan 22, 2014 at 9:29 AM, Daniel Borkmann  wrote:
> On 01/22/2014 08:29 AM, Steven Noonan wrote:
>>
>> On Wed, Jan 22, 2014 at 12:02:15AM -0500, Konrad Rzeszutek Wilk wrote:
>>>
>>> On Tue, Jan 21, 2014 at 07:20:45PM -0800, Steven Noonan wrote:

 On Tue, Jan 21, 2014 at 06:47:07PM -0800, Linus Torvalds wrote:
>
> On Tue, Jan 21, 2014 at 5:49 PM, Greg Kroah-Hartman
>  wrote:
>>>
>>>
>>> Adding extra folks to the party.
>>
>>
>> Odds are this also shows up in 3.13, right?


 Reproduced using 3.13 on the PV guest:

 [  368.756763] BUG: Bad page map in process mp
 pte:8004a67c6165 pmd:e9b706067
 [  368.756777] page:ea001299f180 count:0 mapcount:-1
 mapping:  (null) index:0x0
 [  368.756781] page flags: 0x2f8014(referenced|dirty)
 [  368.756786] addr:7fd1388b7000 vm_flags:00100071
 anon_vma:880e9ba15f80 mapping:  (null) index:7fd1388b7
 [  368.756792] CPU: 29 PID: 618 Comm: mp Not tainted 3.13.0-ec2
 #1
 [  368.756795]  880e9b718958 880e9eaf3cc0
 814d8748 7fd1388b7000
 [  368.756803]  880e9eaf3d08 8116d289
  
 [  368.756809]  880e9b7065b8 ea001299f180
 7fd1388b8000 880e9eaf3e30
 [  368.756815] Call Trace:
 [  368.756825]  [] dump_stack+0x45/0x56
 [  368.756833]  [] print_bad_pte+0x229/0x250
 [  368.756837]  []
 unmap_single_vma+0x583/0x890
 [  368.756842]  [] unmap_vmas+0x65/0x90
 [  368.756847]  [] unmap_region+0xac/0x120
 [  368.756852]  [] ? vma_rb_erase+0x1c9/0x210
 [  368.756856]  [] do_munmap+0x280/0x370
 [  368.756860]  [] vm_munmap+0x41/0x60
 [  368.756864]  [] SyS_munmap+0x22/0x30
 [  368.756869]  []
 system_call_fastpath+0x1a/0x1f
 [  368.756872] Disabling lock debugging due to kernel taint
 [  368.760084] BUG: Bad rss-counter state mm:880e9d079680
 idx:0 val:-1
 [  368.760091] BUG: Bad rss-counter state mm:880e9d079680
 idx:1 val:1

>
> Probably. I don't have a Xen PV setup to test with (and very little
> interest in setting one up).. And I have a suspicion that it might not
> be so much about Xen PV, as perhaps about the kind of hardware.
>
> I suspect the issue has something to do with the magic _PAGE_NUMA
> tie-in with _PAGE_PRESENT. And then mprotect(PROT_NONE) ends up
> removing the _PAGE_PRESENT bit, and now the crazy numa code is
> confused.
>
> The whole _PAGE_NUMA thing is a f*cking horrible hack, and shares the
> bit with _PAGE_PROTNONE, which is why it then has that tie-in to
> _PAGE_PRESENT.
>
> Adding Andrea to the Cc, because he's the author of that horridness.
> Putting Steven's test-case here as an attachement for Andrea, maybe
> that makes him go "Ahh, yes, silly case".
>
> Also added Kirill, because he was involved the last _PAGE_NUMA debacle.
>
> Andrea, you can find the thread on lkml, but it boils down to commit
> 1667918b6483 (backported to 3.12.7 as 3d792d616ba4) breaking the
> attached test-case (but apparently only under Xen PV). There it
> apparently causes a "BUG: Bad page map .." error.
>>>
>>>
>>> I *think* it is due to the fact that pmd_numa and pte_numa is getting the
>>> _raw_
>>> value of PMDs and PTEs. That is - it does not use the pvops interface
>>> and instead reads the values directly from the page-table. Since the
>>> page-table is also manipulated by the hypervisor - there are certain
>>> flags it also sets to do its business. It might be that it uses
>>> _PAGE_GLOBAL as well - and Linux picks up on that. If it was using
>>> pte_flags that would invoke the pvops interface.
>>>
>>> Elena, Dariof and George, you guys had been looking at this a bit deeper
>>> than I have. Does the Xen hypervisor use the _PAGE_GLOBAL for PV guests?
>>>
>>> This not-compiled-totally-bad-patch might shed some light on what I was
>>> thinking _could_ fix this issue - and IS NOT A FIX - JUST A HACK.
>>> It does not fix it for PMDs naturally (as there are no PMD paravirt ops
>>> for that).
>>
>>
>> Unfortunately the Totally Bad Patch seems to make no difference. I am
>> still able to repro the issue:

Steven, do you use numa=fake on boot cmd line for pv guest?

I had similar issue on pv guest. Let me check if the fix that resolved
this for me will help with 3.13.


>
>
> Maybe this one is also related to this BUG here (cc'ed people investigating
> this one) ...
>
>   https://lkml.org/lkml/2014/1/10/427
>
> ... not sure, though.
>
>
>> [  346.374929] BUG: Bad page map in process mp
>> pte:8004ae928065 pmd:e993f9067
>> [  346.374942] page:ea0012ba4a00 count:0 mapcount:-1 mapping:
>>

Re: [BISECTED] Linux 3.12.7 introduces page map handling regression

2014-01-22 Thread Elena Ufimtseva

On Wed, Jan 22, 2014 at 9:29 AM, Daniel Borkmann borkm...@iogearbox.net wrote:
 On 01/22/2014 08:29 AM, Steven Noonan wrote:

 On Wed, Jan 22, 2014 at 12:02:15AM -0500, Konrad Rzeszutek Wilk wrote:

 On Tue, Jan 21, 2014 at 07:20:45PM -0800, Steven Noonan wrote:

 On Tue, Jan 21, 2014 at 06:47:07PM -0800, Linus Torvalds wrote:

 On Tue, Jan 21, 2014 at 5:49 PM, Greg Kroah-Hartman
 gre...@linuxfoundation.org wrote:


 Adding extra folks to the party.


 Odds are this also shows up in 3.13, right?


 Reproduced using 3.13 on the PV guest:

 [  368.756763] BUG: Bad page map in process mp
 pte:8004a67c6165 pmd:e9b706067
 [  368.756777] page:ea001299f180 count:0 mapcount:-1
 mapping:  (null) index:0x0
 [  368.756781] page flags: 0x2f8014(referenced|dirty)
 [  368.756786] addr:7fd1388b7000 vm_flags:00100071
 anon_vma:880e9ba15f80 mapping:  (null) index:7fd1388b7
 [  368.756792] CPU: 29 PID: 618 Comm: mp Not tainted 3.13.0-ec2
 #1
 [  368.756795]  880e9b718958 880e9eaf3cc0
 814d8748 7fd1388b7000
 [  368.756803]  880e9eaf3d08 8116d289
  
 [  368.756809]  880e9b7065b8 ea001299f180
 7fd1388b8000 880e9eaf3e30
 [  368.756815] Call Trace:
 [  368.756825]  [814d8748] dump_stack+0x45/0x56
 [  368.756833]  [8116d289] print_bad_pte+0x229/0x250
 [  368.756837]  [8116eae3]
 unmap_single_vma+0x583/0x890
 [  368.756842]  [8116feb5] unmap_vmas+0x65/0x90
 [  368.756847]  [81175dac] unmap_region+0xac/0x120
 [  368.756852]  [81176379] ? vma_rb_erase+0x1c9/0x210
 [  368.756856]  [81177f10] do_munmap+0x280/0x370
 [  368.756860]  [81178041] vm_munmap+0x41/0x60
 [  368.756864]  [81178f32] SyS_munmap+0x22/0x30
 [  368.756869]  [814e70ed]
 system_call_fastpath+0x1a/0x1f
 [  368.756872] Disabling lock debugging due to kernel taint
 [  368.760084] BUG: Bad rss-counter state mm:880e9d079680
 idx:0 val:-1
 [  368.760091] BUG: Bad rss-counter state mm:880e9d079680
 idx:1 val:1


 Probably. I don't have a Xen PV setup to test with (and very little
 interest in setting one up).. And I have a suspicion that it might not
 be so much about Xen PV, as perhaps about the kind of hardware.

 I suspect the issue has something to do with the magic _PAGE_NUMA
 tie-in with _PAGE_PRESENT. And then mprotect(PROT_NONE) ends up
 removing the _PAGE_PRESENT bit, and now the crazy numa code is
 confused.

 The whole _PAGE_NUMA thing is a f*cking horrible hack, and shares the
 bit with _PAGE_PROTNONE, which is why it then has that tie-in to
 _PAGE_PRESENT.

 Adding Andrea to the Cc, because he's the author of that horridness.
 Putting Steven's test-case here as an attachement for Andrea, maybe
 that makes him go Ahh, yes, silly case.

 Also added Kirill, because he was involved the last _PAGE_NUMA debacle.

 Andrea, you can find the thread on lkml, but it boils down to commit
 1667918b6483 (backported to 3.12.7 as 3d792d616ba4) breaking the
 attached test-case (but apparently only under Xen PV). There it
 apparently causes a BUG: Bad page map .. error.


 I *think* it is due to the fact that pmd_numa and pte_numa is getting the
 _raw_
 value of PMDs and PTEs. That is - it does not use the pvops interface
 and instead reads the values directly from the page-table. Since the
 page-table is also manipulated by the hypervisor - there are certain
 flags it also sets to do its business. It might be that it uses
 _PAGE_GLOBAL as well - and Linux picks up on that. If it was using
 pte_flags that would invoke the pvops interface.

 Elena, Dariof and George, you guys had been looking at this a bit deeper
 than I have. Does the Xen hypervisor use the _PAGE_GLOBAL for PV guests?

 This not-compiled-totally-bad-patch might shed some light on what I was
 thinking _could_ fix this issue - and IS NOT A FIX - JUST A HACK.
 It does not fix it for PMDs naturally (as there are no PMD paravirt ops
 for that).


 Unfortunately the Totally Bad Patch seems to make no difference. I am
 still able to repro the issue:

Steven, do you use numa=fake on boot cmd line for pv guest?

I had similar issue on pv guest. Let me check if the fix that resolved
this for me will help with 3.13.




 Maybe this one is also related to this BUG here (cc'ed people investigating
 this one) ...

   https://lkml.org/lkml/2014/1/10/427

 ... not sure, though.


 [  346.374929] BUG: Bad page map in process mp
 pte:8004ae928065 pmd:e993f9067
 [  346.374942] page:ea0012ba4a00 count:0 mapcount:-1 mapping:
 (null) index:0x0
 [  346.374946] page flags: 0x2f8014(referenced|dirty)
 [  346.374951] addr:7f06a9bbb000 vm_flags:00100071
 anon_vma:880e9939fe00 mapping:  (null)

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-19 Thread Elena Ufimtseva

On Fri, Dec 20, 2013 at 2:39 AM, Elena Ufimtseva  wrote:
> On Wed, Dec 4, 2013 at 8:13 PM, Dario Faggioli
>  wrote:
>> On mer, 2013-12-04 at 01:20 -0500, Elena Ufimtseva wrote:
>>> On Tue, Dec 3, 2013 at 7:35 PM, Elena Ufimtseva  wrote:
>>> > Oh guys, I feel really bad about not replying to these emails... Somehow 
>>> > these
>>> > replies all got deleted.. wierd.
>>> >
>> No worries... You should see *my* backlog. :-P
>>
>>> > Ok, about that automatic balancing. At the moment of the last patch
>>> > automatic numa balancing seem to
>>> > work, but after rebasing on the top of 3.12-rc2 I see similar issues.
>>> > I will try to figure out what commits broke and will contact Ingo
>>> > Molnar and Mel Gorman.
>>> >
>>> As of now I have patch v4 for reviewing. Not sure if it will be
>>> beneficial to post it for review
>>> or look closer at the current problem.
>>>
>> You mean the Linux side? Perhaps stick somewhere a reference to the git
>> tree/branch where it lives, but, before re-sending, let's wait for it to
>> be as issue free as we can tell?
>>
>>> The issue I am seeing right now is defferent from what was happening before.
>>> The corruption happens when on change_prot_numa way :
>>>
>> Ok, so, I think I need to step back a bit from the actual stack trace
>> and look at the big picture. Please, Elena or anyone, correct me if I'm
>> saying something wrong about how Linux's autonuma works and interacts
>> with Xen.
>>
>> The way it worked when I last looked at it was sort of like this:
>>  - there was a kthread scanning all the pages, removing the PAGE_PRESENT
>>bit from actually present pages, and adding a new special one
>>(PAGE_NUMA or something like that);
>>  - when a page fault is triggered and the PAGE_NUMA flag is found, it
>>figures out the page is actually there, so no swap or anything.
>>However, it tracks from what node the access to that page came from,
>>matches it with the node where the page actually is and collect some
>>statistics about that;
>>  - at some point (and here I don't remember the exact logic, since it
>>changed quite a few times) pages ranking badly in the stats above are
>>moved from one node to another.
>
> Hello Dario, Konrad.
>
> - Yes, there is a kernel worker that runs on each node and scans some
> pages stats and
> marks them as _PROT_NONE and resets _PAGE_PRESENT.
> The page fault at this moment is triggered and control is being
> returned back to the linux pv kernel
> to process with handle_mm_fault and page numa fault handler if
> discovered if that was a numa pmd/pte with
> present flag cleared.
> About the stats, I will have to collect some sensible information.
>
>>
>> Is this description still accurate? If yes, here's what I would (double)
>> check, when running this in a PV guest on top of Xen:
>>
>>  1. the NUMA hinting page fault, are we getting and handling them
>> correctly in the PV guest? Are the stats in the guest kernel being
>> updated in a sensible way, i.e., do they make sense and properly
>> relate to the virtual topology of the guest?
>> At some point we thought it would have been necessary to intercept
>> these faults and make sure the above is true with some help from the
>> hypervisor... Is this the case? Why? Why not?
>
> The real healp needed from hypervisor is to allow _PAGE_NUMA flags on
> pte/pmd entries.
> I have done so in hypervisor by utilizing same _PAGE_NUMA bit and
> including into the allowed bit mask.
> As this bit is the same as PAGE_GLOBAL in hypervisor, that may induce
> some other errors. So far I have not seen any
> and I will double check on this.
>
>>
>>  2. what happens when autonuma tries to move pages from one node to
>> another? For us, that would mean in moving from one virtual node
>> to another... Is there a need to do anything at all? I mean, is
>> this, from our perspective, just copying the content of an MFN from
>> node X into another MFN on node Y, or do we need to update some of
>> our vnuma tracking data structures in Xen?
>>
>> If we have this figured out already, then I think we just chase bugs and
>> repost the series. If not, well, I think we should. :-D
>>
> here is the best part :)
>
> After a fresh look at the numa autobalancing, applying recent patches,
> talking some to riel who works now on mm numa autobalancing and
> running some tests including dd, ltp, kernel co

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-19 Thread Elena Ufimtseva

On Wed, Dec 4, 2013 at 8:13 PM, Dario Faggioli
 wrote:
> On mer, 2013-12-04 at 01:20 -0500, Elena Ufimtseva wrote:
>> On Tue, Dec 3, 2013 at 7:35 PM, Elena Ufimtseva  wrote:
>> > Oh guys, I feel really bad about not replying to these emails... Somehow 
>> > these
>> > replies all got deleted.. wierd.
>> >
> No worries... You should see *my* backlog. :-P
>
>> > Ok, about that automatic balancing. At the moment of the last patch
>> > automatic numa balancing seem to
>> > work, but after rebasing on the top of 3.12-rc2 I see similar issues.
>> > I will try to figure out what commits broke and will contact Ingo
>> > Molnar and Mel Gorman.
>> >
>> As of now I have patch v4 for reviewing. Not sure if it will be
>> beneficial to post it for review
>> or look closer at the current problem.
>>
> You mean the Linux side? Perhaps stick somewhere a reference to the git
> tree/branch where it lives, but, before re-sending, let's wait for it to
> be as issue free as we can tell?
>
>> The issue I am seeing right now is defferent from what was happening before.
>> The corruption happens when on change_prot_numa way :
>>
> Ok, so, I think I need to step back a bit from the actual stack trace
> and look at the big picture. Please, Elena or anyone, correct me if I'm
> saying something wrong about how Linux's autonuma works and interacts
> with Xen.
>
> The way it worked when I last looked at it was sort of like this:
>  - there was a kthread scanning all the pages, removing the PAGE_PRESENT
>bit from actually present pages, and adding a new special one
>(PAGE_NUMA or something like that);
>  - when a page fault is triggered and the PAGE_NUMA flag is found, it
>figures out the page is actually there, so no swap or anything.
>However, it tracks from what node the access to that page came from,
>matches it with the node where the page actually is and collect some
>statistics about that;
>  - at some point (and here I don't remember the exact logic, since it
>changed quite a few times) pages ranking badly in the stats above are
>moved from one node to another.

Hello Dario, Konrad.

- Yes, there is a kernel worker that runs on each node and scans some
pages stats and
marks them as _PROT_NONE and resets _PAGE_PRESENT.
The page fault at this moment is triggered and control is being
returned back to the linux pv kernel
to process with handle_mm_fault and page numa fault handler if
discovered if that was a numa pmd/pte with
present flag cleared.
About the stats, I will have to collect some sensible information.

>
> Is this description still accurate? If yes, here's what I would (double)
> check, when running this in a PV guest on top of Xen:
>
>  1. the NUMA hinting page fault, are we getting and handling them
> correctly in the PV guest? Are the stats in the guest kernel being
> updated in a sensible way, i.e., do they make sense and properly
> relate to the virtual topology of the guest?
> At some point we thought it would have been necessary to intercept
> these faults and make sure the above is true with some help from the
> hypervisor... Is this the case? Why? Why not?

The real healp needed from hypervisor is to allow _PAGE_NUMA flags on
pte/pmd entries.
I have done so in hypervisor by utilizing same _PAGE_NUMA bit and
including into the allowed bit mask.
As this bit is the same as PAGE_GLOBAL in hypervisor, that may induce
some other errors. So far I have not seen any
and I will double check on this.

>
>  2. what happens when autonuma tries to move pages from one node to
> another? For us, that would mean in moving from one virtual node
> to another... Is there a need to do anything at all? I mean, is
> this, from our perspective, just copying the content of an MFN from
> node X into another MFN on node Y, or do we need to update some of
> our vnuma tracking data structures in Xen?
>
> If we have this figured out already, then I think we just chase bugs and
> repost the series. If not, well, I think we should. :-D
>
here is the best part :)

After a fresh look at the numa autobalancing, applying recent patches,
talking some to riel who works now on mm numa autobalancing and
running some tests including dd, ltp, kernel compiling and my own
tests, autobalancing now is working
correctly with vnuma. Now I can see sucessfully migrated pages in /proc/vmstat:

numa_pte_updates 39
numa_huge_pte_updates 0
numa_hint_faults 36
numa_hint_faults_local 23
numa_pages_migrated 4
pgmigrate_success 4
pgmigrate_fail 0

I will be running some tests with transparent huge pages as the
migration of such will be failing.
Probably it is possible to find all the patches related to numa
autobalan

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-19 Thread Elena Ufimtseva

On Wed, Dec 4, 2013 at 8:13 PM, Dario Faggioli
dario.faggi...@citrix.com wrote:
 On mer, 2013-12-04 at 01:20 -0500, Elena Ufimtseva wrote:
 On Tue, Dec 3, 2013 at 7:35 PM, Elena Ufimtseva ufimts...@gmail.com wrote:
  Oh guys, I feel really bad about not replying to these emails... Somehow 
  these
  replies all got deleted.. wierd.
 
 No worries... You should see *my* backlog. :-P

  Ok, about that automatic balancing. At the moment of the last patch
  automatic numa balancing seem to
  work, but after rebasing on the top of 3.12-rc2 I see similar issues.
  I will try to figure out what commits broke and will contact Ingo
  Molnar and Mel Gorman.
 
 As of now I have patch v4 for reviewing. Not sure if it will be
 beneficial to post it for review
 or look closer at the current problem.

 You mean the Linux side? Perhaps stick somewhere a reference to the git
 tree/branch where it lives, but, before re-sending, let's wait for it to
 be as issue free as we can tell?

 The issue I am seeing right now is defferent from what was happening before.
 The corruption happens when on change_prot_numa way :

 Ok, so, I think I need to step back a bit from the actual stack trace
 and look at the big picture. Please, Elena or anyone, correct me if I'm
 saying something wrong about how Linux's autonuma works and interacts
 with Xen.

 The way it worked when I last looked at it was sort of like this:
  - there was a kthread scanning all the pages, removing the PAGE_PRESENT
bit from actually present pages, and adding a new special one
(PAGE_NUMA or something like that);
  - when a page fault is triggered and the PAGE_NUMA flag is found, it
figures out the page is actually there, so no swap or anything.
However, it tracks from what node the access to that page came from,
matches it with the node where the page actually is and collect some
statistics about that;
  - at some point (and here I don't remember the exact logic, since it
changed quite a few times) pages ranking badly in the stats above are
moved from one node to another.

Hello Dario, Konrad.

- Yes, there is a kernel worker that runs on each node and scans some
pages stats and
marks them as _PROT_NONE and resets _PAGE_PRESENT.
The page fault at this moment is triggered and control is being
returned back to the linux pv kernel
to process with handle_mm_fault and page numa fault handler if
discovered if that was a numa pmd/pte with
present flag cleared.
About the stats, I will have to collect some sensible information.


 Is this description still accurate? If yes, here's what I would (double)
 check, when running this in a PV guest on top of Xen:

  1. the NUMA hinting page fault, are we getting and handling them
 correctly in the PV guest? Are the stats in the guest kernel being
 updated in a sensible way, i.e., do they make sense and properly
 relate to the virtual topology of the guest?
 At some point we thought it would have been necessary to intercept
 these faults and make sure the above is true with some help from the
 hypervisor... Is this the case? Why? Why not?

The real healp needed from hypervisor is to allow _PAGE_NUMA flags on
pte/pmd entries.
I have done so in hypervisor by utilizing same _PAGE_NUMA bit and
including into the allowed bit mask.
As this bit is the same as PAGE_GLOBAL in hypervisor, that may induce
some other errors. So far I have not seen any
and I will double check on this.


  2. what happens when autonuma tries to move pages from one node to
 another? For us, that would mean in moving from one virtual node
 to another... Is there a need to do anything at all? I mean, is
 this, from our perspective, just copying the content of an MFN from
 node X into another MFN on node Y, or do we need to update some of
 our vnuma tracking data structures in Xen?

 If we have this figured out already, then I think we just chase bugs and
 repost the series. If not, well, I think we should. :-D

here is the best part :)

After a fresh look at the numa autobalancing, applying recent patches,
talking some to riel who works now on mm numa autobalancing and
running some tests including dd, ltp, kernel compiling and my own
tests, autobalancing now is working
correctly with vnuma. Now I can see sucessfully migrated pages in /proc/vmstat:

numa_pte_updates 39
numa_huge_pte_updates 0
numa_hint_faults 36
numa_hint_faults_local 23
numa_pages_migrated 4
pgmigrate_success 4
pgmigrate_fail 0

I will be running some tests with transparent huge pages as the
migration of such will be failing.
Probably it is possible to find all the patches related to numa
autobalancing and figure out possible reasons
of why previously balancing was not working. Giving the amount of work
kernel folks spent recently to fix
issues with numa and the significance of the changes itself, I might
need few more attempts to understand it.

I am going to test THP and if that works will follow up with patches.

Dario

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-19 Thread Elena Ufimtseva

On Fri, Dec 20, 2013 at 2:39 AM, Elena Ufimtseva ufimts...@gmail.com wrote:
 On Wed, Dec 4, 2013 at 8:13 PM, Dario Faggioli
 dario.faggi...@citrix.com wrote:
 On mer, 2013-12-04 at 01:20 -0500, Elena Ufimtseva wrote:
 On Tue, Dec 3, 2013 at 7:35 PM, Elena Ufimtseva ufimts...@gmail.com wrote:
  Oh guys, I feel really bad about not replying to these emails... Somehow 
  these
  replies all got deleted.. wierd.
 
 No worries... You should see *my* backlog. :-P

  Ok, about that automatic balancing. At the moment of the last patch
  automatic numa balancing seem to
  work, but after rebasing on the top of 3.12-rc2 I see similar issues.
  I will try to figure out what commits broke and will contact Ingo
  Molnar and Mel Gorman.
 
 As of now I have patch v4 for reviewing. Not sure if it will be
 beneficial to post it for review
 or look closer at the current problem.

 You mean the Linux side? Perhaps stick somewhere a reference to the git
 tree/branch where it lives, but, before re-sending, let's wait for it to
 be as issue free as we can tell?

 The issue I am seeing right now is defferent from what was happening before.
 The corruption happens when on change_prot_numa way :

 Ok, so, I think I need to step back a bit from the actual stack trace
 and look at the big picture. Please, Elena or anyone, correct me if I'm
 saying something wrong about how Linux's autonuma works and interacts
 with Xen.

 The way it worked when I last looked at it was sort of like this:
  - there was a kthread scanning all the pages, removing the PAGE_PRESENT
bit from actually present pages, and adding a new special one
(PAGE_NUMA or something like that);
  - when a page fault is triggered and the PAGE_NUMA flag is found, it
figures out the page is actually there, so no swap or anything.
However, it tracks from what node the access to that page came from,
matches it with the node where the page actually is and collect some
statistics about that;
  - at some point (and here I don't remember the exact logic, since it
changed quite a few times) pages ranking badly in the stats above are
moved from one node to another.

 Hello Dario, Konrad.

 - Yes, there is a kernel worker that runs on each node and scans some
 pages stats and
 marks them as _PROT_NONE and resets _PAGE_PRESENT.
 The page fault at this moment is triggered and control is being
 returned back to the linux pv kernel
 to process with handle_mm_fault and page numa fault handler if
 discovered if that was a numa pmd/pte with
 present flag cleared.
 About the stats, I will have to collect some sensible information.


 Is this description still accurate? If yes, here's what I would (double)
 check, when running this in a PV guest on top of Xen:

  1. the NUMA hinting page fault, are we getting and handling them
 correctly in the PV guest? Are the stats in the guest kernel being
 updated in a sensible way, i.e., do they make sense and properly
 relate to the virtual topology of the guest?
 At some point we thought it would have been necessary to intercept
 these faults and make sure the above is true with some help from the
 hypervisor... Is this the case? Why? Why not?

 The real healp needed from hypervisor is to allow _PAGE_NUMA flags on
 pte/pmd entries.
 I have done so in hypervisor by utilizing same _PAGE_NUMA bit and
 including into the allowed bit mask.
 As this bit is the same as PAGE_GLOBAL in hypervisor, that may induce
 some other errors. So far I have not seen any
 and I will double check on this.


  2. what happens when autonuma tries to move pages from one node to
 another? For us, that would mean in moving from one virtual node
 to another... Is there a need to do anything at all? I mean, is
 this, from our perspective, just copying the content of an MFN from
 node X into another MFN on node Y, or do we need to update some of
 our vnuma tracking data structures in Xen?

 If we have this figured out already, then I think we just chase bugs and
 repost the series. If not, well, I think we should. :-D

 here is the best part :)

 After a fresh look at the numa autobalancing, applying recent patches,
 talking some to riel who works now on mm numa autobalancing and
 running some tests including dd, ltp, kernel compiling and my own
 tests, autobalancing now is working
 correctly with vnuma. Now I can see sucessfully migrated pages in 
 /proc/vmstat:

 numa_pte_updates 39
 numa_huge_pte_updates 0
 numa_hint_faults 36
 numa_hint_faults_local 23
 numa_pages_migrated 4
 pgmigrate_success 4
 pgmigrate_fail 0

 I will be running some tests with transparent huge pages as the
 migration of such will be failing.
 Probably it is possible to find all the patches related to numa
 autobalancing and figure out possible reasons
 of why previously balancing was not working. Giving the amount of work
 kernel folks spent recently to fix
 issues with numa and the significance of the changes itself, I might

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-03 Thread Elena Ufimtseva

On Tue, Dec 3, 2013 at 7:35 PM, Elena Ufimtseva  wrote:
> On Tue, Nov 19, 2013 at 1:29 PM, Dario Faggioli
>  wrote:
>> On mar, 2013-11-19 at 10:38 -0500, Konrad Rzeszutek Wilk wrote:
>>> On Mon, Nov 18, 2013 at 03:25:48PM -0500, Elena Ufimtseva wrote:
>>> > The patchset introduces vnuma to paravirtualized Xen guests
>>> > runnning as domU.
>>> > Xen subop hypercall is used to retreive vnuma topology information.
>>> > Bases on the retreived topology from Xen, NUMA number of nodes,
>>> > memory ranges, distance table and cpumask is being set.
>>> > If initialization is incorrect, sets 'dummy' node and unsets
>>> > nodemask.
>>> > vNUMA topology is constructed by Xen toolstack. Xen patchset is
>>> > available at https://git.gitorious.org/xenvnuma/xenvnuma.git:v3.
>>>
>>> Yeey!
>>>
>> :-)
>>
>>> One question - I know you had questions about the
>>> PROT_GLOBAL | ~PAGE_PRESENT being set on PTEs that are going to
>>> be harvested for AutoNUMA balancing.
>>>
>>> And that the hypercall to set such PTE entry disallows the
>>> PROT_GLOBAL (it stripts it off)? That means that when the
>>> Linux page system kicks in (as it has ~PAGE_PRESENT) the
>>> Linux pagehandler won't see the PROT_GLOBAL (as it has
>>> been filtered out). Which means that the AutoNUMA code won't
>>> kick in.
>>>
>>> (see http://article.gmane.org/gmane.comp.emulators.xen.devel/174317)
>>>
>>> Was that problem ever answered?
>>>
>> I think the issue is a twofold one.
>>
>> If I remember correctly (Elena, please, correct me if I'm wrong) Elena
>> was seeing _crashes_ with both vNUMA and AutoNUMA enabled for the guest.
>> That's what pushed her to investigate the issue, and led to what you're
>> summing up above.
>>
>> However, it appears the crash was due to something completely unrelated
>> to Xen and vNUMA, was affecting baremetal too, and got fixed, which
>> means the crash is now gone.
>>
>> It remains to be seen (I think) whether that also means that AutoNUMA
>> works. In fact, chatting about this in Edinburgh, Elena managed to
>> convince me pretty badly that we should --as part of the vNUMA support--
>> do something about this, in order to make it work. At that time I
>> thought we should be doing something to avoid the system to go ka-boom,
>> but as I said, even now that it does not crash anymore, she was so
>> persuasive that I now find it quite hard to believe that we really don't
>> need to do anything. :-P
>
> Yes, you were right Dario :) See at the end. pv guests do not crash,
> but they have user space memory corruption.
> Ok, so I will try to understand what again had happened during this
> weekend.
> Meanwhile posting patches for Xen.
>
>>
>> I guess, as soon as we get the chance, we should see if this actually
>> works, i.e., in addition to seeing the proper topology and not crashing,
>> verify that AutoNUMA in the guest is actually doing is job.
>>
>> What do you think? Again, Elena, please chime in and explain how things
>> are, if I got something wrong. :-)
>>
>
> Oh guys, I feel really bad about not replying to these emails... Somehow these
> replies all got deleted.. wierd.
>
> Ok, about that automatic balancing. At the moment of the last patch
> automatic numa balancing seem to
> work, but after rebasing on the top of 3.12-rc2 I see similar issues.
> I will try to figure out what commits broke and will contact Ingo
> Molnar and Mel Gorman.
>
> Konrad,
> as of PROT_GLOBAL flag, I will double check once more to exclude
> errors from my side.
> Last time I was able to have numa_balancing working without any
> modifications from hypervisor side.
> But again, I want to double check this, some experiments might have
> appear being good :)
>
>
>> Regards,
>> Dario
>>
>> --
>> <> (Raistlin Majere)
>> -
>> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
>> Senior Software Engineer, Citrix Systems R Ltd., Cambridge (UK)
>>
>

As of now I have patch v4 for reviewing. Not sure if it will be
beneficial to post it for review
or look closer at the current problem.
The issue I am seeing right now is defferent from what was happening before.
The corruption happens when on change_prot_numa way :

[ 6638.021439]  pfn 45e602, highest_memmap_pfn - 14ddd7
[ 6638.021444] BUG: Bad page map in process dd  pte:80045e602166
pmd:abf1a067
[ 6638.021449] addr:7f4fda2d8000 vm_flags:00

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-03 Thread Elena Ufimtseva

On Tue, Nov 19, 2013 at 1:29 PM, Dario Faggioli
 wrote:
> On mar, 2013-11-19 at 10:38 -0500, Konrad Rzeszutek Wilk wrote:
>> On Mon, Nov 18, 2013 at 03:25:48PM -0500, Elena Ufimtseva wrote:
>> > The patchset introduces vnuma to paravirtualized Xen guests
>> > runnning as domU.
>> > Xen subop hypercall is used to retreive vnuma topology information.
>> > Bases on the retreived topology from Xen, NUMA number of nodes,
>> > memory ranges, distance table and cpumask is being set.
>> > If initialization is incorrect, sets 'dummy' node and unsets
>> > nodemask.
>> > vNUMA topology is constructed by Xen toolstack. Xen patchset is
>> > available at https://git.gitorious.org/xenvnuma/xenvnuma.git:v3.
>>
>> Yeey!
>>
> :-)
>
>> One question - I know you had questions about the
>> PROT_GLOBAL | ~PAGE_PRESENT being set on PTEs that are going to
>> be harvested for AutoNUMA balancing.
>>
>> And that the hypercall to set such PTE entry disallows the
>> PROT_GLOBAL (it stripts it off)? That means that when the
>> Linux page system kicks in (as it has ~PAGE_PRESENT) the
>> Linux pagehandler won't see the PROT_GLOBAL (as it has
>> been filtered out). Which means that the AutoNUMA code won't
>> kick in.
>>
>> (see http://article.gmane.org/gmane.comp.emulators.xen.devel/174317)
>>
>> Was that problem ever answered?
>>
> I think the issue is a twofold one.
>
> If I remember correctly (Elena, please, correct me if I'm wrong) Elena
> was seeing _crashes_ with both vNUMA and AutoNUMA enabled for the guest.
> That's what pushed her to investigate the issue, and led to what you're
> summing up above.
>
> However, it appears the crash was due to something completely unrelated
> to Xen and vNUMA, was affecting baremetal too, and got fixed, which
> means the crash is now gone.
>
> It remains to be seen (I think) whether that also means that AutoNUMA
> works. In fact, chatting about this in Edinburgh, Elena managed to
> convince me pretty badly that we should --as part of the vNUMA support--
> do something about this, in order to make it work. At that time I
> thought we should be doing something to avoid the system to go ka-boom,
> but as I said, even now that it does not crash anymore, she was so
> persuasive that I now find it quite hard to believe that we really don't
> need to do anything. :-P

Yes, you were right Dario :) See at the end. pv guests do not crash,
but they have user space memory corruption.
Ok, so I will try to understand what again had happened during this
weekend.
Meanwhile posting patches for Xen.

>
> I guess, as soon as we get the chance, we should see if this actually
> works, i.e., in addition to seeing the proper topology and not crashing,
> verify that AutoNUMA in the guest is actually doing is job.
>
> What do you think? Again, Elena, please chime in and explain how things
> are, if I got something wrong. :-)
>

Oh guys, I feel really bad about not replying to these emails... Somehow these
replies all got deleted.. wierd.

Ok, about that automatic balancing. At the moment of the last patch
automatic numa balancing seem to
work, but after rebasing on the top of 3.12-rc2 I see similar issues.
I will try to figure out what commits broke and will contact Ingo
Molnar and Mel Gorman.

Konrad,
as of PROT_GLOBAL flag, I will double check once more to exclude
errors from my side.
Last time I was able to have numa_balancing working without any
modifications from hypervisor side.
But again, I want to double check this, some experiments might have
appear being good :)


> Regards,
> Dario
>
> --
> <> (Raistlin Majere)
> -
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R Ltd., Cambridge (UK)
>



-- 
Elena
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-03 Thread Elena Ufimtseva

On Tue, Nov 19, 2013 at 1:29 PM, Dario Faggioli
dario.faggi...@citrix.com wrote:
 On mar, 2013-11-19 at 10:38 -0500, Konrad Rzeszutek Wilk wrote:
 On Mon, Nov 18, 2013 at 03:25:48PM -0500, Elena Ufimtseva wrote:
  The patchset introduces vnuma to paravirtualized Xen guests
  runnning as domU.
  Xen subop hypercall is used to retreive vnuma topology information.
  Bases on the retreived topology from Xen, NUMA number of nodes,
  memory ranges, distance table and cpumask is being set.
  If initialization is incorrect, sets 'dummy' node and unsets
  nodemask.
  vNUMA topology is constructed by Xen toolstack. Xen patchset is
  available at https://git.gitorious.org/xenvnuma/xenvnuma.git:v3.

 Yeey!

 :-)

 One question - I know you had questions about the
 PROT_GLOBAL | ~PAGE_PRESENT being set on PTEs that are going to
 be harvested for AutoNUMA balancing.

 And that the hypercall to set such PTE entry disallows the
 PROT_GLOBAL (it stripts it off)? That means that when the
 Linux page system kicks in (as it has ~PAGE_PRESENT) the
 Linux pagehandler won't see the PROT_GLOBAL (as it has
 been filtered out). Which means that the AutoNUMA code won't
 kick in.

 (see http://article.gmane.org/gmane.comp.emulators.xen.devel/174317)

 Was that problem ever answered?

 I think the issue is a twofold one.

 If I remember correctly (Elena, please, correct me if I'm wrong) Elena
 was seeing _crashes_ with both vNUMA and AutoNUMA enabled for the guest.
 That's what pushed her to investigate the issue, and led to what you're
 summing up above.

 However, it appears the crash was due to something completely unrelated
 to Xen and vNUMA, was affecting baremetal too, and got fixed, which
 means the crash is now gone.

 It remains to be seen (I think) whether that also means that AutoNUMA
 works. In fact, chatting about this in Edinburgh, Elena managed to
 convince me pretty badly that we should --as part of the vNUMA support--
 do something about this, in order to make it work. At that time I
 thought we should be doing something to avoid the system to go ka-boom,
 but as I said, even now that it does not crash anymore, she was so
 persuasive that I now find it quite hard to believe that we really don't
 need to do anything. :-P

Yes, you were right Dario :) See at the end. pv guests do not crash,
but they have user space memory corruption.
Ok, so I will try to understand what again had happened during this
weekend.
Meanwhile posting patches for Xen.


 I guess, as soon as we get the chance, we should see if this actually
 works, i.e., in addition to seeing the proper topology and not crashing,
 verify that AutoNUMA in the guest is actually doing is job.

 What do you think? Again, Elena, please chime in and explain how things
 are, if I got something wrong. :-)


Oh guys, I feel really bad about not replying to these emails... Somehow these
replies all got deleted.. wierd.

Ok, about that automatic balancing. At the moment of the last patch
automatic numa balancing seem to
work, but after rebasing on the top of 3.12-rc2 I see similar issues.
I will try to figure out what commits broke and will contact Ingo
Molnar and Mel Gorman.

Konrad,
as of PROT_GLOBAL flag, I will double check once more to exclude
errors from my side.
Last time I was able to have numa_balancing working without any
modifications from hypervisor side.
But again, I want to double check this, some experiments might have
appear being good :)


 Regards,
 Dario

 --
 This happens because I choose it to happen! (Raistlin Majere)
 -
 Dario Faggioli, Ph.D, http://about.me/dario.faggioli
 Senior Software Engineer, Citrix Systems RD Ltd., Cambridge (UK)




-- 
Elena
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Xen-devel] [PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-12-03 Thread Elena Ufimtseva

On Tue, Dec 3, 2013 at 7:35 PM, Elena Ufimtseva ufimts...@gmail.com wrote:
 On Tue, Nov 19, 2013 at 1:29 PM, Dario Faggioli
 dario.faggi...@citrix.com wrote:
 On mar, 2013-11-19 at 10:38 -0500, Konrad Rzeszutek Wilk wrote:
 On Mon, Nov 18, 2013 at 03:25:48PM -0500, Elena Ufimtseva wrote:
  The patchset introduces vnuma to paravirtualized Xen guests
  runnning as domU.
  Xen subop hypercall is used to retreive vnuma topology information.
  Bases on the retreived topology from Xen, NUMA number of nodes,
  memory ranges, distance table and cpumask is being set.
  If initialization is incorrect, sets 'dummy' node and unsets
  nodemask.
  vNUMA topology is constructed by Xen toolstack. Xen patchset is
  available at https://git.gitorious.org/xenvnuma/xenvnuma.git:v3.

 Yeey!

 :-)

 One question - I know you had questions about the
 PROT_GLOBAL | ~PAGE_PRESENT being set on PTEs that are going to
 be harvested for AutoNUMA balancing.

 And that the hypercall to set such PTE entry disallows the
 PROT_GLOBAL (it stripts it off)? That means that when the
 Linux page system kicks in (as it has ~PAGE_PRESENT) the
 Linux pagehandler won't see the PROT_GLOBAL (as it has
 been filtered out). Which means that the AutoNUMA code won't
 kick in.

 (see http://article.gmane.org/gmane.comp.emulators.xen.devel/174317)

 Was that problem ever answered?

 I think the issue is a twofold one.

 If I remember correctly (Elena, please, correct me if I'm wrong) Elena
 was seeing _crashes_ with both vNUMA and AutoNUMA enabled for the guest.
 That's what pushed her to investigate the issue, and led to what you're
 summing up above.

 However, it appears the crash was due to something completely unrelated
 to Xen and vNUMA, was affecting baremetal too, and got fixed, which
 means the crash is now gone.

 It remains to be seen (I think) whether that also means that AutoNUMA
 works. In fact, chatting about this in Edinburgh, Elena managed to
 convince me pretty badly that we should --as part of the vNUMA support--
 do something about this, in order to make it work. At that time I
 thought we should be doing something to avoid the system to go ka-boom,
 but as I said, even now that it does not crash anymore, she was so
 persuasive that I now find it quite hard to believe that we really don't
 need to do anything. :-P

 Yes, you were right Dario :) See at the end. pv guests do not crash,
 but they have user space memory corruption.
 Ok, so I will try to understand what again had happened during this
 weekend.
 Meanwhile posting patches for Xen.


 I guess, as soon as we get the chance, we should see if this actually
 works, i.e., in addition to seeing the proper topology and not crashing,
 verify that AutoNUMA in the guest is actually doing is job.

 What do you think? Again, Elena, please chime in and explain how things
 are, if I got something wrong. :-)


 Oh guys, I feel really bad about not replying to these emails... Somehow these
 replies all got deleted.. wierd.

 Ok, about that automatic balancing. At the moment of the last patch
 automatic numa balancing seem to
 work, but after rebasing on the top of 3.12-rc2 I see similar issues.
 I will try to figure out what commits broke and will contact Ingo
 Molnar and Mel Gorman.

 Konrad,
 as of PROT_GLOBAL flag, I will double check once more to exclude
 errors from my side.
 Last time I was able to have numa_balancing working without any
 modifications from hypervisor side.
 But again, I want to double check this, some experiments might have
 appear being good :)


 Regards,
 Dario

 --
 This happens because I choose it to happen! (Raistlin Majere)
 -
 Dario Faggioli, Ph.D, http://about.me/dario.faggioli
 Senior Software Engineer, Citrix Systems RD Ltd., Cambridge (UK)



As of now I have patch v4 for reviewing. Not sure if it will be
beneficial to post it for review
or look closer at the current problem.
The issue I am seeing right now is defferent from what was happening before.
The corruption happens when on change_prot_numa way :

[ 6638.021439]  pfn 45e602, highest_memmap_pfn - 14ddd7
[ 6638.021444] BUG: Bad page map in process dd  pte:80045e602166
pmd:abf1a067
[ 6638.021449] addr:7f4fda2d8000 vm_flags:00100073
anon_vma:8800abf77b90 mapping:  (null) index:7f4fda2d8
[ 6638.021457] CPU: 1 PID: 1033 Comm: dd Tainted: GB   W3.13.0-rc2+ #10
[ 6638.021462]   7f4fda2d8000 813ca5b1
88010d68deb8
[ 6638.021471]  810f2c88 abf1a067 80045e602166

[ 6638.021482]  0045e602 88010d68deb8 7f4fda2d8000
80045e602166
[ 6638.021492] Call Trace:
[ 6638.021497]  [813ca5b1] ? dump_stack+0x41/0x51
[ 6638.021503]  [810f2c88] ? print_bad_pte+0x19d/0x1c9
[ 6638.021509]  [810f3aef] ? vm_normal_page+0x94/0xb3
[ 6638.021519]  [810fb788] ? change_protection+0x35c/0x5a8
[ 6638.021527]  [81107965

Re: [PATCH v2] checkpatch.pl: Check for functions without a real prototype

2013-11-22 Thread Elena Ufimtseva

On Fri, Nov 22, 2013 at 5:45 PM, Joe Perches  wrote:
> On Fri, 2013-11-22 at 17:17 -0500, Elena Ufimtseva wrote:
>> Based on Richard Weinberger patch https://lkml.org/lkml/2012/3/16/510
>> Functions like this one are evil:
>>
>> void foo()
>> {
>>   ...
>> }
> []
>> - added white space in regular expression between (), based
>> on comments from Joe Perches.
>
> Thanks Elena, but this is already applied in -next.
>
>
No problem, thanks Joe )

-- 
Elena
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] checkpatch.pl: Check for functions without a real prototype

2013-11-22 Thread Elena Ufimtseva

Based on Richard Weinberger patch https://lkml.org/lkml/2012/3/16/510
Functions like this one are evil:

void foo()
{
...
}

Signed-off-by: Elena Ufimtseva 
---
Changes since v1:
- added white space in regular expression between (), based
on comments from Joe Perches.

 scripts/checkpatch.pl |   13 +
 1 file changed, 13 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 61090e0..08c95c0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2785,6 +2785,19 @@ sub process {
  "open brace '{' following function declarations 
go on the next line\n" . $herecurr);
}
 
+   if ($line =~ /(\b$Type\s+$Ident)\s*\(\s*\)/) {
+   ERROR("FUNCTION_NO_PROTOTYPE",
+"Bad function definition - $1() should probably be $1(void)\n" . $herecurr .
+"\nThou shalt not, in the language of C, under any circumstances, on the
+pain of death, declare or define a function with an empty set of
+parentheses, for though in the language of C++ it meaneth the same as
+(void), in C it meaneth (...) which is of meaningless as there be no
+anchor argument by which the types of the varadic arguments can be
+expressed, and which misleadeth the compiler into allowing unsavory code
+and in some cases generate really ugly stuff for varadic handling.
+   -hpa\n");
+   }
+
 # open braces for enum, union and struct go on the same line.
if ($line =~ /^.\s*{/ &&
$prevline =~ 
/^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] checkpatch.pl: Check for functions without a real prototype

2013-11-22 Thread Elena Ufimtseva

Based on Richard Weinberger patch https://lkml.org/lkml/2012/3/16/510
Functions like this one are evil:

void foo()
{
...
}

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
Changes since v1:
- added white space in regular expression between (), based
on comments from Joe Perches.

 scripts/checkpatch.pl |   13 +
 1 file changed, 13 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 61090e0..08c95c0 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2785,6 +2785,19 @@ sub process {
  open brace '{' following function declarations 
go on the next line\n . $herecurr);
}
 
+   if ($line =~ /(\b$Type\s+$Ident)\s*\(\s*\)/) {
+   ERROR(FUNCTION_NO_PROTOTYPE,
+Bad function definition - $1() should probably be $1(void)\n . $herecurr .
+\nThou shalt not, in the language of C, under any circumstances, on the
+pain of death, declare or define a function with an empty set of
+parentheses, for though in the language of C++ it meaneth the same as
+(void), in C it meaneth (...) which is of meaningless as there be no
+anchor argument by which the types of the varadic arguments can be
+expressed, and which misleadeth the compiler into allowing unsavory code
+and in some cases generate really ugly stuff for varadic handling.
+   -hpa\n);
+   }
+
 # open braces for enum, union and struct go on the same line.
if ($line =~ /^.\s*{/ 
$prevline =~ 
/^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] checkpatch.pl: Check for functions without a real prototype

2013-11-22 Thread Elena Ufimtseva

On Fri, Nov 22, 2013 at 5:45 PM, Joe Perches j...@perches.com wrote:
 On Fri, 2013-11-22 at 17:17 -0500, Elena Ufimtseva wrote:
 Based on Richard Weinberger patch https://lkml.org/lkml/2012/3/16/510
 Functions like this one are evil:

 void foo()
 {
   ...
 }
 []
 - added white space in regular expression between (), based
 on comments from Joe Perches.

 Thanks Elena, but this is already applied in -next.


No problem, thanks Joe )

-- 
Elena
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RESEND v2 1/2] xen: vnuma support for PV guests running as domU

2013-11-18 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   43 +
 5 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..aee4e92
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+bool xen_vnuma_supported(void);
+int xen_numa_init(void);
+#else
+static inline bool xen_vnuma_supported(void) { return false; };
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58..99efa1b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include "asm/xen/vnuma.h"
 
 #include "numa_internal.h"
 
@@ -632,6 +633,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..de9deab 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o  := $(nostackp)
 obj-y  := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
-   p2m.o
+   p2m.o vnuma.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..caa2178
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,127 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_NUMA
+
+/* Checks if hypercall is supported */
+bool xen_vnuma_supported(void)
+{
+   return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL)
+   == -ENOSYS ? false : true;
+}
+
+/*
+ * Called from numa_init if numa_off = 0;
+ * we set numa_off = 0 if xen_vnuma_supported()
+ * returns true and its a domU;
+ */
+int __init xen_numa_init(void)
+{
+   int rc;
+   unsigned int i, j, nr_nodes, cpu, idx, pcpus;
+   u64 physm, physd, physc;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vblock;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF,
+   .__pad = 0
+   };
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   pcpus = num_possible_cpus();
+
+   mem_size =  pcpus * sizeof(struct vmemrange);
+   dist_size = pcpus * pcpus * sizeof(*numa_topo.distance);
+   cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   vblock = __va(physm);
+
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   vdistance  = __va(physd);
+
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+   cpu_to_node  = __va(physc);
+
+   if (!physm || !physc || !physd)
+   goto out;
+
+   set_xen_guest_handle(numa_topo.nr_nodes, _nodes);
+   set_xen_guest_handle(numa_topo.memrange, vblock);
+   set_xen_guest_handle(numa_topo.distance, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
+
+   rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, _topo);
+
+   if (rc < 0)
+   goto out;
+   nr_nodes = *numa_topo.nr_nodes;
+   if (nr_nodes == 0)
+   goto out;
+   if (nr_nodes > num_possible_cpus()) {
+   pr_debug("vNUMA: Node without cpu is not supported in this 
version.\n");
+   goto out;
+   }
+
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   for (i = 0; i < nr_nodes; i++) {
+   if (numa_add_memblk(i, vblock[i].start, vblock[i].end))
+   goto out;
+   node_set(i, numa_nodes_parsed);
+   }
+
+   setup_nr

[PATCH RESEND v2 0/2] xen: vnuma introduction for pv guest

2013-11-18 Thread Elena Ufimtseva

Xen vnuma introduction.

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask. vNUMA topology is constructed by Xen toolstack.

Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Current patchset is available at 
https://git.gitorious.org/xenvnuma/linuxvnuma.git:v4
 g...@gitorious.org:xenvnuma/linuxvnuma.git:v4
Xen patchset is available at: https://git.gitorious.org/xenvnuma/xenvnuma.git:v3

TODO
*   dom0, pvh and hvm vnuma support;
*   multiple memory ranges per node support;
*   benchmarking;

Elena Ufimtseva (2):
  xen: vnuma support for PV guests running as domU
  Subject: [PATCH RESEND v2 2/2] xen: enable vnuma for PV guest

 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   43 +
 6 files changed, 191 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RESEND v2 2/2] xen: enable vnuma for PV guest

2013-11-18 Thread Elena Ufimtseva

Enables numa if vnuma topology hypercall is supported and it is domU.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/xen/setup.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f..0aab799 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -598,6 +599,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (!xen_initial_domain() && xen_vnuma_supported())
+   numa_off = 0;
+   else
+   numa_off = 1;
 #endif
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/2] xen: vnuma support for PV guests running as domU

2013-11-18 Thread Elena Ufimtseva

On Mon, Nov 18, 2013 at 4:14 PM, H. Peter Anvin  wrote:
> On 11/18/2013 12:25 PM, Elena Ufimtseva wrote:
>> +/* Checks if hypercall is supported */
>> +bool xen_vnuma_supported()
>
> This isn't C++...
>
> http://lwn.net/Articles/487493/
>
> There are several more things in this patchset that get flagged by
> checkpatch, but apparently this rather common (and rather serious)
> problem is still not being detected, even through a patch was submitted
> almost two years ago:
>
> https://lkml.org/lkml/2012/3/16/510

Thank you Peter, good to know.  Will resend these.
>
> -hpa
>
>



-- 
Elena
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-11-18 Thread Elena Ufimtseva

Xen vnuma introduction.

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.
vNUMA topology is constructed by Xen toolstack. Xen patchset is
available at https://git.gitorious.org/xenvnuma/xenvnuma.git:v3.

Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Current patchset is available at 
https://git.gitorious.org/xenvnuma/linuxvnuma.git:v3
Xen patchset is available at: https://git.gitorious.org/xenvnuma/xenvnuma.git:v3

TODO
*   dom0, pvh and hvm vnuma support;
*   multiple memory ranges per node support;
*   benchmarking;


Elena Ufimtseva (2):
  xen: vnuma support for PV guests running as domU
  xen: enable vnuma for PV guest

 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   44 +
 6 files changed, 192 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/2] xen: enable vnuma for PV guest

2013-11-18 Thread Elena Ufimtseva

Enables numa if vnuma topology hypercall is supported and it is domU.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/xen/setup.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f..0aab799 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -598,6 +599,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (!xen_initial_domain() && xen_vnuma_supported())
+   numa_off = 0;
+   else
+   numa_off = 1;
 #endif
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/2] xen: vnuma support for PV guests running as domU

2013-11-18 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   44 +
 5 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..aee4e92
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+bool xen_vnuma_supported(void);
+int xen_numa_init(void);
+#else
+static inline bool xen_vnuma_supported(void) { return false; };
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58..99efa1b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include "asm/xen/vnuma.h"
 
 #include "numa_internal.h"
 
@@ -632,6 +633,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..de9deab 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o  := $(nostackp)
 obj-y  := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
-   p2m.o
+   p2m.o vnuma.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..bce4523
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,127 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_NUMA
+
+/* Checks if hypercall is supported */
+bool xen_vnuma_supported()
+{
+   return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL) == -ENOSYS ? 
false : true;
+}
+
+/* 
+ * Called from numa_init if numa_off = 0;
+ * we set numa_off = 0 if xen_vnuma_supported()
+ * returns true and its a domU;
+ */
+int __init xen_numa_init(void)
+{
+   int rc;
+   unsigned int i, j, nr_nodes, cpu, idx, pcpus;
+   u64 physm, physd, physc;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vblock;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF,
+   .__pad = 0
+   };
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   pcpus = num_possible_cpus();
+
+   mem_size =  pcpus * sizeof(struct vmemrange);
+   dist_size = pcpus * pcpus * sizeof(*numa_topo.distance);
+   cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   vblock = __va(physm);
+
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   vdistance  = __va(physd);
+
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+   cpu_to_node  = __va(physc);
+
+   if (!physm || !physc || !physd)
+   goto out;
+
+   set_xen_guest_handle(numa_topo.nr_nodes, _nodes);
+   set_xen_guest_handle(numa_topo.memrange, vblock);
+   set_xen_guest_handle(numa_topo.distance, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
+
+   rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, _topo);
+
+   if (rc < 0)
+   goto out;
+   nr_nodes = *numa_topo.nr_nodes; 
+   if (nr_nodes == 0) {
+   goto out;
+   }
+   if (nr_nodes > num_possible_cpus()) {
+   pr_debug("vNUMA: Node without cpu is not supported in this 
version.\n");
+   goto out;
+   }
+
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   for (i = 0; i < nr_nodes; i++) {
+   if (numa_add_memblk(i, vblock[i].start, vblock[i].end))
+   goto out;
+   node_set(i, numa_nodes_parsed);
+   }
+
+   setup_nr_node_ids();
+

[PATCH v2 1/2] xen: vnuma support for PV guests running as domU

2013-11-18 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   44 +
 5 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..aee4e92
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+bool xen_vnuma_supported(void);
+int xen_numa_init(void);
+#else
+static inline bool xen_vnuma_supported(void) { return false; };
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58..99efa1b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -17,6 +17,7 @@
 #include asm/dma.h
 #include asm/acpi.h
 #include asm/amd_nb.h
+#include asm/xen/vnuma.h
 
 #include numa_internal.h
 
@@ -632,6 +633,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..de9deab 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o  := $(nostackp)
 obj-y  := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
-   p2m.o
+   p2m.o vnuma.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..bce4523
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,127 @@
+#include linux/err.h
+#include linux/memblock.h
+#include xen/interface/xen.h
+#include xen/interface/memory.h
+#include asm/xen/interface.h
+#include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
+
+#ifdef CONFIG_NUMA
+
+/* Checks if hypercall is supported */
+bool xen_vnuma_supported()
+{
+   return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL) == -ENOSYS ? 
false : true;
+}
+
+/* 
+ * Called from numa_init if numa_off = 0;
+ * we set numa_off = 0 if xen_vnuma_supported()
+ * returns true and its a domU;
+ */
+int __init xen_numa_init(void)
+{
+   int rc;
+   unsigned int i, j, nr_nodes, cpu, idx, pcpus;
+   u64 physm, physd, physc;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vblock;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF,
+   .__pad = 0
+   };
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   pcpus = num_possible_cpus();
+
+   mem_size =  pcpus * sizeof(struct vmemrange);
+   dist_size = pcpus * pcpus * sizeof(*numa_topo.distance);
+   cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   vblock = __va(physm);
+
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   vdistance  = __va(physd);
+
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+   cpu_to_node  = __va(physc);
+
+   if (!physm || !physc || !physd)
+   goto out;
+
+   set_xen_guest_handle(numa_topo.nr_nodes, nr_nodes);
+   set_xen_guest_handle(numa_topo.memrange, vblock);
+   set_xen_guest_handle(numa_topo.distance, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
+
+   rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, numa_topo);
+
+   if (rc  0)
+   goto out;
+   nr_nodes = *numa_topo.nr_nodes; 
+   if (nr_nodes == 0) {
+   goto out;
+   }
+   if (nr_nodes  num_possible_cpus()) {
+   pr_debug(vNUMA: Node without cpu is not supported in this 
version.\n);
+   goto out;
+   }
+
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   for (i = 0; i  nr_nodes; i++) {
+   if (numa_add_memblk(i, vblock[i].start, vblock[i].end

[PATCH v2 2/2] xen: enable vnuma for PV guest

2013-11-18 Thread Elena Ufimtseva

Enables numa if vnuma topology hypercall is supported and it is domU.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/xen/setup.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f..0aab799 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include asm/numa.h
 #include asm/xen/hypervisor.h
 #include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
 
 #include xen/xen.h
 #include xen/page.h
@@ -598,6 +599,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (!xen_initial_domain()  xen_vnuma_supported())
+   numa_off = 0;
+   else
+   numa_off = 1;
 #endif
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 0/2] xen: vnuma introduction for pv guest

2013-11-18 Thread Elena Ufimtseva

Xen vnuma introduction.

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.
vNUMA topology is constructed by Xen toolstack. Xen patchset is
available at https://git.gitorious.org/xenvnuma/xenvnuma.git:v3.

Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Current patchset is available at 
https://git.gitorious.org/xenvnuma/linuxvnuma.git:v3
Xen patchset is available at: https://git.gitorious.org/xenvnuma/xenvnuma.git:v3

TODO
*   dom0, pvh and hvm vnuma support;
*   multiple memory ranges per node support;
*   benchmarking;


Elena Ufimtseva (2):
  xen: vnuma support for PV guests running as domU
  xen: enable vnuma for PV guest

 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   44 +
 6 files changed, 192 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/2] xen: vnuma support for PV guests running as domU

2013-11-18 Thread Elena Ufimtseva

On Mon, Nov 18, 2013 at 4:14 PM, H. Peter Anvin h...@zytor.com wrote:
 On 11/18/2013 12:25 PM, Elena Ufimtseva wrote:
 +/* Checks if hypercall is supported */
 +bool xen_vnuma_supported()

 This isn't C++...

 http://lwn.net/Articles/487493/

 There are several more things in this patchset that get flagged by
 checkpatch, but apparently this rather common (and rather serious)
 problem is still not being detected, even through a patch was submitted
 almost two years ago:

 https://lkml.org/lkml/2012/3/16/510

Thank you Peter, good to know.  Will resend these.

 -hpa





-- 
Elena
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RESEND v2 2/2] xen: enable vnuma for PV guest

2013-11-18 Thread Elena Ufimtseva

Enables numa if vnuma topology hypercall is supported and it is domU.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/xen/setup.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 68c054f..0aab799 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include asm/numa.h
 #include asm/xen/hypervisor.h
 #include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
 
 #include xen/xen.h
 #include xen/page.h
@@ -598,6 +599,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (!xen_initial_domain()  xen_vnuma_supported())
+   numa_off = 0;
+   else
+   numa_off = 1;
 #endif
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RESEND v2 0/2] xen: vnuma introduction for pv guest

2013-11-18 Thread Elena Ufimtseva

Xen vnuma introduction.

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask. vNUMA topology is constructed by Xen toolstack.

Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Current patchset is available at 
https://git.gitorious.org/xenvnuma/linuxvnuma.git:v4
 g...@gitorious.org:xenvnuma/linuxvnuma.git:v4
Xen patchset is available at: https://git.gitorious.org/xenvnuma/xenvnuma.git:v3

TODO
*   dom0, pvh and hvm vnuma support;
*   multiple memory ranges per node support;
*   benchmarking;

Elena Ufimtseva (2):
  xen: vnuma support for PV guests running as domU
  Subject: [PATCH RESEND v2 2/2] xen: enable vnuma for PV guest

 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   43 +
 6 files changed, 191 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RESEND v2 1/2] xen: vnuma support for PV guests running as domU

2013-11-18 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they dont
work with pv guests.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |3 +
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/vnuma.c |  127 ++
 include/xen/interface/memory.h   |   43 +
 5 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..aee4e92
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+bool xen_vnuma_supported(void);
+int xen_numa_init(void);
+#else
+static inline bool xen_vnuma_supported(void) { return false; };
+static inline int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58..99efa1b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -17,6 +17,7 @@
 #include asm/dma.h
 #include asm/acpi.h
 #include asm/amd_nb.h
+#include asm/xen/vnuma.h
 
 #include numa_internal.h
 
@@ -632,6 +633,8 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+   if (!numa_init(xen_numa_init))
+   return;
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..de9deab 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o  := $(nostackp)
 obj-y  := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
-   p2m.o
+   p2m.o vnuma.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..caa2178
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,127 @@
+#include linux/err.h
+#include linux/memblock.h
+#include xen/interface/xen.h
+#include xen/interface/memory.h
+#include asm/xen/interface.h
+#include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
+
+#ifdef CONFIG_NUMA
+
+/* Checks if hypercall is supported */
+bool xen_vnuma_supported(void)
+{
+   return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL)
+   == -ENOSYS ? false : true;
+}
+
+/*
+ * Called from numa_init if numa_off = 0;
+ * we set numa_off = 0 if xen_vnuma_supported()
+ * returns true and its a domU;
+ */
+int __init xen_numa_init(void)
+{
+   int rc;
+   unsigned int i, j, nr_nodes, cpu, idx, pcpus;
+   u64 physm, physd, physc;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vblock;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF,
+   .__pad = 0
+   };
+   rc = -EINVAL;
+   physm = physd = physc = 0;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   pcpus = num_possible_cpus();
+
+   mem_size =  pcpus * sizeof(struct vmemrange);
+   dist_size = pcpus * pcpus * sizeof(*numa_topo.distance);
+   cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   vblock = __va(physm);
+
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   vdistance  = __va(physd);
+
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+   cpu_to_node  = __va(physc);
+
+   if (!physm || !physc || !physd)
+   goto out;
+
+   set_xen_guest_handle(numa_topo.nr_nodes, nr_nodes);
+   set_xen_guest_handle(numa_topo.memrange, vblock);
+   set_xen_guest_handle(numa_topo.distance, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
+
+   rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, numa_topo);
+
+   if (rc  0)
+   goto out;
+   nr_nodes = *numa_topo.nr_nodes;
+   if (nr_nodes == 0)
+   goto out;
+   if (nr_nodes  num_possible_cpus()) {
+   pr_debug(vNUMA: Node without cpu is not supported in this 
version.\n);
+   goto out;
+   }
+
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   for (i = 0; i  nr_nodes; i++) {
+   if (numa_add_memblk(i, vblock[i].start

Re: [Xen-devel] [PATCH 1/2] xen: vnuma support for PV guests running as domU.

2013-11-14 Thread Elena Ufimtseva

On Thu, Nov 14, 2013 at 6:48 AM, Dario Faggioli
 wrote:
> On gio, 2013-11-14 at 11:21 +, David Vrabel wrote:
>> On 14/11/13 07:26, Dario Faggioli wrote:
>> > IIRC, it's more something that was already happening (the breakage, I
>> > mean), than a "safety net" for the unforeseeable future. Might be worth
>> > giving some context about it, perhaps referencing the email thread or
>> > the git commit hash in the comment.
>>
>> Yes, a comment like:
>>
>> /*
>>  * Set a dummy node and return success.  This prevents calling any
>>  * hardware-specific initializers which do not work in a PV guest.
>>  */
>>
>> is better.  No need to refer to any specific threads.  It's pretty clear
>> that any hardware-specific init isn't appropriate for a PV guest.
>>
> Ok.
>
>> >> +  if (rc != 0) {
>> >> +  for (i = 0; i < MAX_LOCAL_APIC; i++)
>> >> +  set_apicid_to_node(i, NUMA_NO_NODE);
>> >> +  nodes_clear(numa_nodes_parsed);
>> >> +  nodes_clear(node_possible_map);
>> >> +  nodes_clear(node_online_map);
>> >> +  node_set(0, numa_nodes_parsed);
>> >> +  numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
>> >> +  }
>> >> +  return 0;
>> >>
>> > Ok, so, we always return 'success', as we were saying during last round.
>> > However, we do not call dummy_numa_init() directly, and instead we do
>> > all these stuff, with the last two statements being exactly what
>> > dummy_numa_init() does. Reason is linking, i.e., the fact that
>> > dummy_numa_init() is not exported and you can't reach it from here,
>> > right?

Ah, my bad, I left these comments and they dont make sense :)
>>
>> I think this bit is fine as-is.
>>
> Ok, cool. :-) Shouldn't we then kill or reformulate the comments where
> dummy_numa_init is explicitly referenced then?
>
> E.g., this one: /* will pass to dummy_numa_init */
>
> It might be me, but I find it rather confusing. After seeing that, I'd
> expect to see that, at some point, either the function returns failure
> (which of course we don't want), or a direct call dummy_numa_init().
>
> Dario
>
> --
> <> (Raistlin Majere)
> -
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R Ltd., Cambridge (UK)
>



-- 
Elena
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Xen-devel] [PATCH 1/2] xen: vnuma support for PV guests running as domU.

2013-11-14 Thread Elena Ufimtseva

On Thu, Nov 14, 2013 at 6:48 AM, Dario Faggioli
dario.faggi...@citrix.com wrote:
 On gio, 2013-11-14 at 11:21 +, David Vrabel wrote:
 On 14/11/13 07:26, Dario Faggioli wrote:
  IIRC, it's more something that was already happening (the breakage, I
  mean), than a safety net for the unforeseeable future. Might be worth
  giving some context about it, perhaps referencing the email thread or
  the git commit hash in the comment.

 Yes, a comment like:

 /*
  * Set a dummy node and return success.  This prevents calling any
  * hardware-specific initializers which do not work in a PV guest.
  */

 is better.  No need to refer to any specific threads.  It's pretty clear
 that any hardware-specific init isn't appropriate for a PV guest.

 Ok.

  +  if (rc != 0) {
  +  for (i = 0; i  MAX_LOCAL_APIC; i++)
  +  set_apicid_to_node(i, NUMA_NO_NODE);
  +  nodes_clear(numa_nodes_parsed);
  +  nodes_clear(node_possible_map);
  +  nodes_clear(node_online_map);
  +  node_set(0, numa_nodes_parsed);
  +  numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
  +  }
  +  return 0;
 
  Ok, so, we always return 'success', as we were saying during last round.
  However, we do not call dummy_numa_init() directly, and instead we do
  all these stuff, with the last two statements being exactly what
  dummy_numa_init() does. Reason is linking, i.e., the fact that
  dummy_numa_init() is not exported and you can't reach it from here,
  right?

Ah, my bad, I left these comments and they dont make sense :)

 I think this bit is fine as-is.

 Ok, cool. :-) Shouldn't we then kill or reformulate the comments where
 dummy_numa_init is explicitly referenced then?

 E.g., this one: /* will pass to dummy_numa_init */

 It might be me, but I find it rather confusing. After seeing that, I'd
 expect to see that, at some point, either the function returns failure
 (which of course we don't want), or a direct call dummy_numa_init().

 Dario

 --
 This happens because I choose it to happen! (Raistlin Majere)
 -
 Dario Faggioli, Ph.D, http://about.me/dario.faggioli
 Senior Software Engineer, Citrix Systems RD Ltd., Cambridge (UK)




-- 
Elena
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] xen: vnuma support for PV guests running as domU.

2013-11-13 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they may
break other guests.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |5 ++
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/vnuma.c |  119 ++
 include/xen/interface/memory.h   |   28 +
 5 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..1ba1e06
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+int xen_vnuma_supported(void);
+int xen_numa_init(void);
+#else
+int xen_vnuma_supported(void) { return 0; };
+int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8bf93ba..c8a61dc 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -19,6 +19,7 @@
 #include 
 
 #include "numa_internal.h"
+#include "asm/xen/vnuma.h"
 
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
@@ -621,6 +622,10 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+#ifdef CONFIG_XEN
+   if (xen_vnuma_supported() && !numa_init(xen_numa_init))
+   return;
+#endif
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..de9deab 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o  := $(nostackp)
 obj-y  := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
-   p2m.o
+   p2m.o vnuma.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..b4fc667
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,119 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_NUMA
+
+/* Checks if hypercall is suported */
+int xen_vnuma_supported()
+{
+   return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL) == -ENOSYS ? 0 
: 1;
+}
+
+int __init xen_numa_init(void)
+{
+   int rc;
+   unsigned int i, j, nr_nodes, cpu, idx, pcpus;
+   u64 physm, physd, physc;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vblock;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF,
+   .__pad = 0
+   };
+   rc = -EINVAL;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   pcpus = num_possible_cpus();
+
+   mem_size =  pcpus * sizeof(struct vmemrange);
+   dist_size = pcpus * pcpus * sizeof(*numa_topo.vdistance);
+   cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   vblock = __va(physm);
+
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   vdistance  = __va(physd);
+
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+   cpu_to_node  = __va(physc);
+
+   if (!physm || !physc || !physd)
+   goto vnumaout;
+
+   set_xen_guest_handle(numa_topo.nr_nodes, _nodes);
+   set_xen_guest_handle(numa_topo.vmemblks, vblock);
+   set_xen_guest_handle(numa_topo.vdistance, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
+
+   rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, _topo);
+
+   if (rc < 0)
+   goto vnumaout;
+   if (*numa_topo.nr_nodes == 0) {
+   /* will pass to dummy_numa_init */
+   goto vnumaout;
+   }
+   if (*numa_topo.nr_nodes > num_possible_cpus()) {
+   pr_debug("vNUMA: Node without cpu is not supported in this 
version.\n");
+   goto vnumaout;
+   }
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   for (i = 0; i < *numa_topo.nr_nodes; i++) {
+   if (numa_add_memblk(i, vblock[i].start, vblock[i].end))
+   /* pass to numa_dummy_init */
+   goto vnumaout;
+   node_set(i, numa_nodes_parsed);
+   }
+   setup_nr_node_id

[PATCH 0/2] xen: vnuma introduction for pv guest

2013-11-13 Thread Elena Ufimtseva

Xen vnuma introduction.

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.
vNUMA topology is constructed by Xen toolstack. Xen patchset is 
available at https://git.gitorious.org/xenvnuma/xenvnuma.git.

Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Current patchset is available at 
https://git.gitorious.org/xenvnuma/linuxvnuma.git

TODO
*   dom0 vnuma support;
*   multiple memory ranges per node support;

Elena Ufimtseva (2):
  xen: vnuma support for PV guests running as domU.
  xen: enable Xen vnuma for PV guest.

 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |5 ++
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  119 ++
 include/xen/interface/memory.h   |   28 +
 6 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] xen: enable vnuma for PV guest.

2013-11-13 Thread Elena Ufimtseva

Enables numa if vnuma topology hypercall is supported and it is domU.

Signed-off-by: Elena Ufimtseva 
---
 arch/x86/xen/setup.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 09f3059..fe23ec2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -598,6 +599,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (!xen_initial_domain() && xen_vnuma_supported())
+   numa_off = 0;
+   else
+   numa_off = 1;
 #endif
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/2] xen: vnuma introduction for pv guest

2013-11-13 Thread Elena Ufimtseva

Xen vnuma introduction.

The patchset introduces vnuma to paravirtualized Xen guests
runnning as domU.
Xen subop hypercall is used to retreive vnuma topology information.
Bases on the retreived topology from Xen, NUMA number of nodes,
memory ranges, distance table and cpumask is being set.
If initialization is incorrect, sets 'dummy' node and unsets
nodemask.
vNUMA topology is constructed by Xen toolstack. Xen patchset is 
available at https://git.gitorious.org/xenvnuma/xenvnuma.git.

Example of vnuma enabled pv domain dmesg:

[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x1000-0x0009]
[0.00]   node   0: [mem 0x0010-0x]
[0.00]   node   1: [mem 0x1-0x1]
[0.00]   node   2: [mem 0x2-0x2]
[0.00]   node   3: [mem 0x3-0x3]
[0.00] On node 0 totalpages: 1048479
[0.00]   DMA zone: 56 pages used for memmap
[0.00]   DMA zone: 21 pages reserved
[0.00]   DMA zone: 3999 pages, LIFO batch:0
[0.00]   DMA32 zone: 14280 pages used for memmap
[0.00]   DMA32 zone: 1044480 pages, LIFO batch:31
[0.00] On node 1 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 2 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] On node 3 totalpages: 1048576
[0.00]   Normal zone: 14336 pages used for memmap
[0.00]   Normal zone: 1048576 pages, LIFO batch:31
[0.00] SFI: Simple Firmware Interface v0.81 http://simplefirmware.org
[0.00] smpboot: Allowing 4 CPUs, 0 hotplug CPUs
[0.00] No local APIC present
[0.00] APIC: disable apic facility
[0.00] APIC: switched to apic NOOP
[0.00] nr_irqs_gsi: 16
[0.00] PM: Registered nosave memory: [mem 0x000a-0x000f]
[0.00] e820: cannot find a gap in the 32bit address range
[0.00] e820: PCI devices with unassigned 32bit BARs may break!
[0.00] e820: [mem 0x40010-0x4004f] available for PCI devices
[0.00] Booting paravirtualized kernel on Xen
[0.00] Xen version: 4.4-unstable (preserve-AD)
[0.00] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:4 
nr_node_ids:4
[0.00] PERCPU: Embedded 28 pages/cpu @8800ffc0 s85376 r8192 
d21120 u2097152
[0.00] pcpu-alloc: s85376 r8192 d21120 u2097152 alloc=1*2097152


numactl output:
root@heatpipe:~# numactl --hardware
available: 4 nodes (0-3)
node 0 cpus: 0
node 0 size: 4031 MB
node 0 free: 3997 MB
node 1 cpus: 1
node 1 size: 4039 MB
node 1 free: 4022 MB
node 2 cpus: 2
node 2 size: 4039 MB
node 2 free: 4023 MB
node 3 cpus: 3
node 3 size: 3975 MB
node 3 free: 3963 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Current patchset is available at 
https://git.gitorious.org/xenvnuma/linuxvnuma.git

TODO
*   dom0 vnuma support;
*   multiple memory ranges per node support;

Elena Ufimtseva (2):
  xen: vnuma support for PV guests running as domU.
  xen: enable Xen vnuma for PV guest.

 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |5 ++
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/setup.c |6 +-
 arch/x86/xen/vnuma.c |  119 ++
 include/xen/interface/memory.h   |   28 +
 6 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] xen: enable vnuma for PV guest.

2013-11-13 Thread Elena Ufimtseva

Enables numa if vnuma topology hypercall is supported and it is domU.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/xen/setup.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 09f3059..fe23ec2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
 #include asm/numa.h
 #include asm/xen/hypervisor.h
 #include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
 
 #include xen/xen.h
 #include xen/page.h
@@ -598,6 +599,9 @@ void __init xen_arch_setup(void)
WARN_ON(xen_set_default_idle());
fiddle_vdso();
 #ifdef CONFIG_NUMA
-   numa_off = 1;
+   if (!xen_initial_domain()  xen_vnuma_supported())
+   numa_off = 0;
+   else
+   numa_off = 1;
 #endif
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] xen: vnuma support for PV guests running as domU.

2013-11-13 Thread Elena Ufimtseva

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they may
break other guests.

Signed-off-by: Elena Ufimtseva ufimts...@gmail.com
---
 arch/x86/include/asm/xen/vnuma.h |   12 
 arch/x86/mm/numa.c   |5 ++
 arch/x86/xen/Makefile|2 +-
 arch/x86/xen/vnuma.c |  119 ++
 include/xen/interface/memory.h   |   28 +
 5 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 000..1ba1e06
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+int xen_vnuma_supported(void);
+int xen_numa_init(void);
+#else
+int xen_vnuma_supported(void) { return 0; };
+int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8bf93ba..c8a61dc 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -19,6 +19,7 @@
 #include asm/amd_nb.h
 
 #include numa_internal.h
+#include asm/xen/vnuma.h
 
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
@@ -621,6 +622,10 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
if (!numa_off) {
+#ifdef CONFIG_XEN
+   if (xen_vnuma_supported()  !numa_init(xen_numa_init))
+   return;
+#endif
 #ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..de9deab 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o  := $(nostackp)
 obj-y  := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
-   p2m.o
+   p2m.o vnuma.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 000..b4fc667
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,119 @@
+#include linux/err.h
+#include linux/memblock.h
+#include xen/interface/xen.h
+#include xen/interface/memory.h
+#include asm/xen/interface.h
+#include asm/xen/hypercall.h
+#include asm/xen/vnuma.h
+
+#ifdef CONFIG_NUMA
+
+/* Checks if hypercall is suported */
+int xen_vnuma_supported()
+{
+   return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL) == -ENOSYS ? 0 
: 1;
+}
+
+int __init xen_numa_init(void)
+{
+   int rc;
+   unsigned int i, j, nr_nodes, cpu, idx, pcpus;
+   u64 physm, physd, physc;
+   unsigned int *vdistance, *cpu_to_node;
+   unsigned long mem_size, dist_size, cpu_to_node_size;
+   struct vmemrange *vblock;
+
+   struct vnuma_topology_info numa_topo = {
+   .domid = DOMID_SELF,
+   .__pad = 0
+   };
+   rc = -EINVAL;
+
+   /* For now only PV guests are supported */
+   if (!xen_pv_domain())
+   return rc;
+
+   pcpus = num_possible_cpus();
+
+   mem_size =  pcpus * sizeof(struct vmemrange);
+   dist_size = pcpus * pcpus * sizeof(*numa_topo.vdistance);
+   cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
+
+   physm = memblock_alloc(mem_size, PAGE_SIZE);
+   vblock = __va(physm);
+
+   physd = memblock_alloc(dist_size, PAGE_SIZE);
+   vdistance  = __va(physd);
+
+   physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+   cpu_to_node  = __va(physc);
+
+   if (!physm || !physc || !physd)
+   goto vnumaout;
+
+   set_xen_guest_handle(numa_topo.nr_nodes, nr_nodes);
+   set_xen_guest_handle(numa_topo.vmemblks, vblock);
+   set_xen_guest_handle(numa_topo.vdistance, vdistance);
+   set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
+
+   rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, numa_topo);
+
+   if (rc  0)
+   goto vnumaout;
+   if (*numa_topo.nr_nodes == 0) {
+   /* will pass to dummy_numa_init */
+   goto vnumaout;
+   }
+   if (*numa_topo.nr_nodes  num_possible_cpus()) {
+   pr_debug(vNUMA: Node without cpu is not supported in this 
version.\n);
+   goto vnumaout;
+   }
+   /*
+* NUMA nodes memory ranges are in pfns, constructed and
+* aligned based on e820 ram domain map.
+*/
+   for (i = 0; i  *numa_topo.nr_nodes; i++) {
+   if (numa_add_memblk(i, vblock[i].start, vblock[i].end))
+   /* pass to numa_dummy_init */
+   goto vnumaout

52 matches

Mail list logo