[PATCH v3 2/2] mm: Introduce kernelcore=mirror option

2015-12-08 Thread Taku Izumi
This patch extends existing "kernelcore" option and
introduces kernelcore=mirror option. By specifying
"mirror" instead of specifying the amount of memory,
non-mirrored (non-reliable) region will be arranged
into ZONE_MOVABLE.

v1 -> v2:
 - Refine so that the following case also can be
   handled properly:

 Node X:  |MM--MM|
   (legend) M: mirrored  -: not mirrrored

 In this case, ZONE_NORMAL and ZONE_MOVABLE are
 arranged like bellow:

 Node X:  |MM--MM|
  |ooxxoo| ZONE_NORMAL
|ooxx| ZONE_MOVABLE
   (legend) o: present  x: absent

v2 -> v3:
 - change the option name from kernelcore=reliable
   into kernelcore=mirror
 - documentation fix so that users can understand
   nn[KMS] and mirror are exclusive

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |  11 +++-
 mm/page_alloc.c | 110 ++--
 2 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index f8aae63..b0ffc76 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "mirror"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1711,6 +1712,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "mirror" option. In case "mirror"
+   option is specified, mirrored (reliable) memory is used
+   for non-movable allocations and remaining memory is used
+   for Movable pages. nn[KMS] and "mirror" are exclusive,
+   so you can NOT specify nn[KMG] and "mirror" at the same
+   time.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: [,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index acb0b4e..4157476 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -251,6 +251,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
unsigned long pfn;
struct zone *z;
unsigned long nr_initialised = 0;
+   struct memblock_region *r = NULL, *tmp;
 
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
if (!update_defer_init(pgdat, pfn, end_pfn,
_initialised))
break;
+
+   /*
+* if not mirrored_kernelcore and ZONE_MOVABLE exists,
+* range from zone_movable_pfn[nid] to end of each node
+* should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
+*/
+   if (!mirrored_kernelcore && zone_movable_pfn[nid])
+   if (zone == ZONE_NORMAL &&
+   pfn >= zone_movable_pfn[nid])
+   continue;
+
+   /*
+* check given memblock attribute by firmware which
+* can affect kernel memory layout.
+* if zone==ZONE_MOVABLE but memory is mirrored,
+* it's an overlapped memmap init. skip it.
+*/
+   if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+   if (!r ||
+

[PATCH v3 1/2] mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()

2015-12-08 Thread Taku Izumi
Currently each zone's zone_start_pfn is calculated at
free_area_init_core(). However zone's range is fixed at
the time when invoking zone_spanned_pages_in_node().

This patch changes each zone->zone_start_pfn is
calculated at zone_spanned_pages_in_node().

Signed-off-by: Taku Izumi 
---
 mm/page_alloc.c | 30 +++---
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a3c66..acb0b4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4928,31 +4928,31 @@ static unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *ignored)
 {
-   unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
 
/* Get the start and end of the zone */
-   zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-   zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+   *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+   *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
-   _start_pfn, _end_pfn);
+   zone_start_pfn, zone_end_pfn);
 
/* Check that this node has pages within the zone's required range */
-   if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+   if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
 
/* Move the zone boundaries inside the node if necessary */
-   zone_end_pfn = min(zone_end_pfn, node_end_pfn);
-   zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+   *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+   *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
 
/* Return the spanned pages */
-   return zone_end_pfn - zone_start_pfn;
+   return *zone_end_pfn - *zone_start_pfn;
 }
 
 /*
@@ -5017,6 +5017,8 @@ static inline unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *zones_size)
 {
return zones_size[zone_type];
@@ -5047,15 +5049,22 @@ static void __meminit calculate_node_totalpages(struct 
pglist_data *pgdat,
 
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+   unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
 
size = zone_spanned_pages_in_node(pgdat->node_id, i,
  node_start_pfn,
  node_end_pfn,
+ _start_pfn,
+ _end_pfn,
  zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
  node_start_pfn, node_end_pfn,
  zholes_size);
+   if (size)
+   zone->zone_start_pfn = zone_start_pfn;
+   else
+   zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
 
@@ -5176,7 +5185,6 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
 {
enum zone_type j;
int nid = pgdat->node_id;
-   unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
 
pgdat_resize_init(pgdat);
@@ -5192,6 +5200,7 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+   unsigned long zone_start_pfn = zone->zone_start_pfn;
 
size = zone->spanned_pages;
realsize = freesize = zone->present_pages

[PATCH v3 0/2] mm: Introduce kernelcore=mirror option

2015-12-08 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are mirrored (reliable) via EFI memory map.
Now Linux kernel utilize its information and allocates 
boot time memory from reliable region. 

My requirement is:
  - allocate kernel memory from mirrored region 
  - allocate user memory from non-mirrored region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-mirrored range into ZONE_MOVABLE, 
mirrored memory is used for kernel allocations.

My idea is to extend existing "kernelcore" option and 
introduces kernelcore=mirror option. By specifying
"mirror" instead of specifying the amount of memory,
non-mirrored region will be arranged into ZONE_MOVABLE.  

Earlier discussions are at: 
 https://lkml.org/lkml/2015/10/9/24
 https://lkml.org/lkml/2015/10/15/9
 https://lkml.org/lkml/2015/11/27/18

For example, suppose 2-nodes system with the following memory
 range: 
  node 0 [mem 0x1000-0x00109fff] 
  node 1 [mem 0x0010a000-0x00209fff]

and the following ranges are marked as reliable (mirrored):
  [0x-0x0001] 
  [0x0001-0x00018000] 
  [0x0008-0x00088000] 
  [0x0010a000-0x00112000]
  [0x0017a000-0x00182000] 

If you specify kernelcore=mirror, ZONE_NORMAL and ZONE_MOVABLE
are arranged like bellow:

 - node 0:
  ZONE_NORMAL : [0x0001-0x0010a000]
  ZONE_MOVABLE: [0x00018000-0x0010a000]
 - node 1: 
  ZONE_NORMAL : [0x0010a000-0x0020a000]
  ZONE_MOVABLE: [0x00112000-0x0020a000] 

In overlapped range, pages to be ZONE_MOVABLE in ZONE_NORMAL
are treated as absent pages, and vice versa.

v1 -> v2:
 Refine so that the above example case also can be
 handled properly:
v2 -> v3:
 Change the option name from kernelcore=reliable
 into kernelcore=mirror and some documentation fix
 according to Andrew Morton's point

 
Taku Izumi (2):
  mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()
  mm: Introduce kernelcore=mirror option

 Documentation/kernel-parameters.txt |  11 ++-
 mm/page_alloc.c | 140 +++-
 2 files changed, 133 insertions(+), 18 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 2/2] mm: Introduce kernelcore=mirror option

2015-12-08 Thread Taku Izumi
This patch extends existing "kernelcore" option and
introduces kernelcore=mirror option. By specifying
"mirror" instead of specifying the amount of memory,
non-mirrored (non-reliable) region will be arranged
into ZONE_MOVABLE.

v1 -> v2:
 - Refine so that the following case also can be
   handled properly:

 Node X:  |MM--MM|
   (legend) M: mirrored  -: not mirrrored

 In this case, ZONE_NORMAL and ZONE_MOVABLE are
 arranged like bellow:

 Node X:  |MM--MM|
  |ooxxoo| ZONE_NORMAL
|ooxx| ZONE_MOVABLE
   (legend) o: present  x: absent

v2 -> v3:
 - change the option name from kernelcore=reliable
   into kernelcore=mirror
 - documentation fix so that users can understand
   nn[KMS] and mirror are exclusive

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  11 +++-
 mm/page_alloc.c | 110 ++--
 2 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index f8aae63..b0ffc76 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "mirror"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1711,6 +1712,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "mirror" option. In case "mirror"
+   option is specified, mirrored (reliable) memory is used
+   for non-movable allocations and remaining memory is used
+   for Movable pages. nn[KMS] and "mirror" are exclusive,
+   so you can NOT specify nn[KMG] and "mirror" at the same
+   time.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index acb0b4e..4157476 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -251,6 +251,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
unsigned long pfn;
struct zone *z;
unsigned long nr_initialised = 0;
+   struct memblock_region *r = NULL, *tmp;
 
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
if (!update_defer_init(pgdat, pfn, end_pfn,
_initialised))
break;
+
+   /*
+* if not mirrored_kernelcore and ZONE_MOVABLE exists,
+* range from zone_movable_pfn[nid] to end of each node
+* should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
+*/
+   if (!mirrored_kernelcore && zone_movable_pfn[nid])
+   if (zone == ZONE_NORMAL &&
+   pfn >= zone_movable_pfn[nid])
+   continue;
+
+   /*
+* check given memblock attribute by firmware which
+* can affect kernel memory layout.
+* if zone==ZONE_MOVABLE but memory is mirrored,
+* it's an overlapped memmap init. skip it.
+*/
+   if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+   if (!r ||
+

[PATCH v3 0/2] mm: Introduce kernelcore=mirror option

2015-12-08 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are mirrored (reliable) via EFI memory map.
Now Linux kernel utilize its information and allocates 
boot time memory from reliable region. 

My requirement is:
  - allocate kernel memory from mirrored region 
  - allocate user memory from non-mirrored region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-mirrored range into ZONE_MOVABLE, 
mirrored memory is used for kernel allocations.

My idea is to extend existing "kernelcore" option and 
introduces kernelcore=mirror option. By specifying
"mirror" instead of specifying the amount of memory,
non-mirrored region will be arranged into ZONE_MOVABLE.  

Earlier discussions are at: 
 https://lkml.org/lkml/2015/10/9/24
 https://lkml.org/lkml/2015/10/15/9
 https://lkml.org/lkml/2015/11/27/18

For example, suppose 2-nodes system with the following memory
 range: 
  node 0 [mem 0x1000-0x00109fff] 
  node 1 [mem 0x0010a000-0x00209fff]

and the following ranges are marked as reliable (mirrored):
  [0x-0x0001] 
  [0x0001-0x00018000] 
  [0x0008-0x00088000] 
  [0x0010a000-0x00112000]
  [0x0017a000-0x00182000] 

If you specify kernelcore=mirror, ZONE_NORMAL and ZONE_MOVABLE
are arranged like bellow:

 - node 0:
  ZONE_NORMAL : [0x0001-0x0010a000]
  ZONE_MOVABLE: [0x00018000-0x0010a000]
 - node 1: 
  ZONE_NORMAL : [0x0010a000-0x0020a000]
  ZONE_MOVABLE: [0x00112000-0x0020a000] 

In overlapped range, pages to be ZONE_MOVABLE in ZONE_NORMAL
are treated as absent pages, and vice versa.

v1 -> v2:
 Refine so that the above example case also can be
 handled properly:
v2 -> v3:
 Change the option name from kernelcore=reliable
 into kernelcore=mirror and some documentation fix
 according to Andrew Morton's point

 
Taku Izumi (2):
  mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()
  mm: Introduce kernelcore=mirror option

 Documentation/kernel-parameters.txt |  11 ++-
 mm/page_alloc.c | 140 +++-
 2 files changed, 133 insertions(+), 18 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/2] mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()

2015-12-08 Thread Taku Izumi
Currently each zone's zone_start_pfn is calculated at
free_area_init_core(). However zone's range is fixed at
the time when invoking zone_spanned_pages_in_node().

This patch changes each zone->zone_start_pfn is
calculated at zone_spanned_pages_in_node().

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 mm/page_alloc.c | 30 +++---
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a3c66..acb0b4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4928,31 +4928,31 @@ static unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *ignored)
 {
-   unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
 
/* Get the start and end of the zone */
-   zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-   zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+   *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+   *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
-   _start_pfn, _end_pfn);
+   zone_start_pfn, zone_end_pfn);
 
/* Check that this node has pages within the zone's required range */
-   if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+   if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
 
/* Move the zone boundaries inside the node if necessary */
-   zone_end_pfn = min(zone_end_pfn, node_end_pfn);
-   zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+   *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+   *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
 
/* Return the spanned pages */
-   return zone_end_pfn - zone_start_pfn;
+   return *zone_end_pfn - *zone_start_pfn;
 }
 
 /*
@@ -5017,6 +5017,8 @@ static inline unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *zones_size)
 {
return zones_size[zone_type];
@@ -5047,15 +5049,22 @@ static void __meminit calculate_node_totalpages(struct 
pglist_data *pgdat,
 
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+   unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
 
size = zone_spanned_pages_in_node(pgdat->node_id, i,
  node_start_pfn,
  node_end_pfn,
+ _start_pfn,
+ _end_pfn,
  zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
  node_start_pfn, node_end_pfn,
  zholes_size);
+   if (size)
+   zone->zone_start_pfn = zone_start_pfn;
+   else
+   zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
 
@@ -5176,7 +5185,6 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
 {
enum zone_type j;
int nid = pgdat->node_id;
-   unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
 
pgdat_resize_init(pgdat);
@@ -5192,6 +5200,7 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+   unsigned long zone_start_pfn = zone->zone_start_pfn;
 
size = zone->spanned_pages;
reals

[PATCH v2 0/2] mm: Introduce kernelcore=reliable option

2015-11-26 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are reliable (mirrored) via EFI memory map.
Now Linux kernel utilize its information and allocates
boot time memory from reliable region.

My requirement is:
  - allocate kernel memory from reliable region
  - allocate user memory from non-reliable region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-reliable range into ZONE_MOVABLE,
reliable memory is only used for kernel allocations.

My idea is to extend existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

Earlier discussions are at:
 https://lkml.org/lkml/2015/10/9/24
 https://lkml.org/lkml/2015/10/15/9

For example, suppose 2-nodes system with the following memory
 range:
  node 0 [mem 0x1000-0x00109fff]
  node 1 [mem 0x0010a000-0x00209fff]

and the following ranges are marked as reliable:
  [0x-0x0001]
  [0x0001-0x00018000]
  [0x0008-0x00088000]
  [0x0010a000-0x00112000]
  [0x0017a000-0x00182000]

If you specify kernelcore=reliable, ZONE_NORMAL and ZONE_MOVABLE
are arranged like bellow:

 - node 0:
  ZONE_NORMAL : [0x0001-0x0010a000]
  ZONE_MOVABLE: [0x00018000-0x0010a000]
 - node 1:
  ZONE_NORMAL : [0x0010a000-0x0020a000]
  ZONE_MOVABLE: [0x00112000-0x0020a000]

In overlapped range, pages to be ZONE_MOVABLE in ZONE_NORMAL
are treated as absent pages, and vice versa.

v1 -> v2:
 Refine so that the above example case also can be
 handled properly:


Taku Izumi (2):
  mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()
  mm: Introduce kernelcore=reliable option

 Documentation/kernel-parameters.txt |   9 ++-
 mm/page_alloc.c | 140 +++-
 2 files changed, 131 insertions(+), 18 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()

2015-11-26 Thread Taku Izumi
Currently each zone's zone_start_pfn is calculated at
free_area_init_core(). However zone's range is fixed at
the time when invoking zone_spanned_pages_in_node().

This patch changes each zone->zone_start_pfn is
calculated at zone_spanned_pages_in_node().

Signed-off-by: Taku Izumi 
---
 mm/page_alloc.c | 30 +++---
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a3c66..acb0b4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4928,31 +4928,31 @@ static unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *ignored)
 {
-   unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
 
/* Get the start and end of the zone */
-   zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-   zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+   *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+   *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
-   _start_pfn, _end_pfn);
+   zone_start_pfn, zone_end_pfn);
 
/* Check that this node has pages within the zone's required range */
-   if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+   if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
 
/* Move the zone boundaries inside the node if necessary */
-   zone_end_pfn = min(zone_end_pfn, node_end_pfn);
-   zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+   *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+   *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
 
/* Return the spanned pages */
-   return zone_end_pfn - zone_start_pfn;
+   return *zone_end_pfn - *zone_start_pfn;
 }
 
 /*
@@ -5017,6 +5017,8 @@ static inline unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *zones_size)
 {
return zones_size[zone_type];
@@ -5047,15 +5049,22 @@ static void __meminit calculate_node_totalpages(struct 
pglist_data *pgdat,
 
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+   unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
 
size = zone_spanned_pages_in_node(pgdat->node_id, i,
  node_start_pfn,
  node_end_pfn,
+ _start_pfn,
+ _end_pfn,
  zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
  node_start_pfn, node_end_pfn,
  zholes_size);
+   if (size)
+   zone->zone_start_pfn = zone_start_pfn;
+   else
+   zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
 
@@ -5176,7 +5185,6 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
 {
enum zone_type j;
int nid = pgdat->node_id;
-   unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
 
pgdat_resize_init(pgdat);
@@ -5192,6 +5200,7 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+   unsigned long zone_start_pfn = zone->zone_start_pfn;
 
size = zone->spanned_pages;
realsize = freesize = zone->present_pages

[PATCH v2 2/2] mm: Introduce kernelcore=reliable option

2015-11-26 Thread Taku Izumi
This patch extends existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

v1 -> v2:
 - Refine so that the following case also can be
   handled properly:

 Node X:  |MM--MM|
   (legend) M: mirrored  -: not mirrrored

 In this case, ZONE_NORMAL and ZONE_MOVABLE are
 arranged like bellow:

 Node X:  |--|
  |ooxxoo| ZONE_NORMAL
|ooxx| ZONE_MOVABLE
   (legend) o: present  x: absent

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |   9 ++-
 mm/page_alloc.c | 110 ++--
 2 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index f8aae63..ed44c2c8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "reliable"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1711,6 +1712,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "reliable" option. In case "reliable"
+   option is specified, reliable memory is used for
+   non-movable allocations and remaining memory is used
+   for Movable pages.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: [,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index acb0b4e..006a3d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -251,6 +251,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool reliable_kernelcore;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
unsigned long pfn;
struct zone *z;
unsigned long nr_initialised = 0;
+   struct memblock_region *r = NULL, *tmp;
 
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
if (!update_defer_init(pgdat, pfn, end_pfn,
_initialised))
break;
+
+   /*
+* if not reliable_kernelcore and ZONE_MOVABLE exists,
+* range from zone_movable_pfn[nid] to end of each node
+* should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
+*/
+   if (!reliable_kernelcore && zone_movable_pfn[nid])
+   if (zone == ZONE_NORMAL &&
+   pfn >= zone_movable_pfn[nid])
+   continue;
+
+   /*
+* check given memblock attribute by firmware which
+* can affect kernel memory layout.
+* if zone==ZONE_MOVABLE but memory is mirrored,
+* it's an overlapped memmap init. skip it.
+*/
+   if (reliable_kernelcore && zone == ZONE_MOVABLE) {
+   if (!r ||
+   pfn >= memblock_region_memory_end_pfn(r)) {
+   for_each_memblock(memory, tmp)
+   if (pfn < 
memblock_region_memory_end_pfn(tmp))
+   break;
+   r = tmp;
+  

[PATCH v2 0/2] mm: Introduce kernelcore=reliable option

2015-11-26 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are reliable (mirrored) via EFI memory map.
Now Linux kernel utilize its information and allocates
boot time memory from reliable region.

My requirement is:
  - allocate kernel memory from reliable region
  - allocate user memory from non-reliable region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-reliable range into ZONE_MOVABLE,
reliable memory is only used for kernel allocations.

My idea is to extend existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

Earlier discussions are at:
 https://lkml.org/lkml/2015/10/9/24
 https://lkml.org/lkml/2015/10/15/9

For example, suppose 2-nodes system with the following memory
 range:
  node 0 [mem 0x1000-0x00109fff]
  node 1 [mem 0x0010a000-0x00209fff]

and the following ranges are marked as reliable:
  [0x-0x0001]
  [0x0001-0x00018000]
  [0x0008-0x00088000]
  [0x0010a000-0x00112000]
  [0x0017a000-0x00182000]

If you specify kernelcore=reliable, ZONE_NORMAL and ZONE_MOVABLE
are arranged like bellow:

 - node 0:
  ZONE_NORMAL : [0x0001-0x0010a000]
  ZONE_MOVABLE: [0x00018000-0x0010a000]
 - node 1:
  ZONE_NORMAL : [0x0010a000-0x0020a000]
  ZONE_MOVABLE: [0x00112000-0x0020a000]

In overlapped range, pages to be ZONE_MOVABLE in ZONE_NORMAL
are treated as absent pages, and vice versa.

v1 -> v2:
 Refine so that the above example case also can be
 handled properly:


Taku Izumi (2):
  mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()
  mm: Introduce kernelcore=reliable option

 Documentation/kernel-parameters.txt |   9 ++-
 mm/page_alloc.c | 140 +++-
 2 files changed, 131 insertions(+), 18 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] mm: Calculate zone_start_pfn at zone_spanned_pages_in_node()

2015-11-26 Thread Taku Izumi
Currently each zone's zone_start_pfn is calculated at
free_area_init_core(). However zone's range is fixed at
the time when invoking zone_spanned_pages_in_node().

This patch changes each zone->zone_start_pfn is
calculated at zone_spanned_pages_in_node().

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 mm/page_alloc.c | 30 +++---
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a3c66..acb0b4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4928,31 +4928,31 @@ static unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *ignored)
 {
-   unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
 
/* Get the start and end of the zone */
-   zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-   zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+   *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+   *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
-   _start_pfn, _end_pfn);
+   zone_start_pfn, zone_end_pfn);
 
/* Check that this node has pages within the zone's required range */
-   if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+   if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
 
/* Move the zone boundaries inside the node if necessary */
-   zone_end_pfn = min(zone_end_pfn, node_end_pfn);
-   zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+   *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+   *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
 
/* Return the spanned pages */
-   return zone_end_pfn - zone_start_pfn;
+   return *zone_end_pfn - *zone_start_pfn;
 }
 
 /*
@@ -5017,6 +5017,8 @@ static inline unsigned long __meminit 
zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+   unsigned long *zone_start_pfn,
+   unsigned long *zone_end_pfn,
unsigned long *zones_size)
 {
return zones_size[zone_type];
@@ -5047,15 +5049,22 @@ static void __meminit calculate_node_totalpages(struct 
pglist_data *pgdat,
 
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+   unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
 
size = zone_spanned_pages_in_node(pgdat->node_id, i,
  node_start_pfn,
  node_end_pfn,
+ _start_pfn,
+ _end_pfn,
  zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
  node_start_pfn, node_end_pfn,
  zholes_size);
+   if (size)
+   zone->zone_start_pfn = zone_start_pfn;
+   else
+   zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
 
@@ -5176,7 +5185,6 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
 {
enum zone_type j;
int nid = pgdat->node_id;
-   unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
 
pgdat_resize_init(pgdat);
@@ -5192,6 +5200,7 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+   unsigned long zone_start_pfn = zone->zone_start_pfn;
 
size = zone->spanned_pages;
reals

[PATCH v2 2/2] mm: Introduce kernelcore=reliable option

2015-11-26 Thread Taku Izumi
This patch extends existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

v1 -> v2:
 - Refine so that the following case also can be
   handled properly:

 Node X:  |MM--MM|
   (legend) M: mirrored  -: not mirrrored

 In this case, ZONE_NORMAL and ZONE_MOVABLE are
 arranged like bellow:

 Node X:  |--|
  |ooxxoo| ZONE_NORMAL
|ooxx| ZONE_MOVABLE
   (legend) o: present  x: absent

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 Documentation/kernel-parameters.txt |   9 ++-
 mm/page_alloc.c | 110 ++--
 2 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index f8aae63..ed44c2c8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1695,7 +1695,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "reliable"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1711,6 +1712,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "reliable" option. In case "reliable"
+   option is specified, reliable memory is used for
+   non-movable allocations and remaining memory is used
+   for Movable pages.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index acb0b4e..006a3d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -251,6 +251,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool reliable_kernelcore;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -4472,6 +4473,7 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
unsigned long pfn;
struct zone *z;
unsigned long nr_initialised = 0;
+   struct memblock_region *r = NULL, *tmp;
 
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4491,6 +4493,38 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
if (!update_defer_init(pgdat, pfn, end_pfn,
_initialised))
break;
+
+   /*
+* if not reliable_kernelcore and ZONE_MOVABLE exists,
+* range from zone_movable_pfn[nid] to end of each node
+* should be ZONE_MOVABLE not ZONE_NORMAL. skip it.
+*/
+   if (!reliable_kernelcore && zone_movable_pfn[nid])
+   if (zone == ZONE_NORMAL &&
+   pfn >= zone_movable_pfn[nid])
+   continue;
+
+   /*
+* check given memblock attribute by firmware which
+* can affect kernel memory layout.
+* if zone==ZONE_MOVABLE but memory is mirrored,
+* it's an overlapped memmap init. skip it.
+*/
+   if (reliable_kernelcore && zone == ZONE_MOVABLE) {
+   if (!r ||
+   pfn >= memblock_region_memory_end_pfn(r)) {
+   for_each_memblock(memory, tmp)
+   if (pfn < 
memblock_region_memory_end_pfn(tmp))
+   break;
+   r = tmp;

[tip:core/efi] efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

2015-10-28 Thread tip-bot for Taku Izumi
Commit-ID:  78b9bc947b18ed16b6c2c573d774e6d54ad9452d
Gitweb: http://git.kernel.org/tip/78b9bc947b18ed16b6c2c573d774e6d54ad9452d
Author: Taku Izumi 
AuthorDate: Fri, 23 Oct 2015 11:48:17 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 28 Oct 2015 12:28:06 +0100

efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

Commit:

  0f96a99dab36 ("efi: Add "efi_fake_mem" boot option")

introduced the following warning message:

  drivers/firmware/efi/fake_mem.c:186:20: warning: cast to pointer from integer 
of different size [-Wint-to-pointer-cast]

new_memmap_phy was defined as a u64 value and cast to void*,
causing a int-to-pointer-cast warning on x86 32-bit builds.
However, since the void* type is inappropriate for a physical
address, the definition of struct efi_memory_map::phys_map has
been changed to phys_addr_t in the previous patch, and so the
cast can be dropped entirely.

This patch also changes the type of the "new_memmap_phy"
variable from "u64" to "phys_addr_t" to align with the types of
memblock_alloc() and struct efi_memory_map::phys_map.

Reported-by: Ingo Molnar 
Signed-off-by: Taku Izumi 
[ Removed void* cast, updated commit log]
Signed-off-by: Ard Biesheuvel 
Reviewed-by: Matt Fleming 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: kamezawa.hir...@jp.fujitsu.com
Cc: linux-...@vger.kernel.org
Cc: matt.flem...@intel.com
Link: 
http://lkml.kernel.org/r/1445593697-1342-2-git-send-email-ard.biesheu...@linaro.org
Signed-off-by: Ingo Molnar 
---
 drivers/firmware/efi/fake_mem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 32bcb14..ed3a854 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -59,7 +59,7 @@ void __init efi_fake_memmap(void)
u64 start, end, m_start, m_end, m_attr;
int new_nr_map = memmap.nr_map;
efi_memory_desc_t *md;
-   u64 new_memmap_phy;
+   phys_addr_t new_memmap_phy;
void *new_memmap;
void *old, *new;
int i;
@@ -183,7 +183,7 @@ void __init efi_fake_memmap(void)
/* swap into new EFI memmap */
efi_unmap_memmap();
memmap.map = new_memmap;
-   memmap.phys_map = (void *)new_memmap_phy;
+   memmap.phys_map = new_memmap_phy;
memmap.nr_map = new_nr_map;
memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size;
set_bit(EFI_MEMMAP, );
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:core/efi] efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

2015-10-28 Thread tip-bot for Taku Izumi
Commit-ID:  78b9bc947b18ed16b6c2c573d774e6d54ad9452d
Gitweb: http://git.kernel.org/tip/78b9bc947b18ed16b6c2c573d774e6d54ad9452d
Author: Taku Izumi <izumi.t...@jp.fujitsu.com>
AuthorDate: Fri, 23 Oct 2015 11:48:17 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Wed, 28 Oct 2015 12:28:06 +0100

efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

Commit:

  0f96a99dab36 ("efi: Add "efi_fake_mem" boot option")

introduced the following warning message:

  drivers/firmware/efi/fake_mem.c:186:20: warning: cast to pointer from integer 
of different size [-Wint-to-pointer-cast]

new_memmap_phy was defined as a u64 value and cast to void*,
causing a int-to-pointer-cast warning on x86 32-bit builds.
However, since the void* type is inappropriate for a physical
address, the definition of struct efi_memory_map::phys_map has
been changed to phys_addr_t in the previous patch, and so the
cast can be dropped entirely.

This patch also changes the type of the "new_memmap_phy"
variable from "u64" to "phys_addr_t" to align with the types of
memblock_alloc() and struct efi_memory_map::phys_map.

Reported-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
[ Removed void* cast, updated commit log]
Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
Reviewed-by: Matt Fleming <m...@codeblueprint.co.uk>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: kamezawa.hir...@jp.fujitsu.com
Cc: linux-...@vger.kernel.org
Cc: matt.flem...@intel.com
Link: 
http://lkml.kernel.org/r/1445593697-1342-2-git-send-email-ard.biesheu...@linaro.org
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 drivers/firmware/efi/fake_mem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 32bcb14..ed3a854 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -59,7 +59,7 @@ void __init efi_fake_memmap(void)
u64 start, end, m_start, m_end, m_attr;
int new_nr_map = memmap.nr_map;
efi_memory_desc_t *md;
-   u64 new_memmap_phy;
+   phys_addr_t new_memmap_phy;
void *new_memmap;
void *old, *new;
int i;
@@ -183,7 +183,7 @@ void __init efi_fake_memmap(void)
/* swap into new EFI memmap */
efi_unmap_memmap();
memmap.map = new_memmap;
-   memmap.phys_map = (void *)new_memmap_phy;
+   memmap.phys_map = new_memmap_phy;
memmap.nr_map = new_nr_map;
memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size;
set_bit(EFI_MEMMAP, );
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

2015-10-22 Thread Taku Izumi
commit-0f96a99 introduces the following warning message:

  drivers/firmware/efi/fake_mem.c:186:20: warning: cast to pointer
  from integer of different size [-Wint-to-pointer-cast]

new_memmap_phy was defined as a u64 value and casted to void*.
This causes a warning of int-to-pointer-cast on x86 32-bit
environment.

This patch changes the type of "new_memmap_phy" variable
from "u64" into "ulong" to avoid it.

v1 -> v2:
 - change the type of "new_memmap_phy" from phys_addr_t
   into ulong according to Ard's comment

Reported-by: Ingo Molnar 
Signed-off-by: Taku Izumi 
---
 drivers/firmware/efi/fake_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 32bcb14..1f483b4 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -59,7 +59,7 @@ void __init efi_fake_memmap(void)
u64 start, end, m_start, m_end, m_attr;
int new_nr_map = memmap.nr_map;
efi_memory_desc_t *md;
-   u64 new_memmap_phy;
+   ulong new_memmap_phy;
void *new_memmap;
void *old, *new;
int i;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

2015-10-22 Thread Taku Izumi
commit-0f96a99 introduces the following warning message:

  drivers/firmware/efi/fake_mem.c:186:20: warning: cast to pointer
  from integer of different size [-Wint-to-pointer-cast]

new_memmap_phy was defined as a u64 value and casted to void*.
This causes a warning of int-to-pointer-cast on x86 32-bit
environment.

This patch changes the type of "new_memmap_phy" variable
from "u64" into "phys_addr_t" to avoid it.

Reported-by: Ingo Molnar 
Signed-off-by: Taku Izumi 
---
 drivers/firmware/efi/fake_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 32bcb14..b65bc07 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -59,7 +59,7 @@ void __init efi_fake_memmap(void)
u64 start, end, m_start, m_end, m_attr;
int new_nr_map = memmap.nr_map;
efi_memory_desc_t *md;
-   u64 new_memmap_phy;
+   phys_addr_t new_memmap_phy;
void *new_memmap;
void *old, *new;
int i;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

2015-10-22 Thread Taku Izumi
commit-0f96a99 introduces the following warning message:

  drivers/firmware/efi/fake_mem.c:186:20: warning: cast to pointer
  from integer of different size [-Wint-to-pointer-cast]

new_memmap_phy was defined as a u64 value and casted to void*.
This causes a warning of int-to-pointer-cast on x86 32-bit
environment.

This patch changes the type of "new_memmap_phy" variable
from "u64" into "phys_addr_t" to avoid it.

Reported-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 drivers/firmware/efi/fake_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 32bcb14..b65bc07 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -59,7 +59,7 @@ void __init efi_fake_memmap(void)
u64 start, end, m_start, m_end, m_attr;
int new_nr_map = memmap.nr_map;
efi_memory_desc_t *md;
-   u64 new_memmap_phy;
+   phys_addr_t new_memmap_phy;
void *new_memmap;
void *old, *new;
int i;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] efi: Fix warning of int-to-pointer-cast on x86 32-bit builds

2015-10-22 Thread Taku Izumi
commit-0f96a99 introduces the following warning message:

  drivers/firmware/efi/fake_mem.c:186:20: warning: cast to pointer
  from integer of different size [-Wint-to-pointer-cast]

new_memmap_phy was defined as a u64 value and casted to void*.
This causes a warning of int-to-pointer-cast on x86 32-bit
environment.

This patch changes the type of "new_memmap_phy" variable
from "u64" into "ulong" to avoid it.

v1 -> v2:
 - change the type of "new_memmap_phy" from phys_addr_t
   into ulong according to Ard's comment

Reported-by: Ingo Molnar <mi...@kernel.org>
Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 drivers/firmware/efi/fake_mem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 32bcb14..1f483b4 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -59,7 +59,7 @@ void __init efi_fake_memmap(void)
u64 start, end, m_start, m_end, m_attr;
int new_nr_map = memmap.nr_map;
efi_memory_desc_t *md;
-   u64 new_memmap_phy;
+   ulong new_memmap_phy;
void *new_memmap;
void *old, *new;
int i;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: Introduce kernelcore=reliable option

2015-10-14 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are reliable (mirrored) via EFI memory map.
Now Linux kernel utilize its information and allocates
boot time memory from reliable region.

My requirement is:
  - allocate kernel memory from reliable region
  - allocate user memory from non-reliable region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-reliable range into ZONE_MOVABLE,
reliable memory is only used for kernel allocations.

This patch extends existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

Earlier discussion is at:
 https://lkml.org/lkml/2015/10/9/24

For example, suppose 2-nodes system with the following
 memory range:
  node 0 [mem 0x1000-0x00109fff]
  node 1 [mem 0x0010a000-0x00209fff]

and the following ranges are marked as reliable (*):
  [0x-0x0001]
  [0x0001-0x00018000]
  [0x0010a000-0x00112000]

If you specify kernelcore=reliable, Movable zones are
arranged like the following:
  Movable zone start for each node
Node 0: 0x00018000
Node 1: 0x00112000

(*) I specified the following instead of using UEFI BIOS
complied with UEFI spec 2.5,
efi_fake_mem=4G@0:0x1,2G@0x10a000:0x1,2G@4G:0x1
efi_fake_mem is found at:
 git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi.git
 tags/efi-next

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |  9 -
 mm/page_alloc.c | 26 ++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index cd5312f..b2c8c13 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1663,7 +1663,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "reliable"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1679,6 +1680,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "reliable" option. In case "reliable"
+   option is specified, reliable memory is used for
+   non-movable allocations and remaining memory is used
+   for Movable pages.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: [,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index beda417..d0b3ac9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -221,6 +221,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool reliable_kernelcore __initdata;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -5618,6 +5619,25 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
 
/*
+* If kernelcore=reliable is specified, ignore movablecore option
+*/
+   if (reliable_kernelcore) {
+   for_each_memblock(memory, r) {
+   if (memblock_is_mirror(r))
+   continue;
+
+   nid = r->nid;
+
+   usable_startpfn = PFN_DOWN(r->base);
+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out2;
+   }
+
+   /*
 * If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
@@ -5873,6 +5893,12 @@ static int __init cmdline_parse_core(char *p, unsigned 
long *core)
  */
 stati

[PATCH] mm: Introduce kernelcore=reliable option

2015-10-14 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are reliable (mirrored) via EFI memory map.
Now Linux kernel utilize its information and allocates
boot time memory from reliable region.

My requirement is:
  - allocate kernel memory from reliable region
  - allocate user memory from non-reliable region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-reliable range into ZONE_MOVABLE,
reliable memory is only used for kernel allocations.

This patch extends existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

Earlier discussion is at:
 https://lkml.org/lkml/2015/10/9/24

For example, suppose 2-nodes system with the following
 memory range:
  node 0 [mem 0x1000-0x00109fff]
  node 1 [mem 0x0010a000-0x00209fff]

and the following ranges are marked as reliable (*):
  [0x-0x0001]
  [0x0001-0x00018000]
  [0x0010a000-0x00112000]

If you specify kernelcore=reliable, Movable zones are
arranged like the following:
  Movable zone start for each node
Node 0: 0x00018000
Node 1: 0x00112000

(*) I specified the following instead of using UEFI BIOS
complied with UEFI spec 2.5,
efi_fake_mem=4G@0:0x1,2G@0x10a000:0x1,2G@4G:0x1
efi_fake_mem is found at:
 git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi.git
 tags/efi-next

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  9 -
 mm/page_alloc.c | 26 ++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index cd5312f..b2c8c13 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1663,7 +1663,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "reliable"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1679,6 +1680,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "reliable" option. In case "reliable"
+   option is specified, reliable memory is used for
+   non-movable allocations and remaining memory is used
+   for Movable pages.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index beda417..d0b3ac9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -221,6 +221,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool reliable_kernelcore __initdata;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -5618,6 +5619,25 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
 
/*
+* If kernelcore=reliable is specified, ignore movablecore option
+*/
+   if (reliable_kernelcore) {
+   for_each_memblock(memory, r) {
+   if (memblock_is_mirror(r))
+   continue;
+
+   nid = r->nid;
+
+   usable_startpfn = PFN_DOWN(r->base);
+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out2;
+   }
+
+   /*
 * If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
@@ -5873,6 +5893,12 @@ static int __

[PATCH][RFC] mm: Introduce kernelcore=reliable option

2015-10-08 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are reliable (mirrored) via EFI memory map.
Now Linux kernel utilize its information and allocates
boot time memory from reliable region.

My requirement is:
  - allocate kernel memory from reliable region
  - allocate user memory from non-reliable region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-reliable range into ZONE_MOVABLE,
reliable memory is only used for kernel allocations.

This patch extends existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |  9 -
 mm/page_alloc.c | 26 ++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 50fc09b..6791cbb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1669,7 +1669,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "reliable"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1685,6 +1686,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "reliable" option. In case "reliable"
+   option is specified, reliable memory is used for
+   non-movable allocations and remaining memory is used
+   for Movable pages.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: [,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48aaf7b..91d7556 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -242,6 +242,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool reliable_kernelcore __initdata;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -5652,6 +5653,25 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
 
/*
+* If kernelcore=reliable is specified, ignore movablecore option
+*/
+   if (reliable_kernelcore) {
+   for_each_memblock(memory, r) {
+   if (memblock_is_mirror(r))
+   continue;
+
+   nid = r->nid;
+
+   usable_startpfn = PFN_DOWN(r->base);
+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out2;
+   }
+
+   /*
 * If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
@@ -5907,6 +5927,12 @@ static int __init cmdline_parse_core(char *p, unsigned 
long *core)
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
+   /* parse kernelcore=reliable */
+   if (parse_option_str(p, "reliable")) {
+   reliable_kernelcore = true;
+   return 0;
+   }
+
return cmdline_parse_core(p, _kernelcore);
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH][RFC] mm: Introduce kernelcore=reliable option

2015-10-08 Thread Taku Izumi
Xeon E7 v3 based systems supports Address Range Mirroring
and UEFI BIOS complied with UEFI spec 2.5 can notify which
ranges are reliable (mirrored) via EFI memory map.
Now Linux kernel utilize its information and allocates
boot time memory from reliable region.

My requirement is:
  - allocate kernel memory from reliable region
  - allocate user memory from non-reliable region

In order to meet my requirement, ZONE_MOVABLE is useful.
By arranging non-reliable range into ZONE_MOVABLE,
reliable memory is only used for kernel allocations.

This patch extends existing "kernelcore" option and
introduces kernelcore=reliable option. By specifying
"reliable" instead of specifying the amount of memory,
non-reliable region will be arranged into ZONE_MOVABLE.

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  9 -
 mm/page_alloc.c | 26 ++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 50fc09b..6791cbb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1669,7 +1669,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
keepinitrd  [HW,ARM]
 
-   kernelcore=nn[KMG]  [KNL,X86,IA-64,PPC] This parameter
+   kernelcore= Format: nn[KMG] | "reliable"
+   [KNL,X86,IA-64,PPC] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations.  The requested amount is
spread evenly throughout all nodes in the system. The
@@ -1685,6 +1686,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   Instead of specifying the amount of memory (nn[KMS]),
+   you can specify "reliable" option. In case "reliable"
+   option is specified, reliable memory is used for
+   non-movable allocations and remaining memory is used
+   for Movable pages.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: <Controller#>[,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48aaf7b..91d7556 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -242,6 +242,7 @@ static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool reliable_kernelcore __initdata;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -5652,6 +5653,25 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
 
/*
+* If kernelcore=reliable is specified, ignore movablecore option
+*/
+   if (reliable_kernelcore) {
+   for_each_memblock(memory, r) {
+   if (memblock_is_mirror(r))
+   continue;
+
+   nid = r->nid;
+
+   usable_startpfn = PFN_DOWN(r->base);
+   zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+   min(usable_startpfn, zone_movable_pfn[nid]) :
+   usable_startpfn;
+   }
+
+   goto out2;
+   }
+
+   /*
 * If movablecore=nn[KMG] was specified, calculate what size of
 * kernelcore that corresponds so that memory usable for
 * any allocation type is evenly spread. If both kernelcore
@@ -5907,6 +5927,12 @@ static int __init cmdline_parse_core(char *p, unsigned 
long *core)
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
+   /* parse kernelcore=reliable */
+   if (parse_option_str(p, "reliable")) {
+   reliable_kernelcore = true;
+   return 0;
+   }
+
return cmdline_parse_core(p, _kernelcore);
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:perf/core] perf/x86/intel/uncore: Fix multi-segment problem of perf_event_intel_uncore

2015-10-06 Thread tip-bot for Taku Izumi
Commit-ID:  712df65ccb63da08a484bf57c40b250dfd4103a7
Gitweb: http://git.kernel.org/tip/712df65ccb63da08a484bf57c40b250dfd4103a7
Author: Taku Izumi 
AuthorDate: Thu, 24 Sep 2015 21:10:21 +0900
Committer:  Ingo Molnar 
CommitDate: Tue, 6 Oct 2015 17:31:51 +0200

perf/x86/intel/uncore: Fix multi-segment problem of perf_event_intel_uncore

In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0:

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case, relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes this problem by introducing the segment-aware pci2phy_map 
instead.

Signed-off-by: Taku Izumi 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: a...@kernel.org
Cc: h...@zytor.com
Link: 
http://lkml.kernel.org/r/1443096621-4119-1-git-send-email-izumi.t...@jp.fujitsu.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 61 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 16 --
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +---
 4 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..61215a6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map, *alloc = NULL;
+   int i;
+
+   lockdep_assert_held(_map_lock);
+
+lookup:
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   goto end;
+   }
+
+   if (!alloc) {
+   raw_spin_unlock(_map_lock);
+   alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+   raw_spin_lock(_map_lock);
+
+   if (!alloc)
+   return NULL;
+
+   goto lookup;
+   }
+
+   map = alloc;
+   alloc = NULL;
+   map->segment = segment;
+   for (i = 0; i < 256; i++)
+   map->pbus_to_physid[i] = -1;
+   list_add_tail(>list, _map_head);
+
+end:
+   kfree(alloc);
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcib

[tip:perf/core] perf/x86/intel/uncore: Fix multi-segment problem of perf_event_intel_uncore

2015-10-06 Thread tip-bot for Taku Izumi
Commit-ID:  712df65ccb63da08a484bf57c40b250dfd4103a7
Gitweb: http://git.kernel.org/tip/712df65ccb63da08a484bf57c40b250dfd4103a7
Author: Taku Izumi <izumi.t...@jp.fujitsu.com>
AuthorDate: Thu, 24 Sep 2015 21:10:21 +0900
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Tue, 6 Oct 2015 17:31:51 +0200

perf/x86/intel/uncore: Fix multi-segment problem of perf_event_intel_uncore

In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0:

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case, relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes this problem by introducing the segment-aware pci2phy_map 
instead.

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Arnaldo Carvalho de Melo <a...@redhat.com>
Cc: Jiri Olsa <jo...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: a...@kernel.org
Cc: h...@zytor.com
Link: 
http://lkml.kernel.org/r/1443096621-4119-1-git-send-email-izumi.t...@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 61 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 16 --
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +---
 4 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..61215a6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map, *alloc = NULL;
+   int i;
+
+   lockdep_assert_held(_map_lock);
+
+lookup:
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   goto end;
+   }
+
+   if (!alloc) {
+   raw_spin_unlock(_map_lock);
+   alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+   raw_spin_lock(_map_lock);
+
+   if (!alloc)
+   return NULL;
+
+   goto lookup;
+   }
+
+   map = alloc;
+   alloc = NULL;
+   map->segment = segment;
+   for (i = 0; i < 256; i++)
+   map->pbus_to_physid[i] = -1;
+   list_add_tail(>list, _map_head);
+
+end:
+   kfree(alloc);
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev

[PATCH 2/2] x86, efi: Add "efi_fake_mem" boot option

2015-09-29 Thread Taku Izumi
This patch introduces new boot option named "efi_fake_mem".
By specifying this parameter, you can add arbitrary attribute
to specific memory range.
This is useful for debugging of Address Range Mirroring feature.

For example, if "efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1"
is specified, the original (firmware provided) EFI memmap will be
updated so that the specified memory regions have
EFI_MEMORY_MORE_RELIABLE attribute (0x1):

 
   efi: mem36: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)

 
   efi: mem36: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |  15 +++
 arch/x86/kernel/setup.c |   4 +-
 drivers/firmware/efi/Kconfig|  22 
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 238 
 include/linux/efi.h |   6 +
 6 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 22a4b68..50fc09b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1094,6 +1094,21 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem=   nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
+   Add arbitrary attribute to specific memory range by
+   updating original EFI memory map.
+   Region of memory which aa attribute is added to is
+   from ss to ss+nn.
+   If efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1
+   is specified, EFI_MEMORY_MORE_RELIABLE(0x1)
+   attribute is added to range 0x1-0x18000 and
+   0x10a000-0x112000.
+
+   Using this parameter you can do debugging of EFI memmap
+   related feature. For example, you can do debugging of
+   Address Range Mirroring feature even if your box
+   doesn't support it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fdb7f2a..30b4c44 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1079,8 +1079,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror();
+   }
 
/*
 * The EFI specification says that boot service code won't be called
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 84533e0..ac47cc4d 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -52,6 +52,28 @@ config EFI_RUNTIME_MAP
 
  See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map.
 
+config EFI_FAKE_MEMMAP
+   bool "Enable EFI fake memory map"
+   depends on EFI && X86
+   default n
+   help
+ Saying Y here will enable "efi_fake_mem" boot option.
+ By specifying this parameter, you can add arbitrary attribute
+ to specific memory range by updating original (firmware provided)
+ EFI memmap.
+ This is useful for debugging of EFI memmap related feature.
+ e.g. Address Range Mirroring feature.
+
+config EFI_MAX_FAKE_MEM
+   int "maximum allowable number of ranges in efi_fake_mem boot option"
+   depends on EFI && X86 && EFI_FAKE_MEMMAP
+   range 1 128
+   default 8
+   help
+ Maximum allowable number of ranges in efi_fake_mem boot option.
+ Ranges can be set up to this value using comma-separated list.
+ The default value is 8.
+
 config EFI_PARAMS_FROM_FDT
bool
help
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 6fd3da9..c24

[PATCH 2/2] x86, efi: Add "efi_fake_mem" boot option

2015-09-29 Thread Taku Izumi
This patch introduces new boot option named "efi_fake_mem".
By specifying this parameter, you can add arbitrary attribute
to specific memory range.
This is useful for debugging of Address Range Mirroring feature.

For example, if "efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1"
is specified, the original (firmware provided) EFI memmap will be
updated so that the specified memory regions have
EFI_MEMORY_MORE_RELIABLE attribute (0x1):

 
   efi: mem36: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)

 
   efi: mem36: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |  15 +++
 arch/x86/kernel/setup.c |   4 +-
 drivers/firmware/efi/Kconfig|  22 
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 238 
 include/linux/efi.h |   6 +
 6 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 22a4b68..50fc09b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1094,6 +1094,21 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem=   nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
+   Add arbitrary attribute to specific memory range by
+   updating original EFI memory map.
+   Region of memory which aa attribute is added to is
+   from ss to ss+nn.
+   If efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1
+   is specified, EFI_MEMORY_MORE_RELIABLE(0x1)
+   attribute is added to range 0x1-0x18000 and
+   0x10a000-0x112000.
+
+   Using this parameter you can do debugging of EFI memmap
+   related feature. For example, you can do debugging of
+   Address Range Mirroring feature even if your box
+   doesn't support it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fdb7f2a..30b4c44 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1079,8 +1079,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror();
+   }
 
/*
 * The EFI specification says that boot service code won't be called
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 84533e0..ac47cc4d 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -52,6 +52,28 @@ config EFI_RUNTIME_MAP
 
  See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map.
 
+config EFI_FAKE_MEMMAP
+   bool "Enable EFI fake memory map"
+   depends on EFI && X86
+   default n
+   help
+ Saying Y here will enable "efi_fake_mem" boot option.
+ By specifying this parameter, you can add arbitrary attribute
+ to specific memory range by updating original (firmware provided)
+ EFI memmap.
+ This is useful for debugging of EFI memmap related feature.
+ e.g. Address Range Mirroring feature.
+
+config EFI_MAX_FAKE_MEM
+   int "maximum allowable number of ranges in efi_fake_mem boot option"
+   depends on EFI && X86 && EFI_FAKE_MEMMAP
+   range 1 128
+   default 8
+   help
+ Maximum allowable number of ranges in efi_fake_mem boot option.
+ Ranges can be set up to this value using comma-separated list.
+ The default value is 8.
+
 config EFI_PARAMS_FROM_FDT
bool
help
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 6fd3da9..c24

[PATCH 0/2] Introduce "efi_fake_mem" boot option

2015-09-29 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE which indicates which memory ranges are
mirrored. Now linux kernel can recognize which memory ranges are mirrored
by handling EFI_MEMORY_MORE_RELIABLE attributes.
However testing this feature necesitates boxes with UEFI spec 2.5 complied
firmware.

This patchset introduces new boot option named "efi_fake_mem".
By specifying this parameter, you can add arbitrary attribute to
specific memory range. This is useful for debugging of Memory 
Address Range Mirroring feature.

This is updated version one of the former patch posted at
 http://www.mail-archive.com/linux-efi@vger.kernel.org/msg05936.html

changelog:
 - change boot option name and spec
   efi_fake_mem_mirror=nn@ss -> efi_fake_mem=nn@ss:aa
 - rename print_efi_memmap() to efi_print_memmap()
 - introduce new config named CONFIG_EFI_MAX_FAKE_MEM
 - and some fix pointed by Matt Flemming

Taku Izumi (2):
  x86, efi: rename print_efi_memmap() to efi_print_memmap()
  x86, efi: Add "efi_fake_mem" boot option

 Documentation/kernel-parameters.txt |  15 +++
 arch/x86/include/asm/efi.h  |   1 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   4 +-
 drivers/firmware/efi/Kconfig|  22 
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 238 
 include/linux/efi.h |   6 +
 8 files changed, 288 insertions(+), 3 deletions(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] x86, efi: rename print_efi_memmap() to efi_print_memmap()

2015-09-29 Thread Taku Izumi
This patch renames print_efi_memmap() to efi_print_memmap() and
make it global function so that we can invoke it outside of
arch/x86/platform/efi/efi.c

Signed-off-by: Taku Izumi 
---
 arch/x86/include/asm/efi.h  | 1 +
 arch/x86/platform/efi/efi.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ab5f1d4..f8b93d6 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -103,6 +103,7 @@ extern void __init efi_set_executable(efi_memory_desc_t 
*md, bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init efi_print_memmap(void);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1db84c0..1f95caf 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -222,7 +222,7 @@ int __init efi_memblock_x86_reserve_range(void)
return 0;
 }
 
-static void __init print_efi_memmap(void)
+void __init efi_print_memmap(void)
 {
 #ifdef EFI_DEBUG
efi_memory_desc_t *md;
@@ -524,7 +524,7 @@ void __init efi_init(void)
return;
 
if (efi_enabled(EFI_DBG))
-   print_efi_memmap();
+   efi_print_memmap();
 
efi_esrt_init();
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] x86, efi: Add "efi_fake_mem" boot option

2015-09-29 Thread Taku Izumi
This patch introduces new boot option named "efi_fake_mem".
By specifying this parameter, you can add arbitrary attribute
to specific memory range.
This is useful for debugging of Address Range Mirroring feature.

For example, if "efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1"
is specified, the original (firmware provided) EFI memmap will be
updated so that the specified memory regions have
EFI_MEMORY_MORE_RELIABLE attribute (0x1):

 
   efi: mem36: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)

 
   efi: mem36: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  15 +++
 arch/x86/kernel/setup.c |   4 +-
 drivers/firmware/efi/Kconfig|  22 
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 238 
 include/linux/efi.h |   6 +
 6 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 22a4b68..50fc09b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1094,6 +1094,21 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem=   nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
+   Add arbitrary attribute to specific memory range by
+   updating original EFI memory map.
+   Region of memory which aa attribute is added to is
+   from ss to ss+nn.
+   If efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1
+   is specified, EFI_MEMORY_MORE_RELIABLE(0x1)
+   attribute is added to range 0x1-0x18000 and
+   0x10a000-0x112000.
+
+   Using this parameter you can do debugging of EFI memmap
+   related feature. For example, you can do debugging of
+   Address Range Mirroring feature even if your box
+   doesn't support it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fdb7f2a..30b4c44 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1079,8 +1079,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror();
+   }
 
/*
 * The EFI specification says that boot service code won't be called
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 84533e0..ac47cc4d 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -52,6 +52,28 @@ config EFI_RUNTIME_MAP
 
  See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map.
 
+config EFI_FAKE_MEMMAP
+   bool "Enable EFI fake memory map"
+   depends on EFI && X86
+   default n
+   help
+ Saying Y here will enable "efi_fake_mem" boot option.
+ By specifying this parameter, you can add arbitrary attribute
+ to specific memory range by updating original (firmware provided)
+ EFI memmap.
+ This is useful for debugging of EFI memmap related feature.
+ e.g. Address Range Mirroring feature.
+
+config EFI_MAX_FAKE_MEM
+   int "maximum allowable number of ranges in efi_fake_mem boot option"
+   depends on EFI && X86 && EFI_FAKE_MEMMAP
+   range 1 128
+   default 8
+   help
+ Maximum allowable number of ranges in efi_fake_mem boot option.
+ Ranges can be set up to this value using comma-separated list.
+ The default value is 8.
+
 config EFI_PARAMS_FROM_FDT
bool
help
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/

[PATCH 0/2] Introduce "efi_fake_mem" boot option

2015-09-29 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE which indicates which memory ranges are
mirrored. Now linux kernel can recognize which memory ranges are mirrored
by handling EFI_MEMORY_MORE_RELIABLE attributes.
However testing this feature necesitates boxes with UEFI spec 2.5 complied
firmware.

This patchset introduces new boot option named "efi_fake_mem".
By specifying this parameter, you can add arbitrary attribute to
specific memory range. This is useful for debugging of Memory 
Address Range Mirroring feature.

This is updated version one of the former patch posted at
 http://www.mail-archive.com/linux-efi@vger.kernel.org/msg05936.html

changelog:
 - change boot option name and spec
   efi_fake_mem_mirror=nn@ss -> efi_fake_mem=nn@ss:aa
 - rename print_efi_memmap() to efi_print_memmap()
 - introduce new config named CONFIG_EFI_MAX_FAKE_MEM
 - and some fix pointed by Matt Flemming

Taku Izumi (2):
  x86, efi: rename print_efi_memmap() to efi_print_memmap()
  x86, efi: Add "efi_fake_mem" boot option

 Documentation/kernel-parameters.txt |  15 +++
 arch/x86/include/asm/efi.h  |   1 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   4 +-
 drivers/firmware/efi/Kconfig|  22 
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 238 
 include/linux/efi.h |   6 +
 8 files changed, 288 insertions(+), 3 deletions(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] x86, efi: Add "efi_fake_mem" boot option

2015-09-29 Thread Taku Izumi
This patch introduces new boot option named "efi_fake_mem".
By specifying this parameter, you can add arbitrary attribute
to specific memory range.
This is useful for debugging of Address Range Mirroring feature.

For example, if "efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1"
is specified, the original (firmware provided) EFI memmap will be
updated so that the specified memory regions have
EFI_MEMORY_MORE_RELIABLE attribute (0x1):

 
   efi: mem36: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)

 
   efi: mem36: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  15 +++
 arch/x86/kernel/setup.c |   4 +-
 drivers/firmware/efi/Kconfig|  22 
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 238 
 include/linux/efi.h |   6 +
 6 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 22a4b68..50fc09b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1094,6 +1094,21 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem=   nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
+   Add arbitrary attribute to specific memory range by
+   updating original EFI memory map.
+   Region of memory which aa attribute is added to is
+   from ss to ss+nn.
+   If efi_fake_mem=2G@4G:0x1,2G@0x10a000:0x1
+   is specified, EFI_MEMORY_MORE_RELIABLE(0x1)
+   attribute is added to range 0x1-0x18000 and
+   0x10a000-0x112000.
+
+   Using this parameter you can do debugging of EFI memmap
+   related feature. For example, you can do debugging of
+   Address Range Mirroring feature even if your box
+   doesn't support it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fdb7f2a..30b4c44 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1079,8 +1079,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror();
+   }
 
/*
 * The EFI specification says that boot service code won't be called
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 84533e0..ac47cc4d 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -52,6 +52,28 @@ config EFI_RUNTIME_MAP
 
  See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map.
 
+config EFI_FAKE_MEMMAP
+   bool "Enable EFI fake memory map"
+   depends on EFI && X86
+   default n
+   help
+ Saying Y here will enable "efi_fake_mem" boot option.
+ By specifying this parameter, you can add arbitrary attribute
+ to specific memory range by updating original (firmware provided)
+ EFI memmap.
+ This is useful for debugging of EFI memmap related feature.
+ e.g. Address Range Mirroring feature.
+
+config EFI_MAX_FAKE_MEM
+   int "maximum allowable number of ranges in efi_fake_mem boot option"
+   depends on EFI && X86 && EFI_FAKE_MEMMAP
+   range 1 128
+   default 8
+   help
+ Maximum allowable number of ranges in efi_fake_mem boot option.
+ Ranges can be set up to this value using comma-separated list.
+ The default value is 8.
+
 config EFI_PARAMS_FROM_FDT
bool
help
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/

[PATCH 1/2] x86, efi: rename print_efi_memmap() to efi_print_memmap()

2015-09-29 Thread Taku Izumi
This patch renames print_efi_memmap() to efi_print_memmap() and
make it global function so that we can invoke it outside of
arch/x86/platform/efi/efi.c

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 arch/x86/include/asm/efi.h  | 1 +
 arch/x86/platform/efi/efi.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ab5f1d4..f8b93d6 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -103,6 +103,7 @@ extern void __init efi_set_executable(efi_memory_desc_t 
*md, bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init efi_print_memmap(void);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1db84c0..1f95caf 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -222,7 +222,7 @@ int __init efi_memblock_x86_reserve_range(void)
return 0;
 }
 
-static void __init print_efi_memmap(void)
+void __init efi_print_memmap(void)
 {
 #ifdef EFI_DEBUG
efi_memory_desc_t *md;
@@ -524,7 +524,7 @@ void __init efi_init(void)
return;
 
if (efi_enabled(EFI_DBG))
-   print_efi_memmap();
+   efi_print_memmap();
 
efi_esrt_init();
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5][RESEND] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-09-23 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case, relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v4 -> v5:
   - Add initializaton code of pci2phy_map when newly alloced at 
__find_pci2phy_map()

 v3 -> v4:
   - avoid GFP_ATOMIC allocation at __find_pci2phy_map()
   - Add missing pci_dev_put at snb_pci2phy_map_init()
   - Add missing raw_spin_unlock at snbep_pci2phy_map_init()

Signed-off-by: Taku Izumi 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 61 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 16 --
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +---
 4 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..61215a6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map, *alloc = NULL;
+   int i;
+
+   lockdep_assert_held(_map_lock);
+
+lookup:
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   goto end;
+   }
+
+   if (!alloc) {
+   raw_spin_unlock(_map_lock);
+   alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+   raw_spin_lock(_map_lock);
+
+   if (!alloc)
+   return NULL;
+
+   goto lookup;
+   }
+
+   map = alloc;
+   alloc = NULL;
+   map->segment = segment;
+   for (i = 0; i < 256; i++)
+   map->pbus_to_physid[i] = -1;
+   list_add_tail(>list, _map_head);
+
+end:
+   kfree(alloc);
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 72c54c2..2f0a4a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event

[PATCH v5][RESEND] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-09-23 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case, relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v4 -> v5:
   - Add initializaton code of pci2phy_map when newly alloced at 
__find_pci2phy_map()

 v3 -> v4:
   - avoid GFP_ATOMIC allocation at __find_pci2phy_map()
   - Add missing pci_dev_put at snb_pci2phy_map_init()
   - Add missing raw_spin_unlock at snbep_pci2phy_map_init()

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 61 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 16 --
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +---
 4 files changed, 106 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..61215a6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map, *alloc = NULL;
+   int i;
+
+   lockdep_assert_held(_map_lock);
+
+lookup:
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   goto end;
+   }
+
+   if (!alloc) {
+   raw_spin_unlock(_map_lock);
+   alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+   raw_spin_lock(_map_lock);
+
+   if (!alloc)
+   return NULL;
+
+   goto lookup;
+   }
+
+   map = alloc;
+   alloc = NULL;
+   map->segment = segment;
+   for (i = 0; i < 256; i++)
+   map->pbus_to_physid[i] = -1;
+   list_add_tail(>list, _map_head);
+
+end:
+   kfree(alloc);
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 72c54c2..2f0a4a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+

[PATCH v4] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-09-16 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case, relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v3 -> v4:
   - avoid GFP_ATOMIC allocation at __find_pci2phy_map()
   - Add missing pci_dev_put at snb_pci2phy_map_init()
   - Add missing raw_spin_unlock at snbep_pci2phy_map_init()

Signed-off-by: Taku Izumi 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 58 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 16 --
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +---
 4 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..3fba445 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,56 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map, *alloc = NULL;
+
+   lockdep_assert_held(_map_lock);
+
+lookup:
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   goto end;
+   }
+
+   if (!alloc) {
+   raw_spin_unlock(_map_lock);
+   alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+   raw_spin_lock(_map_lock);
+
+   if (!alloc)
+   return NULL;
+
+   goto lookup;
+   }
+
+   map = alloc;
+   alloc = NULL;
+   map->segment = segment;
+   list_add_tail(>list, _map_head);
+
+end:
+   kfree(alloc);
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +860,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +907,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 72c54c2..2f0a4a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,15 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[2

[PATCH v4] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-09-16 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case, relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v3 -> v4:
   - avoid GFP_ATOMIC allocation at __find_pci2phy_map()
   - Add missing pci_dev_put at snb_pci2phy_map_init()
   - Add missing raw_spin_unlock at snbep_pci2phy_map_init()

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 58 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 16 --
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 32 +---
 4 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..3fba445 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,56 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map, *alloc = NULL;
+
+   lockdep_assert_held(_map_lock);
+
+lookup:
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   goto end;
+   }
+
+   if (!alloc) {
+   raw_spin_unlock(_map_lock);
+   alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+   raw_spin_lock(_map_lock);
+
+   if (!alloc)
+   return NULL;
+
+   goto lookup;
+   }
+
+   map = alloc;
+   alloc = NULL;
+   map->segment = segment;
+   list_add_tail(>list, _map_head);
+
+end:
+   kfree(alloc);
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +860,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +907,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 72c54c2..2f0a4a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,15 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segm

[PATCH v3] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-09-03 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v2 -> v3:
   - fix up according to Peter's comment
   - introduce __find_pci2phy_map() to avert repetition

Signed-off-by: Taku Izumi 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 45 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 +-
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 13 ++-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 31 +++
 4 files changed, 87 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..1ddac35 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,43 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map;
+
+   lockdep_assert_held(_map_lock);
+
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   return map;
+   }
+
+   map = kmalloc(sizeof(struct pci2phy_map), GFP_ATOMIC);
+   if (map) {
+   map->segment = segment;
+   list_add_tail(>list, _map_head);
+   }
+
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +847,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +894,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 72c54c2..2f0a4a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,15 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 exter

[PATCH v3] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-09-03 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v2 -> v3:
   - fix up according to Peter's comment
   - introduce __find_pci2phy_map() to avert repetition

Signed-off-by: Taku Izumi <izumi.t...@jp.fujitsu.com>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 45 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 12 +-
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 13 ++-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 31 +++
 4 files changed, 87 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e525..1ddac35 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,43 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   struct pci2phy_map *map;
+   int phys_id = -1;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+   struct pci2phy_map *map;
+
+   lockdep_assert_held(_map_lock);
+
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == segment)
+   return map;
+   }
+
+   map = kmalloc(sizeof(struct pci2phy_map), GFP_ATOMIC);
+   if (map) {
+   map->segment = segment;
+   list_add_tail(>list, _map_head);
+   }
+
+   return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +847,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +894,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 72c54c2..2f0a4a9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,15 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box

[PATCH v2][RESEND] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-08-26 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v1 -> v2:
   - Extract method named uncore_pcibus_to_physid to avoid repetetion of
 retrieving phys_id code

Signed-off-by: Taku Izumi 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 25 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 11 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 23 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 53 --
 4 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..0ed6f2b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,23 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   int phys_id = -1;
+   struct pci2phy_map *map;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +827,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +874,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..6c96ee9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,14 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +325,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 1

[PATCH v2 1/3] efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()

2015-08-26 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE. This patch adds this new attribute
support to efi_md_typeattr_format().

Signed-off-by: Taku Izumi 
---
 drivers/firmware/efi/efi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index d6144e3..8124078 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -589,12 +589,14 @@ char * __init efi_md_typeattr_format(char *buf, size_t 
size,
attr = md->attribute;
if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
 EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP |
-EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME))
+EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME |
+EFI_MEMORY_MORE_RELIABLE))
snprintf(pos, size, "|attr=0x%016llx]",
 (unsigned long long)attr);
else
-   snprintf(pos, size, "|%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
+   snprintf(pos, size, "|%3s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
 attr & EFI_MEMORY_RUNTIME ? "RUN" : "",
+attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "",
 attr & EFI_MEMORY_XP  ? "XP"  : "",
 attr & EFI_MEMORY_RP  ? "RP"  : "",
 attr & EFI_MEMORY_WP  ? "WP"  : "",
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/3] Introduce "efi_fake_mem_mirror" boot option

2015-08-26 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE which indicates which memory ranges are
mirrored. Now linux kernel can recognize which memory ranges are mirrored
by handling EFI_MEMORY_MORE_RELIABLE attributes.
However testing this feature necesitates boxes with UEFI spec 2.5 complied
firmware.

This patchset introduces new boot option named "efi_fake_mem_mirror".
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Memory Address Range
Mirroring feature.

v1 -> v2:
 - change abbreviation of EFI_MEMORY_MORE_RELIABLE from "RELY" to "MR"
 - add patch (2/3) for changing abbreviation of EFI_MEMORY_RUNTIME
 - migrate some code from arch/x86/platform/efi/quirks to
   drivers/firmware/efi/fake_mem.c and create config EFI_FAKE_MEMMAP

Taku Izumi (3):
  efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()
  efi: Change abbreviation of EFI_MEMORY_RUNTIME from "RUN" to "RT"
  x86, efi: Add "efi_fake_mem_mirror" boot option

 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   1 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 drivers/firmware/efi/Kconfig|  12 +++
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/efi.c  |   8 +-
 drivers/firmware/efi/fake_mem.c | 204 
 include/linux/efi.h |   6 ++
 9 files changed, 241 insertions(+), 5 deletions(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/3] efi: Change abbreviation of EFI_MEMORY_RUNTIME from "RUN" to "RT"

2015-08-26 Thread Taku Izumi
Now efi_md_typeattr_format() outputs "RUN" if passed EFI memory
descriptor has EFI_MEMORY_RUNTIME attribute. But "RT" is preferer
because it is shorter and clearer.

This patch changes abbreviation of EFI_MEMORY_RUNTIME from "RUN"
to "RT".

Suggested-by: Ard Biesheuvel 
Signed-off-by: Taku Izumi 
---
 drivers/firmware/efi/efi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 8124078..25b6477 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -594,8 +594,8 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
snprintf(pos, size, "|attr=0x%016llx]",
 (unsigned long long)attr);
else
-   snprintf(pos, size, "|%3s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
-attr & EFI_MEMORY_RUNTIME ? "RUN" : "",
+   snprintf(pos, size, "|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
+attr & EFI_MEMORY_RUNTIME ? "RT" : "",
 attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "",
 attr & EFI_MEMORY_XP  ? "XP"  : "",
 attr & EFI_MEMORY_RP  ? "RP"  : "",
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/3] x86, efi: Add "efi_fake_mem_mirror" boot option

2015-08-26 Thread Taku Izumi
This patch introduces new boot option named "efi_fake_mem_mirror".
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Address Range
Mirroring feature.

For example, if you specify "efi_fake_mem_mirror=2G@4G,2G@0x10a000",
the original (firmware provided) EFI memmap will be updated so that
the specified memory regions have EFI_MEMORY_MORE_RELIABLE attribute:

 
   efi: mem00: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)
   efi: mem37: [Reserved   |RT|  |  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

 
   efi: mem00: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)
   efi: mem40: [Reserved   |RT|  |  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   1 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 drivers/firmware/efi/Kconfig|  12 +++
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 204 
 include/linux/efi.h |   6 ++
 8 files changed, 236 insertions(+), 2 deletions(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 1d6f045..0efded6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1092,6 +1092,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem_mirror=nn[KMG]@ss[KMG][,nn[KMG]@ss[KMG],..] [EFI; X86]
+   Mark specific memory as mirrored memory and update
+   EFI memory map.
+   Region of memory to be marked is from ss to ss+nn.
+   Using this parameter you can do debugging of Address
+   Range Mirroring feature even if your box doesn't support
+   it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 155162e..479fd51 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -93,6 +93,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, 
bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init print_efi_memmap(void);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874b..e3ed628 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1104,8 +1104,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror();
+   }
 
/*
 * The EFI specification says that boot service code won't be called
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e4308fe..eee8068 100644
--- a/a

[PATCH v2 3/3] x86, efi: Add efi_fake_mem_mirror boot option

2015-08-26 Thread Taku Izumi
This patch introduces new boot option named efi_fake_mem_mirror.
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Address Range
Mirroring feature.

For example, if you specify efi_fake_mem_mirror=2G@4G,2G@0x10a000,
the original (firmware provided) EFI memmap will be updated so that
the specified memory regions have EFI_MEMORY_MORE_RELIABLE attribute:

 original EFI memmap
   efi: mem00: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)
   efi: mem37: [Reserved   |RT|  |  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

 updated EFI memmap
   efi: mem00: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|  |MR|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|  |  |  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)
   efi: mem40: [Reserved   |RT|  |  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   1 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 drivers/firmware/efi/Kconfig|  12 +++
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/fake_mem.c | 204 
 include/linux/efi.h |   6 ++
 8 files changed, 236 insertions(+), 2 deletions(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 1d6f045..0efded6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1092,6 +1092,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem_mirror=nn[KMG]@ss[KMG][,nn[KMG]@ss[KMG],..] [EFI; X86]
+   Mark specific memory as mirrored memory and update
+   EFI memory map.
+   Region of memory to be marked is from ss to ss+nn.
+   Using this parameter you can do debugging of Address
+   Range Mirroring feature even if your box doesn't support
+   it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 155162e..479fd51 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -93,6 +93,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, 
bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init print_efi_memmap(void);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874b..e3ed628 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1104,8 +1104,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror();
+   }
 
/*
 * The EFI specification says that boot service code won't be called
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c

[PATCH v2 2/3] efi: Change abbreviation of EFI_MEMORY_RUNTIME from RUN to RT

2015-08-26 Thread Taku Izumi
Now efi_md_typeattr_format() outputs RUN if passed EFI memory
descriptor has EFI_MEMORY_RUNTIME attribute. But RT is preferer
because it is shorter and clearer.

This patch changes abbreviation of EFI_MEMORY_RUNTIME from RUN
to RT.

Suggested-by: Ard Biesheuvel ard.biesheu...@linaro.org
Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 drivers/firmware/efi/efi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 8124078..25b6477 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -594,8 +594,8 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
snprintf(pos, size, |attr=0x%016llx],
 (unsigned long long)attr);
else
-   snprintf(pos, size, |%3s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s],
-attr  EFI_MEMORY_RUNTIME ? RUN : ,
+   snprintf(pos, size, |%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s],
+attr  EFI_MEMORY_RUNTIME ? RT : ,
 attr  EFI_MEMORY_MORE_RELIABLE ? MR : ,
 attr  EFI_MEMORY_XP  ? XP  : ,
 attr  EFI_MEMORY_RP  ? RP  : ,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/3] efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()

2015-08-26 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE. This patch adds this new attribute
support to efi_md_typeattr_format().

Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 drivers/firmware/efi/efi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index d6144e3..8124078 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -589,12 +589,14 @@ char * __init efi_md_typeattr_format(char *buf, size_t 
size,
attr = md-attribute;
if (attr  ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
 EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP |
-EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME))
+EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME |
+EFI_MEMORY_MORE_RELIABLE))
snprintf(pos, size, |attr=0x%016llx],
 (unsigned long long)attr);
else
-   snprintf(pos, size, |%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s],
+   snprintf(pos, size, |%3s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s],
 attr  EFI_MEMORY_RUNTIME ? RUN : ,
+attr  EFI_MEMORY_MORE_RELIABLE ? MR : ,
 attr  EFI_MEMORY_XP  ? XP  : ,
 attr  EFI_MEMORY_RP  ? RP  : ,
 attr  EFI_MEMORY_WP  ? WP  : ,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/3] Introduce efi_fake_mem_mirror boot option

2015-08-26 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE which indicates which memory ranges are
mirrored. Now linux kernel can recognize which memory ranges are mirrored
by handling EFI_MEMORY_MORE_RELIABLE attributes.
However testing this feature necesitates boxes with UEFI spec 2.5 complied
firmware.

This patchset introduces new boot option named efi_fake_mem_mirror.
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Memory Address Range
Mirroring feature.

v1 - v2:
 - change abbreviation of EFI_MEMORY_MORE_RELIABLE from RELY to MR
 - add patch (2/3) for changing abbreviation of EFI_MEMORY_RUNTIME
 - migrate some code from arch/x86/platform/efi/quirks to
   drivers/firmware/efi/fake_mem.c and create config EFI_FAKE_MEMMAP

Taku Izumi (3):
  efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()
  efi: Change abbreviation of EFI_MEMORY_RUNTIME from RUN to RT
  x86, efi: Add efi_fake_mem_mirror boot option

 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   1 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 drivers/firmware/efi/Kconfig|  12 +++
 drivers/firmware/efi/Makefile   |   1 +
 drivers/firmware/efi/efi.c  |   8 +-
 drivers/firmware/efi/fake_mem.c | 204 
 include/linux/efi.h |   6 ++
 9 files changed, 241 insertions(+), 5 deletions(-)
 create mode 100644 drivers/firmware/efi/fake_mem.c

-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2][RESEND] perf, x86: Fix multi-segment problem of perf_event_intel_uncore

2015-08-26 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because uncore_pcibus_to_physid doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
uncore_pcibus_to_physid array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v1 - v2:
   - Extract method named uncore_pcibus_to_physid to avoid repetetion of
 retrieving phys_id code

Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 25 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 11 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 23 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 53 --
 4 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..0ed6f2b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,23 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   int phys_id = -1;
+   struct pci2phy_map *map;
+
+   raw_spin_lock(pci2phy_map_lock);
+   list_for_each_entry(map, pci2phy_map_head, list) {
+   if (map-segment == pci_domain_nr(bus)) {
+   phys_id = map-pbus_to_physid[bus-number];
+   break;
+   }
+   }
+   raw_spin_unlock(pci2phy_map_lock);
+
+   return phys_id;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +827,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev-bus-number];
+   phys_id = uncore_pcibus_to_physid(pdev-bus);
if (phys_id  0)
return -ENODEV;
 
@@ -856,9 +874,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev-bus-number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev-bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i  UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..6c96ee9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,14 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +325,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 100644
--- a/arch/x86/kernel/cpu

[PATCH 0/2][RFC] Introduce "efi_fake_mem_mirror" boot option

2015-08-20 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE which indicates which memory ranges are
mirrored. Now linux kernel can recognize which memory ranges are mirrored
by handling EFI_MEMORY_MORE_RELIABLE attributes.
However testing this feature necesitates boxes with UEFI spec 2.5 complied
firmware. 

This patchset introduces new boot option named "efi_fake_mem_mirror".
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Memory Address Range
Mirroring feature.

Taku Izumi (2):
  efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()
  x86, efi: Add "efi_fake_mem_mirror" boot option

 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   2 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 arch/x86/platform/efi/quirks.c  | 169 
 drivers/firmware/efi/efi.c  |   6 +-
 6 files changed, 187 insertions(+), 4 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()

2015-08-20 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE. This patch adds this new attribute
support to efi_md_typeattr_format().

Signed-off-by: Taku Izumi 
---
 drivers/firmware/efi/efi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index d6144e3..aadc1c4 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -589,12 +589,14 @@ char * __init efi_md_typeattr_format(char *buf, size_t 
size,
attr = md->attribute;
if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
 EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP |
-EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME))
+EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME |
+EFI_MEMORY_MORE_RELIABLE))
snprintf(pos, size, "|attr=0x%016llx]",
 (unsigned long long)attr);
else
-   snprintf(pos, size, "|%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
+   snprintf(pos, size, "|%3s|%4s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]",
 attr & EFI_MEMORY_RUNTIME ? "RUN" : "",
+attr & EFI_MEMORY_MORE_RELIABLE ? "RELY" : "",
 attr & EFI_MEMORY_XP  ? "XP"  : "",
 attr & EFI_MEMORY_RP  ? "RP"  : "",
 attr & EFI_MEMORY_WP  ? "WP"  : "",
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] x86, efi: Add "efi_fake_mem_mirror" boot option

2015-08-20 Thread Taku Izumi
This patch introduces new boot option named "efi_fake_mem_mirror".
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Address Range
Mirroring feature.

For example, if you specify "efi_fake_mem_mirror=2G@4G,2G@0x10a000",
the original (firmware provided) EFI memmap will be updated so that
the specified memory regions have EFI_MEMORY_MORE_RELIABLE attribute:

 
   efi: mem00: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)
   efi: mem37: [Reserved   |RUN||  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

 
   efi: mem00: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|   |RELY|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|   |RELY|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)
   efi: mem40: [Reserved   |RUN||  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi 
---
 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   2 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 arch/x86/platform/efi/quirks.c  | 169 
 5 files changed, 183 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 1d6f045..0efded6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1092,6 +1092,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem_mirror=nn[KMG]@ss[KMG][,nn[KMG]@ss[KMG],..] [EFI; X86]
+   Mark specific memory as mirrored memory and update
+   EFI memory map.
+   Region of memory to be marked is from ss to ss+nn.
+   Using this parameter you can do debugging of Address
+   Range Mirroring feature even if your box doesn't support
+   it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 155162e..50e53cc 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -93,6 +93,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, 
bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init print_efi_memmap(void);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
@@ -107,6 +108,7 @@ extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
+extern void __init efi_fake_memmap(void);
 
 struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874b..e3ed628 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1104,8 +1104,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror();
+   }
 
/*
 * The EF

[PATCH 1/2] efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()

2015-08-20 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE. This patch adds this new attribute
support to efi_md_typeattr_format().

Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 drivers/firmware/efi/efi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index d6144e3..aadc1c4 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -589,12 +589,14 @@ char * __init efi_md_typeattr_format(char *buf, size_t 
size,
attr = md-attribute;
if (attr  ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT |
 EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP |
-EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME))
+EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME |
+EFI_MEMORY_MORE_RELIABLE))
snprintf(pos, size, |attr=0x%016llx],
 (unsigned long long)attr);
else
-   snprintf(pos, size, |%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s],
+   snprintf(pos, size, |%3s|%4s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s],
 attr  EFI_MEMORY_RUNTIME ? RUN : ,
+attr  EFI_MEMORY_MORE_RELIABLE ? RELY : ,
 attr  EFI_MEMORY_XP  ? XP  : ,
 attr  EFI_MEMORY_RP  ? RP  : ,
 attr  EFI_MEMORY_WP  ? WP  : ,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] x86, efi: Add efi_fake_mem_mirror boot option

2015-08-20 Thread Taku Izumi
This patch introduces new boot option named efi_fake_mem_mirror.
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Address Range
Mirroring feature.

For example, if you specify efi_fake_mem_mirror=2G@4G,2G@0x10a000,
the original (firmware provided) EFI memmap will be updated so that
the specified memory regions have EFI_MEMORY_MORE_RELIABLE attribute:

 original EFI memmap
   efi: mem00: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x0020a000) (129536MB)
   efi: mem37: [Reserved   |RUN||  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

 updated EFI memmap
   efi: mem00: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x-0x1000) (0MB)
   efi: mem01: [Loader Data|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x1000-0x2000) (0MB)
   ...
   efi: mem35: [Boot Data  |   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x47ee6000-0x48014000) (1MB)
   efi: mem36: [Conventional Memory|   |RELY|  |  |  |   |WB|WT|WC|UC] 
range=[0x0001-0x00018000) (2048MB)
   efi: mem37: [Conventional Memory|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x00018000-0x0010a000) (61952MB)
   efi: mem38: [Conventional Memory|   |RELY|  |  |  |   |WB|WT|WC|UC] 
range=[0x0010a000-0x00112000) (2048MB)
   efi: mem39: [Conventional Memory|   ||  |  |  |   |WB|WT|WC|UC] 
range=[0x00112000-0x0020a000) (63488MB)
   efi: mem40: [Reserved   |RUN||  |  |  |   |  |  |  |UC] 
range=[0x6000-0x9000) (768MB)

And you will find that the following message is output:

   efi: Memory: 4096M/131455M mirrored memory

Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   2 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 arch/x86/platform/efi/quirks.c  | 169 
 5 files changed, 183 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 1d6f045..0efded6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1092,6 +1092,14 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
you are really sure that your UEFI does sane gc and
fulfills the spec otherwise your board may brick.
 
+   efi_fake_mem_mirror=nn[KMG]@ss[KMG][,nn[KMG]@ss[KMG],..] [EFI; X86]
+   Mark specific memory as mirrored memory and update
+   EFI memory map.
+   Region of memory to be marked is from ss to ss+nn.
+   Using this parameter you can do debugging of Address
+   Range Mirroring feature even if your box doesn't support
+   it.
+
eisa_irq_edge=  [PARISC,HW]
See header of drivers/parisc/eisa.c.
 
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 155162e..50e53cc 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -93,6 +93,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, 
bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init print_efi_memmap(void);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
@@ -107,6 +108,7 @@ extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
+extern void __init efi_fake_memmap(void);
 
 struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874b..e3ed628 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1104,8 +1104,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
 
-   if (efi_enabled(EFI_BOOT))
+   if (efi_enabled(EFI_BOOT)) {
+   efi_fake_memmap();
efi_find_mirror

[PATCH 0/2][RFC] Introduce efi_fake_mem_mirror boot option

2015-08-20 Thread Taku Izumi
UEFI spec 2.5 introduces new Memory Attribute Definition named
EFI_MEMORY_MORE_RELIABLE which indicates which memory ranges are
mirrored. Now linux kernel can recognize which memory ranges are mirrored
by handling EFI_MEMORY_MORE_RELIABLE attributes.
However testing this feature necesitates boxes with UEFI spec 2.5 complied
firmware. 

This patchset introduces new boot option named efi_fake_mem_mirror.
By specifying this parameter, you can mark specific memory as
mirrored memory. This is useful for debugging of Memory Address Range
Mirroring feature.

Taku Izumi (2):
  efi: Add EFI_MEMORY_MORE_RELIABLE support to efi_md_typeattr_format()
  x86, efi: Add efi_fake_mem_mirror boot option

 Documentation/kernel-parameters.txt |   8 ++
 arch/x86/include/asm/efi.h  |   2 +
 arch/x86/kernel/setup.c |   4 +-
 arch/x86/platform/efi/efi.c |   2 +-
 arch/x86/platform/efi/quirks.c  | 169 
 drivers/firmware/efi/efi.c  |   6 +-
 6 files changed, 187 insertions(+), 4 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] perf: Fix multi-segment problem of perf_event_intel_uncore

2015-08-04 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v1->v2:
  - Extract method named uncore_pcibus_to_physid to avoid repetetion of
retrieving phys_id code

Signed-off-by: Taku Izumi 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 25 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 11 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 23 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 53 --
 4 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..0ed6f2b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,23 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   int phys_id = -1;
+   struct pci2phy_map *map;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +827,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +874,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..6c96ee9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,14 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +325,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 10064

[PATCH v2] perf: Fix multi-segment problem of perf_event_intel_uncore

2015-08-04 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because uncore_pcibus_to_physid doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
uncore_pcibus_to_physid array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v1-v2:
  - Extract method named uncore_pcibus_to_physid to avoid repetetion of
retrieving phys_id code

Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 25 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 11 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 23 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 53 --
 4 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..0ed6f2b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,23 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   int phys_id = -1;
+   struct pci2phy_map *map;
+
+   raw_spin_lock(pci2phy_map_lock);
+   list_for_each_entry(map, pci2phy_map_head, list) {
+   if (map-segment == pci_domain_nr(bus)) {
+   phys_id = map-pbus_to_physid[bus-number];
+   break;
+   }
+   }
+   raw_spin_unlock(pci2phy_map_lock);
+
+   return phys_id;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +827,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev-bus-number];
+   phys_id = uncore_pcibus_to_physid(pdev-bus);
if (phys_id  0)
return -ENODEV;
 
@@ -856,9 +874,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev-bus-number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev-bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i  UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..6c96ee9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,14 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +325,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 100644
--- a/arch/x86/kernel/cpu

[PATCH] perf: Fix multi-segment problem of perf_event_intel_uncore

2015-06-30 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

Signed-off-by: Taku Izumi 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 27 +++---
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  |  9 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 23 +++-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 41 ++
 4 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..78c8686 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -806,10 +807,18 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
struct intel_uncore_type *type;
-   int phys_id;
+   int phys_id = -1;
bool first_box = false;
+   struct pci2phy_map *map;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(pdev->bus)) {
+   phys_id = map->pbus_to_physid[pdev->bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,8 +865,18 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id = -1;
bool last_box = false;
+   struct pci2phy_map *map;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(pdev->bus)) {
+   phys_id = map->pbus_to_physid[pdev->bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
 
box = pci_get_drvdata(pdev);
if (!box) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..0fb2a23 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,12 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +323,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -402,14 +402,35 @@ static int snb_pci2phy_map_init(int devid)
 {
   

[PATCH] perf: Fix multi-segment problem of perf_event_intel_uncore

2015-06-30 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad  Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because uncore_pcibus_to_physid doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
uncore_pcibus_to_physid array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

Signed-off-by: Taku Izumi izumi.t...@jp.fujitsu.com
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 27 +++---
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  |  9 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 23 +++-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 41 ++
 4 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..78c8686 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -806,10 +807,18 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
struct intel_uncore_pmu *pmu;
struct intel_uncore_box *box;
struct intel_uncore_type *type;
-   int phys_id;
+   int phys_id = -1;
bool first_box = false;
+   struct pci2phy_map *map;
 
-   phys_id = uncore_pcibus_to_physid[pdev-bus-number];
+   raw_spin_lock(pci2phy_map_lock);
+   list_for_each_entry(map, pci2phy_map_head, list) {
+   if (map-segment == pci_domain_nr(pdev-bus)) {
+   phys_id = map-pbus_to_physid[pdev-bus-number];
+   break;
+   }
+   }
+   raw_spin_unlock(pci2phy_map_lock);
if (phys_id  0)
return -ENODEV;
 
@@ -856,8 +865,18 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev-bus-number];
+   int i, cpu, phys_id = -1;
bool last_box = false;
+   struct pci2phy_map *map;
+
+   raw_spin_lock(pci2phy_map_lock);
+   list_for_each_entry(map, pci2phy_map_head, list) {
+   if (map-segment == pci_domain_nr(pdev-bus)) {
+   phys_id = map-pbus_to_physid[pdev-bus-number];
+   break;
+   }
+   }
+   raw_spin_unlock(pci2phy_map_lock);
 
box = pci_get_drvdata(pdev);
if (!box) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..0fb2a23 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,12 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +323,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -402,14 +402,35 @@ static int snb_pci2phy_map_init(int devid)
 {
struct pci_dev *dev = NULL;
int bus

Re: [PATCH 2/3 v2] Do not use acpi_device to find pci root bridge in _init code.

2012-10-14 Thread Taku Izumi
On Fri, 12 Oct 2012 20:34:20 +0800
Tang Chen  wrote:

> When the kernel is being initialized, and some hardwares are not added
> to system, there won't be acpi_device structs for these devices. But
> acpi_is_root_bridge() depends on acpi_device struct. As a result, all
> the not-added root bridge will not be judged as a root bridge in
> find_root_bridges(). And further more, no handle_hotplug_event_root()
> notifier will be installed for them.
> 
> This patch introduces a new api to find all root bridges in system by
> getting HID directly from ACPI namespace, not depending on acpi_device
> struct.

  How about squashing patch #2 into patch #1 ?
  The caller and callee should be the same place in my mind.

  Best regards,
  Taku Izumi

> Signed-off-by: Tang Chen 
> Signed-off-by: Liu Jiang 
> ---
>  drivers/acpi/pci_root.c |   19 +++
>  1 files changed, 11 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
> index 6151d83..582eb11 100644
> --- a/drivers/acpi/pci_root.c
> +++ b/drivers/acpi/pci_root.c
> @@ -129,20 +129,23 @@ EXPORT_SYMBOL_GPL(acpi_get_pci_rootbridge_handle);
>   * acpi_is_root_bridge - determine whether an ACPI CA node is a PCI root 
> bridge
>   * @handle - the ACPI CA node in question.
>   *
> - * Note: we could make this API take a struct acpi_device * instead, but
> - * for now, it's more convenient to operate on an acpi_handle.
> + * Note: If a device is not added to the system yet, there won't be an
> + * acpi_device struct for it. So do not get HID and CID from acpi_device,
> + * get them from ACPI namespace directly.
>   */
>  int acpi_is_root_bridge(acpi_handle handle)
>  {
> - int ret;
> - struct acpi_device *device;
> + struct acpi_device_info *info;
> + acpi_status status;
>  
> - ret = acpi_bus_get_device(handle, );
> - if (ret)
> + status = acpi_get_object_info(handle, );
> + if (ACPI_FAILURE(status)) {
> + printk(KERN_ERR PREFIX "%s: Error reading"
> +"device info\n", __func__);
>   return 0;
> + }
>  
> - ret = acpi_match_device_ids(device, root_device_ids);
> - if (ret)
> + if (acpi_match_object_info_ids(info, root_device_ids))
>   return 0;
>   else
>   return 1;
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


-- 
Taku Izumi 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3 v2] Do not use acpi_device to find pci root bridge in _init code.

2012-10-14 Thread Taku Izumi
On Fri, 12 Oct 2012 20:34:20 +0800
Tang Chen tangc...@cn.fujitsu.com wrote:

 When the kernel is being initialized, and some hardwares are not added
 to system, there won't be acpi_device structs for these devices. But
 acpi_is_root_bridge() depends on acpi_device struct. As a result, all
 the not-added root bridge will not be judged as a root bridge in
 find_root_bridges(). And further more, no handle_hotplug_event_root()
 notifier will be installed for them.
 
 This patch introduces a new api to find all root bridges in system by
 getting HID directly from ACPI namespace, not depending on acpi_device
 struct.

  How about squashing patch #2 into patch #1 ?
  The caller and callee should be the same place in my mind.

  Best regards,
  Taku Izumi

 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 Signed-off-by: Liu Jiang jiang@huawei.com
 ---
  drivers/acpi/pci_root.c |   19 +++
  1 files changed, 11 insertions(+), 8 deletions(-)
 
 diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
 index 6151d83..582eb11 100644
 --- a/drivers/acpi/pci_root.c
 +++ b/drivers/acpi/pci_root.c
 @@ -129,20 +129,23 @@ EXPORT_SYMBOL_GPL(acpi_get_pci_rootbridge_handle);
   * acpi_is_root_bridge - determine whether an ACPI CA node is a PCI root 
 bridge
   * @handle - the ACPI CA node in question.
   *
 - * Note: we could make this API take a struct acpi_device * instead, but
 - * for now, it's more convenient to operate on an acpi_handle.
 + * Note: If a device is not added to the system yet, there won't be an
 + * acpi_device struct for it. So do not get HID and CID from acpi_device,
 + * get them from ACPI namespace directly.
   */
  int acpi_is_root_bridge(acpi_handle handle)
  {
 - int ret;
 - struct acpi_device *device;
 + struct acpi_device_info *info;
 + acpi_status status;
  
 - ret = acpi_bus_get_device(handle, device);
 - if (ret)
 + status = acpi_get_object_info(handle, info);
 + if (ACPI_FAILURE(status)) {
 + printk(KERN_ERR PREFIX %s: Error reading
 +device info\n, __func__);
   return 0;
 + }
  
 - ret = acpi_match_device_ids(device, root_device_ids);
 - if (ret)
 + if (acpi_match_object_info_ids(info, root_device_ids))
   return 0;
   else
   return 1;
 -- 
 1.7.1
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
 


-- 
Taku Izumi izumi.t...@jp.fujitsu.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 2/3] ACPIHP: ACPI system device hotplug slot enumerator

2012-08-03 Thread Taku Izumi
andled by acpiphp or pciehp drivers.
> +  */
> + if (type == ACPIHP_DEV_TYPE_HOST_BRIDGE)
> + return AE_CTRL_DEPTH;
> +
> + return AE_OK;
> +}
> +
> +/*
> + * Get types of child devices connected to this slot.
> + * We only care about CPU, memory, PCI host bridge and CONTAINER here.
> + * Values used here must be in consistence with acpihp_enum_get_slot_type().
> + */
> +static acpi_status __init
> +acpihp_enum_get_dev_type(acpi_handle handle, u32 lvl, void *context, void 
> **rv)
> +{
> + acpi_status status = AE_OK;
> + enum acpihp_dev_type type;
> + u32 *tp = (u32 *)rv;
> +
> + if (!acpihp_dev_get_type(handle, )) {
> + switch (type) {
> + case ACPIHP_DEV_TYPE_CPU:
> + *tp |= 0x0001;
> + status = AE_CTRL_DEPTH;
> + break;
> + case ACPIHP_DEV_TYPE_MEM:
> + *tp |= 0x0002;
> + status = AE_CTRL_DEPTH;
> + break;
> + case ACPIHP_DEV_TYPE_HOST_BRIDGE:
> + *tp |= 0x0004;
> + status = AE_CTRL_DEPTH;
> + break;
> + case ACPIHP_DEV_TYPE_CONTAINER:
> + *tp |= 0x0008;
> + break;
> + default:
> + break;
> + }
> + }
> +
> + return status;
> +}
> +
> +/*
> + * Guess type of a hotplug slot according to child devices connecting to it.
> + */
> +static enum acpihp_slot_type __init acpihp_enum_get_slot_type(u32 dev_types)
> +{
> +     BUG_ON(dev_types > 15);
> +
> + switch (dev_types) {
> + case 0:
> + /* Generic CONTAINER */
> + return ACPIHP_SLOT_TYPE_COMMON;
> + case 1:
> + /* Physical processor with logical CPUs */
> + return ACPIHP_SLOT_TYPE_CPU;
> + case 2:
> + /* Memory board/box with memory devices */
> + return ACPIHP_SLOT_TYPE_MEM;
> + case 3:
> + /* Physical processor with CPUs and memory controllers */
> + return ACPIHP_SLOT_TYPE_CPU;
> + case 4:
> + /* IO eXtension board/box with IO host bridges */
> + return ACPIHP_SLOT_TYPE_IOX;
> + case 7:
> + /* Physical processor with CPUs, IO host bridges and MCs. */
> + return ACPIHP_SLOT_TYPE_CPU;


   Why is this case ACPIHP_SLOT_TYPE_CPU? 
   I think this case is ACPIHP_SLOT_TYPE_COMMON or else.
   By the way how about simplifying slot type category?
   Do we need to differentiate case7, 8, 9, 11 and 15?
 
   Best regards,
   Taku Izumi


> + case 8:
> + /* Generic CONTAINER */
> + return ACPIHP_SLOT_TYPE_COMMON;
> + case 9:
> + /* System board with physical processors */
> + return ACPIHP_SLOT_TYPE_SYSTEM_BOARD;
> + case 11:
> + /* System board with physical processors and memory */
> + return ACPIHP_SLOT_TYPE_SYSTEM_BOARD;
> + case 15:
> + /* Node with processor, memory and IO host bridge */
> + return ACPIHP_SLOT_TYPE_NODE;
> + default:
> + return ACPIHP_SLOT_TYPE_UNKNOWN;
> + }
> +}
> +
> +/*
> + * Guess type of a hotplug slot according to the device type of the
> + * corresponding ACPI object itself.
> + */
> +static enum acpihp_slot_type __init
> +acpihp_enum_check_slot_self(struct acpihp_slot *slot)
> +{
> + enum acpihp_dev_type type;
> +
> + if (acpihp_dev_get_type(slot->handle, ))
> + return ACPIHP_SLOT_TYPE_UNKNOWN;
> +
> + switch (type) {
> + case ACPIHP_DEV_TYPE_CPU:
> + /* Logical CPU used in virtualization environment */
> + return ACPIHP_SLOT_TYPE_CPU;
> + case ACPIHP_DEV_TYPE_MEM:
> + /* Memory board with single memory device */
> + return ACPIHP_SLOT_TYPE_MEM;
> + case ACPIHP_DEV_TYPE_HOST_BRIDGE:
> + /* IO eXtension board/box with single IO host bridge */
> + return ACPIHP_SLOT_TYPE_IOX;
> + default:
> + return ACPIHP_SLOT_TYPE_UNKNOWN;
> + }
> +}
> +
> +static int __init acpihp_enum_generate_slot_name(struct acpihp_slot *slot)
> +{
> + int found = 0;
> + struct list_head *list;
> + struct acpihp_slot_id *slot_id;
> + unsigned long long uid;
> +
> + /* Respect firmware settings if _UID return an integer. */
> + if (ACPI_SUCCESS(acpi_evaluate_integer(slot->handle, METHOD_NAME__UID,
> +NULL, )))
> + 

Re: [RFC PATCH 2/3] ACPIHP: ACPI system device hotplug slot enumerator

2012-08-03 Thread Taku Izumi
;
 + break;
 + case ACPIHP_DEV_TYPE_HOST_BRIDGE:
 + *tp |= 0x0004;
 + status = AE_CTRL_DEPTH;
 + break;
 + case ACPIHP_DEV_TYPE_CONTAINER:
 + *tp |= 0x0008;
 + break;
 + default:
 + break;
 + }
 + }
 +
 + return status;
 +}
 +
 +/*
 + * Guess type of a hotplug slot according to child devices connecting to it.
 + */
 +static enum acpihp_slot_type __init acpihp_enum_get_slot_type(u32 dev_types)
 +{
 + BUG_ON(dev_types  15);
 +
 + switch (dev_types) {
 + case 0:
 + /* Generic CONTAINER */
 + return ACPIHP_SLOT_TYPE_COMMON;
 + case 1:
 + /* Physical processor with logical CPUs */
 + return ACPIHP_SLOT_TYPE_CPU;
 + case 2:
 + /* Memory board/box with memory devices */
 + return ACPIHP_SLOT_TYPE_MEM;
 + case 3:
 + /* Physical processor with CPUs and memory controllers */
 + return ACPIHP_SLOT_TYPE_CPU;
 + case 4:
 + /* IO eXtension board/box with IO host bridges */
 + return ACPIHP_SLOT_TYPE_IOX;
 + case 7:
 + /* Physical processor with CPUs, IO host bridges and MCs. */
 + return ACPIHP_SLOT_TYPE_CPU;


   Why is this case ACPIHP_SLOT_TYPE_CPU? 
   I think this case is ACPIHP_SLOT_TYPE_COMMON or else.
   By the way how about simplifying slot type category?
   Do we need to differentiate case7, 8, 9, 11 and 15?
 
   Best regards,
   Taku Izumi


 + case 8:
 + /* Generic CONTAINER */
 + return ACPIHP_SLOT_TYPE_COMMON;
 + case 9:
 + /* System board with physical processors */
 + return ACPIHP_SLOT_TYPE_SYSTEM_BOARD;
 + case 11:
 + /* System board with physical processors and memory */
 + return ACPIHP_SLOT_TYPE_SYSTEM_BOARD;
 + case 15:
 + /* Node with processor, memory and IO host bridge */
 + return ACPIHP_SLOT_TYPE_NODE;
 + default:
 + return ACPIHP_SLOT_TYPE_UNKNOWN;
 + }
 +}
 +
 +/*
 + * Guess type of a hotplug slot according to the device type of the
 + * corresponding ACPI object itself.
 + */
 +static enum acpihp_slot_type __init
 +acpihp_enum_check_slot_self(struct acpihp_slot *slot)
 +{
 + enum acpihp_dev_type type;
 +
 + if (acpihp_dev_get_type(slot-handle, type))
 + return ACPIHP_SLOT_TYPE_UNKNOWN;
 +
 + switch (type) {
 + case ACPIHP_DEV_TYPE_CPU:
 + /* Logical CPU used in virtualization environment */
 + return ACPIHP_SLOT_TYPE_CPU;
 + case ACPIHP_DEV_TYPE_MEM:
 + /* Memory board with single memory device */
 + return ACPIHP_SLOT_TYPE_MEM;
 + case ACPIHP_DEV_TYPE_HOST_BRIDGE:
 + /* IO eXtension board/box with single IO host bridge */
 + return ACPIHP_SLOT_TYPE_IOX;
 + default:
 + return ACPIHP_SLOT_TYPE_UNKNOWN;
 + }
 +}
 +
 +static int __init acpihp_enum_generate_slot_name(struct acpihp_slot *slot)
 +{
 + int found = 0;
 + struct list_head *list;
 + struct acpihp_slot_id *slot_id;
 + unsigned long long uid;
 +
 + /* Respect firmware settings if _UID return an integer. */
 + if (ACPI_SUCCESS(acpi_evaluate_integer(slot-handle, METHOD_NAME__UID,
 +NULL, uid)))
 + goto set_name;
 +
 + if (slot-parent)
 + list = slot-parent-slot_id_list;
 + else
 + list = slot_id_list;
 +
 + list_for_each_entry(slot_id, list, node)
 + if (slot_id-type == slot-type) {
 + found = 1;
 + break;
 + }
 + if (!found) {
 + slot_id = kzalloc(sizeof(struct acpihp_slot_id), GFP_KERNEL);
 + if (!slot_id) {
 + ACPIHP_DEBUG(fails to allocate slot instance ID.\n);
 + return -ENOMEM;
 + }
 + slot_id-type = slot-type;
 + list_add_tail(slot_id-node, list);
 + }
 +
 + uid = slot_id-instance_id++;
 +
 +set_name:
 + snprintf(slot-name, sizeof(slot-name) - 1, %s%02llx,
 +  acpihp_get_slot_type_name(slot-type), uid);
 + dev_set_name(slot-dev, %s, slot-name);
 +
 + return 0;
 +}
 +
 +/*
 + * Generate a meaningful name for the slot according to devices connecting
 + * to this slot
 + */
 +static int __init acpihp_enum_rename_slot(struct acpihp_slot *slot)
 +{
 + u32 child_types = 0;
 +
 + slot-type = acpihp_enum_check_slot_self(slot);
 + if (slot-type == ACPIHP_SLOT_TYPE_UNKNOWN) {
 + acpi_walk_namespace(ACPI_TYPE_DEVICE, slot-handle,
 + ACPI_UINT32_MAX, acpihp_enum_get_dev_type,
 + NULL, NULL, (void **)child_types);
 + acpi_walk_namespace

Re: [RFC PATCH 01/14] PCI: add pcie_flags into struct pci_dev to cache PCIe capabilities register

2012-07-11 Thread Taku Izumi
On Tue, 10 Jul 2012 23:54:02 +0800
Jiang Liu  wrote:

> From: Yijing Wang 
> 
> From: Yijing Wang 
> 
> Since PCI Express Capabilities Register is read only, cache its value
> into struct pci_dev to avoid repeatedly calling pci_read_config_*().
> 
> Signed-off-by: Yijing Wang 
> Signed-off-by: Jiang Liu 
> ---
>  drivers/pci/probe.c |1 +
>  include/linux/pci.h |1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 6c143b4..65e82e3 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -929,6 +929,7 @@ void set_pcie_port_type(struct pci_dev *pdev)
>   pdev->is_pcie = 1;
>   pdev->pcie_cap = pos;
>   pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, );
> + pdev->pcie_flags = reg16;
>   pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
>   pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, );
>   pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 5faa831..f4a7ad6 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -258,6 +258,7 @@ struct pci_dev {
>   u8  pcie_mpss:3;/* PCI-E Max Payload Size Supported */
>   u8  rom_base_reg;   /* which config register controls the 
> ROM */
>   u8  pin;/* which interrupt pin this device uses 
> */
> + u16 pcie_flags; /* cached PCI-E Capabilities Register */

 "xxx_flags" sounds like a bit flag. This variable stores a value of PCIe 
capability 
 register, doesn't it?   How about "pcie_cap_reg" ?

>  
>   struct pci_driver *driver;  /* which driver has allocated this 
> device */
>   u64 dma_mask;   /* Mask of the bits of bus address this
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Taku Izumi 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 01/14] PCI: add pcie_flags into struct pci_dev to cache PCIe capabilities register

2012-07-11 Thread Taku Izumi
On Tue, 10 Jul 2012 23:54:02 +0800
Jiang Liu liu...@gmail.com wrote:

 From: Yijing Wang wangyij...@huawei.com
 
 From: Yijing Wang wangyij...@huawei.com
 
 Since PCI Express Capabilities Register is read only, cache its value
 into struct pci_dev to avoid repeatedly calling pci_read_config_*().
 
 Signed-off-by: Yijing Wang wangyij...@huawei.com
 Signed-off-by: Jiang Liu liu...@gmail.com
 ---
  drivers/pci/probe.c |1 +
  include/linux/pci.h |1 +
  2 files changed, 2 insertions(+)
 
 diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
 index 6c143b4..65e82e3 100644
 --- a/drivers/pci/probe.c
 +++ b/drivers/pci/probe.c
 @@ -929,6 +929,7 @@ void set_pcie_port_type(struct pci_dev *pdev)
   pdev-is_pcie = 1;
   pdev-pcie_cap = pos;
   pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, reg16);
 + pdev-pcie_flags = reg16;
   pdev-pcie_type = (reg16  PCI_EXP_FLAGS_TYPE)  4;
   pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, reg16);
   pdev-pcie_mpss = reg16  PCI_EXP_DEVCAP_PAYLOAD;
 diff --git a/include/linux/pci.h b/include/linux/pci.h
 index 5faa831..f4a7ad6 100644
 --- a/include/linux/pci.h
 +++ b/include/linux/pci.h
 @@ -258,6 +258,7 @@ struct pci_dev {
   u8  pcie_mpss:3;/* PCI-E Max Payload Size Supported */
   u8  rom_base_reg;   /* which config register controls the 
 ROM */
   u8  pin;/* which interrupt pin this device uses 
 */
 + u16 pcie_flags; /* cached PCI-E Capabilities Register */

 xxx_flags sounds like a bit flag. This variable stores a value of PCIe 
capability 
 register, doesn't it?   How about pcie_cap_reg ?

  
   struct pci_driver *driver;  /* which driver has allocated this 
 device */
   u64 dma_mask;   /* Mask of the bits of bus address this
 -- 
 1.7.9.5
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-pci in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 


-- 
Taku Izumi izumi.t...@jp.fujitsu.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance-improvement-of-serial-console-via-virtual.patch added to -mm tree

2005-09-08 Thread Taku Izumi
Dear Russell:

>I don't think we want this.  With early serial console, tx_loadsz is
>not guaranteed to be initialised, and may in fact be zero.

>Plus there's no guarantee that the FIFOs will actually be enabled, so
>I think it's better that this patch doesn't go to mainline.

Our server has a virtual serial port, but its performance seems to be poor.
It takes 10 seconds to output 4000 characters (from kernel) to serial
console. By applying my patch, its peformance could be improved. ( 0.4
seconds / 4000 characters output), so I think it is useful to use FIFO at
serial8250_console_write function like transmit_chars function. Where
should I correct in order to use FIFO?

Taku Izumi <[EMAIL PROTECTED]>


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: performance-improvement-of-serial-console-via-virtual.patch added to -mm tree

2005-09-08 Thread Taku Izumi
Dear Russell:

I don't think we want this.  With early serial console, tx_loadsz is
not guaranteed to be initialised, and may in fact be zero.

Plus there's no guarantee that the FIFOs will actually be enabled, so
I think it's better that this patch doesn't go to mainline.

Our server has a virtual serial port, but its performance seems to be poor.
It takes 10 seconds to output 4000 characters (from kernel) to serial
console. By applying my patch, its peformance could be improved. ( 0.4
seconds / 4000 characters output), so I think it is useful to use FIFO at
serial8250_console_write function like transmit_chars function. Where
should I correct in order to use FIFO?

Taku Izumi [EMAIL PROTECTED]


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch] Peformance improvement of serial console via virtual serial port

2005-09-05 Thread Taku Izumi
Greetings.

This patch improves peformance of serial console by using FIFO.
I think original serial driver is not effective, but transfer rate of
real serial port is low, so this problem has not been exposed.

However, because transfer rate of virtual serial port (ex. Serial Over
Ethernet)
is higher than that of real serial port, this problem is exposed.
The output performance via virtual serial port is still low by using original
serial driver. I think original serial driver becomes bottoleneck.


Taku Izumi <[EMAIL PROTECTED]>

--patch>8

diff -Npur linux-2.6.9.org/drivers/serial/8250.c
linux-2.6.9/drivers/serial/8250.c
--- linux-2.6.9.org/drivers/serial/8250.c   2005-08-08
11:17:38.556373366 +0900
+++ linux-2.6.9/drivers/serial/8250.c   2005-08-08 11:41:03.759131389 +0900
@@ -1943,18 +1943,33 @@ serial8250_console_write(struct console
/*
 *  Now, do each character
 */
-   for (i = 0; i < count; i++, s++) {
-   wait_for_xmitr(up);
+   for (i = 0; i < count; ) {
+   int fifo ;

+   wait_for_xmitr(up);
+   fifo = up->tx_loadsz ;
/*
-*  Send the character out.
+*  Send the character out using FIFO.
 *  If a LF, also do CR...
 */
-   serial_out(up, UART_TX, *s);
-   if (*s == 10) {
-   wait_for_xmitr(up);
-   serial_out(up, UART_TX, 13);
-   }
+   do {
+   serial_out(up, UART_TX, *s);
+   fifo-- ;
+   if (*s == 10) {
+   if (fifo > 0) {
+   serial_out(up, UART_TX, 13);
+   fifo--;
+   } else {
+   /* No room to add CR */
+   wait_for_xmitr(up);
+   fifo = up->tx_loadsz ;
+   serial_out(up, UART_TX, 13);
+   fifo--;
+   }
+   }
+   i++ ;
+   s++ ;
+   } while (fifo > 0 && i < count ) ;
}

/*
--


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[Patch] Peformance improvement of serial console via virtual serial port

2005-09-05 Thread Taku Izumi
Greetings.

This patch improves peformance of serial console by using FIFO.
I think original serial driver is not effective, but transfer rate of
real serial port is low, so this problem has not been exposed.

However, because transfer rate of virtual serial port (ex. Serial Over
Ethernet)
is higher than that of real serial port, this problem is exposed.
The output performance via virtual serial port is still low by using original
serial driver. I think original serial driver becomes bottoleneck.


Taku Izumi [EMAIL PROTECTED]

--patch8

diff -Npur linux-2.6.9.org/drivers/serial/8250.c
linux-2.6.9/drivers/serial/8250.c
--- linux-2.6.9.org/drivers/serial/8250.c   2005-08-08
11:17:38.556373366 +0900
+++ linux-2.6.9/drivers/serial/8250.c   2005-08-08 11:41:03.759131389 +0900
@@ -1943,18 +1943,33 @@ serial8250_console_write(struct console
/*
 *  Now, do each character
 */
-   for (i = 0; i  count; i++, s++) {
-   wait_for_xmitr(up);
+   for (i = 0; i  count; ) {
+   int fifo ;

+   wait_for_xmitr(up);
+   fifo = up-tx_loadsz ;
/*
-*  Send the character out.
+*  Send the character out using FIFO.
 *  If a LF, also do CR...
 */
-   serial_out(up, UART_TX, *s);
-   if (*s == 10) {
-   wait_for_xmitr(up);
-   serial_out(up, UART_TX, 13);
-   }
+   do {
+   serial_out(up, UART_TX, *s);
+   fifo-- ;
+   if (*s == 10) {
+   if (fifo  0) {
+   serial_out(up, UART_TX, 13);
+   fifo--;
+   } else {
+   /* No room to add CR */
+   wait_for_xmitr(up);
+   fifo = up-tx_loadsz ;
+   serial_out(up, UART_TX, 13);
+   fifo--;
+   }
+   }
+   i++ ;
+   s++ ;
+   } while (fifo  0  i  count ) ;
}

/*
--


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/