[PATCH -v3 11/14] x86: Only direct map addresses that are marked as E820_RAM

2012-09-04 Thread Yinghai Lu
From: Jacob Shin 

Currently direct mappings are created for [ 0 to max_low_pfn<
Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/include/asm/page_types.h |8 +--
 arch/x86/kernel/setup.c   |8 ++-
 arch/x86/mm/init.c|  119 +
 arch/x86/mm/init_64.c |6 +-
 4 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index 45aae6e..54c9787 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,13 +51,7 @@ static inline phys_addr_t get_max_mapped(void)
return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
 }
 
-static inline bool pfn_range_is_mapped(unsigned long start_pfn,
-   unsigned long end_pfn)
-{
-   return end_pfn <= max_low_pfn_mapped ||
-  (end_pfn > (1UL << (32 - PAGE_SHIFT)) &&
-   end_pfn <= max_pfn_mapped);
-}
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
 
 extern unsigned long init_memory_mapping(unsigned long start,
 unsigned long end);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de72acc..9ccef07 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -115,9 +115,11 @@
 #include 
 
 /*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a475d7f..47b6e41 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -234,6 +234,38 @@ static unsigned long __init 
calculate_table_space_size(unsigned long begin,
return tables;
 }
 
+static unsigned long __init calculate_all_table_space_size(void)
+{
+   unsigned long start_pfn, end_pfn;
+   unsigned long tables;
+   int i;
+
+   /* the ISA range is always mapped regardless of memory holes */
+   tables = calculate_table_space_size(0, ISA_END_ADDRESS);
+
+   for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
+   u64 start = start_pfn << PAGE_SHIFT;
+   u64 end = end_pfn << PAGE_SHIFT;
+
+   if (end <= ISA_END_ADDRESS)
+   continue;
+
+   if (start < ISA_END_ADDRESS)
+   start = ISA_END_ADDRESS;
+#ifdef CONFIG_X86_32
+   /* on 32 bit, we only map up to max_low_pfn */
+   if ((start >> PAGE_SHIFT) >= max_low_pfn)
+   continue;
+
+   if ((end >> PAGE_SHIFT) > max_low_pfn)
+   end = max_low_pfn << PAGE_SHIFT;
+#endif
+   tables += calculate_table_space_size(start, end);
+   }
+
+   return tables;
+}
+
 static void __init find_early_table_space(unsigned long start,
  unsigned long good_end,
  unsigned long tables)
@@ -249,6 +281,33 @@ static void __init find_early_table_space(unsigned long 
start,
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 }
 
+static struct range pfn_mapped[E820_X_MAX];
+static int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long 
end_pfn)
+{
+   nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+nr_pfn_mapped, start_pfn, end_pfn);
+   nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+   max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+   if (end_pfn <= (1UL << (32 - PAGE_SHIFT)))
+   max_low_pfn_mapped = max(max_low_pfn_mapped, end_pfn);
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+   int i;
+
+   for (i = 0; i < nr_pfn_mapped; i++)
+   if ((start_pfn >= pfn_mapped[i].start) &&
+   (end_pfn <= pfn_mapped[i].end))
+   return true;
+
+   return false;
+}
+
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
@@ -280,9 +339,55 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
 
__flush_tlb_all();
 
+   add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
+
return ret >> PAGE_SHIFT;
 }
 
+/*
+ * Iterate through E820 memory map and create direct mappings for only E820_RAM
+ * regions. We cannot simply create direct mappings for all 

[PATCH -v3 11/14] x86: Only direct map addresses that are marked as E820_RAM

2012-09-04 Thread Yinghai Lu
From: Jacob Shin jacob.s...@amd.com

Currently direct mappings are created for [ 0 to max_low_pfnPAGE_SHIFT )
and [ 4GB to max_pfnPAGE_SHIFT ), which may include regions that are not
backed by actual DRAM. This is fine for holes under 4GB which are covered
by fixed and variable range MTRRs to be UC. However, we run into trouble
on higher memory addresses which cannot be covered by MTRRs.

Our system with 1TB of RAM has an e820 that looks like this:

 BIOS-e820: [mem 0x-0x000983ff] usable
 BIOS-e820: [mem 0x00098400-0x0009] reserved
 BIOS-e820: [mem 0x000d-0x000f] reserved
 BIOS-e820: [mem 0x0010-0xc7eb] usable
 BIOS-e820: [mem 0xc7ec-0xc7ed7fff] ACPI data
 BIOS-e820: [mem 0xc7ed8000-0xc7ed9fff] ACPI NVS
 BIOS-e820: [mem 0xc7eda000-0xc7ff] reserved
 BIOS-e820: [mem 0xfec0-0xfec0] reserved
 BIOS-e820: [mem 0xfee0-0xfee00fff] reserved
 BIOS-e820: [mem 0xfff0-0x] reserved
 BIOS-e820: [mem 0x0001-0x00e037ff] usable
 BIOS-e820: [mem 0x00e03800-0x00fc] reserved
 BIOS-e820: [mem 0x0100-0x011ffeff] usable

and so direct mappings are created for huge memory hole between
0x00e03800 to 0x0100. Even though the kernel never
generates memory accesses in that region, since the page tables mark
them incorrectly as being WB, our (AMD) processor ends up causing a MCE
while doing some memory bookkeeping/optimizations around that area.

This patch iterates through e820 and only direct maps ranges that are
marked as E820_RAM, and keeps track of those pfn ranges. Depending on
the alignment of E820 ranges, this may possibly result in using smaller
size (i.e. 4K instead of 2M or 1G) page tables.

-v2: move changes from setup.c to mm/init.c, also use for_each_mem_pfn_range
instead.  - Yinghai Lu
-v3: add calculate_all_table_space_size() to get correct needed page table
size. - Yinghai Lu

Signed-off-by: Jacob Shin jacob.s...@amd.com
Signed-off-by: Yinghai Lu ying...@kernel.org
Reviewed-by: Pekka Enberg penb...@kernel.org
---
 arch/x86/include/asm/page_types.h |8 +--
 arch/x86/kernel/setup.c   |8 ++-
 arch/x86/mm/init.c|  119 +
 arch/x86/mm/init_64.c |6 +-
 4 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index 45aae6e..54c9787 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,13 +51,7 @@ static inline phys_addr_t get_max_mapped(void)
return (phys_addr_t)max_pfn_mapped  PAGE_SHIFT;
 }
 
-static inline bool pfn_range_is_mapped(unsigned long start_pfn,
-   unsigned long end_pfn)
-{
-   return end_pfn = max_low_pfn_mapped ||
-  (end_pfn  (1UL  (32 - PAGE_SHIFT)) 
-   end_pfn = max_pfn_mapped);
-}
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
 
 extern unsigned long init_memory_mapping(unsigned long start,
 unsigned long end);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de72acc..9ccef07 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -115,9 +115,11 @@
 #include asm/prom.h
 
 /*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a475d7f..47b6e41 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -234,6 +234,38 @@ static unsigned long __init 
calculate_table_space_size(unsigned long begin,
return tables;
 }
 
+static unsigned long __init calculate_all_table_space_size(void)
+{
+   unsigned long start_pfn, end_pfn;
+   unsigned long tables;
+   int i;
+
+   /* the ISA range is always mapped regardless of memory holes */
+   tables = calculate_table_space_size(0, ISA_END_ADDRESS);
+
+   for_each_mem_pfn_range(i, MAX_NUMNODES, start_pfn, end_pfn, NULL) {
+   u64 start = start_pfn  PAGE_SHIFT;
+   u64 end = end_pfn  PAGE_SHIFT;
+
+   if (end = ISA_END_ADDRESS)
+   continue;
+
+   if (start  ISA_END_ADDRESS)
+   start = ISA_END_ADDRESS;
+#ifdef CONFIG_X86_32
+