The primary mapping of the kernel image is made using huge pages where
possible, mostly to minimize TLB pressure (Only the entry text section
requires alignment to 2 MiB). This involves some rounding and padding of
the .text and .rodata sections, resulting in gaps.  These gaps are
smaller than a huge page, and are remapped using different permissions,
resulting in fragmentation of the huge page mappings at the edges of
those regions.

Similarly, there is a gap between .data and .bss, where the init text
and data regions reside. This means that the end of the .data region and
the start of the .bss region are not covered by huge page mappings
either, even though both regions use the same permissions (RW+NX).

Improve the situation, by placing .data and .bss adjacently in the
linker map, and putting the init text and data regions after .rodata,
taking the place of the rodata/data gap. This results in one fewer gap,
and a more efficient mapping of the .data and .bss regions.

To preserve the x86_64 ELF layout with PT_LOAD regions aligned to 2 MiB,
start the second ELF segment at .init.data and align it to 2 MiB.  The
resulting padding will be covered by the init region and will be freed
along with it after boot.

defconfig + Clang 19:

Before:

  0xffffffff81000000-0xffffffff82200000    18M  ro  PSE  GLB x  pmd
  0xffffffff82200000-0xffffffff8231c000  1136K  ro       GLB x  pte
  0xffffffff8231c000-0xffffffff82400000   912K  RW       GLB NX pte
  0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE  GLB NX pmd
  0xffffffff82a00000-0xffffffff82b40000  1280K  ro       GLB NX pte
  0xffffffff82b40000-0xffffffff82c00000   768K  RW       GLB NX pte
  0xffffffff82c00000-0xffffffff83400000     8M  RW  PSE  GLB NX pmd
  0xffffffff83400000-0xffffffff83800000     4M  RW       GLB NX pte

After:

  0xffffffff81000000-0xffffffff82200000    18M  ro  PSE  GLB x  pmd
  0xffffffff82200000-0xffffffff8231c000  1136K  ro       GLB x  pte
  0xffffffff8231c000-0xffffffff82400000   912K  RW       GLB NX pte
  0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE  GLB NX pmd
  0xffffffff82a00000-0xffffffff82b40000  1280K  ro       GLB NX pte
  0xffffffff82b40000-0xffffffff82c00000   768K  RW       GLB NX pte
  0xffffffff82c00000-0xffffffff82e00000     2M  RW  PSE  GLB NX pmd
  0xffffffff82e00000-0xffffffff83000000     2M  RW       GLB NX pte
  0xffffffff83000000-0xffffffff83800000     8M  RW  PSE  GLB NX pmd

With the gaps removed/unmapped (pti=on)

Before:

  0xffffffff81000000-0xffffffff81200000     2M  ro  PSE  GLB x  pmd
  0xffffffff81200000-0xffffffff82200000    16M  ro  PSE      x  pmd
  0xffffffff82200000-0xffffffff8231c000  1136K  ro           x  pte
  0xffffffff8231c000-0xffffffff82400000   912K                  pte
  0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE      NX pmd
  0xffffffff82a00000-0xffffffff82b40000  1280K  ro           NX pte
  0xffffffff82b40000-0xffffffff82c00000   768K                  pte
  0xffffffff82c00000-0xffffffff83400000     8M  RW  PSE      NX pmd
  0xffffffff83400000-0xffffffff8342a000   168K  RW           NX pte
  0xffffffff8342a000-0xffffffff836f3000  2852K                  pte
  0xffffffff836f3000-0xffffffff83800000  1076K  RW           NX pte

After:

  0xffffffff81000000-0xffffffff81200000     2M  ro  PSE  GLB x  pmd
  0xffffffff81200000-0xffffffff82200000    16M  ro  PSE      x  pmd
  0xffffffff82200000-0xffffffff8231c000  1136K  ro           x  pte
  0xffffffff8231c000-0xffffffff82400000   912K                  pte
  0xffffffff82400000-0xffffffff82a00000     6M  ro  PSE      NX pmd
  0xffffffff82a00000-0xffffffff82b40000  1280K  ro           NX pte
  0xffffffff82b40000-0xffffffff82e3d000  3060K                  pte
  0xffffffff82e3d000-0xffffffff83000000  1804K  RW           NX pte
  0xffffffff83000000-0xffffffff83800000     8M  RW  PSE      NX pmd

Signed-off-by: Ard Biesheuvel <[email protected]>
---
 arch/x86/kernel/vmlinux.lds.S | 91 +++++++++++---------
 arch/x86/mm/init_64.c         |  5 +-
 arch/x86/mm/pat/set_memory.c  |  2 +-
 3 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3a24a3fc55f5..1dee2987c42b 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -61,12 +61,15 @@ const_cpu_current_top_of_stack = cpu_current_top_of_stack;
 #define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
 
 #define X86_ALIGN_RODATA_END                                   \
-               . = ALIGN(HPAGE_SIZE);                          \
-               __end_rodata_hpage_align = .;                   \
-               __end_rodata_aligned = .;
+               . = ALIGN(PAGE_SIZE);                           \
+               __end_rodata_aligned = ALIGN(HPAGE_SIZE);
 
 #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
 #define ALIGN_ENTRY_TEXT_END   . = ALIGN(PMD_SIZE);
+
+#define DATA_SEGMENT_START                                     \
+       . = ALIGN(HPAGE_SIZE);                                  \
+       __data_segment_start = .;
 #else
 
 #define X86_ALIGN_RODATA_BEGIN
@@ -76,9 +79,14 @@ const_cpu_current_top_of_stack = cpu_current_top_of_stack;
 
 #define ALIGN_ENTRY_TEXT_BEGIN
 #define ALIGN_ENTRY_TEXT_END
+
+#define DATA_SEGMENT_START                                     \
+       . = ALIGN(PAGE_SIZE);                                   \
+       __data_segment_start = .;
 #endif
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
+
 /*
  * This section contains data which will be mapped as decrypted. Memory
  * encryption operates on a page basis. Make this section PMD-aligned
@@ -171,43 +179,6 @@ SECTIONS
        RO_DATA(PAGE_SIZE)
        X86_ALIGN_RODATA_END
 
-       /* Data */
-       .data : AT(ADDR(.data) - LOAD_OFFSET) {
-               /* Start of data section */
-               _sdata = .;
-
-               /* init_task */
-               INIT_TASK_DATA(THREAD_SIZE)
-
-               /* equivalent to task_pt_regs(&init_task) */
-               __top_init_kernel_stack = __end_init_stack - 
TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
-
-#ifdef CONFIG_X86_32
-               /* 32 bit has nosave before _edata */
-               NOSAVE_DATA
-#endif
-
-               PAGE_ALIGNED_DATA(PAGE_SIZE)
-
-               CACHE_HOT_DATA(L1_CACHE_BYTES)
-
-               CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
-
-               DATA_DATA
-               CONSTRUCTORS
-               KEXEC_RELOCATE_KERNEL
-
-               /* rarely changed data like cpu maps */
-               READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
-
-               /* End of data section */
-               _edata = .;
-       } :data
-
-       BUG_TABLE
-
-       ORC_UNWIND_TABLE
-
        /* Init code and data - will be freed after init */
        . = ALIGN(PAGE_SIZE);
        .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
@@ -229,7 +200,8 @@ SECTIONS
                __inittext_end = .;
        }
 
-       INIT_DATA_SECTION(16)
+       DATA_SEGMENT_START
+       INIT_DATA_SECTION(16) :data
 
        .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
                __x86_cpu_dev_start = .;
@@ -358,6 +330,43 @@ SECTIONS
                __smp_locks_end = .;
        }
 
+       /* Data */
+       .data : AT(ADDR(.data) - LOAD_OFFSET) {
+               /* Start of data section */
+               _sdata = .;
+
+               /* init_task */
+               INIT_TASK_DATA(THREAD_SIZE)
+
+               /* equivalent to task_pt_regs(&init_task) */
+               __top_init_kernel_stack = __end_init_stack - 
TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
+
+#ifdef CONFIG_X86_32
+               /* 32 bit has nosave before _edata */
+               NOSAVE_DATA
+#endif
+
+               PAGE_ALIGNED_DATA(PAGE_SIZE)
+
+               CACHE_HOT_DATA(L1_CACHE_BYTES)
+
+               CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
+
+               DATA_DATA
+               CONSTRUCTORS
+               KEXEC_RELOCATE_KERNEL
+
+               /* rarely changed data like cpu maps */
+               READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
+
+               /* End of data section */
+               _edata = .;
+       }
+
+       BUG_TABLE
+
+       ORC_UNWIND_TABLE
+
 #ifdef CONFIG_X86_64
        .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
                NOSAVE_DATA
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9983017ecbe0..6c2120dd5607 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1397,9 +1397,8 @@ void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_text);
        unsigned long rodata_start = PFN_ALIGN(__start_rodata);
-       unsigned long end = (unsigned long)__end_rodata_hpage_align;
+       unsigned long end = (unsigned long)__end_rodata;
        unsigned long text_end = PFN_ALIGN(_etext);
-       unsigned long rodata_end = PFN_ALIGN(__end_rodata);
        unsigned long all_end;
 
        printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
@@ -1435,8 +1434,6 @@ void mark_rodata_ro(void)
 
        free_kernel_image_pages("unused kernel image (text/rodata gap)",
                                (void *)text_end, (void *)rodata_start);
-       free_kernel_image_pages("unused kernel image (rodata/data gap)",
-                               (void *)rodata_end, (void *)_sdata);
 }
 
 /*
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 6c6eb486f7a6..ad4d55f2413b 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -554,7 +554,7 @@ static pgprotval_t protect_kernel_text(unsigned long start, 
unsigned long end)
 static pgprotval_t protect_kernel_text_ro(unsigned long start,
                                          unsigned long end)
 {
-       unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
+       unsigned long t_end = (unsigned long)__end_rodata - 1;
        unsigned long t_start = (unsigned long)_text;
        unsigned int level;
 
-- 
2.47.3


Reply via email to