[PATCH v5 09/15] riscv: extend execmem_params for generated code allocations

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

The memory allocations for kprobes and BPF on RISC-V are not placed in
the modules area and these custom allocations are implemented with
overrides of alloc_insn_page() and  bpf_jit_alloc_exec().

Slightly reorder execmem_params initialization to support both 32 and 64
bit variants, define EXECMEM_KPROBES and EXECMEM_BPF ranges in
riscv::execmem_params and drop overrides of alloc_insn_page() and
bpf_jit_alloc_exec().

Signed-off-by: Mike Rapoport (IBM) 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/kernel/module.c | 28 +---
 arch/riscv/kernel/probes/kprobes.c | 10 --
 arch/riscv/net/bpf_jit_core.c  | 13 -
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 182904127ba0..2ecbacbc9993 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -906,19 +906,41 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+#ifdef CONFIG_MMU
 static struct execmem_info execmem_info __ro_after_init;
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+   unsigned long start, end;
+
+   if (IS_ENABLED(CONFIG_64BIT)) {
+   start = MODULES_VADDR;
+   end = MODULES_END;
+   } else {
+   start = VMALLOC_START;
+   end = VMALLOC_END;
+   }
+
execmem_info = (struct execmem_info){
.ranges = {
[EXECMEM_DEFAULT] = {
-   .start  = MODULES_VADDR,
-   .end= MODULES_END,
+   .start  = start,
+   .end= end,
.pgprot = PAGE_KERNEL,
.alignment = 1,
},
+   [EXECMEM_KPROBES] = {
+   .start  = VMALLOC_START,
+   .end= VMALLOC_END,
+   .pgprot = PAGE_KERNEL_READ_EXEC,
+   .alignment = 1,
+   },
+   [EXECMEM_BPF] = {
+   .start  = BPF_JIT_REGION_START,
+   .end= BPF_JIT_REGION_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = PAGE_SIZE,
+   },
},
};
 
diff --git a/arch/riscv/kernel/probes/kprobes.c 
b/arch/riscv/kernel/probes/kprobes.c
index 2f08c14a933d..e64f2f3064eb 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -104,16 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return 0;
 }
 
-#ifdef CONFIG_MMU
-void *alloc_insn_page(void)
-{
-   return  __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
-GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
-VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-__builtin_return_address(0));
-}
-#endif
-
 /* install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6b3acac30c06..e238fdbd5dbc 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -219,19 +219,6 @@ u64 bpf_jit_alloc_exec_limit(void)
return BPF_JIT_REGION_SIZE;
 }
 
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-   return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
-   BPF_JIT_REGION_END, GFP_KERNEL,
-   PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-   return vfree(addr);
-}
-
 void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 {
int ret;
-- 
2.43.0



[PATCH v5 11/15] arch: make execmem setup available regardless of CONFIG_MODULES

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

execmem does not depend on modules, on the contrary modules use
execmem.

To make execmem available when CONFIG_MODULES=n, for instance for
kprobes, split execmem_params initialization out from
arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm/kernel/module.c   |  43 --
 arch/arm/mm/init.c |  45 +++
 arch/arm64/kernel/module.c | 140 -
 arch/arm64/mm/init.c   | 140 +
 arch/loongarch/kernel/module.c |  19 -
 arch/loongarch/mm/init.c   |  21 +
 arch/mips/kernel/module.c  |  22 --
 arch/mips/mm/init.c|  23 ++
 arch/nios2/kernel/module.c |  20 -
 arch/nios2/mm/init.c   |  21 +
 arch/parisc/kernel/module.c|  20 -
 arch/parisc/mm/init.c  |  23 +-
 arch/powerpc/kernel/module.c   |  63 ---
 arch/powerpc/mm/mem.c  |  64 +++
 arch/riscv/kernel/module.c |  44 ---
 arch/riscv/mm/init.c   |  45 +++
 arch/s390/kernel/module.c  |  27 ---
 arch/s390/mm/init.c|  30 +++
 arch/sparc/kernel/module.c |  19 -
 arch/sparc/mm/Makefile |   2 +
 arch/sparc/mm/execmem.c|  21 +
 arch/x86/kernel/module.c   |  27 ---
 arch/x86/mm/init.c |  29 +++
 23 files changed, 463 insertions(+), 445 deletions(-)
 create mode 100644 arch/sparc/mm/execmem.c

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index a98fdf6ff26c..677f218f7e84 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -12,57 +12,14 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
-#include 
 
 #include 
 #include 
 #include 
 #include 
 
-#ifdef CONFIG_XIP_KERNEL
-/*
- * The XIP kernel text is mapped in the module area for modules and
- * some other stuff to work without any indirect relocations.
- * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
- * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
- */
-#undef MODULES_VADDR
-#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
-#endif
-
-#ifdef CONFIG_MMU
-static struct execmem_info execmem_info __ro_after_init;
-
-struct execmem_info __init *execmem_arch_setup(void)
-{
-   unsigned long fallback_start = 0, fallback_end = 0;
-
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
-   fallback_start = VMALLOC_START;
-   fallback_end = VMALLOC_END;
-   }
-
-   execmem_info = (struct execmem_info){
-   .ranges = {
-   [EXECMEM_DEFAULT] = {
-   .start  = MODULES_VADDR,
-   .end= MODULES_END,
-   .pgprot = PAGE_KERNEL_EXEC,
-   .alignment = 1,
-   .fallback_start = fallback_start,
-   .fallback_end   = fallback_end,
-   },
-   },
-   };
-
-   return _info;
-}
-#endif
-
 bool module_init_section(const char *name)
 {
return strstarts(name, ".init") ||
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index e8c6f4be0ce1..5345d218899a 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -486,3 +487,47 @@ void free_initrd_mem(unsigned long start, unsigned long 
end)
free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
+
+#ifdef CONFIG_EXECMEM
+
+#ifdef CONFIG_XIP_KERNEL
+/*
+ * The XIP kernel text is mapped in the module area for modules and
+ * some other stuff to work without any indirect relocations.
+ * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
+ * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
+ */
+#undef MODULES_VADDR
+#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
+#endif
+
+#ifdef CONFIG_MMU
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+   unsigned long fallback_start = 0, fallback_end = 0;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   fallback_start = VMALLOC_START;
+   fallback_end = VMALLOC_END;
+   }
+
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   .fallback_start = fallback_start,
+   

[PATCH v5 10/15] powerpc: extend execmem_params for kprobes allocations

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

powerpc overrides kprobes::alloc_insn_page() to remove writable
permissions when STRICT_MODULE_RWX is on.

Add definition of EXECMEM_KRPOBES to execmem_params to allow using the
generic kprobes::alloc_insn_page() with the desired permissions.

As powerpc uses breakpoint instructions to inject kprobes, it does not
need to constrain kprobe allocations to the modules area and can use the
entire vmalloc address space.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/kernel/kprobes.c | 20 
 arch/powerpc/kernel/module.c  |  7 +++
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 9fcd01bb2ce6..14c5ddec3056 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -126,26 +126,6 @@ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long 
addr, unsigned long offse
return (kprobe_opcode_t *)(addr + offset);
 }
 
-void *alloc_insn_page(void)
-{
-   void *page;
-
-   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
-   if (!page)
-   return NULL;
-
-   if (strict_module_rwx_enabled()) {
-   int err = set_memory_rox((unsigned long)page, 1);
-
-   if (err)
-   goto error;
-   }
-   return page;
-error:
-   execmem_free(page);
-   return NULL;
-}
-
 int arch_prepare_kprobe(struct kprobe *p)
 {
int ret = 0;
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index ac80559015a3..2a23cf7e141b 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -94,6 +94,7 @@ static struct execmem_info execmem_info __ro_after_init;
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+   pgprot_t kprobes_prot = strict_module_rwx_enabled() ? PAGE_KERNEL_ROX : 
PAGE_KERNEL_EXEC;
pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : 
PAGE_KERNEL_EXEC;
unsigned long fallback_start = 0, fallback_end = 0;
unsigned long start, end;
@@ -132,6 +133,12 @@ struct execmem_info __init *execmem_arch_setup(void)
.fallback_start = fallback_start,
.fallback_end   = fallback_end,
},
+   [EXECMEM_KPROBES] = {
+   .start  = VMALLOC_START,
+   .end= VMALLOC_END,
+   .pgprot = kprobes_prot,
+   .alignment = 1,
+   },
[EXECMEM_MODULE_DATA] = {
.start  = VMALLOC_START,
.end= VMALLOC_END,
-- 
2.43.0




[PATCH v5 08/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Extend execmem parameters to accommodate more complex overrides of
module_alloc() by architectures.

This includes specification of a fallback range required by arm, arm64
and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for
allocation of KASAN shadow required by s390 and x86 and support for
early initialization of execmem required by x86.

The core implementation of execmem_alloc() takes care of suppressing
warnings when the initial allocation fails but there is a fallback range
defined.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Will Deacon 
---
 arch/Kconfig   |  6 +++
 arch/arm/kernel/module.c   | 41 ++---
 arch/arm64/kernel/module.c | 67 ++--
 arch/arm64/kernel/probes/kprobes.c |  7 ---
 arch/arm64/net/bpf_jit_comp.c  | 11 -
 arch/powerpc/kernel/module.c   | 60 -
 arch/s390/kernel/module.c  | 54 ++-
 arch/x86/Kconfig   |  1 +
 arch/x86/kernel/module.c   | 70 ++
 include/linux/execmem.h| 34 +++
 include/linux/moduleloader.h   | 12 -
 kernel/module/main.c   | 26 +++
 mm/execmem.c   | 70 +-
 mm/mm_init.c   |  2 +
 14 files changed, 259 insertions(+), 202 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 65afb1de48b3..7006f71f0110 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -960,6 +960,12 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC
  For architectures like powerpc/32 which have constraints on module
  allocation and need to allocate module data outside of module area.
 
+config ARCH_WANTS_EXECMEM_EARLY
+   bool
+   help
+ For architectures that might allocate executable memory early on
+ boot, for instance ftrace on x86.
+
 config HAVE_IRQ_EXIT_ON_IRQ_STACK
bool
help
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f58b77..a98fdf6ff26c 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -34,23 +35,31 @@
 #endif
 
 #ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   gfp_t gfp_mask = GFP_KERNEL;
-   void *p;
-
-   /* Silence the initial allocation */
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-   gfp_mask |= __GFP_NOWARN;
-
-   p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-   if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-   return p;
-   return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   unsigned long fallback_start = 0, fallback_end = 0;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   fallback_start = VMALLOC_START;
+   fallback_end = VMALLOC_END;
+   }
+
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   .fallback_start = fallback_start,
+   .fallback_end   = fallback_end,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index e92da4da1b2a..a52240ea084b 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -108,41 +109,59 @@ static int __init module_init_limits(void)
 
return 0;
 }
-subsys_initcall(module_init_limits);
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   void *p = NULL;
+   unsigned long fallback_start = 0, fallback_end = 0;
+   unsigned long start = 0, end = 0;
+
+   module_init_limits();
 
/*
 * Where possible, prefer to allocate within direct branch range of the
 * kernel such that no PLTs are necessary.
 */
if (module_direct_base) {
-   p = __vmalloc_node_range(size, MODULE_ALIGN,
-module_d

[PATCH v5 09/15] riscv: extend execmem_params for generated code allocations

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

The memory allocations for kprobes and BPF on RISC-V are not placed in
the modules area and these custom allocations are implemented with
overrides of alloc_insn_page() and  bpf_jit_alloc_exec().

Slightly reorder execmem_params initialization to support both 32 and 64
bit variants, define EXECMEM_KPROBES and EXECMEM_BPF ranges in
riscv::execmem_params and drop overrides of alloc_insn_page() and
bpf_jit_alloc_exec().

Signed-off-by: Mike Rapoport (IBM) 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/kernel/module.c | 28 +---
 arch/riscv/kernel/probes/kprobes.c | 10 --
 arch/riscv/net/bpf_jit_core.c  | 13 -
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 182904127ba0..2ecbacbc9993 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -906,19 +906,41 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+#ifdef CONFIG_MMU
 static struct execmem_info execmem_info __ro_after_init;
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+   unsigned long start, end;
+
+   if (IS_ENABLED(CONFIG_64BIT)) {
+   start = MODULES_VADDR;
+   end = MODULES_END;
+   } else {
+   start = VMALLOC_START;
+   end = VMALLOC_END;
+   }
+
execmem_info = (struct execmem_info){
.ranges = {
[EXECMEM_DEFAULT] = {
-   .start  = MODULES_VADDR,
-   .end= MODULES_END,
+   .start  = start,
+   .end= end,
.pgprot = PAGE_KERNEL,
.alignment = 1,
},
+   [EXECMEM_KPROBES] = {
+   .start  = VMALLOC_START,
+   .end= VMALLOC_END,
+   .pgprot = PAGE_KERNEL_READ_EXEC,
+   .alignment = 1,
+   },
+   [EXECMEM_BPF] = {
+   .start  = BPF_JIT_REGION_START,
+   .end= BPF_JIT_REGION_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = PAGE_SIZE,
+   },
},
};
 
diff --git a/arch/riscv/kernel/probes/kprobes.c 
b/arch/riscv/kernel/probes/kprobes.c
index 2f08c14a933d..e64f2f3064eb 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -104,16 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return 0;
 }
 
-#ifdef CONFIG_MMU
-void *alloc_insn_page(void)
-{
-   return  __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
-GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
-VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-__builtin_return_address(0));
-}
-#endif
-
 /* install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6b3acac30c06..e238fdbd5dbc 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -219,19 +219,6 @@ u64 bpf_jit_alloc_exec_limit(void)
return BPF_JIT_REGION_SIZE;
 }
 
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-   return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
-   BPF_JIT_REGION_END, GFP_KERNEL,
-   PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-   return vfree(addr);
-}
-
 void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 {
int ret;
-- 
2.43.0




[PATCH v5 07/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Several architectures override module_alloc() only to define address
range for code allocations different than VMALLOC address space.

Provide a generic implementation in execmem that uses the parameters for
address space ranges, required alignment and page protections provided
by architectures.

The architectures must fill execmem_info structure and implement
execmem_arch_setup() that returns a pointer to that structure. This way the
execmem initialization won't be called from every architecture, but rather
from a central place, namely a core_initcall() in execmem.

The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
with the parameters defined by the architectures.  If an architecture does
not implement execmem_arch_setup(), execmem_alloc() will fall back to
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/loongarch/kernel/module.c | 19 +++--
 arch/mips/kernel/module.c  | 20 --
 arch/nios2/kernel/module.c | 21 +++---
 arch/parisc/kernel/module.c| 24 +++
 arch/riscv/kernel/module.c | 24 +++
 arch/sparc/kernel/module.c | 20 --
 include/linux/execmem.h| 41 +++
 mm/execmem.c   | 73 --
 8 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c
index c7d0338d12c1..ca6dd7ea1610 100644
--- a/arch/loongarch/kernel/module.c
+++ b/arch/loongarch/kernel/module.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -490,10 +491,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, 
__builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 static void module_init_ftrace_plt(const Elf_Ehdr *hdr,
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 9a6c96014904..59225a3cf918 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct mips_hi16 {
@@ -32,11 +33,22 @@ static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
 #ifdef MODULES_VADDR
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 9c97b7513853..0d1ee86631fc 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -18,15 +18,26 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC,
-   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index d2

[PATCH v5 08/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Extend execmem parameters to accommodate more complex overrides of
module_alloc() by architectures.

This includes specification of a fallback range required by arm, arm64
and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for
allocation of KASAN shadow required by s390 and x86 and support for
early initialization of execmem required by x86.

The core implementation of execmem_alloc() takes care of suppressing
warnings when the initial allocation fails but there is a fallback range
defined.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Will Deacon 
---
 arch/Kconfig   |  6 +++
 arch/arm/kernel/module.c   | 41 ++---
 arch/arm64/kernel/module.c | 67 ++--
 arch/arm64/kernel/probes/kprobes.c |  7 ---
 arch/arm64/net/bpf_jit_comp.c  | 11 -
 arch/powerpc/kernel/module.c   | 60 -
 arch/s390/kernel/module.c  | 54 ++-
 arch/x86/Kconfig   |  1 +
 arch/x86/kernel/module.c   | 70 ++
 include/linux/execmem.h| 34 +++
 include/linux/moduleloader.h   | 12 -
 kernel/module/main.c   | 26 +++
 mm/execmem.c   | 70 +-
 mm/mm_init.c   |  2 +
 14 files changed, 259 insertions(+), 202 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 65afb1de48b3..7006f71f0110 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -960,6 +960,12 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC
  For architectures like powerpc/32 which have constraints on module
  allocation and need to allocate module data outside of module area.
 
+config ARCH_WANTS_EXECMEM_EARLY
+   bool
+   help
+ For architectures that might allocate executable memory early on
+ boot, for instance ftrace on x86.
+
 config HAVE_IRQ_EXIT_ON_IRQ_STACK
bool
help
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f58b77..a98fdf6ff26c 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -34,23 +35,31 @@
 #endif
 
 #ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   gfp_t gfp_mask = GFP_KERNEL;
-   void *p;
-
-   /* Silence the initial allocation */
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-   gfp_mask |= __GFP_NOWARN;
-
-   p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-   if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-   return p;
-   return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   unsigned long fallback_start = 0, fallback_end = 0;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   fallback_start = VMALLOC_START;
+   fallback_end = VMALLOC_END;
+   }
+
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   .fallback_start = fallback_start,
+   .fallback_end   = fallback_end,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index e92da4da1b2a..a52240ea084b 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -108,41 +109,59 @@ static int __init module_init_limits(void)
 
return 0;
 }
-subsys_initcall(module_init_limits);
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   void *p = NULL;
+   unsigned long fallback_start = 0, fallback_end = 0;
+   unsigned long start = 0, end = 0;
+
+   module_init_limits();
 
/*
 * Where possible, prefer to allocate within direct branch range of the
 * kernel such that no PLTs are necessary.
 */
if (module_direct_base) {
-   p = __vmalloc_node_range(size, MODULE_ALIGN,
-module_d

[PATCH v5 06/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystems
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

Start splitting code allocation from modules by introducing execmem_alloc()
and execmem_free() APIs.

Initially, execmem_alloc() is a wrapper for module_alloc() and
execmem_free() is a replacement of module_memfree() to allow updating all
call sites to use the new APIs.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem_alloc() takes
a type argument, that will be used to identify the calling subsystem and to
allow architectures define parameters for ranges suitable for that
subsystem.

No functional changes.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Masami Hiramatsu (Google) 
---
 arch/powerpc/kernel/kprobes.c|  6 ++--
 arch/s390/kernel/ftrace.c|  4 +--
 arch/s390/kernel/kprobes.c   |  4 +--
 arch/s390/kernel/module.c|  5 +--
 arch/sparc/net/bpf_jit_comp_32.c |  8 ++---
 arch/x86/kernel/ftrace.c |  6 ++--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 include/linux/execmem.h  | 57 
 include/linux/moduleloader.h |  3 --
 kernel/bpf/core.c|  6 ++--
 kernel/kprobes.c |  8 ++---
 kernel/module/Kconfig|  1 +
 kernel/module/main.c | 25 +-
 mm/Kconfig   |  3 ++
 mm/Makefile  |  1 +
 mm/execmem.c | 32 ++
 16 files changed, 128 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/execmem.h
 create mode 100644 mm/execmem.c

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..9fcd01bb2ce6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -19,8 +19,8 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -130,7 +130,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
 
@@ -142,7 +142,7 @@ void *alloc_insn_page(void)
}
return page;
 error:
-   module_memfree(page);
+   execmem_free(page);
return NULL;
 }
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..798249ef5646 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -7,13 +7,13 @@
  *   Author(s): Martin Schwidefsky 
  */
 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void)
 {
const char *start, *end;
 
-   ftrace_plt = module_alloc(PAGE_SIZE);
+   ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index f0cf20d4b3c5..3c1b1be744de 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -9,7 +9,6 @@
 
 #define pr_fmt(fmt) "kprobes: " fmt
 
-#include 
 #include 
 #include 
 #include 
@@ -21,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -38,7 +38,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
set_memory_rox((unsigned long)page, 1);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 42215f9404af..ac97a905e8cd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -76,7 +77,7 @@ void *module_alloc(unsigned long size)
 #ifdef CONFIG_FUNCTION_TRACER
 void module_arch_cleanup(struct module *mod)
 {
-   module_memfree(mod->arch.trampolines_start);
+   execmem_free(mod->arch.trampolines_start);
 }
 #endif
 
@@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct 
module *me,
 
size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
numpages = DIV_ROUND_UP(size, PAGE_SIZE);
-   start = module_alloc(numpages * PAGE_SIZE);
+   start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
if 

[PATCH v5 07/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Several architectures override module_alloc() only to define address
range for code allocations different than VMALLOC address space.

Provide a generic implementation in execmem that uses the parameters for
address space ranges, required alignment and page protections provided
by architectures.

The architectures must fill execmem_info structure and implement
execmem_arch_setup() that returns a pointer to that structure. This way the
execmem initialization won't be called from every architecture, but rather
from a central place, namely a core_initcall() in execmem.

The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
with the parameters defined by the architectures.  If an architecture does
not implement execmem_arch_setup(), execmem_alloc() will fall back to
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/loongarch/kernel/module.c | 19 +++--
 arch/mips/kernel/module.c  | 20 --
 arch/nios2/kernel/module.c | 21 +++---
 arch/parisc/kernel/module.c| 24 +++
 arch/riscv/kernel/module.c | 24 +++
 arch/sparc/kernel/module.c | 20 --
 include/linux/execmem.h| 41 +++
 mm/execmem.c   | 73 --
 8 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c
index c7d0338d12c1..ca6dd7ea1610 100644
--- a/arch/loongarch/kernel/module.c
+++ b/arch/loongarch/kernel/module.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -490,10 +491,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, 
__builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 static void module_init_ftrace_plt(const Elf_Ehdr *hdr,
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 9a6c96014904..59225a3cf918 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct mips_hi16 {
@@ -32,11 +33,22 @@ static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
 #ifdef MODULES_VADDR
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 #endif
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 9c97b7513853..0d1ee86631fc 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -18,15 +18,26 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC,
-   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info = (struct execmem_info){
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start  = MODULES_VADDR,
+   .end= MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   },
+   },
+   };
+
+   return _info;
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index d2

[PATCH v5 06/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystems
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

Start splitting code allocation from modules by introducing execmem_alloc()
and execmem_free() APIs.

Initially, execmem_alloc() is a wrapper for module_alloc() and
execmem_free() is a replacement of module_memfree() to allow updating all
call sites to use the new APIs.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem_alloc() takes
a type argument, that will be used to identify the calling subsystem and to
allow architectures define parameters for ranges suitable for that
subsystem.

No functional changes.

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Masami Hiramatsu (Google) 
---
 arch/powerpc/kernel/kprobes.c|  6 ++--
 arch/s390/kernel/ftrace.c|  4 +--
 arch/s390/kernel/kprobes.c   |  4 +--
 arch/s390/kernel/module.c|  5 +--
 arch/sparc/net/bpf_jit_comp_32.c |  8 ++---
 arch/x86/kernel/ftrace.c |  6 ++--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 include/linux/execmem.h  | 57 
 include/linux/moduleloader.h |  3 --
 kernel/bpf/core.c|  6 ++--
 kernel/kprobes.c |  8 ++---
 kernel/module/Kconfig|  1 +
 kernel/module/main.c | 25 +-
 mm/Kconfig   |  3 ++
 mm/Makefile  |  1 +
 mm/execmem.c | 32 ++
 16 files changed, 128 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/execmem.h
 create mode 100644 mm/execmem.c

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..9fcd01bb2ce6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -19,8 +19,8 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -130,7 +130,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
 
@@ -142,7 +142,7 @@ void *alloc_insn_page(void)
}
return page;
 error:
-   module_memfree(page);
+   execmem_free(page);
return NULL;
 }
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..798249ef5646 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -7,13 +7,13 @@
  *   Author(s): Martin Schwidefsky 
  */
 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void)
 {
const char *start, *end;
 
-   ftrace_plt = module_alloc(PAGE_SIZE);
+   ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index f0cf20d4b3c5..3c1b1be744de 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -9,7 +9,6 @@
 
 #define pr_fmt(fmt) "kprobes: " fmt
 
-#include 
 #include 
 #include 
 #include 
@@ -21,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -38,7 +38,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
set_memory_rox((unsigned long)page, 1);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 42215f9404af..ac97a905e8cd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -76,7 +77,7 @@ void *module_alloc(unsigned long size)
 #ifdef CONFIG_FUNCTION_TRACER
 void module_arch_cleanup(struct module *mod)
 {
-   module_memfree(mod->arch.trampolines_start);
+   execmem_free(mod->arch.trampolines_start);
 }
 #endif
 
@@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct 
module *me,
 
size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
numpages = DIV_ROUND_UP(size, PAGE_SIZE);
-   start = module_alloc(numpages * PAGE_SIZE);
+   start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
if 

[PATCH v5 05/15] module: make module_memory_{alloc,free} more self-contained

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Move the logic related to the memory allocation and freeing into
module_memory_alloc() and module_memory_free().

Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/module/main.c | 64 +++-
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index e1e8a7a9d6c1..5b82b069e0d3 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type)
mod_mem_type_is_core_data(type);
 }
 
-static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
+   unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+   void *ptr;
+
+   mod->mem[type].size = size;
+
if (mod_mem_use_vmalloc(type))
-   return vzalloc(size);
-   return module_alloc(size);
+   ptr = vmalloc(size);
+   else
+   ptr = module_alloc(size);
+
+   if (!ptr)
+   return -ENOMEM;
+
+   /*
+* The pointer to these blocks of memory are stored on the module
+* structure and we keep that around so long as the module is
+* around. We only free that memory when we unload the module.
+* Just mark them as not being a leak then. The .init* ELF
+* sections *do* get freed after boot so we *could* treat them
+* slightly differently with kmemleak_ignore() and only grey
+* them out as they work as typical memory allocations which
+* *do* eventually get freed, but let's just keep things simple
+* and avoid *any* false positives.
+*/
+   kmemleak_not_leak(ptr);
+
+   memset(ptr, 0, size);
+   mod->mem[type].base = ptr;
+
+   return 0;
 }
 
-static void module_memory_free(void *ptr, enum mod_mem_type type)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
 {
+   void *ptr = mod->mem[type].base;
+
if (mod_mem_use_vmalloc(type))
vfree(ptr);
else
@@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
-   module_memory_free(mod_mem->base, type);
+   module_memory_free(mod, type);
}
 
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, 
mod->mem[MOD_DATA].size);
-   module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+   module_memory_free(mod, MOD_DATA);
 }
 
 /* Free a module, remove from lists, etc. */
@@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, 
struct load_info *info)
 static int move_module(struct module *mod, struct load_info *info)
 {
int i;
-   void *ptr;
enum mod_mem_type t = 0;
int ret = -ENOMEM;
 
@@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct 
load_info *info)
mod->mem[type].base = NULL;
continue;
}
-   mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
-   ptr = module_memory_alloc(mod->mem[type].size, type);
-   /*
- * The pointer to these blocks of memory are stored on the 
module
- * structure and we keep that around so long as the module is
- * around. We only free that memory when we unload the module.
- * Just mark them as not being a leak then. The .init* ELF
- * sections *do* get freed after boot so we *could* treat them
- * slightly differently with kmemleak_ignore() and only grey
- * them out as they work as typical memory allocations which
- * *do* eventually get freed, but let's just keep things simple
- * and avoid *any* false positives.
-*/
-   kmemleak_not_leak(ptr);
-   if (!ptr) {
+
+   ret = module_memory_alloc(mod, type);
+   if (ret) {
t = type;
goto out_enomem;
}
-   memset(ptr, 0, mod->mem[type].size);
-   mod->mem[type].base = ptr;
}
 
/* Transfer each section which specifies SHF_ALLOC */
@@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct 
load_info *info)
return 0;
 out_enomem:
for (t--; t >= 0; t--)
-   module_memory_free(mod->mem[t].base, t);
+   module_memory_free(mod, t);
return ret;
 }
 
-- 
2.43.0



[PATCH v5 05/15] module: make module_memory_{alloc,free} more self-contained

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Move the logic related to the memory allocation and freeing into
module_memory_alloc() and module_memory_free().

Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/module/main.c | 64 +++-
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index e1e8a7a9d6c1..5b82b069e0d3 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type)
mod_mem_type_is_core_data(type);
 }
 
-static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
+   unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+   void *ptr;
+
+   mod->mem[type].size = size;
+
if (mod_mem_use_vmalloc(type))
-   return vzalloc(size);
-   return module_alloc(size);
+   ptr = vmalloc(size);
+   else
+   ptr = module_alloc(size);
+
+   if (!ptr)
+   return -ENOMEM;
+
+   /*
+* The pointer to these blocks of memory are stored on the module
+* structure and we keep that around so long as the module is
+* around. We only free that memory when we unload the module.
+* Just mark them as not being a leak then. The .init* ELF
+* sections *do* get freed after boot so we *could* treat them
+* slightly differently with kmemleak_ignore() and only grey
+* them out as they work as typical memory allocations which
+* *do* eventually get freed, but let's just keep things simple
+* and avoid *any* false positives.
+*/
+   kmemleak_not_leak(ptr);
+
+   memset(ptr, 0, size);
+   mod->mem[type].base = ptr;
+
+   return 0;
 }
 
-static void module_memory_free(void *ptr, enum mod_mem_type type)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
 {
+   void *ptr = mod->mem[type].base;
+
if (mod_mem_use_vmalloc(type))
vfree(ptr);
else
@@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
-   module_memory_free(mod_mem->base, type);
+   module_memory_free(mod, type);
}
 
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, 
mod->mem[MOD_DATA].size);
-   module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+   module_memory_free(mod, MOD_DATA);
 }
 
 /* Free a module, remove from lists, etc. */
@@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, 
struct load_info *info)
 static int move_module(struct module *mod, struct load_info *info)
 {
int i;
-   void *ptr;
enum mod_mem_type t = 0;
int ret = -ENOMEM;
 
@@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct 
load_info *info)
mod->mem[type].base = NULL;
continue;
}
-   mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
-   ptr = module_memory_alloc(mod->mem[type].size, type);
-   /*
- * The pointer to these blocks of memory are stored on the 
module
- * structure and we keep that around so long as the module is
- * around. We only free that memory when we unload the module.
- * Just mark them as not being a leak then. The .init* ELF
- * sections *do* get freed after boot so we *could* treat them
- * slightly differently with kmemleak_ignore() and only grey
- * them out as they work as typical memory allocations which
- * *do* eventually get freed, but let's just keep things simple
- * and avoid *any* false positives.
-*/
-   kmemleak_not_leak(ptr);
-   if (!ptr) {
+
+   ret = module_memory_alloc(mod, type);
+   if (ret) {
t = type;
goto out_enomem;
}
-   memset(ptr, 0, mod->mem[type].size);
-   mod->mem[type].base = ptr;
}
 
/* Transfer each section which specifies SHF_ALLOC */
@@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct 
load_info *info)
return 0;
 out_enomem:
for (t--; t >= 0; t--)
-   module_memory_free(mod->mem[t].base, t);
+   module_memory_free(mod, t);
return ret;
 }
 
-- 
2.43.0




[PATCH v5 04/15] sparc: simplify module_alloc()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Define MODULES_VADDR and MODULES_END as VMALLOC_START and VMALLOC_END
for 32-bit and reduce module_alloc() to

__vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, ...)

as with the new defines the allocations becames identical for both 32
and 64 bits.

While on it, drop unsed include of 

Suggested-by: Sam Ravnborg 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/sparc/include/asm/pgtable_32.h |  2 ++
 arch/sparc/kernel/module.c  | 25 +
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_32.h 
b/arch/sparc/include/asm/pgtable_32.h
index 9e85d57ac3f2..62bcafe38b1f 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -432,6 +432,8 @@ static inline int io_remap_pfn_range(struct vm_area_struct 
*vma,
 
 #define VMALLOC_START   _AC(0xfe60,UL)
 #define VMALLOC_END _AC(0xffc0,UL)
+#define MODULES_VADDR   VMALLOC_START
+#define MODULES_END VMALLOC_END
 
 /* We provide our own get_unmapped_area to cope with VA holes for userland */
 #define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 66c45a2764bc..d37adb2a0b54 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -21,35 +21,12 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_SPARC64
-
-#include 
-
-static void *module_map(unsigned long size)
+void *module_alloc(unsigned long size)
 {
-   if (PAGE_ALIGN(size) > MODULES_LEN)
-   return NULL;
return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
-#else
-static void *module_map(unsigned long size)
-{
-   return vmalloc(size);
-}
-#endif /* CONFIG_SPARC64 */
-
-void *module_alloc(unsigned long size)
-{
-   void *ret;
-
-   ret = module_map(size);
-   if (ret)
-   memset(ret, 0, size);
-
-   return ret;
-}
 
 /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
 int module_frob_arch_sections(Elf_Ehdr *hdr,
-- 
2.43.0



[PATCH v5 03/15] nios2: define virtual address space for modules

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

nios2 uses kmalloc() to implement module_alloc() because CALL26/PCREL26
cannot reach all of vmalloc address space.

Define module space as 32MiB below the kernel base and switch nios2 to
use vmalloc for module allocations.

Suggested-by: Thomas Gleixner 
Acked-by: Dinh Nguyen 
Acked-by: Song Liu 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/nios2/include/asm/pgtable.h |  5 -
 arch/nios2/kernel/module.c   | 19 ---
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index d052dfcbe8d3..eab87c6beacb 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -25,7 +25,10 @@
 #include 
 
 #define VMALLOC_START  CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
-#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
+#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M - 1)
+
+#define MODULES_VADDR  (CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M)
+#define MODULES_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
 
 struct mm_struct;
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 76e0a42d6e36..9c97b7513853 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -21,23 +21,12 @@
 
 #include 
 
-/*
- * Modules should NOT be allocated with kmalloc for (obvious) reasons.
- * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
- * from 0x8000 (vmalloc area) to 0xc (kernel) (kmalloc returns
- * addresses in 0xc000)
- */
 void *module_alloc(unsigned long size)
 {
-   if (size == 0)
-   return NULL;
-   return kmalloc(size, GFP_KERNEL);
-}
-
-/* Free memory returned from module_alloc */
-void module_memfree(void *module_region)
-{
-   kfree(module_region);
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_EXEC,
+   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
-- 
2.43.0



[PATCH v5 04/15] sparc: simplify module_alloc()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Define MODULES_VADDR and MODULES_END as VMALLOC_START and VMALLOC_END
for 32-bit and reduce module_alloc() to

__vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, ...)

as with the new defines the allocations becames identical for both 32
and 64 bits.

While on it, drop unsed include of 

Suggested-by: Sam Ravnborg 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/sparc/include/asm/pgtable_32.h |  2 ++
 arch/sparc/kernel/module.c  | 25 +
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_32.h 
b/arch/sparc/include/asm/pgtable_32.h
index 9e85d57ac3f2..62bcafe38b1f 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -432,6 +432,8 @@ static inline int io_remap_pfn_range(struct vm_area_struct 
*vma,
 
 #define VMALLOC_START   _AC(0xfe60,UL)
 #define VMALLOC_END _AC(0xffc0,UL)
+#define MODULES_VADDR   VMALLOC_START
+#define MODULES_END VMALLOC_END
 
 /* We provide our own get_unmapped_area to cope with VA holes for userland */
 #define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 66c45a2764bc..d37adb2a0b54 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -21,35 +21,12 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_SPARC64
-
-#include 
-
-static void *module_map(unsigned long size)
+void *module_alloc(unsigned long size)
 {
-   if (PAGE_ALIGN(size) > MODULES_LEN)
-   return NULL;
return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
-#else
-static void *module_map(unsigned long size)
-{
-   return vmalloc(size);
-}
-#endif /* CONFIG_SPARC64 */
-
-void *module_alloc(unsigned long size)
-{
-   void *ret;
-
-   ret = module_map(size);
-   if (ret)
-   memset(ret, 0, size);
-
-   return ret;
-}
 
 /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
 int module_frob_arch_sections(Elf_Ehdr *hdr,
-- 
2.43.0




[PATCH v5 03/15] nios2: define virtual address space for modules

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

nios2 uses kmalloc() to implement module_alloc() because CALL26/PCREL26
cannot reach all of vmalloc address space.

Define module space as 32MiB below the kernel base and switch nios2 to
use vmalloc for module allocations.

Suggested-by: Thomas Gleixner 
Acked-by: Dinh Nguyen 
Acked-by: Song Liu 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/nios2/include/asm/pgtable.h |  5 -
 arch/nios2/kernel/module.c   | 19 ---
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index d052dfcbe8d3..eab87c6beacb 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -25,7 +25,10 @@
 #include 
 
 #define VMALLOC_START  CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
-#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
+#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M - 1)
+
+#define MODULES_VADDR  (CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M)
+#define MODULES_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
 
 struct mm_struct;
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 76e0a42d6e36..9c97b7513853 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -21,23 +21,12 @@
 
 #include 
 
-/*
- * Modules should NOT be allocated with kmalloc for (obvious) reasons.
- * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
- * from 0x8000 (vmalloc area) to 0xc (kernel) (kmalloc returns
- * addresses in 0xc000)
- */
 void *module_alloc(unsigned long size)
 {
-   if (size == 0)
-   return NULL;
-   return kmalloc(size, GFP_KERNEL);
-}
-
-/* Free memory returned from module_alloc */
-void module_memfree(void *module_region)
-{
-   kfree(module_region);
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_EXEC,
+   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
-- 
2.43.0




[PATCH v5 02/15] mips: module: rename MODULE_START to MODULES_VADDR

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

and MODULE_END to MODULES_END to match other architectures that define
custom address space for modules.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/mips/include/asm/pgtable-64.h | 4 ++--
 arch/mips/kernel/module.c  | 4 ++--
 arch/mips/mm/fault.c   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/pgtable-64.h 
b/arch/mips/include/asm/pgtable-64.h
index 20ca48c1b606..c0109aff223b 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -147,8 +147,8 @@
 #if defined(CONFIG_MODULES) && defined(KBUILD_64BIT_SYM32) && \
VMALLOC_START != CKSSEG
 /* Load modules into 32bit-compatible segment. */
-#define MODULE_START   CKSSEG
-#define MODULE_END (FIXADDR_START-2*PAGE_SIZE)
+#define MODULES_VADDR  CKSSEG
+#define MODULES_END(FIXADDR_START-2*PAGE_SIZE)
 #endif
 
 #define pte_ERROR(e) \
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 7b2fbaa9cac5..9a6c96014904 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,10 +31,10 @@ struct mips_hi16 {
 static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
-#ifdef MODULE_START
+#ifdef MODULES_VADDR
 void *module_alloc(unsigned long size)
 {
-   return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index aaa9a242ebba..37fedeaca2e9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -83,8 +83,8 @@ static void __do_page_fault(struct pt_regs *regs, unsigned 
long write,
 
if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END))
goto VMALLOC_FAULT_TARGET;
-#ifdef MODULE_START
-   if (unlikely(address >= MODULE_START && address < MODULE_END))
+#ifdef MODULES_VADDR
+   if (unlikely(address >= MODULES_VADDR && address < MODULES_END))
goto VMALLOC_FAULT_TARGET;
 #endif
 
-- 
2.43.0



[PATCH v5 02/15] mips: module: rename MODULE_START to MODULES_VADDR

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

and MODULE_END to MODULES_END to match other architectures that define
custom address space for modules.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/mips/include/asm/pgtable-64.h | 4 ++--
 arch/mips/kernel/module.c  | 4 ++--
 arch/mips/mm/fault.c   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/pgtable-64.h 
b/arch/mips/include/asm/pgtable-64.h
index 20ca48c1b606..c0109aff223b 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -147,8 +147,8 @@
 #if defined(CONFIG_MODULES) && defined(KBUILD_64BIT_SYM32) && \
VMALLOC_START != CKSSEG
 /* Load modules into 32bit-compatible segment. */
-#define MODULE_START   CKSSEG
-#define MODULE_END (FIXADDR_START-2*PAGE_SIZE)
+#define MODULES_VADDR  CKSSEG
+#define MODULES_END(FIXADDR_START-2*PAGE_SIZE)
 #endif
 
 #define pte_ERROR(e) \
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 7b2fbaa9cac5..9a6c96014904 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,10 +31,10 @@ struct mips_hi16 {
 static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
-#ifdef MODULE_START
+#ifdef MODULES_VADDR
 void *module_alloc(unsigned long size)
 {
-   return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index aaa9a242ebba..37fedeaca2e9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -83,8 +83,8 @@ static void __do_page_fault(struct pt_regs *regs, unsigned 
long write,
 
if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END))
goto VMALLOC_FAULT_TARGET;
-#ifdef MODULE_START
-   if (unlikely(address >= MODULE_START && address < MODULE_END))
+#ifdef MODULES_VADDR
+   if (unlikely(address >= MODULES_VADDR && address < MODULES_END))
goto VMALLOC_FAULT_TARGET;
 #endif
 
-- 
2.43.0




[PATCH v5 01/15] arm64: module: remove unneeded call to kasan_alloc_module_shadow()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Since commit f6f37d9320a1 ("arm64: select KASAN_VMALLOC for SW/HW_TAGS
modes") KASAN_VMALLOC is always enabled when KASAN is on. This means
that allocations in module_alloc() will be tracked by KASAN protection
for vmalloc() and that kasan_alloc_module_shadow() will be always an
empty inline and there is no point in calling it.

Drop meaningless call to kasan_alloc_module_shadow() from
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm64/kernel/module.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 47e0be610bb6..e92da4da1b2a 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -141,11 +141,6 @@ void *module_alloc(unsigned long size)
__func__);
}
 
-   if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
-   vfree(p);
-   return NULL;
-   }
-
/* Memory is intended to be executable, reset the pointer tag. */
return kasan_reset_tag(p);
 }
-- 
2.43.0



[PATCH v5 01/15] arm64: module: remove unneeded call to kasan_alloc_module_shadow()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Since commit f6f37d9320a1 ("arm64: select KASAN_VMALLOC for SW/HW_TAGS
modes") KASAN_VMALLOC is always enabled when KASAN is on. This means
that allocations in module_alloc() will be tracked by KASAN protection
for vmalloc() and that kasan_alloc_module_shadow() will be always an
empty inline and there is no point in calling it.

Drop meaningless call to kasan_alloc_module_shadow() from
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm64/kernel/module.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 47e0be610bb6..e92da4da1b2a 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -141,11 +141,6 @@ void *module_alloc(unsigned long size)
__func__);
}
 
-   if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
-   vfree(p);
-   return NULL;
-   }
-
/* Memory is intended to be executable, reset the pointer tag. */
return kasan_reset_tag(p);
 }
-- 
2.43.0




[PATCH v5 00/15] mm: jit/text allocator

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Hi,

Since v3 I looked into making execmem more of an utility toolbox, as we
discussed at LPC with Mark Rutland, but it was getting more hairier than
having a struct describing architecture constraints and a type identifying
the consumer of execmem.

And I do think that having the description of architecture constraints for
allocations of executable memory in a single place is better than having it
spread all over the place.

The patches available via git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v5

v5 changes:
* rebase on v6.9-rc4 to avoid a conflict in kprobes
* add copyrights to mm/execmem.c (Luis)
* fix spelling (Ingo)
* define MODULES_VADDDR for sparc (Sam)
* consistently initialize struct execmem_info (Peter)
* reduce #ifdefs in function bodies in kprobes (Masami) 

v4: https://lore.kernel.org/all/20240411160051.2093261-1-r...@kernel.org
* rebase on v6.9-rc2
* rename execmem_params to execmem_info and execmem_arch_params() to
  execmem_arch_setup()
* use single execmem_alloc() API instead of execmem_{text,data}_alloc() (Song)
* avoid extra copy of execmem parameters (Rick)
* run execmem_init() as core_initcall() except for the architectures that
  may allocated text really early (currently only x86) (Will)
* add acks for some of arm64 and riscv changes, thanks Will and Alexandre
* new commits:
  - drop call to kasan_alloc_module_shadow() on arm64 because it's not
needed anymore
  - rename MODULE_START to MODULES_VADDR on MIPS
  - use CONFIG_EXECMEM instead of CONFIG_MODULES on powerpc as per Christophe:
https://lore.kernel.org/all/79062fa3-3402-47b3-8920-9231ad05e...@csgroup.eu/

v3: https://lore.kernel.org/all/20230918072955.2507221-1-r...@kernel.org
* add type parameter to execmem allocation APIs
* remove BPF dependency on modules

v2: https://lore.kernel.org/all/20230616085038.4121892-1-r...@kernel.org
* Separate "module" and "others" allocations with execmem_text_alloc()
and jit_text_alloc()
* Drop ROX entailment on x86
* Add ack for nios2 changes, thanks Dinh Nguyen

v1: https://lore.kernel.org/all/20230601101257.530867-1-r...@kernel.org

= Cover letter from v1 (sligtly updated) =

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystmes
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

A centralized infrastructure for code allocation allows allocations of
executable memory as ROX, and future optimizations such as caching large
pages for better iTLB performance and providing sub-page allocations for
users that only need small jit code snippets.

Rick Edgecombe proposed perm_alloc extension to vmalloc [1] and Song Liu
proposed execmem_alloc [2], but both these approaches were targeting BPF
allocations and lacked the ground work to abstract executable allocations
and split them from the modules core.

Thomas Gleixner suggested to express module allocation restrictions and
requirements as struct mod_alloc_type_params [3] that would define ranges,
protections and other parameters for different types of allocations used by
modules and following that suggestion Song separated allocations of
different types in modules (commit ac3b43283923 ("module: replace
module_layout with module_memory")) and posted "Type aware module
allocator" set [4].

I liked the idea of parametrising code allocation requirements as a
structure, but I believe the original proposal and Song's module allocator
was too module centric, so I came up with these patches.

This set splits code allocation from modules by introducing execmem_alloc()
and and execmem_free(), APIs, replaces call sites of module_alloc() and
module_memfree() with the new APIs and implements core text and related
allocations in a central place.

Instead of architecture specific overrides for module_alloc(), the
architectures that require non-default behaviour for text allocation must
fill execmem_info structure and implement execmem_arch_setup() that returns
a pointer to that structure. If an architecture does not implement
execmem_arch_setup(), the defaults compatible with the current
modules::module_alloc() are used.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem APIs
take a type argument, that will be used to identify the calling subsystem
and to allow architectures to define parameters for ranges suitable for that
subsystem.

The new infrastructure allows decoupling of BPF, kprobes and ftrace from
modules, and mos

[PATCH v5 00/15] mm: jit/text allocator

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Hi,

Since v3 I looked into making execmem more of an utility toolbox, as we
discussed at LPC with Mark Rutland, but it was getting more hairier than
having a struct describing architecture constraints and a type identifying
the consumer of execmem.

And I do think that having the description of architecture constraints for
allocations of executable memory in a single place is better than having it
spread all over the place.

The patches available via git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v5

v5 changes:
* rebase on v6.9-rc4 to avoid a conflict in kprobes
* add copyrights to mm/execmem.c (Luis)
* fix spelling (Ingo)
* define MODULES_VADDDR for sparc (Sam)
* consistently initialize struct execmem_info (Peter)
* reduce #ifdefs in function bodies in kprobes (Masami) 

v4: https://lore.kernel.org/all/20240411160051.2093261-1-r...@kernel.org
* rebase on v6.9-rc2
* rename execmem_params to execmem_info and execmem_arch_params() to
  execmem_arch_setup()
* use single execmem_alloc() API instead of execmem_{text,data}_alloc() (Song)
* avoid extra copy of execmem parameters (Rick)
* run execmem_init() as core_initcall() except for the architectures that
  may allocated text really early (currently only x86) (Will)
* add acks for some of arm64 and riscv changes, thanks Will and Alexandre
* new commits:
  - drop call to kasan_alloc_module_shadow() on arm64 because it's not
needed anymore
  - rename MODULE_START to MODULES_VADDR on MIPS
  - use CONFIG_EXECMEM instead of CONFIG_MODULES on powerpc as per Christophe:
https://lore.kernel.org/all/79062fa3-3402-47b3-8920-9231ad05e...@csgroup.eu/

v3: https://lore.kernel.org/all/20230918072955.2507221-1-r...@kernel.org
* add type parameter to execmem allocation APIs
* remove BPF dependency on modules

v2: https://lore.kernel.org/all/20230616085038.4121892-1-r...@kernel.org
* Separate "module" and "others" allocations with execmem_text_alloc()
and jit_text_alloc()
* Drop ROX entailment on x86
* Add ack for nios2 changes, thanks Dinh Nguyen

v1: https://lore.kernel.org/all/20230601101257.530867-1-r...@kernel.org

= Cover letter from v1 (sligtly updated) =

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystmes
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

A centralized infrastructure for code allocation allows allocations of
executable memory as ROX, and future optimizations such as caching large
pages for better iTLB performance and providing sub-page allocations for
users that only need small jit code snippets.

Rick Edgecombe proposed perm_alloc extension to vmalloc [1] and Song Liu
proposed execmem_alloc [2], but both these approaches were targeting BPF
allocations and lacked the ground work to abstract executable allocations
and split them from the modules core.

Thomas Gleixner suggested to express module allocation restrictions and
requirements as struct mod_alloc_type_params [3] that would define ranges,
protections and other parameters for different types of allocations used by
modules and following that suggestion Song separated allocations of
different types in modules (commit ac3b43283923 ("module: replace
module_layout with module_memory")) and posted "Type aware module
allocator" set [4].

I liked the idea of parametrising code allocation requirements as a
structure, but I believe the original proposal and Song's module allocator
was too module centric, so I came up with these patches.

This set splits code allocation from modules by introducing execmem_alloc()
and and execmem_free(), APIs, replaces call sites of module_alloc() and
module_memfree() with the new APIs and implements core text and related
allocations in a central place.

Instead of architecture specific overrides for module_alloc(), the
architectures that require non-default behaviour for text allocation must
fill execmem_info structure and implement execmem_arch_setup() that returns
a pointer to that structure. If an architecture does not implement
execmem_arch_setup(), the defaults compatible with the current
modules::module_alloc() are used.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem APIs
take a type argument, that will be used to identify the calling subsystem
and to allow architectures to define parameters for ranges suitable for that
subsystem.

The new infrastructure allows decoupling of BPF, kprobes and ftrace from
modules, and mos

Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-20 Thread Mike Rapoport
On Sat, Apr 20, 2024 at 06:15:00PM +0900, Masami Hiramatsu wrote:
> On Sat, 20 Apr 2024 10:33:38 +0300
> Mike Rapoport  wrote:
> 
> > On Fri, Apr 19, 2024 at 03:59:40PM +, Christophe Leroy wrote:
> > > 
> > > 
> > > Le 19/04/2024 à 17:49, Mike Rapoport a écrit :
> > > > Hi Masami,
> > > > 
> > > > On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> > > >> Hi Mike,
> > > >>
> > > >> On Thu, 11 Apr 2024 19:00:50 +0300
> > > >> Mike Rapoport  wrote:
> > > >>
> > > >>> From: "Mike Rapoport (IBM)" 
> > > >>>
> > > >>> kprobes depended on CONFIG_MODULES because it has to allocate memory 
> > > >>> for
> > > >>> code.
> > > >>>
> > > >>> Since code allocations are now implemented with execmem, kprobes can 
> > > >>> be
> > > >>> enabled in non-modular kernels.
> > > >>>
> > > >>> Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes 
> > > >>> inside
> > > >>> modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> > > >>> dependency of CONFIG_KPROBES on CONFIG_MODULES.
> > > >>
> > > >> Thanks for this work, but this conflicts with the latest fix in 
> > > >> v6.9-rc4.
> > > >> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> > > >> function body? We have enough dummy functions for that, so it should
> > > >> not make a problem.
> > > > 
> > > > The code in check_kprobe_address_safe() that gets the module and checks 
> > > > for
> > > > __init functions does not compile with IS_ENABLED(CONFIG_MODULES).
> > > > I can pull it out to a helper or leave #ifdef in the function body,
> > > > whichever you prefer.
> > > 
> > > As far as I can see, the only problem is MODULE_STATE_COMING.
> > > Can we move 'enum module_state' out of #ifdef CONFIG_MODULES in module.h  
> > > ?
> > 
> > There's dereference of 'struct module' there:
> >  
> > (*probed_mod)->state != MODULE_STATE_COMING) {
> > ...
> > }
> > 
> > so moving out 'enum module_state' won't be enough.
> 
> Hmm, this part should be inline functions like;
> 
> #ifdef CONFIG_MODULES
> static inline bool module_is_coming(struct module *mod)
> {
>   return mod->state == MODULE_STATE_COMING;
> }
> #else
> #define module_is_coming(mod) (false)

I'd prefer

static inline module_is_coming(struct module *mod)
{
return false;
}

> #endif
>
> Then we don't need the enum.
> Thank you,
> 
> -- 
> Masami Hiramatsu (Google) 

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-20 Thread Mike Rapoport
On Sat, Apr 20, 2024 at 06:15:00PM +0900, Masami Hiramatsu wrote:
> On Sat, 20 Apr 2024 10:33:38 +0300
> Mike Rapoport  wrote:
> 
> > On Fri, Apr 19, 2024 at 03:59:40PM +, Christophe Leroy wrote:
> > > 
> > > 
> > > Le 19/04/2024 à 17:49, Mike Rapoport a écrit :
> > > > Hi Masami,
> > > > 
> > > > On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> > > >> Hi Mike,
> > > >>
> > > >> On Thu, 11 Apr 2024 19:00:50 +0300
> > > >> Mike Rapoport  wrote:
> > > >>
> > > >>> From: "Mike Rapoport (IBM)" 
> > > >>>
> > > >>> kprobes depended on CONFIG_MODULES because it has to allocate memory 
> > > >>> for
> > > >>> code.
> > > >>>
> > > >>> Since code allocations are now implemented with execmem, kprobes can 
> > > >>> be
> > > >>> enabled in non-modular kernels.
> > > >>>
> > > >>> Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes 
> > > >>> inside
> > > >>> modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> > > >>> dependency of CONFIG_KPROBES on CONFIG_MODULES.
> > > >>
> > > >> Thanks for this work, but this conflicts with the latest fix in 
> > > >> v6.9-rc4.
> > > >> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> > > >> function body? We have enough dummy functions for that, so it should
> > > >> not make a problem.
> > > > 
> > > > The code in check_kprobe_address_safe() that gets the module and checks 
> > > > for
> > > > __init functions does not compile with IS_ENABLED(CONFIG_MODULES).
> > > > I can pull it out to a helper or leave #ifdef in the function body,
> > > > whichever you prefer.
> > > 
> > > As far as I can see, the only problem is MODULE_STATE_COMING.
> > > Can we move 'enum module_state' out of #ifdef CONFIG_MODULES in module.h  
> > > ?
> > 
> > There's dereference of 'struct module' there:
> >  
> > (*probed_mod)->state != MODULE_STATE_COMING) {
> > ...
> > }
> > 
> > so moving out 'enum module_state' won't be enough.
> 
> Hmm, this part should be inline functions like;
> 
> #ifdef CONFIG_MODULES
> static inline bool module_is_coming(struct module *mod)
> {
>   return mod->state == MODULE_STATE_COMING;
> }
> #else
> #define module_is_coming(mod) (false)

I'd prefer

static inline module_is_coming(struct module *mod)
{
return false;
}

> #endif
>
> Then we don't need the enum.
> Thank you,
> 
> -- 
> Masami Hiramatsu (Google) 

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-20 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 03:59:40PM +, Christophe Leroy wrote:
> 
> 
> Le 19/04/2024 à 17:49, Mike Rapoport a écrit :
> > Hi Masami,
> > 
> > On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> >> Hi Mike,
> >>
> >> On Thu, 11 Apr 2024 19:00:50 +0300
> >> Mike Rapoport  wrote:
> >>
> >>> From: "Mike Rapoport (IBM)" 
> >>>
> >>> kprobes depended on CONFIG_MODULES because it has to allocate memory for
> >>> code.
> >>>
> >>> Since code allocations are now implemented with execmem, kprobes can be
> >>> enabled in non-modular kernels.
> >>>
> >>> Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
> >>> modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> >>> dependency of CONFIG_KPROBES on CONFIG_MODULES.
> >>
> >> Thanks for this work, but this conflicts with the latest fix in v6.9-rc4.
> >> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> >> function body? We have enough dummy functions for that, so it should
> >> not make a problem.
> > 
> > The code in check_kprobe_address_safe() that gets the module and checks for
> > __init functions does not compile with IS_ENABLED(CONFIG_MODULES).
> > I can pull it out to a helper or leave #ifdef in the function body,
> > whichever you prefer.
> 
> As far as I can see, the only problem is MODULE_STATE_COMING.
> Can we move 'enum module_state' out of #ifdef CONFIG_MODULES in module.h  ?

There's dereference of 'struct module' there:
 
(*probed_mod)->state != MODULE_STATE_COMING) {
...
}

so moving out 'enum module_state' won't be enough.
 
> >   
> >> -- 
> >> Masami Hiramatsu
> > 

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-20 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 03:59:40PM +, Christophe Leroy wrote:
> 
> 
> Le 19/04/2024 à 17:49, Mike Rapoport a écrit :
> > Hi Masami,
> > 
> > On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> >> Hi Mike,
> >>
> >> On Thu, 11 Apr 2024 19:00:50 +0300
> >> Mike Rapoport  wrote:
> >>
> >>> From: "Mike Rapoport (IBM)" 
> >>>
> >>> kprobes depended on CONFIG_MODULES because it has to allocate memory for
> >>> code.
> >>>
> >>> Since code allocations are now implemented with execmem, kprobes can be
> >>> enabled in non-modular kernels.
> >>>
> >>> Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
> >>> modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> >>> dependency of CONFIG_KPROBES on CONFIG_MODULES.
> >>
> >> Thanks for this work, but this conflicts with the latest fix in v6.9-rc4.
> >> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> >> function body? We have enough dummy functions for that, so it should
> >> not make a problem.
> > 
> > The code in check_kprobe_address_safe() that gets the module and checks for
> > __init functions does not compile with IS_ENABLED(CONFIG_MODULES).
> > I can pull it out to a helper or leave #ifdef in the function body,
> > whichever you prefer.
> 
> As far as I can see, the only problem is MODULE_STATE_COMING.
> Can we move 'enum module_state' out of #ifdef CONFIG_MODULES in module.h  ?

There's dereference of 'struct module' there:
 
(*probed_mod)->state != MODULE_STATE_COMING) {
...
}

so moving out 'enum module_state' won't be enough.
 
> >   
> >> -- 
> >> Masami Hiramatsu
> > 

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 02:42:16PM -0700, Song Liu wrote:
> On Fri, Apr 19, 2024 at 1:00 PM Mike Rapoport  wrote:
> >
> > On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> > > On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport  wrote:
> > > [...]
> > > > > >
> > > > > > [1] 
> > > > > > https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org
> > > > >
> > > > > For the ROX to work, we need different users (module text, kprobe, 
> > > > > etc.) to have
> > > > > the same execmem_range. From [1]:
> > > > >
> > > > > static void *execmem_cache_alloc(struct execmem_range *range, size_t 
> > > > > size)
> > > > > {
> > > > > ...
> > > > >p = __execmem_cache_alloc(size);
> > > > >if (p)
> > > > >return p;
> > > > >   err = execmem_cache_populate(range, size);
> > > > > ...
> > > > > }
> > > > >
> > > > > We are calling __execmem_cache_alloc() without range. For this to 
> > > > > work,
> > > > > we can only call execmem_cache_alloc() with one execmem_range.
> > > >
> > > > Actually, on x86 this will "just work" because everything shares the 
> > > > same
> > > > address space :)
> > > >
> > > > The 2M pages in the cache will be in the modules space, so
> > > > __execmem_cache_alloc() will always return memory from that address 
> > > > space.
> > > >
> > > > For other architectures this indeed needs to be fixed with passing the
> > > > range to __execmem_cache_alloc() and limiting search in the cache for 
> > > > that
> > > > range.
> > >
> > > I think we at least need the "map to" concept (initially proposed by 
> > > Thomas)
> > > to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> > > maps to EXECMEM_MODULE_TEXT, so that all these actually share
> > > the same range.
> >
> > Why?
> 
> IIUC, we need to update __execmem_cache_alloc() to take a range pointer as
> input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe
> will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing
> the "range" object, we will have to compare different range parameters to 
> check
> we can share cached pages between module text and kprobe, which is not
> efficient. Did I miss something?

We can always share large ROX pages as long as they are within the correct
address space. The permissions for them are ROX and the alignment
differences are due to KASAN and this is handled during allocation of the
large page to refill the cache. __execmem_cache_alloc() only needs to limit
the search for the address space of the range.

And regardless, they way we deal with sharing of the cache can be sorted
out later.

> Thanks,
> Song

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 02:42:16PM -0700, Song Liu wrote:
> On Fri, Apr 19, 2024 at 1:00 PM Mike Rapoport  wrote:
> >
> > On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> > > On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport  wrote:
> > > [...]
> > > > > >
> > > > > > [1] 
> > > > > > https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org
> > > > >
> > > > > For the ROX to work, we need different users (module text, kprobe, 
> > > > > etc.) to have
> > > > > the same execmem_range. From [1]:
> > > > >
> > > > > static void *execmem_cache_alloc(struct execmem_range *range, size_t 
> > > > > size)
> > > > > {
> > > > > ...
> > > > >p = __execmem_cache_alloc(size);
> > > > >if (p)
> > > > >return p;
> > > > >   err = execmem_cache_populate(range, size);
> > > > > ...
> > > > > }
> > > > >
> > > > > We are calling __execmem_cache_alloc() without range. For this to 
> > > > > work,
> > > > > we can only call execmem_cache_alloc() with one execmem_range.
> > > >
> > > > Actually, on x86 this will "just work" because everything shares the 
> > > > same
> > > > address space :)
> > > >
> > > > The 2M pages in the cache will be in the modules space, so
> > > > __execmem_cache_alloc() will always return memory from that address 
> > > > space.
> > > >
> > > > For other architectures this indeed needs to be fixed with passing the
> > > > range to __execmem_cache_alloc() and limiting search in the cache for 
> > > > that
> > > > range.
> > >
> > > I think we at least need the "map to" concept (initially proposed by 
> > > Thomas)
> > > to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> > > maps to EXECMEM_MODULE_TEXT, so that all these actually share
> > > the same range.
> >
> > Why?
> 
> IIUC, we need to update __execmem_cache_alloc() to take a range pointer as
> input. module text will use "range" for EXECMEM_MODULE_TEXT, while kprobe
> will use "range" for EXECMEM_KPROBE. Without "map to" concept or sharing
> the "range" object, we will have to compare different range parameters to 
> check
> we can share cached pages between module text and kprobe, which is not
> efficient. Did I miss something?

We can always share large ROX pages as long as they are within the correct
address space. The permissions for them are ROX and the alignment
differences are due to KASAN and this is handled during allocation of the
large page to refill the cache. __execmem_cache_alloc() only needs to limit
the search for the address space of the range.

And regardless, they way we deal with sharing of the cache can be sorted
out later.

> Thanks,
> Song

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport  wrote:
> [...]
> > > >
> > > > [1] https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org
> > >
> > > For the ROX to work, we need different users (module text, kprobe, etc.) 
> > > to have
> > > the same execmem_range. From [1]:
> > >
> > > static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> > > {
> > > ...
> > >p = __execmem_cache_alloc(size);
> > >if (p)
> > >return p;
> > >   err = execmem_cache_populate(range, size);
> > > ...
> > > }
> > >
> > > We are calling __execmem_cache_alloc() without range. For this to work,
> > > we can only call execmem_cache_alloc() with one execmem_range.
> >
> > Actually, on x86 this will "just work" because everything shares the same
> > address space :)
> >
> > The 2M pages in the cache will be in the modules space, so
> > __execmem_cache_alloc() will always return memory from that address space.
> >
> > For other architectures this indeed needs to be fixed with passing the
> > range to __execmem_cache_alloc() and limiting search in the cache for that
> > range.
> 
> I think we at least need the "map to" concept (initially proposed by Thomas)
> to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> maps to EXECMEM_MODULE_TEXT, so that all these actually share
> the same range.

Why?
 
> Does this make sense?
> 
> Song

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 10:32:39AM -0700, Song Liu wrote:
> On Fri, Apr 19, 2024 at 10:03 AM Mike Rapoport  wrote:
> [...]
> > > >
> > > > [1] https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org
> > >
> > > For the ROX to work, we need different users (module text, kprobe, etc.) 
> > > to have
> > > the same execmem_range. From [1]:
> > >
> > > static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> > > {
> > > ...
> > >p = __execmem_cache_alloc(size);
> > >if (p)
> > >return p;
> > >   err = execmem_cache_populate(range, size);
> > > ...
> > > }
> > >
> > > We are calling __execmem_cache_alloc() without range. For this to work,
> > > we can only call execmem_cache_alloc() with one execmem_range.
> >
> > Actually, on x86 this will "just work" because everything shares the same
> > address space :)
> >
> > The 2M pages in the cache will be in the modules space, so
> > __execmem_cache_alloc() will always return memory from that address space.
> >
> > For other architectures this indeed needs to be fixed with passing the
> > range to __execmem_cache_alloc() and limiting search in the cache for that
> > range.
> 
> I think we at least need the "map to" concept (initially proposed by Thomas)
> to get this work. For example, EXECMEM_BPF and EXECMEM_KPROBE
> maps to EXECMEM_MODULE_TEXT, so that all these actually share
> the same range.

Why?
 
> Does this make sense?
> 
> Song

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 08:54:40AM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 11:56 PM Mike Rapoport  wrote:
> >
> > On Thu, Apr 18, 2024 at 02:01:22PM -0700, Song Liu wrote:
> > > On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport  wrote:
> > > >
> > > > On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > > > > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport  wrote:
> > > > > > > >
> > > > > > > > I'm looking at execmem_types more as definition of the 
> > > > > > > > consumers, maybe I
> > > > > > > > should have named the enum execmem_consumer at the first place.
> > > > > > >
> > > > > > > I think looking at execmem_type from consumers' point of view adds
> > > > > > > unnecessary complexity. IIUC, for most (if not all) archs, 
> > > > > > > ftrace, kprobe,
> > > > > > > and bpf (and maybe also module text) all have the same 
> > > > > > > requirements.
> > > > > > > Did I miss something?
> > > > > >
> > > > > > It's enough to have one architecture with different constrains for 
> > > > > > kprobes
> > > > > > and bpf to warrant a type for each.
> > > > >
> > > > > AFAICT, some of these constraints can be changed without too much 
> > > > > work.
> > > >
> > > > But why?
> > > > I honestly don't understand what are you trying to optimize here. A few
> > > > lines of initialization in execmem_info?
> > >
> > > IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
> > > harder for bpf and kprobe to share the same ROX page. In many use cases,
> > > a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
> > > module text. It is not efficient if we have to allocate separate pages 
> > > for each
> > > of these use cases. If this is not a problem, the current approach works.
> >
> > The caching of large ROX pages does not need to be per type.
> >
> > In the POC I've posted for caching of large ROX pages on x86 [1], the cache 
> > is
> > global and to make kprobes and bpf use it it's enough to set a flag in
> > execmem_info.
> >
> > [1] https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org
> 
> For the ROX to work, we need different users (module text, kprobe, etc.) to 
> have
> the same execmem_range. From [1]:
> 
> static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> {
> ...
>p = __execmem_cache_alloc(size);
>if (p)
>return p;
>   err = execmem_cache_populate(range, size);
> ...
> }
> 
> We are calling __execmem_cache_alloc() without range. For this to work,
> we can only call execmem_cache_alloc() with one execmem_range.

Actually, on x86 this will "just work" because everything shares the same
address space :)

The 2M pages in the cache will be in the modules space, so
__execmem_cache_alloc() will always return memory from that address space.

For other architectures this indeed needs to be fixed with passing the
range to __execmem_cache_alloc() and limiting search in the cache for that
range.
 
> Did I miss something?
> 
> Thanks,
> Song

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 08:54:40AM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 11:56 PM Mike Rapoport  wrote:
> >
> > On Thu, Apr 18, 2024 at 02:01:22PM -0700, Song Liu wrote:
> > > On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport  wrote:
> > > >
> > > > On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > > > > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport  wrote:
> > > > > > > >
> > > > > > > > I'm looking at execmem_types more as definition of the 
> > > > > > > > consumers, maybe I
> > > > > > > > should have named the enum execmem_consumer at the first place.
> > > > > > >
> > > > > > > I think looking at execmem_type from consumers' point of view adds
> > > > > > > unnecessary complexity. IIUC, for most (if not all) archs, 
> > > > > > > ftrace, kprobe,
> > > > > > > and bpf (and maybe also module text) all have the same 
> > > > > > > requirements.
> > > > > > > Did I miss something?
> > > > > >
> > > > > > It's enough to have one architecture with different constrains for 
> > > > > > kprobes
> > > > > > and bpf to warrant a type for each.
> > > > >
> > > > > AFAICT, some of these constraints can be changed without too much 
> > > > > work.
> > > >
> > > > But why?
> > > > I honestly don't understand what are you trying to optimize here. A few
> > > > lines of initialization in execmem_info?
> > >
> > > IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
> > > harder for bpf and kprobe to share the same ROX page. In many use cases,
> > > a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
> > > module text. It is not efficient if we have to allocate separate pages 
> > > for each
> > > of these use cases. If this is not a problem, the current approach works.
> >
> > The caching of large ROX pages does not need to be per type.
> >
> > In the POC I've posted for caching of large ROX pages on x86 [1], the cache 
> > is
> > global and to make kprobes and bpf use it it's enough to set a flag in
> > execmem_info.
> >
> > [1] https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org
> 
> For the ROX to work, we need different users (module text, kprobe, etc.) to 
> have
> the same execmem_range. From [1]:
> 
> static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> {
> ...
>p = __execmem_cache_alloc(size);
>if (p)
>return p;
>   err = execmem_cache_populate(range, size);
> ...
> }
> 
> We are calling __execmem_cache_alloc() without range. For this to work,
> we can only call execmem_cache_alloc() with one execmem_range.

Actually, on x86 this will "just work" because everything shares the same
address space :)

The 2M pages in the cache will be in the modules space, so
__execmem_cache_alloc() will always return memory from that address space.

For other architectures this indeed needs to be fixed with passing the
range to __execmem_cache_alloc() and limiting search in the cache for that
range.
 
> Did I miss something?
> 
> Thanks,
> Song

-- 
Sincerely yours,
Mike.



Re: [PATCH v1 0/5] arm64/mm: uffd write-protect and soft-dirty tracking

2024-04-19 Thread Mike Rapoport
On Fri, Apr 19, 2024 at 11:45:14AM +0200, David Hildenbrand wrote:
> On 19.04.24 10:33, Shivansh Vij wrote:
> > > On 19/04/2024 08:43, Ryan Roberts wrote:
> > > > Hi All,
> > > > 
> > > > This series adds uffd write-protect and soft-dirty tracking support for 
> > > > arm64. I
> > > > consider the soft-dirty support (patches 3 and 4) as RFC - see 
> > > > rationale below.
> > > > 
> > > > That said, these are the last 2 SW bits and we may want to keep 1 bit 
> > > > in reserve
> > > > for future use. soft-dirty is only used for CRIU to my knowledge, and 
> > > > it is
> > > > thought that their use case could be solved with the more generic 
> > > > uffd-wp. So
> > > > unless somebody makes a clear case for the inclusion of soft-dirty 
> > > > support, we
> > > > are probably better off dropping patches 3 and 4 and keeping bit 63 for 
> > > > future
> > > > use. Although note that the most recent attempt to add soft-dirty for 
> > > > arm64 was
> > > > last month [1] so I'd like to give Shivansh Vij the opportunity to make 
> > > > the
> > > > case.
> > 
> > Appreciate the opportunity to provide input here.
> > 
> > I picked option one (dirty tracking in arm) because it seems to be the
> > simplest way to move forward, whereas it would be a relatively heavy
> > effort to add uffd-wp support to CRIU.
> > 
> > From a performance perspective I am also a little worried that uffd
> > will be slower than just tracking the dirty bits asynchronously with
> > sw dirty, but maybe that's not as much of a concern with the addition
> > of uffd-wp async.
> > 
> > With all this being said, I'll defer to the wisdom of the crowd about
> > which approach makes more sense - after all, with this patch we should
> > get uffd-wp support on arm so at least there will be _a_ way forward
> > for CRIU (albeit one requiring slightly more work).
> 
> Ccing Mike and Peter. In 2017, Mike gave a presentation "Memory tracking for
> iterative container migration"[1] at LPC
> 
> Some key points are still true I think:
> (1) More flexible and robust than soft-dirty
> (2) May obsolete soft-dirty
> 
> We further recently added a new UFFD_FEATURE_WP_ASYNC feature as part of
> [2], because getting soft-dirty return reliable results in some cases turned
> out rather hard to fix.
> 
> We might still have to optimize that approach for some very sparse large
> VMAs, but that should be solvable.
> 
>  "The major defect of this approach of dirty tracking is we need to
>  populate the pgtables when tracking starts. Soft-dirty doesn't do it
>  like that. It's unwanted in the case where the range of memory to track
>  is huge and unpopulated (e.g., tracking updates on a 10G file with
>  mmap() on top, without having any page cache installed yet). One way to
>  improve this is to allow pte markers exist for larger than PTE level
>  for PMD+. That will not change the interface if to implemented, so we
>  can leave that for later.")[3]
> 
> 
> If we can avoid adding soft-dirty on arm64 that would be great. This will
> require work on the CRIU side. One downside of uffd-wp is that it is
> currently not as avilable on architectures as soft-dirty.

Using uffd-wp instead of soft-dirty in CRIU will require quite some work on
CRIU side and probably on the kernel side too.

And as of now we'll anyway have to maintain soft-dirty because powerpc and
s390 don't have uffd-wp.

With UFFD_FEATURE_WP_ASYNC the concern that uffd-wp will be slower than
soft-dirty probably doesn't exist, but we won't know for sure until
somebody will try.

But there were other limitations, the most prominent was checkpointing an
application that uses uffd. If CRIU is to use uffd-wp for tracking of the
dirty pages, there should be some support for multiple uffd contexts for a
VMA and that's surely a lot of work.

> But I'll throw in another idea: do we really need soft-dirty and uffd-wp to
> exist at the same time in the same process (or the VMA?). In theory, we

For instance to have dirty memory tracking in CRIU for an application that
uses uffd-wp :) 

> could have a VMA flag that defines the semantics of the bit and simply have
> arch code use a single, abstracted PTE bit. Requires a bit more work,
> though, but the benfit would be that architecturs that do support soft-dirty
> could support uffd-wp.
> 
> [1] 
> https://blog.linuxplumbersconf.org/2017/ocw//system/presentations/4724/original/Memory%20tracking%20for%20iterative%20container%20migration.pdf
> [2] 
> https://lore.kernel.org/all/20230821141518.870589-1-usama.an...@collabora.com/
> [3] 
> https://lore.kernel.org/all/20230821141518.870589-2-usama.an...@collabora.com/
> 
> -- 
> Cheers,
> 
> David / dhildenb
> 

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-19 Thread Mike Rapoport
Hi Masami,

On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> Hi Mike,
> 
> On Thu, 11 Apr 2024 19:00:50 +0300
> Mike Rapoport  wrote:
> 
> > From: "Mike Rapoport (IBM)" 
> > 
> > kprobes depended on CONFIG_MODULES because it has to allocate memory for
> > code.
> > 
> > Since code allocations are now implemented with execmem, kprobes can be
> > enabled in non-modular kernels.
> > 
> > Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
> > modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> > dependency of CONFIG_KPROBES on CONFIG_MODULES.
> 
> Thanks for this work, but this conflicts with the latest fix in v6.9-rc4.
> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> function body? We have enough dummy functions for that, so it should
> not make a problem.

The code in check_kprobe_address_safe() that gets the module and checks for
__init functions does not compile with IS_ENABLED(CONFIG_MODULES). 
I can pull it out to a helper or leave #ifdef in the function body,
whichever you prefer.
 
> -- 
> Masami Hiramatsu

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-19 Thread Mike Rapoport
Hi Masami,

On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> Hi Mike,
> 
> On Thu, 11 Apr 2024 19:00:50 +0300
> Mike Rapoport  wrote:
> 
> > From: "Mike Rapoport (IBM)" 
> > 
> > kprobes depended on CONFIG_MODULES because it has to allocate memory for
> > code.
> > 
> > Since code allocations are now implemented with execmem, kprobes can be
> > enabled in non-modular kernels.
> > 
> > Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
> > modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> > dependency of CONFIG_KPROBES on CONFIG_MODULES.
> 
> Thanks for this work, but this conflicts with the latest fix in v6.9-rc4.
> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> function body? We have enough dummy functions for that, so it should
> not make a problem.

The code in check_kprobe_address_safe() that gets the module and checks for
__init functions does not compile with IS_ENABLED(CONFIG_MODULES). 
I can pull it out to a helper or leave #ifdef in the function body,
whichever you prefer.
 
> -- 
> Masami Hiramatsu

-- 
Sincerely yours,
Mike.


Re: [PATCH v3] arm64: hibernate: Fix level3 translation fault in swsusp_save()

2024-04-19 Thread Mike Rapoport
On Wed, Apr 17, 2024 at 10:52:48AM +0800, Yaxiong Tian wrote:
> On ARM64 machines using UEFI, if can_set_direct_map() return false by
> setting some CONFIGS in kernel build or grub,such as
> NO CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT、NO CONFIG_KFENCE
> NO CONFIG_RODATA_FULL_DEFAULT_ENABLED.Also with setting rodata=off、
> debug_pagealloc=off in grub and NO CONFIG_KFENCE.
> swsusp_save() will fail due to can't finding the map table under the
> nomap memory.such as:
> 
> [   48.532162] Unable to handle kernel paging request at virtual address 
> ff80
> [   48.532162] Mem abort info:
> [   48.532162]   ESR = 0x9607
> [   48.532162]   EC = 0x25: DABT (current EL), IL = 32 bits
> [   48.532162]   SET = 0, FnV = 0
> [   48.532162]   EA = 0, S1PTW = 0
> [   48.532162]   FSC = 0x07: level 3 translation fault
> [   48.532162] Data abort info:
> [   48.532162]   ISV = 0, ISS = 0x0007, ISS2 = 0x
> [   48.532162]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> [   48.532162]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> [   48.532162] swapper pgtable: 4k pages, 39-bit VAs, pgdp=eeb0b000
> [   48.532162] [ff80] pgd=18217fff9803, p4d=18217fff9803, 
> pud=18217fff9803, pmd=18217fff8803, pte=
> [   48.532162] Internal error: Oops: 9607 [#1] SMP
> [   48.532162] Internal error: Oops: 9607 [#1] SMP
> [   48.532162] Modules linked in: xt_multiport ipt_REJECT nf_reject_ipv4 
> xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c 
> iptable_filter bpfilter rfkill at803x snd_hda_codec_hdmi snd_hda_intel 
> snd_intel_dspcfg dwmac_generic stmmac_platform snd_hda_codec stmmac joydev 
> pcs_xpcs snd_hda_core phylink ppdev lp parport ramoops reed_solomon ip_tables 
> x_tables nls_iso8859_1 vfat multipath linear amdgpu amdxcp drm_exec gpu_sched 
> drm_buddy hid_generic usbhid hid radeon video drm_suballoc_helper 
> drm_ttm_helper ttm i2c_algo_bit drm_display_helper cec drm_kms_helper drm
> [   48.532162] CPU: 0 PID: 3663 Comm: systemd-sleep Not tainted 6.6.2+ #76
> [   48.532162] Source Version: 4e22ed63a0a48e7a7cff9b98b7806d8d4add7dc0
> [   48.532162] Hardware name: Greatwall GW-XX-XXX/GW-XX-XXX, BIOS 
> KunLun BIOS V4.0 01/19/2021
> [   48.532162] pstate: 63c5 (nZCv DAIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   48.532162] pc : swsusp_save+0x280/0x538
> [   48.532162] lr : swsusp_save+0x280/0x538
> [   48.532162] sp : ffa034a3fa40
> [   48.532162] x29: ffa034a3fa40 x28: ff801000 x27: 
> 
> [   48.532162] x26: ff800140 x25: ffc08113e248 x24: 
> 
> [   48.532162] x23: 0008 x22: ffc08113e280 x21: 
> 000c69f2
> [   48.532162] x20: ff80 x19: ffc081ae2500 x18: 
> 
> [   48.532162] x17: 662074736420 x16: 3030303030303030 x15: 
> 3038
> [   48.532162] x14: 0b69 x13: ff9f89088530 x12: 
> ffea
> [   48.532162] x11: 7fff x10: 7fff x9 : 
> ffc08193f0d0
> [   48.532162] x8 : 000bffe8 x7 : c0007fff x6 : 
> 0001
> [   48.532162] x5 : ffa0fff09dc8 x4 :  x3 : 
> 0027
> [   48.532162] x2 :  x1 :  x0 : 
> 004e
> [   48.532162] Call trace:
> [   48.532162]  swsusp_save+0x280/0x538
> [   48.532162]  swsusp_arch_suspend+0x148/0x190
> [   48.532162]  hibernation_snapshot+0x240/0x39c
> [   48.532162]  hibernate+0xc4/0x378
> [   48.532162]  state_store+0xf0/0x10c
> [   48.532162]  kobj_attr_store+0x14/0x24
> 
> This issue can be reproduced in QEMU using UEFI when booting with
> rodata=off、debug_pagealloc=off in grub and NO CONFIG_KFENCE.
> 
> This is because in swsusp_save()->copy_data_pages()->page_is_saveable(),
> kernel_page_present() presumes that a page is present when 
> can_set_direct_map()
> returns false even for NOMAP ranges.So NOMAP pages will saved in after,and 
> then
> cause level3 translation fault in this pages.
> 
> Since the NOMAP regions are now marked as PageReserved(), pfn walkers
> and the rest of core mm will treat them as unusable memory. So this
> regions should not saved in hibernation.
> 
> This problem may cause by changes to pfn_valid() logic in commit
> a7d9f306ba70 ("arm64: drop pfn_valid_within() and simplify pfn_valid()").
> 
> Reference ohter arch architecture,drop the !can_set_direct_map() condition
> in kernel_page_present().So in page_is_savable(),these page will skiped.
> 
> Fixes: a7d9f306ba70 ("arm64: drop pfn_valid_within() and simplify 
> pfn_valid()")
> 
> Suggested-by: Mike Rapoport 
> Suggested

Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Thu, Apr 18, 2024 at 02:01:22PM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport  wrote:
> >
> > On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport  wrote:
> > > > > >
> > > > > > I'm looking at execmem_types more as definition of the consumers, 
> > > > > > maybe I
> > > > > > should have named the enum execmem_consumer at the first place.
> > > > >
> > > > > I think looking at execmem_type from consumers' point of view adds
> > > > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, 
> > > > > kprobe,
> > > > > and bpf (and maybe also module text) all have the same requirements.
> > > > > Did I miss something?
> > > >
> > > > It's enough to have one architecture with different constrains for 
> > > > kprobes
> > > > and bpf to warrant a type for each.
> > >
> > > AFAICT, some of these constraints can be changed without too much work.
> >
> > But why?
> > I honestly don't understand what are you trying to optimize here. A few
> > lines of initialization in execmem_info?
> 
> IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
> harder for bpf and kprobe to share the same ROX page. In many use cases,
> a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
> module text. It is not efficient if we have to allocate separate pages for 
> each
> of these use cases. If this is not a problem, the current approach works.

The caching of large ROX pages does not need to be per type. 

In the POC I've posted for caching of large ROX pages on x86 [1], the cache is
global and to make kprobes and bpf use it it's enough to set a flag in
execmem_info.

[1] https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org

> Thanks,
> Song

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-19 Thread Mike Rapoport
On Thu, Apr 18, 2024 at 02:01:22PM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 10:54 AM Mike Rapoport  wrote:
> >
> > On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> > > On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport  wrote:
> > > > > >
> > > > > > I'm looking at execmem_types more as definition of the consumers, 
> > > > > > maybe I
> > > > > > should have named the enum execmem_consumer at the first place.
> > > > >
> > > > > I think looking at execmem_type from consumers' point of view adds
> > > > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, 
> > > > > kprobe,
> > > > > and bpf (and maybe also module text) all have the same requirements.
> > > > > Did I miss something?
> > > >
> > > > It's enough to have one architecture with different constrains for 
> > > > kprobes
> > > > and bpf to warrant a type for each.
> > >
> > > AFAICT, some of these constraints can be changed without too much work.
> >
> > But why?
> > I honestly don't understand what are you trying to optimize here. A few
> > lines of initialization in execmem_info?
> 
> IIUC, having separate EXECMEM_BPF and EXECMEM_KPROBE makes it
> harder for bpf and kprobe to share the same ROX page. In many use cases,
> a 2MiB page (assuming x86_64) is enough for all BPF, kprobe, ftrace, and
> module text. It is not efficient if we have to allocate separate pages for 
> each
> of these use cases. If this is not a problem, the current approach works.

The caching of large ROX pages does not need to be per type. 

In the POC I've posted for caching of large ROX pages on x86 [1], the cache is
global and to make kprobes and bpf use it it's enough to set a flag in
execmem_info.

[1] https://lore.kernel.org/all/20240411160526.2093408-1-r...@kernel.org

> Thanks,
> Song

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 3/7] module: [

2024-04-18 Thread Mike Rapoport
On Thu, Apr 18, 2024 at 10:31:16PM +0300, Nadav Amit wrote:
> 
> > On 18 Apr 2024, at 13:20, Mike Rapoport  wrote:
> > 
> > On Tue, Apr 16, 2024 at 12:36:08PM +0300, Nadav Amit wrote:
> >> 
> >> 
> >> 
> >> I might be missing something, but it seems a bit racy.
> >> 
> >> IIUC, module_finalize() calls alternatives_smp_module_add(). At this
> >> point, since you don’t hold the text_mutex, some might do text_poke(),
> >> e.g., by enabling/disabling static-key, and the update would be
> >> overwritten. No?
> > 
> > Right :(
> > Even worse, for UP case alternatives_smp_unlock() will "patch" still empty
> > area.
> > 
> > So I'm thinking about calling alternatives_smp_module_add() from an
> > additional callback after the execmem_update_copy().
> > 
> > Does it make sense to you?
> 
> Going over the code again - I might have just been wrong: I confused the
> alternatives and the jump-label mechanisms (as they do share a lot of
> code and characteristics).
> 
> The jump-labels are updated when prepare_coming_module() is called, which
> happens after post_relocation() [which means they would be updated using
> text_poke() “inefficiently” but should be safe].
> 
> The “alternatives” appear only to use text_poke() (in contrast for
> text_poke_early()) from very specific few flows, e.g., 
> common_cpu_up() -> alternatives_enable_smp().
> 
> Are those flows pose a problem after boot?

Yes, common_cpu_up is called  on CPU hotplug, so it's possible to have a
race between alternatives_smp_module_add() and common_cpu_up() ->
alternatives_enable_smp().

And in UP case alternatives_smp_module_add() will call
alternatives_smp_unlock() that will patch module text before it is updated.

> Anyhow, sorry for the noise.

On the contrary, I would have missed it.

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 3/7] module: [

2024-04-18 Thread Mike Rapoport
On Thu, Apr 18, 2024 at 10:31:16PM +0300, Nadav Amit wrote:
> 
> > On 18 Apr 2024, at 13:20, Mike Rapoport  wrote:
> > 
> > On Tue, Apr 16, 2024 at 12:36:08PM +0300, Nadav Amit wrote:
> >> 
> >> 
> >> 
> >> I might be missing something, but it seems a bit racy.
> >> 
> >> IIUC, module_finalize() calls alternatives_smp_module_add(). At this
> >> point, since you don’t hold the text_mutex, some might do text_poke(),
> >> e.g., by enabling/disabling static-key, and the update would be
> >> overwritten. No?
> > 
> > Right :(
> > Even worse, for UP case alternatives_smp_unlock() will "patch" still empty
> > area.
> > 
> > So I'm thinking about calling alternatives_smp_module_add() from an
> > additional callback after the execmem_update_copy().
> > 
> > Does it make sense to you?
> 
> Going over the code again - I might have just been wrong: I confused the
> alternatives and the jump-label mechanisms (as they do share a lot of
> code and characteristics).
> 
> The jump-labels are updated when prepare_coming_module() is called, which
> happens after post_relocation() [which means they would be updated using
> text_poke() “inefficiently” but should be safe].
> 
> The “alternatives” appear only to use text_poke() (in contrast for
> text_poke_early()) from very specific few flows, e.g., 
> common_cpu_up() -> alternatives_enable_smp().
> 
> Are those flows pose a problem after boot?

Yes, common_cpu_up is called  on CPU hotplug, so it's possible to have a
race between alternatives_smp_module_add() and common_cpu_up() ->
alternatives_enable_smp().

And in UP case alternatives_smp_module_add() will call
alternatives_smp_unlock() that will patch module text before it is updated.

> Anyhow, sorry for the noise.

On the contrary, I would have missed it.

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-18 Thread Mike Rapoport
On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport  wrote:
> > > >
> > > > I'm looking at execmem_types more as definition of the consumers, maybe 
> > > > I
> > > > should have named the enum execmem_consumer at the first place.
> > >
> > > I think looking at execmem_type from consumers' point of view adds
> > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > > and bpf (and maybe also module text) all have the same requirements.
> > > Did I miss something?
> >
> > It's enough to have one architecture with different constrains for kprobes
> > and bpf to warrant a type for each.
> 
> AFAICT, some of these constraints can be changed without too much work.

But why?
I honestly don't understand what are you trying to optimize here. A few
lines of initialization in execmem_info?
What is the advantage in forcing architectures to have imposed limits on
kprobes or bpf allocations?

> > Where do you see unnecessary complexity?
> >
> > > IOW, we have
> > >
> > > enum execmem_type {
> > > EXECMEM_DEFAULT,
> > > EXECMEM_TEXT,
> > > EXECMEM_KPROBES = EXECMEM_TEXT,
> > > EXECMEM_FTRACE = EXECMEM_TEXT,
> > > EXECMEM_BPF = EXECMEM_TEXT,  /* we may end up without
> > > _KPROBE, _FTRACE, _BPF */
> > > EXECMEM_DATA,  /* rw */
> > > EXECMEM_RO_DATA,
> > > EXECMEM_RO_AFTER_INIT,
> > > EXECMEM_TYPE_MAX,
> > > };
> > >
> > > Does this make sense?
> >
> > How do you suggest to deal with e.g. riscv that has separate address spaces
> > for modules, kprobes and bpf?
> 
> IIUC, modules and bpf use the same address space on riscv

Not exactly, bpf is a subset of modules on riscv.

> while kprobes use vmalloc address.

The whole point of using the entire vmalloc for kprobes is to avoid
pollution of limited modules space.
 
> Thanks,
> Song

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-18 Thread Mike Rapoport
On Thu, Apr 18, 2024 at 09:13:27AM -0700, Song Liu wrote:
> On Thu, Apr 18, 2024 at 8:37 AM Mike Rapoport  wrote:
> > > >
> > > > I'm looking at execmem_types more as definition of the consumers, maybe 
> > > > I
> > > > should have named the enum execmem_consumer at the first place.
> > >
> > > I think looking at execmem_type from consumers' point of view adds
> > > unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> > > and bpf (and maybe also module text) all have the same requirements.
> > > Did I miss something?
> >
> > It's enough to have one architecture with different constrains for kprobes
> > and bpf to warrant a type for each.
> 
> AFAICT, some of these constraints can be changed without too much work.

But why?
I honestly don't understand what are you trying to optimize here. A few
lines of initialization in execmem_info?
What is the advantage in forcing architectures to have imposed limits on
kprobes or bpf allocations?

> > Where do you see unnecessary complexity?
> >
> > > IOW, we have
> > >
> > > enum execmem_type {
> > > EXECMEM_DEFAULT,
> > > EXECMEM_TEXT,
> > > EXECMEM_KPROBES = EXECMEM_TEXT,
> > > EXECMEM_FTRACE = EXECMEM_TEXT,
> > > EXECMEM_BPF = EXECMEM_TEXT,  /* we may end up without
> > > _KPROBE, _FTRACE, _BPF */
> > > EXECMEM_DATA,  /* rw */
> > > EXECMEM_RO_DATA,
> > > EXECMEM_RO_AFTER_INIT,
> > > EXECMEM_TYPE_MAX,
> > > };
> > >
> > > Does this make sense?
> >
> > How do you suggest to deal with e.g. riscv that has separate address spaces
> > for modules, kprobes and bpf?
> 
> IIUC, modules and bpf use the same address space on riscv

Not exactly, bpf is a subset of modules on riscv.

> while kprobes use vmalloc address.

The whole point of using the entire vmalloc for kprobes is to avoid
pollution of limited modules space.
 
> Thanks,
> Song

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-18 Thread Mike Rapoport
Hi Masami,

On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> Hi Mike,
> 
> On Thu, 11 Apr 2024 19:00:50 +0300
> Mike Rapoport  wrote:
> 
> > From: "Mike Rapoport (IBM)" 
> > 
> > kprobes depended on CONFIG_MODULES because it has to allocate memory for
> > code.
> > 
> > Since code allocations are now implemented with execmem, kprobes can be
> > enabled in non-modular kernels.
> > 
> > Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
> > modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> > dependency of CONFIG_KPROBES on CONFIG_MODULES.
> 
> Thanks for this work, but this conflicts with the latest fix in v6.9-rc4.
> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> function body? We have enough dummy functions for that, so it should
> not make a problem.

I'll rebase and will try to reduce ifdefery where possible.

> Thank you,
> 
> > 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> >  arch/Kconfig|  2 +-
> >  kernel/kprobes.c| 43 +
> >  kernel/trace/trace_kprobe.c | 11 ++
> >  3 files changed, 37 insertions(+), 19 deletions(-)
> > 
> 
> -- 
> Masami Hiramatsu

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-18 Thread Mike Rapoport
Hi Masami,

On Thu, Apr 18, 2024 at 06:16:15AM +0900, Masami Hiramatsu wrote:
> Hi Mike,
> 
> On Thu, 11 Apr 2024 19:00:50 +0300
> Mike Rapoport  wrote:
> 
> > From: "Mike Rapoport (IBM)" 
> > 
> > kprobes depended on CONFIG_MODULES because it has to allocate memory for
> > code.
> > 
> > Since code allocations are now implemented with execmem, kprobes can be
> > enabled in non-modular kernels.
> > 
> > Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
> > modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
> > dependency of CONFIG_KPROBES on CONFIG_MODULES.
> 
> Thanks for this work, but this conflicts with the latest fix in v6.9-rc4.
> Also, can you use IS_ENABLED(CONFIG_MODULES) instead of #ifdefs in
> function body? We have enough dummy functions for that, so it should
> not make a problem.

I'll rebase and will try to reduce ifdefery where possible.

> Thank you,
> 
> > 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> >  arch/Kconfig|  2 +-
> >  kernel/kprobes.c| 43 +
> >  kernel/trace/trace_kprobe.c | 11 ++
> >  3 files changed, 37 insertions(+), 19 deletions(-)
> > 
> 
> -- 
> Masami Hiramatsu

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-18 Thread Mike Rapoport
On Wed, Apr 17, 2024 at 04:32:49PM -0700, Song Liu wrote:
> On Tue, Apr 16, 2024 at 12:23 AM Mike Rapoport  wrote:
> >
> > On Mon, Apr 15, 2024 at 06:36:39PM +0100, Mark Rutland wrote:
> > > On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> > > > On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > > > > +/**
> > > > > + * enum execmem_type - types of executable memory ranges
> > > > > + *
> > > > > + * There are several subsystems that allocate executable memory.
> > > > > + * Architectures define different restrictions on placement,
> > > > > + * permissions, alignment and other parameters for memory that can 
> > > > > be used
> > > > > + * by these subsystems.
> > > > > + * Types in this enum identify subsystems that allocate executable 
> > > > > memory
> > > > > + * and let architectures define parameters for ranges suitable for
> > > > > + * allocations by each subsystem.
> > > > > + *
> > > > > + * @EXECMEM_DEFAULT: default parameters that would be used for types 
> > > > > that
> > > > > + * are not explcitly defined.
> > > > > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > > > > + * @EXECMEM_KPROBES: parameters for kprobes
> > > > > + * @EXECMEM_FTRACE: parameters for ftrace
> > > > > + * @EXECMEM_BPF: parameters for BPF
> > > > > + * @EXECMEM_TYPE_MAX:
> > > > > + */
> > > > > +enum execmem_type {
> > > > > + EXECMEM_DEFAULT,
> > > > > + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > > > > + EXECMEM_KPROBES,
> > > > > + EXECMEM_FTRACE,
> > > > > + EXECMEM_BPF,
> > > > > + EXECMEM_TYPE_MAX,
> > > > > +};
> > > >
> > > > Can we please get a break-down of how all these types are actually
> > > > different from one another?
> > > >
> > > > I'm thinking some platforms have a tiny immediate space (arm64 comes to
> > > > mind) and has less strict placement constraints for some of them?
> > >
> > > Yeah, and really I'd *much* rather deal with that in arch code, as I have 
> > > said
> > > several times.
> > >
> > > For arm64 we have two bsaic restrictions:
> > >
> > > 1) Direct branches can go +/-128M
> > >We can expand this range by having direct branches go to PLTs, at a
> > >performance cost.
> > >
> > > 2) PREL32 relocations can go +/-2G
> > >We cannot expand this further.
> > >
> > > * We don't need to allocate memory for ftrace. We do not use trampolines.
> > >
> > > * Kprobes XOL areas don't care about either of those; we don't place any
> > >   PC-relative instructions in those. Maybe we want to in future.
> > >
> > > * Modules care about both; we'd *prefer* to place them within +/-128M of 
> > > all
> > >   other kernel/module code, but if there's no space we can use PLTs and 
> > > expand
> > >   that to +/-2G. Since modules can refreence other modules, that ends up
> > >   actually being halved, and modules have to fit within some 2G window 
> > > that
> > >   also covers the kernel.
> 
> Is +/- 2G enough for all realistic use cases? If so, I guess we don't
> really need
> EXECMEM_ANYWHERE below?
> 
> > >
> > > * I'm not sure about BPF's requirements; it seems happy doing the same as
> > >   modules.
> >
> > BPF are happy with vmalloc().
> >
> > > So if we *must* use a common execmem allocator, what we'd reall want is 
> > > our own
> > > types, e.g.
> > >
> > >   EXECMEM_ANYWHERE
> > >   EXECMEM_NOPLT
> > >   EXECMEM_PREL32
> > >
> > > ... and then we use those in arch code to implement module_alloc() and 
> > > friends.
> >
> > I'm looking at execmem_types more as definition of the consumers, maybe I
> > should have named the enum execmem_consumer at the first place.
> 
> I think looking at execmem_type from consumers' point of view adds
> unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> and bpf (and maybe also module text) all have the same requirements.
> Did I miss something?

It's enough to have one architecture with different constrains for kprobes
and bpf to warrant a type for each.

Where do you see unnecessary complexity?
 
> IOW, we have
> 
> enum execmem_type {
> EXECMEM_DEFAULT,
> EXECMEM_TEXT,
> EXECMEM_KPROBES = EXECMEM_TEXT,
> EXECMEM_FTRACE = EXECMEM_TEXT,
> EXECMEM_BPF = EXECMEM_TEXT,  /* we may end up without
> _KPROBE, _FTRACE, _BPF */
> EXECMEM_DATA,  /* rw */
> EXECMEM_RO_DATA,
> EXECMEM_RO_AFTER_INIT,
> EXECMEM_TYPE_MAX,
> };
> 
> Does this make sense?
 
How do you suggest to deal with e.g. riscv that has separate address spaces
for modules, kprobes and bpf?

> Thanks,
> Song

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-18 Thread Mike Rapoport
On Wed, Apr 17, 2024 at 04:32:49PM -0700, Song Liu wrote:
> On Tue, Apr 16, 2024 at 12:23 AM Mike Rapoport  wrote:
> >
> > On Mon, Apr 15, 2024 at 06:36:39PM +0100, Mark Rutland wrote:
> > > On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> > > > On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > > > > +/**
> > > > > + * enum execmem_type - types of executable memory ranges
> > > > > + *
> > > > > + * There are several subsystems that allocate executable memory.
> > > > > + * Architectures define different restrictions on placement,
> > > > > + * permissions, alignment and other parameters for memory that can 
> > > > > be used
> > > > > + * by these subsystems.
> > > > > + * Types in this enum identify subsystems that allocate executable 
> > > > > memory
> > > > > + * and let architectures define parameters for ranges suitable for
> > > > > + * allocations by each subsystem.
> > > > > + *
> > > > > + * @EXECMEM_DEFAULT: default parameters that would be used for types 
> > > > > that
> > > > > + * are not explcitly defined.
> > > > > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > > > > + * @EXECMEM_KPROBES: parameters for kprobes
> > > > > + * @EXECMEM_FTRACE: parameters for ftrace
> > > > > + * @EXECMEM_BPF: parameters for BPF
> > > > > + * @EXECMEM_TYPE_MAX:
> > > > > + */
> > > > > +enum execmem_type {
> > > > > + EXECMEM_DEFAULT,
> > > > > + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > > > > + EXECMEM_KPROBES,
> > > > > + EXECMEM_FTRACE,
> > > > > + EXECMEM_BPF,
> > > > > + EXECMEM_TYPE_MAX,
> > > > > +};
> > > >
> > > > Can we please get a break-down of how all these types are actually
> > > > different from one another?
> > > >
> > > > I'm thinking some platforms have a tiny immediate space (arm64 comes to
> > > > mind) and has less strict placement constraints for some of them?
> > >
> > > Yeah, and really I'd *much* rather deal with that in arch code, as I have 
> > > said
> > > several times.
> > >
> > > For arm64 we have two bsaic restrictions:
> > >
> > > 1) Direct branches can go +/-128M
> > >We can expand this range by having direct branches go to PLTs, at a
> > >performance cost.
> > >
> > > 2) PREL32 relocations can go +/-2G
> > >We cannot expand this further.
> > >
> > > * We don't need to allocate memory for ftrace. We do not use trampolines.
> > >
> > > * Kprobes XOL areas don't care about either of those; we don't place any
> > >   PC-relative instructions in those. Maybe we want to in future.
> > >
> > > * Modules care about both; we'd *prefer* to place them within +/-128M of 
> > > all
> > >   other kernel/module code, but if there's no space we can use PLTs and 
> > > expand
> > >   that to +/-2G. Since modules can refreence other modules, that ends up
> > >   actually being halved, and modules have to fit within some 2G window 
> > > that
> > >   also covers the kernel.
> 
> Is +/- 2G enough for all realistic use cases? If so, I guess we don't
> really need
> EXECMEM_ANYWHERE below?
> 
> > >
> > > * I'm not sure about BPF's requirements; it seems happy doing the same as
> > >   modules.
> >
> > BPF are happy with vmalloc().
> >
> > > So if we *must* use a common execmem allocator, what we'd reall want is 
> > > our own
> > > types, e.g.
> > >
> > >   EXECMEM_ANYWHERE
> > >   EXECMEM_NOPLT
> > >   EXECMEM_PREL32
> > >
> > > ... and then we use those in arch code to implement module_alloc() and 
> > > friends.
> >
> > I'm looking at execmem_types more as definition of the consumers, maybe I
> > should have named the enum execmem_consumer at the first place.
> 
> I think looking at execmem_type from consumers' point of view adds
> unnecessary complexity. IIUC, for most (if not all) archs, ftrace, kprobe,
> and bpf (and maybe also module text) all have the same requirements.
> Did I miss something?

It's enough to have one architecture with different constrains for kprobes
and bpf to warrant a type for each.

Where do you see unnecessary complexity?
 
> IOW, we have
> 
> enum execmem_type {
> EXECMEM_DEFAULT,
> EXECMEM_TEXT,
> EXECMEM_KPROBES = EXECMEM_TEXT,
> EXECMEM_FTRACE = EXECMEM_TEXT,
> EXECMEM_BPF = EXECMEM_TEXT,  /* we may end up without
> _KPROBE, _FTRACE, _BPF */
> EXECMEM_DATA,  /* rw */
> EXECMEM_RO_DATA,
> EXECMEM_RO_AFTER_INIT,
> EXECMEM_TYPE_MAX,
> };
> 
> Does this make sense?
 
How do you suggest to deal with e.g. riscv that has separate address spaces
for modules, kprobes and bpf?

> Thanks,
> Song

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-18 Thread Mike Rapoport
On Tue, Apr 16, 2024 at 09:52:34AM +0200, Peter Zijlstra wrote:
> On Mon, Apr 15, 2024 at 08:00:26PM +0300, Mike Rapoport wrote:
> > On Mon, Apr 15, 2024 at 12:47:50PM +0200, Peter Zijlstra wrote:
> > > On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote:
> > > 
> > > > To populate the cache, a writable large page is allocated from vmalloc 
> > > > with
> > > > VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped 
> > > > as
> > > > ROX.
> > > 
> > > > +static void execmem_invalidate(void *ptr, size_t size, bool writable)
> > > > +{
> > > > +   if (execmem_info->invalidate)
> > > > +   execmem_info->invalidate(ptr, size, writable);
> > > > +   else
> > > > +   memset(ptr, 0, size);
> > > > +}
> > > 
> > > +static void execmem_invalidate(void *ptr, size_t size, bool writeable)
> > > +{
> > > +   /* fill memory with INT3 instructions */
> > > +   if (writeable)
> > > +   memset(ptr, 0xcc, size);
> > > +   else
> > > +   text_poke_set(ptr, 0xcc, size);
> > > +}
> > > 
> > > Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction.
> > > It raises #BP not #UD.
> > 
> > Do you mean that _invalidate is a poor name choice or that it's necessary
> > to use an instruction that raises #UD?
> 
> Poor naming, mostly. #BP handler will still scream bloody murder if the
> site is otherwise unclaimed.
> 
> It just isn't an invalid instruction.

Well, execmem_fill_with_insns_screaming_bloody_murder seems too long, how
about execmem_fill_trapping_insns?

-- 
Sincerely yours,
Mike.


Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-18 Thread Mike Rapoport
On Tue, Apr 16, 2024 at 09:52:34AM +0200, Peter Zijlstra wrote:
> On Mon, Apr 15, 2024 at 08:00:26PM +0300, Mike Rapoport wrote:
> > On Mon, Apr 15, 2024 at 12:47:50PM +0200, Peter Zijlstra wrote:
> > > On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote:
> > > 
> > > > To populate the cache, a writable large page is allocated from vmalloc 
> > > > with
> > > > VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped 
> > > > as
> > > > ROX.
> > > 
> > > > +static void execmem_invalidate(void *ptr, size_t size, bool writable)
> > > > +{
> > > > +   if (execmem_info->invalidate)
> > > > +   execmem_info->invalidate(ptr, size, writable);
> > > > +   else
> > > > +   memset(ptr, 0, size);
> > > > +}
> > > 
> > > +static void execmem_invalidate(void *ptr, size_t size, bool writeable)
> > > +{
> > > +   /* fill memory with INT3 instructions */
> > > +   if (writeable)
> > > +   memset(ptr, 0xcc, size);
> > > +   else
> > > +   text_poke_set(ptr, 0xcc, size);
> > > +}
> > > 
> > > Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction.
> > > It raises #BP not #UD.
> > 
> > Do you mean that _invalidate is a poor name choice or that it's necessary
> > to use an instruction that raises #UD?
> 
> Poor naming, mostly. #BP handler will still scream bloody murder if the
> site is otherwise unclaimed.
> 
> It just isn't an invalid instruction.

Well, execmem_fill_with_insns_screaming_bloody_murder seems too long, how
about execmem_fill_trapping_insns?

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 3/7] module: prepare to handle ROX allocations for text

2024-04-18 Thread Mike Rapoport
On Tue, Apr 16, 2024 at 12:36:08PM +0300, Nadav Amit wrote:
> 
> 
> > On 11 Apr 2024, at 19:05, Mike Rapoport  wrote:
> > 
> > @@ -2440,7 +2479,24 @@ static int post_relocation(struct module *mod, const 
> > struct load_info *info)
> > add_kallsyms(mod, info);
> > 
> > /* Arch-specific module finalizing. */
> > -   return module_finalize(info->hdr, info->sechdrs, mod);
> > +   ret = module_finalize(info->hdr, info->sechdrs, mod);
> > +   if (ret)
> > +   return ret;
> > +
> > +   for_each_mod_mem_type(type) {
> > +   struct module_memory *mem = >mem[type];
> > +
> > +   if (mem->is_rox) {
> > +   if (!execmem_update_copy(mem->base, mem->rw_copy,
> > +mem->size))
> > +   return -ENOMEM;
> > +
> > +   vfree(mem->rw_copy);
> > +   mem->rw_copy = NULL;
> > +   }
> > +   }
> > +
> > +   return 0;
> > }
> 
> I might be missing something, but it seems a bit racy.
> 
> IIUC, module_finalize() calls alternatives_smp_module_add(). At this
> point, since you don’t hold the text_mutex, some might do text_poke(),
> e.g., by enabling/disabling static-key, and the update would be
> overwritten. No?

Right :(
Even worse, for UP case alternatives_smp_unlock() will "patch" still empty
area.

So I'm thinking about calling alternatives_smp_module_add() from an
additional callback after the execmem_update_copy().

Does it make sense to you?

-- 
Sincerely yours,
Mike.


Re: [RFC PATCH 3/7] module: prepare to handle ROX allocations for text

2024-04-18 Thread Mike Rapoport
On Tue, Apr 16, 2024 at 12:36:08PM +0300, Nadav Amit wrote:
> 
> 
> > On 11 Apr 2024, at 19:05, Mike Rapoport  wrote:
> > 
> > @@ -2440,7 +2479,24 @@ static int post_relocation(struct module *mod, const 
> > struct load_info *info)
> > add_kallsyms(mod, info);
> > 
> > /* Arch-specific module finalizing. */
> > -   return module_finalize(info->hdr, info->sechdrs, mod);
> > +   ret = module_finalize(info->hdr, info->sechdrs, mod);
> > +   if (ret)
> > +   return ret;
> > +
> > +   for_each_mod_mem_type(type) {
> > +   struct module_memory *mem = >mem[type];
> > +
> > +   if (mem->is_rox) {
> > +   if (!execmem_update_copy(mem->base, mem->rw_copy,
> > +mem->size))
> > +   return -ENOMEM;
> > +
> > +   vfree(mem->rw_copy);
> > +   mem->rw_copy = NULL;
> > +   }
> > +   }
> > +
> > +   return 0;
> > }
> 
> I might be missing something, but it seems a bit racy.
> 
> IIUC, module_finalize() calls alternatives_smp_module_add(). At this
> point, since you don’t hold the text_mutex, some might do text_poke(),
> e.g., by enabling/disabling static-key, and the update would be
> overwritten. No?

Right :(
Even worse, for UP case alternatives_smp_unlock() will "patch" still empty
area.

So I'm thinking about calling alternatives_smp_module_add() from an
additional callback after the execmem_update_copy().

Does it make sense to you?

-- 
Sincerely yours,
Mike.



Re: [PATCH v20 2/5] ring-buffer: Introducing ring-buffer mapping functions

2024-04-18 Thread Mike Rapoport
On Sat, Apr 06, 2024 at 06:36:46PM +0100, Vincent Donnefort wrote:
> In preparation for allowing the user-space to map a ring-buffer, add
> a set of mapping functions:
> 
>   ring_buffer_{map,unmap}()
> 
> And controls on the ring-buffer:
> 
>   ring_buffer_map_get_reader()  /* swap reader and head */
> 
> Mapping the ring-buffer also involves:
> 
>   A unique ID for each subbuf of the ring-buffer, currently they are
>   only identified through their in-kernel VA.
> 
>   A meta-page, where are stored ring-buffer statistics and a
>   description for the current reader
> 
> The linear mapping exposes the meta-page, and each subbuf of the
> ring-buffer, ordered following their unique ID, assigned during the
> first mapping.
> 
> Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
> size will remain unmodified and the splice enabling functions will in
> reality simply memcpy the data instead of swapping subbufs.
> 
> CC: 
> Signed-off-by: Vincent Donnefort 
> 
> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> index dc5ae4e96aee..96d2140b471e 100644
> --- a/include/linux/ring_buffer.h
> +++ b/include/linux/ring_buffer.h
> @@ -6,6 +6,8 @@
>  #include 
>  #include 
>  
> +#include 
> +
>  struct trace_buffer;
>  struct ring_buffer_iter;
>  
> @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct 
> hlist_node *node);
>  #define trace_rb_cpu_prepare NULL
>  #endif
>  
> +int ring_buffer_map(struct trace_buffer *buffer, int cpu,
> + struct vm_area_struct *vma);
> +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
> +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
>  #endif /* _LINUX_RING_BUFFER_H */
> diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> new file mode 100644
> index ..ffcd8dfcaa4f
> --- /dev/null
> +++ b/include/uapi/linux/trace_mmap.h
> @@ -0,0 +1,46 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _TRACE_MMAP_H_
> +#define _TRACE_MMAP_H_
> +
> +#include 
> +
> +/**
> + * struct trace_buffer_meta - Ring-buffer Meta-page description
> + * @meta_page_size:  Size of this meta-page.
> + * @meta_struct_len: Size of this structure.
> + * @subbuf_size: Size of each sub-buffer.
> + * @nr_subbufs:  Number of subbfs in the ring-buffer, including 
> the reader.
> + * @reader.lost_events:  Number of events lost at the time of the reader 
> swap.
> + * @reader.id:   subbuf ID of the current reader. ID range [0 : 
> @nr_subbufs - 1]
> + * @reader.read: Number of bytes read on the reader subbuf.
> + * @flags:   Placeholder for now, 0 until new features are supported.
> + * @entries: Number of entries in the ring-buffer.
> + * @overrun: Number of entries lost in the ring-buffer.
> + * @read:Number of entries that have been read.
> + * @Reserved1:   Reserved for future use.
> + * @Reserved2:   Reserved for future use.
> + */
> +struct trace_buffer_meta {
> + __u32   meta_page_size;
> + __u32   meta_struct_len;
> +
> + __u32   subbuf_size;
> + __u32   nr_subbufs;
> +
> + struct {
> + __u64   lost_events;
> + __u32   id;
> + __u32   read;
> + } reader;
> +
> + __u64   flags;
> +
> + __u64   entries;
> + __u64   overrun;
> + __u64   read;
> +
> + __u64   Reserved1;
> + __u64   Reserved2;

Why do you need reserved fields? This structure always resides in the
beginning of a page and the rest of the page is essentially "reserved".

> +};
> +
> +#endif /* _TRACE_MMAP_H_ */
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index cc9ebe593571..793ecc454039 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c

... 

> +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
> +unsigned long *subbuf_ids)
> +{
> + struct trace_buffer_meta *meta = cpu_buffer->meta_page;
> + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
> + struct buffer_page *first_subbuf, *subbuf;
> + int id = 0;
> +
> + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
> + cpu_buffer->reader_page->id = id++;
> +
> + first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
> + do {
> + if (WARN_ON(id >= nr_subbufs))
> + break;
> +
> + subbuf_ids[id] = (unsigned long)subbuf->page;
> + subbuf->id = id;
> +
> + rb_inc_page();
> + id++;
> + } while (subbuf != first_subbuf);
> +
> + /* install subbuf ID to kern VA translation */
> + cpu_buffer->subbuf_ids = subbuf_ids;
> +
> + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */
> + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order;

Isn't this a 

Re: [PATCH v20 0/5] Introducing trace buffer mapping by user-space

2024-04-16 Thread Mike Rapoport
(added linux-mm)

On Wed, Apr 10, 2024 at 01:56:12PM -0400, Steven Rostedt wrote:
> 
> Hi Andrew, et.al.
> 
> Linus said it's a hard requirement that this code gets an Acked-by (or
> Reviewed-by) from the memory sub-maintainers before he will accept it.
> He was upset that we faulted in pages one at a time instead of mapping it
> in one go:
> 
>   
> https://lore.kernel.org/all/CAHk-=wh5wWeib7+kVHpBVtUn7kx7GGadWqb5mW5FYTdewEfL=w...@mail.gmail.com/
> 
> Could you take a look at patches 1-3 to make sure they look sane from a
> memory management point of view?
> 
> I really want this applied in the next merge window.
> 
> Thanks!
> 
> -- Steve
> 
> 
> On Sat,  6 Apr 2024 18:36:44 +0100
> Vincent Donnefort  wrote:
> 
> > The tracing ring-buffers can be stored on disk or sent to network
> > without any copy via splice. However the later doesn't allow real time
> > processing of the traces. A solution is to give userspace direct access
> > to the ring-buffer pages via a mapping. An application can now become a
> > consumer of the ring-buffer, in a similar fashion to what trace_pipe
> > offers.
> > 
> > Support for this new feature can already be found in libtracefs from
> > version 1.8, when built with EXTRA_CFLAGS=-DFORCE_MMAP_ENABLE.
> > 
> > Vincent
> > 
> > v19 -> v20:
> >   * Fix typos in documentation.
> >   * Remove useless mmap open and fault callbacks.
> >   * add mm.h include for vm_insert_pages
> > 
> > v18 -> v19:
> >   * Use VM_PFNMAP and vm_insert_pages
> >   * Allocate ring-buffer subbufs with __GFP_COMP
> >   * Pad the meta-page with the zero-page to align on the subbuf_order
> >   * Extend the ring-buffer test with mmap() dedicated suite
> > 
> > v17 -> v18:
> >   * Fix lockdep_assert_held
> >   * Fix spin_lock_init typo
> >   * Fix CONFIG_TRACER_MAX_TRACE typo
> > 
> > v16 -> v17:
> >   * Documentation and comments improvements.
> >   * Create get/put_snapshot_map() for clearer code.
> >   * Replace kzalloc with kcalloc.
> >   * Fix -ENOMEM handling in rb_alloc_meta_page().
> >   * Move flush(cpu_buffer->reader_page) behind the reader lock.
> >   * Move all inc/dec of cpu_buffer->mapped behind reader lock and buffer
> > mutex. (removes READ_ONCE/WRITE_ONCE accesses).
> > 
> > v15 -> v16:
> >   * Add comment for the dcache flush.
> >   * Remove now unnecessary WRITE_ONCE for the meta-page.
> > 
> > v14 -> v15:
> >   * Add meta-page and reader-page flush. Intends to fix the mapping
> > for VIVT and aliasing-VIPT data caches.
> >   * -EPERM on VM_EXEC.
> >   * Fix build warning !CONFIG_TRACER_MAX_TRACE.
> > 
> > v13 -> v14:
> >   * All cpu_buffer->mapped readers use READ_ONCE (except for swap_cpu)
> >   * on unmap, sync meta-page teardown with the reader_lock instead of
> > the synchronize_rcu.
> >   * Add a dedicated spinlock for trace_array ->snapshot and ->mapped.
> > (intends to fix a lockdep issue)
> >   * Add kerneldoc for flags and Reserved fields.
> >   * Add kselftest for snapshot/map mutual exclusion.
> > 
> > v12 -> v13:
> >   * Swap subbufs_{touched,lost} for Reserved fields.
> >   * Add a flag field in the meta-page.
> >   * Fix CONFIG_TRACER_MAX_TRACE.
> >   * Rebase on top of trace/urgent.
> >   * Add a comment for try_unregister_trigger()
> > 
> > v11 -> v12:
> >   * Fix code sample mmap bug.
> >   * Add logging in sample code.
> >   * Reset tracer in selftest.
> >   * Add a refcount for the snapshot users.
> >   * Prevent mapping when there are snapshot users and vice versa.
> >   * Refine the meta-page.
> >   * Fix types in the meta-page.
> >   * Collect Reviewed-by.
> > 
> > v10 -> v11:
> >   * Add Documentation and code sample.
> >   * Add a selftest.
> >   * Move all the update to the meta-page into a single
> > rb_update_meta_page().
> >   * rb_update_meta_page() is now called from
> > ring_buffer_map_get_reader() to fix NOBLOCK callers.
> >   * kerneldoc for struct trace_meta_page.
> >   * Add a patch to zero all the ring-buffer allocations.
> > 
> > v9 -> v10:
> >   * Refactor rb_update_meta_page()
> >   * In-loop declaration for foreach_subbuf_page()
> >   * Check for cpu_buffer->mapped overflow
> > 
> > v8 -> v9:
> >   * Fix the unlock path in ring_buffer_map()
> >   * Fix cpu_buffer cast with rb_work_rq->is_cpu_buffer
> >   * Rebase on linux-trace/for-next 
> > (3cb3091138ca0921c4569bcf7ffa062519639b6a)
> > 
> > v7 -> v8:
> >   * Drop the subbufs renaming into bpages
> >   * Use subbuf as a name when relevant
> > 
> > v6 -> v7:
> >   * Rebase onto lore.kernel.org/lkml/20231215175502.106587...@goodmis.org/
> >   * Support for subbufs
> >   * Rename subbufs into bpages
> > 
> > v5 -> v6:
> >   * Rebase on next-20230802.
> >   * (unsigned long) -> (void *) cast for virt_to_page().
> >   * Add a wait for the GET_READER_PAGE ioctl.
> >   * Move writer fields update (overrun/pages_lost/entries/pages_touched)
> > in the irq_work.
> >   * Rearrange id in struct buffer_page.
> >   * Rearrange the meta-page.
> >   * ring_buffer_meta_page -> 

Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-16 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 06:36:39PM +0100, Mark Rutland wrote:
> On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > > +/**
> > > + * enum execmem_type - types of executable memory ranges
> > > + *
> > > + * There are several subsystems that allocate executable memory.
> > > + * Architectures define different restrictions on placement,
> > > + * permissions, alignment and other parameters for memory that can be 
> > > used
> > > + * by these subsystems.
> > > + * Types in this enum identify subsystems that allocate executable memory
> > > + * and let architectures define parameters for ranges suitable for
> > > + * allocations by each subsystem.
> > > + *
> > > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > > + * are not explcitly defined.
> > > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > > + * @EXECMEM_KPROBES: parameters for kprobes
> > > + * @EXECMEM_FTRACE: parameters for ftrace
> > > + * @EXECMEM_BPF: parameters for BPF
> > > + * @EXECMEM_TYPE_MAX:
> > > + */
> > > +enum execmem_type {
> > > + EXECMEM_DEFAULT,
> > > + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > > + EXECMEM_KPROBES,
> > > + EXECMEM_FTRACE,
> > > + EXECMEM_BPF,
> > > + EXECMEM_TYPE_MAX,
> > > +};
> > 
> > Can we please get a break-down of how all these types are actually
> > different from one another?
> > 
> > I'm thinking some platforms have a tiny immediate space (arm64 comes to
> > mind) and has less strict placement constraints for some of them?
> 
> Yeah, and really I'd *much* rather deal with that in arch code, as I have said
> several times.
> 
> For arm64 we have two bsaic restrictions: 
> 
> 1) Direct branches can go +/-128M
>We can expand this range by having direct branches go to PLTs, at a
>performance cost.
> 
> 2) PREL32 relocations can go +/-2G
>We cannot expand this further.
> 
> * We don't need to allocate memory for ftrace. We do not use trampolines.
> 
> * Kprobes XOL areas don't care about either of those; we don't place any
>   PC-relative instructions in those. Maybe we want to in future.
> 
> * Modules care about both; we'd *prefer* to place them within +/-128M of all
>   other kernel/module code, but if there's no space we can use PLTs and expand
>   that to +/-2G. Since modules can refreence other modules, that ends up
>   actually being halved, and modules have to fit within some 2G window that
>   also covers the kernel.
> 
> * I'm not sure about BPF's requirements; it seems happy doing the same as
>   modules.

BPF are happy with vmalloc().
 
> So if we *must* use a common execmem allocator, what we'd reall want is our 
> own
> types, e.g.
> 
>   EXECMEM_ANYWHERE
>   EXECMEM_NOPLT
>   EXECMEM_PREL32
> 
> ... and then we use those in arch code to implement module_alloc() and 
> friends.

I'm looking at execmem_types more as definition of the consumers, maybe I
should have named the enum execmem_consumer at the first place.

And the arch constrains defined in struct execmem_range describe how memory
should be allocated for each consumer.

These constraints are defined early at boot and remain static, so
initializing them once and letting a common allocator use them makes
perfect sense to me.

I agree that fallback_{start,end} are not ideal, but we have 3
architectures that have preferred and secondary range for modules. And arm
and powerpc use the same logic for kprobes as well, and I don't see why this
code should be duplicated.

And, for instance, if you decide to place PC-relative instructions if
kprobes XOL areas, you'd only need to update execmem_range for kprobes to
be more like the range for modules.

With central allocator it's easier to deal with the things like
VM_FLUSH_RESET_PERMS and caching of ROX memory and I think it will be more
maintainable that module_alloc(), alloc_insn_page() and
bpf_jit_alloc_exec() spread all over the place.
 
> Mark.

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-16 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 06:36:39PM +0100, Mark Rutland wrote:
> On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > > +/**
> > > + * enum execmem_type - types of executable memory ranges
> > > + *
> > > + * There are several subsystems that allocate executable memory.
> > > + * Architectures define different restrictions on placement,
> > > + * permissions, alignment and other parameters for memory that can be 
> > > used
> > > + * by these subsystems.
> > > + * Types in this enum identify subsystems that allocate executable memory
> > > + * and let architectures define parameters for ranges suitable for
> > > + * allocations by each subsystem.
> > > + *
> > > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > > + * are not explcitly defined.
> > > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > > + * @EXECMEM_KPROBES: parameters for kprobes
> > > + * @EXECMEM_FTRACE: parameters for ftrace
> > > + * @EXECMEM_BPF: parameters for BPF
> > > + * @EXECMEM_TYPE_MAX:
> > > + */
> > > +enum execmem_type {
> > > + EXECMEM_DEFAULT,
> > > + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > > + EXECMEM_KPROBES,
> > > + EXECMEM_FTRACE,
> > > + EXECMEM_BPF,
> > > + EXECMEM_TYPE_MAX,
> > > +};
> > 
> > Can we please get a break-down of how all these types are actually
> > different from one another?
> > 
> > I'm thinking some platforms have a tiny immediate space (arm64 comes to
> > mind) and has less strict placement constraints for some of them?
> 
> Yeah, and really I'd *much* rather deal with that in arch code, as I have said
> several times.
> 
> For arm64 we have two bsaic restrictions: 
> 
> 1) Direct branches can go +/-128M
>We can expand this range by having direct branches go to PLTs, at a
>performance cost.
> 
> 2) PREL32 relocations can go +/-2G
>We cannot expand this further.
> 
> * We don't need to allocate memory for ftrace. We do not use trampolines.
> 
> * Kprobes XOL areas don't care about either of those; we don't place any
>   PC-relative instructions in those. Maybe we want to in future.
> 
> * Modules care about both; we'd *prefer* to place them within +/-128M of all
>   other kernel/module code, but if there's no space we can use PLTs and expand
>   that to +/-2G. Since modules can refreence other modules, that ends up
>   actually being halved, and modules have to fit within some 2G window that
>   also covers the kernel.
> 
> * I'm not sure about BPF's requirements; it seems happy doing the same as
>   modules.

BPF are happy with vmalloc().
 
> So if we *must* use a common execmem allocator, what we'd reall want is our 
> own
> types, e.g.
> 
>   EXECMEM_ANYWHERE
>   EXECMEM_NOPLT
>   EXECMEM_PREL32
> 
> ... and then we use those in arch code to implement module_alloc() and 
> friends.

I'm looking at execmem_types more as definition of the consumers, maybe I
should have named the enum execmem_consumer at the first place.

And the arch constrains defined in struct execmem_range describe how memory
should be allocated for each consumer.

These constraints are defined early at boot and remain static, so
initializing them once and letting a common allocator use them makes
perfect sense to me.

I agree that fallback_{start,end} are not ideal, but we have 3
architectures that have preferred and secondary range for modules. And arm
and powerpc use the same logic for kprobes as well, and I don't see why this
code should be duplicated.

And, for instance, if you decide to place PC-relative instructions if
kprobes XOL areas, you'd only need to update execmem_range for kprobes to
be more like the range for modules.

With central allocator it's easier to deal with the things like
VM_FLUSH_RESET_PERMS and caching of ROX memory and I think it will be more
maintainable that module_alloc(), alloc_insn_page() and
bpf_jit_alloc_exec() spread all over the place.
 
> Mark.

-- 
Sincerely yours,
Mike.


Re: [RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text

2024-04-15 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 12:43:16PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:05:24PM +0300, Mike Rapoport wrote:
> > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> > index 45a280f2161c..b4d6868df573 100644
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> 
> > @@ -504,17 +513,17 @@ void __init_or_module noinline 
> > apply_alternatives(struct alt_instr *start,
> >  *   patch if feature is *NOT* present.
> >  */
> > if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
> > -   optimize_nops_inplace(instr, a->instrlen);
> > +   optimize_nops_inplace(wr_instr, a->instrlen);
> > continue;
> > }
> >  
> > -   DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: 
> > (%px, len: %d) flags: 0x%x",
> > +   DPRINTK(ALT, "feat: %d*32+%d, old: (%px (%px) len: %d), repl: 
> > (%px (%px), len: %d) flags: 0x%x",
> > a->cpuid >> 5,
> > a->cpuid & 0x1f,
> > -   instr, instr, a->instrlen,
> > -   replacement, a->replacementlen, a->flags);
> > +   instr, wr_instr, a->instrlen,
> > +   replacement, wr_replacement, a->replacementlen, 
> > a->flags);
> 
> I think this, and

I've found printing both address handy when I debugged it, but no strong
feelings here.
 
> >  
> > -   memcpy(insn_buff, replacement, a->replacementlen);
> > +   memcpy(insn_buff, wr_replacement, a->replacementlen);
> > insn_buff_sz = a->replacementlen;
> >  
> > if (a->flags & ALT_FLAG_DIRECT_CALL) {
> > @@ -528,11 +537,11 @@ void __init_or_module noinline 
> > apply_alternatives(struct alt_instr *start,
> >  
> > apply_relocation(insn_buff, a->instrlen, instr, replacement, 
> > a->replacementlen);
> >  
> > -   DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
> > +   DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px:   old_insn: ", 
> > instr);
> 
> this, want to remain as is. 

here wr_instr is the buffer to dump:

DUMP_BYTES(type, buf, len, fmt, args...)

rather than an address, which remained 'instr'.
 
> > DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   
> > rpl_insn: ", replacement);
> > DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", 
> > instr);
> >  
> > -   text_poke_early(instr, insn_buff, insn_buff_sz);
> > +   text_poke_early(wr_instr, insn_buff, insn_buff_sz);
> > }
> >  
> > kasan_enable_current();
> 
> The rationale being that we then print an address that can be correlated
> to the kernel image (provided one either kills kaslr or adjusts for it).

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text

2024-04-15 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 12:43:16PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:05:24PM +0300, Mike Rapoport wrote:
> > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> > index 45a280f2161c..b4d6868df573 100644
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> 
> > @@ -504,17 +513,17 @@ void __init_or_module noinline 
> > apply_alternatives(struct alt_instr *start,
> >  *   patch if feature is *NOT* present.
> >  */
> > if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
> > -   optimize_nops_inplace(instr, a->instrlen);
> > +   optimize_nops_inplace(wr_instr, a->instrlen);
> > continue;
> > }
> >  
> > -   DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: 
> > (%px, len: %d) flags: 0x%x",
> > +   DPRINTK(ALT, "feat: %d*32+%d, old: (%px (%px) len: %d), repl: 
> > (%px (%px), len: %d) flags: 0x%x",
> > a->cpuid >> 5,
> > a->cpuid & 0x1f,
> > -   instr, instr, a->instrlen,
> > -   replacement, a->replacementlen, a->flags);
> > +   instr, wr_instr, a->instrlen,
> > +   replacement, wr_replacement, a->replacementlen, 
> > a->flags);
> 
> I think this, and

I've found printing both address handy when I debugged it, but no strong
feelings here.
 
> >  
> > -   memcpy(insn_buff, replacement, a->replacementlen);
> > +   memcpy(insn_buff, wr_replacement, a->replacementlen);
> > insn_buff_sz = a->replacementlen;
> >  
> > if (a->flags & ALT_FLAG_DIRECT_CALL) {
> > @@ -528,11 +537,11 @@ void __init_or_module noinline 
> > apply_alternatives(struct alt_instr *start,
> >  
> > apply_relocation(insn_buff, a->instrlen, instr, replacement, 
> > a->replacementlen);
> >  
> > -   DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
> > +   DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px:   old_insn: ", 
> > instr);
> 
> this, want to remain as is. 

here wr_instr is the buffer to dump:

DUMP_BYTES(type, buf, len, fmt, args...)

rather than an address, which remained 'instr'.
 
> > DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   
> > rpl_insn: ", replacement);
> > DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", 
> > instr);
> >  
> > -   text_poke_early(instr, insn_buff, insn_buff_sz);
> > +   text_poke_early(wr_instr, insn_buff, insn_buff_sz);
> > }
> >  
> > kasan_enable_current();
> 
> The rationale being that we then print an address that can be correlated
> to the kernel image (provided one either kills kaslr or adjusts for it).

-- 
Sincerely yours,
Mike.


Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-15 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 12:47:50PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote:
> 
> > To populate the cache, a writable large page is allocated from vmalloc with
> > VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
> > ROX.
> 
> > +static void execmem_invalidate(void *ptr, size_t size, bool writable)
> > +{
> > +   if (execmem_info->invalidate)
> > +   execmem_info->invalidate(ptr, size, writable);
> > +   else
> > +   memset(ptr, 0, size);
> > +}
> 
> +static void execmem_invalidate(void *ptr, size_t size, bool writeable)
> +{
> +   /* fill memory with INT3 instructions */
> +   if (writeable)
> +   memset(ptr, 0xcc, size);
> +   else
> +   text_poke_set(ptr, 0xcc, size);
> +}
> 
> Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction.
> It raises #BP not #UD.

Do you mean that _invalidate is a poor name choice or that it's necessary
to use an instruction that raises #UD?

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-15 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 12:47:50PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote:
> 
> > To populate the cache, a writable large page is allocated from vmalloc with
> > VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
> > ROX.
> 
> > +static void execmem_invalidate(void *ptr, size_t size, bool writable)
> > +{
> > +   if (execmem_info->invalidate)
> > +   execmem_info->invalidate(ptr, size, writable);
> > +   else
> > +   memset(ptr, 0, size);
> > +}
> 
> +static void execmem_invalidate(void *ptr, size_t size, bool writeable)
> +{
> +   /* fill memory with INT3 instructions */
> +   if (writeable)
> +   memset(ptr, 0xcc, size);
> +   else
> +   text_poke_set(ptr, 0xcc, size);
> +}
> 
> Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction.
> It raises #BP not #UD.

Do you mean that _invalidate is a poor name choice or that it's necessary
to use an instruction that raises #UD?

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-15 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > +/**
> > + * enum execmem_type - types of executable memory ranges
> > + *
> > + * There are several subsystems that allocate executable memory.
> > + * Architectures define different restrictions on placement,
> > + * permissions, alignment and other parameters for memory that can be used
> > + * by these subsystems.
> > + * Types in this enum identify subsystems that allocate executable memory
> > + * and let architectures define parameters for ranges suitable for
> > + * allocations by each subsystem.
> > + *
> > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > + * are not explcitly defined.
> > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > + * @EXECMEM_KPROBES: parameters for kprobes
> > + * @EXECMEM_FTRACE: parameters for ftrace
> > + * @EXECMEM_BPF: parameters for BPF
> > + * @EXECMEM_TYPE_MAX:
> > + */
> > +enum execmem_type {
> > +   EXECMEM_DEFAULT,
> > +   EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > +   EXECMEM_KPROBES,
> > +   EXECMEM_FTRACE,
> > +   EXECMEM_BPF,
> > +   EXECMEM_TYPE_MAX,
> > +};
> 
> Can we please get a break-down of how all these types are actually
> different from one another?
> 
> I'm thinking some platforms have a tiny immediate space (arm64 comes to
> mind) and has less strict placement constraints for some of them?

loongarch, mips, nios2 and sparc define modules address space different
from vmalloc and use that for modules, kprobes and bpf (where supported).

parisc uses vmalloc range for everything, but it sets permissions to
PAGE_KERNEL_RWX because it's PAGE_KERNEL_EXEC is read only and it lacks
set_memory_* APIs.

arm has an address space for modules, but it fall back to the entire
vmalloc with CONFIG_ARM_MODULE_PLTS=y.

arm64 uses different ranges for modules and bpf/kprobes. For kprobes it
does vmalloc(PAGE_KERNEL_ROX) and for bpf just plain vmalloc().
For modules arm64 first tries to allocated from 128M below kernel_end and
if that fails it uses 2G below kernel_end as a fallback.

powerpc uses vmalloc space for everything for some configurations. For
book3s-32 and 8xx it defines two ranges that are used for module text,
kprobes and bpf and the module data can be allocated anywhere in vmalloc.

riscv has an address space for modules, a different address space for bpf
and uses vmalloc space for kprobes.

s390 and x86 have modules address space and use that space for all
executable allocations.

The EXECMEM_FTRACE type is only used on s390 and x86 and for now it's there
more for completeness rather to denote special constraints or properties.

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-15 Thread Mike Rapoport
On Mon, Apr 15, 2024 at 09:52:41AM +0200, Peter Zijlstra wrote:
> On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > +/**
> > + * enum execmem_type - types of executable memory ranges
> > + *
> > + * There are several subsystems that allocate executable memory.
> > + * Architectures define different restrictions on placement,
> > + * permissions, alignment and other parameters for memory that can be used
> > + * by these subsystems.
> > + * Types in this enum identify subsystems that allocate executable memory
> > + * and let architectures define parameters for ranges suitable for
> > + * allocations by each subsystem.
> > + *
> > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > + * are not explcitly defined.
> > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > + * @EXECMEM_KPROBES: parameters for kprobes
> > + * @EXECMEM_FTRACE: parameters for ftrace
> > + * @EXECMEM_BPF: parameters for BPF
> > + * @EXECMEM_TYPE_MAX:
> > + */
> > +enum execmem_type {
> > +   EXECMEM_DEFAULT,
> > +   EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > +   EXECMEM_KPROBES,
> > +   EXECMEM_FTRACE,
> > +   EXECMEM_BPF,
> > +   EXECMEM_TYPE_MAX,
> > +};
> 
> Can we please get a break-down of how all these types are actually
> different from one another?
> 
> I'm thinking some platforms have a tiny immediate space (arm64 comes to
> mind) and has less strict placement constraints for some of them?

loongarch, mips, nios2 and sparc define modules address space different
from vmalloc and use that for modules, kprobes and bpf (where supported).

parisc uses vmalloc range for everything, but it sets permissions to
PAGE_KERNEL_RWX because it's PAGE_KERNEL_EXEC is read only and it lacks
set_memory_* APIs.

arm has an address space for modules, but it fall back to the entire
vmalloc with CONFIG_ARM_MODULE_PLTS=y.

arm64 uses different ranges for modules and bpf/kprobes. For kprobes it
does vmalloc(PAGE_KERNEL_ROX) and for bpf just plain vmalloc().
For modules arm64 first tries to allocated from 128M below kernel_end and
if that fails it uses 2G below kernel_end as a fallback.

powerpc uses vmalloc space for everything for some configurations. For
book3s-32 and 8xx it defines two ranges that are used for module text,
kprobes and bpf and the module data can be allocated anywhere in vmalloc.

riscv has an address space for modules, a different address space for bpf
and uses vmalloc space for kprobes.

s390 and x86 have modules address space and use that space for all
executable allocations.

The EXECMEM_FTRACE type is only used on s390 and x86 and for now it's there
more for completeness rather to denote special constraints or properties.

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text

2024-04-14 Thread Mike Rapoport
On Fri, Apr 12, 2024 at 11:08:00AM +0200, Ingo Molnar wrote:
> 
> * Mike Rapoport  wrote:
> 
> > for (s = start; s < end; s++) {
> > void *addr = (void *)s + *s;
> > +   void *wr_addr = addr + module_writable_offset(mod, addr);
> 
> So instead of repeating this pattern in a dozen of places, why not use a 
> simpler method:
> 
>   void *wr_addr = module_writable_address(mod, addr);
> 
> or so, since we have to pass 'addr' to the module code anyway.

Agree.
 
> The text patching code is pretty complex already.
> 
> Thanks,
> 
>   Ingo

-- 
Sincerely yours,
Mike.


Re: [RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text

2024-04-14 Thread Mike Rapoport
On Fri, Apr 12, 2024 at 11:08:00AM +0200, Ingo Molnar wrote:
> 
> * Mike Rapoport  wrote:
> 
> > for (s = start; s < end; s++) {
> > void *addr = (void *)s + *s;
> > +   void *wr_addr = addr + module_writable_offset(mod, addr);
> 
> So instead of repeating this pattern in a dozen of places, why not use a 
> simpler method:
> 
>   void *wr_addr = module_writable_address(mod, addr);
> 
> or so, since we have to pass 'addr' to the module code anyway.

Agree.
 
> The text patching code is pretty complex already.
> 
> Thanks,
> 
>   Ingo

-- 
Sincerely yours,
Mike.



Re: [RFC PATCH 2/7] mm: vmalloc: don't account for number of nodes for HUGE_VMAP allocations

2024-04-14 Thread Mike Rapoport
On Fri, Apr 12, 2024 at 06:07:19AM +, Christophe Leroy wrote:
> 
> 
> Le 11/04/2024 à 18:05, Mike Rapoport a écrit :
> > From: "Mike Rapoport (IBM)" 
> > 
> > vmalloc allocations with VM_ALLOW_HUGE_VMAP that do not explictly
> > specify node ID will use huge pages only if size_per_node is larger than
> > PMD_SIZE.
> > Still the actual allocated memory is not distributed between nodes and
> > there is no advantage in such approach.
> > On the contrary, BPF allocates PMD_SIZE * num_possible_nodes() for each
> > new bpf_prog_pack, while it could do with PMD_SIZE'ed packs.
> > 
> > Don't account for number of nodes for VM_ALLOW_HUGE_VMAP with
> > NUMA_NO_NODE and use huge pages whenever the requested allocation size
> > is larger than PMD_SIZE.
> 
> Patch looks ok but message is confusing. We also use huge pages at PTE 
> size, for instance 512k pages or 16k pages on powerpc 8xx, while 
> PMD_SIZE is 4M.

Ok, I'll rephrase.
 
> Christophe
> 
> > 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> >   mm/vmalloc.c | 9 ++---
> >   1 file changed, 2 insertions(+), 7 deletions(-)
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 22aa63f4ef63..5fc8b514e457 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -3737,8 +3737,6 @@ void *__vmalloc_node_range(unsigned long size, 
> > unsigned long align,
> > }
> >   
> > if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
> > -   unsigned long size_per_node;
> > -
> > /*
> >  * Try huge pages. Only try for PAGE_KERNEL allocations,
> >  * others like modules don't yet expect huge pages in
> > @@ -3746,13 +3744,10 @@ void *__vmalloc_node_range(unsigned long size, 
> > unsigned long align,
> >  * supporting them.
> >  */
> >   
> > -   size_per_node = size;
> > -   if (node == NUMA_NO_NODE)
> > -   size_per_node /= num_online_nodes();
> > -   if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
> > +   if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
> > shift = PMD_SHIFT;
> > else
> > -   shift = arch_vmap_pte_supported_shift(size_per_node);
> > +   shift = arch_vmap_pte_supported_shift(size);
> >   
> > align = max(real_align, 1UL << shift);
> > size = ALIGN(real_size, 1UL << shift);

-- 
Sincerely yours,
Mike.


Re: [RFC PATCH 2/7] mm: vmalloc: don't account for number of nodes for HUGE_VMAP allocations

2024-04-14 Thread Mike Rapoport
On Fri, Apr 12, 2024 at 06:07:19AM +, Christophe Leroy wrote:
> 
> 
> Le 11/04/2024 à 18:05, Mike Rapoport a écrit :
> > From: "Mike Rapoport (IBM)" 
> > 
> > vmalloc allocations with VM_ALLOW_HUGE_VMAP that do not explictly
> > specify node ID will use huge pages only if size_per_node is larger than
> > PMD_SIZE.
> > Still the actual allocated memory is not distributed between nodes and
> > there is no advantage in such approach.
> > On the contrary, BPF allocates PMD_SIZE * num_possible_nodes() for each
> > new bpf_prog_pack, while it could do with PMD_SIZE'ed packs.
> > 
> > Don't account for number of nodes for VM_ALLOW_HUGE_VMAP with
> > NUMA_NO_NODE and use huge pages whenever the requested allocation size
> > is larger than PMD_SIZE.
> 
> Patch looks ok but message is confusing. We also use huge pages at PTE 
> size, for instance 512k pages or 16k pages on powerpc 8xx, while 
> PMD_SIZE is 4M.

Ok, I'll rephrase.
 
> Christophe
> 
> > 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> >   mm/vmalloc.c | 9 ++---
> >   1 file changed, 2 insertions(+), 7 deletions(-)
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 22aa63f4ef63..5fc8b514e457 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -3737,8 +3737,6 @@ void *__vmalloc_node_range(unsigned long size, 
> > unsigned long align,
> > }
> >   
> > if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
> > -   unsigned long size_per_node;
> > -
> > /*
> >  * Try huge pages. Only try for PAGE_KERNEL allocations,
> >  * others like modules don't yet expect huge pages in
> > @@ -3746,13 +3744,10 @@ void *__vmalloc_node_range(unsigned long size, 
> > unsigned long align,
> >  * supporting them.
> >  */
> >   
> > -   size_per_node = size;
> > -   if (node == NUMA_NO_NODE)
> > -   size_per_node /= num_online_nodes();
> > -   if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
> > +   if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
> > shift = PMD_SHIFT;
> > else
> > -   shift = arch_vmap_pte_supported_shift(size_per_node);
> > +   shift = arch_vmap_pte_supported_shift(size);
> >   
> > align = max(real_align, 1UL << shift);
> > size = ALIGN(real_size, 1UL << shift);

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 06/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-14 Thread Mike Rapoport
On Thu, Apr 11, 2024 at 10:53:46PM +0200, Sam Ravnborg wrote:
> Hi Mike.
> 
> On Thu, Apr 11, 2024 at 07:00:42PM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (IBM)" 
> > 
> > Several architectures override module_alloc() only to define address
> > range for code allocations different than VMALLOC address space.
> > 
> > Provide a generic implementation in execmem that uses the parameters for
> > address space ranges, required alignment and page protections provided
> > by architectures.
> > 
> > The architectures must fill execmem_info structure and implement
> > execmem_arch_setup() that returns a pointer to that structure. This way the
> > execmem initialization won't be called from every architecture, but rather
> > from a central place, namely a core_initcall() in execmem.
> > 
> > The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
> > with the parameters defined by the architectures.  If an architecture does
> > not implement execmem_arch_setup(), execmem_alloc() will fall back to
> > module_alloc().
> > 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> 
> This code snippet could be more readable ...
> > diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
> > index 66c45a2764bc..b70047f944cc 100644
> > --- a/arch/sparc/kernel/module.c
> > +++ b/arch/sparc/kernel/module.c
> > @@ -14,6 +14,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #include 
> >  #include 
> > @@ -21,34 +22,26 @@
> >  
> >  #include "entry.h"
> >  
> > +static struct execmem_info execmem_info __ro_after_init = {
> > +   .ranges = {
> > +   [EXECMEM_DEFAULT] = {
> >  #ifdef CONFIG_SPARC64
> > -
> > -#include 
> > -
> > -static void *module_map(unsigned long size)
> > -{
> > -   if (PAGE_ALIGN(size) > MODULES_LEN)
> > -   return NULL;
> > -   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> > -   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> > -   __builtin_return_address(0));
> > -}
> > +   .start = MODULES_VADDR,
> > +   .end = MODULES_END,
> >  #else
> > -static void *module_map(unsigned long size)
> > +   .start = VMALLOC_START,
> > +   .end = VMALLOC_END,
> > +#endif
> > +   .alignment = 1,
> > +   },
> > +   },
> > +};
> > +
> > +struct execmem_info __init *execmem_arch_setup(void)
> >  {
> > -   return vmalloc(size);
> > -}
> > -#endif /* CONFIG_SPARC64 */
> > -
> > -void *module_alloc(unsigned long size)
> > -{
> > -   void *ret;
> > -
> > -   ret = module_map(size);
> > -   if (ret)
> > -   memset(ret, 0, size);
> > +   execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
> >  
> > -   return ret;
> > +   return _info;
> >  }
> >  
> >  /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
> 
> ... if the following was added:
> 
> diff --git a/arch/sparc/include/asm/pgtable_32.h 
> b/arch/sparc/include/asm/pgtable_32.h
> index 9e85d57ac3f2..62bcafe38b1f 100644
> --- a/arch/sparc/include/asm/pgtable_32.h
> +++ b/arch/sparc/include/asm/pgtable_32.h
> @@ -432,6 +432,8 @@ static inline int io_remap_pfn_range(struct 
> vm_area_struct *vma,
> 
>  #define VMALLOC_START   _AC(0xfe60,UL)
>  #define VMALLOC_END _AC(0xffc0,UL)
> +#define MODULES_VADDR   VMALLOC_START
> +#define MODULES_END VMALLOC_END
> 
> 
> Then the #ifdef CONFIG_SPARC64 could be dropped and the code would be
> the same for 32 and 64 bits.
 
Yeah, the #ifdef there can be dropped even regardless of execmem.
I'll add a patch for that.

> Just a drive-by comment.
> 
>   Sam
> 

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 06/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-14 Thread Mike Rapoport
On Thu, Apr 11, 2024 at 10:53:46PM +0200, Sam Ravnborg wrote:
> Hi Mike.
> 
> On Thu, Apr 11, 2024 at 07:00:42PM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (IBM)" 
> > 
> > Several architectures override module_alloc() only to define address
> > range for code allocations different than VMALLOC address space.
> > 
> > Provide a generic implementation in execmem that uses the parameters for
> > address space ranges, required alignment and page protections provided
> > by architectures.
> > 
> > The architectures must fill execmem_info structure and implement
> > execmem_arch_setup() that returns a pointer to that structure. This way the
> > execmem initialization won't be called from every architecture, but rather
> > from a central place, namely a core_initcall() in execmem.
> > 
> > The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
> > with the parameters defined by the architectures.  If an architecture does
> > not implement execmem_arch_setup(), execmem_alloc() will fall back to
> > module_alloc().
> > 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> 
> This code snippet could be more readable ...
> > diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
> > index 66c45a2764bc..b70047f944cc 100644
> > --- a/arch/sparc/kernel/module.c
> > +++ b/arch/sparc/kernel/module.c
> > @@ -14,6 +14,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #include 
> >  #include 
> > @@ -21,34 +22,26 @@
> >  
> >  #include "entry.h"
> >  
> > +static struct execmem_info execmem_info __ro_after_init = {
> > +   .ranges = {
> > +   [EXECMEM_DEFAULT] = {
> >  #ifdef CONFIG_SPARC64
> > -
> > -#include 
> > -
> > -static void *module_map(unsigned long size)
> > -{
> > -   if (PAGE_ALIGN(size) > MODULES_LEN)
> > -   return NULL;
> > -   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> > -   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> > -   __builtin_return_address(0));
> > -}
> > +   .start = MODULES_VADDR,
> > +   .end = MODULES_END,
> >  #else
> > -static void *module_map(unsigned long size)
> > +   .start = VMALLOC_START,
> > +   .end = VMALLOC_END,
> > +#endif
> > +   .alignment = 1,
> > +   },
> > +   },
> > +};
> > +
> > +struct execmem_info __init *execmem_arch_setup(void)
> >  {
> > -   return vmalloc(size);
> > -}
> > -#endif /* CONFIG_SPARC64 */
> > -
> > -void *module_alloc(unsigned long size)
> > -{
> > -   void *ret;
> > -
> > -   ret = module_map(size);
> > -   if (ret)
> > -   memset(ret, 0, size);
> > +   execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
> >  
> > -   return ret;
> > +   return _info;
> >  }
> >  
> >  /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
> 
> ... if the following was added:
> 
> diff --git a/arch/sparc/include/asm/pgtable_32.h 
> b/arch/sparc/include/asm/pgtable_32.h
> index 9e85d57ac3f2..62bcafe38b1f 100644
> --- a/arch/sparc/include/asm/pgtable_32.h
> +++ b/arch/sparc/include/asm/pgtable_32.h
> @@ -432,6 +432,8 @@ static inline int io_remap_pfn_range(struct 
> vm_area_struct *vma,
> 
>  #define VMALLOC_START   _AC(0xfe60,UL)
>  #define VMALLOC_END _AC(0xffc0,UL)
> +#define MODULES_VADDR   VMALLOC_START
> +#define MODULES_END VMALLOC_END
> 
> 
> Then the #ifdef CONFIG_SPARC64 could be dropped and the code would be
> the same for 32 and 64 bits.
 
Yeah, the #ifdef there can be dropped even regardless of execmem.
I'll add a patch for that.

> Just a drive-by comment.
> 
>   Sam
> 

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-14 Thread Mike Rapoport
On Fri, Apr 12, 2024 at 11:16:10AM +0200, Ingo Molnar wrote:
> 
> * Mike Rapoport  wrote:
> 
> > +/**
> > + * enum execmem_type - types of executable memory ranges
> > + *
> > + * There are several subsystems that allocate executable memory.
> > + * Architectures define different restrictions on placement,
> > + * permissions, alignment and other parameters for memory that can be used
> > + * by these subsystems.
> > + * Types in this enum identify subsystems that allocate executable memory
> > + * and let architectures define parameters for ranges suitable for
> > + * allocations by each subsystem.
> > + *
> > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > + * are not explcitly defined.
> > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > + * @EXECMEM_KPROBES: parameters for kprobes
> > + * @EXECMEM_FTRACE: parameters for ftrace
> > + * @EXECMEM_BPF: parameters for BPF
> > + * @EXECMEM_TYPE_MAX:
> > + */
> > +enum execmem_type {
> > +   EXECMEM_DEFAULT,
> > +   EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > +   EXECMEM_KPROBES,
> > +   EXECMEM_FTRACE,
> > +   EXECMEM_BPF,
> > +   EXECMEM_TYPE_MAX,
> > +};
> 
> s/explcitly
>  /explicitly
 
Sure, thanks

> Thanks,
> 
>   Ingo

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-14 Thread Mike Rapoport
On Fri, Apr 12, 2024 at 11:16:10AM +0200, Ingo Molnar wrote:
> 
> * Mike Rapoport  wrote:
> 
> > +/**
> > + * enum execmem_type - types of executable memory ranges
> > + *
> > + * There are several subsystems that allocate executable memory.
> > + * Architectures define different restrictions on placement,
> > + * permissions, alignment and other parameters for memory that can be used
> > + * by these subsystems.
> > + * Types in this enum identify subsystems that allocate executable memory
> > + * and let architectures define parameters for ranges suitable for
> > + * allocations by each subsystem.
> > + *
> > + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> > + * are not explcitly defined.
> > + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> > + * @EXECMEM_KPROBES: parameters for kprobes
> > + * @EXECMEM_FTRACE: parameters for ftrace
> > + * @EXECMEM_BPF: parameters for BPF
> > + * @EXECMEM_TYPE_MAX:
> > + */
> > +enum execmem_type {
> > +   EXECMEM_DEFAULT,
> > +   EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> > +   EXECMEM_KPROBES,
> > +   EXECMEM_FTRACE,
> > +   EXECMEM_BPF,
> > +   EXECMEM_TYPE_MAX,
> > +};
> 
> s/explcitly
>  /explicitly
 
Sure, thanks

> Thanks,
> 
>   Ingo

-- 
Sincerely yours,
Mike.



Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-14 Thread Mike Rapoport
On Thu, Apr 11, 2024 at 12:42:05PM -0700, Luis Chamberlain wrote:
> On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (IBM)" 
> > 
> > module_alloc() is used everywhere as a mean to allocate memory for code.
> > 
> > Beside being semantically wrong, this unnecessarily ties all subsystems
> > that need to allocate code, such as ftrace, kprobes and BPF to modules and
> > puts the burden of code allocation to the modules code.
> > 
> > Several architectures override module_alloc() because of various
> > constraints where the executable memory can be located and this causes
> > additional obstacles for improvements of code allocation.
> > 
> > Start splitting code allocation from modules by introducing execmem_alloc()
> > and execmem_free() APIs.
> > 
> > Initially, execmem_alloc() is a wrapper for module_alloc() and
> > execmem_free() is a replacement of module_memfree() to allow updating all
> > call sites to use the new APIs.
> > 
> > Since architectures define different restrictions on placement,
> > permissions, alignment and other parameters for memory that can be used by
> > different subsystems that allocate executable memory, execmem_alloc() takes
> > a type argument, that will be used to identify the calling subsystem and to
> > allow architectures define parameters for ranges suitable for that
> > subsystem.
> 
> It would be good to describe this is a non-fuctional change.

Ok.
 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> 
> > diff --git a/mm/execmem.c b/mm/execmem.c
> > new file mode 100644
> > index ..ed2ea41a2543
> > --- /dev/null
> > +++ b/mm/execmem.c
> > @@ -0,0 +1,26 @@
> > +// SPDX-License-Identifier: GPL-2.0
> 
> And this just needs to copy over the copyright notices from the main.c file.

Will do.
 
>   Luis

-- 
Sincerely yours,
Mike.


Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-14 Thread Mike Rapoport
On Thu, Apr 11, 2024 at 12:42:05PM -0700, Luis Chamberlain wrote:
> On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (IBM)" 
> > 
> > module_alloc() is used everywhere as a mean to allocate memory for code.
> > 
> > Beside being semantically wrong, this unnecessarily ties all subsystems
> > that need to allocate code, such as ftrace, kprobes and BPF to modules and
> > puts the burden of code allocation to the modules code.
> > 
> > Several architectures override module_alloc() because of various
> > constraints where the executable memory can be located and this causes
> > additional obstacles for improvements of code allocation.
> > 
> > Start splitting code allocation from modules by introducing execmem_alloc()
> > and execmem_free() APIs.
> > 
> > Initially, execmem_alloc() is a wrapper for module_alloc() and
> > execmem_free() is a replacement of module_memfree() to allow updating all
> > call sites to use the new APIs.
> > 
> > Since architectures define different restrictions on placement,
> > permissions, alignment and other parameters for memory that can be used by
> > different subsystems that allocate executable memory, execmem_alloc() takes
> > a type argument, that will be used to identify the calling subsystem and to
> > allow architectures define parameters for ranges suitable for that
> > subsystem.
> 
> It would be good to describe this is a non-fuctional change.

Ok.
 
> > Signed-off-by: Mike Rapoport (IBM) 
> > ---
> 
> > diff --git a/mm/execmem.c b/mm/execmem.c
> > new file mode 100644
> > index ..ed2ea41a2543
> > --- /dev/null
> > +++ b/mm/execmem.c
> > @@ -0,0 +1,26 @@
> > +// SPDX-License-Identifier: GPL-2.0
> 
> And this just needs to copy over the copyright notices from the main.c file.

Will do.
 
>   Luis

-- 
Sincerely yours,
Mike.



Re: [POC][RFC][PATCH 1/2] mm/x86: Add wildcard * option as memmap=nn*align:name

2024-04-12 Thread Mike Rapoport
On Tue, Apr 09, 2024 at 04:41:24PM -0700, Kees Cook wrote:
> On Tue, Apr 09, 2024 at 07:11:56PM -0400, Steven Rostedt wrote:
> > On Tue, 9 Apr 2024 15:23:07 -0700
> > Kees Cook  wrote:
> > 
> > > Do we need to involve e820 at all? I think it might be possible to just
> > > have pstore call request_mem_region() very early? Or does KASLR make
> > > that unstable?
> > 
> > Yeah, would that give the same physical memory each boot, and can we
> > guarantee that KASLR will not map the kernel over the previous location?
> 
> Hm, no, for physical memory it needs to get excluded very early, which
> means e820.

Whatever memory is reserved in arch/x86/kernel/e820.c, that happens after
kaslr, so to begin with, a new memmap parameter should be also added to
parse_memmap in arch/x86/boot/compressed/kaslr.c to ensure the same
physical address will be available after KASLR.

More generally, memmap= is x86 specific and a bit of a hack.
Why won't you add a new kernel parameter that will be parsed in, say, 
mm/mm_init.c and will create the mmap_map (or whatever it will be named)
and reserve that memory in memblock rather than in e820?

This still will require update to arch/x86/boot/compressed/kaslr.c of
course.

> So, yeah, your proposal makes sense. I'm not super excited
> about this be x86-only though. What does arm64 for for memmap?
> 
> -- 
> Kees Cook
> 

-- 
Sincerely yours,
Mike.



[RFC PATCH 7/7] x86/module: enable ROX caches for module text

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Enable execmem's cache of PMD_SIZE'ed pages mapped as ROX for module
text allocations.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/x86/mm/init.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8e8cd0de3af6..049a8b4c64e2 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1102,9 +1102,23 @@ unsigned long arch_max_swapfile_size(void)
 #endif
 
 #ifdef CONFIG_EXECMEM
+static void execmem_invalidate(void *ptr, size_t size, bool writeable)
+{
+   /* fill memory with INT3 instructions */
+   if (writeable)
+   memset(ptr, 0xcc, size);
+   else
+   text_poke_set(ptr, 0xcc, size);
+}
+
 static struct execmem_info execmem_info __ro_after_init = {
+   .invalidate = execmem_invalidate,
.ranges = {
-   [EXECMEM_DEFAULT] = {
+   [EXECMEM_MODULE_TEXT] = {
+   .flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE,
+   .alignment = MODULE_ALIGN,
+   },
+   [EXECMEM_KPROBES...EXECMEM_MODULE_DATA] = {
.flags = EXECMEM_KASAN_SHADOW,
.alignment = MODULE_ALIGN,
},
@@ -1119,9 +1133,16 @@ struct execmem_info __init *execmem_arch_setup(void)
offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
 
start = MODULES_VADDR + offset;
-   execmem_info.ranges[EXECMEM_DEFAULT].start = start;
-   execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END;
-   execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
+
+   for (int i = EXECMEM_MODULE_TEXT; i < EXECMEM_TYPE_MAX; i++) {
+   struct execmem_range *r = _info.ranges[i];
+
+   r->start = start;
+   r->end = MODULES_END;
+   r->pgprot = PAGE_KERNEL;
+   }
+
+   execmem_info.ranges[EXECMEM_MODULE_TEXT].pgprot = PAGE_KERNEL_ROX;
 
return _info;
 }
-- 
2.43.0




[RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Using large pages to map text areas reduces iTLB pressure and improves
performance.

Extend execmem_alloc() with an ability to use PMD_SIZE'ed pages with ROX
permissions as a cache for smaller allocations.

To populate the cache, a writable large page is allocated from vmalloc with
VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
ROX.

Portions of that large page are handed out to execmem_alloc() callers
without any changes to the permissions.

When the memory is freed with execmem_free() it is invalidated again so
that it won't contain stale instructions.

The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
definition of an execmem_range.

Signed-off-by: Mike Rapoport (IBM) 
---
 include/linux/execmem.h |   2 +
 mm/execmem.c| 267 ++--
 2 files changed, 262 insertions(+), 7 deletions(-)

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 9d22999dbd7d..06f678e6fe55 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -77,12 +77,14 @@ struct execmem_range {
 
 /**
  * struct execmem_info - architecture parameters for code allocations
+ * @invalidate: set memory to contain invalid instructions
  * @ranges: array of parameter sets defining architecture specific
  * parameters for executable memory allocations. The ranges that are not
  * explicitly initialized by an architecture use parameters defined for
  * @EXECMEM_DEFAULT.
  */
 struct execmem_info {
+   void (*invalidate)(void *ptr, size_t size, bool writable);
struct execmem_rangeranges[EXECMEM_TYPE_MAX];
 };
 
diff --git a/mm/execmem.c b/mm/execmem.c
index c920d2b5a721..716fba68ab0e 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -1,30 +1,88 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
+#include 
+
+#include "internal.h"
+
 static struct execmem_info *execmem_info __ro_after_init;
 static struct execmem_info default_execmem_info __ro_after_init;
 
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+struct execmem_cache {
+   struct mutex mutex;
+   struct maple_tree busy_areas;
+   struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+   .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+   .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+execmem_cache.mutex),
+   .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+execmem_cache.mutex),
+};
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+   struct maple_tree *free_areas = _cache.free_areas;
+   struct mutex *mutex = _cache.mutex;
+   MA_STATE(mas, free_areas, 0, ULONG_MAX);
+   void *area;
+
+   mutex_lock(mutex);
+   mas_for_each(, area, ULONG_MAX) {
+   size_t size;
+
+   if (!xa_is_value(area))
+   continue;
+
+   size = xa_to_value(area);
+
+   if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(mas.index, 
PMD_SIZE)) {
+   void *ptr = (void *)mas.index;
+
+   mas_erase();
+   vfree(ptr);
+   }
+   }
+   mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static void execmem_invalidate(void *ptr, size_t size, bool writable)
+{
+   if (execmem_info->invalidate)
+   execmem_info->invalidate(ptr, size, writable);
+   else
+   memset(ptr, 0, size);
+}
+
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+pgprot_t pgprot, unsigned long vm_flags)
 {
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
-   unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+   unsigned int align = range->alignment;
unsigned long start = range->start;
unsigned long end = range->end;
-   unsigned int align = range->alignment;
-   pgprot_t pgprot = range->pgprot;
void *p;
 
if (kasan)
vm_flags |= VM_DEFER_KMEMLEAK;
 
-   p = __vmalloc_node_range(size, align, start, end, gfp_flags,
-pgprot, vm_flags, NUMA_NO_NODE,
+   if (vm_flags & VM_ALLOW_HUGE_VMAP)
+   align = PMD_SIZE;
+
+   p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot,
+vm_flags, NUMA_NO_NODE,
 __builtin_return_address(0));
if (!p && range->fallback_start) {
start = range->fallback_start;
@@ -44,6 +102,199 @@ static void *__execmem_alloc(struct execmem_range *range, 
size_t size)
 

[RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

When module text memory will be allocated with ROX permissions, the
memory at the actual address where the module will live will contain
invalid instructions and there will be a writable copy that contains the
actual module code.

Update relocations and alternatives patching to deal with it.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/um/kernel/um_arch.c   |  11 ++-
 arch/x86/entry/vdso/vma.c  |   3 +-
 arch/x86/include/asm/alternative.h |  14 +--
 arch/x86/kernel/alternative.c  | 152 +
 arch/x86/kernel/ftrace.c   |  41 +---
 arch/x86/kernel/module.c   |  17 ++--
 6 files changed, 140 insertions(+), 98 deletions(-)

diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index c5ca89e62552..5183c955974e 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -437,24 +437,25 @@ void __init arch_cpu_finalize_init(void)
os_check_bugs();
 }
 
-void apply_seal_endbr(s32 *start, s32 *end)
+void apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
 {
 }
 
-void apply_retpolines(s32 *start, s32 *end)
+void apply_retpolines(s32 *start, s32 *end, struct module *mod)
 {
 }
 
-void apply_returns(s32 *start, s32 *end)
+void apply_returns(s32 *start, s32 *end, struct module *mod)
 {
 }
 
 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
-  s32 *start_cfi, s32 *end_cfi)
+  s32 *start_cfi, s32 *end_cfi, struct module *mod)
 {
 }
 
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
+   struct module *mod)
 {
 }
 
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 6d83ceb7f1ba..31412adef5d2 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -51,7 +51,8 @@ int __init init_vdso_image(const struct vdso_image *image)
 
apply_alternatives((struct alt_instr *)(image->data + image->alt),
   (struct alt_instr *)(image->data + image->alt +
-   image->alt_len));
+   image->alt_len),
+  NULL);
 
return 0;
 }
diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index fcd20c6dc7f9..6f4b0776fc89 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -96,16 +96,16 @@ extern struct alt_instr __alt_instructions[], 
__alt_instructions_end[];
  * instructions were patched in already:
  */
 extern int alternatives_patched;
+struct module;
 
 extern void alternative_instructions(void);
-extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
-extern void apply_retpolines(s32 *start, s32 *end);
-extern void apply_returns(s32 *start, s32 *end);
-extern void apply_seal_endbr(s32 *start, s32 *end);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
+  struct module *mod);
+extern void apply_retpolines(s32 *start, s32 *end, struct module *mod);
+extern void apply_returns(s32 *start, s32 *end, struct module *mod);
+extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod);
 extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
- s32 *start_cfi, s32 *end_cfi);
-
-struct module;
+ s32 *start_cfi, s32 *end_cfi, struct module *mod);
 
 struct callthunk_sites {
s32 *call_start, *call_end;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 45a280f2161c..b4d6868df573 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -462,7 +462,8 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, 
struct alt_instr *a)
  * to refetch changed I$ lines.
  */
 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
- struct alt_instr *end)
+ struct alt_instr *end,
+ struct module *mod)
 {
struct alt_instr *a;
u8 *instr, *replacement;
@@ -490,10 +491,18 @@ void __init_or_module noinline apply_alternatives(struct 
alt_instr *start,
 * order.
 */
for (a = start; a < end; a++) {
+   unsigned long ins_offs, repl_offs;
int insn_buff_sz = 0;
+   u8 *wr_instr, *wr_replacement;
 
instr = (u8 *)>instr_offset + a->instr_offset;
+   ins_offs =  module_writable_offset(mod, instr);
+   wr_instr = instr + ins_offs;
+
replacement = (u8 *)>repl_offset + a->repl_offset;
+   repl_offs = module_writable_offset(mod, replacement);
+

[RFC PATCH 4/7] ftrace: Add swap_func to ftrace_process_locs()

2024-04-11 Thread Mike Rapoport
From: Song Liu 

ftrace_process_locs sorts module mcount, which is inside RO memory. Add a
ftrace_swap_func so that archs can use RO-memory-poke function to do the
sorting.

Signed-off-by: Song Liu 
Signed-off-by: Mike Rapoport (IBM) 
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/ftrace.c  | 13 -
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 54d53f345d14..54393ce57f08 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1172,4 +1172,6 @@ unsigned long arch_syscall_addr(int nr);
 
 #endif /* CONFIG_FTRACE_SYSCALLS */
 
+void ftrace_swap_func(void *a, void *b, int n);
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index da1710499698..95f11c8cdc5e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6480,6 +6480,17 @@ static void test_is_sorted(unsigned long *start, 
unsigned long count)
 }
 #endif
 
+void __weak ftrace_swap_func(void *a, void *b, int n)
+{
+   unsigned long t;
+
+   WARN_ON_ONCE(n != sizeof(t));
+
+   t = *((unsigned long *)a);
+   *(unsigned long *)a = *(unsigned long *)b;
+   *(unsigned long *)b = t;
+}
+
 static int ftrace_process_locs(struct module *mod,
   unsigned long *start,
   unsigned long *end)
@@ -6507,7 +6518,7 @@ static int ftrace_process_locs(struct module *mod,
 */
if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) {
sort(start, count, sizeof(*start),
-ftrace_cmp_ips, NULL);
+ftrace_cmp_ips, ftrace_swap_func);
} else {
test_is_sorted(start, count);
}
-- 
2.43.0




[RFC PATCH 3/7] module: prepare to handle ROX allocations for text

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

In order to support ROX allocations for module text, it is necessary to
handle modifications to the code, such as relocations and alternatives
patching, without write access to that memory.

One option is to use text patching, but this would make module loading
extremely slow and will expose executable code that is not finally formed.

A better way is to have memory allocated with ROX permissions contain
invalid instructions and keep a writable, but not executable copy of the
module text. The relocations and alternative patches would be done on the
writable copy using the addresses of the ROX memory.
Once the module is completely ready, the updated text will be copied to ROX
memory using text patching in one go and the writable copy will be freed.

Add support for that to module initialization code and provide necessary
interfaces in execmem.
---
 include/linux/execmem.h| 23 +
 include/linux/module.h | 11 ++
 kernel/module/main.c   | 70 ++
 kernel/module/strict_rwx.c |  3 ++
 mm/execmem.c   | 11 ++
 5 files changed, 111 insertions(+), 7 deletions(-)

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index ffd0d12feef5..9d22999dbd7d 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -46,9 +46,11 @@ enum execmem_type {
 /**
  * enum execmem_range_flags - options for executable memory allocations
  * @EXECMEM_KASAN_SHADOW:  allocate kasan shadow
+ * @EXECMEM_READ_ONLY: allocated memory should be mapped as read only
  */
 enum execmem_range_flags {
EXECMEM_KASAN_SHADOW= (1 << 0),
+   EXECMEM_ROX_CACHE   = (1 << 1),
 };
 
 /**
@@ -123,6 +125,27 @@ void *execmem_alloc(enum execmem_type type, size_t size);
  */
 void execmem_free(void *ptr);
 
+/**
+ * execmem_update_copy - copy an update to executable memory
+ * @dst:  destination address to update
+ * @src:  source address containing the data
+ * @size: how many bytes of memory shold be copied
+ *
+ * Copy @size bytes from @src to @dst using text poking if the memory at
+ * @dst is read-only.
+ *
+ * Return: a pointer to @dst or NULL on error
+ */
+void *execmem_update_copy(void *dst, const void *src, size_t size);
+
+/**
+ * execmem_is_rox - check if execmem is read-only
+ * @type - the execmem type to check
+ *
+ * Return: %true if the @type is read-only, %false if it's writable
+ */
+bool execmem_is_rox(enum execmem_type type);
+
 #ifdef CONFIG_ARCH_WANTS_EXECMEM_EARLY
 void execmem_early_init(void);
 #else
diff --git a/include/linux/module.h b/include/linux/module.h
index 1153b0d99a80..3df3202680a2 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -361,6 +361,8 @@ enum mod_mem_type {
 
 struct module_memory {
void *base;
+   void *rw_copy;
+   bool is_rox;
unsigned int size;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
@@ -368,6 +370,15 @@ struct module_memory {
 #endif
 };
 
+#ifdef CONFIG_MODULES
+unsigned long module_writable_offset(struct module *mod, void *loc);
+#else
+static inline unsigned long module_writable_offset(struct module *mod, void 
*loc)
+{
+   return 0;
+}
+#endif
+
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 /* Only touch one cacheline for common rbtree-for-core-layout case. */
 #define __module_memory_align cacheline_aligned
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 91e185607d4b..f83fbb9c95ee 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1188,6 +1188,21 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
+unsigned long module_writable_offset(struct module *mod, void *loc)
+{
+   if (!mod)
+   return 0;
+
+   for_class_mod_mem_type(type, text) {
+   struct module_memory *mem = >mem[type];
+
+   if (loc >= mem->base && loc < mem->base + mem->size)
+   return (unsigned long)(mem->rw_copy - mem->base);
+   }
+
+   return 0;
+}
+
 static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1205,6 +1220,23 @@ static int module_memory_alloc(struct module *mod, enum 
mod_mem_type type)
if (!ptr)
return -ENOMEM;
 
+   mod->mem[type].base = ptr;
+
+   if (execmem_is_rox(execmem_type)) {
+   ptr = vzalloc(size);
+
+   if (!ptr) {
+   execmem_free(mod->mem[type].base);
+   return -ENOMEM;
+   }
+
+   mod->mem[type].rw_copy = ptr;
+   mod->mem[type].is_rox = true;
+   } else {
+   mod->mem[type].rw_copy = mod->mem[type].base;
+   memset(mod->mem[type].base, 0, size);
+   }
+
/*
 * The pointer to these blocks of memory are stored on the module
 * structure and w

[RFC PATCH 2/7] mm: vmalloc: don't account for number of nodes for HUGE_VMAP allocations

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

vmalloc allocations with VM_ALLOW_HUGE_VMAP that do not explictly
specify node ID will use huge pages only if size_per_node is larger than
PMD_SIZE.
Still the actual allocated memory is not distributed between nodes and
there is no advantage in such approach.
On the contrary, BPF allocates PMD_SIZE * num_possible_nodes() for each
new bpf_prog_pack, while it could do with PMD_SIZE'ed packs.

Don't account for number of nodes for VM_ALLOW_HUGE_VMAP with
NUMA_NO_NODE and use huge pages whenever the requested allocation size
is larger than PMD_SIZE.

Signed-off-by: Mike Rapoport (IBM) 
---
 mm/vmalloc.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 22aa63f4ef63..5fc8b514e457 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3737,8 +3737,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned 
long align,
}
 
if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
-   unsigned long size_per_node;
-
/*
 * Try huge pages. Only try for PAGE_KERNEL allocations,
 * others like modules don't yet expect huge pages in
@@ -3746,13 +3744,10 @@ void *__vmalloc_node_range(unsigned long size, unsigned 
long align,
 * supporting them.
 */
 
-   size_per_node = size;
-   if (node == NUMA_NO_NODE)
-   size_per_node /= num_online_nodes();
-   if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
+   if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
shift = PMD_SHIFT;
else
-   shift = arch_vmap_pte_supported_shift(size_per_node);
+   shift = arch_vmap_pte_supported_shift(size);
 
align = max(real_align, 1UL << shift);
size = ALIGN(real_size, 1UL << shift);
-- 
2.43.0




[RFC PATCH 1/7] asm-generic: introduce text-patching.h

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Several architectures support text patching, but they name the header
files that declare patching functions differently.

Make all such headers consistently named text-patching.h and add an empty
header in asm-generic for architectures that do not support text patching.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/alpha/include/asm/Kbuild |  1 +
 arch/arc/include/asm/Kbuild   |  1 +
 arch/arm/include/asm/{patch.h => text-patching.h} |  0
 arch/arm/kernel/ftrace.c  |  2 +-
 arch/arm/kernel/jump_label.c  |  2 +-
 arch/arm/kernel/kgdb.c|  2 +-
 arch/arm/kernel/patch.c   |  2 +-
 arch/arm/probes/kprobes/core.c|  2 +-
 arch/arm/probes/kprobes/opt-arm.c |  2 +-
 .../include/asm/{patching.h => text-patching.h}   |  0
 arch/arm64/kernel/ftrace.c|  2 +-
 arch/arm64/kernel/jump_label.c|  2 +-
 arch/arm64/kernel/kgdb.c  |  2 +-
 arch/arm64/kernel/patching.c  |  2 +-
 arch/arm64/kernel/probes/kprobes.c|  2 +-
 arch/arm64/kernel/traps.c |  2 +-
 arch/arm64/net/bpf_jit_comp.c |  2 +-
 arch/csky/include/asm/Kbuild  |  1 +
 arch/hexagon/include/asm/Kbuild   |  1 +
 arch/loongarch/include/asm/Kbuild |  1 +
 arch/m68k/include/asm/Kbuild  |  1 +
 arch/microblaze/include/asm/Kbuild|  1 +
 arch/mips/include/asm/Kbuild  |  1 +
 arch/nios2/include/asm/Kbuild |  1 +
 arch/openrisc/include/asm/Kbuild  |  1 +
 .../include/asm/{patch.h => text-patching.h}  |  0
 arch/parisc/kernel/ftrace.c   |  2 +-
 arch/parisc/kernel/jump_label.c   |  2 +-
 arch/parisc/kernel/kgdb.c |  2 +-
 arch/parisc/kernel/kprobes.c  |  2 +-
 arch/parisc/kernel/patch.c|  2 +-
 arch/powerpc/include/asm/kprobes.h|  2 +-
 .../asm/{code-patching.h => text-patching.h}  |  0
 arch/powerpc/kernel/crash_dump.c  |  2 +-
 arch/powerpc/kernel/epapr_paravirt.c  |  2 +-
 arch/powerpc/kernel/jump_label.c  |  2 +-
 arch/powerpc/kernel/kgdb.c|  2 +-
 arch/powerpc/kernel/kprobes.c |  2 +-
 arch/powerpc/kernel/module_32.c   |  2 +-
 arch/powerpc/kernel/module_64.c   |  2 +-
 arch/powerpc/kernel/optprobes.c   |  2 +-
 arch/powerpc/kernel/process.c |  2 +-
 arch/powerpc/kernel/security.c|  2 +-
 arch/powerpc/kernel/setup_32.c|  2 +-
 arch/powerpc/kernel/setup_64.c|  2 +-
 arch/powerpc/kernel/static_call.c |  2 +-
 arch/powerpc/kernel/trace/ftrace.c|  2 +-
 arch/powerpc/kernel/trace/ftrace_64_pg.c  |  2 +-
 arch/powerpc/lib/code-patching.c  |  2 +-
 arch/powerpc/lib/feature-fixups.c |  2 +-
 arch/powerpc/lib/test-code-patching.c |  2 +-
 arch/powerpc/lib/test_emulate_step.c  |  2 +-
 arch/powerpc/mm/book3s32/mmu.c|  2 +-
 arch/powerpc/mm/book3s64/hash_utils.c |  2 +-
 arch/powerpc/mm/book3s64/slb.c|  2 +-
 arch/powerpc/mm/kasan/init_32.c   |  2 +-
 arch/powerpc/mm/mem.c |  2 +-
 arch/powerpc/mm/nohash/44x.c  |  2 +-
 arch/powerpc/mm/nohash/book3e_pgtable.c   |  2 +-
 arch/powerpc/mm/nohash/tlb.c  |  2 +-
 arch/powerpc/net/bpf_jit_comp.c   |  2 +-
 arch/powerpc/perf/8xx-pmu.c   |  2 +-
 arch/powerpc/perf/core-book3s.c   |  2 +-
 arch/powerpc/platforms/85xx/smp.c |  2 +-
 arch/powerpc/platforms/86xx/mpc86xx_smp.c |  2 +-
 arch/powerpc/platforms/cell/smp.c |  2 +-
 arch/powerpc/platforms/powermac/smp.c |  2 +-
 arch/powerpc/platforms/powernv/idle.c |  2 +-
 arch/powerpc/platforms/powernv/smp.c  |  2 +-
 arch/powerpc/platforms/pseries/smp.c  |  2 +-
 arch/powerpc/xmon/xmon.c  |  2 +-
 arch/riscv/errata/andes/errata.c  |  2 +-
 arch/riscv/errata/sifive/errata.c |  2 +-
 arch/riscv/errata/thead/errata.c  |  2 +-
 .../include/asm/{patch.h => text-patching.h}  |  0
 arch/riscv/include/asm/uprobes.h  |  2 +-
 arch/riscv/kernel/alternative.c   |  2 +-
 arch/riscv/kernel/cpufeature.c|  3 ++-
 arch/riscv/kernel/ftrace.c|

[RFC PATCH 0/7] x86/module: use large ROX pages for text allocations

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Hi,

These patches add support for using large ROX pages for allocations of
executable memory on x86.

They address Andy's comments [1] about having executable mappings for code
that was not completely formed.

The approach taken is to allocate ROX memory along with writable but not
executable memory and use the writable copy to perform relocations and
alternatives patching. After the module text gets into its final shape, the
contents of the writable memory is copied into the actual ROX location
using text poking.

The allocations of the ROX memory use vmalloc(VMAP_ALLOW_HUGE_MAP) to
allocate PMD aligned memory, fill that memory with invalid instructions and
in the end remap it as ROX. Portions of these large pages are handed out to
execmem_alloc() callers without any changes to the permissions. When the
memory is freed with execmem_free() it is invalidated again so that it
won't contain stale instructions.

The module memory allocation, x86 code dealing with relocations and
alternatives patching takes into account the existence of the two copies,
the writable memory and the ROX memory at the actual allocated virtual
address.

This is an early RFC, it is not well tested and there is a lot of room for
improvement. For example, the locking of execmem_cache can be made more
fine grained, freeing of PMD-sized pages from execmem_cache can be
implemented with a shrinker, the large pages can be removed from the direct
map when they are added to the cache and restored there when they are free
from the cache.

Still, I'd like to hear feedback on the approach in general before moving
forward with polishing the details.

The series applies on top of v4 of "jit/text allocator" [2] and also
available at git:

https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v4%2bx86-rox

[1] 
https://lore.kernel.org/all/a17c65c6-863f-4026-9c6f-a04b659e9...@app.fastmail.com
[2] https://lore.kernel.org/linux-mm/20240411160051.2093261-1-r...@kernel.org 

Mike Rapoport (IBM) (6):
  asm-generic: introduce text-patching.h
  mm: vmalloc: don't account for number of nodes for HUGE_VMAP allocations
  module: prepare to handle ROX allocations for text
  x86/module: perpare module loading for ROX allocations of text
  execmem: add support for cache of large ROX pages
  x86/module: enable ROX caches for module text

Song Liu (1):
  ftrace: Add swap_func to ftrace_process_locs()

 arch/alpha/include/asm/Kbuild |   1 +
 arch/arc/include/asm/Kbuild   |   1 +
 .../include/asm/{patch.h => text-patching.h}  |   0
 arch/arm/kernel/ftrace.c  |   2 +-
 arch/arm/kernel/jump_label.c  |   2 +-
 arch/arm/kernel/kgdb.c|   2 +-
 arch/arm/kernel/patch.c   |   2 +-
 arch/arm/probes/kprobes/core.c|   2 +-
 arch/arm/probes/kprobes/opt-arm.c |   2 +-
 .../asm/{patching.h => text-patching.h}   |   0
 arch/arm64/kernel/ftrace.c|   2 +-
 arch/arm64/kernel/jump_label.c|   2 +-
 arch/arm64/kernel/kgdb.c  |   2 +-
 arch/arm64/kernel/patching.c  |   2 +-
 arch/arm64/kernel/probes/kprobes.c|   2 +-
 arch/arm64/kernel/traps.c |   2 +-
 arch/arm64/net/bpf_jit_comp.c |   2 +-
 arch/csky/include/asm/Kbuild  |   1 +
 arch/hexagon/include/asm/Kbuild   |   1 +
 arch/loongarch/include/asm/Kbuild |   1 +
 arch/m68k/include/asm/Kbuild  |   1 +
 arch/microblaze/include/asm/Kbuild|   1 +
 arch/mips/include/asm/Kbuild  |   1 +
 arch/nios2/include/asm/Kbuild |   1 +
 arch/openrisc/include/asm/Kbuild  |   1 +
 .../include/asm/{patch.h => text-patching.h}  |   0
 arch/parisc/kernel/ftrace.c   |   2 +-
 arch/parisc/kernel/jump_label.c   |   2 +-
 arch/parisc/kernel/kgdb.c |   2 +-
 arch/parisc/kernel/kprobes.c  |   2 +-
 arch/parisc/kernel/patch.c|   2 +-
 arch/powerpc/include/asm/kprobes.h|   2 +-
 .../asm/{code-patching.h => text-patching.h}  |   0
 arch/powerpc/kernel/crash_dump.c  |   2 +-
 arch/powerpc/kernel/epapr_paravirt.c  |   2 +-
 arch/powerpc/kernel/jump_label.c  |   2 +-
 arch/powerpc/kernel/kgdb.c|   2 +-
 arch/powerpc/kernel/kprobes.c |   2 +-
 arch/powerpc/kernel/module_32.c   |   2 +-
 arch/powerpc/kernel/module_64.c   |   2 +-
 arch/powerpc/kernel/optprobes.c   |   2 +-
 arch/powerpc/kernel/process.c |   2 +-
 arch/powerpc/kernel/security.c|   2 +-
 arch/powerpc/kernel/setup_32.c|   2 +-
 arch/powerpc/kernel/setup_64.c|   2 +-
 arch/powerpc/kernel

[PATCH v4 15/15] bpf: remove CONFIG_BPF_JIT dependency on CONFIG_MODULES of

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

BPF just-in-time compiler depended on CONFIG_MODULES because it used
module_alloc() to allocate memory for the generated code.

Since code allocations are now implemented with execmem, drop dependency of
CONFIG_BPF_JIT on CONFIG_MODULES and make it select CONFIG_EXECMEM.

Suggested-by: Björn Töpel 
Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/bpf/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index bc25f5098a25..f999e4e0b344 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -43,7 +43,7 @@ config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on BPF
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
-   depends on MODULES
+   select EXECMEM
help
  BPF programs are normally handled by a BPF interpreter. This option
  allows the kernel to generate native code when a program is loaded
-- 
2.43.0




[PATCH v4 14/15] kprobes: remove dependency on CONFIG_MODULES

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

kprobes depended on CONFIG_MODULES because it has to allocate memory for
code.

Since code allocations are now implemented with execmem, kprobes can be
enabled in non-modular kernels.

Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
dependency of CONFIG_KPROBES on CONFIG_MODULES.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/Kconfig|  2 +-
 kernel/kprobes.c| 43 +
 kernel/trace/trace_kprobe.c | 11 ++
 3 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index bc9e8e5dccd5..68177adf61a0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -52,9 +52,9 @@ config GENERIC_ENTRY
 
 config KPROBES
bool "Kprobes"
-   depends on MODULES
depends on HAVE_KPROBES
select KALLSYMS
+   select EXECMEM
select TASKS_RCU if PREEMPTION
help
  Kprobes allows you to trap at almost any kernel address and
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 047ca629ce49..90c056853e6f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1580,6 +1580,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
 
+#ifdef CONFIG_MODULES
/* Check if 'p' is probing a module. */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
@@ -1603,6 +1604,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
ret = -ENOENT;
}
}
+#endif
+
 out:
preempt_enable();
jump_label_unlock();
@@ -2482,24 +2485,6 @@ int kprobe_add_area_blacklist(unsigned long start, 
unsigned long end)
return 0;
 }
 
-/* Remove all symbols in given area from kprobe blacklist */
-static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
end)
-{
-   struct kprobe_blacklist_entry *ent, *n;
-
-   list_for_each_entry_safe(ent, n, _blacklist, list) {
-   if (ent->start_addr < start || ent->start_addr >= end)
-   continue;
-   list_del(>list);
-   kfree(ent);
-   }
-}
-
-static void kprobe_remove_ksym_blacklist(unsigned long entry)
-{
-   kprobe_remove_area_blacklist(entry, entry + 1);
-}
-
 int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
   char *type, char *sym)
 {
@@ -2564,6 +2549,25 @@ static int __init populate_kprobe_blacklist(unsigned 
long *start,
return ret ? : arch_populate_kprobe_blacklist();
 }
 
+#ifdef CONFIG_MODULES
+/* Remove all symbols in given area from kprobe blacklist */
+static void kprobe_remove_area_blacklist(unsigned long start, unsigned long 
end)
+{
+   struct kprobe_blacklist_entry *ent, *n;
+
+   list_for_each_entry_safe(ent, n, _blacklist, list) {
+   if (ent->start_addr < start || ent->start_addr >= end)
+   continue;
+   list_del(>list);
+   kfree(ent);
+   }
+}
+
+static void kprobe_remove_ksym_blacklist(unsigned long entry)
+{
+   kprobe_remove_area_blacklist(entry, entry + 1);
+}
+
 static void add_module_kprobe_blacklist(struct module *mod)
 {
unsigned long start, end;
@@ -2665,6 +2669,7 @@ static struct notifier_block kprobe_module_nb = {
.notifier_call = kprobes_module_callback,
.priority = 0
 };
+#endif
 
 void kprobe_free_init_mem(void)
 {
@@ -2724,8 +2729,10 @@ static int __init init_kprobes(void)
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(_exceptions_nb);
+#ifdef CONFIG_MODULES
if (!err)
err = register_module_notifier(_module_nb);
+#endif
 
kprobes_initialized = (err == 0);
kprobe_sysctls_init();
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 14099cc17fc9..f0610137d6a3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -111,6 +111,7 @@ static nokprobe_inline bool 
trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
 }
 
+#ifdef CONFIG_MODULES
 static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
 {
char *p;
@@ -129,6 +130,12 @@ static nokprobe_inline bool 
trace_kprobe_module_exist(struct trace_kprobe *tk)
 
return ret;
 }
+#else
+static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
+{
+   return false;
+}
+#endif
 
 static bool trace_kprobe_is_busy(struct dyn_event *ev)
 {
@@ -670,6 +677,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
return ret;
 }
 
+#ifdef CONFIG_MODULES
 /* Module notifier call back, checking event on the module */
 stat

[PATCH v4 13/15] powerpc: use CONFIG_EXECMEM instead of CONFIG_MODULES where appropiate

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

There are places where CONFIG_MODULES guards the code that depends on
memory allocation being done with module_alloc().

Replace CONFIG_MODULES with CONFIG_EXECMEM in such places.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/Kconfig | 2 +-
 arch/powerpc/include/asm/kasan.h | 2 +-
 arch/powerpc/kernel/head_8xx.S   | 4 ++--
 arch/powerpc/kernel/head_book3s_32.S | 6 +++---
 arch/powerpc/lib/code-patching.c | 2 +-
 arch/powerpc/mm/book3s32/mmu.c   | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1c4be3373686..2e586733a464 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -285,7 +285,7 @@ config PPC
select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
-   select KASAN_VMALLOCif KASAN && MODULES
+   select KASAN_VMALLOCif KASAN && EXECMEM
select LOCK_MM_AND_FIND_VMA
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 365d2720097c..b5bbb94c51f6 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -19,7 +19,7 @@
 
 #define KASAN_SHADOW_SCALE_SHIFT   3
 
-#if defined(CONFIG_MODULES) && defined(CONFIG_PPC32)
+#if defined(CONFIG_EXECMEM) && defined(CONFIG_PPC32)
 #define KASAN_KERN_START   ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M)
 #else
 #define KASAN_KERN_START   PAGE_OFFSET
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 647b0b445e89..edc479a7c2bc 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -199,12 +199,12 @@ instruction_counter:
mfspr   r10, SPRN_SRR0  /* Get effective address of fault */
INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11)
mtspr   SPRN_MD_EPN, r10
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
mfcrr11
compare_to_kernel_boundary r10, r10
 #endif
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
blt+3f
rlwinm  r10, r10, 0, 20, 31
orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
diff --git a/arch/powerpc/kernel/head_book3s_32.S 
b/arch/powerpc/kernel/head_book3s_32.S
index c1d89764dd22..57196883a00e 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -419,14 +419,14 @@ InstructionTLBMiss:
  */
/* Get PTE (linux-style) and check access */
mfspr   r3,SPRN_IMISS
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
lis r1, TASK_SIZE@h /* check if kernel address */
cmplw   0,r1,r3
 #endif
mfspr   r2, SPRN_SDR1
li  r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
rlwinm  r2, r2, 28, 0xf000
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
li  r0, 3
bgt-112f
lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha   /* if kernel address, 
use */
@@ -442,7 +442,7 @@ InstructionTLBMiss:
andc.   r1,r1,r2/* check access & ~permission */
bne-InstructionAddressInvalid /* return if access not permitted */
/* Convert linux-style PTE to low word of PPC-style PTE */
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_EXECMEM
rlwimi  r2, r0, 0, 31, 31   /* userspace ? -> PP lsb */
 #endif
ori r1, r1, 0xe06   /* clear out reserved bits */
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index c6ab46156cda..7af791446ddf 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -225,7 +225,7 @@ void __init poking_init(void)
 
 static unsigned long get_patch_pfn(void *addr)
 {
-   if (IS_ENABLED(CONFIG_MODULES) && is_vmalloc_or_module_addr(addr))
+   if (IS_ENABLED(CONFIG_EXECMEM) && is_vmalloc_or_module_addr(addr))
return vmalloc_to_pfn(addr);
else
return __pa_symbol(addr) >> PAGE_SHIFT;
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 100f999871bc..625fe7d08e06 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -184,7 +184,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, 
unsigned long top)
 
 static bool is_module_segment(unsigned long addr)
 {
-   if (!IS_ENABLED(CONFIG_MODULES))
+   if (!IS_ENABLED(CONFIG_EXECMEM))
return false;
if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
return false;
-- 
2.43.0




[PATCH v4 12/15] x86/ftrace: enable dynamic ftrace without CONFIG_MODULES

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Dynamic ftrace must allocate memory for code and this was impossible
without CONFIG_MODULES.

With execmem separated from the modules code, execmem_text_alloc() is
available regardless of CONFIG_MODULES.

Remove dependency of dynamic ftrace on CONFIG_MODULES and make
CONFIG_DYNAMIC_FTRACE select CONFIG_EXECMEM in Kconfig.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/x86/Kconfig |  1 +
 arch/x86/kernel/ftrace.c | 10 --
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e87ddbdaaeb2..5100a769ffda 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86_64
select SWIOTLB
select ARCH_HAS_ELFCORE_COMPAT
select ZONE_DMA32
+   select EXECMEM if DYNAMIC_FTRACE
 
 config FORCE_DYNAMIC_FTRACE
def_bool y
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c8ddb7abda7c..8da0e66ca22d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -261,8 +261,6 @@ void arch_ftrace_update_code(int command)
 /* Currently only x86_64 supports dynamic trampolines */
 #ifdef CONFIG_X86_64
 
-#ifdef CONFIG_MODULES
-/* Module allocation simplifies allocating memory for code */
 static inline void *alloc_tramp(unsigned long size)
 {
return execmem_alloc(EXECMEM_FTRACE, size);
@@ -271,14 +269,6 @@ static inline void tramp_free(void *tramp)
 {
execmem_free(tramp);
 }
-#else
-/* Trampolines can only be created if modules are supported */
-static inline void *alloc_tramp(unsigned long size)
-{
-   return NULL;
-}
-static inline void tramp_free(void *tramp) { }
-#endif
 
 /* Defined as markers to the end of the ftrace default trampolines */
 extern void ftrace_regs_caller_end(void);
-- 
2.43.0




[PATCH v4 11/15] arch: make execmem setup available regardless of CONFIG_MODULES

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

execmem does not depend on modules, on the contrary modules use
execmem.

To make execmem available when CONFIG_MODULES=n, for instance for
kprobes, split execmem_params initialization out from
arch/*/kernel/module.c and compile it when CONFIG_EXECMEM=y

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm/kernel/module.c   |  40 --
 arch/arm/mm/init.c |  40 ++
 arch/arm64/kernel/module.c | 136 -
 arch/arm64/mm/init.c   | 136 +
 arch/loongarch/kernel/module.c |  18 -
 arch/loongarch/mm/init.c   |  20 +
 arch/mips/kernel/module.c  |  21 -
 arch/mips/mm/init.c|  22 ++
 arch/nios2/kernel/module.c |  18 -
 arch/nios2/mm/init.c   |  19 +
 arch/parisc/kernel/module.c|  19 -
 arch/parisc/mm/init.c  |  22 +-
 arch/powerpc/kernel/module.c   |  63 ---
 arch/powerpc/mm/mem.c  |  64 
 arch/riscv/kernel/module.c |  40 --
 arch/riscv/mm/init.c   |  41 ++
 arch/s390/kernel/module.c  |  25 --
 arch/s390/mm/init.c|  28 +++
 arch/sparc/kernel/module.c |  23 --
 arch/sparc/mm/Makefile |   2 +
 arch/sparc/mm/execmem.c|  25 ++
 arch/x86/kernel/module.c   |  25 --
 arch/x86/mm/init.c |  27 +++
 23 files changed, 445 insertions(+), 429 deletions(-)
 create mode 100644 arch/sparc/mm/execmem.c

diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 32974758c73b..677f218f7e84 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -12,54 +12,14 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
-#include 
 
 #include 
 #include 
 #include 
 #include 
 
-#ifdef CONFIG_XIP_KERNEL
-/*
- * The XIP kernel text is mapped in the module area for modules and
- * some other stuff to work without any indirect relocations.
- * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
- * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
- */
-#undef MODULES_VADDR
-#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
-#endif
-
-#ifdef CONFIG_MMU
-static struct execmem_info execmem_info __ro_after_init = {
-   .ranges = {
-   [EXECMEM_DEFAULT] = {
-   .start = MODULES_VADDR,
-   .end = MODULES_END,
-   .alignment = 1,
-   },
-   },
-};
-
-struct execmem_info __init *execmem_arch_setup(void)
-{
-   struct execmem_range *r = _info.ranges[EXECMEM_DEFAULT];
-
-   r->pgprot = PAGE_KERNEL_EXEC;
-
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
-   r->fallback_start = VMALLOC_START;
-   r->fallback_end = VMALLOC_END;
-   }
-
-   return _info;
-}
-#endif
-
 bool module_init_section(const char *name)
 {
return strstarts(name, ".init") ||
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index e8c6f4be0ce1..e54338825156 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -486,3 +487,42 @@ void free_initrd_mem(unsigned long start, unsigned long 
end)
free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
+
+#ifdef CONFIG_EXECMEM
+#ifdef CONFIG_XIP_KERNEL
+/*
+ * The XIP kernel text is mapped in the module area for modules and
+ * some other stuff to work without any indirect relocations.
+ * MODULES_VADDR is redefined here and not in asm/memory.h to avoid
+ * recompiling the whole kernel when CONFIG_XIP_KERNEL is turned on/off.
+ */
+#undef MODULES_VADDR
+#define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
+#endif
+
+#ifdef CONFIG_MMU
+static struct execmem_info execmem_info __ro_after_init = {
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start = MODULES_VADDR,
+   .end = MODULES_END,
+   .alignment = 1,
+   },
+   },
+};
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+   struct execmem_range *r = _info.ranges[EXECMEM_DEFAULT];
+
+   r->pgprot = PAGE_KERNEL_EXEC;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   r->fallback_start = VMALLOC_START;
+   r->fallback_end = VMALLOC_END;
+   }
+
+   return _info;
+}
+#endif
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index aa9e2b3d7459..36b25af56324 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -12,154 +12,18 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
-#include 
-#include 
 
 #include 
 #include 
 #include 
 #include 
 
-st

[PATCH v4 10/15] powerpc: extend execmem_params for kprobes allocations

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

powerpc overrides kprobes::alloc_insn_page() to remove writable
permissions when STRICT_MODULE_RWX is on.

Add definition of EXECMEM_KRPOBES to execmem_params to allow using the
generic kprobes::alloc_insn_page() with the desired permissions.

As powerpc uses breakpoint instructions to inject kprobes, it does not
need to constrain kprobe allocations to the modules area and can use the
entire vmalloc address space.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/kernel/kprobes.c | 20 
 arch/powerpc/kernel/module.c  | 11 +++
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 9fcd01bb2ce6..14c5ddec3056 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -126,26 +126,6 @@ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long 
addr, unsigned long offse
return (kprobe_opcode_t *)(addr + offset);
 }
 
-void *alloc_insn_page(void)
-{
-   void *page;
-
-   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
-   if (!page)
-   return NULL;
-
-   if (strict_module_rwx_enabled()) {
-   int err = set_memory_rox((unsigned long)page, 1);
-
-   if (err)
-   goto error;
-   }
-   return page;
-error:
-   execmem_free(page);
-   return NULL;
-}
-
 int arch_prepare_kprobe(struct kprobe *p)
 {
int ret = 0;
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 5a1d0490c831..a1eaa74f2d41 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -95,6 +95,9 @@ static struct execmem_info execmem_info __ro_after_init = {
[EXECMEM_DEFAULT] = {
.alignment = 1,
},
+   [EXECMEM_KPROBES] = {
+   .alignment = 1,
+   },
[EXECMEM_MODULE_DATA] = {
.alignment = 1,
},
@@ -137,5 +140,13 @@ struct execmem_info __init *execmem_arch_setup(void)
 
text->pgprot = prot;
 
+   execmem_info.ranges[EXECMEM_KPROBES].start = VMALLOC_START;
+   execmem_info.ranges[EXECMEM_KPROBES].start = VMALLOC_END;
+
+   if (strict_module_rwx_enabled())
+   execmem_info.ranges[EXECMEM_KPROBES].pgprot = PAGE_KERNEL_ROX;
+   else
+   execmem_info.ranges[EXECMEM_KPROBES].pgprot = PAGE_KERNEL_EXEC;
+
return _info;
 }
-- 
2.43.0




[PATCH v4 09/15] riscv: extend execmem_params for generated code allocations

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

The memory allocations for kprobes and BPF on RISC-V are not placed in
the modules area and these custom allocations are implemented with
overrides of alloc_insn_page() and  bpf_jit_alloc_exec().

Slightly reorder execmem_params initialization to support both 32 and 64
bit variants, define EXECMEM_KPROBES and EXECMEM_BPF ranges in
riscv::execmem_params and drop overrides of alloc_insn_page() and
bpf_jit_alloc_exec().

Signed-off-by: Mike Rapoport (IBM) 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/kernel/module.c | 21 -
 arch/riscv/kernel/probes/kprobes.c | 10 --
 arch/riscv/net/bpf_jit_core.c  | 13 -
 3 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index ad32e2a8621a..aad158bb2022 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -906,20 +906,39 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+#ifdef CONFIG_MMU
 static struct execmem_info execmem_info __ro_after_init = {
.ranges = {
[EXECMEM_DEFAULT] = {
.pgprot = PAGE_KERNEL,
.alignment = 1,
},
+   [EXECMEM_KPROBES] = {
+   .pgprot = PAGE_KERNEL_READ_EXEC,
+   .alignment = 1,
+   },
+   [EXECMEM_BPF] = {
+   .pgprot = PAGE_KERNEL,
+   .alignment = PAGE_SIZE,
+   },
},
 };
 
 struct execmem_info __init *execmem_arch_setup(void)
 {
+#ifdef CONFIG_64BIT
execmem_info.ranges[EXECMEM_DEFAULT].start = MODULES_VADDR;
execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END;
+#else
+   execmem_info.ranges[EXECMEM_DEFAULT].start = VMALLOC_START;
+   execmem_info.ranges[EXECMEM_DEFAULT].end = VMALLOC_END;
+#endif
+
+   execmem_info.ranges[EXECMEM_KPROBES].start = VMALLOC_START;
+   execmem_info.ranges[EXECMEM_KPROBES].end = VMALLOC_END;
+
+   execmem_info.ranges[EXECMEM_BPF].start = BPF_JIT_REGION_START;
+   execmem_info.ranges[EXECMEM_BPF].end = BPF_JIT_REGION_END;
 
return _info;
 }
diff --git a/arch/riscv/kernel/probes/kprobes.c 
b/arch/riscv/kernel/probes/kprobes.c
index 2f08c14a933d..e64f2f3064eb 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -104,16 +104,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return 0;
 }
 
-#ifdef CONFIG_MMU
-void *alloc_insn_page(void)
-{
-   return  __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
-GFP_KERNEL, PAGE_KERNEL_READ_EXEC,
-VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-__builtin_return_address(0));
-}
-#endif
-
 /* install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c
index 6b3acac30c06..e238fdbd5dbc 100644
--- a/arch/riscv/net/bpf_jit_core.c
+++ b/arch/riscv/net/bpf_jit_core.c
@@ -219,19 +219,6 @@ u64 bpf_jit_alloc_exec_limit(void)
return BPF_JIT_REGION_SIZE;
 }
 
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-   return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
-   BPF_JIT_REGION_END, GFP_KERNEL,
-   PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-   return vfree(addr);
-}
-
 void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 {
int ret;
-- 
2.43.0




[PATCH v4 08/15] arm64: extend execmem_info for generated code allocations

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

The memory allocations for kprobes and BPF on arm64 can be placed
anywhere in vmalloc address space and currently this is implemented with
overrides of alloc_insn_page() and bpf_jit_alloc_exec() in arm64.

Define EXECMEM_KPROBES and EXECMEM_BPF ranges in arm64::execmem_info and
drop overrides of alloc_insn_page() and bpf_jit_alloc_exec().

Signed-off-by: Mike Rapoport (IBM) 
Acked-by: Will Deacon 
---
 arch/arm64/kernel/module.c | 14 ++
 arch/arm64/kernel/probes/kprobes.c |  7 ---
 arch/arm64/net/bpf_jit_comp.c  | 11 ---
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index a377a3217cf2..aa9e2b3d7459 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -115,6 +115,12 @@ static struct execmem_info execmem_info __ro_after_init = {
[EXECMEM_DEFAULT] = {
.alignment = MODULE_ALIGN,
},
+   [EXECMEM_KPROBES] = {
+   .alignment = 1,
+   },
+   [EXECMEM_BPF] = {
+   .alignment = 1,
+   },
},
 };
 
@@ -143,6 +149,14 @@ struct execmem_info __init *execmem_arch_setup(void)
r->end = module_plt_base + SZ_2G;
}
 
+   execmem_info.ranges[EXECMEM_KPROBES].pgprot = PAGE_KERNEL_ROX;
+   execmem_info.ranges[EXECMEM_KPROBES].start = VMALLOC_START;
+   execmem_info.ranges[EXECMEM_KPROBES].end = VMALLOC_END;
+
+   execmem_info.ranges[EXECMEM_BPF].pgprot = PAGE_KERNEL;
+   execmem_info.ranges[EXECMEM_BPF].start = VMALLOC_START;
+   execmem_info.ranges[EXECMEM_BPF].end = VMALLOC_END;
+
return _info;
 }
 
diff --git a/arch/arm64/kernel/probes/kprobes.c 
b/arch/arm64/kernel/probes/kprobes.c
index 327855a11df2..4268678d0e86 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -129,13 +129,6 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return 0;
 }
 
-void *alloc_insn_page(void)
-{
-   return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
-   NUMA_NO_NODE, __builtin_return_address(0));
-}
-
 /* arm kprobe: install breakpoint in text */
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 122021f9bdfc..456f5af239fc 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1793,17 +1793,6 @@ u64 bpf_jit_alloc_exec_limit(void)
return VMALLOC_END - VMALLOC_START;
 }
 
-void *bpf_jit_alloc_exec(unsigned long size)
-{
-   /* Memory is intended to be executable, reset the pointer tag. */
-   return kasan_reset_tag(vmalloc(size));
-}
-
-void bpf_jit_free_exec(void *addr)
-{
-   return vfree(addr);
-}
-
 /* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
 bool bpf_jit_supports_subprog_tailcalls(void)
 {
-- 
2.43.0




[PATCH v4 07/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Extend execmem parameters to accommodate more complex overrides of
module_alloc() by architectures.

This includes specification of a fallback range required by arm, arm64
and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for
allocation of KASAN shadow required by s390 and x86 and support for
early initialization of execmem required by x86.

The core implementation of execmem_alloc() takes care of suppressing
warnings when the initial allocation fails but there is a fallback range
defined.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/Kconfig |  6 
 arch/arm/kernel/module.c | 38 +++-
 arch/arm64/kernel/module.c   | 49 -
 arch/powerpc/kernel/module.c | 58 ++
 arch/s390/kernel/module.c| 52 +++
 arch/x86/Kconfig |  1 +
 arch/x86/kernel/module.c | 62 ++--
 include/linux/execmem.h  | 34 ++
 include/linux/moduleloader.h | 12 ---
 kernel/module/main.c | 26 --
 mm/execmem.c | 70 +---
 mm/mm_init.c |  2 ++
 12 files changed, 228 insertions(+), 182 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 9f066785bb71..bc9e8e5dccd5 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -960,6 +960,12 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC
  For architectures like powerpc/32 which have constraints on module
  allocation and need to allocate module data outside of module area.
 
+config ARCH_WANTS_EXECMEM_EARLY
+   bool
+   help
+ For architectures that might allocate executable memory early on
+ boot, for instance ftrace on x86.
+
 config HAVE_IRQ_EXIT_ON_IRQ_STACK
bool
help
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f58b77..32974758c73b 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -34,23 +35,28 @@
 #endif
 
 #ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init = {
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start = MODULES_VADDR,
+   .end = MODULES_END,
+   .alignment = 1,
+   },
+   },
+};
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   gfp_t gfp_mask = GFP_KERNEL;
-   void *p;
-
-   /* Silence the initial allocation */
-   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-   gfp_mask |= __GFP_NOWARN;
-
-   p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
-   if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-   return p;
-   return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   struct execmem_range *r = _info.ranges[EXECMEM_DEFAULT];
+
+   r->pgprot = PAGE_KERNEL_EXEC;
+
+   if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+   r->fallback_start = VMALLOC_START;
+   r->fallback_end = VMALLOC_END;
+   }
+
+   return _info;
 }
 #endif
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index e92da4da1b2a..a377a3217cf2 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -108,41 +109,41 @@ static int __init module_init_limits(void)
 
return 0;
 }
-subsys_initcall(module_init_limits);
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init = {
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .alignment = MODULE_ALIGN,
+   },
+   },
+};
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   void *p = NULL;
+   struct execmem_range *r = _info.ranges[EXECMEM_DEFAULT];
+
+   module_init_limits();
+
+   r->pgprot = PAGE_KERNEL;
 
/*
 * Where possible, prefer to allocate within direct branch range of the
 * kernel such that no PLTs are necessary.
 */
if (module_direct_base) {
-   p = __vmalloc_node_range(size, MODULE_ALIGN,
-module_direct_base,
-module_direct_base + SZ_128M,
-GFP_KERNEL | __GFP_NOWARN,
-PAGE_KERNEL, 0, NUMA_NO_NODE,
-__bu

[PATCH v4 06/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Several architectures override module_alloc() only to define address
range for code allocations different than VMALLOC address space.

Provide a generic implementation in execmem that uses the parameters for
address space ranges, required alignment and page protections provided
by architectures.

The architectures must fill execmem_info structure and implement
execmem_arch_setup() that returns a pointer to that structure. This way the
execmem initialization won't be called from every architecture, but rather
from a central place, namely a core_initcall() in execmem.

The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
with the parameters defined by the architectures.  If an architecture does
not implement execmem_arch_setup(), execmem_alloc() will fall back to
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/loongarch/kernel/module.c | 18 +++--
 arch/mips/kernel/module.c  | 19 +++--
 arch/nios2/kernel/module.c | 19 ++---
 arch/parisc/kernel/module.c| 23 +++
 arch/riscv/kernel/module.c | 21 +++---
 arch/sparc/kernel/module.c | 41 ---
 include/linux/execmem.h| 41 +++
 mm/execmem.c   | 73 --
 8 files changed, 202 insertions(+), 53 deletions(-)

diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c
index c7d0338d12c1..78c6a68f6c3c 100644
--- a/arch/loongarch/kernel/module.c
+++ b/arch/loongarch/kernel/module.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -490,10 +491,21 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char 
*strtab,
return 0;
 }
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init = {
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .pgprot = PAGE_KERNEL,
+   .alignment = 1,
+   },
+   },
+};
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, 
__builtin_return_address(0));
+   execmem_info.ranges[EXECMEM_DEFAULT].start = MODULES_VADDR;
+   execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END;
+
+   return _info;
 }
 
 static void module_init_ftrace_plt(const Elf_Ehdr *hdr,
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 9a6c96014904..50505e910763 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct mips_hi16 {
@@ -32,11 +33,21 @@ static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
 #ifdef MODULES_VADDR
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init = {
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start = MODULES_VADDR,
+   .end = MODULES_END,
+   .alignment = 1,
+   },
+   },
+};
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
+
+   return _info;
 }
 #endif
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 9c97b7513853..2b68ef8aad42 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -18,15 +18,24 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init = {
+   .ranges = {
+   [EXECMEM_DEFAULT] = {
+   .start = MODULES_VADDR,
+   .end = MODULES_END,
+   .pgprot = PAGE_KERNEL_EXEC,
+   .alignment = 1,
+   },
+   },
+};
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-   GFP_KERNEL, PAGE_KERNEL_EXEC,
-   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-   __builtin_return_address(0));
+   return _info;
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index d214bbe3c2af..721324c42b7d 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -49,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -173,15 +174,21 @@ static inline int reassemble_22(int as22)
((as22 & 0x0003ff) <

[PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystems
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

Start splitting code allocation from modules by introducing execmem_alloc()
and execmem_free() APIs.

Initially, execmem_alloc() is a wrapper for module_alloc() and
execmem_free() is a replacement of module_memfree() to allow updating all
call sites to use the new APIs.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem_alloc() takes
a type argument, that will be used to identify the calling subsystem and to
allow architectures define parameters for ranges suitable for that
subsystem.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/powerpc/kernel/kprobes.c|  6 ++--
 arch/s390/kernel/ftrace.c|  4 +--
 arch/s390/kernel/kprobes.c   |  4 +--
 arch/s390/kernel/module.c|  5 +--
 arch/sparc/net/bpf_jit_comp_32.c |  8 ++---
 arch/x86/kernel/ftrace.c |  6 ++--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 include/linux/execmem.h  | 57 
 include/linux/moduleloader.h |  3 --
 kernel/bpf/core.c|  6 ++--
 kernel/kprobes.c |  8 ++---
 kernel/module/Kconfig|  1 +
 kernel/module/main.c | 25 +-
 mm/Kconfig   |  3 ++
 mm/Makefile  |  1 +
 mm/execmem.c | 26 +++
 16 files changed, 122 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/execmem.h
 create mode 100644 mm/execmem.c

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..9fcd01bb2ce6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -19,8 +19,8 @@
 #include 
 #include 
 #include 
-#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -130,7 +130,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
 
@@ -142,7 +142,7 @@ void *alloc_insn_page(void)
}
return page;
 error:
-   module_memfree(page);
+   execmem_free(page);
return NULL;
 }
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..798249ef5646 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -7,13 +7,13 @@
  *   Author(s): Martin Schwidefsky 
  */
 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void)
 {
const char *start, *end;
 
-   ftrace_plt = module_alloc(PAGE_SIZE);
+   ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index f0cf20d4b3c5..3c1b1be744de 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -9,7 +9,6 @@
 
 #define pr_fmt(fmt) "kprobes: " fmt
 
-#include 
 #include 
 #include 
 #include 
@@ -21,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -38,7 +38,7 @@ void *alloc_insn_page(void)
 {
void *page;
 
-   page = module_alloc(PAGE_SIZE);
+   page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
set_memory_rox((unsigned long)page, 1);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 42215f9404af..ac97a905e8cd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -76,7 +77,7 @@ void *module_alloc(unsigned long size)
 #ifdef CONFIG_FUNCTION_TRACER
 void module_arch_cleanup(struct module *mod)
 {
-   module_memfree(mod->arch.trampolines_start);
+   execmem_free(mod->arch.trampolines_start);
 }
 #endif
 
@@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct 
module *me,
 
size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
numpages = DIV_ROUND_UP(size, PAGE_SIZE);
-   start = module_alloc(numpages * PAGE_SIZE);
+   start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
if (!start)
return -ENOMEM;
set_memory_rox((uns

[PATCH v4 04/15] module: make module_memory_{alloc,free} more self-contained

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Move the logic related to the memory allocation and freeing into
module_memory_alloc() and module_memory_free().

Signed-off-by: Mike Rapoport (IBM) 
---
 kernel/module/main.c | 64 +++-
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/kernel/module/main.c b/kernel/module/main.c
index e1e8a7a9d6c1..5b82b069e0d3 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1203,15 +1203,44 @@ static bool mod_mem_use_vmalloc(enum mod_mem_type type)
mod_mem_type_is_core_data(type);
 }
 
-static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
+   unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+   void *ptr;
+
+   mod->mem[type].size = size;
+
if (mod_mem_use_vmalloc(type))
-   return vzalloc(size);
-   return module_alloc(size);
+   ptr = vmalloc(size);
+   else
+   ptr = module_alloc(size);
+
+   if (!ptr)
+   return -ENOMEM;
+
+   /*
+* The pointer to these blocks of memory are stored on the module
+* structure and we keep that around so long as the module is
+* around. We only free that memory when we unload the module.
+* Just mark them as not being a leak then. The .init* ELF
+* sections *do* get freed after boot so we *could* treat them
+* slightly differently with kmemleak_ignore() and only grey
+* them out as they work as typical memory allocations which
+* *do* eventually get freed, but let's just keep things simple
+* and avoid *any* false positives.
+*/
+   kmemleak_not_leak(ptr);
+
+   memset(ptr, 0, size);
+   mod->mem[type].base = ptr;
+
+   return 0;
 }
 
-static void module_memory_free(void *ptr, enum mod_mem_type type)
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
 {
+   void *ptr = mod->mem[type].base;
+
if (mod_mem_use_vmalloc(type))
vfree(ptr);
else
@@ -1229,12 +1258,12 @@ static void free_mod_mem(struct module *mod)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
-   module_memory_free(mod_mem->base, type);
+   module_memory_free(mod, type);
}
 
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, 
mod->mem[MOD_DATA].size);
-   module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+   module_memory_free(mod, MOD_DATA);
 }
 
 /* Free a module, remove from lists, etc. */
@@ -2225,7 +2254,6 @@ static int find_module_sections(struct module *mod, 
struct load_info *info)
 static int move_module(struct module *mod, struct load_info *info)
 {
int i;
-   void *ptr;
enum mod_mem_type t = 0;
int ret = -ENOMEM;
 
@@ -2234,26 +2262,12 @@ static int move_module(struct module *mod, struct 
load_info *info)
mod->mem[type].base = NULL;
continue;
}
-   mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
-   ptr = module_memory_alloc(mod->mem[type].size, type);
-   /*
- * The pointer to these blocks of memory are stored on the 
module
- * structure and we keep that around so long as the module is
- * around. We only free that memory when we unload the module.
- * Just mark them as not being a leak then. The .init* ELF
- * sections *do* get freed after boot so we *could* treat them
- * slightly differently with kmemleak_ignore() and only grey
- * them out as they work as typical memory allocations which
- * *do* eventually get freed, but let's just keep things simple
- * and avoid *any* false positives.
-*/
-   kmemleak_not_leak(ptr);
-   if (!ptr) {
+
+   ret = module_memory_alloc(mod, type);
+   if (ret) {
t = type;
goto out_enomem;
}
-   memset(ptr, 0, mod->mem[type].size);
-   mod->mem[type].base = ptr;
}
 
/* Transfer each section which specifies SHF_ALLOC */
@@ -2296,7 +2310,7 @@ static int move_module(struct module *mod, struct 
load_info *info)
return 0;
 out_enomem:
for (t--; t >= 0; t--)
-   module_memory_free(mod->mem[t].base, t);
+   module_memory_free(mod, t);
return ret;
 }
 
-- 
2.43.0




[PATCH v4 03/15] nios2: define virtual address space for modules

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

nios2 uses kmalloc() to implement module_alloc() because CALL26/PCREL26
cannot reach all of vmalloc address space.

Define module space as 32MiB below the kernel base and switch nios2 to
use vmalloc for module allocations.

Suggested-by: Thomas Gleixner 
Acked-by: Dinh Nguyen 
Acked-by: Song Liu 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/nios2/include/asm/pgtable.h |  5 -
 arch/nios2/kernel/module.c   | 19 ---
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index d052dfcbe8d3..eab87c6beacb 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -25,7 +25,10 @@
 #include 
 
 #define VMALLOC_START  CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
-#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
+#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M - 1)
+
+#define MODULES_VADDR  (CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M)
+#define MODULES_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
 
 struct mm_struct;
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 76e0a42d6e36..9c97b7513853 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -21,23 +21,12 @@
 
 #include 
 
-/*
- * Modules should NOT be allocated with kmalloc for (obvious) reasons.
- * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
- * from 0x8000 (vmalloc area) to 0xc (kernel) (kmalloc returns
- * addresses in 0xc000)
- */
 void *module_alloc(unsigned long size)
 {
-   if (size == 0)
-   return NULL;
-   return kmalloc(size, GFP_KERNEL);
-}
-
-/* Free memory returned from module_alloc */
-void module_memfree(void *module_region)
-{
-   kfree(module_region);
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_EXEC,
+   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
-- 
2.43.0




[PATCH v4 02/15] mips: module: rename MODULE_START to MODULES_VADDR

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

and MODULE_END to MODULES_END to match other architectures that define
custom address space for modules.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/mips/include/asm/pgtable-64.h | 4 ++--
 arch/mips/kernel/module.c  | 4 ++--
 arch/mips/mm/fault.c   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/pgtable-64.h 
b/arch/mips/include/asm/pgtable-64.h
index 20ca48c1b606..c0109aff223b 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -147,8 +147,8 @@
 #if defined(CONFIG_MODULES) && defined(KBUILD_64BIT_SYM32) && \
VMALLOC_START != CKSSEG
 /* Load modules into 32bit-compatible segment. */
-#define MODULE_START   CKSSEG
-#define MODULE_END (FIXADDR_START-2*PAGE_SIZE)
+#define MODULES_VADDR  CKSSEG
+#define MODULES_END(FIXADDR_START-2*PAGE_SIZE)
 #endif
 
 #define pte_ERROR(e) \
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 7b2fbaa9cac5..9a6c96014904 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,10 +31,10 @@ struct mips_hi16 {
 static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
-#ifdef MODULE_START
+#ifdef MODULES_VADDR
 void *module_alloc(unsigned long size)
 {
-   return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index aaa9a242ebba..37fedeaca2e9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -83,8 +83,8 @@ static void __do_page_fault(struct pt_regs *regs, unsigned 
long write,
 
if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END))
goto VMALLOC_FAULT_TARGET;
-#ifdef MODULE_START
-   if (unlikely(address >= MODULE_START && address < MODULE_END))
+#ifdef MODULES_VADDR
+   if (unlikely(address >= MODULES_VADDR && address < MODULES_END))
goto VMALLOC_FAULT_TARGET;
 #endif
 
-- 
2.43.0




[PATCH v4 01/15] arm64: module: remove uneeded call to kasan_alloc_module_shadow()

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Since commit f6f37d9320a1 ("arm64: select KASAN_VMALLOC for SW/HW_TAGS
modes") KASAN_VMALLOC is always enabled when KASAN is on. This means
that allocations in module_alloc() will be tracked by KASAN protection
for vmalloc() and that kasan_alloc_module_shadow() will be always an
empty inline and there is no point in calling it.

Drop meaningless call to kasan_alloc_module_shadow() from
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm64/kernel/module.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 47e0be610bb6..e92da4da1b2a 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -141,11 +141,6 @@ void *module_alloc(unsigned long size)
__func__);
}
 
-   if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
-   vfree(p);
-   return NULL;
-   }
-
/* Memory is intended to be executable, reset the pointer tag. */
return kasan_reset_tag(p);
 }
-- 
2.43.0




[PATCH v4 00/15] mm: jit/text allocator

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Hi,

Since v3 I looked into making execmem more of an utility toolbox, as we
discussed at LPC with Mark Rutland, but it was getting more hairier than
having a struct describing architecture constraints and a type identifying
the consumer of execmem.

And I do think that having the description of architecture constraints for
allocations of executable memory in a single place is better that having it
spread all over the place.

The patches available via git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v4

v4 changes:
* rebase on v6.9-rc2
* rename execmem_params to execmem_info and execmem_arch_params() to
  execmem_arch_setup()
* use single execmem_alloc() API instead of execmem_{text,data}_alloc() (Song)
* avoid extra copy of execmem parameters (Rick)
* run execmem_init() as core_initcall() except for the architectures that
  may allocated text really early (currently only x86) (Will)
* add acks for some of arm64 and riscv changes, thanks Will and Alexandre
* new commits:
  - drop call to kasan_alloc_module_shadow() on arm64 because it's not
needed anymore
  - rename MODULE_START to MODULES_VADDR on MIPS
  - use CONFIG_EXECMEM instead of CONFIG_MODULES on powerpc as per Christophe:
https://lore.kernel.org/all/79062fa3-3402-47b3-8920-9231ad05e...@csgroup.eu/

v3: https://lore.kernel.org/all/20230918072955.2507221-1-r...@kernel.org
* add type parameter to execmem allocation APIs
* remove BPF dependency on modules

v2: https://lore.kernel.org/all/20230616085038.4121892-1-r...@kernel.org
* Separate "module" and "others" allocations with execmem_text_alloc()
and jit_text_alloc()
* Drop ROX entailment on x86
* Add ack for nios2 changes, thanks Dinh Nguyen

v1: https://lore.kernel.org/all/20230601101257.530867-1-r...@kernel.org

= Cover letter from v1 (sligtly updated) =

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystmes
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

A centralized infrastructure for code allocation allows allocations of
executable memory as ROX, and future optimizations such as caching large
pages for better iTLB performance and providing sub-page allocations for
users that only need small jit code snippets.

Rick Edgecombe proposed perm_alloc extension to vmalloc [1] and Song Liu
proposed execmem_alloc [2], but both these approaches were targeting BPF
allocations and lacked the ground work to abstract executable allocations
and split them from the modules core.

Thomas Gleixner suggested to express module allocation restrictions and
requirements as struct mod_alloc_type_params [3] that would define ranges,
protections and other parameters for different types of allocations used by
modules and following that suggestion Song separated allocations of
different types in modules (commit ac3b43283923 ("module: replace
module_layout with module_memory")) and posted "Type aware module
allocator" set [4].

I liked the idea of parametrising code allocation requirements as a
structure, but I believe the original proposal and Song's module allocator
was too module centric, so I came up with these patches.

This set splits code allocation from modules by introducing execmem_alloc()
and and execmem_free(), APIs, replaces call sites of module_alloc() and
module_memfree() with the new APIs and implements core text and related
allocations in a central place.

Instead of architecture specific overrides for module_alloc(), the
architectures that require non-default behaviour for text allocation must
fill execmem_info structure and implement execmem_arch_setup() that returns
a pointer to that structure. If an architecture does not implement
execmem_arch_setup(), the defaults compatible with the current
modules::module_alloc() are used.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem APIs
take a type argument, that will be used to identify the calling subsystem
and to allow architectures to define parameters for ranges suitable for that
subsystem.

The new infrastructure allows decoupling of BPF, kprobes and ftrace from
modules, and most importantly it paves the way for ROX allocations for
executable memory.

[1] 
https://lore.kernel.org/lkml/20201120202426.18009-1-rick.p.edgeco...@intel.com/
[2] https://lore.kernel.org/all/20221107223921.3451913-1-s...@kernel.org/
[3] https://lore.kernel.org/all/87v8mndy3y.ffs@tglx/
[4] https://lore.kernel.org/all/20230526051529.3387103-1-s

[RFC PATCH 7/7] x86/module: enable ROX caches for module text

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Enable execmem's cache of PMD_SIZE'ed pages mapped as ROX for module
text allocations.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/x86/mm/init.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8e8cd0de3af6..049a8b4c64e2 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1102,9 +1102,23 @@ unsigned long arch_max_swapfile_size(void)
 #endif
 
 #ifdef CONFIG_EXECMEM
+static void execmem_invalidate(void *ptr, size_t size, bool writeable)
+{
+   /* fill memory with INT3 instructions */
+   if (writeable)
+   memset(ptr, 0xcc, size);
+   else
+   text_poke_set(ptr, 0xcc, size);
+}
+
 static struct execmem_info execmem_info __ro_after_init = {
+   .invalidate = execmem_invalidate,
.ranges = {
-   [EXECMEM_DEFAULT] = {
+   [EXECMEM_MODULE_TEXT] = {
+   .flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE,
+   .alignment = MODULE_ALIGN,
+   },
+   [EXECMEM_KPROBES...EXECMEM_MODULE_DATA] = {
.flags = EXECMEM_KASAN_SHADOW,
.alignment = MODULE_ALIGN,
},
@@ -1119,9 +1133,16 @@ struct execmem_info __init *execmem_arch_setup(void)
offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
 
start = MODULES_VADDR + offset;
-   execmem_info.ranges[EXECMEM_DEFAULT].start = start;
-   execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END;
-   execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
+
+   for (int i = EXECMEM_MODULE_TEXT; i < EXECMEM_TYPE_MAX; i++) {
+   struct execmem_range *r = _info.ranges[i];
+
+   r->start = start;
+   r->end = MODULES_END;
+   r->pgprot = PAGE_KERNEL;
+   }
+
+   execmem_info.ranges[EXECMEM_MODULE_TEXT].pgprot = PAGE_KERNEL_ROX;
 
return _info;
 }
-- 
2.43.0



[RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Using large pages to map text areas reduces iTLB pressure and improves
performance.

Extend execmem_alloc() with an ability to use PMD_SIZE'ed pages with ROX
permissions as a cache for smaller allocations.

To populate the cache, a writable large page is allocated from vmalloc with
VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
ROX.

Portions of that large page are handed out to execmem_alloc() callers
without any changes to the permissions.

When the memory is freed with execmem_free() it is invalidated again so
that it won't contain stale instructions.

The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
definition of an execmem_range.

Signed-off-by: Mike Rapoport (IBM) 
---
 include/linux/execmem.h |   2 +
 mm/execmem.c| 267 ++--
 2 files changed, 262 insertions(+), 7 deletions(-)

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 9d22999dbd7d..06f678e6fe55 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -77,12 +77,14 @@ struct execmem_range {
 
 /**
  * struct execmem_info - architecture parameters for code allocations
+ * @invalidate: set memory to contain invalid instructions
  * @ranges: array of parameter sets defining architecture specific
  * parameters for executable memory allocations. The ranges that are not
  * explicitly initialized by an architecture use parameters defined for
  * @EXECMEM_DEFAULT.
  */
 struct execmem_info {
+   void (*invalidate)(void *ptr, size_t size, bool writable);
struct execmem_rangeranges[EXECMEM_TYPE_MAX];
 };
 
diff --git a/mm/execmem.c b/mm/execmem.c
index c920d2b5a721..716fba68ab0e 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -1,30 +1,88 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
+#include 
+
+#include "internal.h"
+
 static struct execmem_info *execmem_info __ro_after_init;
 static struct execmem_info default_execmem_info __ro_after_init;
 
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+struct execmem_cache {
+   struct mutex mutex;
+   struct maple_tree busy_areas;
+   struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+   .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+   .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+execmem_cache.mutex),
+   .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+execmem_cache.mutex),
+};
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+   struct maple_tree *free_areas = _cache.free_areas;
+   struct mutex *mutex = _cache.mutex;
+   MA_STATE(mas, free_areas, 0, ULONG_MAX);
+   void *area;
+
+   mutex_lock(mutex);
+   mas_for_each(, area, ULONG_MAX) {
+   size_t size;
+
+   if (!xa_is_value(area))
+   continue;
+
+   size = xa_to_value(area);
+
+   if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(mas.index, 
PMD_SIZE)) {
+   void *ptr = (void *)mas.index;
+
+   mas_erase();
+   vfree(ptr);
+   }
+   }
+   mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static void execmem_invalidate(void *ptr, size_t size, bool writable)
+{
+   if (execmem_info->invalidate)
+   execmem_info->invalidate(ptr, size, writable);
+   else
+   memset(ptr, 0, size);
+}
+
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+pgprot_t pgprot, unsigned long vm_flags)
 {
bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
-   unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+   unsigned int align = range->alignment;
unsigned long start = range->start;
unsigned long end = range->end;
-   unsigned int align = range->alignment;
-   pgprot_t pgprot = range->pgprot;
void *p;
 
if (kasan)
vm_flags |= VM_DEFER_KMEMLEAK;
 
-   p = __vmalloc_node_range(size, align, start, end, gfp_flags,
-pgprot, vm_flags, NUMA_NO_NODE,
+   if (vm_flags & VM_ALLOW_HUGE_VMAP)
+   align = PMD_SIZE;
+
+   p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot,
+vm_flags, NUMA_NO_NODE,
 __builtin_return_address(0));
if (!p && range->fallback_start) {
start = range->fallback_start;
@@ -44,6 +102,199 @@ static void *__execmem_alloc(struct execmem_range *range, 
size_t size)
 

[RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

When module text memory will be allocated with ROX permissions, the
memory at the actual address where the module will live will contain
invalid instructions and there will be a writable copy that contains the
actual module code.

Update relocations and alternatives patching to deal with it.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/um/kernel/um_arch.c   |  11 ++-
 arch/x86/entry/vdso/vma.c  |   3 +-
 arch/x86/include/asm/alternative.h |  14 +--
 arch/x86/kernel/alternative.c  | 152 +
 arch/x86/kernel/ftrace.c   |  41 +---
 arch/x86/kernel/module.c   |  17 ++--
 6 files changed, 140 insertions(+), 98 deletions(-)

diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index c5ca89e62552..5183c955974e 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -437,24 +437,25 @@ void __init arch_cpu_finalize_init(void)
os_check_bugs();
 }
 
-void apply_seal_endbr(s32 *start, s32 *end)
+void apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
 {
 }
 
-void apply_retpolines(s32 *start, s32 *end)
+void apply_retpolines(s32 *start, s32 *end, struct module *mod)
 {
 }
 
-void apply_returns(s32 *start, s32 *end)
+void apply_returns(s32 *start, s32 *end, struct module *mod)
 {
 }
 
 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
-  s32 *start_cfi, s32 *end_cfi)
+  s32 *start_cfi, s32 *end_cfi, struct module *mod)
 {
 }
 
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
+   struct module *mod)
 {
 }
 
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 6d83ceb7f1ba..31412adef5d2 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -51,7 +51,8 @@ int __init init_vdso_image(const struct vdso_image *image)
 
apply_alternatives((struct alt_instr *)(image->data + image->alt),
   (struct alt_instr *)(image->data + image->alt +
-   image->alt_len));
+   image->alt_len),
+  NULL);
 
return 0;
 }
diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index fcd20c6dc7f9..6f4b0776fc89 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -96,16 +96,16 @@ extern struct alt_instr __alt_instructions[], 
__alt_instructions_end[];
  * instructions were patched in already:
  */
 extern int alternatives_patched;
+struct module;
 
 extern void alternative_instructions(void);
-extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
-extern void apply_retpolines(s32 *start, s32 *end);
-extern void apply_returns(s32 *start, s32 *end);
-extern void apply_seal_endbr(s32 *start, s32 *end);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
+  struct module *mod);
+extern void apply_retpolines(s32 *start, s32 *end, struct module *mod);
+extern void apply_returns(s32 *start, s32 *end, struct module *mod);
+extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod);
 extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
- s32 *start_cfi, s32 *end_cfi);
-
-struct module;
+ s32 *start_cfi, s32 *end_cfi, struct module *mod);
 
 struct callthunk_sites {
s32 *call_start, *call_end;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 45a280f2161c..b4d6868df573 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -462,7 +462,8 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, 
struct alt_instr *a)
  * to refetch changed I$ lines.
  */
 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
- struct alt_instr *end)
+ struct alt_instr *end,
+ struct module *mod)
 {
struct alt_instr *a;
u8 *instr, *replacement;
@@ -490,10 +491,18 @@ void __init_or_module noinline apply_alternatives(struct 
alt_instr *start,
 * order.
 */
for (a = start; a < end; a++) {
+   unsigned long ins_offs, repl_offs;
int insn_buff_sz = 0;
+   u8 *wr_instr, *wr_replacement;
 
instr = (u8 *)>instr_offset + a->instr_offset;
+   ins_offs =  module_writable_offset(mod, instr);
+   wr_instr = instr + ins_offs;
+
replacement = (u8 *)>repl_offset + a->repl_offset;
+   repl_offs = module_writable_offset(mod, replacement);
+

[RFC PATCH 4/7] ftrace: Add swap_func to ftrace_process_locs()

2024-04-11 Thread Mike Rapoport
From: Song Liu 

ftrace_process_locs sorts module mcount, which is inside RO memory. Add a
ftrace_swap_func so that archs can use RO-memory-poke function to do the
sorting.

Signed-off-by: Song Liu 
Signed-off-by: Mike Rapoport (IBM) 
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/ftrace.c  | 13 -
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 54d53f345d14..54393ce57f08 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1172,4 +1172,6 @@ unsigned long arch_syscall_addr(int nr);
 
 #endif /* CONFIG_FTRACE_SYSCALLS */
 
+void ftrace_swap_func(void *a, void *b, int n);
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index da1710499698..95f11c8cdc5e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6480,6 +6480,17 @@ static void test_is_sorted(unsigned long *start, 
unsigned long count)
 }
 #endif
 
+void __weak ftrace_swap_func(void *a, void *b, int n)
+{
+   unsigned long t;
+
+   WARN_ON_ONCE(n != sizeof(t));
+
+   t = *((unsigned long *)a);
+   *(unsigned long *)a = *(unsigned long *)b;
+   *(unsigned long *)b = t;
+}
+
 static int ftrace_process_locs(struct module *mod,
   unsigned long *start,
   unsigned long *end)
@@ -6507,7 +6518,7 @@ static int ftrace_process_locs(struct module *mod,
 */
if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) {
sort(start, count, sizeof(*start),
-ftrace_cmp_ips, NULL);
+ftrace_cmp_ips, ftrace_swap_func);
} else {
test_is_sorted(start, count);
}
-- 
2.43.0



[RFC PATCH 3/7] module: prepare to handle ROX allocations for text

2024-04-11 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

In order to support ROX allocations for module text, it is necessary to
handle modifications to the code, such as relocations and alternatives
patching, without write access to that memory.

One option is to use text patching, but this would make module loading
extremely slow and will expose executable code that is not finally formed.

A better way is to have memory allocated with ROX permissions contain
invalid instructions and keep a writable, but not executable copy of the
module text. The relocations and alternative patches would be done on the
writable copy using the addresses of the ROX memory.
Once the module is completely ready, the updated text will be copied to ROX
memory using text patching in one go and the writable copy will be freed.

Add support for that to module initialization code and provide necessary
interfaces in execmem.
---
 include/linux/execmem.h| 23 +
 include/linux/module.h | 11 ++
 kernel/module/main.c   | 70 ++
 kernel/module/strict_rwx.c |  3 ++
 mm/execmem.c   | 11 ++
 5 files changed, 111 insertions(+), 7 deletions(-)

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index ffd0d12feef5..9d22999dbd7d 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -46,9 +46,11 @@ enum execmem_type {
 /**
  * enum execmem_range_flags - options for executable memory allocations
  * @EXECMEM_KASAN_SHADOW:  allocate kasan shadow
+ * @EXECMEM_READ_ONLY: allocated memory should be mapped as read only
  */
 enum execmem_range_flags {
EXECMEM_KASAN_SHADOW= (1 << 0),
+   EXECMEM_ROX_CACHE   = (1 << 1),
 };
 
 /**
@@ -123,6 +125,27 @@ void *execmem_alloc(enum execmem_type type, size_t size);
  */
 void execmem_free(void *ptr);
 
+/**
+ * execmem_update_copy - copy an update to executable memory
+ * @dst:  destination address to update
+ * @src:  source address containing the data
+ * @size: how many bytes of memory shold be copied
+ *
+ * Copy @size bytes from @src to @dst using text poking if the memory at
+ * @dst is read-only.
+ *
+ * Return: a pointer to @dst or NULL on error
+ */
+void *execmem_update_copy(void *dst, const void *src, size_t size);
+
+/**
+ * execmem_is_rox - check if execmem is read-only
+ * @type - the execmem type to check
+ *
+ * Return: %true if the @type is read-only, %false if it's writable
+ */
+bool execmem_is_rox(enum execmem_type type);
+
 #ifdef CONFIG_ARCH_WANTS_EXECMEM_EARLY
 void execmem_early_init(void);
 #else
diff --git a/include/linux/module.h b/include/linux/module.h
index 1153b0d99a80..3df3202680a2 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -361,6 +361,8 @@ enum mod_mem_type {
 
 struct module_memory {
void *base;
+   void *rw_copy;
+   bool is_rox;
unsigned int size;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
@@ -368,6 +370,15 @@ struct module_memory {
 #endif
 };
 
+#ifdef CONFIG_MODULES
+unsigned long module_writable_offset(struct module *mod, void *loc);
+#else
+static inline unsigned long module_writable_offset(struct module *mod, void 
*loc)
+{
+   return 0;
+}
+#endif
+
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 /* Only touch one cacheline for common rbtree-for-core-layout case. */
 #define __module_memory_align cacheline_aligned
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 91e185607d4b..f83fbb9c95ee 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1188,6 +1188,21 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
+unsigned long module_writable_offset(struct module *mod, void *loc)
+{
+   if (!mod)
+   return 0;
+
+   for_class_mod_mem_type(type, text) {
+   struct module_memory *mem = >mem[type];
+
+   if (loc >= mem->base && loc < mem->base + mem->size)
+   return (unsigned long)(mem->rw_copy - mem->base);
+   }
+
+   return 0;
+}
+
 static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1205,6 +1220,23 @@ static int module_memory_alloc(struct module *mod, enum 
mod_mem_type type)
if (!ptr)
return -ENOMEM;
 
+   mod->mem[type].base = ptr;
+
+   if (execmem_is_rox(execmem_type)) {
+   ptr = vzalloc(size);
+
+   if (!ptr) {
+   execmem_free(mod->mem[type].base);
+   return -ENOMEM;
+   }
+
+   mod->mem[type].rw_copy = ptr;
+   mod->mem[type].is_rox = true;
+   } else {
+   mod->mem[type].rw_copy = mod->mem[type].base;
+   memset(mod->mem[type].base, 0, size);
+   }
+
/*
 * The pointer to these blocks of memory are stored on the module
 * structure and w

<    1   2   3   4   5   6   7   8   9   10   >