[RFC PATCH v2 5/7] pmpool: Update device tree on kexec
From: Stanislav Kinsburskii Introduce a pmpool kexec fdt notifier that enables pmpool to pass its metadata, including the bitmap address, to the new kernel during kexec. Signed-off-by: Stanislav Kinsburskii --- mm/Kconfig |1 + mm/pmpool.c | 64 ++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index e7c10094fb10..1eefdd4c82ba 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -925,6 +925,7 @@ config CMA_AREAS config PMPOOL bool "Persistent memory pool support" select CMA + select LIBFDT help This option adds support for CMA-based persistent memory pool feature, which provides pages allocation and freeing from a set of diff --git a/mm/pmpool.c b/mm/pmpool.c index 12a8cac75558..f2173db782d6 100644 --- a/mm/pmpool.c +++ b/mm/pmpool.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,59 @@ static int __init default_pmpool_fixup_cma(void) } postcore_initcall(default_pmpool_fixup_cma); +static int pmpool_fdt_update(struct notifier_block *nb, unsigned long val, +void *data) +{ + void *fdt = data; + int node, status; + + if (!fdt) + goto err; + + node = fdt_subnode_offset(fdt, 0, "chosen"); + if (node < 0) { + node = fdt_add_subnode(fdt, 0, "chosen"); + if (node < 0) + goto err; + } + + node = fdt_add_subnode(fdt, node, "default_pmpool"); + if (node == -FDT_ERR_EXISTS) + return 0; + if (node < 0) + goto err; + + status = fdt_setprop(fdt, node, "compatible", +"pmpool", sizeof("pmpool")); + if (status) + goto err; + + status = fdt_setprop_u64(fdt, node, "bitmap", +virt_to_phys(default_pmpool->cma->bitmap)); + if (status) + goto err; + + status = fdt_setprop_u64(fdt, node, "size", +default_pmpool->cma->count << PAGE_SHIFT); + if (status) + goto err; + + status = fdt_setprop_u64(fdt, node, "base", +default_pmpool->cma->base_pfn << PAGE_SHIFT); + if (status) + goto err; + + return NOTIFY_DONE; + +err: + pr_err("failed to update fdt\n"); + return NOTIFY_DONE; +} + +static struct notifier_block pmpool_kexec_fdt_nb = { + .notifier_call = pmpool_fdt_update, +}; + static int __init parse_pmpool_opt(char *str) { static struct pmpool pmpool; @@ -80,10 +134,16 @@ static int __init parse_pmpool_opt(char *str) return 0; } + err = register_kexec_fdt_notifier(_kexec_fdt_nb); + if (err) { + pr_err("failed to register kexec fdt notifier: %d\n", err); + goto free_memblock; + } + err = cma_init_reserved_mem(base, size, 0, "pmpool", ); if (err) { pr_err("failed to initialize CMA: %d\n", err); - goto free_memblock; + goto notifier_unregister; } pr_info("default memory pool is created: %#llx-%#llx\n", @@ -93,6 +153,8 @@ static int __init parse_pmpool_opt(char *str) return 0; +notifier_unregister: + unregister_kexec_fdt_notifier(_kexec_fdt_nb); free_memblock: memblock_phys_free(base, size); return 0; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH v2 7/7] Drivers: hv: Allocate persistent pages for root partition
Deposited pages are owned by the hypervisor. Accessing them can trigger a kernel panic due to a general protection fault. This patch ensures that pages for the root partition are allocated from the persistent memory pool. This allocation guarantees stability post-kexec, protecting hypervisor-deposited pages from unintended reuse by the new kernel. Signed-off-by: Stanislav Kinsburskii --- drivers/hv/hv_common.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 335aec5ec504..a81c5613e745 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -426,7 +426,10 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) order = 31 - __builtin_clz(num_pages); while (1) { - pages[i] = alloc_pages_node(node, GFP_KERNEL, order); + if (paritition_id == hv_current_partition_id) + pages[i] = pmpool_alloc(1 << order); + else + pages[i] = alloc_pages_node(node, GFP_KERNEL, order); if (pages[i]) break; if (!order) { @@ -471,8 +474,12 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) err_free_allocations: for (i = 0; i < num_allocations; ++i) { base_pfn = page_to_pfn(pages[i]); - for (j = 0; j < counts[i]; ++j) - __free_page(pfn_to_page(base_pfn + j)); + for (j = 0; j < counts[i]; ++j) { + if (paritition_id == hv_current_partition_id) + pmpool_release(pages[i], counts[i]); + else + __free_page(pfn_to_page(base_pfn + j)); + } } free_buf: ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH v2 6/7] pmpool: Restore state from device tree post-kexec
From: Stanislav Kinsburskii Retrieve the pmpool bitmap from metadata in the fdt passed over kexec, bypassing the need for reinitialization. This ensures the seamless transfer of the pmpool state across kexec. Signed-off-by: Stanislav Kinsburskii --- mm/pmpool.c | 46 ++ 1 file changed, 46 insertions(+) diff --git a/mm/pmpool.c b/mm/pmpool.c index f2173db782d6..6c1a28fd3493 100644 --- a/mm/pmpool.c +++ b/mm/pmpool.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "cma.h" @@ -49,11 +50,56 @@ static void pmpool_fixup_cma(struct cma *cma) pr_info("CMA bitmap moved to %#llx\n", virt_to_phys(cma->bitmap)); } +static int pmpool_fdt_restore(struct cma *cma) +{ + struct device_node *dn; + u64 val; + + dn = of_find_compatible_node(NULL, NULL, "pmpool"); + if (!dn) + return -ENOENT; + + if (of_property_read_u64(dn, "base", )) { + pr_err("invalid fdt: no base\n"); + return -EINVAL; + } + if (val != PFN_PHYS(cma->base_pfn)) { + pr_err("fdt base doesn't match: %#llx != %#llx\n", + val, PFN_PHYS(cma->base_pfn)); + return -EINVAL; + } + + if (of_property_read_u64(dn, "size", )) { + pr_err("invalid fdt: no size\n"); + return -EINVAL; + } + if (val != (cma->count << PAGE_SHIFT)) { + pr_err("fdt size doesn't match: %#llx != %#lx\n", + val, cma->count << PAGE_SHIFT); + return -EINVAL; + } + + if (of_property_read_u64(dn, "bitmap", )) { + pr_err("invalid fdt: no bitmap\n"); + return -EINVAL; + } + + pr_info("CMA bitmap restored to %#llx\n", val); + + bitmap_free(cma->bitmap); + cma->bitmap = phys_to_virt(val); + + return 0; +} + static int __init default_pmpool_fixup_cma(void) { if (!default_pmpool) return 0; + if (!pmpool_fdt_restore(default_pmpool->cma)) + return 0; + pmpool_fixup_cma(default_pmpool->cma); return 0; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH v2 4/7] pmpool: Introduce persistent memory pool
From: Stanislav Kinsburskii This patch introduces a memory allocator specifically tailored for persistent memory within the kernel. The allocator maintains kernel-specific states like DMA passthrough device states, IOMMU state, and more across kexec. The current implementation provides a foundation for custom solutions that may be developed in the future. Although the design is kept concise and straightforward to encourage discussion and feedback, it remains fully functional. The persistent memory pool builds upon the continuous memory allocator (CMA) and ensures CMA state persistency across kexec by incorporating the CMA bitmap into the memory region. Potential applications include: 1. Enabling various in-kernel entities to allocate persistent pages from a unified memory pool, obviating the need for reserving multiple regions. 2. For in-kernel components that need the allocation address to be retained on kernel kexec, this address can be exposed to user space and subsequently passed through the command line. 3. Distinct subsystems or drivers can set aside their region, allocating a segment for their persistent memory pool, suitable for uses such as file systems, key-value stores, and other applications. Signed-off-by: Stanislav Kinsburskii --- include/linux/pmpool.h | 22 +++ mm/Kconfig |8 mm/Makefile|1 mm/pmpool.c| 100 4 files changed, 131 insertions(+) create mode 100644 include/linux/pmpool.h create mode 100644 mm/pmpool.c diff --git a/include/linux/pmpool.h b/include/linux/pmpool.h new file mode 100644 index ..b41f16fa9660 --- /dev/null +++ b/include/linux/pmpool.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PMPOOL_H +#define _PMPOOL_H + +struct page; + +#if defined(CONFIG_PMPOOL) +struct page *pmpool_alloc(unsigned long count); +bool pmpool_release(struct page *pages, unsigned long count); +#else +static inline struct page *pmpool_alloc(unsigned long count) +{ + return NULL; +} +static inline bool pmpool_release(struct page *pages, unsigned long count) +{ + return false; +} +#endif + +#endif /* _PMPOOL_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 09130434e30d..e7c10094fb10 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -922,6 +922,14 @@ config CMA_AREAS If unsure, leave the default value "7" in UMA and "19" in NUMA. +config PMPOOL + bool "Persistent memory pool support" + select CMA + help + This option adds support for CMA-based persistent memory pool + feature, which provides pages allocation and freeing from a set of + persistent memory ranges, deposited to the memory pool. + config MEM_SOFT_DIRTY bool "Track memory changes" depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS diff --git a/mm/Makefile b/mm/Makefile index 678530a07326..8d3579e58c2c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -139,3 +139,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_PMPOOL) += pmpool.o diff --git a/mm/pmpool.c b/mm/pmpool.c new file mode 100644 index ..12a8cac75558 --- /dev/null +++ b/mm/pmpool.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "pmpool: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "cma.h" + +struct pmpool { + struct cma *cma; +}; + +static struct pmpool *default_pmpool; + +bool pmpool_release(struct page *pages, unsigned long count) +{ + if (!default_pmpool) + return false; + + return cma_release(default_pmpool->cma, pages, count); +} + +struct page *pmpool_alloc(unsigned long count) +{ + if (!default_pmpool) + return NULL; + + return cma_alloc(default_pmpool->cma, count, 0, true); +} + +static void pmpool_fixup_cma(struct cma *cma) +{ + unsigned long bitmap_size; + + bitmap_free(cma->bitmap); + cma->bitmap = phys_to_virt(PFN_PHYS(cma->base_pfn)); + + bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)); + memset(cma->bitmap, 0, bitmap_size); + bitmap_set(cma->bitmap, 0, PAGE_ALIGN(bitmap_size) >> PAGE_SHIFT); + + pr_info("CMA bitmap moved to %#llx\n", virt_to_phys(cma->bitmap)); +} + +static int __init default_pmpool_fixup_cma(void) +{ + if (!default_pmpool) + return 0; + + pmpool_fixup_cma(default_pmpool->cma); + return 0; +} +postcore_initcall(default_pmpool_fixup_cma); + +static int __init parse_pmpool_opt(char *str) +{ + static struct pmpool pmpool; + phys_addr_t base, size; + int err; + + /* Format is pmpool=, */ + base = memparse(str, ); + size = memparse(str + 1, NULL); + +
[RFC PATCH v2 3/7] x86: kexec: Enable fdt modification in callbacks
From: Stanislav Kinsburskii This option allows kernel subsystems to modify (or create, if necessary) the Flattened Device Tree (fdt) using registered callbacks and then pass the modified version to the new kernel. Signed-off-by: Stanislav Kinsburskii --- arch/x86/Kconfig |8 +++ arch/x86/kernel/kexec-bzimage64.c | 41 - 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index efb472e267ec..90da51fbb8f8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2078,6 +2078,14 @@ config KEXEC_FILE_FDT This option enables passing existent Flattened Device Tree to the new kernel when kexec is invoked by the file based system call. +config KEXEC_FILE_FDT_CALLBACK + bool "Enable kexec fdt modification support" + depends on KEXEC_FILE_FDT + select LIBFDT + help + This option enables Flattened Device Tree modification (and creation + if needed) by kernel subsystems, registered corresponding callback. + config ARCH_HAS_KEXEC_PURGATORY def_bool KEXEC_FILE diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index ab9ae02c9a5f..3c6df28d3637 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -384,11 +384,50 @@ static int bzImage64_probe(const char *buf, unsigned long len) return ret; } #ifdef CONFIG_KEXEC_FILE_FDT +#ifdef CONFIG_KEXEC_FILE_FDT_CALLBACK +static void *fdt_get_runtime(void) +{ + void *fdt; + size_t fdt_size = SZ_2M; + int status; + + /* It's nothing to do without existent fdt and any callbacks */ + if (!initial_boot_params && kexec_fdt_notify_list_empty()) + return NULL; + + fdt = kzalloc(fdt_size, GFP_KERNEL); + if (!fdt) + return NULL; + + if (initial_boot_params) + status = fdt_open_into(initial_boot_params, fdt, fdt_size); + else + status = fdt_create_empty_tree(fdt, fdt_size); + if (status != 0) { + pr_err("failed to get fdt\n"); + goto free_fdt; + } + + status = kexec_fdt_notify(fdt); + if (status) { + pr_err("fdt notification failed\n"); + goto free_fdt; + } + + fdt_pack(fdt); + + return fdt; + +free_fdt: + kfree(fdt); + return NULL; +} +#else static void *fdt_get_runtime(void) { return initial_boot_params; } - +#endif static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params) { void *fdt; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH v2 1/7] kexec_file: Add fdt modification callback support
From: Stanislav Kinsburskii Introduce primitives to: - Register and unregister callbacks for flattened device tree (fdt) modifications. - Invoke all registered callbacks. - Check for any registered callbacks. These enhancements enable the use of a device tree to store kernel bits. Signed-off-by: Stanislav Kinsburskii --- include/linux/kexec.h |7 +++ kernel/kexec_file.c | 24 2 files changed, 31 insertions(+) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 22b5cd24f581..c9c70551796d 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -282,6 +282,13 @@ arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, return -ENOEXEC; } #endif + +struct notifier_block; +extern int register_kexec_fdt_notifier(struct notifier_block *nb); +extern int unregister_kexec_fdt_notifier(struct notifier_block *nb); +extern bool kexec_fdt_notify_list_empty(void); +extern int kexec_fdt_notify(void *fdt); + #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_KEXEC_ELF diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 881ba0d1714c..f9245d5e4459 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -43,6 +43,30 @@ static int kexec_calculate_store_digests(struct kimage *image); /* Maximum size in bytes for kernel/initrd files. */ #define KEXEC_FILE_SIZE_MAXmin_t(s64, 4LL << 30, SSIZE_MAX) +static BLOCKING_NOTIFIER_HEAD(kexec_fdt_notify_list); + +bool kexec_fdt_notify_list_empty(void) +{ + return kexec_fdt_notify_list.head == NULL; +} + +int kexec_fdt_notify(void *fdt) +{ + return blocking_notifier_call_chain(_fdt_notify_list, 0, fdt); +} + +int register_kexec_fdt_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(_fdt_notify_list, nb); +} +EXPORT_SYMBOL(register_kexec_fdt_notifier); + +int unregister_kexec_fdt_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(_fdt_notify_list, nb); +} +EXPORT_SYMBOL(unregister_kexec_fdt_notifier); + /* * Currently this is the only default function that is exported as some * architectures need it to do additional handlings. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH v2 0/7] Introduce persistent memory pool
This patch introduces a memory allocator specifically tailored for persistent memory within the kernel. The allocator maintains kernel-specific states like DMA passthrough device states, IOMMU state, and more across kexec. The current implementation provides a foundation for custom solutions that may be developed in the future. Although the design is kept concise and straightforward to encourage discussion and feedback, it remains fully functional. The persistent memory pool builds upon the continuous memory allocator (CMA) and ensures CMA state persistency across kexec by incorporating the CMA bitmap into the memory region instead of allocation it from kernel memory. Persistent memory pool metadata is passed across kexec by using Flattened Device Tree, which is added as another kexec segment for x86 architecture. Potential applications include: 1. Enabling various in-kernel entities to allocate persistent pages from a unified memory pool, obviating the need for reserving multiple regions. 2. For in-kernel components that need the allocation address to be retained on kernel kexec, this address can be exposed to user space and subsequently passed through the command line. 3. Distinct subsystems or drivers can set aside their region, allocating a segment for their persistent memory pool, suitable for uses such as file systems, key-value stores, and other applications. Notes: 1. The last patch of the series represents a use case for the feature. However, the patch won't compile and is for illustrative purposes only as the code being patched hasn't been merged yet. 2. The code being patched is currently under review by the community. The series is named "Introduce /dev/mshv drivers": https://lkml.org/lkml/2023/9/22/1117 Changes since v1: 1. Persistent memory pool is now a wrapper on top of CMA instead of being a new allocator. 2. Persistent memory pool metadata doesn't belong to the pool anymore and is now passed via Flattened Device Tree instead over kexec to the new kernel. The following series implements... --- Stanislav Kinsburskii (7): kexec_file: Add fdt modification callback support x86: kexec: Transfer existing fdt to the new kernel x86: kexec: Enable fdt modification in callbacks pmpool: Introduce persistent memory pool pmpool: Update device tree on kexec pmpool: Restore state from device tree post-kexec Drivers: hv: Allocate persistent pages for root partition arch/x86/Kconfig | 16 +++ arch/x86/kernel/kexec-bzimage64.c | 97 + drivers/hv/hv_common.c| 13 ++ include/linux/kexec.h |7 + include/linux/pmpool.h| 22 kernel/kexec_file.c | 24 mm/Kconfig|9 ++ mm/Makefile |1 mm/pmpool.c | 208 + 9 files changed, 394 insertions(+), 3 deletions(-) create mode 100644 include/linux/pmpool.h create mode 100644 mm/pmpool.c ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH v2 2/7] x86: kexec: Transfer existing fdt to the new kernel
From: Stanislav Kinsburskii Enable passing of the Flattened Device Tree (fdt) over kexec for x86 architecture, as outlined in Documentation/x86/booting-dt.rst. Signed-off-by: Stanislav Kinsburskii --- arch/x86/Kconfig |8 + arch/x86/kernel/kexec-bzimage64.c | 58 + 2 files changed, 66 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e36261b4ea14..efb472e267ec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2070,6 +2070,14 @@ config KEXEC_FILE for kernel and initramfs as opposed to list of segments as accepted by previous system call. +config KEXEC_FILE_FDT + bool "Pass fdt over kexec" + depends on KEXEC_FILE && X86_64 + depends on OF_FLATTREE + help + This option enables passing existent Flattened Device Tree to the new + kernel when kexec is invoked by the file based system call. + config ARCH_HAS_KEXEC_PURGATORY def_bool KEXEC_FILE diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index a61c12c01270..ab9ae02c9a5f 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -381,7 +383,59 @@ static int bzImage64_probe(const char *buf, unsigned long len) return ret; } +#ifdef CONFIG_KEXEC_FILE_FDT +static void *fdt_get_runtime(void) +{ + return initial_boot_params; +} + +static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params) +{ + void *fdt; + struct setup_data *sd; + unsigned long fdt_load_addr, fdt_sz; + int ret; + + fdt = fdt_get_runtime(); + if (!fdt) + return 0; + + fdt_sz = fdt_totalsize(fdt); + + kbuf->bufsz = kbuf->memsz = sizeof(struct setup_data) + fdt_sz; + + sd = kzalloc(kbuf->bufsz, GFP_KERNEL); + if (!sd) + return -ENOMEM; + + kbuf->buffer = sd; + kbuf->buf_align = PAGE_SIZE; + kbuf->buf_min = MIN_INITRD_LOAD_ADDR; + kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(kbuf); + if (ret) + return ret; + + fdt_load_addr = kbuf->mem; + pr_debug("Loaded fdt at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + fdt_load_addr, fdt_sz, fdt_sz); + + sd->type = SETUP_DTB; + sd->len = fdt_sz; + memcpy(sd->data, fdt, fdt_sz); + + sd->next = params->hdr.setup_data; + params->hdr.setup_data = fdt_load_addr; + + return 0; +} +#else +static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params) +{ + return 0; +} +#endif static void *bzImage64_load(struct kimage *image, char *kernel, unsigned long kernel_len, char *initrd, unsigned long initrd_len, char *cmdline, @@ -561,6 +615,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, if (ret) goto out_free_params; + ret = kexec_setup_fdt(, params); + if (ret) + goto out_free_params; + /* Allocate loader specific data */ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); if (!ldata) { ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [systemd-devel] [PATCH 0/1] x86/kexec: UKI support
On Mon Sep 18, 2023 at 6:41 PM EEST, Dimitri John Ledkov wrote: > On Tue, 12 Sept 2023 at 11:38, Jarkko Sakkinen wrote: > > > > On Tue Sep 12, 2023 at 2:20 AM EEST, Neal Gompa wrote: > On Mon, Sep 11, > > 2023 at 7:15 PM Jarkko Sakkinen wrote: > > > > On Sat Sep 9, 2023 at 7:18 PM EEST, Jan Hendrik Farr wrote: > > > > > Hello, > > > > > > > > > > this patch implements UKI support for kexec_file_load. It will > > > > > require support > > > > > in the kexec-tools userspace utility. For testing purposes the > > > > > following can be used: > > > > > https://github.com/Cydox/kexec-test/ > > > > > > > > > > There has been discussion on this topic in an issue on GitHub that is > > > > > linked below > > > > > for reference. > > > > > > > > > > > > > > > Some links: > > > > > - Related discussion: https://github.com/systemd/systemd/issues/28538 > > > > > - Documentation of UKIs: > > > > > https://uapi-group.org/specifications/specs/unified_kernel_image/ > > > > > > > > > > Jan Hendrik Farr (1): > > > > > x86/kexec: UKI support > > > > > > > > > > arch/x86/include/asm/kexec-uki.h | 7 ++ > > > > > arch/x86/include/asm/parse_pefile.h| 32 +++ > > > > > arch/x86/kernel/Makefile | 2 + > > > > > arch/x86/kernel/kexec-uki.c| 113 > > > > > + > > > > > arch/x86/kernel/machine_kexec_64.c | 2 + > > > > > arch/x86/kernel/parse_pefile.c | 110 > > > > > crypto/asymmetric_keys/mscode_parser.c | 2 +- > > > > > crypto/asymmetric_keys/verify_pefile.c | 110 +++- > > > > > crypto/asymmetric_keys/verify_pefile.h | 16 > > > > > 9 files changed, 278 insertions(+), 116 deletions(-) > > > > > create mode 100644 arch/x86/include/asm/kexec-uki.h > > > > > create mode 100644 arch/x86/include/asm/parse_pefile.h > > > > > create mode 100644 arch/x86/kernel/kexec-uki.c > > > > > create mode 100644 arch/x86/kernel/parse_pefile.c > > > > > > > > > > -- > > > > > 2.40.1 > > > > > > > > What the heck is UKI? > > > > > > Unified Kernel Images. More details available here: > > > https://uapi-group.org/specifications/specs/unified_kernel_image/ > > > > > > It's a way of creating initramfs-style images as fully generic, > > > reproducible images that can be built server-side. > > > > You can build today a kernel with these compiled in: > > > > 1. EFI stub > > 2. initeramfs > > 3. cmdline > > > > Why another way (and label 'UKI') for a pre-existing feature? > > > > In Ubuntu, we have considered to use the existing kernel features > before going off to use UKI. Here are some of the reasons why we > didn't opt to use the kernel builtin things: > 1) we wanted to have ability to have TPM measured kernel commandline > performed before kernel is being executed, which is what sd-stub > provides us OK this does make a lot of sense. > 2) we wanted to have ability to update / regenerate initrd, without > rebuilding kernel. Thus whenever userspace in the initrd needs > updating, we can generate new initrd for existing kernel build, create > new kernel.efi, whilst using existing .linux / vmlinuz build. I don't > believe it is currently trivial to relink vmlinuz with builtin initrd. > 3) licensing wise it was not clear if initrd has to be GPLv2 > compatible when linked inside vmlinuz, or if it can contain GPLv3 / > LGPLv3 userspace code - with UKI it is believed unambigiously true, > because vmlinuz boots by itself standalone and is compiled separately > of the UKI. Right UKI wraps kernel and kernel is a "leaf object". > 4) we wanted to have ability to override cmdline via kernel args > without secureboot, and use stock cmdline args under secureboot, to > allow debugging & production behaviour from a single signed kernel.efi > (that was custom development, and could be done in the stock vmlinuz > too). > 5) obvious mention, the intention here is to have TPM PCR measurements > and Secureboot signature for vmlinuz and initrd and cmdline and dtb. > There is otherwise no support for standalone signed initrd, cmdline, > dtb today. Nor does vendoring it into vmlinuz achieves this to the > same extent (and ease of predicting for sealing / resealing purposes). ok > 6) in Ubuntu kernel.efi also has sbat section for targeted revocations > (discussed separately elsewhere) > > Overall, it is mostly about flexibility to be able to reuse the same > initrd against multiple kernel builds, or update use multiple initrd > against the same kernel build. This is imho the biggest issue with > using initrd built-into the vmlinuz itself. > Resource wise, the initrd passed in via kernel.efi can be freed, as > far as I understand. I don't know if the one built-into the vmlinuz is > freeable. > > Improving design to do something else instead of UKI would be > welcomed. Or for example improving the zimg linus upstream format to > be a partial or a valid UKI would help as well. For example, building > the kernel built-in initrd
Re: [PATCH 0/2] Sign the Image which is zboot's payload
Hi Dave, On Fri, 22 Sep 2023 13:41:22 +0800 Dave Young wrote: > Hi Jan, > > On Fri, 22 Sept 2023 at 13:19, Jan Hendrik Farr wrote: > > > > Hi Pingfan! > > > > On 21 21:37:01, Pingfan Liu wrote: > > > From: Pingfan Liu > > > > > > > > For security boot, the vmlinuz.efi will be signed so UEFI boot loader > > > can check against it. But at present, there is no signature for kexec > > > file load, this series makes a signature on the zboot's payload -- Image > > > before it is compressed. As a result, the kexec-tools parses and > > > decompresses the Image.gz to get the Image, which has signature and can > > > be checked against during kexec file load > > > > I missed some of the earlier discussion about this zboot kexec support. > > So just let me know if I'm missing something here. You were exploring > > these two options in getting this supported: > > > > 1. Making kexec_file_load do all the work. > > > > This option makes the signature verification easy. kexec_file_load > > checks the signature on the pe file and then extracts it and does the > > kexec. > > > > This is similar to how I'm approaching UKI support in [1]. > > > > 2. Extract in userspace and pass decompressed kernel to kexec_file_load > > > > This options requires the decompressed kernel to have a valid signature on > > it. That's why this patch adds the ability to add that signature to the > > kernel contained inside the zboot image. > > > > This option would not make sense for UKI support as it would not > > validate the signature with respect to the initrd and cmdline that it > > contains. > > Another possibility for the cmdline could be using the bootconfig > facility which was > introduced for boot time tracking: > Documentation/admin-guide/bootconfig.rst > > So the initrd+cmdline can be signed as well. Has this been discussed > before for UKI? Not that I know of. But I'm not sure if the bootconfig the way it works today does the trick. For one the bootconfig is simply glued to the end of the initrd. But that makes it part of the UKI as well. So there is no added gain. Plus, adding the cmdline to the UKI was done on purpose to prevent any unauthorized editing. That basically means that any change to the cmdline needs to be signed as well. But I don't see any signature verification while processing the bootconfig. Finally the bootconfig is setup too late in the boot process, in particular after setup_arch which reserves the crashkernel memory and needs to parse the kernel command line for that. An even more extreme example is the decompressor phase on s390. There the command line is parsed as well. And that is code that runs before start_kernel. All in all I don't believe that using the bootconfig adds much benefit for the UKI. Thanks Philipp
Re: [PATCH v2] Crash: add lock to serialize crash hotplug handling
On 9/24/23 22:07, Baoquan He wrote: Eric reported that handling corresponding crash hotplug event can be failed easily when many memory hotplug event are notified in a short period. They failed because failing to take __kexec_lock. === [ 78.714569] Fallback order for Node 0: 0 [ 78.714575] Built 1 zonelists, mobility grouping on. Total pages: 1817886 [ 78.717133] Policy zone: Normal [ 78.724423] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate [ 78.727207] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate [ 80.056643] PEFILE: Unsigned PE binary === The memory hotplug events are notified very quickly and very many, while the handling of crash hotplug is much slower relatively. So the atomic variable __kexec_lock and kexec_trylock() can't guarantee the serialization of crash hotplug handling. Here, add a new mutex lock __crash_hotplug_lock to serialize crash hotplug handling specifically. This doesn't impact the usage of __kexec_lock. Signed-off-by: Baoquan He --- v1->v2: - Move mutex lock definition into CONFIG_CRASH_HOTPLUG ifdeffery scope in kernel/crash_core.c because the lock is only needed and used in that scope. Suggested by Eric. kernel/crash_core.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 03a7932cde0a..5951d6366b72 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -739,6 +739,17 @@ subsys_initcall(crash_notes_memory_init); #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt +/* + * Different than kexec/kdump loading/unloading/jumping/shrinking which + * usually rarely happen, there will be many crash hotplug events notified + * during one short period, e.g one memory board is hot added and memory + * regions are online. So mutex lock __crash_hotplug_lock is used to + * serialize the crash hotplug handling specifically. + */ +DEFINE_MUTEX(__crash_hotplug_lock); +#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock) +#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock) + /* * This routine utilized when the crash_hotplug sysfs node is read. * It reflects the kernel's ability/permission to update the crash @@ -783,9 +794,11 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) { struct kimage *image; + crash_hotplug_lock(); /* Obtain lock while changing crash information */ if (!kexec_trylock()) { pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + crash_hotplug_unlock(); return; } @@ -852,6 +865,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) out: /* Release lock now that update complete */ kexec_unlock(); + crash_hotplug_unlock(); } static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) The crash_check_update_elfcorehdr() also has kexec_trylock() and needs similar treatment. Userspace (ie udev rule processing) and kernel (crash hotplug infrastrucutre) need to be protected/serialized from one another. Eric ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2 0/2] x86/kexec: UKI Support
Hi Jan, On Thu, 21 Sep 2023 00:02:25 +0200 Jan Hendrik Farr wrote: [...] > > Maybe we should do a BoF at LPC to discuss this further? > > I definetly won't be at LPC, is it possible to join virtually? Yes, LPC will be hybrid again this year. Virtual access costs $50 although you can apply for an 50% discount when you are a non-professional. https://lpc.events/event/17/page/212-attend Thanks Philipp ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/2] Sign the Image which is zboot's payload
On Mon, 25 Sept 2023 at 03:01, Pingfan Liu wrote: > > On Fri, Sep 22, 2023 at 1:19 PM Jan Hendrik Farr wrote: > > ... > > I missed some of the earlier discussion about this zboot kexec support. > > So just let me know if I'm missing something here. You were exploring > > these two options in getting this supported: > > > > 1. Making kexec_file_load do all the work. > > > > This option makes the signature verification easy. kexec_file_load > > checks the signature on the pe file and then extracts it and does the > > kexec. > > > > This is similar to how I'm approaching UKI support in [1]. > > > > Yes, that is my original try. > > > 2. Extract in userspace and pass decompressed kernel to kexec_file_load > > > > This option requires the decompressed kernel to have a valid signature on > > it. That's why this patch adds the ability to add that signature to the > > kernel contained inside the zboot image. > > > > You got it. > > > This option would not make sense for UKI support as it would not > > validate the signature with respect to the initrd and cmdline that it > > contains. Am I correct in thinking that there is no similar issue with > > zboot images? They don't contain any more information besides the kernel > > that is intended to be securely signed, right? Do you have a reference > > If using my second method, it means to unpack the UKI image in user > space, and pass the kernel image, initrd and cmdline through > kexec_file_load interface. If the UKI can have signature on the initrd > and cmdline, we extend the capability of that interface to check those > verification. > > > for the zboot image layout somewhere? > > > > Sorry that maybe there is no document. I understand them through the code. > The zboot image, aka, vmlinuz.efi looks like: > PE header, which is formed manually in arch/arm64/kernel/head.S > EFI decompressor, which consists of > drivers/firmware/efi/libstub/zboot.c and libstub > Image.gz, which is formed by compressing Image as instructed in Makefile.zboot > > Indeed, this is currently only documented in code. zboot is a PE executable that decompresses the kernel and boots it, but it also carries the base and size of the compressed payload in its header, along with the compression type so non-EFI loaders can run it as well (QEMU implements this for gzip on arm64) > > > I hesitate to post this series, > > > > I appreciate you sending it, it's helping the discussion along. > > Absolutely. RFCs are important because nobody knows how exactly the code will look until someone takes the time to implement it. So your work on this is much appreciated, even if we may decide to take another approach down the road. > > > [...] since Ard has recommended using an > > > emulated UEFI boot service to resolve the UKI kexec load problem [1]. > > > since on aarch64, vmlinuz.efi has faced the similar issue at present. > > > But anyway, I have a crude outline of it and am sending it out for > > > discussion. > > > > The more I'm thinking about it, the more I like Ard's idea. There's now > > already two different formats trying to be added to kexec that are > > pretty different from each other, yet they both have the UEFI interface > > in common. I think if the kernel supported kexec'ing EFI applications > > that would be a more flexible and forward-looking approach. It's a > > Yes, I agree. That method is attractive, originally I had a try when > Ard suggested it but there was no clear boundary on which boot service > should be implemented for zboot, so I did not move on along that > direction. > > Now, UKI poses another challenge to kexec_file_load, and seems to > require more than zboot. And it appears that Ard's approach is a > silver bullet for that issue. > Yes, it looks appealing but it will take some time to iterate on ideas and converge on an implementation. > > standard that both zboot and UKI as well as all future formats for UEFI > > platforms will support anyways. So while it's more work right now to > > implement, I think it'll likely pay off. > > > > It is significantly more work than the other options though. So I think > > before work is started on it, it would be nice to get some type of > > consensus on these things (not an exhaustive list, please feel free to > > add to it): > > > > I try to answer part of the questions. > > > 1. Is it the right approach? It adds a significant amount of userspace > > API. > > My crude assumption: this new stub will replace the purgatory, and I > am not sure whether kexec-tools source tree will accommodate it. It > can be signed and checked during the kexec_file_load. > > > 2. What subset of the UEFI spec needs/should to be supported? > > 3. Can we let runtime services still be handled by the firmware after > > exiting boot services? > > I think the runtime services survive through the kexec process. It is > derived from the real firmware, not related with this stub > Yes, this should be possible. > > 4. How can we debug the stubs that are
[PATCHv7 4/4] powerpc/setup: alloc extra paca_ptrs to hold boot_cpuid
paca_ptrs should be large enough to hold the boot_cpuid, hence, its lower boundary is set to the bigger one between boot_cpuid+1 and nr_cpus. On the other hand, some kernel component: -1. the timer assumes cpu0 online since the timer_list->flags subfield 'TIMER_CPUMASK' is zero if not initialized to a proper present cpu. -2. power9_idle_stop() assumes the primary thread's paca is allocated. Hence lift nr_cpu_ids from one to two to ensure cpu0 is onlined, if the boot cpu is not cpu0. Result: When nr_cpus=1, taskset -c 14 bash -c 'echo c > /proc/sysrq-trigger' the kdump kernel brings up two cpus. While when taskset -c 4 bash -c 'echo c > /proc/sysrq-trigger', the kdump kernel brings up one cpu. Signed-off-by: Pingfan Liu Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Mahesh Salgaonkar Cc: Wen Xiong Cc: Baoquan He Cc: Ming Lei Cc: kexec@lists.infradead.org To: linuxppc-...@lists.ozlabs.org --- arch/powerpc/kernel/paca.c | 10 ++ arch/powerpc/kernel/prom.c | 9 ++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index cda4e00b67c1..91e2401de1bd 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -242,9 +242,10 @@ static int __initdata paca_struct_size; void __init allocate_paca_ptrs(void) { - paca_nr_cpu_ids = nr_cpu_ids; + int n = (boot_cpuid + 1) > nr_cpu_ids ? (boot_cpuid + 1) : nr_cpu_ids; - paca_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids; + paca_nr_cpu_ids = n; + paca_ptrs_size = sizeof(struct paca_struct *) * n; paca_ptrs = memblock_alloc_raw(paca_ptrs_size, SMP_CACHE_BYTES); if (!paca_ptrs) panic("Failed to allocate %d bytes for paca pointers\n", @@ -287,13 +288,14 @@ void __init allocate_paca(int cpu) void __init free_unused_pacas(void) { int new_ptrs_size; + int n = (boot_cpuid + 1) > nr_cpu_ids ? (boot_cpuid + 1) : nr_cpu_ids; - new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids; + new_ptrs_size = sizeof(struct paca_struct *) * n; if (new_ptrs_size < paca_ptrs_size) memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size, paca_ptrs_size - new_ptrs_size); - paca_nr_cpu_ids = nr_cpu_ids; + paca_nr_cpu_ids = n; paca_ptrs_size = new_ptrs_size; #ifdef CONFIG_PPC_64S_HASH_MMU diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 87272a2d8c10..15c994f54bf9 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -362,9 +362,12 @@ static int __init early_init_dt_scan_cpus(unsigned long node, */ boot_cpuid = i; found = true; - /* This works around the hole in paca_ptrs[]. */ - if (nr_cpu_ids < nthreads) - set_nr_cpu_ids(nthreads); + /* +* Ideally, nr_cpus=1 can be achieved if each kernel +* component does not assume cpu0 is onlined. +*/ + if (boot_cpuid != 0 && nr_cpu_ids < 2) + set_nr_cpu_ids(2); } #ifdef CONFIG_SMP /* logical cpu id is always 0 on UP kernels */ -- 2.31.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCHv7 1/4] powerpc/setup : Enable boot_cpu_hwid for PPC32
In order to identify the boot cpu, its intserv[] should be recorded and checked in smp_setup_cpu_maps(). smp_setup_cpu_maps() is shared between PPC64 and PPC32. Since PPC64 has already used boot_cpu_hwid to carry that information, enabling this variable on PPC32 so later it can also be used to carry that information for PPC32 in the coming patch. Signed-off-by: Pingfan Liu Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Mahesh Salgaonkar Cc: Wen Xiong Cc: Baoquan He Cc: Ming Lei Cc: kexec@lists.infradead.org To: linuxppc-...@lists.ozlabs.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202309130232.n2rewhbv-...@intel.com/ --- arch/powerpc/include/asm/smp.h | 2 +- arch/powerpc/kernel/prom.c | 3 +-- arch/powerpc/kernel/setup-common.c | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 576d0e15..5db9178cc800 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -26,7 +26,7 @@ #include extern int boot_cpuid; -extern int boot_cpu_hwid; /* PPC64 only */ +extern int boot_cpu_hwid; extern int spinning_secondaries; extern u32 *cpu_to_phys_id; extern bool coregroup_enabled; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 0b5878c3125b..ec82f5bda908 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -372,8 +372,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node, be32_to_cpu(intserv[found_thread])); boot_cpuid = found; - if (IS_ENABLED(CONFIG_PPC64)) - boot_cpu_hwid = be32_to_cpu(intserv[found_thread]); + boot_cpu_hwid = be32_to_cpu(intserv[found_thread]); /* * PAPR defines "logical" PVR values for cpus that diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index d2a446216444..1b19a9815672 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -87,9 +87,7 @@ EXPORT_SYMBOL(machine_id); int boot_cpuid = -1; EXPORT_SYMBOL_GPL(boot_cpuid); -#ifdef CONFIG_PPC64 int boot_cpu_hwid = -1; -#endif /* * These are used in binfmt_elf.c to put aux entries on the stack -- 2.31.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCHv7 2/4] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt
*** Idea *** For kexec -p, the boot cpu can be not the cpu0, this causes the problem of allocating memory for paca_ptrs[]. However, in theory, there is no requirement to assign cpu's logical id as its present sequence in the device tree. But there is something like cpu_first_thread_sibling(), which makes assumption on the mapping inside a core. Hence partially loosening the mapping, i.e. unbind the mapping of core while keep the mapping inside a core. *** Implement *** At this early stage, there are plenty of memory to utilize. Hence, this patch allocates interim memory to link the cpu info on a list, then reorder cpus by changing the list head. As a result, there is a rotate shift between the sequence number in dt and the cpu logical number. *** Result *** After this patch, a boot-cpu's logical id will always be mapped into the range [0,threads_per_core). Besides this, at this phase, all threads in the boot core are forced to be onlined. This restriction will be lifted in a later patch with extra effort. Signed-off-by: Pingfan Liu Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Mahesh Salgaonkar Cc: Wen Xiong Cc: Baoquan He Cc: Ming Lei Cc: kexec@lists.infradead.org To: linuxppc-...@lists.ozlabs.org --- arch/powerpc/kernel/prom.c | 25 + arch/powerpc/kernel/setup-common.c | 87 +++--- 2 files changed, 85 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index ec82f5bda908..87272a2d8c10 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -76,7 +76,9 @@ u64 ppc64_rma_size; unsigned int boot_cpu_node_count __ro_after_init; #endif static phys_addr_t first_memblock_size; +#ifdef CONFIG_SMP static int __initdata boot_cpu_count; +#endif static int __init early_parse_mem(char *p) { @@ -331,8 +333,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node, const __be32 *intserv; int i, nthreads; int len; - int found = -1; - int found_thread = 0; + bool found = false; /* We are scanning "cpu" nodes only */ if (type == NULL || strcmp(type, "cpu") != 0) @@ -355,8 +356,15 @@ static int __init early_init_dt_scan_cpus(unsigned long node, for (i = 0; i < nthreads; i++) { if (be32_to_cpu(intserv[i]) == fdt_boot_cpuid_phys(initial_boot_params)) { - found = boot_cpu_count; - found_thread = i; + /* +* always map the boot-cpu logical id into the +* range of [0, thread_per_core) +*/ + boot_cpuid = i; + found = true; + /* This works around the hole in paca_ptrs[]. */ + if (nr_cpu_ids < nthreads) + set_nr_cpu_ids(nthreads); } #ifdef CONFIG_SMP /* logical cpu id is always 0 on UP kernels */ @@ -365,14 +373,13 @@ static int __init early_init_dt_scan_cpus(unsigned long node, } /* Not the boot CPU */ - if (found < 0) + if (!found) return 0; - DBG("boot cpu: logical %d physical %d\n", found, - be32_to_cpu(intserv[found_thread])); - boot_cpuid = found; + DBG("boot cpu: logical %d physical %d\n", boot_cpuid, + be32_to_cpu(intserv[boot_cpuid])); - boot_cpu_hwid = be32_to_cpu(intserv[found_thread]); + boot_cpu_hwid = be32_to_cpu(intserv[boot_cpuid]); /* * PAPR defines "logical" PVR values for cpus that diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 1b19a9815672..f6d32324b5a5 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -425,6 +426,13 @@ static void __init cpu_init_thread_core_maps(int tpc) u32 *cpu_to_phys_id = NULL; +struct interrupt_server_node { + struct list_head node; + boolavail; + int len; + __be32 *intserv; +}; + /** * setup_cpu_maps - initialize the following cpu maps: * cpu_possible_mask @@ -446,11 +454,16 @@ u32 *cpu_to_phys_id = NULL; void __init smp_setup_cpu_maps(void) { struct device_node *dn; - int cpu = 0; - int nthreads = 1; + int shift = 0, cpu = 0; + int j, nthreads = 1; + int len; + struct interrupt_server_node *intserv_node, *n; + struct list_head *bt_node, head; + bool avail, found_boot_cpu = false; DBG("smp_setup_cpu_maps()\n"); + INIT_LIST_HEAD(); cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32), __alignof__(u32)); if (!cpu_to_phys_id) @@ -460,7 +473,6 @@ void __init
[PATCHv7 3/4] powerpc/setup: Handle the case when boot_cpuid greater than nr_cpus
If the boot_cpuid is smaller than nr_cpus, it requires extra effort to ensure the boot_cpu is in cpu_present_mask. This can be achieved by reserving the last quota for the boot cpu. Note: the restriction on nr_cpus will be lifted with more effort in the next patch Signed-off-by: Pingfan Liu Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Mahesh Salgaonkar Cc: Wen Xiong Cc: Baoquan He Cc: Ming Lei Cc: kexec@lists.infradead.org To: linuxppc-...@lists.ozlabs.org --- arch/powerpc/kernel/setup-common.c | 25 ++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index f6d32324b5a5..a72d00a6cff2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -454,8 +454,8 @@ struct interrupt_server_node { void __init smp_setup_cpu_maps(void) { struct device_node *dn; - int shift = 0, cpu = 0; - int j, nthreads = 1; + int terminate, shift = 0, cpu = 0; + int j, bt_thread = 0, nthreads = 1; int len; struct interrupt_server_node *intserv_node, *n; struct list_head *bt_node, head; @@ -518,6 +518,7 @@ void __init smp_setup_cpu_maps(void) for (j = 0 ; j < nthreads; j++) { if (be32_to_cpu(intserv[j]) == boot_cpu_hwid) { bt_node = _node->node; + bt_thread = j; found_boot_cpu = true; /* * Record the round-shift between dt @@ -537,11 +538,21 @@ void __init smp_setup_cpu_maps(void) /* Select the primary thread, the boot cpu's slibing, as the logic 0 */ list_add_tail(, bt_node); pr_info("the round shift between dt seq and the cpu logic number: %d\n", shift); + terminate = nr_cpu_ids; list_for_each_entry(intserv_node, , node) { + j = 0; + /* Choose a start point to cover the boot cpu */ + if (nr_cpu_ids - 1 < bt_thread) { + /* +* The processor core puts assumption on the thread id, +* not to breach the assumption. +*/ + terminate = nr_cpu_ids - 1; + } avail = intserv_node->avail; nthreads = intserv_node->len / sizeof(int); - for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { + for (; j < nthreads && cpu < terminate; j++) { set_cpu_present(cpu, avail); set_cpu_possible(cpu, true); cpu_to_phys_id[cpu] = be32_to_cpu(intserv_node->intserv[j]); @@ -549,6 +560,14 @@ void __init smp_setup_cpu_maps(void) j, cpu, be32_to_cpu(intserv[j])); cpu++; } + /* Online the boot cpu */ + if (nr_cpu_ids - 1 < bt_thread) { + set_cpu_present(bt_thread, avail); + set_cpu_possible(bt_thread, true); + cpu_to_phys_id[bt_thread] = be32_to_cpu(intserv_node->intserv[bt_thread]); + DBG("thread %d -> cpu %d (hard id %d)\n", + bt_thread, bt_thread, be32_to_cpu(intserv[bt_thread])); + } } list_for_each_entry_safe(intserv_node, n, , node) { -- 2.31.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCHv7 0/4] enable nr_cpus for powerpc
Since my last v4 [1], the code has undergone great changes. The paca[] array has been reorganized and indexed by paca_ptrs[], which dramatically decreases the memory consumption even if there are many unpresent cpus in the middle. However, reordering the logical cpu numbers can further decrease the size of paca_ptrs[] in the kdump case. So I keep [2/4], which rotate-shifts the cpu's sequence number in the device tree to obtain the logical cpu id. Patch [3-4/4] make efforts to decrease the nr_cpus to be less than or equal to two. [1]: https://lore.kernel.org/linuxppc-dev/1520829790-14029-1-git-send-email-kernelf...@gmail.com/ --- v6 -> v7 Add [1/4], which fixes compilation error on PPC32 Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Mahesh Salgaonkar Cc: Wen Xiong Cc: Baoquan He Cc: Ming Lei Cc: kexec@lists.infradead.org To: linuxppc-...@lists.ozlabs.org Pingfan Liu (4): powerpc/setup : Enable boot_cpu_hwid for PPC32 powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt powerpc/setup: Handle the case when boot_cpuid greater than nr_cpus powerpc/setup: alloc extra paca_ptrs to hold boot_cpuid arch/powerpc/include/asm/smp.h | 2 +- arch/powerpc/kernel/paca.c | 10 +-- arch/powerpc/kernel/prom.c | 29 +--- arch/powerpc/kernel/setup-common.c | 108 +++-- 4 files changed, 114 insertions(+), 35 deletions(-) -- 2.31.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec