[RFC PATCH v2 5/7] pmpool: Update device tree on kexec

2023-09-25 Thread Stanislav Kinsburskii
From: Stanislav Kinsburskii 

Introduce a pmpool kexec fdt notifier that enables pmpool to pass its
metadata, including the bitmap address, to the new kernel during kexec.

Signed-off-by: Stanislav Kinsburskii 
---
 mm/Kconfig  |1 +
 mm/pmpool.c |   64 ++-
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index e7c10094fb10..1eefdd4c82ba 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -925,6 +925,7 @@ config CMA_AREAS
 config PMPOOL
bool "Persistent memory pool support"
select CMA
+   select LIBFDT
help
  This option adds support for CMA-based persistent memory pool
  feature, which provides pages allocation and freeing from a set of
diff --git a/mm/pmpool.c b/mm/pmpool.c
index 12a8cac75558..f2173db782d6 100644
--- a/mm/pmpool.c
+++ b/mm/pmpool.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -58,6 +59,59 @@ static int __init default_pmpool_fixup_cma(void)
 }
 postcore_initcall(default_pmpool_fixup_cma);
 
+static int pmpool_fdt_update(struct notifier_block *nb, unsigned long val,
+void *data)
+{
+   void *fdt = data;
+   int node, status;
+
+   if (!fdt)
+   goto err;
+
+   node = fdt_subnode_offset(fdt, 0, "chosen");
+   if (node < 0) {
+   node = fdt_add_subnode(fdt, 0, "chosen");
+   if (node < 0)
+   goto err;
+   }
+
+   node = fdt_add_subnode(fdt, node, "default_pmpool");
+   if (node == -FDT_ERR_EXISTS)
+   return 0;
+   if (node < 0)
+   goto err;
+
+   status = fdt_setprop(fdt, node, "compatible",
+"pmpool", sizeof("pmpool"));
+   if (status)
+   goto err;
+
+   status = fdt_setprop_u64(fdt, node, "bitmap",
+virt_to_phys(default_pmpool->cma->bitmap));
+   if (status)
+   goto err;
+
+   status = fdt_setprop_u64(fdt, node, "size",
+default_pmpool->cma->count << PAGE_SHIFT);
+   if (status)
+   goto err;
+
+   status = fdt_setprop_u64(fdt, node, "base",
+default_pmpool->cma->base_pfn << PAGE_SHIFT);
+   if (status)
+   goto err;
+
+   return NOTIFY_DONE;
+
+err:
+   pr_err("failed to update fdt\n");
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block pmpool_kexec_fdt_nb = {
+   .notifier_call  = pmpool_fdt_update,
+};
+
 static int __init parse_pmpool_opt(char *str)
 {
static struct pmpool pmpool;
@@ -80,10 +134,16 @@ static int __init parse_pmpool_opt(char *str)
return 0;
}
 
+   err = register_kexec_fdt_notifier(_kexec_fdt_nb);
+   if (err) {
+   pr_err("failed to register kexec fdt notifier: %d\n", err);
+   goto free_memblock;
+   }
+
err = cma_init_reserved_mem(base, size, 0, "pmpool", );
if (err) {
pr_err("failed to initialize CMA: %d\n", err);
-   goto free_memblock;
+   goto notifier_unregister;
}
 
pr_info("default memory pool is created: %#llx-%#llx\n",
@@ -93,6 +153,8 @@ static int __init parse_pmpool_opt(char *str)
 
return 0;
 
+notifier_unregister:
+   unregister_kexec_fdt_notifier(_kexec_fdt_nb);
 free_memblock:
memblock_phys_free(base, size);
return 0;



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH v2 7/7] Drivers: hv: Allocate persistent pages for root partition

2023-09-25 Thread Stanislav Kinsburskii
Deposited pages are owned by the hypervisor. Accessing them can trigger a
kernel panic due to a general protection fault.

This patch ensures that pages for the root partition are allocated from the
persistent memory pool. This allocation guarantees stability post-kexec,
protecting hypervisor-deposited pages from unintended reuse by the new
kernel.

Signed-off-by: Stanislav Kinsburskii 
---
 drivers/hv/hv_common.c |   13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 335aec5ec504..a81c5613e745 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -426,7 +426,10 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 
num_pages)
order = 31 - __builtin_clz(num_pages);
 
while (1) {
-   pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+   if (paritition_id == hv_current_partition_id)
+   pages[i] = pmpool_alloc(1 << order);
+   else
+   pages[i] = alloc_pages_node(node, GFP_KERNEL, 
order);
if (pages[i])
break;
if (!order) {
@@ -471,8 +474,12 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 
num_pages)
 err_free_allocations:
for (i = 0; i < num_allocations; ++i) {
base_pfn = page_to_pfn(pages[i]);
-   for (j = 0; j < counts[i]; ++j)
-   __free_page(pfn_to_page(base_pfn + j));
+   for (j = 0; j < counts[i]; ++j) {
+   if (paritition_id == hv_current_partition_id)
+   pmpool_release(pages[i], counts[i]);
+   else
+   __free_page(pfn_to_page(base_pfn + j));
+   }
}
 
 free_buf:



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH v2 6/7] pmpool: Restore state from device tree post-kexec

2023-09-25 Thread Stanislav Kinsburskii
From: Stanislav Kinsburskii 

Retrieve the pmpool bitmap from metadata in the fdt passed over kexec,
bypassing the need for reinitialization. This ensures the seamless transfer
of the pmpool state across kexec.

Signed-off-by: Stanislav Kinsburskii 
---
 mm/pmpool.c |   46 ++
 1 file changed, 46 insertions(+)

diff --git a/mm/pmpool.c b/mm/pmpool.c
index f2173db782d6..6c1a28fd3493 100644
--- a/mm/pmpool.c
+++ b/mm/pmpool.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "cma.h"
@@ -49,11 +50,56 @@ static void pmpool_fixup_cma(struct cma *cma)
pr_info("CMA bitmap moved to %#llx\n", virt_to_phys(cma->bitmap));
 }
 
+static int pmpool_fdt_restore(struct cma *cma)
+{
+   struct device_node *dn;
+   u64 val;
+
+   dn = of_find_compatible_node(NULL, NULL, "pmpool");
+   if (!dn)
+   return -ENOENT;
+
+   if (of_property_read_u64(dn, "base", )) {
+   pr_err("invalid fdt: no base\n");
+   return -EINVAL;
+   }
+   if (val != PFN_PHYS(cma->base_pfn)) {
+   pr_err("fdt base doesn't match: %#llx != %#llx\n",
+   val, PFN_PHYS(cma->base_pfn));
+   return -EINVAL;
+   }
+
+   if (of_property_read_u64(dn, "size", )) {
+   pr_err("invalid fdt: no size\n");
+   return -EINVAL;
+   }
+   if (val != (cma->count << PAGE_SHIFT)) {
+   pr_err("fdt size doesn't match: %#llx != %#lx\n",
+   val, cma->count << PAGE_SHIFT);
+   return -EINVAL;
+   }
+
+   if (of_property_read_u64(dn, "bitmap", )) {
+   pr_err("invalid fdt: no bitmap\n");
+   return -EINVAL;
+   }
+
+   pr_info("CMA bitmap restored to %#llx\n", val);
+
+   bitmap_free(cma->bitmap);
+   cma->bitmap = phys_to_virt(val);
+
+   return 0;
+}
+
 static int __init default_pmpool_fixup_cma(void)
 {
if (!default_pmpool)
return 0;
 
+   if (!pmpool_fdt_restore(default_pmpool->cma))
+   return 0;
+
pmpool_fixup_cma(default_pmpool->cma);
return 0;
 }



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH v2 4/7] pmpool: Introduce persistent memory pool

2023-09-25 Thread Stanislav Kinsburskii
From: Stanislav Kinsburskii 

This patch introduces a memory allocator specifically tailored for
persistent memory within the kernel. The allocator maintains
kernel-specific states like DMA passthrough device states, IOMMU state, and
more across kexec.

The current implementation provides a foundation for custom solutions that
may be developed in the future. Although the design is kept concise and
straightforward to encourage discussion and feedback, it remains fully
functional.

The persistent memory pool builds upon the continuous memory allocator
(CMA) and ensures CMA state persistency across kexec by incorporating the
CMA bitmap into the memory region.

Potential applications include:

  1. Enabling various in-kernel entities to allocate persistent pages from
 a unified memory pool, obviating the need for reserving multiple
 regions.

  2. For in-kernel components that need the allocation address to be
 retained on kernel kexec, this address can be exposed to user space
 and subsequently passed through the command line.

  3. Distinct subsystems or drivers can set aside their region, allocating
 a segment for their persistent memory pool, suitable for uses such as
 file systems, key-value stores, and other applications.

Signed-off-by: Stanislav Kinsburskii 
---
 include/linux/pmpool.h |   22 +++
 mm/Kconfig |8 
 mm/Makefile|1 
 mm/pmpool.c|  100 
 4 files changed, 131 insertions(+)
 create mode 100644 include/linux/pmpool.h
 create mode 100644 mm/pmpool.c

diff --git a/include/linux/pmpool.h b/include/linux/pmpool.h
new file mode 100644
index ..b41f16fa9660
--- /dev/null
+++ b/include/linux/pmpool.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PMPOOL_H
+#define _PMPOOL_H
+
+struct page;
+
+#if defined(CONFIG_PMPOOL)
+struct page *pmpool_alloc(unsigned long count);
+bool pmpool_release(struct page *pages, unsigned long count);
+#else
+static inline struct page *pmpool_alloc(unsigned long count)
+{
+   return NULL;
+}
+static inline bool pmpool_release(struct page *pages, unsigned long count)
+{
+   return false;
+}
+#endif
+
+#endif /* _PMPOOL_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 09130434e30d..e7c10094fb10 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -922,6 +922,14 @@ config CMA_AREAS
 
  If unsure, leave the default value "7" in UMA and "19" in NUMA.
 
+config PMPOOL
+   bool "Persistent memory pool support"
+   select CMA
+   help
+ This option adds support for CMA-based persistent memory pool
+ feature, which provides pages allocation and freeing from a set of
+ persistent memory ranges, deposited to the memory pool.
+
 config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
diff --git a/mm/Makefile b/mm/Makefile
index 678530a07326..8d3579e58c2c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -139,3 +139,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_PMPOOL) += pmpool.o
diff --git a/mm/pmpool.c b/mm/pmpool.c
new file mode 100644
index ..12a8cac75558
--- /dev/null
+++ b/mm/pmpool.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "pmpool: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "cma.h"
+
+struct pmpool {
+   struct cma *cma;
+};
+
+static struct pmpool *default_pmpool;
+
+bool pmpool_release(struct page *pages, unsigned long count)
+{
+   if (!default_pmpool)
+   return false;
+
+   return cma_release(default_pmpool->cma, pages, count);
+}
+
+struct page *pmpool_alloc(unsigned long count)
+{
+   if (!default_pmpool)
+   return NULL;
+
+   return cma_alloc(default_pmpool->cma, count, 0, true);
+}
+
+static void pmpool_fixup_cma(struct cma *cma)
+{
+   unsigned long bitmap_size;
+
+   bitmap_free(cma->bitmap);
+   cma->bitmap = phys_to_virt(PFN_PHYS(cma->base_pfn));
+
+   bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma));
+   memset(cma->bitmap, 0, bitmap_size);
+   bitmap_set(cma->bitmap, 0, PAGE_ALIGN(bitmap_size) >> PAGE_SHIFT);
+
+   pr_info("CMA bitmap moved to %#llx\n", virt_to_phys(cma->bitmap));
+}
+
+static int __init default_pmpool_fixup_cma(void)
+{
+   if (!default_pmpool)
+   return 0;
+
+   pmpool_fixup_cma(default_pmpool->cma);
+   return 0;
+}
+postcore_initcall(default_pmpool_fixup_cma);
+
+static int __init parse_pmpool_opt(char *str)
+{
+   static struct pmpool pmpool;
+   phys_addr_t base, size;
+   int err;
+
+   /* Format is pmpool=, */
+   base = memparse(str, );
+   size = memparse(str + 1, NULL);
+
+   

[RFC PATCH v2 3/7] x86: kexec: Enable fdt modification in callbacks

2023-09-25 Thread Stanislav Kinsburskii
From: Stanislav Kinsburskii 

This option allows kernel subsystems to modify (or create, if necessary)
the Flattened Device Tree (fdt) using registered callbacks and then pass
the modified version to the new kernel.

Signed-off-by: Stanislav Kinsburskii 
---
 arch/x86/Kconfig  |8 +++
 arch/x86/kernel/kexec-bzimage64.c |   41 -
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efb472e267ec..90da51fbb8f8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2078,6 +2078,14 @@ config KEXEC_FILE_FDT
  This option enables passing existent Flattened Device Tree to the new
  kernel when kexec is invoked by the file based system call.
 
+config KEXEC_FILE_FDT_CALLBACK
+   bool "Enable kexec fdt modification support"
+   depends on KEXEC_FILE_FDT
+   select LIBFDT
+   help
+ This option enables Flattened Device Tree modification (and creation
+ if needed) by kernel subsystems, registered corresponding callback.
+
 config ARCH_HAS_KEXEC_PURGATORY
def_bool KEXEC_FILE
 
diff --git a/arch/x86/kernel/kexec-bzimage64.c 
b/arch/x86/kernel/kexec-bzimage64.c
index ab9ae02c9a5f..3c6df28d3637 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -384,11 +384,50 @@ static int bzImage64_probe(const char *buf, unsigned long 
len)
return ret;
 }
 #ifdef CONFIG_KEXEC_FILE_FDT
+#ifdef CONFIG_KEXEC_FILE_FDT_CALLBACK
+static void *fdt_get_runtime(void)
+{
+   void *fdt;
+   size_t fdt_size = SZ_2M;
+   int status;
+
+   /* It's nothing to do without existent fdt and any callbacks */
+   if (!initial_boot_params && kexec_fdt_notify_list_empty())
+   return NULL;
+
+   fdt = kzalloc(fdt_size, GFP_KERNEL);
+   if (!fdt)
+   return NULL;
+
+   if (initial_boot_params)
+   status = fdt_open_into(initial_boot_params, fdt, fdt_size);
+   else
+   status = fdt_create_empty_tree(fdt, fdt_size);
+   if (status != 0) {
+   pr_err("failed to get fdt\n");
+   goto free_fdt;
+   }
+
+   status = kexec_fdt_notify(fdt);
+   if (status) {
+   pr_err("fdt notification failed\n");
+   goto free_fdt;
+   }
+
+   fdt_pack(fdt);
+
+   return fdt;
+
+free_fdt:
+   kfree(fdt);
+   return NULL;
+}
+#else
 static void *fdt_get_runtime(void)
 {
return initial_boot_params;
 }
-
+#endif
 static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params)
 {
void *fdt;



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH v2 1/7] kexec_file: Add fdt modification callback support

2023-09-25 Thread Stanislav Kinsburskii
From: Stanislav Kinsburskii 

Introduce primitives to:
- Register and unregister callbacks for flattened device tree (fdt)
  modifications.
- Invoke all registered callbacks.
- Check for any registered callbacks.

These enhancements enable the use of a device tree to store kernel bits.

Signed-off-by: Stanislav Kinsburskii 
---
 include/linux/kexec.h |7 +++
 kernel/kexec_file.c   |   24 
 2 files changed, 31 insertions(+)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 22b5cd24f581..c9c70551796d 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -282,6 +282,13 @@ arch_kexec_apply_relocations(struct purgatory_info *pi, 
Elf_Shdr *section,
return -ENOEXEC;
 }
 #endif
+
+struct notifier_block;
+extern int register_kexec_fdt_notifier(struct notifier_block *nb);
+extern int unregister_kexec_fdt_notifier(struct notifier_block *nb);
+extern bool kexec_fdt_notify_list_empty(void);
+extern int kexec_fdt_notify(void *fdt);
+
 #endif /* CONFIG_KEXEC_FILE */
 
 #ifdef CONFIG_KEXEC_ELF
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 881ba0d1714c..f9245d5e4459 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -43,6 +43,30 @@ static int kexec_calculate_store_digests(struct kimage 
*image);
 /* Maximum size in bytes for kernel/initrd files. */
 #define KEXEC_FILE_SIZE_MAXmin_t(s64, 4LL << 30, SSIZE_MAX)
 
+static BLOCKING_NOTIFIER_HEAD(kexec_fdt_notify_list);
+
+bool kexec_fdt_notify_list_empty(void)
+{
+   return kexec_fdt_notify_list.head == NULL;
+}
+
+int kexec_fdt_notify(void *fdt)
+{
+   return blocking_notifier_call_chain(_fdt_notify_list, 0, fdt);
+}
+
+int register_kexec_fdt_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_register(_fdt_notify_list, nb);
+}
+EXPORT_SYMBOL(register_kexec_fdt_notifier);
+
+int unregister_kexec_fdt_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_unregister(_fdt_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_kexec_fdt_notifier);
+
 /*
  * Currently this is the only default function that is exported as some
  * architectures need it to do additional handlings.



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH v2 0/7] Introduce persistent memory pool

2023-09-25 Thread Stanislav Kinsburskii
This patch introduces a memory allocator specifically tailored for
persistent memory within the kernel. The allocator maintains
kernel-specific states like DMA passthrough device states, IOMMU state, and
more across kexec.

The current implementation provides a foundation for custom solutions that
may be developed in the future. Although the design is kept concise and
straightforward to encourage discussion and feedback, it remains fully
functional.

The persistent memory pool builds upon the continuous memory allocator
(CMA) and ensures CMA state persistency across kexec by incorporating the
CMA bitmap into the memory region instead of allocation it from kernel
memory.

Persistent memory pool metadata is passed across kexec by using Flattened
Device Tree, which is added as another kexec segment for x86 architecture.

Potential applications include:

  1. Enabling various in-kernel entities to allocate persistent pages from
 a unified memory pool, obviating the need for reserving multiple
 regions.

  2. For in-kernel components that need the allocation address to be
 retained on kernel kexec, this address can be exposed to user space
 and subsequently passed through the command line.

  3. Distinct subsystems or drivers can set aside their region, allocating
 a segment for their persistent memory pool, suitable for uses such as
 file systems, key-value stores, and other applications.

Notes:

  1. The last patch of the series represents a use case for the feature.
 However, the patch won't compile and is for illustrative purposes only
 as the code being patched hasn't been merged yet.

  2. The code being patched is currently under review by the community. The
 series is named "Introduce /dev/mshv drivers":

 https://lkml.org/lkml/2023/9/22/1117


Changes since v1:

  1. Persistent memory pool is now a wrapper on top of CMA instead of being a
 new allocator.

  2. Persistent memory pool metadata doesn't belong to the pool anymore and
 is now passed via Flattened Device Tree instead over kexec to the new
 kernel.

The following series implements...

---

Stanislav Kinsburskii (7):
  kexec_file: Add fdt modification callback support
  x86: kexec: Transfer existing fdt to the new kernel
  x86: kexec: Enable fdt modification in callbacks
  pmpool: Introduce persistent memory pool
  pmpool: Update device tree on kexec
  pmpool: Restore state from device tree post-kexec
  Drivers: hv: Allocate persistent pages for root partition


 arch/x86/Kconfig  |   16 +++
 arch/x86/kernel/kexec-bzimage64.c |   97 +
 drivers/hv/hv_common.c|   13 ++
 include/linux/kexec.h |7 +
 include/linux/pmpool.h|   22 
 kernel/kexec_file.c   |   24 
 mm/Kconfig|9 ++
 mm/Makefile   |1 
 mm/pmpool.c   |  208 +
 9 files changed, 394 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/pmpool.h
 create mode 100644 mm/pmpool.c


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH v2 2/7] x86: kexec: Transfer existing fdt to the new kernel

2023-09-25 Thread Stanislav Kinsburskii
From: Stanislav Kinsburskii 

Enable passing of the Flattened Device Tree (fdt) over kexec for x86
architecture, as outlined in Documentation/x86/booting-dt.rst.

Signed-off-by: Stanislav Kinsburskii 
---
 arch/x86/Kconfig  |8 +
 arch/x86/kernel/kexec-bzimage64.c |   58 +
 2 files changed, 66 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e36261b4ea14..efb472e267ec 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2070,6 +2070,14 @@ config KEXEC_FILE
  for kernel and initramfs as opposed to list of segments as
  accepted by previous system call.
 
+config KEXEC_FILE_FDT
+   bool "Pass fdt over kexec"
+   depends on KEXEC_FILE && X86_64
+   depends on OF_FLATTREE
+   help
+ This option enables passing existent Flattened Device Tree to the new
+ kernel when kexec is invoked by the file based system call.
+
 config ARCH_HAS_KEXEC_PURGATORY
def_bool KEXEC_FILE
 
diff --git a/arch/x86/kernel/kexec-bzimage64.c 
b/arch/x86/kernel/kexec-bzimage64.c
index a61c12c01270..ab9ae02c9a5f 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -381,7 +383,59 @@ static int bzImage64_probe(const char *buf, unsigned long 
len)
 
return ret;
 }
+#ifdef CONFIG_KEXEC_FILE_FDT
+static void *fdt_get_runtime(void)
+{
+   return initial_boot_params;
+}
+
+static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params)
+{
+   void *fdt;
+   struct setup_data *sd;
+   unsigned long fdt_load_addr, fdt_sz;
+   int ret;
+
+   fdt = fdt_get_runtime();
+   if (!fdt)
+   return 0;
+
+   fdt_sz = fdt_totalsize(fdt);
+
+   kbuf->bufsz = kbuf->memsz = sizeof(struct setup_data) + fdt_sz;
+
+   sd = kzalloc(kbuf->bufsz, GFP_KERNEL);
+   if (!sd)
+   return -ENOMEM;
+
+   kbuf->buffer = sd;
+   kbuf->buf_align = PAGE_SIZE;
+   kbuf->buf_min = MIN_INITRD_LOAD_ADDR;
+   kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+   ret = kexec_add_buffer(kbuf);
+   if (ret)
+   return ret;
+
+   fdt_load_addr = kbuf->mem;
 
+   pr_debug("Loaded fdt at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+   fdt_load_addr, fdt_sz, fdt_sz);
+
+   sd->type = SETUP_DTB;
+   sd->len = fdt_sz;
+   memcpy(sd->data, fdt, fdt_sz);
+
+   sd->next = params->hdr.setup_data;
+   params->hdr.setup_data = fdt_load_addr;
+
+   return 0;
+}
+#else
+static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params)
+{
+   return 0;
+}
+#endif
 static void *bzImage64_load(struct kimage *image, char *kernel,
unsigned long kernel_len, char *initrd,
unsigned long initrd_len, char *cmdline,
@@ -561,6 +615,10 @@ static void *bzImage64_load(struct kimage *image, char 
*kernel,
if (ret)
goto out_free_params;
 
+   ret = kexec_setup_fdt(, params);
+   if (ret)
+   goto out_free_params;
+
/* Allocate loader specific data */
ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
if (!ldata) {



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [systemd-devel] [PATCH 0/1] x86/kexec: UKI support

2023-09-25 Thread Jarkko Sakkinen
On Mon Sep 18, 2023 at 6:41 PM EEST, Dimitri John Ledkov wrote:
> On Tue, 12 Sept 2023 at 11:38, Jarkko Sakkinen  wrote:
> >
> > On Tue Sep 12, 2023 at 2:20 AM EEST, Neal Gompa wrote: > On Mon, Sep 11, 
> > 2023 at 7:15 PM Jarkko Sakkinen  wrote:
> > > > On Sat Sep 9, 2023 at 7:18 PM EEST, Jan Hendrik Farr wrote:
> > > > > Hello,
> > > > >
> > > > > this patch implements UKI support for kexec_file_load. It will 
> > > > > require support
> > > > > in the kexec-tools userspace utility. For testing purposes the 
> > > > > following can be used:
> > > > > https://github.com/Cydox/kexec-test/
> > > > >
> > > > > There has been discussion on this topic in an issue on GitHub that is 
> > > > > linked below
> > > > > for reference.
> > > > >
> > > > >
> > > > > Some links:
> > > > > - Related discussion: https://github.com/systemd/systemd/issues/28538
> > > > > - Documentation of UKIs: 
> > > > > https://uapi-group.org/specifications/specs/unified_kernel_image/
> > > > >
> > > > > Jan Hendrik Farr (1):
> > > > >   x86/kexec: UKI support
> > > > >
> > > > >  arch/x86/include/asm/kexec-uki.h   |   7 ++
> > > > >  arch/x86/include/asm/parse_pefile.h|  32 +++
> > > > >  arch/x86/kernel/Makefile   |   2 +
> > > > >  arch/x86/kernel/kexec-uki.c| 113 
> > > > > +
> > > > >  arch/x86/kernel/machine_kexec_64.c |   2 +
> > > > >  arch/x86/kernel/parse_pefile.c | 110 
> > > > >  crypto/asymmetric_keys/mscode_parser.c |   2 +-
> > > > >  crypto/asymmetric_keys/verify_pefile.c | 110 +++-
> > > > >  crypto/asymmetric_keys/verify_pefile.h |  16 
> > > > >  9 files changed, 278 insertions(+), 116 deletions(-)
> > > > >  create mode 100644 arch/x86/include/asm/kexec-uki.h
> > > > >  create mode 100644 arch/x86/include/asm/parse_pefile.h
> > > > >  create mode 100644 arch/x86/kernel/kexec-uki.c
> > > > >  create mode 100644 arch/x86/kernel/parse_pefile.c
> > > > >
> > > > > --
> > > > > 2.40.1
> > > >
> > > > What the heck is UKI?
> > >
> > > Unified Kernel Images. More details available here:
> > > https://uapi-group.org/specifications/specs/unified_kernel_image/
> > >
> > > It's a way of creating initramfs-style images as fully generic,
> > > reproducible images that can be built server-side.
> >
> > You can build today a kernel with these compiled in:
> >
> > 1. EFI stub
> > 2. initeramfs
> > 3. cmdline
> >
> > Why another way (and label 'UKI') for a pre-existing feature?
> >
>
> In Ubuntu, we have considered to use the existing kernel features
> before going off to use UKI. Here are some of the reasons why we
> didn't opt to use the kernel builtin things:
> 1) we wanted to have ability to have TPM measured kernel commandline
> performed before kernel is being executed, which is what sd-stub
> provides us

OK this does make a lot of sense.

> 2) we wanted to have ability to update / regenerate initrd, without
> rebuilding kernel. Thus whenever userspace in the initrd needs
> updating, we can generate new initrd for existing kernel build, create
> new kernel.efi, whilst using existing .linux / vmlinuz build. I don't
> believe it is currently trivial to relink vmlinuz with builtin initrd.
> 3) licensing wise it was not clear if initrd has to be GPLv2
> compatible when linked inside vmlinuz, or if it can contain GPLv3 /
> LGPLv3 userspace code - with UKI it is believed unambigiously true,
> because vmlinuz boots by itself standalone and is compiled separately
> of the UKI.

Right UKI wraps kernel and kernel is a "leaf object".

> 4) we wanted to have ability to override cmdline via kernel args
> without secureboot, and use stock cmdline args under secureboot, to
> allow debugging & production behaviour from a single signed kernel.efi
> (that was custom development, and could be done in the stock vmlinuz
> too).
> 5) obvious mention, the intention here is to have TPM PCR measurements
> and Secureboot signature for vmlinuz and initrd and cmdline and dtb.
> There is otherwise no support for standalone signed initrd, cmdline,
> dtb today. Nor does vendoring it into vmlinuz achieves this to the
> same extent (and ease of predicting for sealing / resealing purposes).

ok

> 6) in Ubuntu kernel.efi also has sbat section for targeted revocations
> (discussed separately elsewhere)
>
> Overall, it is mostly about flexibility to be able to reuse the same
> initrd against multiple kernel builds, or update use multiple initrd
> against the same kernel build. This is imho the biggest issue with
> using initrd built-into the vmlinuz itself.
> Resource wise, the initrd passed in via kernel.efi can be freed, as
> far as I understand. I don't know if the one built-into the vmlinuz is
> freeable.
>
> Improving design to do something else instead of UKI would be
> welcomed. Or for example improving the zimg linus upstream format to
> be a partial or a valid UKI would help as well. For example, building
> the kernel built-in initrd 

Re: [PATCH 0/2] Sign the Image which is zboot's payload

2023-09-25 Thread Philipp Rudo
Hi Dave,

On Fri, 22 Sep 2023 13:41:22 +0800
Dave Young  wrote:

> Hi Jan,
> 
> On Fri, 22 Sept 2023 at 13:19, Jan Hendrik Farr  wrote:
> >
> > Hi Pingfan!
> >
> > On 21 21:37:01, Pingfan Liu wrote:  
> > > From: Pingfan Liu 
> > >  
> >  
> > > For security boot, the vmlinuz.efi will be signed so UEFI boot loader
> > > can check against it. But at present, there is no signature for kexec
> > > file load, this series makes a signature on the zboot's payload -- Image
> > > before it is compressed. As a result, the kexec-tools parses and
> > > decompresses the Image.gz to get the Image, which has signature and can
> > > be checked against during kexec file load  
> >
> > I missed some of the earlier discussion about this zboot kexec support.
> > So just let me know if I'm missing something here. You were exploring
> > these two options in getting this supported:
> >
> > 1. Making kexec_file_load do all the work.
> >
> > This option makes the signature verification easy. kexec_file_load
> > checks the signature on the pe file and then extracts it and does the
> > kexec.
> >
> > This is similar to how I'm approaching UKI support in [1].
> >
> > 2. Extract in userspace and pass decompressed kernel to kexec_file_load
> >
> > This options requires the decompressed kernel to have a valid signature on
> > it. That's why this patch adds the ability to add that signature to the
> > kernel contained inside the zboot image.
> >
> > This option would not make sense for UKI support as it would not
> > validate the signature with respect to the initrd and cmdline that it
> > contains.  
> 
> Another possibility for the cmdline could be using the bootconfig
> facility which was
> introduced for boot time tracking:
> Documentation/admin-guide/bootconfig.rst
> 
> So the initrd+cmdline can be signed as well.  Has this been discussed
> before for UKI?

Not that I know of. But I'm not sure if the bootconfig the way it works
today does the trick.

For one the bootconfig is simply glued to the end of the initrd. But
that makes it part of the UKI as well. So there is no added gain.

Plus, adding the cmdline to the UKI was done on purpose to prevent any
unauthorized editing. That basically means that any change to the
cmdline needs to be signed as well. But I don't see any signature
verification while processing the bootconfig. 

Finally the bootconfig is setup too late in the boot process,
in particular after setup_arch which reserves the crashkernel
memory and needs to parse the kernel command line for that. An even more
extreme example is the decompressor phase on s390. There the command
line is parsed as well. And that is code that runs before start_kernel.

All in all I don't believe that using the bootconfig adds much benefit
for the UKI.

Thanks
Philipp



Re: [PATCH v2] Crash: add lock to serialize crash hotplug handling

2023-09-25 Thread Eric DeVolder




On 9/24/23 22:07, Baoquan He wrote:

Eric reported that handling corresponding crash hotplug event can be
failed easily when many memory hotplug event are notified in a short
period. They failed because failing to take __kexec_lock.

===
[   78.714569] Fallback order for Node 0: 0
[   78.714575] Built 1 zonelists, mobility grouping on.  Total pages: 1817886
[   78.717133] Policy zone: Normal
[   78.724423] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[   78.727207] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[   80.056643] PEFILE: Unsigned PE binary
===

The memory hotplug events are notified very quickly and very many,
while the handling of crash hotplug is much slower relatively. So the
atomic variable __kexec_lock and kexec_trylock() can't guarantee the
serialization of crash hotplug handling.

Here, add a new mutex lock __crash_hotplug_lock to serialize crash
hotplug handling specifically. This doesn't impact the usage of
__kexec_lock.

Signed-off-by: Baoquan He 
---
v1->v2:
  - Move mutex lock definition into CONFIG_CRASH_HOTPLUG ifdeffery
scope in kernel/crash_core.c because the lock is only needed and
used in that scope. Suggested by Eric.

  kernel/crash_core.c | 14 ++
  1 file changed, 14 insertions(+)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 03a7932cde0a..5951d6366b72 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -739,6 +739,17 @@ subsys_initcall(crash_notes_memory_init);
  #undef pr_fmt
  #define pr_fmt(fmt) "crash hp: " fmt
  
+/*

+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock  __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
+ */
+DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
+
  /*
   * This routine utilized when the crash_hotplug sysfs node is read.
   * It reflects the kernel's ability/permission to update the crash
@@ -783,9 +794,11 @@ static void crash_handle_hotplug_event(unsigned int 
hp_action, unsigned int cpu)
  {
struct kimage *image;
  
+	crash_hotplug_lock();

/* Obtain lock while changing crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be 
inaccurate\n");
+   crash_hotplug_unlock();
return;
}
  
@@ -852,6 +865,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)

  out:
/* Release lock now that update complete */
kexec_unlock();
+   crash_hotplug_unlock();
  }
  
  static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)


The crash_check_update_elfcorehdr() also has kexec_trylock() and needs similar 
treatment.
Userspace (ie udev rule processing) and kernel (crash hotplug infrastrucutre) 
need to be
protected/serialized from one another.

Eric

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 0/2] x86/kexec: UKI Support

2023-09-25 Thread Philipp Rudo
Hi Jan,

On Thu, 21 Sep 2023 00:02:25 +0200
Jan Hendrik Farr  wrote:

[...]

> > Maybe we should do a BoF at LPC to discuss this further?  
> 
> I definetly won't be at LPC, is it possible to join virtually?

Yes, LPC will be hybrid again this year. Virtual access costs $50
although you can apply for an 50% discount when you are a
non-professional.

https://lpc.events/event/17/page/212-attend

Thanks
Philipp


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH 0/2] Sign the Image which is zboot's payload

2023-09-25 Thread Ard Biesheuvel
On Mon, 25 Sept 2023 at 03:01, Pingfan Liu  wrote:
>
> On Fri, Sep 22, 2023 at 1:19 PM Jan Hendrik Farr  wrote:
> >
...
> > I missed some of the earlier discussion about this zboot kexec support.
> > So just let me know if I'm missing something here. You were exploring
> > these two options in getting this supported:
> >
> > 1. Making kexec_file_load do all the work.
> >
> > This option makes the signature verification easy. kexec_file_load
> > checks the signature on the pe file and then extracts it and does the
> > kexec.
> >
> > This is similar to how I'm approaching UKI support in [1].
> >
>
> Yes, that is my original try.
>
> > 2. Extract in userspace and pass decompressed kernel to kexec_file_load
> >
> > This option requires the decompressed kernel to have a valid signature on
> > it. That's why this patch adds the ability to add that signature to the
> > kernel contained inside the zboot image.
> >
>
> You got it.
>
> > This option would not make sense for UKI support as it would not
> > validate the signature with respect to the initrd and cmdline that it
> > contains. Am I correct in thinking that there is no similar issue with
> > zboot images? They don't contain any more information besides the kernel
> > that is intended to be securely signed, right? Do you have a reference
>
> If using my second method, it means to unpack the UKI image in user
> space, and pass the kernel image, initrd and cmdline through
> kexec_file_load interface. If the UKI can have signature on the initrd
> and cmdline, we extend the capability of that interface to check those
> verification.
>
> > for the zboot image layout somewhere?
> >
>
> Sorry that maybe there is no document. I understand them through the code.
> The zboot image, aka, vmlinuz.efi looks like:
> PE header, which is formed manually in arch/arm64/kernel/head.S
> EFI decompressor, which consists of
> drivers/firmware/efi/libstub/zboot.c and libstub
> Image.gz, which is formed by compressing Image as instructed in Makefile.zboot
>
>

Indeed, this is currently only documented in code. zboot is a PE
executable that decompresses the kernel and boots it, but it also
carries the base and size of the compressed payload in its header,
along with the compression type so non-EFI loaders can run it as well
(QEMU implements this for gzip on arm64)

> > > I hesitate to post this series,
> >
> > I appreciate you sending it, it's helping the discussion along.
> >

Absolutely. RFCs are important because nobody knows how exactly the
code will look until someone takes the time to implement it. So your
work on this is much appreciated, even if we may decide to take
another approach down the road.

> > > [...] since Ard has recommended using an
> > > emulated UEFI boot service to resolve the UKI kexec load problem [1].
> > > since on aarch64, vmlinuz.efi has faced the similar issue at present.
> > > But anyway, I have a crude outline of it and am sending it out for
> > > discussion.
> >
> > The more I'm thinking about it, the more I like Ard's idea. There's now
> > already two different formats trying to be added to kexec that are
> > pretty different from each other, yet they both have the UEFI interface
> > in common. I think if the kernel supported kexec'ing EFI applications
> > that would be a more flexible and forward-looking approach. It's a
>
> Yes, I agree. That method is attractive, originally I had a try when
> Ard suggested it but there was no clear boundary on which boot service
> should be implemented for zboot, so I did not move on along that
> direction.
>
> Now, UKI poses another challenge to kexec_file_load, and seems to
> require more than zboot. And it appears that Ard's approach is a
> silver bullet for that issue.
>

Yes, it looks appealing but it will take some time to iterate on ideas
and converge on an implementation.

> > standard that both zboot and UKI as well as all future formats for UEFI
> > platforms will support anyways. So while it's more work right now to
> > implement, I think it'll likely pay off.
> >
> > It is significantly more work than the other options though. So I think
> > before work is started on it, it would be nice to get some type of
> > consensus on these things (not an exhaustive list, please feel free to
> > add to it):
> >
>
> I try to answer part of the questions.
>
> > 1. Is it the right approach? It adds a significant amount of userspace
> > API.
>
> My crude assumption: this new stub will replace the purgatory, and I
> am not sure whether kexec-tools source tree will accommodate it. It
> can be signed and checked during the kexec_file_load.
>
> > 2. What subset of the UEFI spec needs/should to be supported?
> > 3. Can we let runtime services still be handled by the firmware after
> > exiting boot services?
>
> I think the runtime services survive through the kexec process. It is
> derived from the real firmware, not related with this stub
>

Yes, this should be possible.

> > 4. How can we debug the stubs that are 

[PATCHv7 4/4] powerpc/setup: alloc extra paca_ptrs to hold boot_cpuid

2023-09-25 Thread Pingfan Liu
paca_ptrs should be large enough to hold the boot_cpuid, hence, its
lower boundary is set to the bigger one between boot_cpuid+1 and
nr_cpus.

On the other hand, some kernel component: -1. the timer assumes cpu0
online since the timer_list->flags subfield 'TIMER_CPUMASK' is zero if
not initialized to a proper present cpu.  -2. power9_idle_stop() assumes
the primary thread's paca is allocated.

Hence lift nr_cpu_ids from one to two to ensure cpu0 is onlined, if the
boot cpu is not cpu0.

Result:
When nr_cpus=1, taskset -c 14 bash -c 'echo c > /proc/sysrq-trigger'
the kdump kernel brings up two cpus.
While when taskset -c 4 bash -c 'echo c > /proc/sysrq-trigger',
the kdump kernel brings up one cpu.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: kexec@lists.infradead.org
To: linuxppc-...@lists.ozlabs.org
---
 arch/powerpc/kernel/paca.c | 10 ++
 arch/powerpc/kernel/prom.c |  9 ++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index cda4e00b67c1..91e2401de1bd 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -242,9 +242,10 @@ static int __initdata paca_struct_size;
 
 void __init allocate_paca_ptrs(void)
 {
-   paca_nr_cpu_ids = nr_cpu_ids;
+   int n = (boot_cpuid + 1) > nr_cpu_ids ? (boot_cpuid + 1) : nr_cpu_ids;
 
-   paca_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
+   paca_nr_cpu_ids = n;
+   paca_ptrs_size = sizeof(struct paca_struct *) * n;
paca_ptrs = memblock_alloc_raw(paca_ptrs_size, SMP_CACHE_BYTES);
if (!paca_ptrs)
panic("Failed to allocate %d bytes for paca pointers\n",
@@ -287,13 +288,14 @@ void __init allocate_paca(int cpu)
 void __init free_unused_pacas(void)
 {
int new_ptrs_size;
+   int n = (boot_cpuid + 1) > nr_cpu_ids ? (boot_cpuid + 1) : nr_cpu_ids;
 
-   new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
+   new_ptrs_size = sizeof(struct paca_struct *) * n;
if (new_ptrs_size < paca_ptrs_size)
memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size,
   paca_ptrs_size - new_ptrs_size);
 
-   paca_nr_cpu_ids = nr_cpu_ids;
+   paca_nr_cpu_ids = n;
paca_ptrs_size = new_ptrs_size;
 
 #ifdef CONFIG_PPC_64S_HASH_MMU
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 87272a2d8c10..15c994f54bf9 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -362,9 +362,12 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
 */
boot_cpuid = i;
found = true;
-   /* This works around the hole in paca_ptrs[]. */
-   if (nr_cpu_ids < nthreads)
-   set_nr_cpu_ids(nthreads);
+   /*
+* Ideally, nr_cpus=1 can be achieved if each kernel
+* component does not assume cpu0 is onlined.
+*/
+   if (boot_cpuid != 0 && nr_cpu_ids < 2)
+   set_nr_cpu_ids(2);
}
 #ifdef CONFIG_SMP
/* logical cpu id is always 0 on UP kernels */
-- 
2.31.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCHv7 1/4] powerpc/setup : Enable boot_cpu_hwid for PPC32

2023-09-25 Thread Pingfan Liu
In order to identify the boot cpu, its intserv[] should be recorded and
checked in smp_setup_cpu_maps().

smp_setup_cpu_maps() is shared between PPC64 and PPC32. Since PPC64 has
already used boot_cpu_hwid to carry that information, enabling this
variable on PPC32 so later it can also be used to carry that information
for PPC32 in the coming patch.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: kexec@lists.infradead.org
To: linuxppc-...@lists.ozlabs.org
Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202309130232.n2rewhbv-...@intel.com/
---
 arch/powerpc/include/asm/smp.h | 2 +-
 arch/powerpc/kernel/prom.c | 3 +--
 arch/powerpc/kernel/setup-common.c | 2 --
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 576d0e15..5db9178cc800 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -26,7 +26,7 @@
 #include 
 
 extern int boot_cpuid;
-extern int boot_cpu_hwid; /* PPC64 only */
+extern int boot_cpu_hwid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
 extern bool coregroup_enabled;
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 0b5878c3125b..ec82f5bda908 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -372,8 +372,7 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
be32_to_cpu(intserv[found_thread]));
boot_cpuid = found;
 
-   if (IS_ENABLED(CONFIG_PPC64))
-   boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);
+   boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);
 
/*
 * PAPR defines "logical" PVR values for cpus that
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index d2a446216444..1b19a9815672 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -87,9 +87,7 @@ EXPORT_SYMBOL(machine_id);
 int boot_cpuid = -1;
 EXPORT_SYMBOL_GPL(boot_cpuid);
 
-#ifdef CONFIG_PPC64
 int boot_cpu_hwid = -1;
-#endif
 
 /*
  * These are used in binfmt_elf.c to put aux entries on the stack
-- 
2.31.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCHv7 2/4] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt

2023-09-25 Thread Pingfan Liu
*** Idea ***
For kexec -p, the boot cpu can be not the cpu0, this causes the problem
of allocating memory for paca_ptrs[]. However, in theory, there is no
requirement to assign cpu's logical id as its present sequence in the
device tree. But there is something like cpu_first_thread_sibling(),
which makes assumption on the mapping inside a core. Hence partially
loosening the mapping, i.e. unbind the mapping of core while keep the
mapping inside a core.

*** Implement ***
At this early stage, there are plenty of memory to utilize. Hence, this
patch allocates interim memory to link the cpu info on a list, then
reorder cpus by changing the list head. As a result, there is a rotate
shift between the sequence number in dt and the cpu logical number.

*** Result ***
After this patch, a boot-cpu's logical id will always be mapped into the
range [0,threads_per_core).

Besides this, at this phase, all threads in the boot core are forced to
be onlined. This restriction will be lifted in a later patch with
extra effort.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: kexec@lists.infradead.org
To: linuxppc-...@lists.ozlabs.org
---
 arch/powerpc/kernel/prom.c | 25 +
 arch/powerpc/kernel/setup-common.c | 87 +++---
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index ec82f5bda908..87272a2d8c10 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -76,7 +76,9 @@ u64 ppc64_rma_size;
 unsigned int boot_cpu_node_count __ro_after_init;
 #endif
 static phys_addr_t first_memblock_size;
+#ifdef CONFIG_SMP
 static int __initdata boot_cpu_count;
+#endif
 
 static int __init early_parse_mem(char *p)
 {
@@ -331,8 +333,7 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
const __be32 *intserv;
int i, nthreads;
int len;
-   int found = -1;
-   int found_thread = 0;
+   bool found = false;
 
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
@@ -355,8 +356,15 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
for (i = 0; i < nthreads; i++) {
if (be32_to_cpu(intserv[i]) ==
fdt_boot_cpuid_phys(initial_boot_params)) {
-   found = boot_cpu_count;
-   found_thread = i;
+   /*
+* always map the boot-cpu logical id into the
+* range of [0, thread_per_core)
+*/
+   boot_cpuid = i;
+   found = true;
+   /* This works around the hole in paca_ptrs[]. */
+   if (nr_cpu_ids < nthreads)
+   set_nr_cpu_ids(nthreads);
}
 #ifdef CONFIG_SMP
/* logical cpu id is always 0 on UP kernels */
@@ -365,14 +373,13 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
}
 
/* Not the boot CPU */
-   if (found < 0)
+   if (!found)
return 0;
 
-   DBG("boot cpu: logical %d physical %d\n", found,
-   be32_to_cpu(intserv[found_thread]));
-   boot_cpuid = found;
+   DBG("boot cpu: logical %d physical %d\n", boot_cpuid,
+   be32_to_cpu(intserv[boot_cpuid]));
 
-   boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);
+   boot_cpu_hwid = be32_to_cpu(intserv[boot_cpuid]);
 
/*
 * PAPR defines "logical" PVR values for cpus that
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 1b19a9815672..f6d32324b5a5 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -425,6 +426,13 @@ static void __init cpu_init_thread_core_maps(int tpc)
 
 u32 *cpu_to_phys_id = NULL;
 
+struct interrupt_server_node {
+   struct list_head node;
+   boolavail;
+   int len;
+   __be32 *intserv;
+};
+
 /**
  * setup_cpu_maps - initialize the following cpu maps:
  *  cpu_possible_mask
@@ -446,11 +454,16 @@ u32 *cpu_to_phys_id = NULL;
 void __init smp_setup_cpu_maps(void)
 {
struct device_node *dn;
-   int cpu = 0;
-   int nthreads = 1;
+   int shift = 0, cpu = 0;
+   int j, nthreads = 1;
+   int len;
+   struct interrupt_server_node *intserv_node, *n;
+   struct list_head *bt_node, head;
+   bool avail, found_boot_cpu = false;
 
DBG("smp_setup_cpu_maps()\n");
 
+   INIT_LIST_HEAD();
cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32),
__alignof__(u32));
if (!cpu_to_phys_id)
@@ -460,7 +473,6 @@ void __init 

[PATCHv7 3/4] powerpc/setup: Handle the case when boot_cpuid greater than nr_cpus

2023-09-25 Thread Pingfan Liu
If the boot_cpuid is smaller than nr_cpus, it requires extra effort to
ensure the boot_cpu is in cpu_present_mask. This can be achieved by
reserving the last quota for the boot cpu.

Note: the restriction on nr_cpus will be lifted with more effort in the
next patch

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: kexec@lists.infradead.org
To: linuxppc-...@lists.ozlabs.org
---
 arch/powerpc/kernel/setup-common.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index f6d32324b5a5..a72d00a6cff2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -454,8 +454,8 @@ struct interrupt_server_node {
 void __init smp_setup_cpu_maps(void)
 {
struct device_node *dn;
-   int shift = 0, cpu = 0;
-   int j, nthreads = 1;
+   int terminate, shift = 0, cpu = 0;
+   int j, bt_thread = 0, nthreads = 1;
int len;
struct interrupt_server_node *intserv_node, *n;
struct list_head *bt_node, head;
@@ -518,6 +518,7 @@ void __init smp_setup_cpu_maps(void)
for (j = 0 ; j < nthreads; j++) {
if (be32_to_cpu(intserv[j]) == boot_cpu_hwid) {
bt_node = _node->node;
+   bt_thread = j;
found_boot_cpu = true;
/*
 * Record the round-shift between dt
@@ -537,11 +538,21 @@ void __init smp_setup_cpu_maps(void)
/* Select the primary thread, the boot cpu's slibing, as the logic 0 */
list_add_tail(, bt_node);
pr_info("the round shift between dt seq and the cpu logic number: 
%d\n", shift);
+   terminate = nr_cpu_ids;
list_for_each_entry(intserv_node, , node) {
 
+   j = 0;
+   /* Choose a start point to cover the boot cpu */
+   if (nr_cpu_ids - 1 < bt_thread) {
+   /*
+* The processor core puts assumption on the thread id,
+* not to breach the assumption.
+*/
+   terminate = nr_cpu_ids - 1;
+   }
avail = intserv_node->avail;
nthreads = intserv_node->len / sizeof(int);
-   for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) {
+   for (; j < nthreads && cpu < terminate; j++) {
set_cpu_present(cpu, avail);
set_cpu_possible(cpu, true);
cpu_to_phys_id[cpu] = 
be32_to_cpu(intserv_node->intserv[j]);
@@ -549,6 +560,14 @@ void __init smp_setup_cpu_maps(void)
j, cpu, be32_to_cpu(intserv[j]));
cpu++;
}
+   /* Online the boot cpu */
+   if (nr_cpu_ids - 1 < bt_thread) {
+   set_cpu_present(bt_thread, avail);
+   set_cpu_possible(bt_thread, true);
+   cpu_to_phys_id[bt_thread] = 
be32_to_cpu(intserv_node->intserv[bt_thread]);
+   DBG("thread %d -> cpu %d (hard id %d)\n",
+   bt_thread, bt_thread, 
be32_to_cpu(intserv[bt_thread]));
+   }
}
 
list_for_each_entry_safe(intserv_node, n, , node) {
-- 
2.31.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCHv7 0/4] enable nr_cpus for powerpc

2023-09-25 Thread Pingfan Liu
Since my last v4 [1], the code has undergone great changes. The paca[]
array has been reorganized and indexed by paca_ptrs[], which
dramatically decreases the memory consumption even if there are many
unpresent cpus in the middle.

However, reordering the logical cpu numbers can further decrease the
size of paca_ptrs[] in the kdump case. So I keep [2/4], which
rotate-shifts the cpu's sequence number in the device tree to obtain the
logical cpu id.

Patch [3-4/4] make efforts to decrease the nr_cpus to be less than or
equal to two.

[1]: 
https://lore.kernel.org/linuxppc-dev/1520829790-14029-1-git-send-email-kernelf...@gmail.com/
---
v6 -> v7
  Add [1/4], which fixes compilation error on PPC32

Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: kexec@lists.infradead.org
To: linuxppc-...@lists.ozlabs.org


Pingfan Liu (4):
  powerpc/setup : Enable boot_cpu_hwid for PPC32
  powerpc/setup: Loosen the mapping between cpu logical id and its seq
in dt
  powerpc/setup: Handle the case when boot_cpuid greater than nr_cpus
  powerpc/setup: alloc extra paca_ptrs to hold boot_cpuid

 arch/powerpc/include/asm/smp.h |   2 +-
 arch/powerpc/kernel/paca.c |  10 +--
 arch/powerpc/kernel/prom.c |  29 +---
 arch/powerpc/kernel/setup-common.c | 108 +++--
 4 files changed, 114 insertions(+), 35 deletions(-)

-- 
2.31.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec