This patch adds the core infrastructure to generate Kexec HandOver metadata. Kexec HandOver is a mechanism that allows Linux to preserve state - arbitrary properties as well as memory locations - across kexec.
It does so using 3 concepts: 1) Device Tree - Every KHO kexec carries a KHO specific flattened device tree blob that describes the state of the system. Device drivers can register to KHO to serialize their state before kexec. 2) Mem cache - A memblocks like structure that contains full page ranges of reservations. These can not be part of the architectural reservations, because they differ on every kexec. 3) Scratch Region - A CMA region that we allocate in the first kernel. CMA gives us the guarantee that no handover pages land in that region, because handover pages must be at a static physical memory location. We use this region as the place to load future kexec images into which then won't collide with any handover data. Signed-off-by: Alexander Graf <g...@amazon.com> --- v1 -> v2: - s/kho_reserve/kho_reserve_scratch/g - Move kho enums out of ifdef --- Documentation/ABI/testing/sysfs-kernel-kho | 53 +++ .../admin-guide/kernel-parameters.txt | 10 + MAINTAINERS | 1 + include/linux/kexec.h | 24 ++ include/uapi/linux/kexec.h | 6 + kernel/Makefile | 1 + kernel/kexec_kho_out.c | 316 ++++++++++++++++++ 7 files changed, 411 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-kho create mode 100644 kernel/kexec_kho_out.c diff --git a/Documentation/ABI/testing/sysfs-kernel-kho b/Documentation/ABI/testing/sysfs-kernel-kho new file mode 100644 index 000000000000..f69e7b81a337 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-kho @@ -0,0 +1,53 @@ +What: /sys/kernel/kho/active +Date: December 2023 +Contact: Alexander Graf <g...@amazon.com> +Description: + Kexec HandOver (KHO) allows Linux to transition the state of + compatible drivers into the next kexec'ed kernel. To do so, + device drivers will serialize their current state into a DT. + While the state is serialized, they are unable to perform + any modifications to state that was serialized, such as + handed over memory allocations. + + When this file contains "1", the system is in the transition + state. When contains "0", it is not. To switch between the + two states, echo the respective number into this file. + +What: /sys/kernel/kho/dt_max +Date: December 2023 +Contact: Alexander Graf <g...@amazon.com> +Description: + KHO needs to allocate a buffer for the DT that gets + generated before it knows the final size. By default, it + will allocate 10 MiB for it. You can write to this file + to modify the size of that allocation. + +What: /sys/kernel/kho/scratch_len +Date: December 2023 +Contact: Alexander Graf <g...@amazon.com> +Description: + To support continuous KHO kexecs, we need to reserve a + physically contiguous memory region that will always stay + available for future kexec allocations. This file describes + the length of that memory region. Kexec user space tooling + can use this to determine where it should place its payload + images. + +What: /sys/kernel/kho/scratch_phys +Date: December 2023 +Contact: Alexander Graf <g...@amazon.com> +Description: + To support continuous KHO kexecs, we need to reserve a + physically contiguous memory region that will always stay + available for future kexec allocations. This file describes + the physical location of that memory region. Kexec user space + tooling can use this to determine where it should place its + payload images. + +What: /sys/kernel/kho/dt +Date: December 2023 +Contact: Alexander Graf <g...@amazon.com> +Description: + When KHO is active, the kernel exposes the generated DT that + carries its current KHO state in this file. Kexec user space + tooling can use this as input file for the KHO payload image. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cc9cc33d1121..f8785df17d67 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2515,6 +2515,16 @@ kgdbwait [KGDB] Stop kernel execution and enter the kernel debugger at the earliest opportunity. + kho_scratch=n[KMG] [KEXEC] Sets the size of the KHO scratch + region. The KHO scratch region is a physically + memory range that can only be used for non-kernel + allocations. That way, even when memory is heavily + fragmented with handed over memory, kexec will always + be able to find contiguous memory to place the next + kernel for kexec into. + + The default is 0. + kmac= [MIPS] Korina ethernet MAC address. Configure the RouterBoard 532 series on-chip Ethernet adapter MAC address. diff --git a/MAINTAINERS b/MAINTAINERS index 391bbb855cbe..6ec4be8874b9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11824,6 +11824,7 @@ M: Eric Biederman <ebied...@xmission.com> L: kexec@lists.infradead.org S: Maintained W: http://kernel.org/pub/linux/utils/kernel/kexec/ +F: Documentation/ABI/testing/sysfs-kernel-kho F: include/linux/kexec.h F: include/uapi/linux/kexec.h F: kernel/kexec* diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 400cb6c02176..19ffc00b5e7b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -21,6 +21,8 @@ #include <uapi/linux/kexec.h> #include <linux/verification.h> +#include <linux/libfdt.h> +#include <linux/notifier.h> extern note_buf_t __percpu *crash_notes; @@ -523,6 +525,28 @@ void set_kexec_sig_enforced(void); static inline void set_kexec_sig_enforced(void) {} #endif +/* Notifier index */ +enum kho_event { + KEXEC_KHO_DUMP = 0, + KEXEC_KHO_ABORT = 1, +}; + +#ifdef CONFIG_KEXEC_KHO +extern phys_addr_t kho_scratch_phys; +extern phys_addr_t kho_scratch_len; + +/* egest handover metadata */ +void kho_reserve_scratch(void); +int register_kho_notifier(struct notifier_block *nb); +int unregister_kho_notifier(struct notifier_block *nb); +bool kho_is_active(void); +#else +static inline void kho_reserve_scratch(void) {} +static inline int register_kho_notifier(struct notifier_block *nb) { return -EINVAL; } +static inline int unregister_kho_notifier(struct notifier_block *nb) { return -EINVAL; } +static inline bool kho_is_active(void) { return false; } +#endif + #endif /* !defined(__ASSEBMLY__) */ #endif /* LINUX_KEXEC_H */ diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index c17bb096ea68..ad9e95b88b34 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -50,6 +50,12 @@ /* The artificial cap on the number of segments passed to kexec_load. */ #define KEXEC_SEGMENT_MAX 16 +/* KHO passes an array of kho_mem as "mem cache" to the new kernel */ +struct kho_mem { + __u64 addr; + __u64 len; +}; + #ifndef __KERNEL__ /* * This structure is used to hold the arguments that are used when diff --git a/kernel/Makefile b/kernel/Makefile index ce105a5558fc..b182b7b4e7d1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -73,6 +73,7 @@ obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o +obj-$(CONFIG_KEXEC_KHO) += kexec_kho_out.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_kho_out.c b/kernel/kexec_kho_out.c new file mode 100644 index 000000000000..765cf6ba7a46 --- /dev/null +++ b/kernel/kexec_kho_out.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_kho_out.c - kexec handover code to egest metadata. + * Copyright (C) 2023 Alexander Graf <g...@amazon.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cma.h> +#include <linux/kexec.h> +#include <linux/device.h> +#include <linux/compiler.h> +#include <linux/kmsg_dump.h> + +struct kho_out { + struct kobject *kobj; + bool active; + struct cma *cma; + struct blocking_notifier_head chain_head; + void *dt; + u64 dt_len; + u64 dt_max; + struct mutex lock; +}; + +static struct kho_out kho = { + .dt_max = (1024 * 1024 * 10), + .chain_head = BLOCKING_NOTIFIER_INIT(kho.chain_head), + .lock = __MUTEX_INITIALIZER(kho.lock), +}; + +/* + * Size for scratch (non-KHO) memory. With KHO enabled, memory can become + * fragmented because KHO regions may be anywhere in physical address + * space. The scratch region gives us a safe zone that we will never see + * KHO allocations from. This is where we can later safely load our new kexec + * images into. + */ +static phys_addr_t kho_scratch_size __initdata; + +int register_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&kho.chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_kho_notifier); + +int unregister_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&kho.chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_kho_notifier); + +bool kho_is_active(void) +{ + return kho.active; +} +EXPORT_SYMBOL_GPL(kho_is_active); + +static ssize_t raw_read(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + mutex_lock(&kho.lock); + memcpy(buf, attr->private + pos, count); + mutex_unlock(&kho.lock); + + return count; +} + +static BIN_ATTR(dt, 0400, raw_read, NULL, 0); + +static int kho_expose_dt(void *fdt) +{ + long fdt_len = fdt_totalsize(fdt); + int err; + + kho.dt = fdt; + kho.dt_len = fdt_len; + + bin_attr_dt.size = fdt_totalsize(fdt); + bin_attr_dt.private = fdt; + err = sysfs_create_bin_file(kho.kobj, &bin_attr_dt); + + return err; +} + +static void kho_abort(void) +{ + if (!kho.active) + return; + + sysfs_remove_bin_file(kho.kobj, &bin_attr_dt); + + kvfree(kho.dt); + kho.dt = NULL; + kho.dt_len = 0; + + blocking_notifier_call_chain(&kho.chain_head, KEXEC_KHO_ABORT, NULL); + + kho.active = false; +} + +static int kho_serialize(void) +{ + void *fdt = NULL; + int err; + + kho.active = true; + err = -ENOMEM; + + fdt = kvmalloc(kho.dt_max, GFP_KERNEL); + if (!fdt) + goto out; + + if (fdt_create(fdt, kho.dt_max)) { + err = -EINVAL; + goto out; + } + + err = fdt_finish_reservemap(fdt); + if (err) + goto out; + + err = fdt_begin_node(fdt, ""); + if (err) + goto out; + + err = fdt_property_string(fdt, "compatible", "kho-v1"); + if (err) + goto out; + + /* Loop through all kho dump functions */ + err = blocking_notifier_call_chain(&kho.chain_head, KEXEC_KHO_DUMP, fdt); + err = notifier_to_errno(err); + if (err) + goto out; + + /* Close / */ + err = fdt_end_node(fdt); + if (err) + goto out; + + err = fdt_finish(fdt); + if (err) + goto out; + + if (WARN_ON(fdt_check_header(fdt))) { + err = -EINVAL; + goto out; + } + + err = kho_expose_dt(fdt); + +out: + if (err) { + pr_err("kho failed to serialize state: %d", err); + kho_abort(); + } + return err; +} + +/* Handling for /sys/kernel/kho */ + +#define KHO_ATTR_RO(_name) static struct kobj_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) +#define KHO_ATTR_RW(_name) static struct kobj_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) + +static ssize_t active_store(struct kobject *dev, struct kobj_attribute *attr, + const char *buf, size_t size) +{ + ssize_t retsize = size; + bool val = false; + int ret; + + if (kstrtobool(buf, &val) < 0) + return -EINVAL; + + if (!kho_scratch_len) + return -ENOMEM; + + mutex_lock(&kho.lock); + if (val != kho.active) { + if (val) { + ret = kho_serialize(); + if (ret) { + retsize = -EINVAL; + goto out; + } + } else { + kho_abort(); + } + } + +out: + mutex_unlock(&kho.lock); + return retsize; +} + +static ssize_t active_show(struct kobject *dev, struct kobj_attribute *attr, + char *buf) +{ + ssize_t ret; + + mutex_lock(&kho.lock); + ret = sysfs_emit(buf, "%d\n", kho.active); + mutex_unlock(&kho.lock); + + return ret; +} +KHO_ATTR_RW(active); + +static ssize_t dt_max_store(struct kobject *dev, struct kobj_attribute *attr, + const char *buf, size_t size) +{ + u64 val; + + if (kstrtoull(buf, 0, &val)) + return -EINVAL; + + kho.dt_max = val; + + return size; +} + +static ssize_t dt_max_show(struct kobject *dev, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "0x%llx\n", kho.dt_max); +} +KHO_ATTR_RW(dt_max); + +static ssize_t scratch_len_show(struct kobject *dev, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "0x%llx\n", kho_scratch_len); +} +KHO_ATTR_RO(scratch_len); + +static ssize_t scratch_phys_show(struct kobject *dev, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "0x%llx\n", kho_scratch_phys); +} +KHO_ATTR_RO(scratch_phys); + +static __init int kho_out_init(void) +{ + int ret = 0; + + kho.kobj = kobject_create_and_add("kho", kernel_kobj); + if (!kho.kobj) { + ret = -ENOMEM; + goto err; + } + + ret = sysfs_create_file(kho.kobj, &active_attr.attr); + if (ret) + goto err; + + ret = sysfs_create_file(kho.kobj, &dt_max_attr.attr); + if (ret) + goto err; + + ret = sysfs_create_file(kho.kobj, &scratch_phys_attr.attr); + if (ret) + goto err; + + ret = sysfs_create_file(kho.kobj, &scratch_len_attr.attr); + if (ret) + goto err; + +err: + return ret; +} +late_initcall(kho_out_init); + +static int __init early_kho_scratch(char *p) +{ + kho_scratch_size = memparse(p, &p); + return 0; +} +early_param("kho_scratch", early_kho_scratch); + +/** + * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec + * + * With KHO we can preserve arbitrary pages in the system. To ensure we still + * have a large contiguous region of memory when we search the physical address + * space for target memory, let's make sure we always have a large CMA region + * active. This CMA region will only be used for movable pages which are not a + * problem for us during KHO because we can just move them somewhere else. + */ +__init void kho_reserve_scratch(void) +{ + int r; + + if (kho_get_fdt()) { + /* + * We came from a previous KHO handover, so we already have + * a known good scratch region that we preserve. No need to + * allocate another. + */ + return; + } + + /* Only allocate KHO scratch memory when we're asked to */ + if (!kho_scratch_size) + return; + + r = cma_declare_contiguous_nid(0, kho_scratch_size, 0, PAGE_SIZE, 0, + false, "kho", &kho.cma, NUMA_NO_NODE); + if (WARN_ON(r)) + return; + + kho_scratch_phys = cma_get_base(kho.cma); + kho_scratch_len = cma_get_size(kho.cma); +} -- 2.40.1 Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879