[PATCH 1/2] x86/apic: add a more generic early_probe
From: Kairui Song There is only one early apic driver probe method: acpi_madt_oem_check, which is used by ACPI MADT init path only. Some apic drivers' early probe doesn't need ACPI info. Even when probed from ACPI subsystem, the ACPI info is simply ignored. So add a more generic early_probe method, which can be used by MPTABLE parse later. Signed-off-by: Kairui Song --- arch/x86/include/asm/apic.h | 6 ++ arch/x86/kernel/apic/probe_64.c | 16 arch/x86/kernel/apic/x2apic_cluster.c | 8 +++- arch/x86/kernel/apic/x2apic_phys.c| 8 +++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bd8ae0a7010a..cd3266fbfa63 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -310,6 +310,7 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); + int (*early_probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); int (*apic_id_valid)(u32 apicid); int (*apic_id_registered)(void); @@ -498,6 +499,11 @@ extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler); extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); +#ifdef CONFIG_X86_64 +extern void apic_early_probe(void); +#else +static inline void apic_early_probe(void) { } +#endif extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c46720f185c0..3f600c421f07 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -13,6 +13,22 @@ #include "local.h" +void __init apic_early_probe(void) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->early_probe && (*drv)->early_probe()) { + if (apic != *drv) { + apic = *drv; + pr_info("Switched to APIC driver %s.\n", + apic->name); + } + break; + } + } +} + /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e696e22d0531..02eb8ea9a5b5 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -26,11 +26,16 @@ static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); static struct cluster_mask *cluster_hotplug_mask; -static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int x2apic_early_probe(void) { return x2apic_enabled(); } +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_early_probe(); +} + static void x2apic_send_IPI(int cpu, int vector) { u32 dest = x86_cpu_to_logical_apicid[cpu]; @@ -197,6 +202,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, + .early_probe= x2apic_early_probe, .acpi_madt_oem_check= x2apic_acpi_madt_oem_check, .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6bde05a86b4e..c4dd4ec0f1ac 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -34,11 +34,16 @@ static bool x2apic_fadt_phys(void) return false; } -static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int x2apic_early_probe(void) { return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); } +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_early_probe(); +} + static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_apicid, cpu); @@ -156,6 +161,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, + .early_probe= x2apic_early_probe, .acpi_madt_oem_check= x2apic_acpi_madt_oem_check, .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, -- 2.35.2 ___
[PATCH 2/2] x86/mpparse, kexec: probe apic driver early for x2apic
From: Kairui Song Following kernel panic is observed when doing kdump/kexec on virtual machines that uses MPTABLE, not ACPI MADT, and supports x2apic: Intel MultiProcessor Specification v1.4 MPTABLE: OEM ID: BOCHSCPU MPTABLE: Product ID: 0.1 MPTABLE: APIC at: 0xFEE0 BUG: unable to handle page fault for address: ff5fc020 #PF: supervisor read access in kernel mode #PF: error_code(0x) - not-present page PGD 25e15067 P4D 25e15067 PUD 25e17067 PMD 25e18067 PTE 0 Oops: [#1] SMP NOPTI CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.10-300.fc35.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1.fc35 04/01/2014 RIP: 0010:native_apic_mem_read+0x2/0x10 Code: 14 25 20 cd e3 82 c3 90 bf 30 08 00 00 ff 14 25 18 cd e3 82 c3 cc cc cc 89 ff 89 b7 00 c0 5f ff c3 0f 1f 80 00 00 00 00 89 ff <8b> 87 00 c0 5f ff c3 0f 1f 80 00 00 00 0 RSP: :82e03e18 EFLAGS: 00010046 RAX: 81064840 RBX: ff240b6c RCX: 82f17428 RDX: c000dfff RSI: dfff RDI: 0020 RBP: 88802320 R08: R09: 82e03c50 R10: 82e03c48 R11: 82f47468 R12: ff240b40 R13: ff200b30 R14: R15: 00d4 FS: () GS:8365b000() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: ff5fc020 CR3: 25e1 CR4: 06b0 Call Trace: ? read_apic_id+0x15/0x30 ? register_lapic_address+0x76/0x97 ? default_get_smp_config+0x28b/0x42d ? dmi_check_system+0x1c/0x60 ? acpi_boot_init+0x1d/0x4c3 ? setup_arch+0xb37/0xc2a ? slab_is_available+0x5/0x10 ? start_kernel+0x61/0x980 ? load_ucode_bsp+0x4c/0xcd ? secondary_startup_64_no_verify+0xc2/0xcb Modules linked in: CR2: ff5fc020 random: get_random_bytes called from oops_exit+0x35/0x60 with crng_init=0 ---[ end trace c9e569df3bdbefd3 ]--- The panic happens within following init code: setup_arch() check_x2apic() <-- x2apic is enabled by first kernel before kexec, this set x2apic_mode = 1, make sure later probes will recognize pre-enabled x2apic. acpi_boot_init(); <-- If ACPI MADT is in use, this will switch apic driver to x2apic, but it will do nothing with MPTABLE. x86_dtb_init(); get_smp_config(); default_get_smp_config(); <-- MPTABLE setup. check_physptr(); smp_read_mpc(); register_lapic_address(); <-- * panic here * init_apic_mappings(); The problem here is MPTABLE setup calls register_lapic_address(), which is still using apic_flat driver, and access the apic MMIO interface. But the address is never mapped for pre-enabled x2apic, since commit 0450193bffed6 ("x86, x2apic: Don't map lapic addr for preenabled x2apic systems"), then it panics. Simply map it won't work either, in x2apic mode the MMIO interface is not usable (Intel SDM Volume 3A 10.12.2), later setups will still fail with other errors. So it needs do a proper apic driver probe and switch to x2apic driver to perform MSR operation instead. Such issue is currently only seen with kdump/kexec, kernel enabled the x2apic in first kernel and kept it enabled to 2nd kernel. This can be easily reproduced with qemu-kvm, use -no-acpi and enable x2apic, so x2apic with MPTABLE will be in use, then trigger kdump/kexec. Signed-off-by: Kairui Song --- arch/x86/kernel/mpparse.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index fed721f90116..7658c8184e8c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -202,8 +202,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) return 0; /* Initialize the lapic mapping */ - if (!acpi_lapic) + if (!acpi_lapic) { + apic_early_probe(); register_lapic_address(mpc->lapic); + } if (early) return 1; -- 2.35.2 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 0/2] x86/mpparse, kexec: Fix kdump/kexec kernel panic with MPTABLE and x2apic
From: Kairui Song Following kernel panic is observed when doing kdump/kexec on qemu-kvm VMs that uses MPTABLE, not ACPI MADT, and supports x2apic: Intel MultiProcessor Specification v1.4 MPTABLE: OEM ID: BOCHSCPU MPTABLE: Product ID: 0.1 MPTABLE: APIC at: 0xFEE0 BUG: unable to handle page fault for address: ff5fc020 #PF: supervisor read access in kernel mode #PF: error_code(0x) - not-present page PGD 25e15067 P4D 25e15067 PUD 25e17067 PMD 25e18067 PTE 0 Oops: [#1] SMP NOPTI CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.10-300.fc35.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1.fc35 04/01/2014 RIP: 0010:native_apic_mem_read+0x2/0x10 Code: 14 25 20 cd e3 82 c3 90 bf 30 08 00 00 ff 14 25 18 cd e3 82 c3 cc cc cc 89 ff 89 b7 00 c0 5f ff c3 0f 1f 80 00 00 00 00 89 ff <8b> 87 00 c0 5f ff c3 0f 1f 80 00 00 00 0 RSP: :82e03e18 EFLAGS: 00010046 RAX: 81064840 RBX: ff240b6c RCX: 82f17428 RDX: c000dfff RSI: dfff RDI: 0020 RBP: 88802320 R08: R09: 82e03c50 R10: 82e03c48 R11: 82f47468 R12: ff240b40 R13: ff200b30 R14: R15: 00d4 FS: () GS:8365b000() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: ff5fc020 CR3: 25e1 CR4: 06b0 Call Trace: ? read_apic_id+0x15/0x30 ? register_lapic_address+0x76/0x97 ? default_get_smp_config+0x28b/0x42d ? dmi_check_system+0x1c/0x60 ? acpi_boot_init+0x1d/0x4c3 ? setup_arch+0xb37/0xc2a ? slab_is_available+0x5/0x10 ? start_kernel+0x61/0x980 ? load_ucode_bsp+0x4c/0xcd ? secondary_startup_64_no_verify+0xc2/0xcb Modules linked in: CR2: ff5fc020 random: get_random_bytes called from oops_exit+0x35/0x60 with crng_init=0 ---[ end trace c9e569df3bdbefd3 ]--- It turns out MPTABLE doesn't play well with pre-enabled x2apic mode, this series extend the apic driver interface and let MPTABLE parse probe the driver properly. This can be easily reproduced with qemu-kvm, use -no-acpi and enable x2apic, so x2apic with MPTABLE will be in use, then trigger kdump/kexec. Kairui Song (2): x86, apic: add a more generic early_probe x86/mpparse, kexec: probe apic driver early for x2apic arch/x86/include/asm/apic.h | 6 ++ arch/x86/kernel/apic/probe_64.c | 16 arch/x86/kernel/apic/x2apic_cluster.c | 8 +++- arch/x86/kernel/apic/x2apic_phys.c| 8 +++- arch/x86/kernel/mpparse.c | 4 +++- 5 files changed, 39 insertions(+), 3 deletions(-) -- 2.35.2 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: Setting "orig_video_isVGA" when handing off Linux framebuffer
Hi Benjamin, Sorry for the late reply, I missed your email in my inbox. On Wed, May 5, 2021 at 7:10 AM Benjamin Moody wrote: > > Hi, > > In regard to how kexec hands off the framebuffer to the newly-booted > kernel: > > Commit 060eee589dd1 (2018-01-28) added the "blindly try old boot time > video type" behavior, without doing any checking to see if the > framebuffer is compatible with the stated format. > > Commit fb5a8792e6e4 (2019-03-05) made this behavior conditional on the > --reuse-video-type option. The commit message observes that: > > Currently kernel hanging is inspected on some hyper-v VMs after this > commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot > up, but after the real driver is loaded, it will switch to new mode > and no longer compatible with EFI/VESA VGA. Keep setting > orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded > and try to manipulate the framebuffer in a wrong way. > > It's clear to me that various bad things *might* happen if kexec > pretends that the framebuffer is "VESA-compatible" or "EFI-compatible" > when in fact it isn't. > > Yet, in many cases, the Linux framebuffer is VESA/EFI-compatible, at > least to the extent that blindly setting orig_video_isVGA = 0x23 or > 0x70 results in a usable display. So I have to wonder, in the > situation mentioned above: > > - was the framebuffer not in a compatible format to begin with? > > - was the framebuffer address not correctly reported by the existing >kernel driver? > > - did the original bootloader give wrong information and somehow that >broke the newly booted kernel? > > Kairui, can you please clarify what sort of kernel hangs you were > seeing and what specific hardware and drivers you were using? > For the commit fb5a8792e6e4, the problem is only observed with hyperv_fb, and it's a HyperV VM. The framebuffer was VESA compatible when the machine just booted, but after hyperv_fb driver is loaded, it will ask the hypervisor to relocate the framebuffer in a new location and in a new format. In a later kernel commit 3cb73bc3fa2a3cb80b88aa63b48409939e0d996b, it fixed the kernel side issue that after the relocation, the framebuffer address is not updated in boot_params. It was not updated before this kernel commit. Before that, the old boot_params will contain an invalid address and cause failures in the new booted kernel. And I also remember blindly setting orig_video_isVGA will cause strange errors on some random graphic cards. If we can't make sure it's really VGA, this field better left zero, so kernel won't use it as a VGA framebuffer. For your case, you mentioned "'fix.id' is not "VESA VGA" or "EFI VGA", but rather "inteldrmfb" or "i915drmfb"", 'fix.id' can change after boot, I'm not familiar with heads or coreboot, but I guess the first kernel you booted have intel drm drivers loaded? Maybe you can try either don't load intel drm driver in first kernel (so the framebuffer is always being used in a VESA/EFI compatible way), or ensure same driver is loaded in the new booted kernel (this way the driver will reinitialize the framebuffer anyway, even if it's not set in boot_params). > Benjamin Moody > -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v4 1/1] kernel/crash_core: Add crashkernel=auto for vmcore creation
On Wed, Feb 24, 2021 at 1:45 AM Saeed Mirzamohammadi wrote: > > This adds crashkernel=auto feature to configure reserved memory for > vmcore creation. CONFIG_CRASH_AUTO_STR is defined to be set for > different kernel distributions and different archs based on their > needs. > > Signed-off-by: Saeed Mirzamohammadi > Signed-off-by: John Donnelly > Tested-by: John Donnelly > --- > Documentation/admin-guide/kdump/kdump.rst | 3 ++- > .../admin-guide/kernel-parameters.txt | 6 ++ > arch/Kconfig | 20 +++ > kernel/crash_core.c | 7 +++ > 4 files changed, 35 insertions(+), 1 deletion(-) > > diff --git a/Documentation/admin-guide/kdump/kdump.rst > b/Documentation/admin-guide/kdump/kdump.rst > index 75a9dd98e76e..ae030111e22a 100644 > --- a/Documentation/admin-guide/kdump/kdump.rst > +++ b/Documentation/admin-guide/kdump/kdump.rst > @@ -285,7 +285,8 @@ This would mean: > 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M > 3) if the RAM size is larger than 2G, then reserve 128M > > - > +Or you can use crashkernel=auto to choose the crash kernel memory size > +based on the recommended configuration set for each arch. > > Boot into System Kernel > === > diff --git a/Documentation/admin-guide/kernel-parameters.txt > b/Documentation/admin-guide/kernel-parameters.txt > index 9e3cdb271d06..a5deda5c85fe 100644 > --- a/Documentation/admin-guide/kernel-parameters.txt > +++ b/Documentation/admin-guide/kernel-parameters.txt > @@ -747,6 +747,12 @@ > a memory unit (amount[KMG]). See also > Documentation/admin-guide/kdump/kdump.rst for an > example. > > + crashkernel=auto > + [KNL] This parameter will set the reserved memory for > + the crash kernel based on the value of the > CRASH_AUTO_STR > + that is the best effort estimation for each arch. See > also > + arch/Kconfig for further details. > + > crashkernel=size[KMG],high > [KNL, X86-64] range could be above 4G. Allow kernel > to allocate physical memory region from top, so could > diff --git a/arch/Kconfig b/arch/Kconfig > index 24862d15f3a3..23d047548772 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -14,6 +14,26 @@ menu "General architecture-dependent options" > config CRASH_CORE > bool > > +config CRASH_AUTO_STR > + string "Memory reserved for crash kernel" > + depends on CRASH_CORE > + default "1G-64G:128M,64G-1T:256M,1T-:512M" > + help > + This configures the reserved memory dependent > + on the value of System RAM. The syntax is: > + crashkernel=:[,:,...][@offset] > + range=start-[end] > + > + For example: > + crashkernel=512M-2G:64M,2G-:128M > + > + This would mean: > + > + 1) if the RAM is smaller than 512M, then don't reserve anything > +(this is the "rescue" case) > + 2) if the RAM size is between 512M and 2G (exclusive), then > reserve 64M > + 3) if the RAM size is larger than 2G, then reserve 128M > + > config KEXEC_CORE > select CRASH_CORE > bool > diff --git a/kernel/crash_core.c b/kernel/crash_core.c > index 825284baaf46..90f9e4bb6704 100644 > --- a/kernel/crash_core.c > +++ b/kernel/crash_core.c > @@ -7,6 +7,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -250,6 +251,12 @@ static int __init __parse_crashkernel(char *cmdline, > if (suffix) > return parse_crashkernel_suffix(ck_cmdline, crash_size, > suffix); > +#ifdef CONFIG_CRASH_AUTO_STR > + if (strncmp(ck_cmdline, "auto", 4) == 0) { > + ck_cmdline = CONFIG_CRASH_AUTO_STR; > + pr_info("Using crashkernel=auto, the size chosen is a best > effort estimation.\n"); > + } > +#endif > /* > * if the commandline contains a ':', then that's the extended > * syntax -- if not, it must be the classic syntax > -- > 2.27.0 > > > ___ > kexec mailing list > kexec@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kexec > Thanks for help pushing the crashkernel=auto to upstream This patch works well. Tested-by: Kairui Song -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v3 1/1] kernel/crash_core: Add crashkernel=auto for vmcore creation
int __init __parse_crashkernel(char *cmdline, > > if (suffix) > > return parse_crashkernel_suffix(ck_cmdline, crash_size, > > suffix); > > +#ifdef CONFIG_CRASH_AUTO_STR > > + if (strncmp(ck_cmdline, "auto", 4) == 0) { > > + ck_cmdline = CONFIG_CRASH_AUTO_STR; > > + pr_info("Using crashkernel=auto, the size chosen is a best > > effort estimation.\n"); > > + } > > +#endif > > /* > >* if the commandline contains a ':', then that's the extended > >* syntax -- if not, it must be the classic syntax > > -- > > 2.27.0 > > > > > ___ > kexec mailing list > kexec@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kexec > -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/1] kexec-tools: fix build on pre 4.4 kernels
On 2/5/21 4:15 PM, Federico Pellegrin wrote: kexec build will fail on older kernels (pre 4.4) as the define VIDEO_CAPABILITY_64BIT_BASE was not present at that time. This patch adds it, as per linux/include/uapi/linux/screen_info.h, if not present. Signed-off-by: Federico Pellegrin --- kexec/arch/i386/x86-linux-setup.c | 4 1 file changed, 4 insertions(+) diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c index 76e1185..ab54a4a 100644 --- a/kexec/arch/i386/x86-linux-setup.c +++ b/kexec/arch/i386/x86-linux-setup.c @@ -37,6 +37,10 @@ #include "x86-linux-setup.h" #include "../../kexec/kexec-syscall.h" +#ifndef VIDEO_CAPABILITY_64BIT_BASE +#define VIDEO_CAPABILITY_64BIT_BASE (1 << 1) /* Frame buffer base is 64-bit */ +#endif + void init_linux_parameters(struct x86_linux_param_header *real_mode) { /* Fill in the values that are usually provided by the kernel. */ Thanks for the fix, I didn't notice pre 4.4 kernels don't have this defined when I submitted that patch. Reviewed-by: Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/1] kernel/crash_core.c - Add crashkernel=auto for x86 and ARM
On Fri, Nov 20, 2020 at 4:28 AM Saeed Mirzamohammadi wrote: > > Hi, > > And I think crashkernel=auto could be used as an indicator that user > want the kernel to control the crashkernel size, so some further work > could be done to adjust the crashkernel more accordingly. eg. when > memory encryption is enabled, increase the crashkernel value for the > auto estimation, as it's known to consume more crashkernel memory. > > Thanks for the suggestion! I tried to keep it simple and leave it to the user > to change Kconfig in case a different range is needed. Based on experience, > these ranges work well for most of the regular cases. Yes, I think the current implementation is a very good start. There are some use cases, where kernel is expected to reserve more memory, like: - when memory encryption is enabled, an extra swiotlb size of memory should be reserved - on pcc, fadump will expect more memory to be reserved I believe there are a lot more cases like these. I tried to come up with some patches to let the kernel reserve more memory automatically, when such conditions are detected, but changing the crashkernel= specified value is really weird. But if we have a crashkernel=auto, then kernel automatically reserve more memory will make sense. > But why not make it arch-independent? This crashkernel=auto idea > should simply work with every arch. > > > Thanks! I’ll be making it arch-independent in the v2 patch. > > > #include > #include > @@ -41,6 +42,15 @@ static int __init parse_crashkernel_mem(char *cmdline, >unsigned long long *crash_base) > { >char *cur = cmdline, *tmp; > + unsigned long long total_mem = system_ram; > + > + /* > +* Firmware sometimes reserves some memory regions for it's own use. > +* so we get less than actual system memory size. > +* Workaround this by round up the total size to 128M which is > +* enough for most test cases. > +*/ > + total_mem = roundup(total_mem, SZ_128M); > > > I think this rounding may be better moved to the arch specified part > where parse_crashkernel is called? > > > Thanks for the suggestion. Could you please elaborate why do we need to do > that? Every arch gets their total memory value using different methods, (just check every parse_crashkernel call, and the system_ram param is filled in many different ways), so I'm really not sure if this rounding is always suitable. > > Thanks, > Saeed > > -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/1] kernel/crash_core.c - Add crashkernel=auto for x86 and ARM
; help > Enable bzImage signature verification support. > > -config CRASH_DUMP > +menuconfig CRASH_DUMP > bool "kernel crash dumps" > depends on X86_64 || (X86_32 && HIGHMEM) > help > @@ -2049,6 +2049,30 @@ config CRASH_DUMP > (CONFIG_RELOCATABLE=y). > For more details see Documentation/admin-guide/kdump/kdump.rst > > +if CRASH_DUMP > + > +config CRASH_AUTO_STR > +string "Memory reserved for crash kernel" if X86_64 > + depends on CRASH_DUMP > +default "1G-64G:128M,64G-1T:256M,1T-:512M" > + help > + This configures the reserved memory dependent > + on the value of System RAM. The syntax is: > + crashkernel=:[,:,...][@offset] > + range=start-[end] > + > + For example: > + crashkernel=512M-2G:64M,2G-:128M > + > + This would mean: > + > + 1) if the RAM is smaller than 512M, then don't reserve anything > +(this is the "rescue" case) > + 2) if the RAM size is between 512M and 2G (exclusive), then > reserve 64M > + 3) if the RAM size is larger than 2G, then reserve 128M > + > +endif # CRASH_DUMP > + > config KEXEC_JUMP > bool "kexec jump" > depends on KEXEC && HIBERNATION > diff --git a/arch/x86/configs/x86_64_defconfig > b/arch/x86/configs/x86_64_defconfig > index 9936528e1939..7a87fbecf40b 100644 > --- a/arch/x86/configs/x86_64_defconfig > +++ b/arch/x86/configs/x86_64_defconfig > @@ -33,6 +33,7 @@ CONFIG_EFI_MIXED=y > CONFIG_HZ_1000=y > CONFIG_KEXEC=y > CONFIG_CRASH_DUMP=y > +# CONFIG_CRASH_AUTO_STR is not set > CONFIG_HIBERNATION=y > CONFIG_PM_DEBUG=y > CONFIG_PM_TRACE_RTC=y > diff --git a/kernel/crash_core.c b/kernel/crash_core.c > index 106e4500fd53..a44cd9cc12c4 100644 > --- a/kernel/crash_core.c > +++ b/kernel/crash_core.c > @@ -7,6 +7,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -41,6 +42,15 @@ static int __init parse_crashkernel_mem(char *cmdline, > unsigned long long *crash_base) > { > char *cur = cmdline, *tmp; > + unsigned long long total_mem = system_ram; > + > + /* > +* Firmware sometimes reserves some memory regions for it's own use. > +* so we get less than actual system memory size. > +* Workaround this by round up the total size to 128M which is > +* enough for most test cases. > +*/ > + total_mem = roundup(total_mem, SZ_128M); I think this rounding may be better moved to the arch specified part where parse_crashkernel is called? > > /* for each entry of the comma-separated list */ > do { > @@ -85,13 +95,13 @@ static int __init parse_crashkernel_mem(char *cmdline, > return -EINVAL; > } > cur = tmp; > - if (size >= system_ram) { > + if (size >= total_mem) { > pr_warn("crashkernel: invalid size\n"); > return -EINVAL; > } > > /* match ? */ > - if (system_ram >= start && system_ram < end) { > + if (total_mem >= start && total_mem < end) { > *crash_size = size; > break; > } > @@ -250,6 +260,12 @@ static int __init __parse_crashkernel(char *cmdline, > if (suffix) > return parse_crashkernel_suffix(ck_cmdline, crash_size, > suffix); > +#ifdef CONFIG_CRASH_AUTO_STR > + if (strncmp(ck_cmdline, "auto", 4) == 0) { > + ck_cmdline = CONFIG_CRASH_AUTO_STR; > + pr_info("Using crashkernel=auto, the size chosen is a best > effort estimation.\n"); > + } > +#endif > /* > * if the commandline contains a ':', then that's the extended > * syntax -- if not, it must be the classic syntax > -- > 2.18.4 > -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 2/2] hyperv_fb: Update screen_info after removing old framebuffer
On gen2 HyperV VM, hyperv_fb will remove the old framebuffer, the new allocated framebuffer address could be at a differnt location, and it's no longer VGA framebuffer. Update screen_info so that after kexec, kernel won't try to reuse the old invalid framebuffer address as VGA. Signed-off-by: Kairui Song --- drivers/video/fbdev/hyperv_fb.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index 02411d89cb46..e36fb1a0ecdb 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -1114,8 +1114,15 @@ static int hvfb_getmem(struct hv_device *hdev, struct fb_info *info) getmem_done: remove_conflicting_framebuffers(info->apertures, KBUILD_MODNAME, false); - if (!gen2vm) + + if (gen2vm) { + /* framebuffer is reallocated, clear screen_info to avoid misuse from kexec */ + screen_info.lfb_size = 0; + screen_info.lfb_base = 0; + screen_info.orig_video_isVGA = 0; + } else { pci_dev_put(pdev); + } kfree(info->apertures); return 0; -- 2.28.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 0/2] x86/hyperv: fix kexec/kdump hang on some VMs
On some HyperV machines, if kexec_file_load is used to load the kexec kernel, second kernel could hang with following stacktrace: [0.591705] efifb: probing for efifb [0.596869] efifb: framebuffer at 0xf800, using 3072k, total 3072k [0.605894] efifb: mode is 1024x768x32, linelength=4096, pages=1 [0.617926] efifb: scrolling: redraw [0.622715] efifb: Truecolor: size=8:8:8:8, shift=24:16:8:0 [ 28.039046] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [swapper/0:1] [ 28.039046] Modules linked in: [ 28.039046] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.18.0-230.el8.x86_64 #1 [ 28.039046] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019 [ 28.039046] RIP: 0010:cfb_imageblit+0x450/0x4c0 [ 28.039046] Code: 89 f8 b9 08 00 00 00 48 89 04 24 eb 2d 41 0f be 30 29 e9 4c 8d 5f 04 d3 fe 44 21 ee 41 8b 04 b6 44 21 c8 89 c6 44 31 d6 89 37 <85> c9 75 09 49 83 c0 01 b9 08 00 00 00 4c 89 df 48 39 df 75 ce 83 [ 28.039046] RSP: 0018:c9087830 EFLAGS: 00010246 ORIG_RAX: ff12 [ 28.039046] RAX: RBX: c9542000 RCX: 0003 [ 28.039046] RDX: 000e RSI: RDI: c9541bf0 [ 28.039046] RBP: 0001 R08: 8880f555c8df R09: 00aa [ 28.039046] R10: R11: c9541bf4 R12: 1000 [ 28.039046] R13: 0001 R14: 81e9a460 R15: 8880f555c880 [ 28.039046] FS: () GS:8880f100() knlGS: [ 28.039046] CS: 0010 DS: ES: CR0: 80050033 [ 28.039046] CR2: 7f7b223b8000 CR3: f3a0a004 CR4: 003606b0 [ 28.039046] DR0: DR1: DR2: [ 28.039046] DR3: DR6: fffe0ff0 DR7: 0400 [ 28.039046] Call Trace: [ 28.039046] bit_putcs+0x2a1/0x550 [ 28.039046] ? fbcon_switch+0x33e/0x5b0 [ 28.039046] ? bit_clear+0x120/0x120 [ 28.039046] fbcon_putcs+0xe7/0x100 [ 28.039046] do_update_region+0x154/0x1a0 [ 28.039046] redraw_screen+0x209/0x240 [ 28.039046] ? vc_do_resize+0x5c9/0x660 [ 28.039046] fbcon_prepare_logo+0x3b3/0x430 [ 28.039046] fbcon_init+0x436/0x630 [ 28.039046] visual_init+0xce/0x130 [ 28.039046] do_bind_con_driver+0x1df/0x2d0 [ 28.039046] do_take_over_console+0x113/0x180 [ 28.039046] do_fbcon_takeover+0x58/0xb0 [ 28.039046] register_framebuffer+0x225/0x2f0 [ 28.039046] efifb_probe.cold.5+0x51a/0x55d [ 28.039046] platform_drv_probe+0x38/0x90 [ 28.039046] really_probe+0x212/0x440 [ 28.039046] driver_probe_device+0x49/0xc0 [ 28.039046] device_driver_attach+0x50/0x60 [ 28.039046] __driver_attach+0x61/0x130 [ 28.039046] ? device_driver_attach+0x60/0x60 [ 28.039046] bus_for_each_dev+0x77/0xc0 [ 28.039046] ? klist_add_tail+0x57/0x70 [ 28.039046] bus_add_driver+0x14d/0x1e0 [ 28.039046] ? vesafb_driver_init+0x13/0x13 [ 28.039046] ? do_early_param+0x91/0x91 [ 28.039046] driver_register+0x6b/0xb0 [ 28.039046] ? vesafb_driver_init+0x13/0x13 [ 28.039046] do_one_initcall+0x46/0x1c3 [ 28.039046] ? do_early_param+0x91/0x91 [ 28.039046] kernel_init_freeable+0x1b4/0x25d [ 28.039046] ? rest_init+0xaa/0xaa [ 28.039046] kernel_init+0xa/0xfa [ 28.039046] ret_from_fork+0x35/0x40 The root cause is that hyperv_fb driver will relocate the framebuffer address in first kernel, but kexec_file_load simply reuse the old framebuffer info from boot_params, which is now invalid, so second kernel will write to an invalid framebuffer address. This series fix this problem by: 1. Let kexec_file_load use the updated copy of screen_info. Instead of using boot_params.screen_info, use the globally available screen_info variable instead (which is just an copy of boot_params.screen_info on x86). This variable could be updated by arch indenpendent drivers. Just keep this variable updated should be a good way to keep screen_info consistent across kexec. 2. Let hyperv_fb clean the screen_info copy when the boot framebuffer is relocated outside the old framebuffer. After the relocation, the framebuffer is no longer a VGA framebuffer, so just clean it up should be good. Kairui Song (2): x86/kexec: Use up-to-dated screen_info copy to fill boot params hyperv_fb: Update screen_info after removing old framebuffer arch/x86/kernel/kexec-bzimage64.c | 3 +-- drivers/video/fbdev/hyperv_fb.c | 8 2 files changed, 9 insertions(+), 2 deletions(-) -- 2.28.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 1/2] x86/kexec: Use up-to-dated screen_info copy to fill boot params
kexec_file_load now just reuse the old boot_params.screen_info. But if drivers have change the hardware state, boot_param.screen_info could contain invalid info. For example, the video type might be no longer VGA, or frame buffer address changed. If kexec kernel keep using the old screen_info, kexec'ed kernel may attempt to write to an invalid framebuffer memory region. There are two screen_info globally available, boot_params.screen_info and screen_info. Later one is a copy, and could be updated by drivers. So let kexec_file_load use the updated copy. Signed-off-by: Kairui Song --- arch/x86/kernel/kexec-bzimage64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 57c2ecf43134..ce831f9448e7 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -200,8 +200,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; /* Copying screen_info will do? */ - memcpy(¶ms->screen_info, &boot_params.screen_info, - sizeof(struct screen_info)); + memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info)); /* Fill in memsize later */ params->screen_info.ext_mem_k = 0; -- 2.28.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH 0/3] Add writing support to vmcore for reusing oldmem
On Thu, Sep 10, 2020 at 12:43 AM Kairui Song wrote: > > On Wed, Sep 9, 2020 at 10:04 PM Eric W. Biederman > wrote: > > > > Kairui Song writes: > > > > > Currently vmcore only supports reading, this patch series is an RFC > > > to add writing support to vmcore. It's x86_64 only yet, I'll add other > > > architecture later if there is no problem with this idea. > > > > > > My purpose of adding writing support is to reuse the crashed kernel's > > > old memory in kdump kernel, reduce kdump memory pressure, and > > > allow kdump to run with a smaller crashkernel reservation. > > > > > > This is doable because in most cases, after kernel panic, user only > > > interested in the crashed kernel itself, and userspace/cache/free > > > memory pages are not dumped. `makedumpfile` is widely used to skip > > > these pages. Kernel pages usually only take a small part of > > > the whole old memory. So there will be many reusable pages. > > > > > > By adding writing support, userspace then can use these pages as a fast > > > and temporary storage. This helps reduce memory pressure in many ways. > > > > > > For example, I've written a POC program based on this, it will find > > > the reusable pages, and creates an NBD device which maps to these pages. > > > The NBD device can then be used as swap, or to hold some temp files > > > which previouly live in RAM. > > > > > > The link of the POC tool: https://github.com/ryncsn/kdumpd > > > > A couple of thoughts. > > 1) Unless I am completely mistaken treating this as a exercise in > >memory hotplug would be much simpler. > > > >AKA just plug in the memory that is not needed as part of the kdump. > > > >I see below that you have problems doing this because > >of fragmentation. I still think hotplug is doable using some > >kind of fragmented memory zone. > > > > 2) The purpose of the memory reservation is because hardware is > >still potentially running agains the memory of the old kernel. > > > >By the time we have brought up a new kernel enough of the hardware > >may have been reinitialized that we don't have to worry about > >hardware randomly dma'ing into the memory used by the old kernel. > > > >With IOMMUs and care we may be able to guarantee for some machine > >configurations it is impossible for DMA to come from some piece of > >hardware that is present but the kernel does not have a driver > >loaded for.\ > > > > I really do not like this approach because it is fundamentlly doing the > > wrong thing. Adding write support to read-only drivers. I do not see > > anywhere that you even mentioned the hard problem and the reason we > > reserve memory in the first place. Hardware spontaneously DMA'ing onto > > it. > > > That POC tool looks ugly for now as it only a draft to prove this > works, sorry about it. > > For the patch, yes, it is expecting IOMMU to lower the chance of > potential DMA issue, and expecting DMA will not hit userspace/free > page, or at least won't override a massive amount of reusable old > memory. And I thought about some solutions for the potential DMA > issue. > > As old memories are used as a block device, which is proxied by > userspace, so upon each IO, the userspace tool could do an integrity > check of the corresponding data stored in old mem, and keep multiple > copies of the data. (eg. use 512M of old memory to hold a 128M block > device). These copies will be kept far away from each other regarding > the physical memory location. The reusable old memories are sparse so > the actual memory containing the data should be also sparse. > So if some part is corrupted, it is still recoverable. Unless the DMA > went very wrong and wiped a large region of memory, but if such thing > happens, it's most likely kernel pages are also being wiped by DMA, so > the vmcore is already corrupted and kdump may not help. But at least > it won't fail silently, the userspace tool can still do something like > dump some available data to an easy to setup target. > > And also that's one of the reasons not using old memory as kdump's > memory directly. > > > > It's have been a long time issue that kdump suffers from OOM issue > > > with limited crashkernel memory. So reusing old memory could be very > > > helpful. > > > > There is a very fine line here between reusing existing code (aka > > drivers and userspace) and
Re: [RFC PATCH 0/3] Add writing support to vmcore for reusing oldmem
On Wed, Sep 9, 2020 at 10:04 PM Eric W. Biederman wrote: > > Kairui Song writes: > > > Currently vmcore only supports reading, this patch series is an RFC > > to add writing support to vmcore. It's x86_64 only yet, I'll add other > > architecture later if there is no problem with this idea. > > > > My purpose of adding writing support is to reuse the crashed kernel's > > old memory in kdump kernel, reduce kdump memory pressure, and > > allow kdump to run with a smaller crashkernel reservation. > > > > This is doable because in most cases, after kernel panic, user only > > interested in the crashed kernel itself, and userspace/cache/free > > memory pages are not dumped. `makedumpfile` is widely used to skip > > these pages. Kernel pages usually only take a small part of > > the whole old memory. So there will be many reusable pages. > > > > By adding writing support, userspace then can use these pages as a fast > > and temporary storage. This helps reduce memory pressure in many ways. > > > > For example, I've written a POC program based on this, it will find > > the reusable pages, and creates an NBD device which maps to these pages. > > The NBD device can then be used as swap, or to hold some temp files > > which previouly live in RAM. > > > > The link of the POC tool: https://github.com/ryncsn/kdumpd > > A couple of thoughts. > 1) Unless I am completely mistaken treating this as a exercise in >memory hotplug would be much simpler. > >AKA just plug in the memory that is not needed as part of the kdump. > >I see below that you have problems doing this because >of fragmentation. I still think hotplug is doable using some >kind of fragmented memory zone. > > 2) The purpose of the memory reservation is because hardware is >still potentially running agains the memory of the old kernel. > >By the time we have brought up a new kernel enough of the hardware >may have been reinitialized that we don't have to worry about >hardware randomly dma'ing into the memory used by the old kernel. > >With IOMMUs and care we may be able to guarantee for some machine >configurations it is impossible for DMA to come from some piece of >hardware that is present but the kernel does not have a driver >loaded for.\ > > I really do not like this approach because it is fundamentlly doing the > wrong thing. Adding write support to read-only drivers. I do not see > anywhere that you even mentioned the hard problem and the reason we > reserve memory in the first place. Hardware spontaneously DMA'ing onto > it. > That POC tool looks ugly for now as it only a draft to prove this works, sorry about it. For the patch, yes, it is expecting IOMMU to lower the chance of potential DMA issue, and expecting DMA will not hit userspace/free page, or at least won't override a massive amount of reusable old memory. And I thought about some solutions for the potential DMA issue. As old memories are used as a block device, which is proxied by userspace, so upon each IO, the userspace tool could do an integrity check of the corresponding data stored in old mem, and keep multiple copies of the data. (eg. use 512M of old memory to hold a 128M block device). These copies will be kept far away from each other regarding the physical memory location. The reusable old memories are sparse so the actual memory containing the data should be also sparse. So if some part is corrupted, it is still recoverable. Unless the DMA went very wrong and wiped a large region of memory, but if such thing happens, it's most likely kernel pages are also being wiped by DMA, so the vmcore is already corrupted and kdump may not help. But at least it won't fail silently, the userspace tool can still do something like dump some available data to an easy to setup target. And also that's one of the reasons not using old memory as kdump's memory directly. > > It's have been a long time issue that kdump suffers from OOM issue > > with limited crashkernel memory. So reusing old memory could be very > > helpful. > > There is a very fine line here between reusing existing code (aka > drivers and userspace) and doing something that should work. > > It might make sense to figure out what is using so much memory > that an OOM is triggered. > > Ages ago I did something that was essentially dumping the kernels printk > buffer to the serial console in case of a crash and I had things down to > something comparatively miniscule like 8M or less. > > My memory is that historically it has been high performance scsi raid > drivers or something like that, that are behind the need to have
[RFC PATCH 3/3] x86_64: implement copy_to_oldmem_page
Previous commit introduced writing support for vmcore, it requires per-architecture implementation for the writing function. Signed-off-by: Kairui Song --- arch/x86/kernel/crash_dump_64.c | 49 +++-- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045e82e8945b..ec80da75b287 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -13,7 +13,7 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf, - bool encrypted) + bool encrypted, bool is_write) { void *vaddr; @@ -28,13 +28,25 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { - iounmap((void __iomem *)vaddr); - return -EFAULT; + if (is_write) { + if (userbuf) { + if (copy_from_user(vaddr + offset, (void __user *)buf, csize)) { + iounmap((void __iomem *)vaddr); + return -EFAULT; + } + } else { + memcpy(vaddr + offset, buf, csize); } - } else - memcpy(buf, vaddr + offset, csize); + } else { + if (userbuf) { + if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { + iounmap((void __iomem *)vaddr); + return -EFAULT; + } + } else { + memcpy(buf, vaddr + offset, csize); + } + } set_iounmap_nonlazy(); iounmap((void __iomem *)vaddr); @@ -57,7 +69,7 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false, false); } /** @@ -68,7 +80,26 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true, false); +} + +/** + * copy_to_oldmem_page - similar to copy_oldmem_page but in opposite direction. + */ +ssize_t copy_to_oldmem_page(unsigned long pfn, char *src, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, src, csize, offset, userbuf, false, true); +} + +/** + * copy_to_oldmem_page_encrypted - similar to copy_oldmem_page_encrypted but + * in opposite direction. + */ +ssize_t copy_to_oldmem_page_encrypted(unsigned long pfn, char *src, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, src, csize, offset, userbuf, true, true); } ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) -- 2.26.2 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH 2/3] vmcore: Add interface to write to old mem
vmcore is used as the interface to access crashed kernel's memory in kdump, and currently vmcore only supports reading. Adding writing support is useful for enabling userspace making better use of the old memory. For kdump, `makedumpfile` is widely used to reduce the dumped vmcore size, and in most setup, it will drop user space memory, caches. This means these memory pages are reusable. Kdump runs in limited pre-reserved memory region, so if these old memory pages are reused, it can help reduce memory pressure in kdump kernel, hence allow first kernel to reserve less memory for kdump. Adding write support to vmcore is the first step, then user space can do IO on the old mem. There are multiple ways to reuse the memory, for example, userspace can register a NBD device, and redirect the IO on the device to old memory. The NBD device can be used as swap, or used to hold some temp files. Signed-off-by: Kairui Song --- fs/proc/vmcore.c | 129 + include/linux/crash_dump.h | 18 -- 2 files changed, 131 insertions(+), 16 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 124c2066f3e5..23acc0f2ecd7 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -103,9 +103,9 @@ static int pfn_is_ram(unsigned long pfn) } /* Reads a page from the oldmem device from given offset. */ -ssize_t read_from_oldmem(char *buf, size_t count, -u64 *ppos, int userbuf, -bool encrypted) +static ssize_t oldmem_rw_page(char *buf, size_t count, + u64 *ppos, int userbuf, + bool encrypted, bool is_write) { unsigned long pfn, offset; size_t nr_bytes, to_copy = count; @@ -119,20 +119,33 @@ ssize_t read_from_oldmem(char *buf, size_t count, /* If pfn is not ram, return zeros for sparse dump files */ if (pfn_is_ram(pfn) == 0) { - memset(buf, 0, nr_bytes); - } else { - if (encrypted) - tmp = copy_oldmem_page_encrypted(pfn, buf, -nr_bytes, -offset, -userbuf); + if (is_write) + return -EINVAL; else - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + memset(buf, 0, nr_bytes); + } else { + if (encrypted) { + tmp = is_write ? + copy_to_oldmem_page_encrypted(pfn, buf, + nr_bytes, + offset, + userbuf) : + copy_oldmem_page_encrypted(pfn, buf, + nr_bytes, + offset, + userbuf); + } else { + tmp = is_write ? + copy_to_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf) : + copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + } if (tmp < 0) return tmp; } + *ppos += nr_bytes; buf += nr_bytes; to_copy -= nr_bytes; @@ -143,6 +156,22 @@ ssize_t read_from_oldmem(char *buf, size_t count, return count; } +/* Reads a page from the oldmem device from given offset. */ +ssize_t read_from_oldmem(char *buf, size_t count, +u64 *ppos, int userbuf, +bool encrypted) +{ + return oldmem_rw_page(buf, count, ppos, userbuf, encrypted, 0); +} + +/* Writes a page to the oldmem device of given offset. */ +ssize_t write_to_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf, + bool encrypted) +{ + return oldmem_rw_page(buf, count, ppos, userbuf, encrypted, 1); +} + /* * Architectures may override this function to allocate ELF header in 2nd kernel */ @@ -184,6 +213,26 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, return remap_pfn_range(vma, from, pfn, size, prot); } +/* + * Architectures which support wr
[RFC PATCH 0/3] Add writing support to vmcore for reusing oldmem
Currently vmcore only supports reading, this patch series is an RFC to add writing support to vmcore. It's x86_64 only yet, I'll add other architecture later if there is no problem with this idea. My purpose of adding writing support is to reuse the crashed kernel's old memory in kdump kernel, reduce kdump memory pressure, and allow kdump to run with a smaller crashkernel reservation. This is doable because in most cases, after kernel panic, user only interested in the crashed kernel itself, and userspace/cache/free memory pages are not dumped. `makedumpfile` is widely used to skip these pages. Kernel pages usually only take a small part of the whole old memory. So there will be many reusable pages. By adding writing support, userspace then can use these pages as a fast and temporary storage. This helps reduce memory pressure in many ways. For example, I've written a POC program based on this, it will find the reusable pages, and creates an NBD device which maps to these pages. The NBD device can then be used as swap, or to hold some temp files which previouly live in RAM. The link of the POC tool: https://github.com/ryncsn/kdumpd I tested it on x86_64 on latest Fedora by using it as swap with following step in kdump kernel: 1. Install this tool in kdump initramfs 2. Execute following command in kdump: /sbin/modprobe nbd nbds_max=1 /bin/kdumpd & /sbin/mkswap /dev/nbd0 /sbin/swapon /dev/nbd0 3. Observe the swap is being used: SwapTotal:131068 kB SwapFree: 121852 kB It helped to reduce the crashkernel from 168M to 110M for a successful kdump run over NFSv3. There are still many workitems that could be done based on this idea, eg. move the initramfs content to the old memory, which may help reduce another ~10-20M of memory. It's have been a long time issue that kdump suffers from OOM issue with limited crashkernel memory. So reusing old memory could be very helpful. This method have it's limitation: - Swap only works for userspace. But kdump userspace is a major memory consumer, so in general this should be helpful enough. - For users who want to dump the whole memory area, this won't help as there is no reusable page. I've tried other ways to improve the crashkernel value, eg. - Reserve some smaller memory segments in first kernel for crashkernel: It's only a suppliment of the default crashkernel reservation and only make crashkernel value more adjustable, still not solving the real problem. - Reuse old memory, but hotplug chunk of reusable old memory into kdump kernel's memory: It's hard to find large chunk of continuous memory, especially on systems with heavy workload, the reusable regions could be very fragmental. So it can only hotplug small fragments of memories, which looks hackish, and may have a high page table overhead. - Implement the old memory based based block device as a kernel module. It doesn't looks good to have a module for this sole usage and it don't have much performance/implementation advantage compared to this RFC. Besides, keeping all the complex logic of parsing reusing old memory logic in userspace seems a better idea. And as a plus, this could make it more doable and reasonable to have n crashkernel=auto param. If there is a swap, then userspace will have less memory pressure. crashkernel=auto can focus on the kernel usage. Kairui Song (3): vmcore: simplify read_from_olemem vmcore: Add interface to write to old mem x86_64: implement copy_to_oldmem_page arch/x86/kernel/crash_dump_64.c | 49 -- fs/proc/vmcore.c| 154 ++-- include/linux/crash_dump.h | 18 +++- 3 files changed, 180 insertions(+), 41 deletions(-) -- 2.26.2 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH 1/3] vmcore: simplify read_from_olemem
Simplify the code logic, also helps reduce object size and stack usage. Stack usage: Before: fs/proc/vmcore.c:106:9:read_from_oldmem.part.0 80 static fs/proc/vmcore.c:106:9:read_from_oldmem 16 static After: fs/proc/vmcore.c:106:9:read_from_oldmem 80 static Size of vmcore.o: textdata bss dec hex filename Before: 7677 109 8878741ec2 fs/proc/vmcore.o After: 7669 109 8878661eba fs/proc/vmcore.o Signed-off-by: Kairui Song --- fs/proc/vmcore.c | 27 ++- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index c3a345c28a93..124c2066f3e5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -108,25 +108,19 @@ ssize_t read_from_oldmem(char *buf, size_t count, bool encrypted) { unsigned long pfn, offset; - size_t nr_bytes; - ssize_t read = 0, tmp; + size_t nr_bytes, to_copy = count; + ssize_t tmp; - if (!count) - return 0; - - offset = (unsigned long)(*ppos % PAGE_SIZE); + offset = (unsigned long)(*ppos & (PAGE_SIZE - 1)); pfn = (unsigned long)(*ppos / PAGE_SIZE); - do { - if (count > (PAGE_SIZE - offset)) - nr_bytes = PAGE_SIZE - offset; - else - nr_bytes = count; + while (to_copy) { + nr_bytes = min(to_copy, PAGE_SIZE - offset); /* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) + if (pfn_is_ram(pfn) == 0) { memset(buf, 0, nr_bytes); - else { + } else { if (encrypted) tmp = copy_oldmem_page_encrypted(pfn, buf, nr_bytes, @@ -140,14 +134,13 @@ ssize_t read_from_oldmem(char *buf, size_t count, return tmp; } *ppos += nr_bytes; - count -= nr_bytes; buf += nr_bytes; - read += nr_bytes; + to_copy -= nr_bytes; ++pfn; offset = 0; - } while (count); + } - return read; + return count; } /* -- 2.26.2 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Thu, Jul 23, 2020 at 8:00 AM Bjorn Helgaas wrote: > > On Wed, Jul 22, 2020 at 03:50:48PM -0600, Jerry Hoemann wrote: > > On Wed, Jul 22, 2020 at 10:21:23AM -0500, Bjorn Helgaas wrote: > > > On Wed, Jul 22, 2020 at 10:52:26PM +0800, Kairui Song wrote: > > > > > I think I didn't make one thing clear, The PCI UR error never arrives > > > > in kernel, it's the iLo BMC on that HPE machine caught the error, and > > > > send kernel an NMI. kernel is panicked by NMI, I'm still trying to > > > > figure out why the NMI hanged kernel, even with panic=-1, > > > > panic_on_io_nmi, panic_on_unknown_nmi all set. But if we can avoid the > > > > NMI by shutdown the devices in right order, that's also a solution. > > ACPI v6.3, chapter 18, does mention NMIs several times, e.g., Table > 18-394 and sec 18.4. I'm not familiar enough with APEI to know > whether Linux correctly supports all those cases. Maybe this is a > symptom that we don't? > > > > I'm not sure how much sympathy to have for this situation. A PCIe UR > > > is fatal for the transaction and maybe even the device, but from the > > > overall system point of view, it *should* be a recoverable error and > > > we shouldn't panic. > > > > > > Errors like that should be reported via the normal AER or ACPI/APEI > > > mechanisms. It sounds like in this case, the platform has decided > > > these aren't enough and it is trying to force a reboot? If this is > > > "special" platform behavior, I'm not sure how much we need to cater > > > for it. > > > > Are these AER errors the type processed by the GHES code? > > My understanding from ACPI v6.3, sec 18.3.2, is that the Hardware > Error Source Table may contain Error Source Descriptors of types like: > > IA-32 Machine Check Exception > IA-32 Corrected Machine Check > IA-32 Non-Maskable Interrupt > PCIe Root Port AER > PCIe Device AER > Generic Hardware Error Source (GHES) > Hardware Error Notification > IA-32 Deferred Machine Check > > I would naively expect PCIe UR errors to be reported via one of the > PCIe Error Sources, not GHES, but maybe there's some reason to use > GHES. > > The kernel should already know how to deal with the PCIe AER errors, > but we'd have to add new device-specific code to handle things > reported via GHES, along the lines of what Shiju is doing here: > > https://lore.kernel.org/r/20200722104245.1060-1-shiju.j...@huawei.com > > > I'll note that RedHat runs their crash kernel with: hest_disable. > > So, the ghes code is disabled in the crash kernel. > > That would disable all the HEST error sources, including the PCIe AER > ones as well as GHES ones. If we turn off some of the normal error > handling mechanisms, I guess we have to expect that some errors won't > be handled correctly. Hi, that's true, hest_disable is added by default to reduce memory usage in special cases. But even if I remove hest_disable and have GHES enabled, but the hanging issue still exists, from the iLO console log, it's still sending an NMI to kernel, and kernel hanged. The NMI won't hang the kernel for 100 percent, sometime it will just panic and reboot and sometimes it hangs. This behavior didn't change after/before enabled the GHES. Maybe this is a "special platform behavior". I'm also not 100 percent sure if/how we can cover this in a good way for now. I'll try to figure how the NMI actually hanged the kernel and see if it could be fixed in other ways. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Fri, Mar 6, 2020 at 5:38 PM Baoquan He wrote: > > On 03/04/20 at 08:53pm, Deepa Dinamani wrote: > > On Wed, Mar 4, 2020 at 7:53 PM Baoquan He wrote: > > > > > > +Joerg to CC. > > > > > > On 03/03/20 at 01:01pm, Deepa Dinamani wrote: > > > > I looked at this some more. Looks like we do not clear irqs when we do > > > > a kexec reboot. And, the bootup code maintains the same table for the > > > > kexec-ed kernel. I'm looking at the following code in > > > > > > I guess you are talking about kdump reboot here, right? Kexec and kdump > > > boot take the similar mechanism, but differ a little. > > > > Right I meant kdump kernel here. And, clearly the is_kdump_kernel() case > > below. > > > > > > > > > intel_irq_remapping.c: > > > > > > > > if (ir_pre_enabled(iommu)) { > > > > if (!is_kdump_kernel()) { > > > > pr_warn("IRQ remapping was enabled on %s but > > > > we are not in kdump mode\n", > > > > iommu->name); > > > > clear_ir_pre_enabled(iommu); > > > > iommu_disable_irq_remapping(iommu); > > > > } else if (iommu_load_old_irte(iommu)) > > > > > > Here, it's for kdump kernel to copy old ir table from 1st kernel. > > > > Correct. > > > > > > pr_err("Failed to copy IR table for %s from > > > > previous kernel\n", > > > >iommu->name); > > > > else > > > > pr_info("Copied IR table for %s from previous > > > > kernel\n", > > > > iommu->name); > > > > } > > > > > > > > Would cleaning the interrupts(like in the non kdump path above) just > > > > before shutdown help here? This should clear the interrupts enabled > > > > for all the devices in the current kernel. So when kdump kernel > > > > starts, it starts clean. This should probably help block out the > > > > interrupts from a device that does not have a driver. > > > > > > I think stopping those devices out of control from continue sending > > > interrupts is a good idea. While not sure if only clearing the interrupt > > > will be enough. Those devices which will be initialized by their driver > > > will brake, but devices which drivers are not loaded into kdump kernel > > > may continue acting. Even though interrupts are cleaning at this time, > > > the on-flight DMA could continue triggerring interrupt since the ir > > > table and iopage table are rebuilt. > > > > This should be handled by the IOMMU, right? And, hence you are getting > > UR. This seems like the correct execution flow to me. > > Sorry for late reply. > Yes, this is initializing IOMMU device. > > > > > Anyway, you could just test this theory by removing the > > is_kdump_kernel() check above and see if it solves your problem. > > Obviously, check the VT-d spec to figure out the exact sequence to > > turn off the IR. > > OK, I will talk to Kairui and get a machine to test it. Thanks for your > nice idea, if you have a draft patch, we are happy to test it. > > > > > Note that the device that is causing the problem here is a legit > > device. We want to have interrupts from devices we don't know about > > blocked anyway because we can have compromised firmware/ devices that > > could cause a DoS attack. So blocking the unwanted interrupts seems > > like the right thing to do here. > > Kairui said it's a device which driver is not loaded in kdump kernel > because it's not needed by kdump. We try to only load kernel modules > which are needed, e.g one device is the dump target, its driver has to > be loaded in. In this case, the device is more like a out of control > device to kdump kernel. > Hi Bao, Deepa, sorry for this very late response. The test machine was not available for sometime, and I restarted to work on this problem. For the workaround mention by Deepa (by remote the is_kdump_kernel() check), it didn't work, the machine still hangs upon shutdown. The devices that were left in an unknown state and sending interrupt could be a problem, but it's irrelevant to this hanging problem. I think I didn't make one thing clear, The PCI UR error never arrives in kernel, it's the iLo BMC on that HPE machine caught the error, and send kernel an NMI. kernel is panicked by NMI, I'm still trying to figure out why the NMI hanged kernel, even with panic=-1, panic_on_io_nmi, panic_on_unknown_nmi all set. But if we can avoid the NMI by shutdown the devices in right order, that's also a solution. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] crash_dump: remove saved_max_pfn
This variable is no longer used. saved_max_pfn was originally introduce in commit 92aa63a5a1bf ("[PATCH] kdump: Retrieve saved max pfn"), used to make sure that user does not try to read the physical memory beyond saved_max_pfn. But since commit 921d58c0e699 ("vmcore: remove saved_max_pfn check") it's no longer used for the check. Only user left is Calary IOMMU, which start using it from commit 95b68dec0d52 ("calgary iommu: use the first kernels TCE tables in kdump"). But again, recently in commit 90dc392fc445 ("x86: Remove the calgary IOMMU driver"), Calary IOMMU is removed and this variable no longer have any user. So just remove it. Signed-off-by: Kairui Song --- arch/x86/kernel/e820.c | 8 include/linux/crash_dump.h | 2 -- kernel/crash_dump.c| 6 -- 3 files changed, 16 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c5399e80c59c..4d13c57f370a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -910,14 +910,6 @@ static int __init parse_memmap_one(char *p) return -EINVAL; if (!strncmp(p, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* -* If we are doing a crash dump, we still need to know -* the real memory size before the original memory map is -* reset. -*/ - saved_max_pfn = e820__end_of_ram_pfn(); -#endif e820_table->nr_entries = 0; userdef = 1; return 0; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 4664fc1871de..bc156285d097 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -97,8 +97,6 @@ extern void unregister_oldmem_pfn_is_ram(void); static inline bool is_kdump_kernel(void) { return 0; } #endif /* CONFIG_CRASH_DUMP */ -extern unsigned long saved_max_pfn; - /* Device Dump information to be filled by drivers */ struct vmcoredd_data { char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */ diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -5,12 +5,6 @@ #include #include -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - /* * stores the physical address of elf header of crash image * -- 2.25.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] swiotlb: Allow swiotlb to live at pre-defined address
ff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > > index c19379fabd20..83da0caa2f93 100644 > > --- a/kernel/dma/swiotlb.c > > +++ b/kernel/dma/swiotlb.c > > @@ -46,6 +46,7 @@ > > #include > > #include > > #include > > +#include > > > > #define CREATE_TRACE_POINTS > > #include > > @@ -102,6 +103,12 @@ unsigned int max_segment; > > #define INVALID_PHYS_ADDR (~(phys_addr_t)0) > > static phys_addr_t *io_tlb_orig_addr; > > > > +/* > > + * The TLB phys addr may be defined on the command line. Store it here if > > it is. > > + */ > > +static phys_addr_t io_tlb_addr = INVALID_PHYS_ADDR; > > + > > + > > /* > > * Protect the above data structures in the map and unmap calls > > */ > > @@ -119,11 +126,23 @@ setup_io_tlb_npages(char *str) > > } > > if (*str == ',') > > ++str; > > - if (!strcmp(str, "force")) { > > + if (!strncmp(str, "force", 5)) { > > swiotlb_force = SWIOTLB_FORCE; > > - } else if (!strcmp(str, "noforce")) { > > + str += 5; > > + } else if (!strncmp(str, "noforce", 7)) { > > swiotlb_force = SWIOTLB_NO_FORCE; > > io_tlb_nslabs = 1; > > + str += 7; > > + } > > + > > + if (*str == ',') > > + ++str; > > + if (!strncmp(str, "addr=", 5)) { > > + char *addrstr = str + 5; > > + > > + io_tlb_addr = kstrtoul(addrstr, 0, &str); > > + if (addrstr == str) > > + io_tlb_addr = INVALID_PHYS_ADDR; > > } > > > > return 0; > > @@ -239,6 +258,25 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned > > long nslabs, int verbose) > > return 0; > > } > > > > +static int __init swiotlb_init_io(int verbose, unsigned long bytes) > > +{ > > + unsigned __iomem char *vstart; > > + > > + if (io_tlb_addr == INVALID_PHYS_ADDR) > > + return -EINVAL; > > + > > + vstart = memremap(io_tlb_addr, bytes, MEMREMAP_WB); > > + if (!vstart) > > + return -EINVAL; > > + > > + if (swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) { > > + memunmap(vstart); > > + return -EINVAL; > > + } > > + > > + return 0; > > +} > > + > > /* > > * Statically reserve bounce buffer space and initialize bounce buffer data > > * structures for the software IO TLB used to implement the DMA API. > > @@ -257,6 +295,10 @@ swiotlb_init(int verbose) > > > > bytes = io_tlb_nslabs << IO_TLB_SHIFT; > > > > + /* Map IO TLB from device memory */ > > + if (!swiotlb_init_io(verbose, bytes)) > > + return; > > + > > /* Get IO TLB memory from the low pages */ > > vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); > > if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) > > -- > > 2.16.4 > > > > > > > > > > Amazon Development Center Germany GmbH > > Krausenstr. 38 > > 10117 Berlin > > Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss > > Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B > > Sitz: Berlin > > Ust-ID: DE 289 237 879 > > > > > > > > Thanks > Dave > -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
Hi, Thanks for the reply, I don't have any better idea than this RFC patch yet. The patch is hold as previous discussion suggests this just work around the problem, the real fix should be let crash kernel load every required kernel module and reset whichever hardware that is not in a good status. However, user may struggle to find out which driver is actually needed, and it's not practical to load all drivers in kdump kernel. (actually kdump have been trying to load as less driver as possible to save memory). So as Dave Y suggested in another reply, will it better to apply this quirk with a kernel param controlling it? If such problem happens, the option could be turned on as a fix. On Sun, Feb 23, 2020 at 12:59 AM Bjorn Helgaas wrote: > > [+cc Khalid, Deepa, Randy, Dave, Myron] > > On Thu, Dec 26, 2019 at 03:21:18AM +0800, Kairui Song wrote: > > There are reports about kdump hang upon reboot on some HPE machines, > > kernel hanged when trying to shutdown a PCIe port, an uncorrectable > > error occurred and crashed the system. > > Did we ever make progress on this? This definitely sounds like a > problem that needs to be fixed, but I don't see a resolution here. > > > On the machine I can reproduce this issue, part of the topology > > looks like this: > > > > [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 > > +-01.0-[02]-- > > +-01.1-[05]-- > > +-02.0-[06]--+-00.0 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.1 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.2 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.3 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.4 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.5 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.6 Emulex Corporation OneConnect NIC (Skyhawk) > > |\-00.7 Emulex Corporation OneConnect NIC (Skyhawk) > > +-02.1-[0f]-- > > +-02.2-[07]00.0 Hewlett-Packard Company Smart Array Gen9 > > Controllers > > > > When shuting down PCIe port :00:02.2 or :00:02.0, the machine > > will hang, depend on which device is reinitialized in kdump kernel. > > > > If force remove unused device then trigger kdump, the problem will never > > happen: > > > > echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove > > echo c > /proc/sysrq-trigger > > > > ... Kdump save vmcore through network, the NIC get reinitialized and > > hpsa is untouched. Then reboot with no problem. (If hpsa is used > > instead, shutdown the NIC in first kernel will help) > > > > The cause is that some devices are enabled by the first kernel, but it > > don't have the chance to shutdown the device, and kdump kernel is not > > aware of it, unless it reinitialize the device. > > > > Upon reboot, kdump kernel will skip downstream device shutdown and > > clears its bridge's master bit directly. The downstream device could > > error out as it can still send requests but upstream refuses it. > > > > So for kdump, let kernel read the correct hardware power state on boot, > > and always clear the bus master bit of PCI device upon shutdown if the > > device is on. PCIe port driver will always shutdown all downstream > > devices first, so this should ensure all downstream devices have bus > > master bit off before clearing the bridge's bus master bit. > > > > Signed-off-by: Kairui Song > > --- > > drivers/pci/pci-driver.c | 11 --- > > drivers/pci/quirks.c | 20 > > 2 files changed, 28 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > > index 0454ca0e4e3f..84a7fd643b4d 100644 > > --- a/drivers/pci/pci-driver.c > > +++ b/drivers/pci/pci-driver.c > > @@ -18,6 +18,7 @@ > > #include > > #include > > #include > > +#include > > #include "pci.h" > > #include "pcie/portdrv.h" > > > > @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev) > >* If this is a kexec reboot, turn off Bus Master bit on the > >* device to tell it to not continue to do DMA. Don't touch > >* devices in D3cold or unknown states. > > - * If it is not a kexec reboot, firmware will hit the PCI > > - * devices with big hammer and stop their DMA any
Re: [PATCH] makedumpfile: Remove duplicated variable declarations
On Thu, Jan 30, 2020 at 12:28 AM HAGIO KAZUHITO(萩尾 一仁) wrote: > > Hi Kairui, > > Thank you for the patch. > > > -Original Message- > > When building on Fedora 32, following error is observed: > > > > /usr/bin/ld: > > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010: > > multiple definition of `crash_reserved_mem_nr'; > > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010: > > first > > defined here > > /usr/bin/ld: > > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009: > > multiple definition of `crash_reserved_mem'; > > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009: > > first > > defined here > > /usr/bin/ld: > > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278: > > multiple definition of `parallel_info_t'; > > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278: > > first > > defined here > > /usr/bin/ld: > > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265: > > multiple definition of `splitting_info_t'; > > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265: > > first > > defined here > > > > And apparently, these variables are wrongly declared multiple times. So > > remove duplicated declaration. > > > > Signed-off-by: Kairui Song > > --- > > makedumpfile.c | 2 ++ > > makedumpfile.h | 10 ++ > > 2 files changed, 8 insertions(+), 4 deletions(-) > > > > diff --git a/makedumpfile.c b/makedumpfile.c > > index e290fbd..9aad77b 100644 > > --- a/makedumpfile.c > > +++ b/makedumpfile.c > > @@ -34,6 +34,8 @@ struct array_table array_table; > > struct number_table number_table; > > struct srcfile_table srcfile_table; > > struct save_control sc; > > +struct parallel_info parallel_info_t; > > +struct splitting_infosplitting_info_t; > > > > struct vm_table vt = { 0 }; > > struct DumpInfo *info = NULL; > > diff --git a/makedumpfile.h b/makedumpfile.h > > index 68d9691..614764c 100644 > > --- a/makedumpfile.h > > +++ b/makedumpfile.h > > @@ -1262,7 +1262,8 @@ struct splitting_info { > > mdf_pfn_t end_pfn; > > off_t offset_eraseinfo; > > unsigned long size_eraseinfo; > > -} splitting_info_t; > > +}; > > +extern struct splitting_info splitting_info_t; > > Interestingly, it seems that the splitting_info_t and parallel_info_t should > have been typedef'd because of their names ending with _t and not being used > as variable. (We use info->splitting_info and info->parallel_info.) > > So, is the following patch OK? then I can modify your patch. > Hi, Thanks for the review, and yes it's definitely OK to change the patch in this way. I just took a brief look at the code, and modified it in the way that actually change nothing. And after a second look, indeed they are never used as variable, only used as parameters of sizeof(). So actually can we just get rid of them, and use sizeof(struct parallel_info) and sizeof(struct splitting_info) instead? It may be even simpler. I'm OK with either way. > --- a/makedumpfile.h > +++ b/makedumpfile.h > @@ -1255,7 +1255,7 @@ struct makedumpfile_data_header { > int64_t buf_size; > }; > > -struct splitting_info { > +typedef struct splitting_info { > char*name_dumpfile; > int fd_bitmap; > mdf_pfn_t start_pfn; > @@ -1264,7 +1264,7 @@ struct splitting_info { > unsigned long size_eraseinfo; > } splitting_info_t; > > -struct parallel_info { > +typedef struct parallel_info { > int fd_memory; > int fd_bitmap_memory; > int fd_bitmap; > @@ -2006,8 +2006,8 @@ struct memory_range { > }; > > #define CRASH_RESERVED_MEM_NR 8 > -struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR]; > -int crash_reserved_mem_nr; > +extern struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR]; > +extern int crash_reserved_mem_nr; > > unsigned long read_vmcoreinfo_symbol(char *str_symbol); > int readmem(int type_addr, unsigned long long addr, void *bufptr, size_t > size); > > > Thanks, > Kazu > > >
[PATCH] kexec-tools: Remove duplicated variable declarations
When building kexec-tools for Fedora 32, following error is observed: /usr/bin/ld: kexec/arch/x86_64/kexec-bzImage64.o:(.bss+0x0): multiple definition of `bzImage_support_efi_boot'; kexec/arch/i386/kexec-bzImage.o:(.bss+0x0): first defined here /builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm/../../fs2dt.h:33: multiple definition of `my_debug'; kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/kexec/fs2dt.h:33: first defined here /builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm64/kexec-arm64.h:68: multiple definition of `arm64_mem'; kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/././kexec/arch/arm64/kexec-arm64.h:68: first defined here /builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm64/kexec-arm64.h:54: multiple definition of `initrd_size'; kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/././kexec/arch/arm64/kexec-arm64.h:54: first defined here /builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm64/kexec-arm64.h:53: multiple definition of `initrd_base'; kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/././kexec/arch/arm64/kexec-arm64.h:53: first defined here And apparently, these variables are wrongly declared multiple times. So remove duplicated declaration. Signed-off-by: Kairui Song --- kexec/arch/arm64/kexec-arm64.h | 6 +++--- kexec/arch/ppc64/kexec-elf-ppc64.c | 2 -- kexec/arch/x86_64/kexec-bzImage64.c | 1 - kexec/fs2dt.h | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/kexec/arch/arm64/kexec-arm64.h b/kexec/arch/arm64/kexec-arm64.h index 628de79..ed447ac 100644 --- a/kexec/arch/arm64/kexec-arm64.h +++ b/kexec/arch/arm64/kexec-arm64.h @@ -50,8 +50,8 @@ int zImage_arm64_load(int argc, char **argv, const char *kernel_buf, void zImage_arm64_usage(void); -off_t initrd_base; -off_t initrd_size; +extern off_t initrd_base; +extern off_t initrd_size; /** * struct arm64_mem - Memory layout info. @@ -65,7 +65,7 @@ struct arm64_mem { }; #define arm64_mem_ngv UINT64_MAX -struct arm64_mem arm64_mem; +extern struct arm64_mem arm64_mem; uint64_t get_phys_offset(void); uint64_t get_vp_offset(void); diff --git a/kexec/arch/ppc64/kexec-elf-ppc64.c b/kexec/arch/ppc64/kexec-elf-ppc64.c index 3510b70..695b8b0 100644 --- a/kexec/arch/ppc64/kexec-elf-ppc64.c +++ b/kexec/arch/ppc64/kexec-elf-ppc64.c @@ -44,8 +44,6 @@ uint64_t initrd_base, initrd_size; unsigned char reuse_initrd = 0; const char *ramdisk; -/* Used for enabling printing message from purgatory code */ -int my_debug = 0; int elf_ppc64_probe(const char *buf, off_t len) { diff --git a/kexec/arch/x86_64/kexec-bzImage64.c b/kexec/arch/x86_64/kexec-bzImage64.c index 8edb3e4..ba8dc48 100644 --- a/kexec/arch/x86_64/kexec-bzImage64.c +++ b/kexec/arch/x86_64/kexec-bzImage64.c @@ -42,7 +42,6 @@ #include static const int probe_debug = 0; -int bzImage_support_efi_boot; int bzImage64_probe(const char *buf, off_t len) { diff --git a/kexec/fs2dt.h b/kexec/fs2dt.h index 7633273..fe24931 100644 --- a/kexec/fs2dt.h +++ b/kexec/fs2dt.h @@ -30,7 +30,7 @@ extern struct bootblock bb[1]; /* Used for enabling printing message from purgatory code * Only has implemented for PPC64 */ -int my_debug; +extern int my_debug; extern int dt_no_old_root; void reserve(unsigned long long where, unsigned long long length); -- 2.24.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] makedumpfile: Remove duplicated variable declarations
When building on Fedora 32, following error is observed: /usr/bin/ld: erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010: multiple definition of `crash_reserved_mem_nr'; elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010: first defined here /usr/bin/ld: erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009: multiple definition of `crash_reserved_mem'; elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009: first defined here /usr/bin/ld: erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278: multiple definition of `parallel_info_t'; elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278: first defined here /usr/bin/ld: erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265: multiple definition of `splitting_info_t'; elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265: first defined here And apparently, these variables are wrongly declared multiple times. So remove duplicated declaration. Signed-off-by: Kairui Song --- makedumpfile.c | 2 ++ makedumpfile.h | 10 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/makedumpfile.c b/makedumpfile.c index e290fbd..9aad77b 100644 --- a/makedumpfile.c +++ b/makedumpfile.c @@ -34,6 +34,8 @@ struct array_tablearray_table; struct number_tablenumber_table; struct srcfile_table srcfile_table; struct save_controlsc; +struct parallel_info parallel_info_t; +struct splitting_info splitting_info_t; struct vm_tablevt = { 0 }; struct DumpInfo*info = NULL; diff --git a/makedumpfile.h b/makedumpfile.h index 68d9691..614764c 100644 --- a/makedumpfile.h +++ b/makedumpfile.h @@ -1262,7 +1262,8 @@ struct splitting_info { mdf_pfn_t end_pfn; off_t offset_eraseinfo; unsigned long size_eraseinfo; -} splitting_info_t; +}; +extern struct splitting_info splitting_info_t; struct parallel_info { int fd_memory; @@ -1275,7 +1276,8 @@ struct parallel_info { #ifdef USELZO lzo_bytep wrkmem; #endif -} parallel_info_t; +}; +extern struct parallel_info parallel_info_t; struct ppc64_vmemmap { unsigned long phys; @@ -2006,8 +2008,8 @@ struct memory_range { }; #define CRASH_RESERVED_MEM_NR 8 -struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR]; -int crash_reserved_mem_nr; +extern struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR]; +extern int crash_reserved_mem_nr; unsigned long read_vmcoreinfo_symbol(char *str_symbol); int readmem(int type_addr, unsigned long long addr, void *bufptr, size_t size); -- 2.24.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Thu, Jan 16, 2020 at 1:31 AM Khalid Aziz wrote: > > On 1/13/20 10:07 AM, Kairui Song wrote: > > On Sun, Jan 12, 2020 at 2:33 AM Deepa Dinamani > > wrote: > >> > >>> Hi, there are some previous works about this issue, reset PCI devices > >>> in kdump kernel to stop ongoing DMA: > >>> > >>> [v7,0/5] Reset PCIe devices to address DMA problem on kdump with iommu > >>> https://lore.kernel.org/patchwork/cover/343767/ > >>> > >>> [v2] PCI: Reset PCIe devices to stop ongoing DMA > >>> https://lore.kernel.org/patchwork/patch/379191/ > >>> > >>> And didn't get merged, that patch are trying to fix some DMAR error > >>> problem, but resetting devices is a bit too destructive, and the > >>> problem is later fixed in IOMMU side. And in most case the DMA seems > >>> harmless, as they targets first kernel's memory and kdump kernel only > >>> live in crash memory. > >> > >> I was going to ask the same. If the kdump kernel had IOMMU on, would > >> that still be a problem? > > > > It will still fail, doing DMA is not a problem, it only go wrong when > > a device's upstream bridge is mistakenly shutdown before the device > > shutdown. > > > >> > >>> Also, by the time kdump kernel is able to scan and reset devices, > >>> there are already a very large time window where things could go > >>> wrong. > >>> > >>> The currently problem observed only happens upon kdump kernel > >>> shutdown, as the upper bridge is disabled before the device is > >>> disabledm so DMA will raise error. It's more like a problem of wrong > >>> device shutting down order. > >> > >> The way it was described earlier "During this time, the SUT sometimes > >> gets a PCI error that raises an NMI." suggests that it isn't really > >> restricted to kexec/kdump. > >> Any attached device without an active driver might attempt spurious or > >> malicious DMA and trigger the same during normal operation. > >> Do you have available some more reporting of what happens during the > >> PCIe error handling? > > > > Let me add more info about this: > > > > On the machine where I can reproduce this issue, the first kernel > > always runs fine, and kdump kernel works fine during dumping the > > vmcore, even if I keep the kdump kernel running for hours, nothing > > goes wrong. If there are DMA during normal operation that will cause > > problem, this should have exposed it. > > > > This is the part that is puzzling me. Error shows up only when kdump > kernel is being shut down. kdump kernel can run for hours without this > issue. What is the operation from downstream device that is resulting in > uncorrectable error - is it indeed a DMA request? Why does that > operation from downstream device not happen until shutdown? > > I just want to make sure we fix the right problem in the right way. > Actually the device could keep sending request with no problem during kdump kernel running. Eg. keep sending DMA, and all DMA targets first kernel's system memory, so kdump runs fine as long as nothing touch the reserved crash memory. And the error is reported by the port, when shutdown it has bus master bit, and downstream request will cause error. I'm not sure what request it really is either, it could depend on device. On that machine, error could be reproduced when either the NIC or HPSA is not reset in kdump, and from the bug report, the reporter used a different NIC card and it's also reproducible. The NIC is much less like to cause bridge error though (HPSA is about 7/10 reproducible, NIC is about 3/10), so the device could send different requests but fail in the same way (UR error reported from the bridge). Will try to do more debug, but I'm not sure how can I intercept the PCIe operation to get some info about what is actually causing the issue, do you have any suggestion? -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Wed, Jan 15, 2020 at 9:17 AM Deepa Dinamani wrote: > > On Mon, Jan 13, 2020 at 9:07 AM Kairui Song wrote: > > > > On Sun, Jan 12, 2020 at 2:33 AM Deepa Dinamani > > wrote: > > > > > > > Hi, there are some previous works about this issue, reset PCI devices > > > > in kdump kernel to stop ongoing DMA: > > > > > > > > [v7,0/5] Reset PCIe devices to address DMA problem on kdump with iommu > > > > https://lore.kernel.org/patchwork/cover/343767/ > > > > > > > > [v2] PCI: Reset PCIe devices to stop ongoing DMA > > > > https://lore.kernel.org/patchwork/patch/379191/ > > > > > > > > And didn't get merged, that patch are trying to fix some DMAR error > > > > problem, but resetting devices is a bit too destructive, and the > > > > problem is later fixed in IOMMU side. And in most case the DMA seems > > > > harmless, as they targets first kernel's memory and kdump kernel only > > > > live in crash memory. > > > > > > I was going to ask the same. If the kdump kernel had IOMMU on, would > > > that still be a problem? > > > > It will still fail, doing DMA is not a problem, it only go wrong when > > a device's upstream bridge is mistakenly shutdown before the device > > shutdown. > > > > > > > > > Also, by the time kdump kernel is able to scan and reset devices, > > > > there are already a very large time window where things could go > > > > wrong. > > > > > > > > The currently problem observed only happens upon kdump kernel > > > > shutdown, as the upper bridge is disabled before the device is > > > > disabledm so DMA will raise error. It's more like a problem of wrong > > > > device shutting down order. > > > > > > The way it was described earlier "During this time, the SUT sometimes > > > gets a PCI error that raises an NMI." suggests that it isn't really > > > restricted to kexec/kdump. > > > Any attached device without an active driver might attempt spurious or > > > malicious DMA and trigger the same during normal operation. > > > Do you have available some more reporting of what happens during the > > > PCIe error handling? > > > > Let me add more info about this: > > > > On the machine where I can reproduce this issue, the first kernel > > always runs fine, and kdump kernel works fine during dumping the > > vmcore, even if I keep the kdump kernel running for hours, nothing > > goes wrong. If there are DMA during normal operation that will cause > > problem, this should have exposed it. > > > > The problem only occur when kdump kernel try to reboot, no matter how > > long the kdump kernel have been running (few minutes or hours). The > > machine is dead after printing: > > [ 101.438300] reboot: Restarting system^M > > [ 101.455360] reboot: machine restart^M > > > > And I can find following logs happend just at that time, in the > > "Integrated Management Log" from the iLO web interface: > > 1254 OS 12/25/2019 09:08 12/25/2019 09:08 1 User Remotely Initiated NMI > > Switch > > 1253 System Error 12/25/2019 09:08 12/25/2019 09:08 1 An Unrecoverable > > System Error (NMI) has occurred (Service Information: 0x, > > 0x) > > 1252 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 Uncorrectable PCI > > Express Error (Embedded device, Bus 0, Device 2, Function 2, Error > > status 0x0010) > > 1251 System Error 12/25/2019 09:07 12/25/2019 09:07 1 Unrecoverable > > System Error (NMI) has occurred. System Firmware will log additional > > details in a separate IML entry if possible > > 1250 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 PCI Bus Error (Slot > > 0, Bus 0, Device 2, Function 2) > > > > And the topology is: > > [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 > > +-01.0-[02]-- > > +-01.1-[05]-- > > +-02.0-[06]--+-00.0 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.1 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.2 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.3 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.4 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.5 Emulex Corporation OneConnect NIC (Skyhawk) > > |+-00.6 Emulex Corporation OneConnect NIC (S
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Sun, Jan 12, 2020 at 2:33 AM Deepa Dinamani wrote: > > > Hi, there are some previous works about this issue, reset PCI devices > > in kdump kernel to stop ongoing DMA: > > > > [v7,0/5] Reset PCIe devices to address DMA problem on kdump with iommu > > https://lore.kernel.org/patchwork/cover/343767/ > > > > [v2] PCI: Reset PCIe devices to stop ongoing DMA > > https://lore.kernel.org/patchwork/patch/379191/ > > > > And didn't get merged, that patch are trying to fix some DMAR error > > problem, but resetting devices is a bit too destructive, and the > > problem is later fixed in IOMMU side. And in most case the DMA seems > > harmless, as they targets first kernel's memory and kdump kernel only > > live in crash memory. > > I was going to ask the same. If the kdump kernel had IOMMU on, would > that still be a problem? It will still fail, doing DMA is not a problem, it only go wrong when a device's upstream bridge is mistakenly shutdown before the device shutdown. > > > Also, by the time kdump kernel is able to scan and reset devices, > > there are already a very large time window where things could go > > wrong. > > > > The currently problem observed only happens upon kdump kernel > > shutdown, as the upper bridge is disabled before the device is > > disabledm so DMA will raise error. It's more like a problem of wrong > > device shutting down order. > > The way it was described earlier "During this time, the SUT sometimes > gets a PCI error that raises an NMI." suggests that it isn't really > restricted to kexec/kdump. > Any attached device without an active driver might attempt spurious or > malicious DMA and trigger the same during normal operation. > Do you have available some more reporting of what happens during the > PCIe error handling? Let me add more info about this: On the machine where I can reproduce this issue, the first kernel always runs fine, and kdump kernel works fine during dumping the vmcore, even if I keep the kdump kernel running for hours, nothing goes wrong. If there are DMA during normal operation that will cause problem, this should have exposed it. The problem only occur when kdump kernel try to reboot, no matter how long the kdump kernel have been running (few minutes or hours). The machine is dead after printing: [ 101.438300] reboot: Restarting system^M [ 101.455360] reboot: machine restart^M And I can find following logs happend just at that time, in the "Integrated Management Log" from the iLO web interface: 1254 OS 12/25/2019 09:08 12/25/2019 09:08 1 User Remotely Initiated NMI Switch 1253 System Error 12/25/2019 09:08 12/25/2019 09:08 1 An Unrecoverable System Error (NMI) has occurred (Service Information: 0x, 0x) 1252 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 Uncorrectable PCI Express Error (Embedded device, Bus 0, Device 2, Function 2, Error status 0x0010) 1251 System Error 12/25/2019 09:07 12/25/2019 09:07 1 Unrecoverable System Error (NMI) has occurred. System Firmware will log additional details in a separate IML entry if possible 1250 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 PCI Bus Error (Slot 0, Bus 0, Device 2, Function 2) And the topology is: [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 +-01.0-[02]-- +-01.1-[05]-- +-02.0-[06]--+-00.0 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.1 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.2 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.3 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.4 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.5 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.6 Emulex Corporation OneConnect NIC (Skyhawk) |\-00.7 Emulex Corporation OneConnect NIC (Skyhawk) +-02.1-[0f]-- +-02.2-[07]00.0 Hewlett-Packard Company Smart Array Gen9 Controllers It's a bridge reporting the error. It should be an unsupported request error, bacause downstream device is still alive and sending request, but the port have bus mastering off. If I manually shutdown the "Smart Array" (HPSA) device before kdump reboot, it will always reboot just fine. And as the patch descriptions said, the HPSA is used in first kernel, but didn't get reset in kdump kernel because driver is not loaded. When shutting down a bridge, kernel should shutdown downstream device first, and then shutdown and clear bus master bit of the bridge. But in kdump case, kernel skipped some device shutdown due to driver not loaded issue, and kernel don't know they are enabled. This problem is not limited to HPSA, the NIC listed in above topology maybe also make the bridge error out, if HPSA get loaded in kdump kernel and NIC get ignored. > > "The reaction to the NMI that the kdump kernel takes is problematic." > Or the NMI should not have been triggered to begin with? Where do
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Sat, Jan 11, 2020 at 11:46 AM Khalid Aziz wrote: > > On 1/10/20 5:50 PM, Baoquan He wrote: > > On 01/10/20 at 05:18pm, Khalid Aziz wrote: > >> On 1/10/20 4:00 PM, Jerry Hoemann wrote: > >>> On Fri, Jan 10, 2020 at 03:25:36PM -0700, Khalid Aziz and Shuah Khan > >>> wrote: > >>>> On 1/10/20 2:42 PM, Bjorn Helgaas wrote: > >>>>> [+cc Deepa (also working in this area)] > >>>>> > >>>>> On Thu, Dec 26, 2019 at 03:21:18AM +0800, Kairui Song wrote: > >>>>>> There are reports about kdump hang upon reboot on some HPE machines, > >>>>>> kernel hanged when trying to shutdown a PCIe port, an uncorrectable > >>>>>> error occurred and crashed the system. > >>>>> > >>>>> Details? Do you have URLs for bug reports, dmesg logs, etc? > >>>>> > >>>>>> On the machine I can reproduce this issue, part of the topology > >>>>>> looks like this: > >>>>>> > >>>>>> [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 > >>>>>> +-01.0-[02]-- > >>>>>> +-01.1-[05]-- > >>>>>> +-02.0-[06]--+-00.0 tEmulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> |+-00.1 Emulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> |+-00.2 Emulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> |+-00.3 Emulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> |+-00.4 Emulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> |+-00.5 Emulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> |+-00.6 Emulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> |\-00.7 Emulex Corporation OneConnect NIC > >>>>>> (Skyhawk) > >>>>>> +-02.1-[0f]-- > >>>>>> +-02.2-[07]00.0 Hewlett-Packard Company Smart Array > >>>>>> Gen9 Controllers > >>>>>> > >>>>>> When shutting down PCIe port :00:02.2 or :00:02.0, the machine > >>>>>> will hang, depend on which device is reinitialized in kdump kernel. > >>>>>> > >>>>>> If force remove unused device then trigger kdump, the problem will > >>>>>> never > >>>>>> happen: > >>>>>> > >>>>>> echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove > >>>>>> echo c > /proc/sysrq-trigger > >>>>>> > >>>>>> ... Kdump save vmcore through network, the NIC get reinitialized > >>>>>> and > >>>>>> hpsa is untouched. Then reboot with no problem. (If hpsa is used > >>>>>> instead, shutdown the NIC in first kernel will help) > >>>>>> > >>>>>> The cause is that some devices are enabled by the first kernel, but it > >>>>>> don't have the chance to shutdown the device, and kdump kernel is not > >>>>>> aware of it, unless it reinitialize the device. > >>>>>> > >>>>>> Upon reboot, kdump kernel will skip downstream device shutdown and > >>>>>> clears its bridge's master bit directly. The downstream device could > >>>>>> error out as it can still send requests but upstream refuses it. > >>>>> > >>>>> Can you help me understand the sequence of events? If I understand > >>>>> correctly, the desired sequence is: > >>>>> > >>>>> - user kernel boots > >>>>> - user kernel panics and kexecs to kdump kernel > >>>>> - kdump kernel writes vmcore to network or disk > >>>>> - kdump kernel reboots > >>>>> - user kernel boots > >>>>> > >>>>> But the problem is that as part of the kdump kernel reboot, > >>>>> > >>>>> - kdump kernel disables bus mastering for a Root Por
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Sat, Jan 11, 2020 at 8:45 AM Baoquan He wrote: > On 01/10/20 at 04:00pm, Jerry Hoemann wrote: > > > I am not understanding this failure mode either. That code in > > > pci_device_shutdown() was added originally to address this very issue. > > > The patch 4fc9bbf98fd6 ("PCI: Disable Bus Master only on kexec reboot") > > > shut down any errant DMAs from PCI devices as we kexec a new kernel. In > > > this new patch, this is the same code path that will be taken again when > > > kdump kernel is shutting down. If the errant DMA problem was not fixed > > > by clearing Bus Master bit in this path when kdump kernel was being > > > kexec'd, why does the same code path work the second time around when > > > kdump kernel is shutting down? Is there more going on that we don't > > > understand? > > > > > > > Khalid, > > > > I don't believe we execute that code path in the crash case. > > > > The variable kexec_in_progress is set true in kernel_kexec() before > > calling > > machine_kexec(). This is the fast reboot case. > > > > I don't see kexec_in_progress set true elsewhere. > > > > > > The code path for crash is different. > > > > For instance, panic() will call > > -> __crash_kexec() which calls > > -> machine_kexec(). > > > > So the setting of kexec_in_progress is bypassed. > > Yeah, it's a differet behaviour than kexec case. I talked to Kairui, the > patch log may be not very clear. Below is summary I got from my > understanding about this issue: > > ~~~ > Problem: > > When crash is triggered, system jumps into kdump kernel to collect > vmcore and dump out. After dumping is finished, kdump kernel will try > ty reboot to normal kernel. This hang happened during kdump kernel > rebooting, when dumping is network dumping, e.g ssh/nfs, local storage > is HPSA. > > Root cause: > > When configuring network dumping, only network driver modules are added > into kdump initramfs. However, the storage HPSA pcie device is enabled > in 1st kernel, its status is PCI_D3hot. When crashed system jumps to kdump > kernel, we didn't shutdown any device for safety and efficiency. Then > during kdump kernel boot up, the pci scan will get hpsa device and only > initialize its status as pci_dev->current_state = PCI_UNKNOWN. This > pci_dev->current_state will be manipulated by the relevant device > driver. So HPSA device will never have chance to calibrate its status, > and can't be shut down by pci_device_shutdown() called by reboot > service. It's still PCI_D3hot, then crash happened when system try to > shutdown its upper bridge. > > Fix: > > Here, Kairui uses a quirk to get PM state and mask off value bigger than > PCI_D3cold. Means, all devices will get PM state > pci_dev->current_state = PCI_D0 or PCI_D3hot Or to put it simple, I just synced the actual PM state into pci_dev->current_state using a quirk, for kdump kernel only. > Finally, during kdump > reboot stage, this device can be shut down successfully by clearing its > master bit. > > ~~~ > > About this patch, I think the quirk getting active PM state for all devices > may be risky, it will impact normal kernel too which doesn't have this issue. > > Wondering if there's any other way to fix or work around it. > Thank you for the detailed description! -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Sat, Jan 11, 2020 at 5:42 AM Bjorn Helgaas wrote: > > Can you help me understand the sequence of events? If I understand > correctly, the desired sequence is: > > - user kernel boots > - user kernel panics and kexecs to kdump kernel One thing imported need to be mentioned here, user kernel kexec into kdump kernel using the fast path, which does very few things, and leave all the PCI devices untouched. If they are on, or doing DMA, will just keep doing that, nothing will stop them. In most cases the on going DMA seems harmless though, as kdump kernel only live in reserved crash memory. > - kdump kernel writes vmcore to network or disk > - kdump kernel reboots > - user kernel boots > > But the problem is that as part of the kdump kernel reboot, > > - kdump kernel disables bus mastering for a Root Port > - device below the Root Port attempts DMA > - Root Port receives DMA transaction, handles it as Unsupported > Request, sends UR Completion to device > - device signals uncorrectable error > - uncorrectable error causes a crash (Or a hang? You mention both > and I'm not sure which it is) > > Is that right so far? Yes everything else all correct. On the machine I can reproduce it, system just hanged, even serial console is dead with no output. > > > So for kdump, let kernel read the correct hardware power state on boot, > > and always clear the bus master bit of PCI device upon shutdown if the > > device is on. PCIe port driver will always shutdown all downstream > > devices first, so this should ensure all downstream devices have bus > > master bit off before clearing the bridge's bus master bit. > > > > Signed-off-by: Kairui Song > > --- > > drivers/pci/pci-driver.c | 11 --- > > drivers/pci/quirks.c | 20 > > 2 files changed, 28 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > > index 0454ca0e4e3f..84a7fd643b4d 100644 > > --- a/drivers/pci/pci-driver.c > > +++ b/drivers/pci/pci-driver.c > > @@ -18,6 +18,7 @@ > > #include > > #include > > #include > > +#include > > #include "pci.h" > > #include "pcie/portdrv.h" > > > > @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev) > >* If this is a kexec reboot, turn off Bus Master bit on the > >* device to tell it to not continue to do DMA. Don't touch > >* devices in D3cold or unknown states. > > - * If it is not a kexec reboot, firmware will hit the PCI > > - * devices with big hammer and stop their DMA any way. > > + * If this is kdump kernel, also turn off Bus Master, the device > > + * could be activated by previous crashed kernel and may block > > + * it's upstream from shutting down. > > + * Else, firmware will hit the PCI devices with big hammer > > + * and stop their DMA any way. > >*/ > > - if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) > > + if ((kexec_in_progress || is_kdump_kernel()) && > > + pci_dev->current_state <= PCI_D3hot) > > pci_clear_master(pci_dev); > > I'm clearly missing something because this will turn off bus mastering > in cases where we previously left it enabled. > > I was assuming the crash was related to a device doing DMA when the > Root Port had bus mastering disabled. But that must be wrong. That is just what is happening. When kdump kernel try to reboot, it only cleared bus mastering bit of the Root Port, ignoring enabled device under it, because it's not the kdump kernel that enabled the device, it's the first kernel enabled it, and kdump kernel don't know it. > > I'd like to understand the crash/hang better because the quirk > especially is hard to connect to anything. If the crash is because of > an AER or other PCIe error, maybe another possibility is that we could > handle it better or disable signaling of it or something. > Maybe if we can solve the problem by properly shutdown the devices in right order, then better don't disable any error handling features? Or kernel might miss some real hardware issue. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Thu, Dec 26, 2019 at 3:21 AM Kairui Song wrote: > > There are reports about kdump hang upon reboot on some HPE machines, > kernel hanged when trying to shutdown a PCIe port, an uncorrectable > error occurred and crashed the system. > > On the machine I can reproduce this issue, part of the topology > looks like this: > > [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 > +-01.0-[02]-- > +-01.1-[05]-- > +-02.0-[06]--+-00.0 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.1 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.2 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.3 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.4 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.5 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.6 Emulex Corporation OneConnect NIC (Skyhawk) > |\-00.7 Emulex Corporation OneConnect NIC (Skyhawk) > +-02.1-[0f]-- > +-02.2-[07]00.0 Hewlett-Packard Company Smart Array Gen9 > Controllers > > When shuting down PCIe port :00:02.2 or :00:02.0, the machine > will hang, depend on which device is reinitialized in kdump kernel. > > If force remove unused device then trigger kdump, the problem will never > happen: > > echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove > echo c > /proc/sysrq-trigger > > ... Kdump save vmcore through network, the NIC get reinitialized and > hpsa is untouched. Then reboot with no problem. (If hpsa is used > instead, shutdown the NIC in first kernel will help) > > The cause is that some devices are enabled by the first kernel, but it > don't have the chance to shutdown the device, and kdump kernel is not > aware of it, unless it reinitialize the device. > > Upon reboot, kdump kernel will skip downstream device shutdown and > clears its bridge's master bit directly. The downstream device could > error out as it can still send requests but upstream refuses it. > > So for kdump, let kernel read the correct hardware power state on boot, > and always clear the bus master bit of PCI device upon shutdown if the > device is on. PCIe port driver will always shutdown all downstream > devices first, so this should ensure all downstream devices have bus > master bit off before clearing the bridge's bus master bit. > > Signed-off-by: Kairui Song > --- > drivers/pci/pci-driver.c | 11 --- > drivers/pci/quirks.c | 20 > 2 files changed, 28 insertions(+), 3 deletions(-) > > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > index 0454ca0e4e3f..84a7fd643b4d 100644 > --- a/drivers/pci/pci-driver.c > +++ b/drivers/pci/pci-driver.c > @@ -18,6 +18,7 @@ > #include > #include > #include > +#include > #include "pci.h" > #include "pcie/portdrv.h" > > @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev) > * If this is a kexec reboot, turn off Bus Master bit on the > * device to tell it to not continue to do DMA. Don't touch > * devices in D3cold or unknown states. > -* If it is not a kexec reboot, firmware will hit the PCI > -* devices with big hammer and stop their DMA any way. > +* If this is kdump kernel, also turn off Bus Master, the device > +* could be activated by previous crashed kernel and may block > +* it's upstream from shutting down. > +* Else, firmware will hit the PCI devices with big hammer > +* and stop their DMA any way. > */ > - if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) > + if ((kexec_in_progress || is_kdump_kernel()) && > + pci_dev->current_state <= PCI_D3hot) > pci_clear_master(pci_dev); > } > > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c > index 4937a088d7d8..c65d11ab3939 100644 > --- a/drivers/pci/quirks.c > +++ b/drivers/pci/quirks.c > @@ -28,6 +28,7 @@ > #include > #include > #include > +#include > #include/* isa_dma_bridge_buggy */ > #include "pci.h" > > @@ -192,6 +193,25 @@ static int __init pci_apply_final_quirks(void) > } > fs_initcall_sync(pci_apply_final_quirks); > > +/* > + * Read the device state even if it's not enabled. The device could be > + * activated by previous crashed kernel, this will read and correct the > + * cached state. > + */ > +static void quirk_r
[RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
There are reports about kdump hang upon reboot on some HPE machines, kernel hanged when trying to shutdown a PCIe port, an uncorrectable error occurred and crashed the system. On the machine I can reproduce this issue, part of the topology looks like this: [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 +-01.0-[02]-- +-01.1-[05]-- +-02.0-[06]--+-00.0 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.1 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.2 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.3 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.4 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.5 Emulex Corporation OneConnect NIC (Skyhawk) |+-00.6 Emulex Corporation OneConnect NIC (Skyhawk) |\-00.7 Emulex Corporation OneConnect NIC (Skyhawk) +-02.1-[0f]-- +-02.2-[07]00.0 Hewlett-Packard Company Smart Array Gen9 Controllers When shuting down PCIe port :00:02.2 or :00:02.0, the machine will hang, depend on which device is reinitialized in kdump kernel. If force remove unused device then trigger kdump, the problem will never happen: echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove echo c > /proc/sysrq-trigger ... Kdump save vmcore through network, the NIC get reinitialized and hpsa is untouched. Then reboot with no problem. (If hpsa is used instead, shutdown the NIC in first kernel will help) The cause is that some devices are enabled by the first kernel, but it don't have the chance to shutdown the device, and kdump kernel is not aware of it, unless it reinitialize the device. Upon reboot, kdump kernel will skip downstream device shutdown and clears its bridge's master bit directly. The downstream device could error out as it can still send requests but upstream refuses it. So for kdump, let kernel read the correct hardware power state on boot, and always clear the bus master bit of PCI device upon shutdown if the device is on. PCIe port driver will always shutdown all downstream devices first, so this should ensure all downstream devices have bus master bit off before clearing the bridge's bus master bit. Signed-off-by: Kairui Song --- drivers/pci/pci-driver.c | 11 --- drivers/pci/quirks.c | 20 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 0454ca0e4e3f..84a7fd643b4d 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "pci.h" #include "pcie/portdrv.h" @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev) * If this is a kexec reboot, turn off Bus Master bit on the * device to tell it to not continue to do DMA. Don't touch * devices in D3cold or unknown states. -* If it is not a kexec reboot, firmware will hit the PCI -* devices with big hammer and stop their DMA any way. +* If this is kdump kernel, also turn off Bus Master, the device +* could be activated by previous crashed kernel and may block +* it's upstream from shutting down. +* Else, firmware will hit the PCI devices with big hammer +* and stop their DMA any way. */ - if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) + if ((kexec_in_progress || is_kdump_kernel()) && + pci_dev->current_state <= PCI_D3hot) pci_clear_master(pci_dev); } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4937a088d7d8..c65d11ab3939 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -28,6 +28,7 @@ #include #include #include +#include #include/* isa_dma_bridge_buggy */ #include "pci.h" @@ -192,6 +193,25 @@ static int __init pci_apply_final_quirks(void) } fs_initcall_sync(pci_apply_final_quirks); +/* + * Read the device state even if it's not enabled. The device could be + * activated by previous crashed kernel, this will read and correct the + * cached state. + */ +static void quirk_read_pm_state_in_kdump(struct pci_dev *dev) +{ + u16 pmcsr; + + if (!is_kdump_kernel()) + return; + + if (dev->pm_cap) { + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); + dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, quirk_read_pm_state_in_kdump); + /* * Decoding should be disabled for a PCI device during BAR sizing to avoid * conflict. But doing so may cause problems on host bridge and perhaps other
Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active
thiOn Tue, Oct 15, 2019 at 10:18 AM Dave Young wrote: > > On 10/14/19 at 07:05pm, Dave Young wrote: > > On 10/12/19 at 05:24pm, Kairui Song wrote: > > > On 9/27/19 1:42 PM, Dave Young wrote: > > > > On 09/25/19 at 06:36pm, Kairui Song wrote: > > > > > On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar wrote: > > > > > > * Kairui Song wrote: > > > > > > > > > > > > > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption > > > > > > > support"), > > > > > > > SWIOTLB will be enabled even if there is less than 4G of memory > > > > > > > when SME > > > > > > > is active, to support DMA of devices that not support address > > > > > > > with the > > > > > > > encrypt bit. > > > > > > > > > > > > > > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if > > > > > > > SME is > > > > > > > active") make the kernel keep SWIOTLB enabled even if there is an > > > > > > > IOMMU. > > > > > > > > > > > > > > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory > > > > > > > encryption") will always force SWIOTLB to be enabled when SEV is > > > > > > > active > > > > > > > in all cases. > > > > > > > > > > > > > > Now, when either SME or SEV is active, SWIOTLB will be force > > > > > > > enabled, > > > > > > > and this is also true for kdump kernel. As a result kdump kernel > > > > > > > will > > > > > > > run out of already scarce pre-reserved memory easily. > > > > > > > > > > > > > > So when SME/SEV is active, reserve extra memory for SWIOTLB to > > > > > > > ensure > > > > > > > kdump kernel have enough memory, except when > > > > > > > "crashkernel=size[KMG],high" > > > > > > > is specified or any offset is used. As for the high reservation > > > > > > > case, an > > > > > > > extra low memory region will always be reserved and that is > > > > > > > enough for > > > > > > > SWIOTLB. Else if the offset format is used, user should be fully > > > > > > > aware > > > > > > > of any possible kdump kernel memory requirement and have to > > > > > > > organize the > > > > > > > memory usage carefully. > > > > > > > > > > > > > > Signed-off-by: Kairui Song > > > > > > > --- > > > > > > > arch/x86/kernel/setup.c | 20 +--- > > > > > > > 1 file changed, 17 insertions(+), 3 deletions(-) > > > > > > > > > > > > > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > > > > > > > index 71f20bb18cb0..ee6a2f1e2226 100644 > > > > > > > --- a/arch/x86/kernel/setup.c > > > > > > > +++ b/arch/x86/kernel/setup.c > > > > > > > @@ -530,7 +530,7 @@ static int __init > > > > > > > crashkernel_find_region(unsigned long long *crash_base, > > > > > > > unsigned long long > > > > > > > *crash_size, > > > > > > > bool high) > > > > > > > { > > > > > > > - unsigned long long base, size; > > > > > > > + unsigned long long base, size, mem_enc_req = 0; > > > > > > > > > > > > > >base = *crash_base; > > > > > > >size = *crash_size; > > > > > > > @@ -561,11 +561,25 @@ static int __init > > > > > > > crashkernel_find_region(unsigned long long *crash_base, > > > > > > >if (high) > > > > > > >goto high_reserve; > > > > > > > > > > > > > > + /* > > > > > > > + * When SME/SEV is active and not using high reserve, > > > > > > > + * it will always required an extra SWIOTLB region. > > > > > > > + */ > > > > >
Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active
On 9/27/19 1:42 PM, Dave Young wrote: On 09/25/19 at 06:36pm, Kairui Song wrote: On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar wrote: * Kairui Song wrote: Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), SWIOTLB will be enabled even if there is less than 4G of memory when SME is active, to support DMA of devices that not support address with the encrypt bit. And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory encryption") will always force SWIOTLB to be enabled when SEV is active in all cases. Now, when either SME or SEV is active, SWIOTLB will be force enabled, and this is also true for kdump kernel. As a result kdump kernel will run out of already scarce pre-reserved memory easily. So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure kdump kernel have enough memory, except when "crashkernel=size[KMG],high" is specified or any offset is used. As for the high reservation case, an extra low memory region will always be reserved and that is enough for SWIOTLB. Else if the offset format is used, user should be fully aware of any possible kdump kernel memory requirement and have to organize the memory usage carefully. Signed-off-by: Kairui Song --- arch/x86/kernel/setup.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 71f20bb18cb0..ee6a2f1e2226 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned long long *crash_base, unsigned long long *crash_size, bool high) { - unsigned long long base, size; + unsigned long long base, size, mem_enc_req = 0; base = *crash_base; size = *crash_size; @@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned long long *crash_base, if (high) goto high_reserve; + /* + * When SME/SEV is active and not using high reserve, + * it will always required an extra SWIOTLB region. + */ + if (mem_encrypt_active()) + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); + base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_LOW_MAX, size, + CRASH_ADDR_LOW_MAX, + size + mem_enc_req, CRASH_ALIGN); Hi Ingo, I re-read my previous reply, it's long and tedious, let me try to make a more effective reply: What sizes are we talking about here? The size here is how much memory will be reserved for kdump kernel, to ensure kdump kernel and userspace can run without OOM. - What is the possible size range of swiotlb_size_or_default() swiotlb_size_or_default() returns the swiotlb size, it's specified by user using swiotlb=, or default size (64MB) - What is the size of CRASH_ADDR_LOW_MAX (the old limit)? It's 4G. - Why do we replace one fixed limit with another fixed limit instead of accurately sizing the area, with each required feature adding its own requirement to the reservation size? It's quite hard to "accurately sizing the area". No way to tell the exact amount of memory kdump needs, we can only estimate. Kdump kernel use different cmdline, drivers and components will have special handling for kdump, and userspace is totally different. Agreed about your above, but specific this the problem in this patch There should be other ways. First thought about doing generic handling in swiotlb part, and do something like kdump_memory_reserve(size) Ingo suggested, but according to you swiotlb init is late, so it can not increase the size, OTOH if reserve another region for kdump in swiotlb will cause other issues. So let's think about other improvement, for example to see if you can call kdump_memory_reserve(size) in AMD SME init path, for example in mem_encrypt_init(), is it before crashkernel reservation? If doable it will be at least cleaner than the code in this patch. Thanks Dave How about something simple as following code? The logic and new function is as simple as possible, just always reserve extra low memory when SME/SEV is active, ignore the high/low reservation case. It will waste some memory with SME and high reservation though. Was hesitating a lot about this series, one thing I'm thinking is that what is the point of "crashkernel=" argument, if the crashkernel value could be adjusted according, the value specified will seems more meanless or confusing... And currently there isn't anything like crashkernel=auto or anything similiar to le
Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active
On Fri, Sep 27, 2019 at 1:42 PM Dave Young wrote: > > On 09/25/19 at 06:36pm, Kairui Song wrote: > > On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar wrote: > > > * Kairui Song wrote: > > > > > > > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption > > > > support"), > > > > SWIOTLB will be enabled even if there is less than 4G of memory when SME > > > > is active, to support DMA of devices that not support address with the > > > > encrypt bit. > > > > > > > > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is > > > > active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. > > > > > > > > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory > > > > encryption") will always force SWIOTLB to be enabled when SEV is active > > > > in all cases. > > > > > > > > Now, when either SME or SEV is active, SWIOTLB will be force enabled, > > > > and this is also true for kdump kernel. As a result kdump kernel will > > > > run out of already scarce pre-reserved memory easily. > > > > > > > > So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure > > > > kdump kernel have enough memory, except when > > > > "crashkernel=size[KMG],high" > > > > is specified or any offset is used. As for the high reservation case, an > > > > extra low memory region will always be reserved and that is enough for > > > > SWIOTLB. Else if the offset format is used, user should be fully aware > > > > of any possible kdump kernel memory requirement and have to organize the > > > > memory usage carefully. > > > > > > > > Signed-off-by: Kairui Song > > > > --- > > > > arch/x86/kernel/setup.c | 20 +--- > > > > 1 file changed, 17 insertions(+), 3 deletions(-) > > > > > > > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > > > > index 71f20bb18cb0..ee6a2f1e2226 100644 > > > > --- a/arch/x86/kernel/setup.c > > > > +++ b/arch/x86/kernel/setup.c > > > > @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned > > > > long long *crash_base, > > > > unsigned long long *crash_size, > > > > bool high) > > > > { > > > > - unsigned long long base, size; > > > > + unsigned long long base, size, mem_enc_req = 0; > > > > > > > > base = *crash_base; > > > > size = *crash_size; > > > > @@ -561,11 +561,25 @@ static int __init > > > > crashkernel_find_region(unsigned long long *crash_base, > > > > if (high) > > > > goto high_reserve; > > > > > > > > + /* > > > > + * When SME/SEV is active and not using high reserve, > > > > + * it will always required an extra SWIOTLB region. > > > > + */ > > > > + if (mem_encrypt_active()) > > > > + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); > > > > + > > > > base = memblock_find_in_range(CRASH_ALIGN, > > > > - CRASH_ADDR_LOW_MAX, size, > > > > + CRASH_ADDR_LOW_MAX, > > > > + size + mem_enc_req, > > > > CRASH_ALIGN); > > > > > > > Hi Ingo, > > > > I re-read my previous reply, it's long and tedious, let me try to make > > a more effective reply: > > > > > What sizes are we talking about here? > > > > The size here is how much memory will be reserved for kdump kernel, to > > ensure kdump kernel and userspace can run without OOM. > > > > > > > > - What is the possible size range of swiotlb_size_or_default() > > > > swiotlb_size_or_default() returns the swiotlb size, it's specified by > > user using swiotlb=, or default size (64MB) > > > > > > > > - What is the size of CRASH_ADDR_LOW_MAX (the old limit)? > > > > It's 4G. > > > > > > > > - Why do we replace one fixed limit with another fixed limit instead of > > > accurately sizing the area, with each required feature adding its own > > > require
Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active
On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar wrote: > * Kairui Song wrote: > > > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), > > SWIOTLB will be enabled even if there is less than 4G of memory when SME > > is active, to support DMA of devices that not support address with the > > encrypt bit. > > > > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is > > active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. > > > > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory > > encryption") will always force SWIOTLB to be enabled when SEV is active > > in all cases. > > > > Now, when either SME or SEV is active, SWIOTLB will be force enabled, > > and this is also true for kdump kernel. As a result kdump kernel will > > run out of already scarce pre-reserved memory easily. > > > > So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure > > kdump kernel have enough memory, except when "crashkernel=size[KMG],high" > > is specified or any offset is used. As for the high reservation case, an > > extra low memory region will always be reserved and that is enough for > > SWIOTLB. Else if the offset format is used, user should be fully aware > > of any possible kdump kernel memory requirement and have to organize the > > memory usage carefully. > > > > Signed-off-by: Kairui Song > > --- > > arch/x86/kernel/setup.c | 20 +--- > > 1 file changed, 17 insertions(+), 3 deletions(-) > > > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > > index 71f20bb18cb0..ee6a2f1e2226 100644 > > --- a/arch/x86/kernel/setup.c > > +++ b/arch/x86/kernel/setup.c > > @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned long > > long *crash_base, > > unsigned long long *crash_size, > > bool high) > > { > > - unsigned long long base, size; > > + unsigned long long base, size, mem_enc_req = 0; > > > > base = *crash_base; > > size = *crash_size; > > @@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned > > long long *crash_base, > > if (high) > > goto high_reserve; > > > > + /* > > + * When SME/SEV is active and not using high reserve, > > + * it will always required an extra SWIOTLB region. > > + */ > > + if (mem_encrypt_active()) > > + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); > > + > > base = memblock_find_in_range(CRASH_ALIGN, > > - CRASH_ADDR_LOW_MAX, size, > > + CRASH_ADDR_LOW_MAX, > > + size + mem_enc_req, > > CRASH_ALIGN); > Hi Ingo, I re-read my previous reply, it's long and tedious, let me try to make a more effective reply: > What sizes are we talking about here? The size here is how much memory will be reserved for kdump kernel, to ensure kdump kernel and userspace can run without OOM. > > - What is the possible size range of swiotlb_size_or_default() swiotlb_size_or_default() returns the swiotlb size, it's specified by user using swiotlb=, or default size (64MB) > > - What is the size of CRASH_ADDR_LOW_MAX (the old limit)? It's 4G. > > - Why do we replace one fixed limit with another fixed limit instead of > accurately sizing the area, with each required feature adding its own > requirement to the reservation size? It's quite hard to "accurately sizing the area". No way to tell the exact amount of memory kdump needs, we can only estimate. Kdump kernel use different cmdline, drivers and components will have special handling for kdump, and userspace is totally different. > > I.e. please engineer this into a proper solution instead of just > modifying it around the edges. > > For example have you considered adding some sort of > kdump_memory_reserve(size) facility, which increases the reservation size > as something like SWIOTLB gets activated? That would avoid the ugly > mem_encrypt_active() flag, it would just automagically work. My first attempt is increase crashkernel memory as swiotlb is activated. There are problems. First, SME/SEV is currently the only case that both kernel require SWIOTLB, for most other case, it's wasting memory. If we don't care about the memory waste, it has to check/reserve/free crashkernel memory at three different poin
Re: [PATCH v3 0/2] x86/kdump: Reserve extra memory when SME or SEV is active
On Wed, Sep 18, 2019 at 3:55 PM Dave Young wrote: > > On 09/12/19 at 12:23am, Kairui Song wrote: > > On Wednesday, September 11, 2019, Ingo Molnar wrote: > > > > > > * Kairui Song wrote: > > > > > >> Since commit c7753208a94c ("x86, swiotlb: Add memory encryption > > support"), > > >> SWIOTLB will be enabled even if there is less than 4G of memory when SME > > >> is active, to support DMA of devices that not support address with the > > >> encrypt bit. > > >> > > >> And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is > > >> active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. > > >> > > >> Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory > > >> encryption") will always force SWIOTLB to be enabled when SEV is active > > >> in all cases. > > >> > > >> Now, when either SME or SEV is active, SWIOTLB will be force enabled, > > >> and this is also true for kdump kernel. As a result kdump kernel will > > >> run out of already scarce pre-reserved memory easily. > > >> > > >> So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure > > >> kdump kernel have enough memory, except when "crashkernel=size[KMG],high" > > >> is specified or any offset is used. As for the high reservation case, an > > >> extra low memory region will always be reserved and that is enough for > > >> SWIOTLB. Else if the offset format is used, user should be fully aware > > >> of any possible kdump kernel memory requirement and have to organize the > > >> memory usage carefully. > > >> > > >> Signed-off-by: Kairui Song > > >> --- > > >> arch/x86/kernel/setup.c | 20 +--- > > >> 1 file changed, 17 insertions(+), 3 deletions(-) > > >> > > >> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > > >> index 71f20bb18cb0..ee6a2f1e2226 100644 > > >> --- a/arch/x86/kernel/setup.c > > >> +++ b/arch/x86/kernel/setup.c > > >> @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned > > long long *crash_base, > > >> unsigned long long *crash_size, > > >> bool high) > > >> { > > >> - unsigned long long base, size; > > >> + unsigned long long base, size, mem_enc_req = 0; > > >> > > >> base = *crash_base; > > >> size = *crash_size; > > >> @@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned > > long long *crash_base, > > >> if (high) > > >> goto high_reserve; > > >> > > >> + /* > > >> + * When SME/SEV is active and not using high reserve, > > >> + * it will always required an extra SWIOTLB region. > > >> + */ > > >> + if (mem_encrypt_active()) > > >> + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); > > >> + > > >> base = memblock_find_in_range(CRASH_ALIGN, > > >> - CRASH_ADDR_LOW_MAX, size, > > >> + CRASH_ADDR_LOW_MAX, > > >> + size + mem_enc_req, > > >> CRASH_ALIGN); > > > > > > What sizes are we talking about here? > > > > > > - What is the possible size range of swiotlb_size_or_default() > > > > > > - What is the size of CRASH_ADDR_LOW_MAX (the old limit)? > > > > > > - Why do we replace one fixed limit with another fixed limit instead of > > > accurately sizing the area, with each required feature adding its own > > > requirement to the reservation size? > > > > > > I.e. please engineer this into a proper solution instead of just > > > modifying it around the edges. > > > > > > For example have you considered adding some sort of > > > kdump_memory_reserve(size) facility, which increases the reservation size > > > as something like SWIOTLB gets activated? That would avoid the ugly > > > mem_encrypt_active() flag, it would just automagically work. > > > > Hi, thanks for the suggestions, actually I did try to workout a better > > resolution, at least for SW
[PATCH v3 0/2] x86/kdump: Reserve extra memory when SME or SEV is active
This series let kernel reserve extra memory for kdump when SME or SEV is active. When SME or SEV is active, SWIOTLB will be always be force enabled, and this is also true for kdump kernel. As a result kdump kernel will run out of already scarce pre-reserved memory easily. So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure kdump kernel have enough memory, except when "crashkernel=size[KMG],high" is specified or any offset is used. With high reservation an extra low memory region will always be reserved and that is enough for SWIOTLB. With offset format, user should be fully aware of any possible kdump kernel memory requirement and have to organize the memory usage carefully. Patch 1/2 simply split some code out of the reserve_crashkernel, prepare for the change of next patch. Patch 2/2 will let crashkernel reserve extra memory when SME or SEV is active, and explains more details and history about why this change is introduced. Update from V2: - Refactor and split some function out of reserve_crashkernel to make it cleaner, as suggested by Borislav Petkov - Split into 2 patches Update from V1: - Use mem_encrypt_active() instead of "sme_active() || sev_active()" - Don't reserve extra memory when ",high" or "@offset" is used, and don't print redundant message. - Fix coding style problem Kairui Song (2): x86/kdump: Split some code out of reserve_crashkernel x86/kdump: Reserve extra memory when SME or SEV is active arch/x86/kernel/setup.c | 106 1 file changed, 74 insertions(+), 32 deletions(-) -- 2.21.0
[PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active
Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), SWIOTLB will be enabled even if there is less than 4G of memory when SME is active, to support DMA of devices that not support address with the encrypt bit. And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory encryption") will always force SWIOTLB to be enabled when SEV is active in all cases. Now, when either SME or SEV is active, SWIOTLB will be force enabled, and this is also true for kdump kernel. As a result kdump kernel will run out of already scarce pre-reserved memory easily. So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure kdump kernel have enough memory, except when "crashkernel=size[KMG],high" is specified or any offset is used. As for the high reservation case, an extra low memory region will always be reserved and that is enough for SWIOTLB. Else if the offset format is used, user should be fully aware of any possible kdump kernel memory requirement and have to organize the memory usage carefully. Signed-off-by: Kairui Song --- arch/x86/kernel/setup.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 71f20bb18cb0..ee6a2f1e2226 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned long long *crash_base, unsigned long long *crash_size, bool high) { - unsigned long long base, size; + unsigned long long base, size, mem_enc_req = 0; base = *crash_base; size = *crash_size; @@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned long long *crash_base, if (high) goto high_reserve; + /* +* When SME/SEV is active and not using high reserve, +* it will always required an extra SWIOTLB region. +*/ + if (mem_encrypt_active()) + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); + base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_LOW_MAX, size, + CRASH_ADDR_LOW_MAX, + size + mem_enc_req, CRASH_ALIGN); - if (base) + if (base) { + if (mem_enc_req) { + pr_info("Memory encryption is active, crashkernel needs %ldMB extra memory\n", + (unsigned long)(mem_enc_req >> 20)); + size += mem_enc_req; + } goto found; + } high_reserve: /* Try high reserve */ -- 2.21.0
[PATCH v3 1/2] x86/kdump: Split some code out of reserve_crashkernel
Split out the code related to finding suitable region for kdump out of reserve_crashkernel, clean up and refactor for further change, no feature change. Signed-off-by: Kairui Song --- arch/x86/kernel/setup.c | 92 +++-- 1 file changed, 60 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe35bf879f5..71f20bb18cb0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -526,6 +526,63 @@ static int __init reserve_crashkernel_low(void) return 0; } +static int __init crashkernel_find_region(unsigned long long *crash_base, + unsigned long long *crash_size, + bool high) +{ + unsigned long long base, size; + + base = *crash_base; + size = *crash_size; + + /* +* base == 0 means: find the address automatically, else just +* verify the region is useable +*/ + if (base) { + unsigned long long start; + + start = memblock_find_in_range(base, base + size, + size, 1 << 20); + if (start != base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); + return -1; + } + return 0; + } + + /* +* crashkernel=x,high reserves memory over 4G, also allocates +* 256M extra low memory for DMA buffers and swiotlb. +* But the extra memory is not required for all machines. +* So try low memory first and fall back to high memory +* unless "crashkernel=size[KMG],high" is specified. +*/ + if (high) + goto high_reserve; + + base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_LOW_MAX, size, + CRASH_ALIGN); + if (base) + goto found; + +high_reserve: + /* Try high reserve */ + base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX, size, + CRASH_ALIGN); + if (base) + goto found; + + pr_info("crashkernel reservation failed - No suitable area found.\n"); + return -1; +found: + *crash_base = base; + *crash_size = size; + return 0; +} + static void __init reserve_crashkernel(void) { unsigned long long crash_size, crash_base, total_mem; @@ -550,39 +607,10 @@ static void __init reserve_crashkernel(void) return; } - /* 0 means: find the address automatically */ - if (!crash_base) { - /* -* Set CRASH_ADDR_LOW_MAX upper bound for crash memory, -* crashkernel=x,high reserves memory over 4G, also allocates -* 256M extra low memory for DMA buffers and swiotlb. -* But the extra memory is not required for all machines. -* So try low memory first and fall back to high memory -* unless "crashkernel=size[KMG],high" is specified. -*/ - if (!high) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); - if (!crash_base) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_HIGH_MAX, - crash_size, CRASH_ALIGN); - if (!crash_base) { - pr_info("crashkernel reservation failed - No suitable area found.\n"); - return; - } - } else { - unsigned long long start; + ret = crashkernel_find_region(&crash_base, &crash_size, high); + if (ret) + return; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, 1 << 20); - if (start != crash_base) { - pr_info("crashkernel reservation failed - memory is in use.\n"); - return; - } - } ret = memblock_reserve(crash_base, crash_size); if (ret) { pr_err("%s: Error reserving crashkernel memblock.\n", __func__); -- 2.21.0
Re: [PATCH v2] x86/kdump: Reserve extra memory when SME or SEV is active
On 8/31/19 12:45 AM, Borislav Petkov wrote: On Mon, Aug 26, 2019 at 12:45:35PM +0800, Kairui Song wrote: Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), SWIOTLB will be enabled even if there is less than 4G of memory when SME is active, to support DMA of devices that not support address with the encrypt bit. And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory encryption") will always force SWIOTLB to be enabled when SEV is active in all cases. Now, when either SME or SEV is active, SWIOTLB will be force enabled, and this is also true for kdump kernel. As a result kdump kernel will run out of already scarce pre-reserved memory easily. So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure kdump kernel have enough memory, except when "crashkernel=size[KMG],high" is specified or any offset is used. As for the high reservation case, an extra low memory region will always be reserved and that is enough for SWIOTLB. Else if the offset format is used, user should be fully aware of any possible kdump kernel memory requirement and have to organize the memory usage carefully. Signed-off-by: Kairui Song --- Update from V1: - Use mem_encrypt_active() instead of "sme_active() || sev_active()" - Don't reserve extra memory when ",high" or "@offset" is used, and don't print redundant message. - Fix coding style problem arch/x86/kernel/setup.c | 31 --- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe35bf879f5..221beb10c55d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void) static void __init reserve_crashkernel(void) { - unsigned long long crash_size, crash_base, total_mem; + unsigned long long crash_size, crash_base, total_mem, mem_enc_req; bool high = false; int ret; @@ -550,6 +550,15 @@ static void __init reserve_crashkernel(void) return; } + /* +* When SME/SEV is active, it will always required an extra SWIOTLB +* region. +*/ + if (mem_encrypt_active()) + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); + else + mem_enc_req = 0; Hmm, ugly. I agree with this, but didn't have a better idea about how toimprove it, so thanks for the suggestions below. You set mem_enc_reg here ... + /* 0 means: find the address automatically */ if (!crash_base) { /* @@ -563,11 +572,19 @@ static void __init reserve_crashkernel(void) if (!high) crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); - if (!crash_base) + crash_size + mem_enc_req, + CRASH_ALIGN); + /* +* For high reservation, an extra low memory for SWIOTLB will +* always be reserved later, so no need to reserve extra +* memory for memory encryption case here. +*/ + if (!crash_base) { + mem_enc_req = 0; ... but you clear it here... crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_HIGH_MAX, crash_size, CRASH_ALIGN); + } if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -575,6 +592,7 @@ static void __init reserve_crashkernel(void) } else { unsigned long long start; + mem_enc_req = 0; ... and here... start = memblock_find_in_range(crash_base, crash_base + crash_size, crash_size, 1 << 20); @@ -583,6 +601,13 @@ static void __init reserve_crashkernel(void) return; } } + + if (mem_enc_req) { + pr_info("Memory encryption is active, crashkernel needs %ldMB extra memory\n", + (unsigned long)(mem_enc_req >> 20)); + crash_size += mem_enc_req; + } ... and then you report only when it is still set. How about you carve out that if (!crash_base) { ... } else { } piece into a separate function without any fu
Re: [PATCH v2] x86/kdump: Reserve extra memory when SME or SEV is active
On Mon, Aug 26, 2019 at 12:46 PM Kairui Song wrote: > > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), > SWIOTLB will be enabled even if there is less than 4G of memory when SME > is active, to support DMA of devices that not support address with the > encrypt bit. > > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is > active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. > > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory > encryption") will always force SWIOTLB to be enabled when SEV is active > in all cases. > > Now, when either SME or SEV is active, SWIOTLB will be force enabled, > and this is also true for kdump kernel. As a result kdump kernel will > run out of already scarce pre-reserved memory easily. > > So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure > kdump kernel have enough memory, except when "crashkernel=size[KMG],high" > is specified or any offset is used. As for the high reservation case, an > extra low memory region will always be reserved and that is enough for > SWIOTLB. Else if the offset format is used, user should be fully aware > of any possible kdump kernel memory requirement and have to organize the > memory usage carefully. > > Signed-off-by: Kairui Song > > --- > Update from V1: > - Use mem_encrypt_active() instead of "sme_active() || sev_active()" > - Don't reserve extra memory when ",high" or "@offset" is used, and > don't print redundant message. > - Fix coding style problem > > arch/x86/kernel/setup.c | 31 --- > 1 file changed, 28 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > index bbe35bf879f5..221beb10c55d 100644 > --- a/arch/x86/kernel/setup.c > +++ b/arch/x86/kernel/setup.c > @@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void) > > static void __init reserve_crashkernel(void) > { > - unsigned long long crash_size, crash_base, total_mem; > + unsigned long long crash_size, crash_base, total_mem, mem_enc_req; > bool high = false; > int ret; > > @@ -550,6 +550,15 @@ static void __init reserve_crashkernel(void) > return; > } > > + /* > +* When SME/SEV is active, it will always required an extra SWIOTLB > +* region. > +*/ > + if (mem_encrypt_active()) > + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); > + else > + mem_enc_req = 0; > + > /* 0 means: find the address automatically */ > if (!crash_base) { > /* > @@ -563,11 +572,19 @@ static void __init reserve_crashkernel(void) > if (!high) > crash_base = memblock_find_in_range(CRASH_ALIGN, > CRASH_ADDR_LOW_MAX, > - crash_size, CRASH_ALIGN); > - if (!crash_base) > + crash_size + mem_enc_req, > + CRASH_ALIGN); > + /* > +* For high reservation, an extra low memory for SWIOTLB will > +* always be reserved later, so no need to reserve extra > +* memory for memory encryption case here. > +*/ > + if (!crash_base) { > + mem_enc_req = 0; > crash_base = memblock_find_in_range(CRASH_ALIGN, > CRASH_ADDR_HIGH_MAX, > crash_size, CRASH_ALIGN); > + } > if (!crash_base) { > pr_info("crashkernel reservation failed - No suitable > area found.\n"); > return; > @@ -575,6 +592,7 @@ static void __init reserve_crashkernel(void) > } else { > unsigned long long start; > > + mem_enc_req = 0; > start = memblock_find_in_range(crash_base, >crash_base + crash_size, >crash_size, 1 << 20); > @@ -583,6 +601,13 @@ static void __init reserve_crashkernel(void) > return; > } > } > + > + if (mem_enc_req) { > + pr_info("Memory encryption is active, crashkernel needs %ldMB > extra memory\n", > + (unsigned long)(mem_enc_req >> 20)); > + crash_size += mem_enc_req; > + } > + > ret = memblock_reserve(crash_base, crash_size); > if (ret) { > pr_err("%s: Error reserving crashkernel memblock.\n", > __func__); > -- > 2.21.0 > Hi Tom, any comment about V2? -- Best Regards, Kairui Song
[PATCH v2] x86/kdump: Reserve extra memory when SME or SEV is active
Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), SWIOTLB will be enabled even if there is less than 4G of memory when SME is active, to support DMA of devices that not support address with the encrypt bit. And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory encryption") will always force SWIOTLB to be enabled when SEV is active in all cases. Now, when either SME or SEV is active, SWIOTLB will be force enabled, and this is also true for kdump kernel. As a result kdump kernel will run out of already scarce pre-reserved memory easily. So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure kdump kernel have enough memory, except when "crashkernel=size[KMG],high" is specified or any offset is used. As for the high reservation case, an extra low memory region will always be reserved and that is enough for SWIOTLB. Else if the offset format is used, user should be fully aware of any possible kdump kernel memory requirement and have to organize the memory usage carefully. Signed-off-by: Kairui Song --- Update from V1: - Use mem_encrypt_active() instead of "sme_active() || sev_active()" - Don't reserve extra memory when ",high" or "@offset" is used, and don't print redundant message. - Fix coding style problem arch/x86/kernel/setup.c | 31 --- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe35bf879f5..221beb10c55d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void) static void __init reserve_crashkernel(void) { - unsigned long long crash_size, crash_base, total_mem; + unsigned long long crash_size, crash_base, total_mem, mem_enc_req; bool high = false; int ret; @@ -550,6 +550,15 @@ static void __init reserve_crashkernel(void) return; } + /* +* When SME/SEV is active, it will always required an extra SWIOTLB +* region. +*/ + if (mem_encrypt_active()) + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); + else + mem_enc_req = 0; + /* 0 means: find the address automatically */ if (!crash_base) { /* @@ -563,11 +572,19 @@ static void __init reserve_crashkernel(void) if (!high) crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); - if (!crash_base) + crash_size + mem_enc_req, + CRASH_ALIGN); + /* +* For high reservation, an extra low memory for SWIOTLB will +* always be reserved later, so no need to reserve extra +* memory for memory encryption case here. +*/ + if (!crash_base) { + mem_enc_req = 0; crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_HIGH_MAX, crash_size, CRASH_ALIGN); + } if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -575,6 +592,7 @@ static void __init reserve_crashkernel(void) } else { unsigned long long start; + mem_enc_req = 0; start = memblock_find_in_range(crash_base, crash_base + crash_size, crash_size, 1 << 20); @@ -583,6 +601,13 @@ static void __init reserve_crashkernel(void) return; } } + + if (mem_enc_req) { + pr_info("Memory encryption is active, crashkernel needs %ldMB extra memory\n", + (unsigned long)(mem_enc_req >> 20)); + crash_size += mem_enc_req; + } + ret = memblock_reserve(crash_base, crash_size); if (ret) { pr_err("%s: Error reserving crashkernel memblock.\n", __func__); -- 2.21.0
Re: [PATCH] x86/kdump: Reserve extra memory when SME or SEV is active
On Thu, Aug 22, 2019 at 10:35 PM Lendacky, Thomas wrote: > > On 8/21/19 9:53 PM, Kairui Song wrote: > > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), > > SWIOTLB will be enabled even if there is less than 4G of memory when SME > > is active, to support DMA of devices that not support address with the > > encrypt bit. > > > > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is > > active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. > > > > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory > > encryption") will always force SWIOTLB to be enabled when SEV is active > > in all cases. > > > > Now, when either SME or SEV is active, SWIOTLB will be force enabled, > > and this is also true for kdump kernel. As a result kdump kernel will > > run out of already scarce pre-reserved memory easily. > > > > So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure > > kdump kernel have enough memory, except when "crashkernel=size[KMG],high" > > is specified or any offset is used. As for the high reservation case, an > > extra low memory region will always be reserved and that is enough for > > SWIOTLB. Else if the offset format is used, user should be fully aware > > of any possible kdump kernel memory requirement and have to organize the > > memory usage carefully. > > > > Signed-off-by: Kairui Song > > --- > > arch/x86/kernel/setup.c | 26 +++--- > > 1 file changed, 23 insertions(+), 3 deletions(-) > > > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > > index bbe35bf879f5..ed91fa9d9f6e 100644 > > --- a/arch/x86/kernel/setup.c > > +++ b/arch/x86/kernel/setup.c > > @@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void) > > > > static void __init reserve_crashkernel(void) > > { > > - unsigned long long crash_size, crash_base, total_mem; > > + unsigned long long crash_size, crash_base, total_mem, mem_enc_req; > > bool high = false; > > int ret; > > > > @@ -550,6 +550,17 @@ static void __init reserve_crashkernel(void) > > return; > > } > > > > + /* > > + * When SME/SEV is active, it will always required an extra SWIOTLB > > + * region. > > + */ > > + if (sme_active() || sev_active()) { > > You can use mem_encrypt_active() here in place of the two checks. That's a very good suggestion. > > > + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); > > + pr_info("Memory encryption is active, crashkernel needs %ldMB > > extra memory\n", > > + (unsigned long)(mem_enc_req >> 20)); > > There is a point below where you zero out this value, so should this > be issued later only if mem_enc_req is non-zero? Yes that's true, but currently if zero out this value when ",high" is used, then an extra low memory region will be reserved, so this message will not be very confusing I think? as the required extra memory is now in the low memory region. And for the "@offset" case this could be a hint for users. And if the reserve failed due to enlarged crashkernel size, the user may also be better aware of what is causing the failure by this message. > > Also, looks like one too many tabs. > > > + } else > > Since you used braces on the if path, you need braces on the else path. OK, will fix the code style issues. > > Thanks, > Tom > > > + mem_enc_req = 0; > > + > > /* 0 means: find the address automatically */ > > if (!crash_base) { > > /* > > @@ -563,11 +574,19 @@ static void __init reserve_crashkernel(void) > > if (!high) > > crash_base = memblock_find_in_range(CRASH_ALIGN, > > CRASH_ADDR_LOW_MAX, > > - crash_size, CRASH_ALIGN); > > - if (!crash_base) > > + crash_size + mem_enc_req, > > + CRASH_ALIGN); > > + /* > > + * For high reservation, an extra low memory for SWIOTLB will > > + * always be reserved later, so no need to reserve extra > > + * memory for memory encryption case here. > > + */ > > + if (!crash_base) { > > +
[PATCH] x86/kdump: Reserve extra memory when SME or SEV is active
Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"), SWIOTLB will be enabled even if there is less than 4G of memory when SME is active, to support DMA of devices that not support address with the encrypt bit. And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is active") make the kernel keep SWIOTLB enabled even if there is an IOMMU. Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory encryption") will always force SWIOTLB to be enabled when SEV is active in all cases. Now, when either SME or SEV is active, SWIOTLB will be force enabled, and this is also true for kdump kernel. As a result kdump kernel will run out of already scarce pre-reserved memory easily. So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure kdump kernel have enough memory, except when "crashkernel=size[KMG],high" is specified or any offset is used. As for the high reservation case, an extra low memory region will always be reserved and that is enough for SWIOTLB. Else if the offset format is used, user should be fully aware of any possible kdump kernel memory requirement and have to organize the memory usage carefully. Signed-off-by: Kairui Song --- arch/x86/kernel/setup.c | 26 +++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe35bf879f5..ed91fa9d9f6e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void) static void __init reserve_crashkernel(void) { - unsigned long long crash_size, crash_base, total_mem; + unsigned long long crash_size, crash_base, total_mem, mem_enc_req; bool high = false; int ret; @@ -550,6 +550,17 @@ static void __init reserve_crashkernel(void) return; } + /* +* When SME/SEV is active, it will always required an extra SWIOTLB +* region. +*/ + if (sme_active() || sev_active()) { + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M); + pr_info("Memory encryption is active, crashkernel needs %ldMB extra memory\n", + (unsigned long)(mem_enc_req >> 20)); + } else + mem_enc_req = 0; + /* 0 means: find the address automatically */ if (!crash_base) { /* @@ -563,11 +574,19 @@ static void __init reserve_crashkernel(void) if (!high) crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); - if (!crash_base) + crash_size + mem_enc_req, + CRASH_ALIGN); + /* +* For high reservation, an extra low memory for SWIOTLB will +* always be reserved later, so no need to reserve extra +* memory for memory encryption case here. +*/ + if (!crash_base) { + mem_enc_req = 0; crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_HIGH_MAX, crash_size, CRASH_ALIGN); + } if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -583,6 +602,7 @@ static void __init reserve_crashkernel(void) return; } } + crash_size += mem_enc_req; ret = memblock_reserve(crash_base, crash_size); if (ret) { pr_err("%s: Error reserving crashkernel memblock.\n", __func__); -- 2.21.0
[PATCH] x86: Fix broken multiboot2 buliding for i386
When building for i386, an error occured: kexec/arch/i386/kexec-x86.c:39:22: error: 'multiboot2_x86_probe' undeclared here (not in a function); did you mean 'multiboot_x86_probe'? 39 | { "multiboot2-x86", multiboot2_x86_probe, multiboot2_x86_load, | ^~~~ | multiboot_x86_probe kexec/arch/i386/kexec-x86.c:39:44: error: 'multiboot2_x86_load' undeclared here (not in a function); did you mean 'multiboot_x86_load'? 39 | { "multiboot2-x86", multiboot2_x86_probe, multiboot2_x86_load, |^~~ |multiboot_x86_load kexec/arch/i386/kexec-x86.c:40:4: error: 'multiboot2_x86_usage' undeclared here (not in a function); did you mean 'multiboot_x86_usage'? 40 |multiboot2_x86_usage }, |^~~~ |multiboot_x86_usage Fix this issue by putting the definition in the right header, also tidy up Makefile. Fixes: 22a2ed55132e ("x86: Support multiboot2 images") Signed-off-by: Kairui Song --- kexec/arch/i386/Makefile | 2 +- kexec/arch/i386/kexec-x86.h | 5 + kexec/arch/x86_64/kexec-x86_64.h | 5 - 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kexec/arch/i386/Makefile b/kexec/arch/i386/Makefile index 105cefd..f486103 100644 --- a/kexec/arch/i386/Makefile +++ b/kexec/arch/i386/Makefile @@ -7,6 +7,7 @@ i386_KEXEC_SRCS += kexec/arch/i386/kexec-elf-x86.c i386_KEXEC_SRCS += kexec/arch/i386/kexec-elf-rel-x86.c i386_KEXEC_SRCS += kexec/arch/i386/kexec-bzImage.c i386_KEXEC_SRCS += kexec/arch/i386/kexec-multiboot-x86.c +i386_KEXEC_SRCS += kexec/arch/i386/kexec-mb2-x86.c i386_KEXEC_SRCS += kexec/arch/i386/kexec-beoboot-x86.c i386_KEXEC_SRCS += kexec/arch/i386/kexec-nbi.c i386_KEXEC_SRCS += kexec/arch/i386/x86-linux-setup.c @@ -14,7 +15,6 @@ i386_KEXEC_SRCS += kexec/arch/i386/crashdump-x86.c dist += kexec/arch/i386/Makefile $(i386_KEXEC_SRCS)\ kexec/arch/i386/crashdump-x86.h \ - kexec/arch/i386/kexec-mb2-x86.c \ kexec/arch/i386/kexec-x86.h \ kexec/arch/i386/x86-linux-setup.h \ kexec/arch/i386/include/arch/options.h diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h index 1b58c3b..16d0f6c 100644 --- a/kexec/arch/i386/kexec-x86.h +++ b/kexec/arch/i386/kexec-x86.h @@ -60,6 +60,11 @@ int multiboot_x86_load(int argc, char **argv, const char *buf, off_t len, struct kexec_info *info); void multiboot_x86_usage(void); +int multiboot2_x86_load(int argc, char **argv, const char *buf, off_t len, + struct kexec_info *info); +void multiboot2_x86_usage(void); +int multiboot2_x86_probe(const char *buf, off_t buf_len); + int elf_x86_probe(const char *buf, off_t len); int elf_x86_load(int argc, char **argv, const char *buf, off_t len, struct kexec_info *info); diff --git a/kexec/arch/x86_64/kexec-x86_64.h b/kexec/arch/x86_64/kexec-x86_64.h index 21c3a73..4cdeffb 100644 --- a/kexec/arch/x86_64/kexec-x86_64.h +++ b/kexec/arch/x86_64/kexec-x86_64.h @@ -33,9 +33,4 @@ int bzImage64_load(int argc, char **argv, const char *buf, off_t len, struct kexec_info *info); void bzImage64_usage(void); -int multiboot2_x86_load(int argc, char **argv, const char *buf, off_t len, - struct kexec_info *info); -void multiboot2_x86_usage(void); -int multiboot2_x86_probe(const char *buf, off_t buf_len); - #endif /* KEXEC_X86_64_H */ -- 2.21.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH kexec-tools] x86: re-order includes to avoid duplicate struct e820entry
On Wed, Jul 10, 2019 at 4:11 PM Simon Horman wrote: > > On Wed, Jul 03, 2019 at 10:04:32AM +0200, Simon Horman wrote: > > xenctrl.h defines struct e820entry as: > > > > if defined(__i386__) || defined(__x86_64__) > > ... > > #define E820_RAM1 > > ... > > struct e820entry { > > uint64_t addr; > > uint64_t size; > > uint32_t type; > > } __attribute__((packed)); > > ... > > #endif > > > > $ dpkg-query -S /usr/include/xenctrl.h > > libxen-dev:amd64: /usr/include/xenctrl.h > > $ dpkg-query -W libxen-dev:amd64 > > libxen-dev:amd64 4.8.5+shim4.10.2+xsa282-1+deb9u11 > > > > ./include/x86/x86-linux.h defines struct e820entry as: > > > > #ifndef E820_RAM > > struct e820entry { > > uint64_t addr; /* start of memory segment */ > > uint64_t size; /* size of memory segment */ > > uint32_t type; /* type of memory segment */ > > #define E820_RAM1 > > ... > > } __attribute__((packed)); > > #endif > > > > Since cedeee0a3007 ("x86: Introduce helpers for getting RSDP address") > > ./kexec/arch/i386/kexec-x86-common.c includes > > > > +#include "x86-linux-setup.h" > >#include "../../kexec-xen.h" > > > > When xenctrl.h is present the above results in: > > > > $ gcc > > ... > > In file included from kexec/arch/i386/../../kexec-xen.h:5:0, > > from kexec/arch/i386/kexec-x86-common.c:43: > > /usr/include/xenctrl.h:1271:8: error: redefinition of 'struct e820entry' > > struct e820entry { > > ^ > > > > In file included from kexec/arch/i386/x86-linux-setup.h:3:0, > > from kexec/arch/i386/kexec-x86-common.c:42: > > ./include/x86/x86-linux.h:16:8: note: originally defined here > > struct e820entry { > > ^ > > ... > > $ gcc --version | head -1 > > gcc (Debian 6.3.0-18+deb9u1) 6.3.0 20170516 > > > > To militate this this problem re-order the includes so that > > x86-linux.h is included after xenctrl.h and thus > > struct e820entry will only be defined once due to it > > being devined conditionally in x86-linux.h. > > > > In practice the definitions are the same so it should > > not matter which is chosen. > > > > It also seems rather unpleasent to me to need to play > > with include ordering. Perhaps a better solution in the longer > > term would be to rename the local definition of struct e820entry. > > > > Fixes: cedeee0a3007 ("x86: Introduce helpers for getting RSDP address") > > Signed-off-by: Simon Horman > > I have applied this change. > Thanks for the fix, it looks good, so the "move the helpers to x86-linux-setup.c" patch should be not needed now. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH kexec-tools] x86: re-order includes to avoid duplicate struct e820entry
break; + } + } + fclose(fp); + + return acpi_rsdp; +} + +uint64_t get_acpi_rsdp(void) +{ + uint64_t acpi_rsdp = 0; + + acpi_rsdp = bootparam_get_acpi_rsdp(); + + if (!acpi_rsdp) + acpi_rsdp = efi_get_acpi_rsdp(); + + return acpi_rsdp; +} void setup_linux_system_parameters(struct kexec_info *info, struct x86_linux_param_header *real_mode) { diff --git a/kexec/arch/i386/x86-linux-setup.h b/kexec/arch/i386/x86-linux-setup.h index 0c651e5..1e81805 100644 --- a/kexec/arch/i386/x86-linux-setup.h +++ b/kexec/arch/i386/x86-linux-setup.h @@ -22,7 +22,7 @@ static inline void setup_linux_bootloader_parameters( void setup_linux_system_parameters(struct kexec_info *info, struct x86_linux_param_header *real_mode); int get_bootparam(void *buf, off_t offset, size_t size); - +uint64_t get_acpi_rsdp(void); #define SETUP_BASE0x9 #define KERN32_BASE 0x10 /* 1MB */ -- 2.21.0 Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] x86/kexec: Add ACPI NVS region to the ident map
On Mon, Jun 10, 2019 at 5:52 PM Borislav Petkov wrote: > > On Mon, Jun 10, 2019 at 03:36:17PM +0800, Kairui Song wrote: > > With the recent addition of RSDP parsing in decompression stage, kexec > > kernel now needs ACPI tables to be covered by the identity mapping. > > And in commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and > > ACPI tables to the ident map"), ACPI tables memory region was added to > > the ident map. > > > > But on some machines, there is only ACPI NVS memory region, and the ACPI > > tables is located in the NVS region instead. In such case second kernel > > *are* located - plural. > > > will still fail when trying to access ACPI tables. > > > > So, to fix the problem, add NVS memory region in the ident map as well. > > > > Fixes: 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI tables > > to the ident map") > > Suggested-by: Junichi Nomura > > Signed-off-by: Kairui Song > > --- > > > > Tested with my laptop and VM, on top of current tip:x86/boot. > > You tested this in a VM and not on the *actual* machine with the NVS > region? > > This is a joke, right? > Hi Boris, unfortunately I don't have a real machine which only have the NVS region. I did fake the memmap to emulate such problem but can't really promise this will fix the real case. So just declare it won't break anything that is already working. And I'm asking Junichi to have a try as he reported this issue on the machines he has. -- Best Regards, Kairui Song
Re: [PATCH] x86/kexec: Add ACPI NVS region to the ident map
On Mon, Jun 10, 2019 at 3:37 PM Kairui Song wrote: > > With the recent addition of RSDP parsing in decompression stage, kexec > kernel now needs ACPI tables to be covered by the identity mapping. > And in commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and > ACPI tables to the ident map"), ACPI tables memory region was added to > the ident map. > > But on some machines, there is only ACPI NVS memory region, and the ACPI > tables is located in the NVS region instead. In such case second kernel > will still fail when trying to access ACPI tables. > > So, to fix the problem, add NVS memory region in the ident map as well. > > Fixes: 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI tables to > the ident map") > Suggested-by: Junichi Nomura > Signed-off-by: Kairui Song > --- > > Tested with my laptop and VM, on top of current tip:x86/boot. > > arch/x86/kernel/machine_kexec_64.c | 18 +++--- > 1 file changed, 15 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kernel/machine_kexec_64.c > b/arch/x86/kernel/machine_kexec_64.c > index 3c77bdf7b32a..a406602fdb3c 100644 > --- a/arch/x86/kernel/machine_kexec_64.c > +++ b/arch/x86/kernel/machine_kexec_64.c > @@ -54,14 +54,26 @@ static int mem_region_callback(struct resource *res, void > *arg) > static int > map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) > { > - unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; > + int ret; > + unsigned long flags; > struct init_pgtable_data data; > > data.info = info; > data.level4p = level4p; > flags = IORESOURCE_MEM | IORESOURCE_BUSY; > - return walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, > - &data, mem_region_callback); > + > + ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, > + &data, mem_region_callback); > + if (ret && ret != -EINVAL) > + return ret; > + > + /* ACPI tables could be located in ACPI Non-volatile Storage region */ > + ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, > + &data, mem_region_callback); > + if (ret && ret != -EINVAL) > + return ret; > + > + return 0; > } > #else > static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { > return 0; } > -- > 2.21.0 > Hi, could you help test the tip branch with this applied? This should fix all the issues, I can't find any other issues now. Thanks. -- Best Regards, Kairui Song
Re: [PATCH v3 0/4] x86: Always try to fill acpi_rsdp_addr in boot params
On Fri, May 31, 2019 at 5:27 PM Simon Horman wrote: > > On Fri, May 24, 2019 at 02:23:17PM +0800, Kairui Song wrote: > > This patch sync the behavior of user space kexec and kexec_file_load, > > they will both fill the boot_params.acpi_rsdp_addr with a valid RSDP > > value, to make sure second kernel can always get the RSDP consistently. > > > > This will make it effortless to boot newer version of kernel (5.0+) > > without specifying acpi_rsdp= cmdline on EFI system even with EFI > > service disabled. Should not change any behavior with older kernels. > > > > Update from V2: > > - Drop unneeded 'packed' attribute for boot parameters structure > > - Don't trust kernel cmdline as a reliable acpi rsdp source > > > > Update from V1: > > - Split into multiple patches for a cleaner structure, content is not > > changed. > > Thanks Kairui, > > applied. Hi Simon, I still haven't see this series get merged yet in the git repo, any update? -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] x86/kexec: Add ACPI NVS region to the ident map
With the recent addition of RSDP parsing in decompression stage, kexec kernel now needs ACPI tables to be covered by the identity mapping. And in commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI tables to the ident map"), ACPI tables memory region was added to the ident map. But on some machines, there is only ACPI NVS memory region, and the ACPI tables is located in the NVS region instead. In such case second kernel will still fail when trying to access ACPI tables. So, to fix the problem, add NVS memory region in the ident map as well. Fixes: 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI tables to the ident map") Suggested-by: Junichi Nomura Signed-off-by: Kairui Song --- Tested with my laptop and VM, on top of current tip:x86/boot. arch/x86/kernel/machine_kexec_64.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 3c77bdf7b32a..a406602fdb3c 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -54,14 +54,26 @@ static int mem_region_callback(struct resource *res, void *arg) static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { - unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; + int ret; + unsigned long flags; struct init_pgtable_data data; data.info = info; data.level4p = level4p; flags = IORESOURCE_MEM | IORESOURCE_BUSY; - return walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, - &data, mem_region_callback); + + ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + /* ACPI tables could be located in ACPI Non-volatile Storage region */ + ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + return 0; } #else static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } -- 2.21.0
Re: [PATCH] x86: Clear isVGA flag if current fb driver is mimicking VGA
On Fri, May 31, 2019 at 5:29 PM Simon Horman wrote: > > On Fri, Nov 23, 2018 at 05:28:01PM +0800, Kairui Song wrote: > > Some device (eg. hyperv_fb) will mimic EFI (or VESA) VGA on first boot > > up, but after the real driver is loaded, it will switch to new mode > > and no longer compatible with EFI/VESA VGA. Keep setting > > orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and > > try to manipulate the framebuffer in a wrong way. > > > > As we have already take care of "VESA VGA" and "EFI VGA", just set the > > orig_video_isVGA to 0 for any other driver reports as EFI/VESA VGA but > > is not EFI/VESA VGA. > > > > Signed-off-by: Kairui Song > > Sorry for letting this slip through the cracks. > Please let me know if this is still relevant. Hi Simon, after fb5a879 ("x86: Introduce a new option --reuse-video-type") in kexec-tools, this patch is no longer needed. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] x86: Handle 64bit framebuffer memory address properly
On Fri, May 31, 2019 at 5:29 PM Simon Horman wrote: > > On Fri, Nov 23, 2018 at 05:26:33PM +0800, Kairui Song wrote: > > In a EFI system, the frame buffer address is 64bit, so currently > > if the address is beyound 4G, kexec will set wrong address due to > > truncate. > > > > Linux kernel commit ae2ee627dc87 ('efifb: Add support for 64-bit > > frame buffer addresses') added support for 64bit frame buffer > > address, an 'ext_lfb_base' field is added as the upper 32-bits of > > the frame buffer, and introduced a new capability flag > > 'VIDEO_TYPE_CAPABILITY_64BIT_BASE' to indicate if the extend field is > > used. > > > > This patch adopts this change, set proper extent address and capability > > flag when the address is beyound 4G. > > > > Signed-off-by: Kairui Song > > Sorry for letting this slip through the cracks. > Please let me know if this is still relevant. Hi Simon, I checked kexec-tools repo and this patch is merged already, maybe you replied the wrong mail? -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v4] vmcore: Add a kernel parameter novmcoredd
Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in second kernel"), drivers is allowed to add device related dump data to vmcore as they want by using the device dump API. This have a potential issue, the data is stored in memory, drivers may append too much data and use too much memory. The vmcore is typically used in a kdump kernel which runs in a pre-reserved small chunk of memory. So as a result it will make kdump unusable at all due to OOM issues. So introduce new 'novmcoredd' command line option. User can disable device dump to reduce memory usage. This is helpful if device dump is using too much memory, disabling device dump could make sure a regular vmcore without device dump data is still available. Signed-off-by: Kairui Song --- Update from V3: - Use novmcoredd instead of vmcore_device_dump. Use vmcore_device_dump and make it off by default is confusing, novmcoredd is a cleaner way to let user space be able to disable device dump to save memory. Update from V2: - Improve related docs Update from V1: - Use bool parameter to turn it on/off instead of letting user give the size limit. Size of device dump is hard to determine. Documentation/admin-guide/kernel-parameters.txt | 11 +++ fs/proc/Kconfig | 3 ++- fs/proc/vmcore.c| 8 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..1b900d262680 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2872,6 +2872,17 @@ /sys/module/printk/parameters/console_suspend) to turn on/off it dynamically. + novmcoredd [KNL,KDUMP] + Disable device dump. Device dump allows drivers to + append dump data to vmcore so you can collect driver + specified debug info. The drivers could append the + data without any limit, and the data is stored in + memory, this may bring a significant memory stress. + Disable device dump can help save memory but driver + debug data will be no longer available. + Only available when CONFIG_PROC_VMCORE_DEVICE_DUMP + is set. + noaliencache[MM, NUMA, SLAB] Disables the allocation of alien caches in the slab allocator. Saves per-node memory, but will impact performance. diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 817c02b13b1d..62b19162d198 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -57,7 +57,8 @@ config PROC_VMCORE_DEVICE_DUMP snapshot. If you say Y here, the collected device dumps will be added - as ELF notes to /proc/vmcore. + as ELF notes to /proc/vmcore. You can still disabled device + dump by command line option 'novmcoredd'. config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 3fe90443c1bb..e815fd035fc0 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore; /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); static DEFINE_MUTEX(vmcoredd_mutex); + +static bool vmcoredd_disabled; +core_param(novmcoredd, vmcoredd_disabled, bool, 0); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Device Dump Size */ @@ -1451,6 +1454,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) size_t data_size; int ret; + if (vmcoredd_disabled) { + pr_err_once("Device dump is disabled\n"); + return -EINVAL; + } + if (!data || !strlen(data->dump_name) || !data->vmcoredd_callback || !data->size) return -EINVAL; -- 2.21.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v6 1/2] x86/kexec: Build identity mapping for EFI systab and ACPI tables
On Wed, May 22, 2019 at 2:09 AM Borislav Petkov wrote: > > On Tue, May 21, 2019 at 05:02:59PM +0800, Kairui Song wrote: > > Hi Boris, would you prefer to just fold Junichi update patch into the > > previous one or I should send an updated patch? > > Please send a patch ontop after Ingo queues your old one, which should > happen soon. This way it would also document the fact that there are > machines with NVS regions only. > > Thx. > Hi, by now, I still didn't see any tip branch pick up this patch yet, any update? -- Best Regards, Kairui Song
Re: [PATCH v3] vmcore: Add a kernel parameter vmcore_device_dump
On Mon, May 27, 2019 at 2:45 AM Bhupesh Sharma wrote: > > On Fri, May 24, 2019 at 6:25 PM Dave Young wrote: > > > > On 05/24/19 at 02:29pm, Kairui Song wrote: > > > Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in > > > second kernel"), drivers is allowed to add device related dump data to > > > vmcore as they want by using the device dump API. This have a potential > > > issue, the data is stored in memory, drivers may append too much data > > > and use too much memory. The vmcore is typically used in a kdump kernel > > > which runs in a pre-reserved small chunk of memory. So as a result it > > > will make kdump unusable at all due to OOM issues. > > > > > > So introduce new vmcore_device_dump= kernel parameter, and disable > > > device dump by default. User can enable it only if device dump data is > > > required for debugging, and have the chance to increase the kdump > > > reserved memory accordingly before device dump fails kdump. > > > > > > Signed-off-by: Kairui Song > > > > > > --- > > > > > > Update from V2: > > > - Improve related docs > > > > > > Update from V1: > > > - Use bool parameter to turn it on/off instead of letting user give > > > the size limit. Size of device dump is hard to determine. > > > > > > Documentation/admin-guide/kernel-parameters.txt | 14 ++ > > > fs/proc/Kconfig | 6 -- > > > fs/proc/vmcore.c| 13 + > > > 3 files changed, 31 insertions(+), 2 deletions(-) > > > > > > diff --git a/Documentation/admin-guide/kernel-parameters.txt > > > b/Documentation/admin-guide/kernel-parameters.txt > > > index 138f6664b2e2..3706ad9e1d97 100644 > > > --- a/Documentation/admin-guide/kernel-parameters.txt > > > +++ b/Documentation/admin-guide/kernel-parameters.txt > > > @@ -5078,6 +5078,20 @@ > > > decrease the size and leave more room for directly > > > mapped kernel RAM. > > > > > > + vmcore_device_dump= [KNL,KDUMP] > > > + Format: {"off" | "on"} > > > + Depends on CONFIG_PROC_VMCORE_DEVICE_DUMP. > > > + This parameter allows enable or disable device dump > > > + for vmcore on kernel start-up. > > > + Device dump allows drivers to append dump data to > > > + vmcore so you can collect driver specified debug > > > info. > > > + Note that the drivers could append the data without > > > + any limit, and the data is stored in memory, this > > > may > > > + bring a significant memory stress. If you want to > > > turn > > > + on this option, make sure you have reserved enough > > > memory > > > + with crashkernel= parameter. > > > + default: off > > > + > > > vmcp_cma=nn[MG] [KNL,S390] > > > Sets the memory size reserved for contiguous memory > > > allocations for the vmcp device driver. > > > diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig > > > index 817c02b13b1d..1a7a38976bb0 100644 > > > --- a/fs/proc/Kconfig > > > +++ b/fs/proc/Kconfig > > > @@ -56,8 +56,10 @@ config PROC_VMCORE_DEVICE_DUMP > > > recovery kernel's initramfs to collect its underlying device > > > snapshot. > > > > > > - If you say Y here, the collected device dumps will be added > > > - as ELF notes to /proc/vmcore. > > > + If you say Y here, a new kernel parameter 'vmcore_device_dump' > > > + will be available. You can then enable device dump by passing > > > > "a new kernel parameter 'vmcore_device_dump' will be available" is not > > necessary, "new" is a not a clear word. I suggest to remove this > > sentence. > > > > s/You can then/You can > > I agree with Dave. We are just trying to say here that even if > CONFIG_PROC_VMCORE_DEVICE_DUMP is set to Y, one can still disable the > device dump feature by passing parameter 'vmcore_device_dump=off' to > the kernel. > > May be you can use the wording I mentioned in the v2 patch review, > which tried to convey a similar meaning. > > With the change addressed: > Reviewed-by: Bhupesh Sharma > > Thanks, > Bhupesh > OK, How about: If you say Y here, device dump is still disabled by default. You can enable device dump by passing 'vmcore_device_dump=on' to kernel, the collected device dumps will be added as ELF notes to /proc/vmcore. If you think this is good I'll send V4 including the changes. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v3] vmcore: Add a kernel parameter vmcore_device_dump
Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in second kernel"), drivers is allowed to add device related dump data to vmcore as they want by using the device dump API. This have a potential issue, the data is stored in memory, drivers may append too much data and use too much memory. The vmcore is typically used in a kdump kernel which runs in a pre-reserved small chunk of memory. So as a result it will make kdump unusable at all due to OOM issues. So introduce new vmcore_device_dump= kernel parameter, and disable device dump by default. User can enable it only if device dump data is required for debugging, and have the chance to increase the kdump reserved memory accordingly before device dump fails kdump. Signed-off-by: Kairui Song --- Update from V2: - Improve related docs Update from V1: - Use bool parameter to turn it on/off instead of letting user give the size limit. Size of device dump is hard to determine. Documentation/admin-guide/kernel-parameters.txt | 14 ++ fs/proc/Kconfig | 6 -- fs/proc/vmcore.c| 13 + 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..3706ad9e1d97 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5078,6 +5078,20 @@ decrease the size and leave more room for directly mapped kernel RAM. + vmcore_device_dump= [KNL,KDUMP] + Format: {"off" | "on"} + Depends on CONFIG_PROC_VMCORE_DEVICE_DUMP. + This parameter allows enable or disable device dump + for vmcore on kernel start-up. + Device dump allows drivers to append dump data to + vmcore so you can collect driver specified debug info. + Note that the drivers could append the data without + any limit, and the data is stored in memory, this may + bring a significant memory stress. If you want to turn + on this option, make sure you have reserved enough memory + with crashkernel= parameter. + default: off + vmcp_cma=nn[MG] [KNL,S390] Sets the memory size reserved for contiguous memory allocations for the vmcp device driver. diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 817c02b13b1d..1a7a38976bb0 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -56,8 +56,10 @@ config PROC_VMCORE_DEVICE_DUMP recovery kernel's initramfs to collect its underlying device snapshot. - If you say Y here, the collected device dumps will be added - as ELF notes to /proc/vmcore. + If you say Y here, a new kernel parameter 'vmcore_device_dump' + will be available. You can then enable device dump by passing + 'vmcore_device_dump=on' to kernel, the collected device dumps + will be added as ELF notes to /proc/vmcore. config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 3fe90443c1bb..d1b608b0efad 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -53,6 +53,8 @@ static struct proc_dir_entry *proc_vmcore; /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); static DEFINE_MUTEX(vmcoredd_mutex); + +static bool vmcoredd_enabled; #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Device Dump Size */ @@ -1451,6 +1453,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) size_t data_size; int ret; + if (!vmcoredd_enabled) { + pr_err_once("Device dump is disabled\n"); + return -EINVAL; + } + if (!data || !strlen(data->dump_name) || !data->vmcoredd_callback || !data->size) return -EINVAL; @@ -1502,6 +1509,12 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return ret; } EXPORT_SYMBOL(vmcore_add_device_dump); + +static int __init vmcoredd_parse_cmdline(char *arg) +{ + return kstrtobool(arg, &vmcoredd_enabled); +} +__setup("vmcore_device_dump=", vmcoredd_parse_cmdline); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Free all dumps in vmcore device dump list */ -- 2.21.0
[PATCH v3 0/4] x86: Always try to fill acpi_rsdp_addr in boot params
This patch sync the behavior of user space kexec and kexec_file_load, they will both fill the boot_params.acpi_rsdp_addr with a valid RSDP value, to make sure second kernel can always get the RSDP consistently. This will make it effortless to boot newer version of kernel (5.0+) without specifying acpi_rsdp= cmdline on EFI system even with EFI service disabled. Should not change any behavior with older kernels. Update from V2: - Drop unneeded 'packed' attribute for boot parameters structure - Don't trust kernel cmdline as a reliable acpi rsdp source Update from V1: - Split into multiple patches for a cleaner structure, content is not changed. Kairui Song (4): x86: Update boot parameters defination x86: Introduce helpers for getting RSDP address x86: Always try to fill acpi_rsdp_addr in boot params crashdump/x86: Use new introduce helper for getting RSDP include/x86/x86-linux.h| 6 +++-- kexec/arch/i386/crashdump-x86.c| 34 +++ kexec/arch/i386/kexec-x86-common.c | 43 ++ kexec/arch/i386/kexec-x86.h| 1 + kexec/arch/i386/x86-linux-setup.c | 6 +++-- kexec/arch/i386/x86-linux-setup.h | 1 + 6 files changed, 62 insertions(+), 29 deletions(-) -- 2.21.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v3 3/4] x86: Always try to fill acpi_rsdp_addr in boot params
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address from boot params if available"), kernel accept an acpi_rsdp_addr param in boot_params. So fill in this parameter unconditionally, ensure second kernel always get the right RSDP address consistently, and boot well on EFI system even with EFI service disabled. User no longer need to change the kernel cmdline to workaround the missing RSDP issue. For older version of kernels (Before 5.0), there won't be any change of behavior. Signed-off-by: Kairui Song --- kexec/arch/i386/x86-linux-setup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c index 5ca7c25..5b00b42 100644 --- a/kexec/arch/i386/x86-linux-setup.c +++ b/kexec/arch/i386/x86-linux-setup.c @@ -901,4 +901,7 @@ void setup_linux_system_parameters(struct kexec_info *info, /* fill the EDD information */ setup_edd_info(real_mode); + + /* Always try to fill acpi_rsdp_addr */ + real_mode->acpi_rsdp_addr = get_acpi_rsdp(); } -- 2.21.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v3 4/4] crashdump/x86: Use new introduce helper for getting RSDP
Use the new introduce helper for getting RSDP, this ensures RSDP is always accessible and avoid code duplication. Signed-off-by: Kairui Song --- kexec/arch/i386/crashdump-x86.c | 34 + 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c index 140f45b..a2aea31 100644 --- a/kexec/arch/i386/crashdump-x86.c +++ b/kexec/arch/i386/crashdump-x86.c @@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void) /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */ static void cmdline_add_efi(char *cmdline) { - FILE *fp; - int cmdlen, len; - char line[MAX_LINE], *s; - const char *acpis = " acpi_rsdp="; + uint64_t acpi_rsdp; + char acpi_rsdp_buf[MAX_LINE]; - fp = fopen("/sys/firmware/efi/systab", "r"); - if (!fp) - return; + acpi_rsdp = get_acpi_rsdp(); - while(fgets(line, sizeof(line), fp) != 0) { - /* ACPI20= always goes before ACPI= */ - if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) { - line[strlen(line) - 1] = '\0'; - s = strchr(line, '='); - s += 1; - len = strlen(s) + strlen(acpis); - cmdlen = strlen(cmdline) + len; - if (cmdlen > (COMMAND_LINE_SIZE - 1)) - die("Command line overflow\n"); - strcat(cmdline, acpis); - strcat(cmdline, s); - dbgprintf("Command line after adding efi\n"); - dbgprintf("%s\n", cmdline); + if (!acpi_rsdp) + return; - break; - } - } + sprintf(acpi_rsdp_buf, " acpi_rsdp=0x%lx", acpi_rsdp); + if (strlen(cmdline) + strlen(acpi_rsdp_buf) > (COMMAND_LINE_SIZE - 1)) + die("Command line overflow\n"); - fclose(fp); + strcat(cmdline, acpi_rsdp_buf); } static void get_backup_area(struct kexec_info *info, -- 2.21.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v3 2/4] x86: Introduce helpers for getting RSDP address
On x86 RSDP is fundamental for booting the machine. When second kernel is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI system with EFI service disabled), kexec should prepare the RSDP address for second kernel. Introduce helpers for getting RSDP from multiple sources, including boot params and EFI firmware. For legacy BIOS interface, there is no better way to find the RSDP address rather than scanning the memory region and search for it, and this will always be done by the kernel as a fallback, so this is no need to try to get the RSDP address for that case. Signed-off-by: Kairui Song --- kexec/arch/i386/kexec-x86-common.c | 43 ++ kexec/arch/i386/kexec-x86.h| 1 + kexec/arch/i386/x86-linux-setup.c | 3 +-- kexec/arch/i386/x86-linux-setup.h | 1 + 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/kexec/arch/i386/kexec-x86-common.c b/kexec/arch/i386/kexec-x86-common.c index de99758..5c55ec8 100644 --- a/kexec/arch/i386/kexec-x86-common.c +++ b/kexec/arch/i386/kexec-x86-common.c @@ -39,6 +39,7 @@ #include "../../firmware_memmap.h" #include "../../crashdump.h" #include "kexec-x86.h" +#include "x86-linux-setup.h" #include "../../kexec-xen.h" /* Used below but not present in (older?) xenctrl.h */ @@ -392,4 +393,46 @@ int get_memory_ranges(struct memory_range **range, int *ranges, return ret; } +static uint64_t bootparam_get_acpi_rsdp(void) { + uint64_t acpi_rsdp = 0; + off_t offset = offsetof(struct x86_linux_param_header, acpi_rsdp_addr); + if (get_bootparam(&acpi_rsdp, offset, sizeof(acpi_rsdp))) + return 0; + + return acpi_rsdp; +} + +static uint64_t efi_get_acpi_rsdp(void) { + FILE *fp; + char line[MAX_LINE], *s; + uint64_t acpi_rsdp = 0; + + fp = fopen("/sys/firmware/efi/systab", "r"); + if (!fp) + return acpi_rsdp; + + while(fgets(line, sizeof(line), fp) != 0) { + /* ACPI20= always goes before ACPI= */ + if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) { + s = strchr(line, '=') + 1; + sscanf(s, "0x%lx", &acpi_rsdp); + break; + } + } + fclose(fp); + + return acpi_rsdp; +} + +uint64_t get_acpi_rsdp(void) +{ + uint64_t acpi_rsdp = 0; + + acpi_rsdp = bootparam_get_acpi_rsdp(); + + if (!acpi_rsdp) + acpi_rsdp = efi_get_acpi_rsdp(); + + return acpi_rsdp; +} diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h index c2bcd37..1b58c3b 100644 --- a/kexec/arch/i386/kexec-x86.h +++ b/kexec/arch/i386/kexec-x86.h @@ -86,4 +86,5 @@ int nbi_load(int argc, char **argv, const char *buf, off_t len, void nbi_usage(void); extern unsigned xen_e820_to_kexec_type(uint32_t type); +extern uint64_t get_acpi_rsdp(void); #endif /* KEXEC_X86_H */ diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c index 8fad115..5ca7c25 100644 --- a/kexec/arch/i386/x86-linux-setup.c +++ b/kexec/arch/i386/x86-linux-setup.c @@ -123,7 +123,6 @@ void setup_linux_bootloader_parameters_high( cmdline_ptr[cmdline_len - 1] = '\0'; } -static int get_bootparam(void *buf, off_t offset, size_t size); static int setup_linux_vesafb(struct x86_linux_param_header *real_mode) { struct fb_fix_screeninfo fix; @@ -452,7 +451,7 @@ char *find_mnt_by_fsname(char *fsname) return mntdir; } -static int get_bootparam(void *buf, off_t offset, size_t size) +int get_bootparam(void *buf, off_t offset, size_t size) { int data_file; char *debugfs_mnt, *sysfs_mnt; diff --git a/kexec/arch/i386/x86-linux-setup.h b/kexec/arch/i386/x86-linux-setup.h index f5d23d3..0c651e5 100644 --- a/kexec/arch/i386/x86-linux-setup.h +++ b/kexec/arch/i386/x86-linux-setup.h @@ -21,6 +21,7 @@ static inline void setup_linux_bootloader_parameters( } void setup_linux_system_parameters(struct kexec_info *info, struct x86_linux_param_header *real_mode); +int get_bootparam(void *buf, off_t offset, size_t size); #define SETUP_BASE0x9 -- 2.21.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v3 1/4] x86: Update boot parameters defination
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address from boot params if available"), kernel accept a acpi_rsdp_addr param in boot_params. Sync the x86_linux_param_header to support this param. Signed-off-by: Kairui Song --- include/x86/x86-linux.h | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h index 352ea02..9646102 100644 --- a/include/x86/x86-linux.h +++ b/include/x86/x86-linux.h @@ -45,7 +45,6 @@ struct apm_bios_info { uint16_t cseg_len; /* 0x4e */ uint16_t cseg_16_len; /* 0x50 */ uint16_t dseg_len; /* 0x52 */ - uint8_t reserved[44]; /* 0x54 */ }; /* @@ -113,12 +112,15 @@ struct x86_linux_param_header { uint8_t reserved4[2]; /* 0x3e -- 0x3f reserved for future expansion */ struct apm_bios_info apm_bios_info; /* 0x40 */ + uint8_t reserved4_1[28]; /* 0x54 */ + uint64_t acpi_rsdp_addr;/* 0x70 */ + uint8_t reserved4_2[8];/* 0x78 */ struct drive_info_struct drive_info;/* 0x80 */ struct sys_desc_table sys_desc_table; /* 0xa0 */ uint32_t ext_ramdisk_image; /* 0xc0 */ uint32_t ext_ramdisk_size; /* 0xc4 */ uint32_t ext_cmd_line_ptr; /* 0xc8 */ - uint8_t reserved4_1[0x1c0 - 0xcc]; /* 0xe4 */ + uint8_t reserved4_3[0x1c0 - 0xcc]; /* 0xe4 */ uint8_t efi_info[32]; /* 0x1c0 */ uint32_t alt_mem_k; /* 0x1e0 */ uint8_t reserved5[4]; /* 0x1e4 */ -- 2.21.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2 1/4] x86: Update boot parameters defination
On Thu, May 23, 2019 at 11:01 AM Dave Young wrote: > > Hi Kairui > On 05/14/19 at 01:09pm, Kairui Song wrote: > > Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address > > from boot params if available"), kernel accept a acpi_rsdp_addr param in > > boot_params. Sync the x86_linux_param_header to support this param. > > > > Signed-off-by: Kairui Song > > --- > > include/x86/x86-linux.h | 8 +--- > > 1 file changed, 5 insertions(+), 3 deletions(-) > > > > diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h > > index 352ea02..a5d8df8 100644 > > --- a/include/x86/x86-linux.h > > +++ b/include/x86/x86-linux.h > > @@ -45,8 +45,7 @@ struct apm_bios_info { > > uint16_t cseg_len; /* 0x4e */ > > uint16_t cseg_16_len; /* 0x50 */ > > uint16_t dseg_len; /* 0x52 */ > > - uint8_t reserved[44]; /* 0x54 */ > > -}; > > +} __attribute__((packed)); > > It should be good to keep same as the kernel header without packed > attribute. > > is it possible to sync the latest mainline uapi bootparam header file, > maybe after this series get solved? > Hi Dave, I can remove the packed attr then, just ensure it won't break anything, a bit paranoid maybe... About sync the whole structure, the problem is kexec tools' header structure is different so that will change a lot. Maybe could be discussed seperately. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2 2/4] x86: Introduce helpers for getting RSDP address
On Thu, May 23, 2019 at 11:43 AM Kairui Song wrote: > > On Thu, May 23, 2019 at 11:16 AM Dave Young wrote: > > > > On 05/14/19 at 01:09pm, Kairui Song wrote: > > > On x86 RSDP is fundamental for booting the machine. When second kernel > > > is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI > > > system with EFI service disabled), kexec should prepare the RSDP address > > > for second kernel. > > > > > > Introduce helpers for getting RSDP from multiple sources, including boot > > > params, cmdline and EFI firmware. > > > > > > For legacy BIOS interface, there is no better way to find the RSDP address > > > rather than scanning the memory region and search for it, and this will > > > always be done by the kernel as a fallback, so this is no need to try to > > > get the RSDP address for that case. > > > > > > Signed-off-by: Kairui Song > > > --- > > > kexec/arch/i386/kexec-x86-common.c | 60 ++ > > > kexec/arch/i386/kexec-x86.h| 1 + > > > kexec/arch/i386/x86-linux-setup.c | 3 +- > > > kexec/arch/i386/x86-linux-setup.h | 1 + > > > 4 files changed, 63 insertions(+), 2 deletions(-) > > > > > > diff --git a/kexec/arch/i386/kexec-x86-common.c > > > b/kexec/arch/i386/kexec-x86-common.c > > > index de99758..4b8eb26 100644 > > > --- a/kexec/arch/i386/kexec-x86-common.c > > > +++ b/kexec/arch/i386/kexec-x86-common.c > > > @@ -39,6 +39,7 @@ > > > #include "../../firmware_memmap.h" > > > #include "../../crashdump.h" > > > #include "kexec-x86.h" > > > +#include "x86-linux-setup.h" > > > #include "../../kexec-xen.h" > > > > > > /* Used below but not present in (older?) xenctrl.h */ > > > @@ -392,4 +393,63 @@ int get_memory_ranges(struct memory_range **range, > > > int *ranges, > > > return ret; > > > } > > > > > > +static uint64_t cmdline_get_acpi_rsdp(void) { > > > + uint64_t acpi_rsdp = 0; > > > + char *tmp_cmdline, *rsdp_param; > > > > > > + tmp_cmdline = get_command_line(); > > > + rsdp_param = strstr(tmp_cmdline, "acpi_rsdp="); > > > > strstr will locate the first acpi_rsdp, what about multiple acpi_rsdp > > provided? > > Good catch, should always use the latest acpi_rsdp provided, will fix that. > > > > > BTW, if one provide a wrong adress in acpi_rsdp= cmdline then it is not > > usable. > > > > I think in that case kernel will not boot. If kexec is available then > it means a right value is given. > After double check the kernel will boot even wrong acpi_rsdp is given, so I'll drop this part. boot_params in newer kernel will be enough to make sure kexec loop boot with EFI disabled won't fail. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] vmcore: Add a kernel cmdline vmcore_device_dump
On Wed, May 22, 2019 at 1:38 PM Dave Young wrote: > > On 05/20/19 at 02:18pm, Kairui Song wrote: > > Since commit 2724273e8fd0 ('vmcore: add API to collect hardware dump in > > second kernel'), drivers is allowed to add device related dump data to > > vmcore as they want by using the device dump API. This have a potential > > issue, the data is stored in memory, drivers may append too much data > > and use too much memory. The vmcore is typically used in a kdump kernel > > which runs in a pre-reserved small chunk of memory. So as a result it > > will make kdump unusable at all due to OOM issues. > > > > So introduce new vmcore_device_dump= kernel parameter, and disable > > device dump by default. User can enable it only if device dump data is > > required for debugging, and have the chance to increase the kdump > > reserved memory accordingly before device dump fails kdump. > > > > Signed-off-by: Kairui Song > > --- > > Update from V1: > > - Use bool parameter to turn it on/off instead of letting user give > > the size limit. Size of device dump is hard to determine. > > > > Documentation/admin-guide/kernel-parameters.txt | 15 +++ > > fs/proc/vmcore.c| 13 + > > 2 files changed, 28 insertions(+) > > > > diff --git a/Documentation/admin-guide/kernel-parameters.txt > > b/Documentation/admin-guide/kernel-parameters.txt > > index 43176340c73d..2d48e39fd080 100644 > > --- a/Documentation/admin-guide/kernel-parameters.txt > > +++ b/Documentation/admin-guide/kernel-parameters.txt > > @@ -5062,6 +5062,21 @@ > > decrease the size and leave more room for directly > > mapped kernel RAM. > > > > + vmcore_device_dump= > > + [VMCORE] > > It looks better to have above two line merged in one line, also use > [KNL, KDUMP] will be better. > > > + Format: {"off" | "on"} > > + If CONFIG_PROC_VMCORE_DEVICE_DUMP is set, > > + this parameter allows enable or disable device dump > > + for vmcore. > > + Device dump allows drivers to append dump data to > > + vmcore so you can collect driver specified debug info. > > + Note that the drivers could append the data without > > + any limit, and the data is stored in memory, this may > > + bring a significant memory stress. If you want to turn > > + on this option, make sure you have reserved enough > > memory > > + with crashkernel= parameter. > > + default: off > > + > > vmcp_cma=nn[MG] [KNL,S390] > > Sets the memory size reserved for contiguous memory > > allocations for the vmcp device driver. > > diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c > > index 3fe90443c1bb..d1b608b0efad 100644 > > --- a/fs/proc/vmcore.c > > +++ b/fs/proc/vmcore.c > > @@ -53,6 +53,8 @@ static struct proc_dir_entry *proc_vmcore; > > /* Device Dump list and mutex to synchronize access to list */ > > static LIST_HEAD(vmcoredd_list); > > static DEFINE_MUTEX(vmcoredd_mutex); > > + > > +static bool vmcoredd_enabled; > > #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ > > > > /* Device Dump Size */ > > @@ -1451,6 +1453,11 @@ int vmcore_add_device_dump(struct vmcoredd_data > > *data) > > size_t data_size; > > int ret; > > > > + if (!vmcoredd_enabled) { > > + pr_err_once("Device dump is disabled\n"); > > + return -EINVAL; > > + } > > + > > if (!data || !strlen(data->dump_name) || > > !data->vmcoredd_callback || !data->size) > > return -EINVAL; > > @@ -1502,6 +1509,12 @@ int vmcore_add_device_dump(struct vmcoredd_data > > *data) > > return ret; > > } > > EXPORT_SYMBOL(vmcore_add_device_dump); > > + > > +static int __init vmcoredd_parse_cmdline(char *arg) > > +{ > > + return kstrtobool(arg, &vmcoredd_enabled); > > +} > > +__setup("vmcore_device_dump=", vmcoredd_parse_cmdline); > > #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ > > > > /* Free all dumps in vmcore device dump list */ > > -- > > 2.21.0 > > > > Thanks > Dave Good suggestion, I'll update in V3. -- Best Regards, Kairui Song
Re: [PATCH v2] vmcore: Add a kernel cmdline vmcore_device_dump
On Thu, May 23, 2019 at 2:44 AM Bhupesh Sharma wrote: > > On 05/20/2019 11:48 AM, Kairui Song wrote: > > Since commit 2724273e8fd0 ('vmcore: add API to collect hardware dump in > > second kernel'), drivers is allowed to add device related dump data to > > vmcore as they want by using the device dump API. This have a potential > > issue, the data is stored in memory, drivers may append too much data > > and use too much memory. The vmcore is typically used in a kdump kernel > > which runs in a pre-reserved small chunk of memory. So as a result it > > will make kdump unusable at all due to OOM issues. > > > > So introduce new vmcore_device_dump= kernel parameter, and disable > > device dump by default. User can enable it only if device dump data is > > required for debugging, and have the chance to increase the kdump > > reserved memory accordingly before device dump fails kdump. > > > > Signed-off-by: Kairui Song > > --- > > Update from V1: > >- Use bool parameter to turn it on/off instead of letting user give > > the size limit. Size of device dump is hard to determine. > > > > Documentation/admin-guide/kernel-parameters.txt | 15 +++ > > fs/proc/vmcore.c| 13 + > > 2 files changed, 28 insertions(+) > > > > diff --git a/Documentation/admin-guide/kernel-parameters.txt > > b/Documentation/admin-guide/kernel-parameters.txt > > index 43176340c73d..2d48e39fd080 100644 > > --- a/Documentation/admin-guide/kernel-parameters.txt > > +++ b/Documentation/admin-guide/kernel-parameters.txt > > @@ -5062,6 +5062,21 @@ > > decrease the size and leave more room for directly > > mapped kernel RAM. > > > > + vmcore_device_dump= > > + [VMCORE] > > + Format: {"off" | "on"} > > + If CONFIG_PROC_VMCORE_DEVICE_DUMP is set, > > + this parameter allows enable or disable device dump > > + for vmcore. > > We can add a simpler description here, something like: > Depends on CONFIG_PROC_VMCORE_DEVICE_DUMP > > > + Device dump allows drivers to append dump data to > > + vmcore so you can collect driver specified debug info. > > + Note that the drivers could append the data without > > + any limit, and the data is stored in memory, this may > > + bring a significant memory stress. If you want to turn > > + on this option, make sure you have reserved enough > > memory > > + with crashkernel= parameter. > > + default: off > > ... and massage the rest of text accordingly. > > Better to also modify the help text for 'PROC_VMCORE_DEVICE_DUMP' config > option defined in 'fs/proc/Kconfig'. Something like: > > config PROC_VMCORE_DEVICE_DUMP > bool "Device Hardware/Firmware Log Collection" > <..snip..> > If you say Y here, the collected device dumps will be added > as ELF notes to /proc/vmcore. > > If this option is selected, device dump collection can still be > disabled by passing vmcore_device_dump=off to the kernel. > > See config INTEL_IOMMU_DEFAULT_ON in 'drivers/iommu/Kconfig' as an example. > Good suggestion! I'll update in V3. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2 2/4] x86: Introduce helpers for getting RSDP address
On Thu, May 23, 2019 at 11:16 AM Dave Young wrote: > > On 05/14/19 at 01:09pm, Kairui Song wrote: > > On x86 RSDP is fundamental for booting the machine. When second kernel > > is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI > > system with EFI service disabled), kexec should prepare the RSDP address > > for second kernel. > > > > Introduce helpers for getting RSDP from multiple sources, including boot > > params, cmdline and EFI firmware. > > > > For legacy BIOS interface, there is no better way to find the RSDP address > > rather than scanning the memory region and search for it, and this will > > always be done by the kernel as a fallback, so this is no need to try to > > get the RSDP address for that case. > > > > Signed-off-by: Kairui Song > > --- > > kexec/arch/i386/kexec-x86-common.c | 60 ++ > > kexec/arch/i386/kexec-x86.h| 1 + > > kexec/arch/i386/x86-linux-setup.c | 3 +- > > kexec/arch/i386/x86-linux-setup.h | 1 + > > 4 files changed, 63 insertions(+), 2 deletions(-) > > > > diff --git a/kexec/arch/i386/kexec-x86-common.c > > b/kexec/arch/i386/kexec-x86-common.c > > index de99758..4b8eb26 100644 > > --- a/kexec/arch/i386/kexec-x86-common.c > > +++ b/kexec/arch/i386/kexec-x86-common.c > > @@ -39,6 +39,7 @@ > > #include "../../firmware_memmap.h" > > #include "../../crashdump.h" > > #include "kexec-x86.h" > > +#include "x86-linux-setup.h" > > #include "../../kexec-xen.h" > > > > /* Used below but not present in (older?) xenctrl.h */ > > @@ -392,4 +393,63 @@ int get_memory_ranges(struct memory_range **range, int > > *ranges, > > return ret; > > } > > > > +static uint64_t cmdline_get_acpi_rsdp(void) { > > + uint64_t acpi_rsdp = 0; > > + char *tmp_cmdline, *rsdp_param; > > > > + tmp_cmdline = get_command_line(); > > + rsdp_param = strstr(tmp_cmdline, "acpi_rsdp="); > > strstr will locate the first acpi_rsdp, what about multiple acpi_rsdp > provided? Good catch, should always use the latest acpi_rsdp provided, will fix that. > > BTW, if one provide a wrong adress in acpi_rsdp= cmdline then it is not > usable. > I think in that case kernel will not boot. If kexec is available then it means a right value is given. > So not sure if adding this cmdline param is necessary, maybe only add > efi case will be reliable. Adding the cmdline param ensure kexec boot loop won't fail. eg. in an older version kernel booted with kexec, and have EFI disabled, then cmdline is the only source for getting and storing the RSDP address. > > > + > > + if (rsdp_param) > > + sscanf(rsdp_param, "acpi_rsdp=%lx", &acpi_rsdp); > > + > > + free(tmp_cmdline); > > + return acpi_rsdp; > > +} > > + > > +static uint64_t bootparam_get_acpi_rsdp(void) { > > + uint64_t acpi_rsdp = 0; > > + off_t offset = offsetof(struct x86_linux_param_header, > > acpi_rsdp_addr); > > + > > + if (get_bootparam(&acpi_rsdp, offset, sizeof(acpi_rsdp))) > > + return 0; > > + > > + return acpi_rsdp; > > +} > > + > > +static uint64_t efi_get_acpi_rsdp(void) { > > + FILE *fp; > > + char line[MAX_LINE], *s; > > + uint64_t acpi_rsdp = 0; > > + > > + fp = fopen("/sys/firmware/efi/systab", "r"); > > + if (!fp) > > + return acpi_rsdp; > > + > > + while(fgets(line, sizeof(line), fp) != 0) { > > + /* ACPI20= always goes before ACPI= */ > > + if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) { > > + s = strchr(line, '=') + 1; > > + sscanf(s, "0x%lx", &acpi_rsdp); > > + break; > > + } > > + } > > + fclose(fp); > > + > > + return acpi_rsdp; > > +} > > + > > +uint64_t get_acpi_rsdp(void) > > +{ > > + uint64_t acpi_rsdp = 0; > > + > > + acpi_rsdp = cmdline_get_acpi_rsdp(); > > + > > + if (!acpi_rsdp) > > + acpi_rsdp = bootparam_get_acpi_rsdp(); > > + > > + if (!acpi_rsdp) > > + acpi_rsdp = efi_get_acpi_rsdp(); > > + > > + return acpi_rsdp; > > +} > > diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h > > index c2bcd37..1b58c
Re: [PATCH v6 1/2] x86/kexec: Build identity mapping for EFI systab and ACPI tables
On Wed, May 15, 2019 at 3:10 PM Junichi Nomura wrote: > > On 5/15/19 3:58 PM, Borislav Petkov wrote: > > On Wed, May 15, 2019 at 05:17:19AM +, Junichi Nomura wrote: > >> Hi Kairui, > >> > >> On 5/13/19 5:02 PM, Baoquan He wrote: > >>> On 05/13/19 at 09:50am, Borislav Petkov wrote: > >>>> On Mon, May 13, 2019 at 03:32:54PM +0800, Baoquan He wrote: > >>>> So we're going to try it again this cycle and if there's no fallout, it > >>>> will go upstream. If not, it will have to be fixed. The usual thing. > >>>> > >>>> And I don't care if Kairui's patch fixes this one problem - judging by > >>>> the fragility of this whole thing, it should be hammered on one more > >>>> cycle on as many boxes as possible to make sure there's no other SNAFUs. > >>>> > >>>> So go test it on more machines instead. I've pushed it here: > >>>> > >>>> https://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git/log/?h=next-merge-window > >>> > >>> Pingfan has got a machine to reproduce the kexec breakage issue, and > >>> applying these two patches fix it. He planned to paste the test result. > >>> I will ask him to try this branch if he has time, or I can get his > >>> machine to test. > >>> > >>> Junichi, also have a try on Boris's branch in NEC's test environment? > >> > >> while the patch set works on most of the machines I'm testing around, > >> I found kexec(1) fails to load kernel on a few machines if this patch > >> is applied. Those machines don't have IORES_DESC_ACPI_TABLES region > >> and have ACPI tables in IORES_DESC_ACPI_NV_STORAGE region instead. > > > > Why? What kind of machines are those? > > I don't know. They are just general purpose Xeon-based servers > and not some special purpose machines. So I guess there are other > such machines in the wild. > Hi, I think it's reasonable to update the patch to include the NV_STORAGE regions as well, most likely the firmware only provided NV_STORAGE region? Can you help confirm if the e820 didn't contain ACPI data, and only ACPI NVS? I had a try with this update patch, it worked and didn't break anything. Hi Boris, would you prefer to just fold Junichi update patch into the previous one or I should send an updated patch? -- Best Regards, Kairui Song
Re: [RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit
On Mon, May 20, 2019 at 1:55 PM Bhupesh Sharma wrote: > > On 05/16/2019 01:49 PM, Kairui Song wrote: > > On Fri, May 10, 2019 at 7:17 PM Bhupesh Sharma wrote: > >> > >> Hi Kairui, > >> > >> Thanks for the patch. Please see my comments in-line: > >> > >> On 05/10/2019 03:50 PM, Kairui Song wrote: > >>> Device dump allow drivers to add device related dump data to vmcore as > >>> they want. This have a potential issue, the data is stored in memory, > >>> drivers may append too much data and use too much memory. The vmcore is > >>> typically used in a kdump kernel which runs in a pre-reserved small > >>> chunk of memory. So as a result it will make kdump unusable at all due > >>> to OOM issues. > >>> > >>> So introduce new device_dump_limit= kernel parameter, and set the > >>> default limit to 0, so device dump is not enabled unless user specify > >>> the accetable maxiam > >> > >> acceptable maximum > > > > Will fix this typo. > > Ok. > > >>> memory usage for device dump data. In this way user > >>> will also have the chance to adjust the kdump reserved memory > >>> accordingly. > >> > >> Hmmm., this doesn't give much confidence with the > >> PROC_VMCORE_DEVICE_DUMP feature in its current shape. Rather shouldn't > >> we be enabling config PROC_VMCORE_DEVICE_DUMP only under EXPERT mode for > >> now, considering that this feature needs further thrashing and testing > >> with real setups including platforms where drivers append large amounts > >> of data to vmcore: > > > > I think no need to move it to expert mode, just leave it disabled by > > default should be better, that should be enough to make sure driver > > won't append that much memory and cause OOM, while it could still be > > enabled without changing the kernel, so this feature won't bring extra > > risk, and could be enabled anytime easily. > > I have seen some arm64 users report issues on mailing lists with > PROC_VMCORE_DEVICE_DUMP enabled as this causes frequent OOM in the arm64 > crash dump kernel. > > I think they are using this infrastructure to extend/enable device > driver debugging on some arm64 platforms and finding issues with the > crash dump kernel. > > I will do some analysis later-on (when I get some spare time) and post a > patch (if needed) to put the same under EXPERT mode for now. > > >> diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig > >> index 817c02b13b1d..c47a12cf7fc0 100644 > >> --- a/fs/proc/Kconfig > >> +++ b/fs/proc/Kconfig > >> @@ -45,7 +45,7 @@ config PROC_VMCORE > >>Exports the dump image of crashed kernel in ELF format. > >> > >>config PROC_VMCORE_DEVICE_DUMP > >> - bool "Device Hardware/Firmware Log Collection" > >> + bool "Device Hardware/Firmware Log Collection" if EXPERT > >> depends on PROC_VMCORE > >> default n > >> help > >> @@ -59,6 +59,12 @@ config PROC_VMCORE_DEVICE_DUMP > >> If you say Y here, the collected device dumps will be added > >> as ELF notes to /proc/vmcore. > >> > >> + Considering that there can be device drivers which append > >> + large amounts of data to vmcore, you should say N here unless > >> + you are reserving a large chunk of memory for crashdump > >> + kernel, because otherwise the crashdump kernel might become > >> + unusable due to OOM issues. > >> + > >> > >> May be you can add a 'Fixes:' tag here. > > > > Problem is previous commit seems not broken, just bring extra memory > > stress. Is "Fixes:" tag suitable for this commit? > > I think since the earlier patch causes an OOM, it would be better to > atleast mention it in the git log (for easier git bisect later on). > > If not the 'Fixes:' tag may be we can use a 'Since commit ..' like > wording in the commit log. > > >>> Signed-off-by: Kairui Song > >>> --- > >>>fs/proc/vmcore.c | 20 > >>>1 file changed, 20 insertions(+) > >>> > >>> diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c > >>> index 3fe90443c1bb..e28695ef2439 100644 > >>> --- a/fs/proc/vmcore.c > >>> +++ b/fs/proc/vmcore.c > >>> @@ -53,6 +53,9 @@ static str
[PATCH v2] vmcore: Add a kernel cmdline vmcore_device_dump
Since commit 2724273e8fd0 ('vmcore: add API to collect hardware dump in second kernel'), drivers is allowed to add device related dump data to vmcore as they want by using the device dump API. This have a potential issue, the data is stored in memory, drivers may append too much data and use too much memory. The vmcore is typically used in a kdump kernel which runs in a pre-reserved small chunk of memory. So as a result it will make kdump unusable at all due to OOM issues. So introduce new vmcore_device_dump= kernel parameter, and disable device dump by default. User can enable it only if device dump data is required for debugging, and have the chance to increase the kdump reserved memory accordingly before device dump fails kdump. Signed-off-by: Kairui Song --- Update from V1: - Use bool parameter to turn it on/off instead of letting user give the size limit. Size of device dump is hard to determine. Documentation/admin-guide/kernel-parameters.txt | 15 +++ fs/proc/vmcore.c| 13 + 2 files changed, 28 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 43176340c73d..2d48e39fd080 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5062,6 +5062,21 @@ decrease the size and leave more room for directly mapped kernel RAM. + vmcore_device_dump= + [VMCORE] + Format: {"off" | "on"} + If CONFIG_PROC_VMCORE_DEVICE_DUMP is set, + this parameter allows enable or disable device dump + for vmcore. + Device dump allows drivers to append dump data to + vmcore so you can collect driver specified debug info. + Note that the drivers could append the data without + any limit, and the data is stored in memory, this may + bring a significant memory stress. If you want to turn + on this option, make sure you have reserved enough memory + with crashkernel= parameter. + default: off + vmcp_cma=nn[MG] [KNL,S390] Sets the memory size reserved for contiguous memory allocations for the vmcp device driver. diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 3fe90443c1bb..d1b608b0efad 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -53,6 +53,8 @@ static struct proc_dir_entry *proc_vmcore; /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); static DEFINE_MUTEX(vmcoredd_mutex); + +static bool vmcoredd_enabled; #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Device Dump Size */ @@ -1451,6 +1453,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) size_t data_size; int ret; + if (!vmcoredd_enabled) { + pr_err_once("Device dump is disabled\n"); + return -EINVAL; + } + if (!data || !strlen(data->dump_name) || !data->vmcoredd_callback || !data->size) return -EINVAL; @@ -1502,6 +1509,12 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return ret; } EXPORT_SYMBOL(vmcore_add_device_dump); + +static int __init vmcoredd_parse_cmdline(char *arg) +{ + return kstrtobool(arg, &vmcoredd_enabled); +} +__setup("vmcore_device_dump=", vmcoredd_parse_cmdline); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Free all dumps in vmcore device dump list */ -- 2.21.0
Re: [RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit
On Fri, May 10, 2019 at 7:17 PM Bhupesh Sharma wrote: > > Hi Kairui, > > Thanks for the patch. Please see my comments in-line: > > On 05/10/2019 03:50 PM, Kairui Song wrote: > > Device dump allow drivers to add device related dump data to vmcore as > > they want. This have a potential issue, the data is stored in memory, > > drivers may append too much data and use too much memory. The vmcore is > > typically used in a kdump kernel which runs in a pre-reserved small > > chunk of memory. So as a result it will make kdump unusable at all due > > to OOM issues. > > > > So introduce new device_dump_limit= kernel parameter, and set the > > default limit to 0, so device dump is not enabled unless user specify > > the accetable maxiam > > acceptable maximum Will fix this typo. > > > memory usage for device dump data. In this way user > > will also have the chance to adjust the kdump reserved memory > > accordingly. > > Hmmm., this doesn't give much confidence with the > PROC_VMCORE_DEVICE_DUMP feature in its current shape. Rather shouldn't > we be enabling config PROC_VMCORE_DEVICE_DUMP only under EXPERT mode for > now, considering that this feature needs further thrashing and testing > with real setups including platforms where drivers append large amounts > of data to vmcore: I think no need to move it to expert mode, just leave it disabled by default should be better, that should be enough to make sure driver won't append that much memory and cause OOM, while it could still be enabled without changing the kernel, so this feature won't bring extra risk, and could be enabled anytime easily. > > diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig > index 817c02b13b1d..c47a12cf7fc0 100644 > --- a/fs/proc/Kconfig > +++ b/fs/proc/Kconfig > @@ -45,7 +45,7 @@ config PROC_VMCORE > Exports the dump image of crashed kernel in ELF format. > > config PROC_VMCORE_DEVICE_DUMP > - bool "Device Hardware/Firmware Log Collection" > + bool "Device Hardware/Firmware Log Collection" if EXPERT > depends on PROC_VMCORE > default n > help > @@ -59,6 +59,12 @@ config PROC_VMCORE_DEVICE_DUMP >If you say Y here, the collected device dumps will be added >as ELF notes to /proc/vmcore. > > + Considering that there can be device drivers which append > + large amounts of data to vmcore, you should say N here unless > + you are reserving a large chunk of memory for crashdump > + kernel, because otherwise the crashdump kernel might become > + unusable due to OOM issues. > + > > May be you can add a 'Fixes:' tag here. Problem is previous commit seems not broken, just bring extra memory stress. Is "Fixes:" tag suitable for this commit? > > > Signed-off-by: Kairui Song > > --- > > fs/proc/vmcore.c | 20 > > 1 file changed, 20 insertions(+) > > > > diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c > > index 3fe90443c1bb..e28695ef2439 100644 > > --- a/fs/proc/vmcore.c > > +++ b/fs/proc/vmcore.c > > @@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore; > > /* Device Dump list and mutex to synchronize access to list */ > > static LIST_HEAD(vmcoredd_list); > > static DEFINE_MUTEX(vmcoredd_mutex); > > + > > +/* Device Dump Limit */ > > +static size_t vmcoredd_limit; > > #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ > > > > /* Device Dump Size */ > > @@ -1465,6 +1468,11 @@ int vmcore_add_device_dump(struct vmcoredd_data > > *data) > > data_size = roundup(sizeof(struct vmcoredd_header) + data->size, > > PAGE_SIZE); > > > > + if (vmcoredd_orig_sz + data_size >= vmcoredd_limit) { > > + ret = -ENOMEM; > > Should we be adding a WARN() here to let the user know that the device > dump data will not be available in vmcore? Yes, that could be very helpful. How about pr_err_once? WARN is too noise, just give a hint to the user that device dump is disabled should be enough, so user will know why device dump data is not present and will just enable it. > > > + goto out_err; > > + } > > + > > /* Allocate buffer for driver's to write their dumps */ > > buf = vmcore_alloc_buf(data_size); > > if (!buf) { > > @@ -1502,6 +1510,18 @@ int vmcore_add_device_dump(struct vmcoredd_data > > *data) > > return ret; > > } > > EXPORT_SYMBOL(vmcore_add_device_dump); > > + > > +static int __init parse_vmc
Re: [PATCH v6 1/2] x86/kexec: Build identity mapping for EFI systab and ACPI tables
00 > [0.696330][T1] FS: () GS:8c6bd600() > knlGS: > [0.697330][T1] CS: 0010 DS: ES: CR0: 80050033 > [0.698330][T1] CR2: 8c6bde5ff000 CR3: 00015700e001 CR4: > 000606f0 > [0.699334][T1] Kernel panic - not syncing: Attempted to kill init! > exitcode=0x000b > [0.700328][ T1] ---[ end Kernel panic - not syncing: Attempted to kill > init! exitcode=0x000b ]--- > > Thanks > Dave I can confirm as I got same result on my T420. next-merge-window branch fails both normal boot and kexec... I didn't manage to get a working serial console, but the behavior is the same so should be the same issue. Also after "git cherry-pick de01951c8d40^..next-merge-window" on master branch, it worked well, so the patch should be good. -- Best Regards, Kairui Song
[PATCH v2 4/4] crashdump/x86: Use new introduced helper for getting RSDP
Use the new introduced helper for getting RSDP, this ensures RSDP is always accessible and avoid code duplication. Signed-off-by: Kairui Song --- kexec/arch/i386/crashdump-x86.c | 34 + 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c index 140f45b..a2aea31 100644 --- a/kexec/arch/i386/crashdump-x86.c +++ b/kexec/arch/i386/crashdump-x86.c @@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void) /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */ static void cmdline_add_efi(char *cmdline) { - FILE *fp; - int cmdlen, len; - char line[MAX_LINE], *s; - const char *acpis = " acpi_rsdp="; + uint64_t acpi_rsdp; + char acpi_rsdp_buf[MAX_LINE]; - fp = fopen("/sys/firmware/efi/systab", "r"); - if (!fp) - return; + acpi_rsdp = get_acpi_rsdp(); - while(fgets(line, sizeof(line), fp) != 0) { - /* ACPI20= always goes before ACPI= */ - if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) { - line[strlen(line) - 1] = '\0'; - s = strchr(line, '='); - s += 1; - len = strlen(s) + strlen(acpis); - cmdlen = strlen(cmdline) + len; - if (cmdlen > (COMMAND_LINE_SIZE - 1)) - die("Command line overflow\n"); - strcat(cmdline, acpis); - strcat(cmdline, s); - dbgprintf("Command line after adding efi\n"); - dbgprintf("%s\n", cmdline); + if (!acpi_rsdp) + return; - break; - } - } + sprintf(acpi_rsdp_buf, " acpi_rsdp=0x%lx", acpi_rsdp); + if (strlen(cmdline) + strlen(acpi_rsdp_buf) > (COMMAND_LINE_SIZE - 1)) + die("Command line overflow\n"); - fclose(fp); + strcat(cmdline, acpi_rsdp_buf); } static void get_backup_area(struct kexec_info *info, -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 3/4] x86: Always try to fill acpi_rsdp_addr in boot params
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address from boot params if available"), kernel accept an acpi_rsdp_addr param in boot_params. So fill in this parameter unconditionally, ensure second kernel always get the right RSDP address consistently, and boot well on EFI system even with EFI service disabled. User no longer need to change the kernel cmdline to workaround the missing RSDP issue. For older version of kernels (Before 5.0), there won't be any change of behavior. Signed-off-by: Kairui Song --- kexec/arch/i386/x86-linux-setup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c index 5ca7c25..5b00b42 100644 --- a/kexec/arch/i386/x86-linux-setup.c +++ b/kexec/arch/i386/x86-linux-setup.c @@ -901,4 +901,7 @@ void setup_linux_system_parameters(struct kexec_info *info, /* fill the EDD information */ setup_edd_info(real_mode); + + /* Always try to fill acpi_rsdp_addr */ + real_mode->acpi_rsdp_addr = get_acpi_rsdp(); } -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 1/4] x86: Update boot parameters defination
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address from boot params if available"), kernel accept a acpi_rsdp_addr param in boot_params. Sync the x86_linux_param_header to support this param. Signed-off-by: Kairui Song --- include/x86/x86-linux.h | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h index 352ea02..a5d8df8 100644 --- a/include/x86/x86-linux.h +++ b/include/x86/x86-linux.h @@ -45,8 +45,7 @@ struct apm_bios_info { uint16_t cseg_len; /* 0x4e */ uint16_t cseg_16_len; /* 0x50 */ uint16_t dseg_len; /* 0x52 */ - uint8_t reserved[44]; /* 0x54 */ -}; +} __attribute__((packed)); /* * EDD stuff @@ -113,12 +112,15 @@ struct x86_linux_param_header { uint8_t reserved4[2]; /* 0x3e -- 0x3f reserved for future expansion */ struct apm_bios_info apm_bios_info; /* 0x40 */ + uint8_t reserved4_1[28]; /* 0x54 */ + uint64_t acpi_rsdp_addr;/* 0x70 */ + uint8_t reserved4_2[8];/* 0x78 */ struct drive_info_struct drive_info;/* 0x80 */ struct sys_desc_table sys_desc_table; /* 0xa0 */ uint32_t ext_ramdisk_image; /* 0xc0 */ uint32_t ext_ramdisk_size; /* 0xc4 */ uint32_t ext_cmd_line_ptr; /* 0xc8 */ - uint8_t reserved4_1[0x1c0 - 0xcc]; /* 0xe4 */ + uint8_t reserved4_3[0x1c0 - 0xcc]; /* 0xe4 */ uint8_t efi_info[32]; /* 0x1c0 */ uint32_t alt_mem_k; /* 0x1e0 */ uint8_t reserved5[4]; /* 0x1e4 */ -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 0/4] x86: Always to to fill acpi_rsdp_addr in boot params
This patch sync the behavior of user space kexec and kexec_file_load, they will both fill the boot_params.acpi_rsdp_addr with a valid RSDP value, to make sure second kernel can always get the RSDP consistently. This will make it effortless to boot newer version of kernel (5.0+) without specifying acpi_rsdp= cmdline on EFI system even with EFI service disabled. Should not change any behavior with older kernels. Update from V1: - Split into multiple patches for a cleaner structure, content is not changed. Kairui Song (4): x86: Update boot parameters defination x86: Introduce helpers for getting RSDP address x86: Always try to fill acpi_rsdp_addr in boot params crashdump/x86: Use new introduce helper for getting RSDP include/x86/x86-linux.h| 8 ++-- kexec/arch/i386/crashdump-x86.c| 34 + kexec/arch/i386/kexec-x86-common.c | 60 ++ kexec/arch/i386/kexec-x86.h| 1 + kexec/arch/i386/x86-linux-setup.c | 6 ++- kexec/arch/i386/x86-linux-setup.h | 1 + 6 files changed, 80 insertions(+), 30 deletions(-) -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 2/4] x86: Introduce helpers for getting RSDP address
On x86 RSDP is fundamental for booting the machine. When second kernel is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI system with EFI service disabled), kexec should prepare the RSDP address for second kernel. Introduce helpers for getting RSDP from multiple sources, including boot params, cmdline and EFI firmware. For legacy BIOS interface, there is no better way to find the RSDP address rather than scanning the memory region and search for it, and this will always be done by the kernel as a fallback, so this is no need to try to get the RSDP address for that case. Signed-off-by: Kairui Song --- kexec/arch/i386/kexec-x86-common.c | 60 ++ kexec/arch/i386/kexec-x86.h| 1 + kexec/arch/i386/x86-linux-setup.c | 3 +- kexec/arch/i386/x86-linux-setup.h | 1 + 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/kexec/arch/i386/kexec-x86-common.c b/kexec/arch/i386/kexec-x86-common.c index de99758..4b8eb26 100644 --- a/kexec/arch/i386/kexec-x86-common.c +++ b/kexec/arch/i386/kexec-x86-common.c @@ -39,6 +39,7 @@ #include "../../firmware_memmap.h" #include "../../crashdump.h" #include "kexec-x86.h" +#include "x86-linux-setup.h" #include "../../kexec-xen.h" /* Used below but not present in (older?) xenctrl.h */ @@ -392,4 +393,63 @@ int get_memory_ranges(struct memory_range **range, int *ranges, return ret; } +static uint64_t cmdline_get_acpi_rsdp(void) { + uint64_t acpi_rsdp = 0; + char *tmp_cmdline, *rsdp_param; + tmp_cmdline = get_command_line(); + rsdp_param = strstr(tmp_cmdline, "acpi_rsdp="); + + if (rsdp_param) + sscanf(rsdp_param, "acpi_rsdp=%lx", &acpi_rsdp); + + free(tmp_cmdline); + return acpi_rsdp; +} + +static uint64_t bootparam_get_acpi_rsdp(void) { + uint64_t acpi_rsdp = 0; + off_t offset = offsetof(struct x86_linux_param_header, acpi_rsdp_addr); + + if (get_bootparam(&acpi_rsdp, offset, sizeof(acpi_rsdp))) + return 0; + + return acpi_rsdp; +} + +static uint64_t efi_get_acpi_rsdp(void) { + FILE *fp; + char line[MAX_LINE], *s; + uint64_t acpi_rsdp = 0; + + fp = fopen("/sys/firmware/efi/systab", "r"); + if (!fp) + return acpi_rsdp; + + while(fgets(line, sizeof(line), fp) != 0) { + /* ACPI20= always goes before ACPI= */ + if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) { + s = strchr(line, '=') + 1; + sscanf(s, "0x%lx", &acpi_rsdp); + break; + } + } + fclose(fp); + + return acpi_rsdp; +} + +uint64_t get_acpi_rsdp(void) +{ + uint64_t acpi_rsdp = 0; + + acpi_rsdp = cmdline_get_acpi_rsdp(); + + if (!acpi_rsdp) + acpi_rsdp = bootparam_get_acpi_rsdp(); + + if (!acpi_rsdp) + acpi_rsdp = efi_get_acpi_rsdp(); + + return acpi_rsdp; +} diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h index c2bcd37..1b58c3b 100644 --- a/kexec/arch/i386/kexec-x86.h +++ b/kexec/arch/i386/kexec-x86.h @@ -86,4 +86,5 @@ int nbi_load(int argc, char **argv, const char *buf, off_t len, void nbi_usage(void); extern unsigned xen_e820_to_kexec_type(uint32_t type); +extern uint64_t get_acpi_rsdp(void); #endif /* KEXEC_X86_H */ diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c index 8fad115..5ca7c25 100644 --- a/kexec/arch/i386/x86-linux-setup.c +++ b/kexec/arch/i386/x86-linux-setup.c @@ -123,7 +123,6 @@ void setup_linux_bootloader_parameters_high( cmdline_ptr[cmdline_len - 1] = '\0'; } -static int get_bootparam(void *buf, off_t offset, size_t size); static int setup_linux_vesafb(struct x86_linux_param_header *real_mode) { struct fb_fix_screeninfo fix; @@ -452,7 +451,7 @@ char *find_mnt_by_fsname(char *fsname) return mntdir; } -static int get_bootparam(void *buf, off_t offset, size_t size) +int get_bootparam(void *buf, off_t offset, size_t size) { int data_file; char *debugfs_mnt, *sysfs_mnt; diff --git a/kexec/arch/i386/x86-linux-setup.h b/kexec/arch/i386/x86-linux-setup.h index f5d23d3..0c651e5 100644 --- a/kexec/arch/i386/x86-linux-setup.h +++ b/kexec/arch/i386/x86-linux-setup.h @@ -21,6 +21,7 @@ static inline void setup_linux_bootloader_parameters( } void setup_linux_system_parameters(struct kexec_info *info, struct x86_linux_param_header *real_mode); +int get_bootparam(void *buf, off_t offset, size_t size); #define SETUP_BASE0x9 -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] kexec/x86: Unconditionally add the acpi_rsdp command line
On Fri, Mar 15, 2019 at 5:36 PM Lianbo Jiang wrote: > > The Linux kernel commit 3a63f70bf4c3 introduces the early parsing > of the RSDP. This means that boot loader must either set the > boot_params.acpi_rsdp_addr or pass a command line 'acpi_rsdp=xxx' > to tell the RDSP physical address. > > Currently, kexec neither sets the boot_params.acpi_rsdp or passes > acpi_rsdp command line if it sees the first kernel support efi > runtime. This is causing the second kernel boot failure. > The EFI runtime is not available so early in the boot process so > unconditionally pass the 'acpi_rsdp=xxx' to the second kernel. > > Signed-off-by: Lianbo Jiang > Signed-off-by: Brijesh Singh > --- > kexec/arch/i386/crashdump-x86.c | 17 + > 1 file changed, 1 insertion(+), 16 deletions(-) > > diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c > index 140f45b..a29b15b 100644 > --- a/kexec/arch/i386/crashdump-x86.c > +++ b/kexec/arch/i386/crashdump-x86.c > @@ -35,7 +35,6 @@ > #include > #include > #include > -#include > #include "../../kexec.h" > #include "../../kexec-elf.h" > #include "../../kexec-syscall.h" > @@ -772,18 +771,6 @@ static enum coretype get_core_type(struct crash_elf_info > *elf_info, > } > } > > -static int sysfs_efi_runtime_map_exist(void) > -{ > - DIR *dir; > - > - dir = opendir("/sys/firmware/efi/runtime-map"); > - if (!dir) > - return 0; > - > - closedir(dir); > - return 1; > -} > - > /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */ > static void cmdline_add_efi(char *cmdline) > { > @@ -978,9 +965,7 @@ int load_crashdump_segments(struct kexec_info *info, > char* mod_cmdline, > dbgprintf("Created elf header segment at 0x%lx\n", elfcorehdr); > if (delete_memmap(memmap_p, &nr_memmap, elfcorehdr, memsz) < 0) > return -1; > - if (!bzImage_support_efi_boot || arch_options.noefi || > - !sysfs_efi_runtime_map_exist()) > - cmdline_add_efi(mod_cmdline); > + cmdline_add_efi(mod_cmdline); > cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr); > > /* Inform second kernel about the presence of ACPI tables. */ > -- > 2.17.1 > > > ___ > kexec mailing list > kexec@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kexec Hi Lianbo, I've sent another patch similiar to yours: [PATCH] x86: Always try to fill acpi_rsdp_addr in boot params I'll update V2 and your use case should also be covered in that patch, as we have talked in IRC previously, thanks! -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit
On Mon, May 13, 2019 at 9:52 AM Dave Young wrote: > > On 05/10/19 at 06:20pm, Kairui Song wrote: > > Device dump allow drivers to add device related dump data to vmcore as > > they want. This have a potential issue, the data is stored in memory, > > drivers may append too much data and use too much memory. The vmcore is > > typically used in a kdump kernel which runs in a pre-reserved small > > chunk of memory. So as a result it will make kdump unusable at all due > > to OOM issues. > > > > So introduce new device_dump_limit= kernel parameter, and set the > > default limit to 0, so device dump is not enabled unless user specify > > the accetable maxiam memory usage for device dump data. In this way user > > will also have the chance to adjust the kdump reserved memory > > accordingly. > > The device dump is only affective in kdump 2nd kernel, so add the > limitation seems not useful. One is hard to know the correct size > unless one does some crash test. If one did the test and want to eanble > the device dump he needs increase crashkernel= size in 1st kernel and > add the limit param in 2nd kernel. > > So a global on/off param sounds easier and better, something like > vmcore_device_dump=on (default is off) Yes, on/off could be another way to solve this issue, the size limit could being more flexibility, if device dump is not asking for too much memory then it would just work but bring extra complexity indeed. Considering it's actually hard to know how much memory is needed for the device dump drivers to work, I'll update to use the on/off cmdline then. > > > > > Signed-off-by: Kairui Song > > --- > > fs/proc/vmcore.c | 20 > > 1 file changed, 20 insertions(+) > > > > diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c > > index 3fe90443c1bb..e28695ef2439 100644 > > --- a/fs/proc/vmcore.c > > +++ b/fs/proc/vmcore.c > > @@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore; > > /* Device Dump list and mutex to synchronize access to list */ > > static LIST_HEAD(vmcoredd_list); > > static DEFINE_MUTEX(vmcoredd_mutex); > > + > > +/* Device Dump Limit */ > > +static size_t vmcoredd_limit; > > #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ > > > > /* Device Dump Size */ > > @@ -1465,6 +1468,11 @@ int vmcore_add_device_dump(struct vmcoredd_data > > *data) > > data_size = roundup(sizeof(struct vmcoredd_header) + data->size, > > PAGE_SIZE); > > > > + if (vmcoredd_orig_sz + data_size >= vmcoredd_limit) { > > + ret = -ENOMEM; > > + goto out_err; > > + } > > + > > /* Allocate buffer for driver's to write their dumps */ > > buf = vmcore_alloc_buf(data_size); > > if (!buf) { > > @@ -1502,6 +1510,18 @@ int vmcore_add_device_dump(struct vmcoredd_data > > *data) > > return ret; > > } > > EXPORT_SYMBOL(vmcore_add_device_dump); > > + > > +static int __init parse_vmcoredd_limit(char *arg) > > +{ > > + char *end; > > + > > + if (!arg) > > + return -EINVAL; > > + vmcoredd_limit = memparse(arg, &end); > > + return end > arg ? 0 : -EINVAL; > > + > > +} > > +__setup("device_dump_limit=", parse_vmcoredd_limit); > > #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ > > > > /* Free all dumps in vmcore device dump list */ > > -- > > 2.20.1 > > > > Thanks > Dave -- Best Regards, Kairui Song
[RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit
Device dump allow drivers to add device related dump data to vmcore as they want. This have a potential issue, the data is stored in memory, drivers may append too much data and use too much memory. The vmcore is typically used in a kdump kernel which runs in a pre-reserved small chunk of memory. So as a result it will make kdump unusable at all due to OOM issues. So introduce new device_dump_limit= kernel parameter, and set the default limit to 0, so device dump is not enabled unless user specify the accetable maxiam memory usage for device dump data. In this way user will also have the chance to adjust the kdump reserved memory accordingly. Signed-off-by: Kairui Song --- fs/proc/vmcore.c | 20 1 file changed, 20 insertions(+) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 3fe90443c1bb..e28695ef2439 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore; /* Device Dump list and mutex to synchronize access to list */ static LIST_HEAD(vmcoredd_list); static DEFINE_MUTEX(vmcoredd_mutex); + +/* Device Dump Limit */ +static size_t vmcoredd_limit; #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Device Dump Size */ @@ -1465,6 +1468,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) data_size = roundup(sizeof(struct vmcoredd_header) + data->size, PAGE_SIZE); + if (vmcoredd_orig_sz + data_size >= vmcoredd_limit) { + ret = -ENOMEM; + goto out_err; + } + /* Allocate buffer for driver's to write their dumps */ buf = vmcore_alloc_buf(data_size); if (!buf) { @@ -1502,6 +1510,18 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return ret; } EXPORT_SYMBOL(vmcore_add_device_dump); + +static int __init parse_vmcoredd_limit(char *arg) +{ + char *end; + + if (!arg) + return -EINVAL; + vmcoredd_limit = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; + +} +__setup("device_dump_limit=", parse_vmcoredd_limit); #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Free all dumps in vmcore device dump list */ -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] x86/kexec: always ensure EFI systab region is mapped
On Wed, Apr 24, 2019, 03:46 Baoquan He wrote: > > On 04/24/19 at 02:18pm, Dave Young wrote: > > On 04/24/19 at 01:41pm, Baoquan He wrote: > > > On 04/24/19 at 02:47am, Junichi Nomura wrote: > > > > On 4/24/19 2:15 AM, Kairui Song wrote: > > > > > On Mon, Apr 22, 2019 at 11:21 PM Junichi Nomura > > > > > wrote: > > > > >> Is the mapping of ACPI tables just by luck, too? > > > > > > > > > > Good question, they should have same issue with systab, I ignored > > > > > this one. > > > > > Then in first kernel when doing kexec it should ensure both ACPI > > > > > tables and the EFI systab are mapped, that should cover everything and > > > > > make it work. > > > > > > > > Right. > > > > > > > > > Is there anything else missing? > > > > No, as far as I looked around get_rsdp_addr(). > > > > > > Have made a draft patch to build ident mapping for ACPI tables too, it's > > > based on Kairui's patch. Dave has tested on his t400s laptop, and > > > passed. Please check if this adding is OK. > > > > > > Kairui, you can add this into your patch to make a new one and resend. > > > Or I can combine them and send for you today. > > > > > Since I can not reproduce the acpi table accessing fault with Kairui's > > patch, > > the test is just sanity testing on same hardware. But the patch looks > > good. > > Yes, usually vendor will put these efi systab, ACPI tables together. See > the regions you listed on your t420 laptop in another mail: > da99f000 - dae9efff Reserved (efi systab fall in this region) > daf9f000 - daffefff ACPI tables > > We build 1:1 mapping for kexec kernel down to PMD level. Means for a > region, it will align starting address down to PMD size, and align end > address up to PMD size. So the end of efi systab, 0xdae9efff, will cause > mapping built for the 2MB area, 0xdae0-0xdaf0. Clearly ACPI > tables are covered by that PMD entry. That's why only efi systab > mapping is built, accessing ACPI tables doesn't cause error. > > But we can't assume they will be put together always, so need map ACPI > tables too. > > > > > With Kairui's fix+ this acpi fix and Junichi's patch everything works. > > Can anyone send them for example patch 1/2: kexec early mapping for > > efi/acpi, patch 2/2: Junichi's previous patch. > > Kairui is having a workshop in the US, I can make a patchset to > include these two patches. > > For patch 1/2, I will combine the patch Kairui posted and my draft patch, > Kairui is the author certainly, since he debugged and found out the root > cause, and posted v1 when I was on vacation last week. > > For patch 2/2, I think the version Boris organized is good. > http://lkml.kernel.org/r/20190416095209.gg27...@zn.tnic > Thanks a lot Bao! I was offline for about 1 day due to timezone and flight, I have no problem with this and the ACPI mapping part looks good to me. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] x86/kexec: always ensure EFI systab region is mapped
On Mon, Apr 22, 2019 at 11:21 PM Junichi Nomura wrote: > > On 4/22/19 6:28 PM, Kairui Song wrote: > > The reason is the systab region is not mapped by the identity mapping > > provided by kexec. Currently kexec only create identity mapping for > > mem regions, wihch won't cover the systab. So second kernel will be > > accessing a not mapped memory region and cause fault. > > But as kexec tend to pad the map region up to PUD size, the > > systab could be included in the map by accident, so it worked on > > some machines, but that will be broken easily and unstable. > > Is the mapping of ACPI tables just by luck, too? > Good question, they should have same issue with systab, I ignored this one. Then in first kernel when doing kexec it should ensure both ACPI tables and the EFI systab are mapped, that should cover everything and make it work. Is there anything else missing? -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] x86/kexec: always ensure EFI systab region is mapped
This is a fix needed for: "x86/boot: Use efi_setup_data for searching RSDP on kexec-ed kernels", that patch cause kexec to reset the system on some machines. The reason is the systab region is not mapped by the identity mapping provided by kexec. Currently kexec only create identity mapping for mem regions, wihch won't cover the systab. So second kernel will be accessing a not mapped memory region and cause fault. But as kexec tend to pad the map region up to PUD size, the systab could be included in the map by accident, so it worked on some machines, but that will be broken easily and unstable. To fix it just treat systab specially, always map the systab region unconditionally on EFI systems as long as there is a valid systab address. Signed-off-by: Kairui Song --- arch/x86/kernel/machine_kexec_64.c | 40 ++ 1 file changed, 40 insertions(+) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ceba408ea982..d5da54893f97 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,37 @@ static void *alloc_pgt_page(void *data) return p; } +#ifdef CONFIG_EFI +static int init_efi_systab_pgtable(struct x86_mapping_info *info, + pgd_t *level4p) +{ + unsigned long mstart, mend; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + mstart = (boot_params.efi_info.efi_systab | + ((u64)boot_params.efi_info.efi_systab_hi<<32)); + + if (efi_enabled(EFI_64BIT)) + mend = mstart + sizeof(efi_system_table_64_t); + else + mend = mstart + sizeof(efi_system_table_32_t); + + if (mstart) + return kernel_ident_mapping_init(info, + level4p, mstart, mend); + + return 0; +} +#else +static inline int init_efi_systab_pgtable(struct x86_mapping_info *info, + pgd_t *level4p) +{ + return 0; +} +#endif + static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { struct x86_mapping_info info = { @@ -159,6 +191,14 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) return result; } + /* +* Prepare EFI systab mapping for kexec kernel, systab is not +* covered by pfn_mapped. +*/ + result = init_efi_systab_pgtable(&info, level4p); + if (result) + return result; + return init_transition_pgtable(image, level4p); } -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it
On Fri, Apr 19, 2019 at 7:34 PM Borislav Petkov wrote: > > On Fri, Apr 19, 2019 at 07:20:06PM +0800, Kairui Song wrote: > > Thanks for the declaration Bao, I can verify on the machine I have, > > the issue still exist without kaslr. Currently, we read rsdp in early > > code and fill in boot_params unconditional, so it will read from the > > systab anyway. > > Yes, and in the future, info required by the kexec'ed kernel - like the > EFI systab address or even whether the kernel has been kexec'ed or comes > from cold boot - should be passed in boot_params. So that we don't have > to do all that ugly dancing in early code. > > > Yes, kexec only cover RAM in the ident map it prepared for second > > kernel, but the systab could be in reserved region, so if it didn't > > fall into the 1G padding by accident it will fail when reading from > > it. Fix in early code could make sure 2nd kernel always work. Or > > should we treat it specially in kexec mapping prepare code? > > Yes, we should. As I said, this is not early boot code's problem but the > kexec setup code's problem. > > If the new kernel cannot get RSDP that early, then it should fail the > same way it failed before. That early RDSP parsing was added for the > movable regions thing working with KASLR. > > If it can't get a RDSP for whatever reason, then if KASLR selects > a region overlapping with the movable regions, then it is the old > behavior. > > Ok? > OK. And then fix the mapping issue in 1st kernel is the right way, I'll skip the update for the early code mapping thing. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it
On Fri, Apr 19, 2019 at 6:50 PM Baoquan He wrote: > > On 04/19/19 at 12:17pm, Borislav Petkov wrote: > > Breaking thread because this one got too big. > > > > On Fri, Apr 19, 2019 at 04:34:58PM +0800, Kairui Song wrote: > > > There are two approach to fix it, detect if the systab is mapped, and > > > avoid reading it if not. > > > > Ok, so tglx and I discussed this situation which is slowly getting out > > of hand with all the tinkering. > > > > So, here's what we should do - scream loudly now if some of this doesn't > > make any sense. > > > > 1. Junichi's patch should get the systab check above added and sent to > > 5.1 so that at least some EFI kexecing can work with 5.1 > > Talked with Kairui privately just now. Seems Junichi's patch need add > this systab mapping. Since the systab region is not mapped on some > machines. Those machine don't have this issue because they got systab > region luckily coverred by 1 GB page mapping in 1st kernel before > kexec jumping. > > This issue should happen whether it is KASLR kernel or not KASLR kernel. Thanks for the declaration Bao, I can verify on the machine I have, the issue still exist without kaslr. Currently, we read rsdp in early code and fill in boot_params unconditional, so it will read from the systab anyway. > > > > > 2. Then, the fact whether the kernel has been kexec'ed and which > > addresses it should use early, should all be passed through boot_params > > which is either setup by kexec(1) or by the first kernel itself, in the > > kexec_file_load() case. > > Seems no better way to check if it's kexec-ed kernel, except of the > setup data checking of kexec-ed kernel. > > It may happen in both kexec_load or kexec_file_load, since we build > ident mapping of kexec for RAM in 1st kernel. For kexec_file_load newer kernel will fill in the acpi_rsdp in boot_params so it bypassed the kexec_get_rsdp_addr (which will read from systab). The problem is not fixed, systab mapping still missing, but not likely to happen with kexec_file_load on newer kernel. > > > > > > the systab region is not mapped by the identity mapping provided by > > > kexec. > > > > 3. Then that needs to be fixed in the first kernel as it is a > > shortcoming of us starting to parse systab very early. It is the kexec > > setup code's problem not the early compressed stage's problem that the > > EFI systab is not mapped. > > Yeah, adding the systab mapping looks good. Kairui put it in > decompressing stage just because he wants to cover the case in which the > old kernel kexec jumping to 2nd kernel. Now it seems not very > reasonable, we also have the new kernel kexec jumping to old 1nd kernel. Yes, kexec only cover RAM in the ident map it prepared for second kernel, but the systab could be in reserved region, so if it didn't fall into the 1G padding by accident it will fail when reading from it. Fix in early code could make sure 2nd kernel always work. Or should we treat it specially in kexec mapping prepare code? > > Thanks > Baoquan -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it
On Fri, Apr 19, 2019 at 4:58 PM Baoquan He wrote: > > On 04/19/19 at 04:34pm, Kairui Song wrote: > > /* Locates and clears a region for a new top level page table. */ > > void initialize_identity_maps(void) > > { > > - /* If running as an SEV guest, the encryption mask is required. */ > > - set_sev_encryption_mask(); > > - > > - /* Exclude the encryption mask from __PHYSICAL_MASK */ > > - physical_mask &= ~sme_me_mask; > > - > > - /* Init mapping_info with run-time function/buffer pointers. */ > > - mapping_info.alloc_pgt_page = alloc_pgt_page; > > - mapping_info.context = &pgt_data; > > - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; > > - mapping_info.kernpg_flag = _KERNPG_TABLE; > > - > > - /* > > - * It should be impossible for this not to already be true, > > - * but since calling this a second time would rewind the other > > - * counters, let's just make sure this is reset too. > > - */ > > - pgt_data.pgt_buf_offset = 0; > > - > > - /* > > - * If we came here via startup_32(), cr3 will be _pgtable already > > - * and we must append to the existing area instead of entirely > > - * overwriting it. > > - * > > - * With 5-level paging, we use '_pgtable' to allocate the p4d page > > table, > > - * the top-level page table is allocated separately. > > - * > > - * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level > > - * cases. On 4-level paging it's equal to 'top_level_pgt'. > > - */ > > - top_level_pgt = read_cr3_pa(); > > - if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { > > - debug_putstr("booted via startup_32()\n"); > > - pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; > > - pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; > > - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); > > - } else { > > - debug_putstr("booted via startup_64()\n"); > > - pgt_data.pgt_buf = _pgtable; > > - pgt_data.pgt_buf_size = BOOT_PGT_SIZE; > > - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); > > + top_level_pgt = early_boot_top_pgt; > > + if ((p4d_t *)top_level_pgt != (p4d_t *)_pgtable) > > top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); > > Kairui, will you make a patchset to include these changes separately > later on? I don't get the purposes of code changes. E.g here, I > don't know why you introduce a new variable early_boot_top_pgt, and > allocate the page table, even though they have been done in the old > initialize_identity_maps(). > > Thanks > Baoquan > OK, right, it's not a good idea to mess up things together, I'll resend the patch, and will sent the cleanup separately. Without clean up it may bring in some extra burden with certain kernel config, but that should be OK for the fix. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it
The previous patch "x86/boot: Use efi_setup_data for searching RSDP on kexec-ed kernels" always reset some machines. This is a follow up of that patch. The reason is, by default, the systab region is not mapped by the identity mapping provided by kexec. So kernel will be accessing a not mapped memory region and cause fault. But as kexec tend to pad the map region up tp PUD or PMD size, the systab could be included in the map by accident so it worked on some machines, but that will be broken easily and unstable. There are two approach to fix it, detect if the systab is mapped, and avoid reading it if not. Another one is to ensure the region is map by either check and map the systab in fisrt kernel before kexec. Or map the systab in early code before reading it. Mapping in the early code should cover every case (else boot from an older kernel will also fail). This patch is a draft of implementing it. Just added a helper (add_identity_map_pgd) which could be used to add extra identity mapping in very early stage. And call it before reading systab. There should be no need to unmap it as the early page table will be discarded later. But some refractoring is included, which introduced a lot of changes, move some page table related code from kaslr_64.c to pgtable_64.c. If the appraoch goes well could prepare a sperate clean up patches. Signed-off-by: Kairui Song --- arch/x86/boot/compressed/acpi.c | 5 + arch/x86/boot/compressed/kaslr_64.c | 109 + arch/x86/boot/compressed/misc.c | 2 + arch/x86/boot/compressed/pgtable.h| 11 +++ arch/x86/boot/compressed/pgtable_64.c | 131 +- arch/x86/include/asm/boot.h | 8 +- 6 files changed, 156 insertions(+), 110 deletions(-) diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 8cecce1ac0cd..a513b0f9bfda 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -2,6 +2,7 @@ #define BOOT_CTYPE_H #include "misc.h" #include "error.h" +#include "pgtable.h" #include "../string.h" #include @@ -134,6 +135,10 @@ static acpi_physical_address kexec_get_rsdp_addr(void) if (!systab) error("EFI system table not found in kexec boot_params."); + add_identity_map_pgd((unsigned long)systab, +(unsigned long)systab + sizeof(*systab), +early_boot_top_pgt); + return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true); } #else diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c index 748456c365f4..ec7093e192bf 100644 --- a/arch/x86/boot/compressed/kaslr_64.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -8,121 +8,21 @@ * Copyright (C) 2016 Kees Cook */ -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION - #include "misc.h" - -/* These actually do the work of building the kernel identity maps. */ -#include -#include -/* Use the static base for this part of the boot process */ -#undef __PAGE_OFFSET -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#include "../../mm/ident_map.c" +#include "pgtable.h" /* Used by pgtable.h asm code to force instruction serialization. */ unsigned long __force_order; -/* Used to track our page table allocation area. */ -struct alloc_pgt_data { - unsigned char *pgt_buf; - unsigned long pgt_buf_size; - unsigned long pgt_buf_offset; -}; - -/* - * Allocates space for a page table entry, using struct alloc_pgt_data - * above. Besides the local callers, this is used as the allocation - * callback in mapping_info below. - */ -static void *alloc_pgt_page(void *context) -{ - struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; - unsigned char *entry; - - /* Validate there is space available for a new page. */ - if (pages->pgt_buf_offset >= pages->pgt_buf_size) { - debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); - debug_putaddr(pages->pgt_buf_offset); - debug_putaddr(pages->pgt_buf_size); - return NULL; - } - - entry = pages->pgt_buf + pages->pgt_buf_offset; - pages->pgt_buf_offset += PAGE_SIZE; - - return entry; -} - -/* Used to track our allocated page tables. */ -static struct alloc_pgt_data pgt_data; - /* The top level page table entry pointer. */ static unsigned long top_level_pgt; -phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; - -
Re: [PATCH] x86/boot: Use efi_setup_data for searching RSDP on kexec-ed kernels
On Wed, Apr 17, 2019 at 12:57 PM Dave Young wrote: > > On 04/17/19 at 09:38am, Dave Young wrote: > > On 04/16/19 at 03:22pm, Borislav Petkov wrote: > > > On Tue, Apr 16, 2019 at 07:41:33PM +0800, Dave Young wrote: > > > > On 04/16/19 at 11:52am, Borislav Petkov wrote: > > > > > I'll queue the below in the next days if there are no more complaints: > > > > > > > > As for the kexec breakage, even with the V3 patch, kexec still hangs on > > > > a Lenovo T420 laptop. Kairui also reproduced the problem. So can we > > > > wait a few days see if we can make some progress to find the cause? > > > > > > How is applying this patch going to change anything? > > > > > > I was told that the breakage is there even without it... > > > > Without this patch, the bug happens in the efi_get_rsdp.. function, this > > patch tries to fix that by adding kexec_get.. but the new introduced > > kexec_* function does not work on some laptops, so it is not a 100% good > > fix, I hoped we can get it working for all known issues. But if we can > > not do it eg. within one week we can go with this version and leave the > > laptop issue as a known issue. > > > > Latest debugging status: > > Kexec boot works with commenting out some code like below, so the guid > cmp (memcmp) caused a system reset), still need to find out why: > > diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c > index d9f9abd63c68..13e7a23ae94c 100644 > --- a/arch/x86/boot/compressed/acpi.c > +++ b/arch/x86/boot/compressed/acpi.c > @@ -95,10 +95,12 @@ __efi_get_rsdp_addr(unsigned long config_tables, unsigned > int nr_tables, > table = tbl->table; > } > > +/* > if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) > rsdp_addr = table; > else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) > return table; > +*/ > } > > return rsdp_addr; > @@ -291,9 +293,10 @@ acpi_physical_address get_rsdp_addr(void) > if (!pa) > pa = kexec_get_rsdp_addr(); > > +/* > if (!pa) > pa = efi_get_rsdp_addr(); > - > +*/ > if (!pa) > pa = bios_get_rsdp_addr(); > > Hi Dave, for this case I think it's just because GCC will found the loop does nothing, and optimize out the whole loop in __efi_get_rsdp_addr and will no longer read the actual nr_table value. I can fix the boot error on T420 with your patch, but if I add anything, like a hardcode value assignment with the right value for acpi_rsdp in the loop, it will reset the machine. But set acpi_rsdp with a right initial value out side the loop works fine. If the loop condition is false, then there should be no difference between just comment out the line you mentioned and add an assignment. Else it just assign the value multiple times, not very reasonable but shouldn't fail. And, I inspected the generated ASM code also suggest the same thing. So still, access the systab memory is the cause of the system reset on certain machines. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] x86: Always to to fill acpi_rsdp_addr in boot params
On Thu, Apr 4, 2019 at 3:25 PM Dave Young wrote: > > Hello Kairui > On 03/28/19 at 05:49pm, Kairui Song wrote: > > Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address > > from boot params if available"), kernel accept a acpi_rsdp_addr param in > > boot_params. Sync the x86_linux_param_header to support this param. > > > > And previously we are already appending 'acpi_rsdp=' command line only for > > loading crash kernel on EFI systems, it will be better to try to set the > > boot param for any kernel get loaded, to help the kernel finding the > > RSDP value more stably. Otherwise if the user decide to disable EFI > > service in second kernel, it will fail to boot. > > > > There is no better way to find the RSDP address from legacy BIOS > > interface rather than scanning the memory region and search for it, > > which will always be done by the kernel as a fallback, so we only > > look for RSDP in previous boot params, cmdline and EFI firmware. > > It would be good to always pass acpi_rsdp= kernel cmdline in case > efi=old_map. (or maybe efi=noruntime as well, but I did not remember > the behavior of noruntime now), no matter kexec or kdump.. > > And if you want, maybe fill the boot_params instead of passing cmdline > for new kernel which supports the new boot_param field. > > Split the patch to small patches would be better. > > Thanks > Dave Thanks for the review! I'll update in V2 accordingly. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] x86: Always to to fill acpi_rsdp_addr in boot params
On Thu, Mar 28, 2019 at 5:49 PM Kairui Song wrote: > > Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address > from boot params if available"), kernel accept a acpi_rsdp_addr param in > boot_params. Sync the x86_linux_param_header to support this param. > > And previously we are already appending 'acpi_rsdp=' command line only for > loading crash kernel on EFI systems, it will be better to try to set the > boot param for any kernel get loaded, to help the kernel finding the > RSDP value more stably. Otherwise if the user decide to disable EFI > service in second kernel, it will fail to boot. > > There is no better way to find the RSDP address from legacy BIOS > interface rather than scanning the memory region and search for it, > which will always be done by the kernel as a fallback, so we only > look for RSDP in previous boot params, cmdline and EFI firmware. > > Signed-off-by: Kairui Song > --- > include/x86/x86-linux.h| 8 ++-- > kexec/arch/i386/crashdump-x86.c| 34 + > kexec/arch/i386/kexec-x86-common.c | 60 ++ > kexec/arch/i386/kexec-x86.h| 1 + > kexec/arch/i386/x86-linux-setup.c | 6 ++- > kexec/arch/i386/x86-linux-setup.h | 1 + > 6 files changed, 80 insertions(+), 30 deletions(-) > > diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h > index 352ea02..a5d8df8 100644 > --- a/include/x86/x86-linux.h > +++ b/include/x86/x86-linux.h > @@ -45,8 +45,7 @@ struct apm_bios_info { > uint16_t cseg_len; /* 0x4e */ > uint16_t cseg_16_len; /* 0x50 */ > uint16_t dseg_len; /* 0x52 */ > - uint8_t reserved[44]; /* 0x54 */ > -}; > +} __attribute__((packed)); > > /* > * EDD stuff > @@ -113,12 +112,15 @@ struct x86_linux_param_header { > uint8_t reserved4[2]; /* 0x3e -- 0x3f reserved for > future expansion */ > > struct apm_bios_info apm_bios_info; /* 0x40 */ > + uint8_t reserved4_1[28]; /* 0x54 */ > + uint64_t acpi_rsdp_addr;/* 0x70 */ > + uint8_t reserved4_2[8];/* 0x78 */ > struct drive_info_struct drive_info;/* 0x80 */ > struct sys_desc_table sys_desc_table; /* 0xa0 */ > uint32_t ext_ramdisk_image; /* 0xc0 */ > uint32_t ext_ramdisk_size; /* 0xc4 */ > uint32_t ext_cmd_line_ptr; /* 0xc8 */ > - uint8_t reserved4_1[0x1c0 - 0xcc]; /* 0xe4 */ > + uint8_t reserved4_3[0x1c0 - 0xcc]; /* 0xe4 */ > uint8_t efi_info[32]; /* 0x1c0 */ > uint32_t alt_mem_k; /* 0x1e0 */ > uint8_t reserved5[4]; /* 0x1e4 */ > diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c > index 140f45b..262d157 100644 > --- a/kexec/arch/i386/crashdump-x86.c > +++ b/kexec/arch/i386/crashdump-x86.c > @@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void) > /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */ > static void cmdline_add_efi(char *cmdline) > { > - FILE *fp; > - int cmdlen, len; > - char line[MAX_LINE], *s; > - const char *acpis = " acpi_rsdp="; > + int cmdlen; > + uint64_t acpi_rsdp; > > - fp = fopen("/sys/firmware/efi/systab", "r"); > - if (!fp) > - return; > + acpi_rsdp = get_acpi_rsdp(); > + cmdlen = strlen(cmdline); > > - while(fgets(line, sizeof(line), fp) != 0) { > - /* ACPI20= always goes before ACPI= */ > - if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) { > - line[strlen(line) - 1] = '\0'; > - s = strchr(line, '='); > - s += 1; > - len = strlen(s) + strlen(acpis); > - cmdlen = strlen(cmdline) + len; > - if (cmdlen > (COMMAND_LINE_SIZE - 1)) > - die("Command line overflow\n"); > - strcat(cmdline, acpis); > - strcat(cmdline, s); > - dbgprintf("Command line after adding efi\n"); > - dbgprintf("%s\n", cmdline); > + if (!acpi_rsdp) > + return; > > - break; > - } > - } > + if (cmdlen + sizeof(" acpi_rsdp=0x") + 16 > (COMMAND_LINE_SIZE - 1)) > + die("Command
[PATCH] x86: Always to to fill acpi_rsdp_addr in boot params
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address from boot params if available"), kernel accept a acpi_rsdp_addr param in boot_params. Sync the x86_linux_param_header to support this param. And previously we are already appending 'acpi_rsdp=' command line only for loading crash kernel on EFI systems, it will be better to try to set the boot param for any kernel get loaded, to help the kernel finding the RSDP value more stably. Otherwise if the user decide to disable EFI service in second kernel, it will fail to boot. There is no better way to find the RSDP address from legacy BIOS interface rather than scanning the memory region and search for it, which will always be done by the kernel as a fallback, so we only look for RSDP in previous boot params, cmdline and EFI firmware. Signed-off-by: Kairui Song --- include/x86/x86-linux.h| 8 ++-- kexec/arch/i386/crashdump-x86.c| 34 + kexec/arch/i386/kexec-x86-common.c | 60 ++ kexec/arch/i386/kexec-x86.h| 1 + kexec/arch/i386/x86-linux-setup.c | 6 ++- kexec/arch/i386/x86-linux-setup.h | 1 + 6 files changed, 80 insertions(+), 30 deletions(-) diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h index 352ea02..a5d8df8 100644 --- a/include/x86/x86-linux.h +++ b/include/x86/x86-linux.h @@ -45,8 +45,7 @@ struct apm_bios_info { uint16_t cseg_len; /* 0x4e */ uint16_t cseg_16_len; /* 0x50 */ uint16_t dseg_len; /* 0x52 */ - uint8_t reserved[44]; /* 0x54 */ -}; +} __attribute__((packed)); /* * EDD stuff @@ -113,12 +112,15 @@ struct x86_linux_param_header { uint8_t reserved4[2]; /* 0x3e -- 0x3f reserved for future expansion */ struct apm_bios_info apm_bios_info; /* 0x40 */ + uint8_t reserved4_1[28]; /* 0x54 */ + uint64_t acpi_rsdp_addr;/* 0x70 */ + uint8_t reserved4_2[8];/* 0x78 */ struct drive_info_struct drive_info;/* 0x80 */ struct sys_desc_table sys_desc_table; /* 0xa0 */ uint32_t ext_ramdisk_image; /* 0xc0 */ uint32_t ext_ramdisk_size; /* 0xc4 */ uint32_t ext_cmd_line_ptr; /* 0xc8 */ - uint8_t reserved4_1[0x1c0 - 0xcc]; /* 0xe4 */ + uint8_t reserved4_3[0x1c0 - 0xcc]; /* 0xe4 */ uint8_t efi_info[32]; /* 0x1c0 */ uint32_t alt_mem_k; /* 0x1e0 */ uint8_t reserved5[4]; /* 0x1e4 */ diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c index 140f45b..262d157 100644 --- a/kexec/arch/i386/crashdump-x86.c +++ b/kexec/arch/i386/crashdump-x86.c @@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void) /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */ static void cmdline_add_efi(char *cmdline) { - FILE *fp; - int cmdlen, len; - char line[MAX_LINE], *s; - const char *acpis = " acpi_rsdp="; + int cmdlen; + uint64_t acpi_rsdp; - fp = fopen("/sys/firmware/efi/systab", "r"); - if (!fp) - return; + acpi_rsdp = get_acpi_rsdp(); + cmdlen = strlen(cmdline); - while(fgets(line, sizeof(line), fp) != 0) { - /* ACPI20= always goes before ACPI= */ - if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) { - line[strlen(line) - 1] = '\0'; - s = strchr(line, '='); - s += 1; - len = strlen(s) + strlen(acpis); - cmdlen = strlen(cmdline) + len; - if (cmdlen > (COMMAND_LINE_SIZE - 1)) - die("Command line overflow\n"); - strcat(cmdline, acpis); - strcat(cmdline, s); - dbgprintf("Command line after adding efi\n"); - dbgprintf("%s\n", cmdline); + if (!acpi_rsdp) + return; - break; - } - } + if (cmdlen + sizeof(" acpi_rsdp=0x") + 16 > (COMMAND_LINE_SIZE - 1)) + die("Command line overflow\n"); - fclose(fp); + sprintf(cmdline + cmdlen, " acpi_rsdp=0x%016lx", acpi_rsdp); } static void get_backup_area(struct kexec_info *info, diff --git a/kexec/arch/i386/kexec-x86-common.c b/kexec/arch/i386/kexec-x86-common.c index de99758..4b8eb26 100644 --- a/kexec/arch/i386/kexec-x86-common.c +++ b/kexec/arch/i386/kexec-x86-common.c @@ -39,6 +39,7 @@ #include "../../firmware_memmap.h" #include "../../crashdump.h" #include "kexec-x86.h" +#include "x86-linux-
Re: [PATCH] x86/boot: Use EFI setup data if provided
On Mon, Mar 25, 2019 at 2:20 PM Dave Young wrote: > > On 03/25/19 at 02:01pm, Dave Young wrote: > > On 03/25/19 at 12:27am, Junichi Nomura wrote: > > > On Fri, Mar 22, 2019 at 04:23:28PM +0100, Borislav Petkov wrote: > > > > On Fri, Mar 22, 2019 at 11:03:43AM +, Junichi Nomura wrote: > > > > > Commit 3a63f70bf4c3a ("x86/boot: Early parse RSDP and save it in > > > > > boot_params") broke kexec boot on EFI systems. efi_get_rsdp_addr() > > > > > in the early parsing code tries to search RSDP from EFI table but > > > > > whose address is virtual. > > > > > > > > > > Since kexec(1) provides physical address of config_table via > > > > > boot_params, > > > > > efi_get_rsdp_addr() should look for setup_data in the same way as > > > > > efi_systab_init() in arch/x86/platform/efi/efi.c does. > > > > > > > > If the kexec kernel should continue to use efi_systab_init() then you > > > > should make efi_get_rsdp_addr() exit early in the kexec-ed kernel. > > > > > > I'm not sure which way kexec devel is going. Added kexec list. > > > Here is the version that exits early in efi_get_rsdp_addr(). > > > > > > [PATCH] x86/boot: Don't try to search RSDP from EFI when kexec-booted > > > > > > Commit 3a63f70bf4c3a ("x86/boot: Early parse RSDP and save it in > > > boot_params") broke kexec boot on EFI systems. efi_get_rsdp_addr() > > > in the early parsing code tries to search RSDP from EFI table but > > > whose address is virtual. > > > > > > Normally kexec(1) provides physical address of config_table via > > > boot_params > > > and EFI code uses that during initialization. > > > For the early boot code, we just exit efi_get_rsdp_addr() early if the > > > kernel > > > is booted by kexec. > > > > > > Fixes: 3a63f70bf4c3a ("x86/boot: Early parse RSDP and save it in > > > boot_params") > > > Signed-off-by: Jun'ichi Nomura > > > Cc: Chao Fan > > > Cc: Borislav Petkov > > > > > > diff --git a/arch/x86/boot/compressed/acpi.c > > > b/arch/x86/boot/compressed/acpi.c > > > index 0ef4ad5..1cefc43 100644 > > > --- a/arch/x86/boot/compressed/acpi.c > > > +++ b/arch/x86/boot/compressed/acpi.c > > > @@ -44,6 +44,24 @@ static acpi_physical_address get_acpi_rsdp(void) > > > return addr; > > > } > > > > > > +static bool is_kexec_booted(void) > > > +{ > > > + struct setup_data *data; > > > + > > > + /* > > > +* kexec-tools provides EFI setup data so that kexec-ed kernel > > > +* can find proper tables. > > > +*/ > > > + data = (struct setup_data *) boot_params->hdr.setup_data; > > > + while (data) { > > > + if (data->type == SETUP_EFI) > > > + return true; > > > + data = (struct setup_data *) data->next; > > > + } > > > + > > > + return false; > > > +} > > > + > > > /* Search EFI system tables for RSDP. */ > > > static acpi_physical_address efi_get_rsdp_addr(void) > > > { > > > @@ -57,6 +75,10 @@ static acpi_physical_address efi_get_rsdp_addr(void) > > > int size, i; > > > char *sig; > > > > > > + /* If the system is kexec-booted, poking EFI systab may not work. */ > > > + if (is_kexec_booted()) > > > + return 0; > > > + > > > ei = &boot_params->efi_info; > > > sig = (char *)&ei->efi_loader_signature; > > > > > > > > > ___ > > > kexec mailing list > > > kexec@lists.infradead.org > > > http://lists.infradead.org/mailman/listinfo/kexec > > > > Good catch, this way looks good to me. But the function > > is_kexec_booted can be compiled when #ifdef CONFIG_EFI > > > > Otherwise: > > > > Acked-by: Dave Young > > > > Hold on, I replied too quick. One question is does the above patch > passed your test? It can workaround and skip the wrong phys addr > issue, but the acpi early parsing still fails because efi_get_rsdp_addr > return 0? > > If this is the case you may need go with your old patch. > > I think normally people do not see this bug, because kernel will set the > rsdp in boot_params->acpi_rsdp_addr. Maybe you are testing with > different kernel versions, eg. > > old kernel kexec to new kernel. > > And the old kernel does not set boot_params->acpi_rsdp_addr > > Is this correct? > > Thanks > Dave Hi Dave, actually only kexec_file_load will always set the boot_params->acpi_rsdp_addr. Can't guarantee how user space tools will prepare the boot_prams if kexec_load is used, so it's should very likely to happen. And for the patch, I also think the first patch looks better, if we just return 0 early in efi_get_rsdp_addr aren't we still failing to parse the rsdp in early code? -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2] x86: Introduce a new option --reuse-video-type
After commit 060eee58 "x86: use old screen_info if needed", kexec-tools will force use old screen_info and vga type if failed to determine current vga type. But it is not always a good idea. Currently kernel hanging is inspected on some hyper-v VMs after this commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot up, but after the real driver is loaded, it will switch to new mode and no longer compatible with EFI/VESA VGA. Keep setting orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and try to manipulate the framebuffer in a wrong way. We can't ensure this won't happen on other framebuffer drivers, But it's a helpful feature if the framebuffer drivers just work. So this patch introduce a --reuse-video-type options to let user decide if the old screen_info hould be used unconditional or not. Signed-off-by: Kairui Song --- Update from V1: - Fix a fd leak - Rename the option from --force-vga to --reuse-video-type kexec/arch/i386/include/arch/options.h | 2 ++ kexec/arch/i386/kexec-x86.h| 1 + kexec/arch/i386/x86-linux-setup.c | 8 ++-- kexec/arch/x86_64/kexec-x86_64.c | 5 + 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/kexec/arch/i386/include/arch/options.h b/kexec/arch/i386/include/arch/options.h index c113a83..0e57951 100644 --- a/kexec/arch/i386/include/arch/options.h +++ b/kexec/arch/i386/include/arch/options.h @@ -32,6 +32,7 @@ #define OPT_ENTRY_32BIT(OPT_ARCH_MAX+10) #define OPT_PASS_MEMMAP_CMDLINE(OPT_ARCH_MAX+11) #define OPT_NOEFI (OPT_ARCH_MAX+12) +#define OPT_REUSE_VIDEO_TYPE (OPT_ARCH_MAX+13) /* Options relevant to the architecture (excluding loader-specific ones): */ #define KEXEC_ARCH_OPTIONS \ @@ -45,6 +46,7 @@ { "elf64-core-headers", 0, 0, OPT_ELF64_CORE }, \ { "pass-memmap-cmdline", 0, 0, OPT_PASS_MEMMAP_CMDLINE }, \ { "noefi", 0, 0, OPT_NOEFI}, \ + { "reuse-video-type", 0, 0, OPT_REUSE_VIDEO_TYPE }, \ #define KEXEC_ARCH_OPT_STR KEXEC_OPT_STR "" diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h index 51855f8..c2bcd37 100644 --- a/kexec/arch/i386/kexec-x86.h +++ b/kexec/arch/i386/kexec-x86.h @@ -52,6 +52,7 @@ struct arch_options_t { enum coretype core_header_type; uint8_t pass_memmap_cmdline; uint8_t noefi; + uint8_t reuse_video_type; }; int multiboot_x86_probe(const char *buf, off_t len); diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c index 1bd408b..8fad115 100644 --- a/kexec/arch/i386/x86-linux-setup.c +++ b/kexec/arch/i386/x86-linux-setup.c @@ -144,7 +144,7 @@ static int setup_linux_vesafb(struct x86_linux_param_header *real_mode) } else if (0 == strcmp(fix.id, "EFI VGA")) { /* VIDEO_TYPE_EFI */ real_mode->orig_video_isVGA = 0x70; - } else { + } else if (arch_options.reuse_video_type) { int err; off_t offset = offsetof(typeof(*real_mode), orig_video_isVGA); @@ -152,6 +152,10 @@ static int setup_linux_vesafb(struct x86_linux_param_header *real_mode) err = get_bootparam(&real_mode->orig_video_isVGA, offset, 1); if (err) goto out; + } else { + real_mode->orig_video_isVGA = 0; + close(fd); + return 0; } close(fd); @@ -844,7 +848,7 @@ void setup_linux_system_parameters(struct kexec_info *info, setup_subarch(real_mode); if (bzImage_support_efi_boot && !arch_options.noefi) setup_efi_info(info, real_mode); - + /* Default screen size */ real_mode->orig_x = 0; real_mode->orig_y = 0; diff --git a/kexec/arch/x86_64/kexec-x86_64.c b/kexec/arch/x86_64/kexec-x86_64.c index 041b007..ccdc980 100644 --- a/kexec/arch/x86_64/kexec-x86_64.c +++ b/kexec/arch/x86_64/kexec-x86_64.c @@ -55,6 +55,7 @@ void arch_usage(void) " --console-serial Enable the serial console\n" " --pass-memmap-cmdline Pass memory map via command line in kexec on panic case\n" " --noefi Disable efi support\n" + " --reuse-video-typeReuse old boot time video type blindly\n" ); } @@ -67,6 +68,7 @@ struct arch_options_t arch_options = { .core_header_type = CORE_TYPE_ELF64, .pass_memmap_cmdline = 0, .noefi = 0, + .reuse_video_type = 0, }; int arch_process_options(int argc, char **argv) @@ -136,6 +138,9 @@ int arch_process_options(int argc, char **argv) case OPT_NOEFI: arch_options.noefi = 1;
Re: [PATCH] x86: Introdudce a new option --force-vga
On Tue, Mar 5, 2019 at 6:09 PM Dave Young wrote: > > On 03/05/19 at 04:24pm, Kairui Song wrote: > > On Mon, Mar 4, 2019 at 2:30 PM Dave Young wrote: > > > > > > On 02/28/19 at 06:07pm, Kairui Song wrote: > > > > After commit 060eee58 "x86: use old screen_info if needed", kexec-tools > > > > will force use old screen_info and vga type if failed to determine > > > > current vga type. But it is not always a good idea. > > > > > > > > Currently kernel hanging is inspected on some hyper-v VMs after this > > > > commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot > > > > up, but after the real driver is loaded, it will switch to new mode > > > > and no longer compatible with EFI/VESA VGA. Keep setting > > > > orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and > > > > try to manipulate the framebuffer in a wrong way. > > > > > > > > We can't ensure this won't happen on other framebuffer drivers, But > > > > it's a helpful feature if the framebuffer drivers just work. So this > > > > patch introduce a --force-vga options to let user decide if the > > > > old screen_info should be used unconditional or not. > > > > > > It looks good to me except the option name, because vga usually means > > > the specific vga video type. But here you are enforcing to reuse the > > > first > > > kernel original video type. > > > > > > It would be better to use --reuse-video-type or --force-orig-video, etc.. > > > > > > > Thanks for the review, the naming is not very good indeed, will update > > the patch. How about just --reuse-video? This should be general enough > > and clear. > > Hmm, I feel --reuse-video-type is clearer, --reuse-video seems not > clear :) > OK, will use --reuse-video-type then. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] x86: Introdudce a new option --force-vga
On Mon, Mar 4, 2019 at 2:30 PM Dave Young wrote: > > On 02/28/19 at 06:07pm, Kairui Song wrote: > > After commit 060eee58 "x86: use old screen_info if needed", kexec-tools > > will force use old screen_info and vga type if failed to determine > > current vga type. But it is not always a good idea. > > > > Currently kernel hanging is inspected on some hyper-v VMs after this > > commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot > > up, but after the real driver is loaded, it will switch to new mode > > and no longer compatible with EFI/VESA VGA. Keep setting > > orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and > > try to manipulate the framebuffer in a wrong way. > > > > We can't ensure this won't happen on other framebuffer drivers, But > > it's a helpful feature if the framebuffer drivers just work. So this > > patch introduce a --force-vga options to let user decide if the > > old screen_info should be used unconditional or not. > > It looks good to me except the option name, because vga usually means > the specific vga video type. But here you are enforcing to reuse the first > kernel original video type. > > It would be better to use --reuse-video-type or --force-orig-video, etc.. > Thanks for the review, the naming is not very good indeed, will update the patch. How about just --reuse-video? This should be general enough and clear. -- Best Regards, Kairui Song ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] x86: Introdudce a new option --force-vga
After commit 060eee58 "x86: use old screen_info if needed", kexec-tools will force use old screen_info and vga type if failed to determine current vga type. But it is not always a good idea. Currently kernel hanging is inspected on some hyper-v VMs after this commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot up, but after the real driver is loaded, it will switch to new mode and no longer compatible with EFI/VESA VGA. Keep setting orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and try to manipulate the framebuffer in a wrong way. We can't ensure this won't happen on other framebuffer drivers, But it's a helpful feature if the framebuffer drivers just work. So this patch introduce a --force-vga options to let user decide if the old screen_info should be used unconditional or not. Signed-off-by: Kairui Song --- kexec/arch/i386/include/arch/options.h | 2 ++ kexec/arch/i386/kexec-x86.h| 1 + kexec/arch/i386/x86-linux-setup.c | 7 +-- kexec/arch/x86_64/kexec-x86_64.c | 5 + 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kexec/arch/i386/include/arch/options.h b/kexec/arch/i386/include/arch/options.h index c113a83..7667cf4 100644 --- a/kexec/arch/i386/include/arch/options.h +++ b/kexec/arch/i386/include/arch/options.h @@ -32,6 +32,7 @@ #define OPT_ENTRY_32BIT(OPT_ARCH_MAX+10) #define OPT_PASS_MEMMAP_CMDLINE(OPT_ARCH_MAX+11) #define OPT_NOEFI (OPT_ARCH_MAX+12) +#define OPT_FORCE_VGA (OPT_ARCH_MAX+13) /* Options relevant to the architecture (excluding loader-specific ones): */ #define KEXEC_ARCH_OPTIONS \ @@ -45,6 +46,7 @@ { "elf64-core-headers", 0, 0, OPT_ELF64_CORE }, \ { "pass-memmap-cmdline", 0, 0, OPT_PASS_MEMMAP_CMDLINE }, \ { "noefi", 0, 0, OPT_NOEFI}, \ + { "force-vga", 0, 0, OPT_FORCE_VGA }, \ #define KEXEC_ARCH_OPT_STR KEXEC_OPT_STR "" diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h index 51855f8..d16679f 100644 --- a/kexec/arch/i386/kexec-x86.h +++ b/kexec/arch/i386/kexec-x86.h @@ -52,6 +52,7 @@ struct arch_options_t { enum coretype core_header_type; uint8_t pass_memmap_cmdline; uint8_t noefi; + uint8_t force_vga; }; int multiboot_x86_probe(const char *buf, off_t len); diff --git a/kexec/arch/i386/x86-linux-setup.c b/kexec/arch/i386/x86-linux-setup.c index 1bd408b..0e92d26 100644 --- a/kexec/arch/i386/x86-linux-setup.c +++ b/kexec/arch/i386/x86-linux-setup.c @@ -144,7 +144,7 @@ static int setup_linux_vesafb(struct x86_linux_param_header *real_mode) } else if (0 == strcmp(fix.id, "EFI VGA")) { /* VIDEO_TYPE_EFI */ real_mode->orig_video_isVGA = 0x70; - } else { + } else if (arch_options.force_vga) { int err; off_t offset = offsetof(typeof(*real_mode), orig_video_isVGA); @@ -152,6 +152,9 @@ static int setup_linux_vesafb(struct x86_linux_param_header *real_mode) err = get_bootparam(&real_mode->orig_video_isVGA, offset, 1); if (err) goto out; + } else { + real_mode->orig_video_isVGA = 0; + return 0; } close(fd); @@ -844,7 +847,7 @@ void setup_linux_system_parameters(struct kexec_info *info, setup_subarch(real_mode); if (bzImage_support_efi_boot && !arch_options.noefi) setup_efi_info(info, real_mode); - + /* Default screen size */ real_mode->orig_x = 0; real_mode->orig_y = 0; diff --git a/kexec/arch/x86_64/kexec-x86_64.c b/kexec/arch/x86_64/kexec-x86_64.c index 041b007..2e54381 100644 --- a/kexec/arch/x86_64/kexec-x86_64.c +++ b/kexec/arch/x86_64/kexec-x86_64.c @@ -55,6 +55,7 @@ void arch_usage(void) " --console-serial Enable the serial console\n" " --pass-memmap-cmdline Pass memory map via command line in kexec on panic case\n" " --noefi Disable efi support\n" + " --force-vga Enabled vga blindly whenever possible \n" ); } @@ -67,6 +68,7 @@ struct arch_options_t arch_options = { .core_header_type = CORE_TYPE_ELF64, .pass_memmap_cmdline = 0, .noefi = 0, + .force_vga = 0, }; int arch_process_options(int argc, char **argv) @@ -136,6 +138,9 @@ int arch_process_options(int argc, char **argv) case OPT_NOEFI: arch_options.noefi = 1; break; + case OPT_FORCE_VGA: + arch_options.force_vga = 1; + break; } } /*
[PATCH] x86, kexec_file_load: fill in acpi_rsdp_addr boot param unconditionally
When efi=noruntime or efi=oldmap is used, EFI services won't be available in the second kernel, therefore the second kernel will not be able to get the ACPI RSDP address from firmware by calling EFI services so it won't boot. Previously we are expecting the user to set the acpi_rsdp= on kernel command line for second kernel as there was no other way to pass RSDP address to second kernel. After commit e6e094e053af ("x86/acpi, x86/boot: Take RSDP address from boot params if available"), now it's possible to set an acpi_rsdp_addr parameter in the boot_params passed to second kernel, and kernel will prefer using this value for the RSDP address when it's set. And with commit 3a63f70bf4c3 ("x86/boot: Early parse RSDP and save it in boot_params"), now the acpi_rsdp_addr will always be filled with valid RSDP address. So we just fill in that value for second kernel's boot_params unconditionally, this ensure second kernel always use the same RSDP value as the first kernel. Tested with an EFI enabled KVM VM with efi=noruntime. Signed-off-by: Kairui Song --- This is update of part of patch series: "[PATCH v3 0/3] make kexec work with efi=noruntime or efi=old_map." But "[PATCH v3 1/3] x86, kexec_file_load: Don't setup EFI info if EFI runtime is not enabled" is already in [tip:x86/urgent], and with Chao's commit 3a63f70bf4c3 in [tip:x86/boot], we can just fill in acpi_rsdp_addr boot param unconditionally to fix the problem, so only I update and resend this patch. arch/x86/kernel/kexec-bzimage64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 53917a3ebf94..3611946dc7ea 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -218,6 +218,9 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, params->screen_info.ext_mem_k = 0; params->alt_mem_k = 0; + /* Always fill in RSDP, it's either 0 or a valid value */ + params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr; + /* Default APM info */ memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); @@ -256,7 +259,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, efi_setup_data_offset); #endif - /* Setup EDD info */ memcpy(params->eddbuf, boot_params.eddbuf, EDDMAXNR * sizeof(struct edd_info)); -- 2.20.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec