[PATCH 1/2] x86/apic: add a more generic early_probe

2022-06-08 Thread Kairui Song
From: Kairui Song 

There is only one early apic driver probe method: acpi_madt_oem_check,
which is used by ACPI MADT init path only.

Some apic drivers' early probe doesn't need ACPI info. Even when probed
from ACPI subsystem, the ACPI info is simply ignored. So add a more generic
early_probe method, which can be used by MPTABLE parse later.

Signed-off-by: Kairui Song 
---
 arch/x86/include/asm/apic.h   |  6 ++
 arch/x86/kernel/apic/probe_64.c   | 16 
 arch/x86/kernel/apic/x2apic_cluster.c |  8 +++-
 arch/x86/kernel/apic/x2apic_phys.c|  8 +++-
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bd8ae0a7010a..cd3266fbfa63 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -310,6 +310,7 @@ struct apic {
 
/* Probe, setup and smpboot functions */
int (*probe)(void);
+   int (*early_probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
@@ -498,6 +499,11 @@ extern void 
acpi_wake_cpu_handler_update(wakeup_cpu_handler handler);
 extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
+#ifdef CONFIG_X86_64
+extern void apic_early_probe(void);
+#else
+static inline void apic_early_probe(void) { }
+#endif
 
 extern u32 apic_default_calc_apicid(unsigned int cpu);
 extern u32 apic_flat_calc_apicid(unsigned int cpu);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c46720f185c0..3f600c421f07 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -13,6 +13,22 @@
 
 #include "local.h"
 
+void __init apic_early_probe(void)
+{
+   struct apic **drv;
+
+   for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+   if ((*drv)->early_probe && (*drv)->early_probe()) {
+   if (apic != *drv) {
+   apic = *drv;
+   pr_info("Switched to APIC driver %s.\n",
+   apic->name);
+   }
+   break;
+   }
+   }
+}
+
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c 
b/arch/x86/kernel/apic/x2apic_cluster.c
index e696e22d0531..02eb8ea9a5b5 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -26,11 +26,16 @@ static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
 static struct cluster_mask *cluster_hotplug_mask;
 
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int x2apic_early_probe(void)
 {
return x2apic_enabled();
 }
 
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+   return x2apic_early_probe();
+}
+
 static void x2apic_send_IPI(int cpu, int vector)
 {
u32 dest = x86_cpu_to_logical_apicid[cpu];
@@ -197,6 +202,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
 
.name   = "cluster x2apic",
.probe  = x2apic_cluster_probe,
+   .early_probe= x2apic_early_probe,
.acpi_madt_oem_check= x2apic_acpi_madt_oem_check,
.apic_id_valid  = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c 
b/arch/x86/kernel/apic/x2apic_phys.c
index 6bde05a86b4e..c4dd4ec0f1ac 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -34,11 +34,16 @@ static bool x2apic_fadt_phys(void)
return false;
 }
 
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int x2apic_early_probe(void)
 {
return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
 }
 
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+   return x2apic_early_probe();
+}
+
 static void x2apic_send_IPI(int cpu, int vector)
 {
u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
@@ -156,6 +161,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
 
.name   = "physical x2apic",
.probe  = x2apic_phys_probe,
+   .early_probe= x2apic_early_probe,
.acpi_madt_oem_check= x2apic_acpi_madt_oem_check,
.apic_id_valid  = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
-- 
2.35.2


___

[PATCH 2/2] x86/mpparse, kexec: probe apic driver early for x2apic

2022-06-08 Thread Kairui Song
From: Kairui Song 

Following kernel panic is observed when doing kdump/kexec on
virtual machines that uses MPTABLE, not ACPI MADT, and supports x2apic:

  Intel MultiProcessor Specification v1.4
  MPTABLE: OEM ID: BOCHSCPU
  MPTABLE: Product ID: 0.1
  MPTABLE: APIC at: 0xFEE0
  BUG: unable to handle page fault for address: ff5fc020
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x) - not-present page
  PGD 25e15067 P4D 25e15067 PUD 25e17067 PMD 25e18067 PTE 0
  Oops:  [#1] SMP NOPTI
  CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.10-300.fc35.x86_64 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1.fc35 
04/01/2014
  RIP: 0010:native_apic_mem_read+0x2/0x10
  Code: 14 25 20 cd e3 82 c3 90 bf 30 08 00 00 ff 14 25 18 cd e3 82 c3 cc cc cc 
89 ff 89 b7 00 c0 5f ff c3 0f 1f 80 00 00 00 00 89 ff <8b> 87 00 c0 5f ff c3 0f 
1f 80 00 00 00 0
  RSP: :82e03e18 EFLAGS: 00010046
  RAX: 81064840 RBX: ff240b6c RCX: 82f17428
  RDX: c000dfff RSI: dfff RDI: 0020
  RBP: 88802320 R08:  R09: 82e03c50
  R10: 82e03c48 R11: 82f47468 R12: ff240b40
  R13: ff200b30 R14:  R15: 00d4
  FS:  () GS:8365b000() knlGS:
  CS:  0010 DS:  ES:  CR0: 80050033
  CR2: ff5fc020 CR3: 25e1 CR4: 06b0
  Call Trace:
   ? read_apic_id+0x15/0x30
   ? register_lapic_address+0x76/0x97
   ? default_get_smp_config+0x28b/0x42d
   ? dmi_check_system+0x1c/0x60
   ? acpi_boot_init+0x1d/0x4c3
   ? setup_arch+0xb37/0xc2a
   ? slab_is_available+0x5/0x10
   ? start_kernel+0x61/0x980
   ? load_ucode_bsp+0x4c/0xcd
   ? secondary_startup_64_no_verify+0xc2/0xcb
  Modules linked in:
  CR2: ff5fc020
  random: get_random_bytes called from oops_exit+0x35/0x60 with crng_init=0
  ---[ end trace c9e569df3bdbefd3 ]---

The panic happens within following init code:
setup_arch()
  
  check_x2apic() <-- x2apic is enabled by first kernel before kexec,
 this set x2apic_mode = 1, make sure later probes
 will recognize pre-enabled x2apic.
  
  acpi_boot_init();  <-- If ACPI MADT is in use, this will switch apic driver
 to x2apic, but it will do nothing with MPTABLE.
  x86_dtb_init();
  get_smp_config();
default_get_smp_config();  <-- MPTABLE setup.
  check_physptr();
smp_read_mpc();
  register_lapic_address(); <-- * panic here *
  init_apic_mappings();
  

The problem here is MPTABLE setup calls register_lapic_address(), which
is still using apic_flat driver, and access the apic MMIO interface. But
the address is never mapped for pre-enabled x2apic, since commit
0450193bffed6 ("x86, x2apic: Don't map lapic addr for preenabled x2apic 
systems"),
then it panics.

Simply map it won't work either, in x2apic mode the MMIO interface is
not usable (Intel SDM Volume 3A 10.12.2), later setups will still fail with
other errors. So it needs do a proper apic driver probe and switch to
x2apic driver to perform MSR operation instead.

Such issue is currently only seen with kdump/kexec, kernel enabled the
x2apic in first kernel and kept it enabled to 2nd kernel.

This can be easily reproduced with qemu-kvm, use -no-acpi and enable
x2apic, so x2apic with MPTABLE will be in use, then trigger kdump/kexec.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/mpparse.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index fed721f90116..7658c8184e8c 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -202,8 +202,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, 
unsigned early)
return 0;
 
/* Initialize the lapic mapping */
-   if (!acpi_lapic)
+   if (!acpi_lapic) {
+   apic_early_probe();
register_lapic_address(mpc->lapic);
+   }
 
if (early)
return 1;
-- 
2.35.2


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2] x86/mpparse, kexec: Fix kdump/kexec kernel panic with MPTABLE and x2apic

2022-06-08 Thread Kairui Song
From: Kairui Song 

Following kernel panic is observed when doing kdump/kexec on
qemu-kvm VMs that uses MPTABLE, not ACPI MADT, and supports x2apic:

  Intel MultiProcessor Specification v1.4
  MPTABLE: OEM ID: BOCHSCPU
  MPTABLE: Product ID: 0.1
  MPTABLE: APIC at: 0xFEE0
  BUG: unable to handle page fault for address: ff5fc020
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x) - not-present page
  PGD 25e15067 P4D 25e15067 PUD 25e17067 PMD 25e18067 PTE 0
  Oops:  [#1] SMP NOPTI
  CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.10-300.fc35.x86_64 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1.fc35 
04/01/2014
  RIP: 0010:native_apic_mem_read+0x2/0x10
  Code: 14 25 20 cd e3 82 c3 90 bf 30 08 00 00 ff 14 25 18 cd e3 82 c3 cc cc cc 
89 ff 89 b7 00 c0 5f ff c3 0f 1f 80 00 00 00 00 89 ff <8b> 87 00 c0 5f ff c3 0f 
1f 80 00 00 00 0
  RSP: :82e03e18 EFLAGS: 00010046
  RAX: 81064840 RBX: ff240b6c RCX: 82f17428
  RDX: c000dfff RSI: dfff RDI: 0020
  RBP: 88802320 R08:  R09: 82e03c50
  R10: 82e03c48 R11: 82f47468 R12: ff240b40
  R13: ff200b30 R14:  R15: 00d4
  FS:  () GS:8365b000() knlGS:
  CS:  0010 DS:  ES:  CR0: 80050033
  CR2: ff5fc020 CR3: 25e1 CR4: 06b0
  Call Trace:
   ? read_apic_id+0x15/0x30
   ? register_lapic_address+0x76/0x97
   ? default_get_smp_config+0x28b/0x42d
   ? dmi_check_system+0x1c/0x60
   ? acpi_boot_init+0x1d/0x4c3
   ? setup_arch+0xb37/0xc2a
   ? slab_is_available+0x5/0x10
   ? start_kernel+0x61/0x980
   ? load_ucode_bsp+0x4c/0xcd
   ? secondary_startup_64_no_verify+0xc2/0xcb
  Modules linked in:
  CR2: ff5fc020
  random: get_random_bytes called from oops_exit+0x35/0x60 with crng_init=0
  ---[ end trace c9e569df3bdbefd3 ]---

It turns out MPTABLE doesn't play well with pre-enabled x2apic mode,
this series extend the apic driver interface and let MPTABLE parse
probe the driver properly.

This can be easily reproduced with qemu-kvm, use -no-acpi and enable
x2apic, so x2apic with MPTABLE will be in use, then trigger kdump/kexec.

Kairui Song (2):
  x86, apic: add a more generic early_probe
  x86/mpparse, kexec: probe apic driver early for x2apic

 arch/x86/include/asm/apic.h   |  6 ++
 arch/x86/kernel/apic/probe_64.c   | 16 
 arch/x86/kernel/apic/x2apic_cluster.c |  8 +++-
 arch/x86/kernel/apic/x2apic_phys.c|  8 +++-
 arch/x86/kernel/mpparse.c |  4 +++-
 5 files changed, 39 insertions(+), 3 deletions(-)

-- 
2.35.2


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: Setting "orig_video_isVGA" when handing off Linux framebuffer

2021-05-18 Thread Kairui Song
 Hi Benjamin,

Sorry for the late reply, I missed your email in my inbox.

On Wed, May 5, 2021 at 7:10 AM Benjamin Moody  wrote:
>
> Hi,
>
> In regard to how kexec hands off the framebuffer to the newly-booted
> kernel:
>
> Commit 060eee589dd1 (2018-01-28) added the "blindly try old boot time
> video type" behavior, without doing any checking to see if the
> framebuffer is compatible with the stated format.
>
> Commit fb5a8792e6e4 (2019-03-05) made this behavior conditional on the
> --reuse-video-type option.  The commit message observes that:
>
> Currently kernel hanging is inspected on some hyper-v VMs after this
> commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot
> up, but after the real driver is loaded, it will switch to new mode
> and no longer compatible with EFI/VESA VGA. Keep setting
> orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded
> and try to manipulate the framebuffer in a wrong way.
>
> It's clear to me that various bad things *might* happen if kexec
> pretends that the framebuffer is "VESA-compatible" or "EFI-compatible"
> when in fact it isn't.
>
> Yet, in many cases, the Linux framebuffer is VESA/EFI-compatible, at
> least to the extent that blindly setting orig_video_isVGA = 0x23 or
> 0x70 results in a usable display.  So I have to wonder, in the
> situation mentioned above:
>
>  - was the framebuffer not in a compatible format to begin with?
>
>  - was the framebuffer address not correctly reported by the existing
>kernel driver?
>
>  - did the original bootloader give wrong information and somehow that
>broke the newly booted kernel?
>
> Kairui, can you please clarify what sort of kernel hangs you were
> seeing and what specific hardware and drivers you were using?
>

For the commit fb5a8792e6e4, the problem is only observed with
hyperv_fb, and it's a HyperV VM.
The framebuffer was VESA compatible when the machine just booted, but
after hyperv_fb driver is loaded, it will ask the hypervisor to
relocate the framebuffer in a new location and in a new format.

In a later kernel commit 3cb73bc3fa2a3cb80b88aa63b48409939e0d996b, it
fixed the kernel side issue that after the relocation, the framebuffer
address is not updated in boot_params. It was not updated before this
kernel commit. Before that, the old boot_params will contain an
invalid address and cause failures in the new booted kernel.

And I also remember blindly setting orig_video_isVGA will cause
strange errors on some random graphic cards. If we can't make sure
it's really VGA, this field better left zero, so kernel won't use it
as a VGA framebuffer.

For your case, you mentioned "'fix.id' is not "VESA VGA" or "EFI VGA",
but rather "inteldrmfb" or "i915drmfb"",  'fix.id' can change after
boot, I'm not familiar with heads or coreboot, but I guess the first
kernel you booted have intel drm drivers loaded? Maybe you can try
either don't load intel drm driver in first kernel (so the framebuffer
is always being used in a VESA/EFI compatible way), or ensure same
driver is loaded in the new booted kernel (this way the driver will
reinitialize the framebuffer anyway, even if it's not set in
boot_params).

> Benjamin Moody
>


-- 
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v4 1/1] kernel/crash_core: Add crashkernel=auto for vmcore creation

2021-02-23 Thread Kairui Song
On Wed, Feb 24, 2021 at 1:45 AM Saeed Mirzamohammadi
 wrote:
>
> This adds crashkernel=auto feature to configure reserved memory for
> vmcore creation. CONFIG_CRASH_AUTO_STR is defined to be set for
> different kernel distributions and different archs based on their
> needs.
>
> Signed-off-by: Saeed Mirzamohammadi 
> Signed-off-by: John Donnelly 
> Tested-by: John Donnelly 
> ---
>  Documentation/admin-guide/kdump/kdump.rst |  3 ++-
>  .../admin-guide/kernel-parameters.txt |  6 ++
>  arch/Kconfig  | 20 +++
>  kernel/crash_core.c   |  7 +++
>  4 files changed, 35 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/admin-guide/kdump/kdump.rst 
> b/Documentation/admin-guide/kdump/kdump.rst
> index 75a9dd98e76e..ae030111e22a 100644
> --- a/Documentation/admin-guide/kdump/kdump.rst
> +++ b/Documentation/admin-guide/kdump/kdump.rst
> @@ -285,7 +285,8 @@ This would mean:
>  2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M
>  3) if the RAM size is larger than 2G, then reserve 128M
>
> -
> +Or you can use crashkernel=auto to choose the crash kernel memory size
> +based on the recommended configuration set for each arch.
>
>  Boot into System Kernel
>  ===
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 9e3cdb271d06..a5deda5c85fe 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -747,6 +747,12 @@
> a memory unit (amount[KMG]). See also
> Documentation/admin-guide/kdump/kdump.rst for an 
> example.
>
> +   crashkernel=auto
> +   [KNL] This parameter will set the reserved memory for
> +   the crash kernel based on the value of the 
> CRASH_AUTO_STR
> +   that is the best effort estimation for each arch. See 
> also
> +   arch/Kconfig for further details.
> +
> crashkernel=size[KMG],high
> [KNL, X86-64] range could be above 4G. Allow kernel
> to allocate physical memory region from top, so could
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 24862d15f3a3..23d047548772 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -14,6 +14,26 @@ menu "General architecture-dependent options"
>  config CRASH_CORE
> bool
>
> +config CRASH_AUTO_STR
> +   string "Memory reserved for crash kernel"
> +   depends on CRASH_CORE
> +   default "1G-64G:128M,64G-1T:256M,1T-:512M"
> +   help
> + This configures the reserved memory dependent
> + on the value of System RAM. The syntax is:
> + crashkernel=:[,:,...][@offset]
> + range=start-[end]
> +
> + For example:
> + crashkernel=512M-2G:64M,2G-:128M
> +
> + This would mean:
> +
> + 1) if the RAM is smaller than 512M, then don't reserve anything
> +(this is the "rescue" case)
> + 2) if the RAM size is between 512M and 2G (exclusive), then 
> reserve 64M
> + 3) if the RAM size is larger than 2G, then reserve 128M
> +
>  config KEXEC_CORE
> select CRASH_CORE
> bool
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 825284baaf46..90f9e4bb6704 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -7,6 +7,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>  #include 
> @@ -250,6 +251,12 @@ static int __init __parse_crashkernel(char *cmdline,
> if (suffix)
> return parse_crashkernel_suffix(ck_cmdline, crash_size,
> suffix);
> +#ifdef CONFIG_CRASH_AUTO_STR
> +   if (strncmp(ck_cmdline, "auto", 4) == 0) {
> +   ck_cmdline = CONFIG_CRASH_AUTO_STR;
> +   pr_info("Using crashkernel=auto, the size chosen is a best 
> effort estimation.\n");
> +   }
> +#endif
> /*
>  * if the commandline contains a ':', then that's the extended
>  * syntax -- if not, it must be the classic syntax
> --
> 2.27.0
>
>
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>

Thanks for help pushing the crashkernel=auto to upstream
This patch works well.

Tested-by: Kairui Song 


--
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v3 1/1] kernel/crash_core: Add crashkernel=auto for vmcore creation

2021-02-23 Thread Kairui Song
int __init __parse_crashkernel(char *cmdline,
> >   if (suffix)
> >   return parse_crashkernel_suffix(ck_cmdline, crash_size,
> >   suffix);
> > +#ifdef CONFIG_CRASH_AUTO_STR
> > + if (strncmp(ck_cmdline, "auto", 4) == 0) {
> > + ck_cmdline = CONFIG_CRASH_AUTO_STR;
> > + pr_info("Using crashkernel=auto, the size chosen is a best 
> > effort estimation.\n");
> > + }
> > +#endif
> >   /*
> >* if the commandline contains a ':', then that's the extended
> >* syntax -- if not, it must be the classic syntax
> > --
> > 2.27.0
> >
>
>
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
>


-- 
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH 1/1] kexec-tools: fix build on pre 4.4 kernels

2021-02-23 Thread Kairui Song

On 2/5/21 4:15 PM, Federico Pellegrin wrote:

kexec build will fail on older kernels (pre 4.4) as the define
VIDEO_CAPABILITY_64BIT_BASE was not present at that time.

This patch adds it, as per linux/include/uapi/linux/screen_info.h,
if not present.

Signed-off-by: Federico Pellegrin 
---
  kexec/arch/i386/x86-linux-setup.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/kexec/arch/i386/x86-linux-setup.c 
b/kexec/arch/i386/x86-linux-setup.c
index 76e1185..ab54a4a 100644
--- a/kexec/arch/i386/x86-linux-setup.c
+++ b/kexec/arch/i386/x86-linux-setup.c
@@ -37,6 +37,10 @@
  #include "x86-linux-setup.h"
  #include "../../kexec/kexec-syscall.h"
  
+#ifndef VIDEO_CAPABILITY_64BIT_BASE

+#define VIDEO_CAPABILITY_64BIT_BASE (1 << 1) /* Frame buffer base is 
64-bit */
+#endif
+
  void init_linux_parameters(struct x86_linux_param_header *real_mode)
  {
/* Fill in the values that are usually provided by the kernel. */


Thanks for the fix, I didn't notice pre 4.4 kernels don't have this defined
when I submitted that patch.

Reviewed-by: Kairui Song 


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH 1/1] kernel/crash_core.c - Add crashkernel=auto for x86 and ARM

2020-11-20 Thread Kairui Song
On Fri, Nov 20, 2020 at 4:28 AM Saeed Mirzamohammadi
 wrote:
>
> Hi,
>
> And I think crashkernel=auto could be used as an indicator that user
> want the kernel to control the crashkernel size, so some further work
> could be done to adjust the crashkernel more accordingly. eg. when
> memory encryption is enabled, increase the crashkernel value for the
> auto estimation, as it's known to consume more crashkernel memory.
>
> Thanks for the suggestion! I tried to keep it simple and leave it to the user 
> to change Kconfig in case a different range is needed. Based on experience, 
> these ranges work well for most of the regular cases.

Yes, I think the current implementation is a very good start.

There are some use cases, where kernel is expected to reserve more memory, like:
- when memory encryption is enabled, an extra swiotlb size of memory
should be reserved
- on pcc, fadump will expect more memory to be reserved

I believe there are a lot more cases like these.
I tried to come up with some patches to let the kernel reserve more
memory automatically, when such conditions are detected, but changing
the crashkernel= specified value is really weird.

But if we have a crashkernel=auto, then kernel automatically reserve
more memory will make sense.

> But why not make it arch-independent? This crashkernel=auto idea
> should simply work with every arch.
>
>
> Thanks! I’ll be making it arch-independent in the v2 patch.
>
>
> #include 
> #include 
> @@ -41,6 +42,15 @@ static int __init parse_crashkernel_mem(char *cmdline,
>unsigned long long *crash_base)
> {
>char *cur = cmdline, *tmp;
> +   unsigned long long total_mem = system_ram;
> +
> +   /*
> +* Firmware sometimes reserves some memory regions for it's own use.
> +* so we get less than actual system memory size.
> +* Workaround this by round up the total size to 128M which is
> +* enough for most test cases.
> +*/
> +   total_mem = roundup(total_mem, SZ_128M);
>
>
> I think this rounding may be better moved to the arch specified part
> where parse_crashkernel is called?
>
>
> Thanks for the suggestion. Could you please elaborate why do we need to do 
> that?

Every arch gets their total memory value using different methods,
(just check every parse_crashkernel call, and the system_ram param is
filled in many different ways), so I'm really not sure if this
rounding is always suitable.

>
> Thanks,
> Saeed
>
>
--
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH 1/1] kernel/crash_core.c - Add crashkernel=auto for x86 and ARM

2020-11-18 Thread Kairui Song
; help
>   Enable bzImage signature verification support.
>
> -config CRASH_DUMP
> +menuconfig CRASH_DUMP
> bool "kernel crash dumps"
> depends on X86_64 || (X86_32 && HIGHMEM)
> help
> @@ -2049,6 +2049,30 @@ config CRASH_DUMP
>   (CONFIG_RELOCATABLE=y).
>   For more details see Documentation/admin-guide/kdump/kdump.rst
>
> +if CRASH_DUMP
> +
> +config CRASH_AUTO_STR
> +string "Memory reserved for crash kernel" if X86_64
> +   depends on CRASH_DUMP
> +default "1G-64G:128M,64G-1T:256M,1T-:512M"
> +   help
> + This configures the reserved memory dependent
> + on the value of System RAM. The syntax is:
> + crashkernel=:[,:,...][@offset]
> + range=start-[end]
> +
> + For example:
> + crashkernel=512M-2G:64M,2G-:128M
> +
> + This would mean:
> +
> + 1) if the RAM is smaller than 512M, then don't reserve anything
> +(this is the "rescue" case)
> + 2) if the RAM size is between 512M and 2G (exclusive), then 
> reserve 64M
> + 3) if the RAM size is larger than 2G, then reserve 128M
> +
> +endif # CRASH_DUMP
> +
>  config KEXEC_JUMP
> bool "kexec jump"
> depends on KEXEC && HIBERNATION
> diff --git a/arch/x86/configs/x86_64_defconfig 
> b/arch/x86/configs/x86_64_defconfig
> index 9936528e1939..7a87fbecf40b 100644
> --- a/arch/x86/configs/x86_64_defconfig
> +++ b/arch/x86/configs/x86_64_defconfig
> @@ -33,6 +33,7 @@ CONFIG_EFI_MIXED=y
>  CONFIG_HZ_1000=y
>  CONFIG_KEXEC=y
>  CONFIG_CRASH_DUMP=y
> +# CONFIG_CRASH_AUTO_STR is not set
>  CONFIG_HIBERNATION=y
>  CONFIG_PM_DEBUG=y
>  CONFIG_PM_TRACE_RTC=y
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 106e4500fd53..a44cd9cc12c4 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -7,6 +7,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>  #include 
> @@ -41,6 +42,15 @@ static int __init parse_crashkernel_mem(char *cmdline,
> unsigned long long *crash_base)
>  {
> char *cur = cmdline, *tmp;
> +   unsigned long long total_mem = system_ram;
> +
> +   /*
> +* Firmware sometimes reserves some memory regions for it's own use.
> +* so we get less than actual system memory size.
> +* Workaround this by round up the total size to 128M which is
> +* enough for most test cases.
> +*/
> +   total_mem = roundup(total_mem, SZ_128M);

I think this rounding may be better moved to the arch specified part
where parse_crashkernel is called?

>
> /* for each entry of the comma-separated list */
> do {
> @@ -85,13 +95,13 @@ static int __init parse_crashkernel_mem(char *cmdline,
> return -EINVAL;
> }
> cur = tmp;
> -   if (size >= system_ram) {
> +   if (size >= total_mem) {
> pr_warn("crashkernel: invalid size\n");
> return -EINVAL;
> }
>
> /* match ? */
> -   if (system_ram >= start && system_ram < end) {
> +   if (total_mem >= start && total_mem < end) {
>     *crash_size = size;
> break;
> }
> @@ -250,6 +260,12 @@ static int __init __parse_crashkernel(char *cmdline,
> if (suffix)
> return parse_crashkernel_suffix(ck_cmdline, crash_size,
> suffix);
> +#ifdef CONFIG_CRASH_AUTO_STR
> +   if (strncmp(ck_cmdline, "auto", 4) == 0) {
> +   ck_cmdline = CONFIG_CRASH_AUTO_STR;
> +   pr_info("Using crashkernel=auto, the size chosen is a best 
> effort estimation.\n");
> +   }
> +#endif
> /*
>  * if the commandline contains a ':', then that's the extended
>  * syntax -- if not, it must be the classic syntax
> --
> 2.18.4
>


--
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/2] hyperv_fb: Update screen_info after removing old framebuffer

2020-10-14 Thread Kairui Song
On gen2 HyperV VM, hyperv_fb will remove the old framebuffer, the
new allocated framebuffer address could be at a differnt location,
and it's no longer VGA framebuffer. Update screen_info
so that after kexec, kernel won't try to reuse the old invalid
framebuffer address as VGA.

Signed-off-by: Kairui Song 
---
 drivers/video/fbdev/hyperv_fb.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index 02411d89cb46..e36fb1a0ecdb 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -1114,8 +1114,15 @@ static int hvfb_getmem(struct hv_device *hdev, struct 
fb_info *info)
 getmem_done:
remove_conflicting_framebuffers(info->apertures,
KBUILD_MODNAME, false);
-   if (!gen2vm)
+
+   if (gen2vm) {
+   /* framebuffer is reallocated, clear screen_info to avoid 
misuse from kexec */
+   screen_info.lfb_size = 0;
+   screen_info.lfb_base = 0;
+   screen_info.orig_video_isVGA = 0;
+   } else {
pci_dev_put(pdev);
+   }
kfree(info->apertures);
 
return 0;
-- 
2.28.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 0/2] x86/hyperv: fix kexec/kdump hang on some VMs

2020-10-14 Thread Kairui Song
On some HyperV machines, if kexec_file_load is used to load the kexec
kernel, second kernel could hang with following stacktrace:

[0.591705] efifb: probing for efifb
[0.596869] efifb: framebuffer at 0xf800, using 3072k, total 3072k
[0.605894] efifb: mode is 1024x768x32, linelength=4096, pages=1
[0.617926] efifb: scrolling: redraw
[0.622715] efifb: Truecolor: size=8:8:8:8, shift=24:16:8:0
[   28.039046] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [swapper/0:1]
[   28.039046] Modules linked in:
[   28.039046] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.18.0-230.el8.x86_64 
#1
[   28.039046] Hardware name: Microsoft Corporation Virtual Machine/Virtual 
Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019
[   28.039046] RIP: 0010:cfb_imageblit+0x450/0x4c0
[   28.039046] Code: 89 f8 b9 08 00 00 00 48 89 04 24 eb 2d 41 0f be 30 29 e9 
4c 8d 5f 04 d3 fe 44 21 ee 41 8b 04 b6 44 21 c8 89 c6 44 31 d6 89 37 <85> c9 75 
09 49 83 c0 01 b9 08 00 00 00 4c 89 df 48 39 df 75 ce 83
[   28.039046] RSP: 0018:c9087830 EFLAGS: 00010246 ORIG_RAX: 
ff12
[   28.039046] RAX:  RBX: c9542000 RCX: 0003
[   28.039046] RDX: 000e RSI:  RDI: c9541bf0
[   28.039046] RBP: 0001 R08: 8880f555c8df R09: 00aa
[   28.039046] R10:  R11: c9541bf4 R12: 1000
[   28.039046] R13: 0001 R14: 81e9a460 R15: 8880f555c880
[   28.039046] FS:  () GS:8880f100() 
knlGS:
[   28.039046] CS:  0010 DS:  ES:  CR0: 80050033
[   28.039046] CR2: 7f7b223b8000 CR3: f3a0a004 CR4: 003606b0
[   28.039046] DR0:  DR1:  DR2: 
[   28.039046] DR3:  DR6: fffe0ff0 DR7: 0400
[   28.039046] Call Trace:
[   28.039046]  bit_putcs+0x2a1/0x550
[   28.039046]  ? fbcon_switch+0x33e/0x5b0
[   28.039046]  ? bit_clear+0x120/0x120
[   28.039046]  fbcon_putcs+0xe7/0x100
[   28.039046]  do_update_region+0x154/0x1a0
[   28.039046]  redraw_screen+0x209/0x240
[   28.039046]  ? vc_do_resize+0x5c9/0x660
[   28.039046]  fbcon_prepare_logo+0x3b3/0x430
[   28.039046]  fbcon_init+0x436/0x630
[   28.039046]  visual_init+0xce/0x130
[   28.039046]  do_bind_con_driver+0x1df/0x2d0
[   28.039046]  do_take_over_console+0x113/0x180
[   28.039046]  do_fbcon_takeover+0x58/0xb0
[   28.039046]  register_framebuffer+0x225/0x2f0
[   28.039046]  efifb_probe.cold.5+0x51a/0x55d
[   28.039046]  platform_drv_probe+0x38/0x90
[   28.039046]  really_probe+0x212/0x440
[   28.039046]  driver_probe_device+0x49/0xc0
[   28.039046]  device_driver_attach+0x50/0x60
[   28.039046]  __driver_attach+0x61/0x130
[   28.039046]  ? device_driver_attach+0x60/0x60
[   28.039046]  bus_for_each_dev+0x77/0xc0
[   28.039046]  ? klist_add_tail+0x57/0x70
[   28.039046]  bus_add_driver+0x14d/0x1e0
[   28.039046]  ? vesafb_driver_init+0x13/0x13
[   28.039046]  ? do_early_param+0x91/0x91
[   28.039046]  driver_register+0x6b/0xb0
[   28.039046]  ? vesafb_driver_init+0x13/0x13
[   28.039046]  do_one_initcall+0x46/0x1c3
[   28.039046]  ? do_early_param+0x91/0x91
[   28.039046]  kernel_init_freeable+0x1b4/0x25d
[   28.039046]  ? rest_init+0xaa/0xaa
[   28.039046]  kernel_init+0xa/0xfa
[   28.039046]  ret_from_fork+0x35/0x40

The root cause is that hyperv_fb driver will relocate the
framebuffer address in first kernel, but kexec_file_load simply reuse
the old framebuffer info from boot_params, which is now invalid, so
second kernel will write to an invalid framebuffer address.

This series fix this problem by:

1. Let kexec_file_load use the updated copy of screen_info.

  Instead of using boot_params.screen_info, use the globally available
  screen_info variable instead (which is just an copy of
  boot_params.screen_info on x86). This variable could be updated
  by arch indenpendent drivers. Just keep this variable updated should
  be a good way to keep screen_info consistent across kexec.

2. Let hyperv_fb clean the screen_info copy when the boot framebuffer
  is relocated outside the old framebuffer.

  After the relocation, the framebuffer is no longer a VGA
  framebuffer, so just clean it up should be good.

Kairui Song (2):
  x86/kexec: Use up-to-dated screen_info copy to fill boot params
  hyperv_fb: Update screen_info after removing old framebuffer

 arch/x86/kernel/kexec-bzimage64.c | 3 +--
 drivers/video/fbdev/hyperv_fb.c   | 8 
 2 files changed, 9 insertions(+), 2 deletions(-)

-- 
2.28.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 1/2] x86/kexec: Use up-to-dated screen_info copy to fill boot params

2020-10-14 Thread Kairui Song
kexec_file_load now just reuse the old boot_params.screen_info.
But if drivers have change the hardware state, boot_param.screen_info
could contain invalid info.

For example, the video type might be no longer VGA, or frame buffer
address changed. If kexec kernel keep using the old screen_info,
kexec'ed kernel may attempt to write to an invalid framebuffer
memory region.

There are two screen_info globally available, boot_params.screen_info
and screen_info. Later one is a copy, and could be updated by drivers.

So let kexec_file_load use the updated copy.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/kexec-bzimage64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kexec-bzimage64.c 
b/arch/x86/kernel/kexec-bzimage64.c
index 57c2ecf43134..ce831f9448e7 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -200,8 +200,7 @@ setup_boot_parameters(struct kimage *image, struct 
boot_params *params,
params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
 
/* Copying screen_info will do? */
-   memcpy(¶ms->screen_info, &boot_params.screen_info,
-   sizeof(struct screen_info));
+   memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info));
 
/* Fill in memsize later */
params->screen_info.ext_mem_k = 0;
-- 
2.28.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH 0/3] Add writing support to vmcore for reusing oldmem

2020-09-21 Thread Kairui Song
On Thu, Sep 10, 2020 at 12:43 AM Kairui Song  wrote:
>
> On Wed, Sep 9, 2020 at 10:04 PM Eric W. Biederman  
> wrote:
> >
> > Kairui Song  writes:
> >
> > > Currently vmcore only supports reading, this patch series is an RFC
> > > to add writing support to vmcore. It's x86_64 only yet, I'll add other
> > > architecture later if there is no problem with this idea.
> > >
> > > My purpose of adding writing support is to reuse the crashed kernel's
> > > old memory in kdump kernel, reduce kdump memory pressure, and
> > > allow kdump to run with a smaller crashkernel reservation.
> > >
> > > This is doable because in most cases, after kernel panic, user only
> > > interested in the crashed kernel itself, and userspace/cache/free
> > > memory pages are not dumped. `makedumpfile` is widely used to skip
> > > these pages. Kernel pages usually only take a small part of
> > > the whole old memory. So there will be many reusable pages.
> > >
> > > By adding writing support, userspace then can use these pages as a fast
> > > and temporary storage. This helps reduce memory pressure in many ways.
> > >
> > > For example, I've written a POC program based on this, it will find
> > > the reusable pages, and creates an NBD device which maps to these pages.
> > > The NBD device can then be used as swap, or to hold some temp files
> > > which previouly live in RAM.
> > >
> > > The link of the POC tool: https://github.com/ryncsn/kdumpd
> >
> > A couple of thoughts.
> > 1) Unless I am completely mistaken treating this as a exercise in
> >memory hotplug would be much simpler.
> >
> >AKA just plug in the memory that is not needed as part of the kdump.
> >
> >I see below that you have problems doing this because
> >of fragmentation.  I still think hotplug is doable using some
> >kind of fragmented memory zone.
> >
> > 2) The purpose of the memory reservation is because hardware is
> >still potentially running agains the memory of the old kernel.
> >
> >By the time we have brought up a new kernel enough of the hardware
> >may have been reinitialized that we don't have to worry about
> >hardware randomly dma'ing into the memory used by the old kernel.
> >
> >With IOMMUs and care we may be able to guarantee for some machine
> >configurations it is impossible for DMA to come from some piece of
> >hardware that is present but the kernel does not have a driver
> >loaded for.\
> >
> > I really do not like this approach because it is fundamentlly doing the
> > wrong thing.  Adding write support to read-only drivers.  I do not see
> > anywhere that you even mentioned the hard problem and the reason we
> > reserve memory in the first place.  Hardware spontaneously DMA'ing onto
> > it.
> >
> That POC tool looks ugly for now as it only a draft to prove this
> works, sorry about it.
>
> For the patch, yes, it is expecting IOMMU to lower the chance of
> potential DMA issue, and expecting DMA will not hit userspace/free
> page, or at least won't override a massive amount of reusable old
> memory. And I thought about some solutions for the potential DMA
> issue.
>
> As old memories are used as a block device, which is proxied by
> userspace, so upon each IO, the userspace tool could do an integrity
> check of the corresponding data stored in old mem, and keep multiple
> copies of the data. (eg. use 512M of old memory to hold a 128M block
> device). These copies will be kept far away from each other regarding
> the physical memory location. The reusable old memories are sparse so
> the actual memory containing the data should be also sparse.
> So if some part is corrupted, it is still recoverable. Unless the DMA
> went very wrong and wiped a large region of memory, but if such thing
> happens, it's most likely kernel pages are also being wiped by DMA, so
> the vmcore is already corrupted and kdump may not help. But at least
> it won't fail silently, the userspace tool can still do something like
> dump some available data to an easy to setup target.
>
> And also that's one of the reasons not using old memory as kdump's
> memory directly.
>
> > > It's have been a long time issue that kdump suffers from OOM issue
> > > with limited crashkernel memory. So reusing old memory could be very
> > > helpful.
> >
> > There is a very fine line here between reusing existing code (aka
> > drivers and userspace) and 

Re: [RFC PATCH 0/3] Add writing support to vmcore for reusing oldmem

2020-09-09 Thread Kairui Song
On Wed, Sep 9, 2020 at 10:04 PM Eric W. Biederman  wrote:
>
> Kairui Song  writes:
>
> > Currently vmcore only supports reading, this patch series is an RFC
> > to add writing support to vmcore. It's x86_64 only yet, I'll add other
> > architecture later if there is no problem with this idea.
> >
> > My purpose of adding writing support is to reuse the crashed kernel's
> > old memory in kdump kernel, reduce kdump memory pressure, and
> > allow kdump to run with a smaller crashkernel reservation.
> >
> > This is doable because in most cases, after kernel panic, user only
> > interested in the crashed kernel itself, and userspace/cache/free
> > memory pages are not dumped. `makedumpfile` is widely used to skip
> > these pages. Kernel pages usually only take a small part of
> > the whole old memory. So there will be many reusable pages.
> >
> > By adding writing support, userspace then can use these pages as a fast
> > and temporary storage. This helps reduce memory pressure in many ways.
> >
> > For example, I've written a POC program based on this, it will find
> > the reusable pages, and creates an NBD device which maps to these pages.
> > The NBD device can then be used as swap, or to hold some temp files
> > which previouly live in RAM.
> >
> > The link of the POC tool: https://github.com/ryncsn/kdumpd
>
> A couple of thoughts.
> 1) Unless I am completely mistaken treating this as a exercise in
>memory hotplug would be much simpler.
>
>AKA just plug in the memory that is not needed as part of the kdump.
>
>I see below that you have problems doing this because
>of fragmentation.  I still think hotplug is doable using some
>kind of fragmented memory zone.
>
> 2) The purpose of the memory reservation is because hardware is
>still potentially running agains the memory of the old kernel.
>
>By the time we have brought up a new kernel enough of the hardware
>may have been reinitialized that we don't have to worry about
>hardware randomly dma'ing into the memory used by the old kernel.
>
>With IOMMUs and care we may be able to guarantee for some machine
>configurations it is impossible for DMA to come from some piece of
>hardware that is present but the kernel does not have a driver
>loaded for.\
>
> I really do not like this approach because it is fundamentlly doing the
> wrong thing.  Adding write support to read-only drivers.  I do not see
> anywhere that you even mentioned the hard problem and the reason we
> reserve memory in the first place.  Hardware spontaneously DMA'ing onto
> it.
>
That POC tool looks ugly for now as it only a draft to prove this
works, sorry about it.

For the patch, yes, it is expecting IOMMU to lower the chance of
potential DMA issue, and expecting DMA will not hit userspace/free
page, or at least won't override a massive amount of reusable old
memory. And I thought about some solutions for the potential DMA
issue.

As old memories are used as a block device, which is proxied by
userspace, so upon each IO, the userspace tool could do an integrity
check of the corresponding data stored in old mem, and keep multiple
copies of the data. (eg. use 512M of old memory to hold a 128M block
device). These copies will be kept far away from each other regarding
the physical memory location. The reusable old memories are sparse so
the actual memory containing the data should be also sparse.
So if some part is corrupted, it is still recoverable. Unless the DMA
went very wrong and wiped a large region of memory, but if such thing
happens, it's most likely kernel pages are also being wiped by DMA, so
the vmcore is already corrupted and kdump may not help. But at least
it won't fail silently, the userspace tool can still do something like
dump some available data to an easy to setup target.

And also that's one of the reasons not using old memory as kdump's
memory directly.

> > It's have been a long time issue that kdump suffers from OOM issue
> > with limited crashkernel memory. So reusing old memory could be very
> > helpful.
>
> There is a very fine line here between reusing existing code (aka
> drivers and userspace) and doing something that should work.
>
> It might make sense to figure out what is using so much memory
> that an OOM is triggered.
>
> Ages ago I did something that was essentially dumping the kernels printk
> buffer to the serial console in case of a crash and I had things down to
> something comparatively miniscule like 8M or less.
>
> My memory is that historically it has been high performance scsi raid
> drivers or something like that, that are behind the need to have

[RFC PATCH 3/3] x86_64: implement copy_to_oldmem_page

2020-09-09 Thread Kairui Song
Previous commit introduced writing support for vmcore, it requires
per-architecture implementation for the writing function.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/crash_dump_64.c | 49 +++--
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045e82e8945b..ec80da75b287 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -13,7 +13,7 @@
 
 static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
  unsigned long offset, int userbuf,
- bool encrypted)
+ bool encrypted, bool is_write)
 {
void  *vaddr;
 
@@ -28,13 +28,25 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char 
*buf, size_t csize,
if (!vaddr)
return -ENOMEM;
 
-   if (userbuf) {
-   if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
-   iounmap((void __iomem *)vaddr);
-   return -EFAULT;
+   if (is_write) {
+   if (userbuf) {
+   if (copy_from_user(vaddr + offset, (void __user *)buf, 
csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else {
+   memcpy(vaddr + offset, buf, csize);
}
-   } else
-   memcpy(buf, vaddr + offset, csize);
+   } else {
+   if (userbuf) {
+   if (copy_to_user((void __user *)buf, vaddr + offset, 
csize)) {
+   iounmap((void __iomem *)vaddr);
+   return -EFAULT;
+   }
+   } else {
+   memcpy(buf, vaddr + offset, csize);
+   }
+   }
 
set_iounmap_nonlazy();
iounmap((void __iomem *)vaddr);
@@ -57,7 +69,7 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char 
*buf, size_t csize,
 ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
 unsigned long offset, int userbuf)
 {
-   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
+   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false, 
false);
 }
 
 /**
@@ -68,7 +80,26 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 
size_t csize,
 ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
   unsigned long offset, int userbuf)
 {
-   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
+   return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true, 
false);
+}
+
+/**
+ * copy_to_oldmem_page - similar to copy_oldmem_page but in opposite direction.
+ */
+ssize_t copy_to_oldmem_page(unsigned long pfn, char *src, size_t csize,
+   unsigned long offset, int userbuf)
+{
+   return __copy_oldmem_page(pfn, src, csize, offset, userbuf, false, 
true);
+}
+
+/**
+ * copy_to_oldmem_page_encrypted - similar to copy_oldmem_page_encrypted but
+ * in opposite direction.
+ */
+ssize_t copy_to_oldmem_page_encrypted(unsigned long pfn, char *src, size_t 
csize,
+   unsigned long offset, int userbuf)
+{
+   return __copy_oldmem_page(pfn, src, csize, offset, userbuf, true, true);
 }
 
 ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
-- 
2.26.2


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH 2/3] vmcore: Add interface to write to old mem

2020-09-09 Thread Kairui Song
vmcore is used as the interface to access crashed kernel's memory in
kdump, and currently vmcore only supports reading.

Adding writing support is useful for enabling userspace making better
use of the old memory.

For kdump, `makedumpfile` is widely used to reduce the dumped vmcore
size, and in most setup, it will drop user space memory, caches. This
means these memory pages are reusable.

Kdump runs in limited pre-reserved memory region, so if these old memory
pages are reused, it can help reduce memory pressure in kdump kernel,
hence allow first kernel to reserve less memory for kdump.

Adding write support to vmcore is the first step, then user space can
do IO on the old mem. There are multiple ways to reuse the memory, for
example, userspace can register a NBD device, and redirect the IO on the
device to old memory. The NBD device can be used as swap, or used to
hold some temp files.

Signed-off-by: Kairui Song 
---
 fs/proc/vmcore.c   | 129 +
 include/linux/crash_dump.h |  18 --
 2 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 124c2066f3e5..23acc0f2ecd7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -103,9 +103,9 @@ static int pfn_is_ram(unsigned long pfn)
 }
 
 /* Reads a page from the oldmem device from given offset. */
-ssize_t read_from_oldmem(char *buf, size_t count,
-u64 *ppos, int userbuf,
-bool encrypted)
+static ssize_t oldmem_rw_page(char *buf, size_t count,
+ u64 *ppos, int userbuf,
+ bool encrypted, bool is_write)
 {
unsigned long pfn, offset;
size_t nr_bytes, to_copy = count;
@@ -119,20 +119,33 @@ ssize_t read_from_oldmem(char *buf, size_t count,
 
/* If pfn is not ram, return zeros for sparse dump files */
if (pfn_is_ram(pfn) == 0) {
-   memset(buf, 0, nr_bytes);
-   } else {
-   if (encrypted)
-   tmp = copy_oldmem_page_encrypted(pfn, buf,
-nr_bytes,
-offset,
-userbuf);
+   if (is_write)
+   return -EINVAL;
else
-   tmp = copy_oldmem_page(pfn, buf, nr_bytes,
-  offset, userbuf);
+   memset(buf, 0, nr_bytes);
+   } else {
+   if (encrypted) {
+   tmp = is_write ?
+   copy_to_oldmem_page_encrypted(pfn, buf,
+ nr_bytes,
+ offset,
+ userbuf) :
+   copy_oldmem_page_encrypted(pfn, buf,
+  nr_bytes,
+  offset,
+  userbuf);
+   } else {
+   tmp = is_write ?
+   copy_to_oldmem_page(pfn, buf, nr_bytes,
+   offset, userbuf) :
+   copy_oldmem_page(pfn, buf, nr_bytes,
+   offset, userbuf);
+   }
 
if (tmp < 0)
return tmp;
}
+
*ppos += nr_bytes;
buf += nr_bytes;
to_copy -= nr_bytes;
@@ -143,6 +156,22 @@ ssize_t read_from_oldmem(char *buf, size_t count,
return count;
 }
 
+/* Reads a page from the oldmem device from given offset. */
+ssize_t read_from_oldmem(char *buf, size_t count,
+u64 *ppos, int userbuf,
+bool encrypted)
+{
+   return oldmem_rw_page(buf, count, ppos, userbuf, encrypted, 0);
+}
+
+/* Writes a page to the oldmem device of given offset. */
+ssize_t write_to_oldmem(char *buf, size_t count,
+   u64 *ppos, int userbuf,
+   bool encrypted)
+{
+   return oldmem_rw_page(buf, count, ppos, userbuf, encrypted, 1);
+}
+
 /*
  * Architectures may override this function to allocate ELF header in 2nd 
kernel
  */
@@ -184,6 +213,26 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct 
*vma,
return remap_pfn_range(vma, from, pfn, size, prot);
 }
 
+/*
+ * Architectures which support wr

[RFC PATCH 0/3] Add writing support to vmcore for reusing oldmem

2020-09-09 Thread Kairui Song
Currently vmcore only supports reading, this patch series is an RFC
to add writing support to vmcore. It's x86_64 only yet, I'll add other
architecture later if there is no problem with this idea.

My purpose of adding writing support is to reuse the crashed kernel's
old memory in kdump kernel, reduce kdump memory pressure, and
allow kdump to run with a smaller crashkernel reservation.

This is doable because in most cases, after kernel panic, user only
interested in the crashed kernel itself, and userspace/cache/free
memory pages are not dumped. `makedumpfile` is widely used to skip
these pages. Kernel pages usually only take a small part of
the whole old memory. So there will be many reusable pages.

By adding writing support, userspace then can use these pages as a fast
and temporary storage. This helps reduce memory pressure in many ways.

For example, I've written a POC program based on this, it will find
the reusable pages, and creates an NBD device which maps to these pages.
The NBD device can then be used as swap, or to hold some temp files
which previouly live in RAM.

The link of the POC tool: https://github.com/ryncsn/kdumpd

I tested it on x86_64 on latest Fedora by using it as swap with
following step in kdump kernel:

  1. Install this tool in kdump initramfs
  2. Execute following command in kdump:
 /sbin/modprobe nbd nbds_max=1
 /bin/kdumpd &
 /sbin/mkswap /dev/nbd0
 /sbin/swapon /dev/nbd0
  3. Observe the swap is being used:
 SwapTotal:131068 kB
 SwapFree: 121852 kB

It helped to reduce the crashkernel from 168M to 110M for a successful
kdump run over NFSv3. There are still many workitems that could be done
based on this idea, eg. move the initramfs content to the old memory,
which may help reduce another ~10-20M of memory.

It's have been a long time issue that kdump suffers from OOM issue
with limited crashkernel memory. So reusing old memory could be very
helpful.

This method have it's limitation:
- Swap only works for userspace. But kdump userspace is a major memory
  consumer, so in general this should be helpful enough.
- For users who want to dump the whole memory area, this won't help as
  there is no reusable page.

I've tried other ways to improve the crashkernel value, eg.
- Reserve some smaller memory segments in first kernel for crashkernel: It's
  only a suppliment of the default crashkernel reservation and only make
  crashkernel value more adjustable, still not solving the real problem.

- Reuse old memory, but hotplug chunk of reusable old memory into
  kdump kernel's memory:
  It's hard to find large chunk of continuous memory, especially on
  systems with heavy workload, the reusable regions could be very
  fragmental. So it can only hotplug small fragments of memories,
  which looks hackish, and may have a high page table overhead.

- Implement the old memory based based block device as a kernel
  module. It doesn't looks good to have a module for this sole
  usage and it don't have much performance/implementation advantage
  compared to this RFC.

Besides, keeping all the complex logic of parsing reusing old memory
logic in userspace seems a better idea.

And as a plus, this could make it more doable and reasonable to
have n crashkernel=auto param. If there is a swap, then userspace
will have less memory pressure. crashkernel=auto can focus on the
kernel usage.

Kairui Song (3):
  vmcore: simplify read_from_olemem
  vmcore: Add interface to write to old mem
  x86_64: implement copy_to_oldmem_page

 arch/x86/kernel/crash_dump_64.c |  49 --
 fs/proc/vmcore.c| 154 ++--
 include/linux/crash_dump.h  |  18 +++-
 3 files changed, 180 insertions(+), 41 deletions(-)

-- 
2.26.2


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH 1/3] vmcore: simplify read_from_olemem

2020-09-09 Thread Kairui Song
Simplify the code logic, also helps reduce object size and stack usage.

Stack usage:
  Before: fs/proc/vmcore.c:106:9:read_from_oldmem.part.0  80 static
  fs/proc/vmcore.c:106:9:read_from_oldmem 16 static
  After:  fs/proc/vmcore.c:106:9:read_from_oldmem 80 static

Size of vmcore.o:
  textdata bss dec hex filename
  Before: 7677 109  8878741ec2 fs/proc/vmcore.o
  After:  7669 109  8878661eba fs/proc/vmcore.o

Signed-off-by: Kairui Song 
---
 fs/proc/vmcore.c | 27 ++-
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index c3a345c28a93..124c2066f3e5 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -108,25 +108,19 @@ ssize_t read_from_oldmem(char *buf, size_t count,
 bool encrypted)
 {
unsigned long pfn, offset;
-   size_t nr_bytes;
-   ssize_t read = 0, tmp;
+   size_t nr_bytes, to_copy = count;
+   ssize_t tmp;
 
-   if (!count)
-   return 0;
-
-   offset = (unsigned long)(*ppos % PAGE_SIZE);
+   offset = (unsigned long)(*ppos & (PAGE_SIZE - 1));
pfn = (unsigned long)(*ppos / PAGE_SIZE);
 
-   do {
-   if (count > (PAGE_SIZE - offset))
-   nr_bytes = PAGE_SIZE - offset;
-   else
-   nr_bytes = count;
+   while (to_copy) {
+   nr_bytes = min(to_copy, PAGE_SIZE - offset);
 
/* If pfn is not ram, return zeros for sparse dump files */
-   if (pfn_is_ram(pfn) == 0)
+   if (pfn_is_ram(pfn) == 0) {
memset(buf, 0, nr_bytes);
-   else {
+   } else {
if (encrypted)
tmp = copy_oldmem_page_encrypted(pfn, buf,
 nr_bytes,
@@ -140,14 +134,13 @@ ssize_t read_from_oldmem(char *buf, size_t count,
return tmp;
}
*ppos += nr_bytes;
-   count -= nr_bytes;
buf += nr_bytes;
-   read += nr_bytes;
+   to_copy -= nr_bytes;
++pfn;
offset = 0;
-   } while (count);
+   }
 
-   return read;
+   return count;
 }
 
 /*
-- 
2.26.2


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-07-23 Thread Kairui Song
On Thu, Jul 23, 2020 at 8:00 AM Bjorn Helgaas  wrote:
>
> On Wed, Jul 22, 2020 at 03:50:48PM -0600, Jerry Hoemann wrote:
> > On Wed, Jul 22, 2020 at 10:21:23AM -0500, Bjorn Helgaas wrote:
> > > On Wed, Jul 22, 2020 at 10:52:26PM +0800, Kairui Song wrote:
>
> > > > I think I didn't make one thing clear, The PCI UR error never arrives
> > > > in kernel, it's the iLo BMC on that HPE machine caught the error, and
> > > > send kernel an NMI. kernel is panicked by NMI, I'm still trying to
> > > > figure out why the NMI hanged kernel, even with panic=-1,
> > > > panic_on_io_nmi, panic_on_unknown_nmi all set. But if we can avoid the
> > > > NMI by shutdown the devices in right order, that's also a solution.
>
> ACPI v6.3, chapter 18, does mention NMIs several times, e.g., Table
> 18-394 and sec 18.4.  I'm not familiar enough with APEI to know
> whether Linux correctly supports all those cases.  Maybe this is a
> symptom that we don't?
>
> > > I'm not sure how much sympathy to have for this situation.  A PCIe UR
> > > is fatal for the transaction and maybe even the device, but from the
> > > overall system point of view, it *should* be a recoverable error and
> > > we shouldn't panic.
> > >
> > > Errors like that should be reported via the normal AER or ACPI/APEI
> > > mechanisms.  It sounds like in this case, the platform has decided
> > > these aren't enough and it is trying to force a reboot?  If this is
> > > "special" platform behavior, I'm not sure how much we need to cater
> > > for it.
> >
> > Are these AER errors the type processed by the GHES code?
>
> My understanding from ACPI v6.3, sec 18.3.2, is that the Hardware
> Error Source Table may contain Error Source Descriptors of types like:
>
>   IA-32 Machine Check Exception
>   IA-32 Corrected Machine Check
>   IA-32 Non-Maskable Interrupt
>   PCIe Root Port AER
>   PCIe Device AER
>   Generic Hardware Error Source (GHES)
>   Hardware Error Notification
>   IA-32 Deferred Machine Check
>
> I would naively expect PCIe UR errors to be reported via one of the
> PCIe Error Sources, not GHES, but maybe there's some reason to use
> GHES.
>
> The kernel should already know how to deal with the PCIe AER errors,
> but we'd have to add new device-specific code to handle things
> reported via GHES, along the lines of what Shiju is doing here:
>
>   https://lore.kernel.org/r/20200722104245.1060-1-shiju.j...@huawei.com
>
> > I'll note that RedHat runs their crash kernel with:  hest_disable.
> > So, the ghes code is disabled in the crash kernel.
>
> That would disable all the HEST error sources, including the PCIe AER
> ones as well as GHES ones.  If we turn off some of the normal error
> handling mechanisms, I guess we have to expect that some errors won't
> be handled correctly.


Hi, that's true, hest_disable is added by default to reduce memory
usage in special cases.
But even if I remove hest_disable and have GHES enabled, but the
hanging issue still exists, from the iLO console log, it's still
sending an NMI to kernel, and kernel hanged.

The NMI won't hang the kernel for 100 percent, sometime it will just
panic and reboot and sometimes it hangs. This behavior didn't change
after/before enabled the GHES.

Maybe this is a "special platform behavior". I'm also not 100 percent
sure if/how we can cover this in a good way for now.
I'll try to figure how the NMI actually hanged the kernel and see if
it could be fixed in other ways.

-- 
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-07-22 Thread Kairui Song
On Fri, Mar 6, 2020 at 5:38 PM Baoquan He  wrote:
>
> On 03/04/20 at 08:53pm, Deepa Dinamani wrote:
> > On Wed, Mar 4, 2020 at 7:53 PM Baoquan He  wrote:
> > >
> > > +Joerg to CC.
> > >
> > > On 03/03/20 at 01:01pm, Deepa Dinamani wrote:
> > > > I looked at this some more. Looks like we do not clear irqs when we do
> > > > a kexec reboot. And, the bootup code maintains the same table for the
> > > > kexec-ed kernel. I'm looking at the following code in
> > >
> > > I guess you are talking about kdump reboot here, right? Kexec and kdump
> > > boot take the similar mechanism, but differ a little.
> >
> > Right I meant kdump kernel here. And, clearly the is_kdump_kernel() case 
> > below.
> >
> > >
> > > > intel_irq_remapping.c:
> > > >
> > > > if (ir_pre_enabled(iommu)) {
> > > > if (!is_kdump_kernel()) {
> > > > pr_warn("IRQ remapping was enabled on %s but
> > > > we are not in kdump mode\n",
> > > > iommu->name);
> > > > clear_ir_pre_enabled(iommu);
> > > > iommu_disable_irq_remapping(iommu);
> > > > } else if (iommu_load_old_irte(iommu))
> > >
> > > Here, it's for kdump kernel to copy old ir table from 1st kernel.
> >
> > Correct.
> >
> > > > pr_err("Failed to copy IR table for %s from
> > > > previous kernel\n",
> > > >iommu->name);
> > > > else
> > > > pr_info("Copied IR table for %s from previous 
> > > > kernel\n",
> > > > iommu->name);
> > > > }
> > > >
> > > > Would cleaning the interrupts(like in the non kdump path above) just
> > > > before shutdown help here? This should clear the interrupts enabled
> > > > for all the devices in the current kernel. So when kdump kernel
> > > > starts, it starts clean. This should probably help block out the
> > > > interrupts from a device that does not have a driver.
> > >
> > > I think stopping those devices out of control from continue sending
> > > interrupts is a good idea. While not sure if only clearing the interrupt
> > > will be enough. Those devices which will be initialized by their driver
> > > will brake, but devices which drivers are not loaded into kdump kernel
> > > may continue acting. Even though interrupts are cleaning at this time,
> > > the on-flight DMA could continue triggerring interrupt since the ir
> > > table and iopage table are rebuilt.
> >
> > This should be handled by the IOMMU, right? And, hence you are getting
> > UR. This seems like the correct execution flow to me.
>
> Sorry for late reply.
> Yes, this is initializing IOMMU device.
>
> >
> > Anyway, you could just test this theory by removing the
> > is_kdump_kernel() check above and see if it solves your problem.
> > Obviously, check the VT-d spec to figure out the exact sequence to
> > turn off the IR.
>
> OK, I will talk to Kairui and get a machine to test it. Thanks for your
> nice idea, if you have a draft patch, we are happy to test it.
>
> >
> > Note that the device that is causing the problem here is a legit
> > device. We want to have interrupts from devices we don't know about
> > blocked anyway because we can have compromised firmware/ devices that
> > could cause a DoS attack. So blocking the unwanted interrupts seems
> > like the right thing to do here.
>
> Kairui said it's a device which driver is not loaded in kdump kernel
> because it's not needed by kdump. We try to only load kernel modules
> which are needed, e.g one device is the dump target, its driver has to
> be loaded in. In this case, the device is more like a out of control
> device to kdump kernel.
>

Hi Bao, Deepa, sorry for this very late response. The test machine was
not available for sometime, and I restarted to work on this problem.

For the workaround mention by Deepa (by remote the is_kdump_kernel()
check), it didn't work, the machine still hangs upon shutdown.
The devices that were left in an unknown state and sending interrupt
could be a problem, but it's irrelevant to this hanging problem.

I think I didn't make one thing clear, The PCI UR error never arrives
in kernel, it's the iLo BMC on that HPE machine caught the error, and
send kernel an NMI. kernel is panicked by NMI, I'm still trying to
figure out why the NMI hanged kernel, even with panic=-1,
panic_on_io_nmi, panic_on_unknown_nmi all set. But if we can avoid the
NMI by shutdown the devices in right order, that's also a solution.

--
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] crash_dump: remove saved_max_pfn

2020-03-30 Thread Kairui Song
This variable is no longer used.

saved_max_pfn was originally introduce in commit 92aa63a5a1bf ("[PATCH]
kdump: Retrieve saved max pfn"), used to make sure that user does not
try to read the physical memory beyond saved_max_pfn. But since
commit 921d58c0e699 ("vmcore: remove saved_max_pfn check")
it's no longer used for the check.

Only user left is Calary IOMMU, which start using it from
commit 95b68dec0d52 ("calgary iommu: use the first kernels TCE tables
in kdump"). But again, recently in commit 90dc392fc445 ("x86: Remove
the calgary IOMMU driver"), Calary IOMMU is removed and this variable
no longer have any user.

So just remove it.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/e820.c | 8 
 include/linux/crash_dump.h | 2 --
 kernel/crash_dump.c| 6 --
 3 files changed, 16 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c5399e80c59c..4d13c57f370a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -910,14 +910,6 @@ static int __init parse_memmap_one(char *p)
return -EINVAL;
 
if (!strncmp(p, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
-   /*
-* If we are doing a crash dump, we still need to know
-* the real memory size before the original memory map is
-* reset.
-*/
-   saved_max_pfn = e820__end_of_ram_pfn();
-#endif
e820_table->nr_entries = 0;
userdef = 1;
return 0;
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 4664fc1871de..bc156285d097 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -97,8 +97,6 @@ extern void unregister_oldmem_pfn_is_ram(void);
 static inline bool is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
 
-extern unsigned long saved_max_pfn;
-
 /* Device Dump information to be filled by drivers */
 struct vmcoredd_data {
char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 9c23ae074b40..92da32275af5 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -5,12 +5,6 @@
 #include 
 #include 
 
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-
 /*
  * stores the physical address of elf header of crash image
  *
-- 
2.25.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] swiotlb: Allow swiotlb to live at pre-defined address

2020-03-29 Thread Kairui Song
ff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> > index c19379fabd20..83da0caa2f93 100644
> > --- a/kernel/dma/swiotlb.c
> > +++ b/kernel/dma/swiotlb.c
> > @@ -46,6 +46,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  #define CREATE_TRACE_POINTS
> >  #include 
> > @@ -102,6 +103,12 @@ unsigned int max_segment;
> >  #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
> >  static phys_addr_t *io_tlb_orig_addr;
> >
> > +/*
> > + * The TLB phys addr may be defined on the command line. Store it here if 
> > it is.
> > + */
> > +static phys_addr_t io_tlb_addr = INVALID_PHYS_ADDR;
> > +
> > +
> >  /*
> >   * Protect the above data structures in the map and unmap calls
> >   */
> > @@ -119,11 +126,23 @@ setup_io_tlb_npages(char *str)
> >   }
> >   if (*str == ',')
> >   ++str;
> > - if (!strcmp(str, "force")) {
> > + if (!strncmp(str, "force", 5)) {
> >   swiotlb_force = SWIOTLB_FORCE;
> > - } else if (!strcmp(str, "noforce")) {
> > + str += 5;
> > + } else if (!strncmp(str, "noforce", 7)) {
> >   swiotlb_force = SWIOTLB_NO_FORCE;
> >   io_tlb_nslabs = 1;
> > + str += 7;
> > + }
> > +
> > + if (*str == ',')
> > + ++str;
> > + if (!strncmp(str, "addr=", 5)) {
> > + char *addrstr = str + 5;
> > +
> > + io_tlb_addr = kstrtoul(addrstr, 0, &str);
> > + if (addrstr == str)
> > + io_tlb_addr = INVALID_PHYS_ADDR;
> >   }
> >
> >   return 0;
> > @@ -239,6 +258,25 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned 
> > long nslabs, int verbose)
> >   return 0;
> >  }
> >
> > +static int __init swiotlb_init_io(int verbose, unsigned long bytes)
> > +{
> > + unsigned __iomem char *vstart;
> > +
> > + if (io_tlb_addr == INVALID_PHYS_ADDR)
> > + return -EINVAL;
> > +
> > + vstart = memremap(io_tlb_addr, bytes, MEMREMAP_WB);
> > + if (!vstart)
> > + return -EINVAL;
> > +
> > + if (swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) {
> > + memunmap(vstart);
> > + return -EINVAL;
> > + }
> > +
> > + return 0;
> > +}
> > +
> >  /*
> >   * Statically reserve bounce buffer space and initialize bounce buffer data
> >   * structures for the software IO TLB used to implement the DMA API.
> > @@ -257,6 +295,10 @@ swiotlb_init(int verbose)
> >
> >   bytes = io_tlb_nslabs << IO_TLB_SHIFT;
> >
> > + /* Map IO TLB from device memory */
> > + if (!swiotlb_init_io(verbose, bytes))
> > + return;
> > +
> >   /* Get IO TLB memory from the low pages */
> >   vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
> >   if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
> > --
> > 2.16.4
> >
> >
> >
> >
> > Amazon Development Center Germany GmbH
> > Krausenstr. 38
> > 10117 Berlin
> > Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
> > Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
> > Sitz: Berlin
> > Ust-ID: DE 289 237 879
> >
> >
> >
>
> Thanks
> Dave
>


-- 
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-02-24 Thread Kairui Song
Hi,

Thanks for the reply, I don't have any better idea than this RFC patch
yet. The patch is hold as previous discussion suggests this just work
around the problem, the real fix should be let crash kernel load every
required kernel module and reset whichever hardware that is not in a
good status. However, user may struggle to find out which driver is
actually needed, and it's not practical to load all drivers in kdump
kernel. (actually kdump have been trying to load as less driver as
possible to save memory).

So as Dave Y suggested in another reply, will it better to apply this
quirk with a kernel param controlling it? If such problem happens, the
option could be turned on as a fix.


On Sun, Feb 23, 2020 at 12:59 AM Bjorn Helgaas  wrote:
>
> [+cc Khalid, Deepa, Randy, Dave, Myron]
>
> On Thu, Dec 26, 2019 at 03:21:18AM +0800, Kairui Song wrote:
> > There are reports about kdump hang upon reboot on some HPE machines,
> > kernel hanged when trying to shutdown a PCIe port, an uncorrectable
> > error occurred and crashed the system.
>
> Did we ever make progress on this?  This definitely sounds like a
> problem that needs to be fixed, but I don't see a resolution here.
>
> > On the machine I can reproduce this issue, part of the topology
> > looks like this:
> >
> > [:00]-+-00.0  Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2
> >   +-01.0-[02]--
> >   +-01.1-[05]--
> >   +-02.0-[06]--+-00.0  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.1  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.2  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.3  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.4  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.5  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.6  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |\-00.7  Emulex Corporation OneConnect NIC (Skyhawk)
> >   +-02.1-[0f]--
> >   +-02.2-[07]00.0  Hewlett-Packard Company Smart Array Gen9 
> > Controllers
> >
> > When shuting down PCIe port :00:02.2 or :00:02.0, the machine
> > will hang, depend on which device is reinitialized in kdump kernel.
> >
> > If force remove unused device then trigger kdump, the problem will never
> > happen:
> >
> > echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove
> > echo c > /proc/sysrq-trigger
> >
> > ... Kdump save vmcore through network, the NIC get reinitialized and
> > hpsa is untouched. Then reboot with no problem. (If hpsa is used
> > instead, shutdown the NIC in first kernel will help)
> >
> > The cause is that some devices are enabled by the first kernel, but it
> > don't have the chance to shutdown the device, and kdump kernel is not
> > aware of it, unless it reinitialize the device.
> >
> > Upon reboot, kdump kernel will skip downstream device shutdown and
> > clears its bridge's master bit directly. The downstream device could
> > error out as it can still send requests but upstream refuses it.
> >
> > So for kdump, let kernel read the correct hardware power state on boot,
> > and always clear the bus master bit of PCI device upon shutdown if the
> > device is on. PCIe port driver will always shutdown all downstream
> > devices first, so this should ensure all downstream devices have bus
> > master bit off before clearing the bridge's bus master bit.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  drivers/pci/pci-driver.c | 11 ---
> >  drivers/pci/quirks.c | 20 
> >  2 files changed, 28 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
> > index 0454ca0e4e3f..84a7fd643b4d 100644
> > --- a/drivers/pci/pci-driver.c
> > +++ b/drivers/pci/pci-driver.c
> > @@ -18,6 +18,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include "pci.h"
> >  #include "pcie/portdrv.h"
> >
> > @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev)
> >* If this is a kexec reboot, turn off Bus Master bit on the
> >* device to tell it to not continue to do DMA. Don't touch
> >* devices in D3cold or unknown states.
> > -  * If it is not a kexec reboot, firmware will hit the PCI
> > -  * devices with big hammer and stop their DMA any 

Re: [PATCH] makedumpfile: Remove duplicated variable declarations

2020-01-29 Thread Kairui Song
On Thu, Jan 30, 2020 at 12:28 AM HAGIO KAZUHITO(萩尾 一仁)
 wrote:
>
> Hi Kairui,
>
> Thank you for the patch.
>
> > -Original Message-
> > When building on Fedora 32, following error is observed:
> >
> > /usr/bin/ld:
> > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010:
> > multiple definition of `crash_reserved_mem_nr';
> > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010:
> >  first
> > defined here
> > /usr/bin/ld:
> > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009:
> > multiple definition of `crash_reserved_mem';
> > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009:
> >  first
> > defined here
> > /usr/bin/ld:
> > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278:
> > multiple definition of `parallel_info_t';
> > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278:
> >  first
> > defined here
> > /usr/bin/ld:
> > erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265:
> > multiple definition of `splitting_info_t';
> > elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265:
> >  first
> > defined here
> >
> > And apparently, these variables are wrongly declared multiple times. So
> > remove duplicated declaration.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  makedumpfile.c |  2 ++
> >  makedumpfile.h | 10 ++
> >  2 files changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/makedumpfile.c b/makedumpfile.c
> > index e290fbd..9aad77b 100644
> > --- a/makedumpfile.c
> > +++ b/makedumpfile.c
> > @@ -34,6 +34,8 @@ struct array_table  array_table;
> >  struct number_table  number_table;
> >  struct srcfile_table srcfile_table;
> >  struct save_control  sc;
> > +struct parallel_info parallel_info_t;
> > +struct splitting_infosplitting_info_t;
> >
> >  struct vm_table  vt = { 0 };
> >  struct DumpInfo  *info = NULL;
> > diff --git a/makedumpfile.h b/makedumpfile.h
> > index 68d9691..614764c 100644
> > --- a/makedumpfile.h
> > +++ b/makedumpfile.h
> > @@ -1262,7 +1262,8 @@ struct splitting_info {
> >   mdf_pfn_t   end_pfn;
> >   off_t   offset_eraseinfo;
> >   unsigned long   size_eraseinfo;
> > -} splitting_info_t;
> > +};
> > +extern struct splitting_info splitting_info_t;
>
> Interestingly, it seems that the splitting_info_t and parallel_info_t should
> have been typedef'd because of their names ending with _t and not being used
> as variable.  (We use info->splitting_info and info->parallel_info.)
>
> So, is the following patch OK? then I can modify your patch.
>

Hi,

Thanks for the review, and yes it's definitely OK to change the patch
in this way. I just took a brief look at the code, and modified it in
the way that actually change nothing. And after a second look, indeed
they are never used as variable, only used as parameters of sizeof().

So actually can we just get rid of them, and use sizeof(struct
parallel_info) and sizeof(struct splitting_info) instead? It may be
even simpler.

I'm OK with either way.

> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -1255,7 +1255,7 @@ struct makedumpfile_data_header {
> int64_t buf_size;
>  };
>
> -struct splitting_info {
> +typedef struct splitting_info {
> char*name_dumpfile;
> int fd_bitmap;
> mdf_pfn_t   start_pfn;
> @@ -1264,7 +1264,7 @@ struct splitting_info {
> unsigned long   size_eraseinfo;
>  } splitting_info_t;
>
> -struct parallel_info {
> +typedef struct parallel_info {
> int fd_memory;
> int fd_bitmap_memory;
> int fd_bitmap;
> @@ -2006,8 +2006,8 @@ struct memory_range {
>  };
>
>  #define CRASH_RESERVED_MEM_NR   8
> -struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR];
> -int crash_reserved_mem_nr;
> +extern struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR];
> +extern int crash_reserved_mem_nr;
>
>  unsigned long read_vmcoreinfo_symbol(char *str_symbol);
>  int readmem(int type_addr, unsigned long long addr, void *bufptr, size_t 
> size);
>
>
> Thanks,
> Kazu
>
> >

[PATCH] kexec-tools: Remove duplicated variable declarations

2020-01-28 Thread Kairui Song
When building kexec-tools for Fedora 32, following error is observed:

/usr/bin/ld: kexec/arch/x86_64/kexec-bzImage64.o:(.bss+0x0): multiple 
definition of `bzImage_support_efi_boot';
kexec/arch/i386/kexec-bzImage.o:(.bss+0x0): first defined here

/builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm/../../fs2dt.h:33: 
multiple definition of `my_debug';
kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/kexec/fs2dt.h:33: first 
defined here

/builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm64/kexec-arm64.h:68: 
multiple definition of `arm64_mem';
kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/././kexec/arch/arm64/kexec-arm64.h:68:
 first defined here

/builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm64/kexec-arm64.h:54: 
multiple definition of `initrd_size';
kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/././kexec/arch/arm64/kexec-arm64.h:54:
 first defined here

/builddir/build/BUILD/kexec-tools-2.0.20/kexec/arch/arm64/kexec-arm64.h:53: 
multiple definition of `initrd_base';
kexec/fs2dt.o:/builddir/build/BUILD/kexec-tools-2.0.20/././kexec/arch/arm64/kexec-arm64.h:53:
 first defined here

And apparently, these variables are wrongly declared multiple times. So
remove duplicated declaration.

Signed-off-by: Kairui Song 
---
 kexec/arch/arm64/kexec-arm64.h  | 6 +++---
 kexec/arch/ppc64/kexec-elf-ppc64.c  | 2 --
 kexec/arch/x86_64/kexec-bzImage64.c | 1 -
 kexec/fs2dt.h   | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/kexec/arch/arm64/kexec-arm64.h b/kexec/arch/arm64/kexec-arm64.h
index 628de79..ed447ac 100644
--- a/kexec/arch/arm64/kexec-arm64.h
+++ b/kexec/arch/arm64/kexec-arm64.h
@@ -50,8 +50,8 @@ int zImage_arm64_load(int argc, char **argv, const char 
*kernel_buf,
 void zImage_arm64_usage(void);
 
 
-off_t initrd_base;
-off_t initrd_size;
+extern off_t initrd_base;
+extern off_t initrd_size;
 
 /**
  * struct arm64_mem - Memory layout info.
@@ -65,7 +65,7 @@ struct arm64_mem {
 };
 
 #define arm64_mem_ngv UINT64_MAX
-struct arm64_mem arm64_mem;
+extern struct arm64_mem arm64_mem;
 
 uint64_t get_phys_offset(void);
 uint64_t get_vp_offset(void);
diff --git a/kexec/arch/ppc64/kexec-elf-ppc64.c 
b/kexec/arch/ppc64/kexec-elf-ppc64.c
index 3510b70..695b8b0 100644
--- a/kexec/arch/ppc64/kexec-elf-ppc64.c
+++ b/kexec/arch/ppc64/kexec-elf-ppc64.c
@@ -44,8 +44,6 @@
 uint64_t initrd_base, initrd_size;
 unsigned char reuse_initrd = 0;
 const char *ramdisk;
-/* Used for enabling printing message from purgatory code */
-int my_debug = 0;
 
 int elf_ppc64_probe(const char *buf, off_t len)
 {
diff --git a/kexec/arch/x86_64/kexec-bzImage64.c 
b/kexec/arch/x86_64/kexec-bzImage64.c
index 8edb3e4..ba8dc48 100644
--- a/kexec/arch/x86_64/kexec-bzImage64.c
+++ b/kexec/arch/x86_64/kexec-bzImage64.c
@@ -42,7 +42,6 @@
 #include 
 
 static const int probe_debug = 0;
-int bzImage_support_efi_boot;
 
 int bzImage64_probe(const char *buf, off_t len)
 {
diff --git a/kexec/fs2dt.h b/kexec/fs2dt.h
index 7633273..fe24931 100644
--- a/kexec/fs2dt.h
+++ b/kexec/fs2dt.h
@@ -30,7 +30,7 @@ extern struct bootblock bb[1];
 
 /* Used for enabling printing message from purgatory code
  * Only has implemented for PPC64 */
-int my_debug;
+extern int my_debug;
 extern int dt_no_old_root;
 
 void reserve(unsigned long long where, unsigned long long length);
-- 
2.24.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] makedumpfile: Remove duplicated variable declarations

2020-01-28 Thread Kairui Song
When building on Fedora 32, following error is observed:

/usr/bin/ld: 
erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010:
multiple definition of `crash_reserved_mem_nr'; 
elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2010:
 first defined here
/usr/bin/ld: 
erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009:
multiple definition of `crash_reserved_mem'; 
elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:2009:
 first defined here
/usr/bin/ld: 
erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278:
multiple definition of `parallel_info_t'; 
elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1278:
 first defined here
/usr/bin/ld: 
erase_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265:
multiple definition of `splitting_info_t'; 
elf_info.o:/builddir/build/BUILD/kexec-tools-2.0.20/makedumpfile-1.6.7/makedumpfile.h:1265:
 first defined here

And apparently, these variables are wrongly declared multiple times. So
remove duplicated declaration.

Signed-off-by: Kairui Song 
---
 makedumpfile.c |  2 ++
 makedumpfile.h | 10 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index e290fbd..9aad77b 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -34,6 +34,8 @@ struct array_tablearray_table;
 struct number_tablenumber_table;
 struct srcfile_table   srcfile_table;
 struct save_controlsc;
+struct parallel_info   parallel_info_t;
+struct splitting_info  splitting_info_t;
 
 struct vm_tablevt = { 0 };
 struct DumpInfo*info = NULL;
diff --git a/makedumpfile.h b/makedumpfile.h
index 68d9691..614764c 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -1262,7 +1262,8 @@ struct splitting_info {
mdf_pfn_t   end_pfn;
off_t   offset_eraseinfo;
unsigned long   size_eraseinfo;
-} splitting_info_t;
+};
+extern struct splitting_info splitting_info_t;
 
 struct parallel_info {
int fd_memory;
@@ -1275,7 +1276,8 @@ struct parallel_info {
 #ifdef USELZO
lzo_bytep   wrkmem;
 #endif
-} parallel_info_t;
+};
+extern struct parallel_info parallel_info_t;
 
 struct ppc64_vmemmap {
unsigned long   phys;
@@ -2006,8 +2008,8 @@ struct memory_range {
 };
 
 #define CRASH_RESERVED_MEM_NR   8
-struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR];
-int crash_reserved_mem_nr;
+extern struct memory_range crash_reserved_mem[CRASH_RESERVED_MEM_NR];
+extern int crash_reserved_mem_nr;
 
 unsigned long read_vmcoreinfo_symbol(char *str_symbol);
 int readmem(int type_addr, unsigned long long addr, void *bufptr, size_t size);
-- 
2.24.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-01-15 Thread Kairui Song
On Thu, Jan 16, 2020 at 1:31 AM Khalid Aziz  wrote:
>
> On 1/13/20 10:07 AM, Kairui Song wrote:
> > On Sun, Jan 12, 2020 at 2:33 AM Deepa Dinamani  
> > wrote:
> >>
> >>> Hi, there are some previous works about this issue, reset PCI devices
> >>> in kdump kernel to stop ongoing DMA:
> >>>
> >>> [v7,0/5] Reset PCIe devices to address DMA problem on kdump with iommu
> >>> https://lore.kernel.org/patchwork/cover/343767/
> >>>
> >>> [v2] PCI: Reset PCIe devices to stop ongoing DMA
> >>> https://lore.kernel.org/patchwork/patch/379191/
> >>>
> >>> And didn't get merged, that patch are trying to fix some DMAR error
> >>> problem, but resetting devices is a bit too destructive, and the
> >>> problem is later fixed in IOMMU side. And in most case the DMA seems
> >>> harmless, as they targets first kernel's memory and kdump kernel only
> >>> live in crash memory.
> >>
> >> I was going to ask the same. If the kdump kernel had IOMMU on, would
> >> that still be a problem?
> >
> > It will still fail, doing DMA is not a problem, it only go wrong when
> > a device's upstream bridge is mistakenly shutdown before the device
> > shutdown.
> >
> >>
> >>> Also, by the time kdump kernel is able to scan and reset devices,
> >>> there are already a very large time window where things could go
> >>> wrong.
> >>>
> >>> The currently problem observed only happens upon kdump kernel
> >>> shutdown, as the upper bridge is disabled before the device is
> >>> disabledm so DMA will raise error. It's more like a problem of wrong
> >>> device shutting down order.
> >>
> >> The way it was described earlier "During this time, the SUT sometimes
> >> gets a PCI error that raises an NMI." suggests that it isn't really
> >> restricted to kexec/kdump.
> >> Any attached device without an active driver might attempt spurious or
> >> malicious DMA and trigger the same during normal operation.
> >> Do you have available some more reporting of what happens during the
> >> PCIe error handling?
> >
> > Let me add more info about this:
> >
> > On the machine where I can reproduce this issue, the first kernel
> > always runs fine, and kdump kernel works fine during dumping the
> > vmcore, even if I keep the kdump kernel running for hours, nothing
> > goes wrong. If there are DMA during normal operation that will cause
> > problem, this should have exposed it.
> >
>
> This is the part that is puzzling me. Error shows up only when kdump
> kernel is being shut down. kdump kernel can run for hours without this
> issue. What is the operation from downstream device that is resulting in
> uncorrectable error - is it indeed a DMA request? Why does that
> operation from downstream device not happen until shutdown?
>
> I just want to make sure we fix the right problem in the right way.
>

Actually the device could keep sending request with no problem during
kdump kernel running. Eg. keep sending DMA, and all DMA targets first
kernel's system memory, so kdump runs fine as long as nothing touch
the reserved crash memory. And the error is reported by the port, when
shutdown it has bus master bit, and downstream request will cause
error.

I'm not sure what request it really is either, it could depend on
device. On that machine, error could be reproduced when either the NIC
or HPSA is not reset in kdump, and from the bug report, the reporter
used a different NIC card and it's also reproducible.
The NIC is much less like to cause bridge error though (HPSA is about
7/10 reproducible, NIC is about 3/10), so the device could send
different requests but fail in the same way (UR error reported from
the bridge).

Will try to do more debug, but I'm not sure how can I intercept the
PCIe operation to get some info about what is actually causing the
issue, do you have any suggestion?

-- 
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-01-14 Thread Kairui Song
On Wed, Jan 15, 2020 at 9:17 AM Deepa Dinamani  wrote:
>
> On Mon, Jan 13, 2020 at 9:07 AM Kairui Song  wrote:
> >
> > On Sun, Jan 12, 2020 at 2:33 AM Deepa Dinamani  
> > wrote:
> > >
> > > > Hi, there are some previous works about this issue, reset PCI devices
> > > > in kdump kernel to stop ongoing DMA:
> > > >
> > > > [v7,0/5] Reset PCIe devices to address DMA problem on kdump with iommu
> > > > https://lore.kernel.org/patchwork/cover/343767/
> > > >
> > > > [v2] PCI: Reset PCIe devices to stop ongoing DMA
> > > > https://lore.kernel.org/patchwork/patch/379191/
> > > >
> > > > And didn't get merged, that patch are trying to fix some DMAR error
> > > > problem, but resetting devices is a bit too destructive, and the
> > > > problem is later fixed in IOMMU side. And in most case the DMA seems
> > > > harmless, as they targets first kernel's memory and kdump kernel only
> > > > live in crash memory.
> > >
> > > I was going to ask the same. If the kdump kernel had IOMMU on, would
> > > that still be a problem?
> >
> > It will still fail, doing DMA is not a problem, it only go wrong when
> > a device's upstream bridge is mistakenly shutdown before the device
> > shutdown.
> >
> > >
> > > > Also, by the time kdump kernel is able to scan and reset devices,
> > > > there are already a very large time window where things could go
> > > > wrong.
> > > >
> > > > The currently problem observed only happens upon kdump kernel
> > > > shutdown, as the upper bridge is disabled before the device is
> > > > disabledm so DMA will raise error. It's more like a problem of wrong
> > > > device shutting down order.
> > >
> > > The way it was described earlier "During this time, the SUT sometimes
> > > gets a PCI error that raises an NMI." suggests that it isn't really
> > > restricted to kexec/kdump.
> > > Any attached device without an active driver might attempt spurious or
> > > malicious DMA and trigger the same during normal operation.
> > > Do you have available some more reporting of what happens during the
> > > PCIe error handling?
> >
> > Let me add more info about this:
> >
> > On the machine where I can reproduce this issue, the first kernel
> > always runs fine, and kdump kernel works fine during dumping the
> > vmcore, even if I keep the kdump kernel running for hours, nothing
> > goes wrong. If there are DMA during normal operation that will cause
> > problem, this should have exposed it.
> >
> > The problem only occur when kdump kernel try to reboot, no matter how
> > long the kdump kernel have been running (few minutes or hours). The
> > machine is dead after printing:
> > [  101.438300] reboot: Restarting system^M
> > [  101.455360] reboot: machine restart^M
> >
> > And I can find following logs happend just at that time, in the
> > "Integrated Management Log" from the iLO web interface:
> > 1254 OS 12/25/2019 09:08 12/25/2019 09:08 1 User Remotely Initiated NMI 
> > Switch
> > 1253 System Error 12/25/2019 09:08 12/25/2019 09:08 1 An Unrecoverable
> > System Error (NMI) has occurred (Service Information: 0x,
> > 0x)
> > 1252 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 Uncorrectable PCI
> > Express Error (Embedded device, Bus 0, Device 2, Function 2, Error
> > status 0x0010)
> > 1251 System Error 12/25/2019 09:07 12/25/2019 09:07 1 Unrecoverable
> > System Error (NMI) has occurred.  System Firmware will log additional
> > details in a separate IML entry if possible
> > 1250 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 PCI Bus Error (Slot
> > 0, Bus 0, Device 2, Function 2)
> >
> > And the topology is:
> > [:00]-+-00.0  Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2
> >   +-01.0-[02]--
> >   +-01.1-[05]--
> >   +-02.0-[06]--+-00.0  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.1  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.2  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.3  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.4  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.5  Emulex Corporation OneConnect NIC (Skyhawk)
> >   |+-00.6  Emulex Corporation OneConnect NIC (S

Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-01-13 Thread Kairui Song
On Sun, Jan 12, 2020 at 2:33 AM Deepa Dinamani  wrote:
>
> > Hi, there are some previous works about this issue, reset PCI devices
> > in kdump kernel to stop ongoing DMA:
> >
> > [v7,0/5] Reset PCIe devices to address DMA problem on kdump with iommu
> > https://lore.kernel.org/patchwork/cover/343767/
> >
> > [v2] PCI: Reset PCIe devices to stop ongoing DMA
> > https://lore.kernel.org/patchwork/patch/379191/
> >
> > And didn't get merged, that patch are trying to fix some DMAR error
> > problem, but resetting devices is a bit too destructive, and the
> > problem is later fixed in IOMMU side. And in most case the DMA seems
> > harmless, as they targets first kernel's memory and kdump kernel only
> > live in crash memory.
>
> I was going to ask the same. If the kdump kernel had IOMMU on, would
> that still be a problem?

It will still fail, doing DMA is not a problem, it only go wrong when
a device's upstream bridge is mistakenly shutdown before the device
shutdown.

>
> > Also, by the time kdump kernel is able to scan and reset devices,
> > there are already a very large time window where things could go
> > wrong.
> >
> > The currently problem observed only happens upon kdump kernel
> > shutdown, as the upper bridge is disabled before the device is
> > disabledm so DMA will raise error. It's more like a problem of wrong
> > device shutting down order.
>
> The way it was described earlier "During this time, the SUT sometimes
> gets a PCI error that raises an NMI." suggests that it isn't really
> restricted to kexec/kdump.
> Any attached device without an active driver might attempt spurious or
> malicious DMA and trigger the same during normal operation.
> Do you have available some more reporting of what happens during the
> PCIe error handling?

Let me add more info about this:

On the machine where I can reproduce this issue, the first kernel
always runs fine, and kdump kernel works fine during dumping the
vmcore, even if I keep the kdump kernel running for hours, nothing
goes wrong. If there are DMA during normal operation that will cause
problem, this should have exposed it.

The problem only occur when kdump kernel try to reboot, no matter how
long the kdump kernel have been running (few minutes or hours). The
machine is dead after printing:
[  101.438300] reboot: Restarting system^M
[  101.455360] reboot: machine restart^M

And I can find following logs happend just at that time, in the
"Integrated Management Log" from the iLO web interface:
1254 OS 12/25/2019 09:08 12/25/2019 09:08 1 User Remotely Initiated NMI Switch
1253 System Error 12/25/2019 09:08 12/25/2019 09:08 1 An Unrecoverable
System Error (NMI) has occurred (Service Information: 0x,
0x)
1252 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 Uncorrectable PCI
Express Error (Embedded device, Bus 0, Device 2, Function 2, Error
status 0x0010)
1251 System Error 12/25/2019 09:07 12/25/2019 09:07 1 Unrecoverable
System Error (NMI) has occurred.  System Firmware will log additional
details in a separate IML entry if possible
1250 PCI Bus 12/25/2019 09:07 12/25/2019 09:07 1 PCI Bus Error (Slot
0, Bus 0, Device 2, Function 2)

And the topology is:
[:00]-+-00.0  Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2
  +-01.0-[02]--
  +-01.1-[05]--
  +-02.0-[06]--+-00.0  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.1  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.2  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.3  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.4  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.5  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.6  Emulex Corporation OneConnect NIC (Skyhawk)
  |\-00.7  Emulex Corporation OneConnect NIC (Skyhawk)
  +-02.1-[0f]--
  +-02.2-[07]00.0  Hewlett-Packard Company Smart Array
Gen9 Controllers

It's a bridge reporting the error. It should be an unsupported request
error, bacause downstream device is still alive and sending request,
but the port have bus mastering off. If I manually shutdown the "Smart
Array" (HPSA) device before kdump reboot, it will always reboot just
fine.

And as the patch descriptions said, the HPSA is used in first kernel,
but didn't get reset in kdump kernel because driver is not loaded.
When shutting down a bridge, kernel should shutdown downstream device
first, and then shutdown and clear bus master bit of the bridge. But
in kdump case, kernel skipped some device shutdown due to driver not
loaded issue, and kernel don't know they are enabled.

This problem is not limited to HPSA, the NIC listed in above topology
maybe also make the bridge error out, if HPSA get loaded in kdump
kernel and NIC get ignored.

>
> "The reaction to the NMI that the kdump kernel takes is problematic."
> Or the NMI should not have been triggered to begin with? Where do

Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-01-11 Thread Kairui Song
On Sat, Jan 11, 2020 at 11:46 AM Khalid Aziz  wrote:
>
> On 1/10/20 5:50 PM, Baoquan He wrote:
> > On 01/10/20 at 05:18pm, Khalid Aziz wrote:
> >> On 1/10/20 4:00 PM, Jerry Hoemann wrote:
> >>> On Fri, Jan 10, 2020 at 03:25:36PM -0700, Khalid Aziz and Shuah Khan 
> >>> wrote:
> >>>> On 1/10/20 2:42 PM, Bjorn Helgaas wrote:
> >>>>> [+cc Deepa (also working in this area)]
> >>>>>
> >>>>> On Thu, Dec 26, 2019 at 03:21:18AM +0800, Kairui Song wrote:
> >>>>>> There are reports about kdump hang upon reboot on some HPE machines,
> >>>>>> kernel hanged when trying to shutdown a PCIe port, an uncorrectable
> >>>>>> error occurred and crashed the system.
> >>>>>
> >>>>> Details?  Do you have URLs for bug reports, dmesg logs, etc?
> >>>>>
> >>>>>> On the machine I can reproduce this issue, part of the topology
> >>>>>> looks like this:
> >>>>>>
> >>>>>> [:00]-+-00.0  Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2
> >>>>>>   +-01.0-[02]--
> >>>>>>   +-01.1-[05]--
> >>>>>>   +-02.0-[06]--+-00.0  tEmulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   |+-00.1  Emulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   |+-00.2  Emulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   |+-00.3  Emulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   |+-00.4  Emulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   |+-00.5  Emulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   |+-00.6  Emulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   |\-00.7  Emulex Corporation OneConnect NIC 
> >>>>>> (Skyhawk)
> >>>>>>   +-02.1-[0f]--
> >>>>>>   +-02.2-[07]00.0  Hewlett-Packard Company Smart Array 
> >>>>>> Gen9 Controllers
> >>>>>>
> >>>>>> When shutting down PCIe port :00:02.2 or :00:02.0, the machine
> >>>>>> will hang, depend on which device is reinitialized in kdump kernel.
> >>>>>>
> >>>>>> If force remove unused device then trigger kdump, the problem will 
> >>>>>> never
> >>>>>> happen:
> >>>>>>
> >>>>>> echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove
> >>>>>> echo c > /proc/sysrq-trigger
> >>>>>>
> >>>>>> ... Kdump save vmcore through network, the NIC get reinitialized 
> >>>>>> and
> >>>>>> hpsa is untouched. Then reboot with no problem. (If hpsa is used
> >>>>>> instead, shutdown the NIC in first kernel will help)
> >>>>>>
> >>>>>> The cause is that some devices are enabled by the first kernel, but it
> >>>>>> don't have the chance to shutdown the device, and kdump kernel is not
> >>>>>> aware of it, unless it reinitialize the device.
> >>>>>>
> >>>>>> Upon reboot, kdump kernel will skip downstream device shutdown and
> >>>>>> clears its bridge's master bit directly. The downstream device could
> >>>>>> error out as it can still send requests but upstream refuses it.
> >>>>>
> >>>>> Can you help me understand the sequence of events?  If I understand
> >>>>> correctly, the desired sequence is:
> >>>>>
> >>>>>   - user kernel boots
> >>>>>   - user kernel panics and kexecs to kdump kernel
> >>>>>   - kdump kernel writes vmcore to network or disk
> >>>>>   - kdump kernel reboots
> >>>>>   - user kernel boots
> >>>>>
> >>>>> But the problem is that as part of the kdump kernel reboot,
> >>>>>
> >>>>>   - kdump kernel disables bus mastering for a Root Por

Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-01-11 Thread Kairui Song
On Sat, Jan 11, 2020 at 8:45 AM Baoquan He  wrote:
> On 01/10/20 at 04:00pm, Jerry Hoemann wrote:
> > > I am not understanding this failure mode either. That code in
> > > pci_device_shutdown() was added originally to address this very issue.
> > > The patch 4fc9bbf98fd6 ("PCI: Disable Bus Master only on kexec reboot")
> > > shut down any errant DMAs from PCI devices as we kexec a new kernel. In
> > > this new patch, this is the same code path that will be taken again when
> > > kdump kernel is shutting down. If the errant DMA problem was not fixed
> > > by clearing Bus Master bit in this path when kdump kernel was being
> > > kexec'd, why does the same code path work the second time around when
> > > kdump kernel is shutting down? Is there more going on that we don't
> > > understand?
> > >
> >
> >   Khalid,
> >
> >   I don't believe we execute that code path in the crash case.
> >
> >   The variable kexec_in_progress is set true in kernel_kexec() before 
> > calling
> >   machine_kexec().  This is the fast reboot case.
> >
> >   I don't see kexec_in_progress set true elsewhere.
> >
> >
> >   The code path for crash is different.
> >
> >   For instance, panic() will call
> >   -> __crash_kexec()  which calls
> >   -> machine_kexec().
> >
> >  So the setting of kexec_in_progress is bypassed.
>
> Yeah, it's a differet behaviour than kexec case. I talked to Kairui, the
> patch log may be not very clear. Below is summary I got from my
> understanding about this issue:
>
> ~~~
> Problem:
>
> When crash is triggered, system jumps into kdump kernel to collect
> vmcore and dump out. After dumping is finished, kdump kernel will try
> ty reboot to normal kernel. This hang happened during kdump kernel
> rebooting, when dumping is network dumping, e.g ssh/nfs, local storage
> is HPSA.
>
> Root cause:
>
> When configuring network dumping, only network driver modules are added
> into kdump initramfs. However, the storage HPSA pcie device is enabled
> in 1st kernel, its status is PCI_D3hot. When crashed system jumps to kdump
> kernel, we didn't shutdown any device for safety and efficiency. Then
> during kdump kernel boot up, the pci scan will get hpsa device and only
> initialize its status as pci_dev->current_state = PCI_UNKNOWN. This
> pci_dev->current_state will be manipulated by the relevant device
> driver. So HPSA device will never have chance to calibrate its status,
> and can't be shut down by pci_device_shutdown() called by reboot
> service. It's still PCI_D3hot, then crash happened when system try to
> shutdown its upper bridge.
>
> Fix:
>
> Here, Kairui uses a quirk to get PM state and mask off value bigger than
> PCI_D3cold. Means, all devices will get PM state
> pci_dev->current_state = PCI_D0 or PCI_D3hot

Or to put it simple, I just synced the actual PM state into
pci_dev->current_state using a quirk, for kdump kernel only.

> Finally, during kdump
> reboot stage, this device can be shut down successfully by clearing its
> master bit.
>
> ~~~
>
> About this patch, I think the quirk getting active PM state for all devices
> may be risky, it will impact normal kernel too which doesn't have this issue.
>
> Wondering if there's any other way to fix or work around it.
>

Thank you for the detailed description!

-- 
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-01-11 Thread Kairui Song
On Sat, Jan 11, 2020 at 5:42 AM Bjorn Helgaas  wrote:
>
> Can you help me understand the sequence of events?  If I understand
> correctly, the desired sequence is:
>
>   - user kernel boots
>   - user kernel panics and kexecs to kdump kernel

One thing imported need to be mentioned here, user kernel kexec into
kdump kernel using the fast path, which does very few things, and
leave all the PCI devices untouched. If they are on, or doing DMA,
will just keep doing that, nothing will stop them.

In most cases the on going DMA seems harmless though, as kdump kernel
only live in reserved crash memory.

>   - kdump kernel writes vmcore to network or disk
>   - kdump kernel reboots
>   - user kernel boots
>
> But the problem is that as part of the kdump kernel reboot,
>
>   - kdump kernel disables bus mastering for a Root Port
>   - device below the Root Port attempts DMA
>   - Root Port receives DMA transaction, handles it as Unsupported
> Request, sends UR Completion to device
>   - device signals uncorrectable error
>   - uncorrectable error causes a crash (Or a hang?  You mention both
> and I'm not sure which it is)
>
> Is that right so far?

Yes everything else all correct. On the machine I can reproduce it,
system just hanged, even serial console is dead with no output.

>
> > So for kdump, let kernel read the correct hardware power state on boot,
> > and always clear the bus master bit of PCI device upon shutdown if the
> > device is on. PCIe port driver will always shutdown all downstream
> > devices first, so this should ensure all downstream devices have bus
> > master bit off before clearing the bridge's bus master bit.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  drivers/pci/pci-driver.c | 11 ---
> >  drivers/pci/quirks.c | 20 
> >  2 files changed, 28 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
> > index 0454ca0e4e3f..84a7fd643b4d 100644
> > --- a/drivers/pci/pci-driver.c
> > +++ b/drivers/pci/pci-driver.c
> > @@ -18,6 +18,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include "pci.h"
> >  #include "pcie/portdrv.h"
> >
> > @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev)
> >* If this is a kexec reboot, turn off Bus Master bit on the
> >* device to tell it to not continue to do DMA. Don't touch
> >* devices in D3cold or unknown states.
> > -  * If it is not a kexec reboot, firmware will hit the PCI
> > -  * devices with big hammer and stop their DMA any way.
> > +  * If this is kdump kernel, also turn off Bus Master, the device
> > +  * could be activated by previous crashed kernel and may block
> > +  * it's upstream from shutting down.
> > +  * Else, firmware will hit the PCI devices with big hammer
> > +  * and stop their DMA any way.
> >*/
> > - if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot))
> > + if ((kexec_in_progress || is_kdump_kernel()) &&
> > + pci_dev->current_state <= PCI_D3hot)
> >   pci_clear_master(pci_dev);
>
> I'm clearly missing something because this will turn off bus mastering
> in cases where we previously left it enabled.
>
> I was assuming the crash was related to a device doing DMA when the
> Root Port had bus mastering disabled.  But that must be wrong.

That is just what is happening. When kdump kernel try to reboot, it
only cleared bus mastering bit of the Root Port, ignoring enabled
device under it, because it's not the kdump kernel that enabled the
device, it's the first kernel enabled it, and kdump kernel don't know
it.

>
> I'd like to understand the crash/hang better because the quirk
> especially is hard to connect to anything.  If the crash is because of
> an AER or other PCIe error, maybe another possibility is that we could
> handle it better or disable signaling of it or something.
>

Maybe if we can solve the problem by properly shutdown the devices in
right order, then better don't disable any error handling features? Or
kernel might miss some real hardware issue.

--
Best Regards,
Kairui Song


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2020-01-03 Thread Kairui Song
On Thu, Dec 26, 2019 at 3:21 AM Kairui Song  wrote:
>
> There are reports about kdump hang upon reboot on some HPE machines,
> kernel hanged when trying to shutdown a PCIe port, an uncorrectable
> error occurred and crashed the system.
>
> On the machine I can reproduce this issue, part of the topology
> looks like this:
>
> [:00]-+-00.0  Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2
>   +-01.0-[02]--
>   +-01.1-[05]--
>   +-02.0-[06]--+-00.0  Emulex Corporation OneConnect NIC (Skyhawk)
>   |+-00.1  Emulex Corporation OneConnect NIC (Skyhawk)
>   |+-00.2  Emulex Corporation OneConnect NIC (Skyhawk)
>   |+-00.3  Emulex Corporation OneConnect NIC (Skyhawk)
>   |+-00.4  Emulex Corporation OneConnect NIC (Skyhawk)
>   |+-00.5  Emulex Corporation OneConnect NIC (Skyhawk)
>   |+-00.6  Emulex Corporation OneConnect NIC (Skyhawk)
>   |\-00.7  Emulex Corporation OneConnect NIC (Skyhawk)
>   +-02.1-[0f]--
>   +-02.2-[07]00.0  Hewlett-Packard Company Smart Array Gen9 
> Controllers
>
> When shuting down PCIe port :00:02.2 or :00:02.0, the machine
> will hang, depend on which device is reinitialized in kdump kernel.
>
> If force remove unused device then trigger kdump, the problem will never
> happen:
>
> echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove
> echo c > /proc/sysrq-trigger
>
> ... Kdump save vmcore through network, the NIC get reinitialized and
> hpsa is untouched. Then reboot with no problem. (If hpsa is used
> instead, shutdown the NIC in first kernel will help)
>
> The cause is that some devices are enabled by the first kernel, but it
> don't have the chance to shutdown the device, and kdump kernel is not
> aware of it, unless it reinitialize the device.
>
> Upon reboot, kdump kernel will skip downstream device shutdown and
> clears its bridge's master bit directly. The downstream device could
> error out as it can still send requests but upstream refuses it.
>
> So for kdump, let kernel read the correct hardware power state on boot,
> and always clear the bus master bit of PCI device upon shutdown if the
> device is on. PCIe port driver will always shutdown all downstream
> devices first, so this should ensure all downstream devices have bus
> master bit off before clearing the bridge's bus master bit.
>
> Signed-off-by: Kairui Song 
> ---
>  drivers/pci/pci-driver.c | 11 ---
>  drivers/pci/quirks.c | 20 
>  2 files changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
> index 0454ca0e4e3f..84a7fd643b4d 100644
> --- a/drivers/pci/pci-driver.c
> +++ b/drivers/pci/pci-driver.c
> @@ -18,6 +18,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "pci.h"
>  #include "pcie/portdrv.h"
>
> @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev)
>  * If this is a kexec reboot, turn off Bus Master bit on the
>  * device to tell it to not continue to do DMA. Don't touch
>  * devices in D3cold or unknown states.
> -* If it is not a kexec reboot, firmware will hit the PCI
> -* devices with big hammer and stop their DMA any way.
> +* If this is kdump kernel, also turn off Bus Master, the device
> +* could be activated by previous crashed kernel and may block
> +* it's upstream from shutting down.
> +* Else, firmware will hit the PCI devices with big hammer
> +* and stop their DMA any way.
>  */
> -   if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot))
> +   if ((kexec_in_progress || is_kdump_kernel()) &&
> +   pci_dev->current_state <= PCI_D3hot)
> pci_clear_master(pci_dev);
>  }
>
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index 4937a088d7d8..c65d11ab3939 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -28,6 +28,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include/* isa_dma_bridge_buggy */
>  #include "pci.h"
>
> @@ -192,6 +193,25 @@ static int __init pci_apply_final_quirks(void)
>  }
>  fs_initcall_sync(pci_apply_final_quirks);
>
> +/*
> + * Read the device state even if it's not enabled. The device could be
> + * activated by previous crashed kernel, this will read and correct the
> + * cached state.
> + */
> +static void quirk_r

[RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel

2019-12-25 Thread Kairui Song
There are reports about kdump hang upon reboot on some HPE machines,
kernel hanged when trying to shutdown a PCIe port, an uncorrectable
error occurred and crashed the system.

On the machine I can reproduce this issue, part of the topology
looks like this:

[:00]-+-00.0  Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2
  +-01.0-[02]--
  +-01.1-[05]--
  +-02.0-[06]--+-00.0  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.1  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.2  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.3  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.4  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.5  Emulex Corporation OneConnect NIC (Skyhawk)
  |+-00.6  Emulex Corporation OneConnect NIC (Skyhawk)
  |\-00.7  Emulex Corporation OneConnect NIC (Skyhawk)
  +-02.1-[0f]--
  +-02.2-[07]00.0  Hewlett-Packard Company Smart Array Gen9 
Controllers

When shuting down PCIe port :00:02.2 or :00:02.0, the machine
will hang, depend on which device is reinitialized in kdump kernel.

If force remove unused device then trigger kdump, the problem will never
happen:

echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove
echo c > /proc/sysrq-trigger

... Kdump save vmcore through network, the NIC get reinitialized and
hpsa is untouched. Then reboot with no problem. (If hpsa is used
instead, shutdown the NIC in first kernel will help)

The cause is that some devices are enabled by the first kernel, but it
don't have the chance to shutdown the device, and kdump kernel is not
aware of it, unless it reinitialize the device.

Upon reboot, kdump kernel will skip downstream device shutdown and
clears its bridge's master bit directly. The downstream device could
error out as it can still send requests but upstream refuses it.

So for kdump, let kernel read the correct hardware power state on boot,
and always clear the bus master bit of PCI device upon shutdown if the
device is on. PCIe port driver will always shutdown all downstream
devices first, so this should ensure all downstream devices have bus
master bit off before clearing the bridge's bus master bit.

Signed-off-by: Kairui Song 
---
 drivers/pci/pci-driver.c | 11 ---
 drivers/pci/quirks.c | 20 
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 0454ca0e4e3f..84a7fd643b4d 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "pci.h"
 #include "pcie/portdrv.h"
 
@@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev)
 * If this is a kexec reboot, turn off Bus Master bit on the
 * device to tell it to not continue to do DMA. Don't touch
 * devices in D3cold or unknown states.
-* If it is not a kexec reboot, firmware will hit the PCI
-* devices with big hammer and stop their DMA any way.
+* If this is kdump kernel, also turn off Bus Master, the device
+* could be activated by previous crashed kernel and may block
+* it's upstream from shutting down.
+* Else, firmware will hit the PCI devices with big hammer
+* and stop their DMA any way.
 */
-   if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot))
+   if ((kexec_in_progress || is_kdump_kernel()) &&
+   pci_dev->current_state <= PCI_D3hot)
pci_clear_master(pci_dev);
 }
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 4937a088d7d8..c65d11ab3939 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 #include/* isa_dma_bridge_buggy */
 #include "pci.h"
 
@@ -192,6 +193,25 @@ static int __init pci_apply_final_quirks(void)
 }
 fs_initcall_sync(pci_apply_final_quirks);
 
+/*
+ * Read the device state even if it's not enabled. The device could be
+ * activated by previous crashed kernel, this will read and correct the
+ * cached state.
+ */
+static void quirk_read_pm_state_in_kdump(struct pci_dev *dev)
+{
+   u16 pmcsr;
+
+   if (!is_kdump_kernel())
+   return;
+
+   if (dev->pm_cap) {
+   pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
+   dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
+   }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, quirk_read_pm_state_in_kdump);
+
 /*
  * Decoding should be disabled for a PCI device during BAR sizing to avoid
  * conflict. But doing so may cause problems on host bridge and perhaps other

Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-10-15 Thread Kairui Song
 thiOn Tue, Oct 15, 2019 at 10:18 AM Dave Young  wrote:
>
> On 10/14/19 at 07:05pm, Dave Young wrote:
> > On 10/12/19 at 05:24pm, Kairui Song wrote:
> > > On 9/27/19 1:42 PM, Dave Young wrote:
> > > > On 09/25/19 at 06:36pm, Kairui Song wrote:
> > > > > On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar  wrote:
> > > > > > * Kairui Song  wrote:
> > > > > >
> > > > > > > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption 
> > > > > > > support"),
> > > > > > > SWIOTLB will be enabled even if there is less than 4G of memory 
> > > > > > > when SME
> > > > > > > is active, to support DMA of devices that not support address 
> > > > > > > with the
> > > > > > > encrypt bit.
> > > > > > >
> > > > > > > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if 
> > > > > > > SME is
> > > > > > > active") make the kernel keep SWIOTLB enabled even if there is an 
> > > > > > > IOMMU.
> > > > > > >
> > > > > > > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
> > > > > > > encryption") will always force SWIOTLB to be enabled when SEV is 
> > > > > > > active
> > > > > > > in all cases.
> > > > > > >
> > > > > > > Now, when either SME or SEV is active, SWIOTLB will be force 
> > > > > > > enabled,
> > > > > > > and this is also true for kdump kernel. As a result kdump kernel 
> > > > > > > will
> > > > > > > run out of already scarce pre-reserved memory easily.
> > > > > > >
> > > > > > > So when SME/SEV is active, reserve extra memory for SWIOTLB to 
> > > > > > > ensure
> > > > > > > kdump kernel have enough memory, except when 
> > > > > > > "crashkernel=size[KMG],high"
> > > > > > > is specified or any offset is used. As for the high reservation 
> > > > > > > case, an
> > > > > > > extra low memory region will always be reserved and that is 
> > > > > > > enough for
> > > > > > > SWIOTLB. Else if the offset format is used, user should be fully 
> > > > > > > aware
> > > > > > > of any possible kdump kernel memory requirement and have to 
> > > > > > > organize the
> > > > > > > memory usage carefully.
> > > > > > >
> > > > > > > Signed-off-by: Kairui Song 
> > > > > > > ---
> > > > > > >   arch/x86/kernel/setup.c | 20 +---
> > > > > > >   1 file changed, 17 insertions(+), 3 deletions(-)
> > > > > > >
> > > > > > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > > > > > > index 71f20bb18cb0..ee6a2f1e2226 100644
> > > > > > > --- a/arch/x86/kernel/setup.c
> > > > > > > +++ b/arch/x86/kernel/setup.c
> > > > > > > @@ -530,7 +530,7 @@ static int __init 
> > > > > > > crashkernel_find_region(unsigned long long *crash_base,
> > > > > > >  unsigned long long 
> > > > > > > *crash_size,
> > > > > > >  bool high)
> > > > > > >   {
> > > > > > > - unsigned long long base, size;
> > > > > > > + unsigned long long base, size, mem_enc_req = 0;
> > > > > > >
> > > > > > >base = *crash_base;
> > > > > > >size = *crash_size;
> > > > > > > @@ -561,11 +561,25 @@ static int __init 
> > > > > > > crashkernel_find_region(unsigned long long *crash_base,
> > > > > > >if (high)
> > > > > > >goto high_reserve;
> > > > > > >
> > > > > > > + /*
> > > > > > > +  * When SME/SEV is active and not using high reserve,
> > > > > > > +  * it will always required an extra SWIOTLB region.
> > > > > > > +  */
> > > > > 

Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-10-12 Thread Kairui Song

On 9/27/19 1:42 PM, Dave Young wrote:

On 09/25/19 at 06:36pm, Kairui Song wrote:

On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar  wrote:

* Kairui Song  wrote:


Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
SWIOTLB will be enabled even if there is less than 4G of memory when SME
is active, to support DMA of devices that not support address with the
encrypt bit.

And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.

Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
encryption") will always force SWIOTLB to be enabled when SEV is active
in all cases.

Now, when either SME or SEV is active, SWIOTLB will be force enabled,
and this is also true for kdump kernel. As a result kdump kernel will
run out of already scarce pre-reserved memory easily.

So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
is specified or any offset is used. As for the high reservation case, an
extra low memory region will always be reserved and that is enough for
SWIOTLB. Else if the offset format is used, user should be fully aware
of any possible kdump kernel memory requirement and have to organize the
memory usage carefully.

Signed-off-by: Kairui Song 
---
  arch/x86/kernel/setup.c | 20 +---
  1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 71f20bb18cb0..ee6a2f1e2226 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned long 
long *crash_base,
 unsigned long long *crash_size,
 bool high)
  {
- unsigned long long base, size;
+ unsigned long long base, size, mem_enc_req = 0;

   base = *crash_base;
   size = *crash_size;
@@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned long 
long *crash_base,
   if (high)
   goto high_reserve;

+ /*
+  * When SME/SEV is active and not using high reserve,
+  * it will always required an extra SWIOTLB region.
+  */
+ if (mem_encrypt_active())
+ mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
+
   base = memblock_find_in_range(CRASH_ALIGN,
-   CRASH_ADDR_LOW_MAX, size,
+   CRASH_ADDR_LOW_MAX,
+   size + mem_enc_req,
 CRASH_ALIGN);




Hi Ingo,

I re-read my previous reply, it's long and tedious, let me try to make
a more effective reply:


What sizes are we talking about here?


The size here is how much memory will be reserved for kdump kernel, to
ensure kdump kernel and userspace can run without OOM.



- What is the possible size range of swiotlb_size_or_default()


swiotlb_size_or_default() returns the swiotlb size, it's specified by
user using swiotlb=, or default size (64MB)



- What is the size of CRASH_ADDR_LOW_MAX (the old limit)?


It's 4G.



- Why do we replace one fixed limit with another fixed limit instead of
   accurately sizing the area, with each required feature adding its own
   requirement to the reservation size?


It's quite hard to "accurately sizing the area".

No way to tell the exact amount of memory kdump needs, we can only estimate.
Kdump kernel use different cmdline, drivers and components will have
special handling for kdump, and userspace is totally different.


Agreed about your above, but specific this the problem in this patch
There should be other ways.

First thought about doing generic handling in swiotlb part, and do
something like kdump_memory_reserve(size) Ingo suggested,  but according
to you swiotlb init is late, so it can not increase the size, OTOH if
reserve another region for kdump in swiotlb will cause other issues.

So let's think about other improvement, for example to see if you can
call kdump_memory_reserve(size) in AMD SME init path, for example in
mem_encrypt_init(), is it before crashkernel reservation?

If doable it will be at least cleaner than the code in this patch.

Thanks
Dave



How about something simple as following code? The logic and new function is as 
simple as
possible, just always reserve extra low memory when SME/SEV is active, ignore 
the high/low
reservation case. It will waste some memory with SME and high reservation 
though.

Was hesitating a lot about this series, one thing I'm thinking is that what is 
the point
of "crashkernel=" argument, if the crashkernel value could be adjusted 
according, the value
specified will seems more meanless or confusing...

And currently there isn't anything like crashkernel=auto or anything similiar 
to le

Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-09-27 Thread Kairui Song
On Fri, Sep 27, 2019 at 1:42 PM Dave Young  wrote:
>
> On 09/25/19 at 06:36pm, Kairui Song wrote:
> > On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar  wrote:
> > > * Kairui Song  wrote:
> > >
> > > > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption 
> > > > support"),
> > > > SWIOTLB will be enabled even if there is less than 4G of memory when SME
> > > > is active, to support DMA of devices that not support address with the
> > > > encrypt bit.
> > > >
> > > > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
> > > > active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.
> > > >
> > > > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
> > > > encryption") will always force SWIOTLB to be enabled when SEV is active
> > > > in all cases.
> > > >
> > > > Now, when either SME or SEV is active, SWIOTLB will be force enabled,
> > > > and this is also true for kdump kernel. As a result kdump kernel will
> > > > run out of already scarce pre-reserved memory easily.
> > > >
> > > > So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
> > > > kdump kernel have enough memory, except when 
> > > > "crashkernel=size[KMG],high"
> > > > is specified or any offset is used. As for the high reservation case, an
> > > > extra low memory region will always be reserved and that is enough for
> > > > SWIOTLB. Else if the offset format is used, user should be fully aware
> > > > of any possible kdump kernel memory requirement and have to organize the
> > > > memory usage carefully.
> > > >
> > > > Signed-off-by: Kairui Song 
> > > > ---
> > > >  arch/x86/kernel/setup.c | 20 +---
> > > >  1 file changed, 17 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > > > index 71f20bb18cb0..ee6a2f1e2226 100644
> > > > --- a/arch/x86/kernel/setup.c
> > > > +++ b/arch/x86/kernel/setup.c
> > > > @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned 
> > > > long long *crash_base,
> > > > unsigned long long *crash_size,
> > > > bool high)
> > > >  {
> > > > - unsigned long long base, size;
> > > > + unsigned long long base, size, mem_enc_req = 0;
> > > >
> > > >   base = *crash_base;
> > > >   size = *crash_size;
> > > > @@ -561,11 +561,25 @@ static int __init 
> > > > crashkernel_find_region(unsigned long long *crash_base,
> > > >   if (high)
> > > >   goto high_reserve;
> > > >
> > > > + /*
> > > > +  * When SME/SEV is active and not using high reserve,
> > > > +  * it will always required an extra SWIOTLB region.
> > > > +  */
> > > > + if (mem_encrypt_active())
> > > > + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
> > > > +
> > > >   base = memblock_find_in_range(CRASH_ALIGN,
> > > > -   CRASH_ADDR_LOW_MAX, size,
> > > > +   CRASH_ADDR_LOW_MAX,
> > > > +   size + mem_enc_req,
> > > > CRASH_ALIGN);
> > >
> >
> > Hi Ingo,
> >
> > I re-read my previous reply, it's long and tedious, let me try to make
> > a more effective reply:
> >
> > > What sizes are we talking about here?
> >
> > The size here is how much memory will be reserved for kdump kernel, to
> > ensure kdump kernel and userspace can run without OOM.
> >
> > >
> > > - What is the possible size range of swiotlb_size_or_default()
> >
> > swiotlb_size_or_default() returns the swiotlb size, it's specified by
> > user using swiotlb=, or default size (64MB)
> >
> > >
> > > - What is the size of CRASH_ADDR_LOW_MAX (the old limit)?
> >
> > It's 4G.
> >
> > >
> > > - Why do we replace one fixed limit with another fixed limit instead of
> > >   accurately sizing the area, with each required feature adding its own
> > >   require

Re: [PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-09-25 Thread Kairui Song
On Wed, Sep 11, 2019 at 1:56 PM Ingo Molnar  wrote:
> * Kairui Song  wrote:
>
> > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
> > SWIOTLB will be enabled even if there is less than 4G of memory when SME
> > is active, to support DMA of devices that not support address with the
> > encrypt bit.
> >
> > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
> > active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.
> >
> > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
> > encryption") will always force SWIOTLB to be enabled when SEV is active
> > in all cases.
> >
> > Now, when either SME or SEV is active, SWIOTLB will be force enabled,
> > and this is also true for kdump kernel. As a result kdump kernel will
> > run out of already scarce pre-reserved memory easily.
> >
> > So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
> > kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
> > is specified or any offset is used. As for the high reservation case, an
> > extra low memory region will always be reserved and that is enough for
> > SWIOTLB. Else if the offset format is used, user should be fully aware
> > of any possible kdump kernel memory requirement and have to organize the
> > memory usage carefully.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  arch/x86/kernel/setup.c | 20 +---
> >  1 file changed, 17 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > index 71f20bb18cb0..ee6a2f1e2226 100644
> > --- a/arch/x86/kernel/setup.c
> > +++ b/arch/x86/kernel/setup.c
> > @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned long 
> > long *crash_base,
> > unsigned long long *crash_size,
> > bool high)
> >  {
> > - unsigned long long base, size;
> > + unsigned long long base, size, mem_enc_req = 0;
> >
> >   base = *crash_base;
> >   size = *crash_size;
> > @@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned 
> > long long *crash_base,
> >   if (high)
> >   goto high_reserve;
> >
> > + /*
> > +  * When SME/SEV is active and not using high reserve,
> > +  * it will always required an extra SWIOTLB region.
> > +  */
> > + if (mem_encrypt_active())
> > + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
> > +
> >   base = memblock_find_in_range(CRASH_ALIGN,
> > -   CRASH_ADDR_LOW_MAX, size,
> > +   CRASH_ADDR_LOW_MAX,
> > +   size + mem_enc_req,
> > CRASH_ALIGN);
>

Hi Ingo,

I re-read my previous reply, it's long and tedious, let me try to make
a more effective reply:

> What sizes are we talking about here?

The size here is how much memory will be reserved for kdump kernel, to
ensure kdump kernel and userspace can run without OOM.

>
> - What is the possible size range of swiotlb_size_or_default()

swiotlb_size_or_default() returns the swiotlb size, it's specified by
user using swiotlb=, or default size (64MB)

>
> - What is the size of CRASH_ADDR_LOW_MAX (the old limit)?

It's 4G.

>
> - Why do we replace one fixed limit with another fixed limit instead of
>   accurately sizing the area, with each required feature adding its own
>   requirement to the reservation size?

It's quite hard to "accurately sizing the area".

No way to tell the exact amount of memory kdump needs, we can only estimate.
Kdump kernel use different cmdline, drivers and components will have
special handling for kdump, and userspace is totally different.

>
> I.e. please engineer this into a proper solution instead of just
> modifying it around the edges.
>
> For example have you considered adding some sort of
> kdump_memory_reserve(size) facility, which increases the reservation size
> as something like SWIOTLB gets activated? That would avoid the ugly
> mem_encrypt_active() flag, it would just automagically work.

My first attempt is increase crashkernel memory as swiotlb is
activated. There are problems.

First, SME/SEV is currently the only case that both kernel require
SWIOTLB, for most other case, it's wasting memory.

If we don't care about the memory waste, it has to check/reserve/free
crashkernel memory at three different poin

Re: [PATCH v3 0/2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-09-18 Thread Kairui Song
On Wed, Sep 18, 2019 at 3:55 PM Dave Young  wrote:
>
> On 09/12/19 at 12:23am, Kairui Song wrote:
> > On Wednesday, September 11, 2019, Ingo Molnar  wrote:
> > >
> > > * Kairui Song  wrote:
> > >
> > >> Since commit c7753208a94c ("x86, swiotlb: Add memory encryption
> > support"),
> > >> SWIOTLB will be enabled even if there is less than 4G of memory when SME
> > >> is active, to support DMA of devices that not support address with the
> > >> encrypt bit.
> > >>
> > >> And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
> > >> active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.
> > >>
> > >> Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
> > >> encryption") will always force SWIOTLB to be enabled when SEV is active
> > >> in all cases.
> > >>
> > >> Now, when either SME or SEV is active, SWIOTLB will be force enabled,
> > >> and this is also true for kdump kernel. As a result kdump kernel will
> > >> run out of already scarce pre-reserved memory easily.
> > >>
> > >> So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
> > >> kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
> > >> is specified or any offset is used. As for the high reservation case, an
> > >> extra low memory region will always be reserved and that is enough for
> > >> SWIOTLB. Else if the offset format is used, user should be fully aware
> > >> of any possible kdump kernel memory requirement and have to organize the
> > >> memory usage carefully.
> > >>
> > >> Signed-off-by: Kairui Song 
> > >> ---
> > >>  arch/x86/kernel/setup.c | 20 +---
> > >>  1 file changed, 17 insertions(+), 3 deletions(-)
> > >>
> > >> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > >> index 71f20bb18cb0..ee6a2f1e2226 100644
> > >> --- a/arch/x86/kernel/setup.c
> > >> +++ b/arch/x86/kernel/setup.c
> > >> @@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned
> > long long *crash_base,
> > >> unsigned long long *crash_size,
> > >> bool high)
> > >>  {
> > >> - unsigned long long base, size;
> > >> + unsigned long long base, size, mem_enc_req = 0;
> > >>
> > >>   base = *crash_base;
> > >>   size = *crash_size;
> > >> @@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned
> > long long *crash_base,
> > >>   if (high)
> > >>   goto high_reserve;
> > >>
> > >> + /*
> > >> +  * When SME/SEV is active and not using high reserve,
> > >> +  * it will always required an extra SWIOTLB region.
> > >> +  */
> > >> + if (mem_encrypt_active())
> > >> + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
> > >> +
> > >>   base = memblock_find_in_range(CRASH_ALIGN,
> > >> -   CRASH_ADDR_LOW_MAX, size,
> > >> +   CRASH_ADDR_LOW_MAX,
> > >> +   size + mem_enc_req,
> > >> CRASH_ALIGN);
> > >
> > > What sizes are we talking about here?
> > >
> > > - What is the possible size range of swiotlb_size_or_default()
> > >
> > > - What is the size of CRASH_ADDR_LOW_MAX (the old limit)?
> > >
> > > - Why do we replace one fixed limit with another fixed limit instead of
> > >   accurately sizing the area, with each required feature adding its own
> > >   requirement to the reservation size?
> > >
> > > I.e. please engineer this into a proper solution instead of just
> > > modifying it around the edges.
> > >
> > > For example have you considered adding some sort of
> > > kdump_memory_reserve(size) facility, which increases the reservation size
> > > as something like SWIOTLB gets activated? That would avoid the ugly
> > > mem_encrypt_active() flag, it would just automagically work.
> >
> > Hi, thanks for the suggestions, actually I did try to workout a better
> > resolution, at least for SW

[PATCH v3 0/2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-09-10 Thread Kairui Song
This series let kernel reserve extra memory for kdump when SME or SEV is
active.

When SME or SEV is active, SWIOTLB will be always be force enabled, and
this is also true for kdump kernel. As a result kdump kernel will
run out of already scarce pre-reserved memory easily.

So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
is specified or any offset is used. With high reservation an extra low
memory region will always be reserved and that is enough for SWIOTLB.
With offset format, user should be fully aware of any possible kdump
kernel memory requirement and have to organize the memory usage carefully.

Patch 1/2 simply split some code out of the reserve_crashkernel, prepare
for the change of next patch.

Patch 2/2 will let crashkernel reserve extra memory when SME or SEV is
active, and explains more details and history about why this change is
introduced.

Update from V2:
- Refactor and split some function out of reserve_crashkernel to make
  it cleaner, as suggested by Borislav Petkov
- Split into 2 patches

Update from V1:
- Use mem_encrypt_active() instead of "sme_active() || sev_active()"
- Don't reserve extra memory when ",high" or "@offset" is used, and
don't print redundant message.
- Fix coding style problem

Kairui Song (2):
  x86/kdump: Split some code out of reserve_crashkernel
  x86/kdump: Reserve extra memory when SME or SEV is active

 arch/x86/kernel/setup.c | 106 
 1 file changed, 74 insertions(+), 32 deletions(-)

-- 
2.21.0



[PATCH v3 2/2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-09-10 Thread Kairui Song
Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
SWIOTLB will be enabled even if there is less than 4G of memory when SME
is active, to support DMA of devices that not support address with the
encrypt bit.

And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.

Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
encryption") will always force SWIOTLB to be enabled when SEV is active
in all cases.

Now, when either SME or SEV is active, SWIOTLB will be force enabled,
and this is also true for kdump kernel. As a result kdump kernel will
run out of already scarce pre-reserved memory easily.

So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
is specified or any offset is used. As for the high reservation case, an
extra low memory region will always be reserved and that is enough for
SWIOTLB. Else if the offset format is used, user should be fully aware
of any possible kdump kernel memory requirement and have to organize the
memory usage carefully.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/setup.c | 20 +---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 71f20bb18cb0..ee6a2f1e2226 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -530,7 +530,7 @@ static int __init crashkernel_find_region(unsigned long 
long *crash_base,
  unsigned long long *crash_size,
  bool high)
 {
-   unsigned long long base, size;
+   unsigned long long base, size, mem_enc_req = 0;
 
base = *crash_base;
size = *crash_size;
@@ -561,11 +561,25 @@ static int __init crashkernel_find_region(unsigned long 
long *crash_base,
if (high)
goto high_reserve;
 
+   /*
+* When SME/SEV is active and not using high reserve,
+* it will always required an extra SWIOTLB region.
+*/
+   if (mem_encrypt_active())
+   mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
+
base = memblock_find_in_range(CRASH_ALIGN,
- CRASH_ADDR_LOW_MAX, size,
+ CRASH_ADDR_LOW_MAX,
+ size + mem_enc_req,
  CRASH_ALIGN);
-   if (base)
+   if (base) {
+   if (mem_enc_req) {
+   pr_info("Memory encryption is active, crashkernel needs 
%ldMB extra memory\n",
+   (unsigned long)(mem_enc_req >> 20));
+   size += mem_enc_req;
+   }
goto found;
+   }
 
 high_reserve:
/* Try high reserve */
-- 
2.21.0



[PATCH v3 1/2] x86/kdump: Split some code out of reserve_crashkernel

2019-09-10 Thread Kairui Song
Split out the code related to finding suitable region for kdump out of
reserve_crashkernel, clean up and refactor for further change, no feature
change.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/setup.c | 92 +++--
 1 file changed, 60 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bbe35bf879f5..71f20bb18cb0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -526,6 +526,63 @@ static int __init reserve_crashkernel_low(void)
return 0;
 }
 
+static int __init crashkernel_find_region(unsigned long long *crash_base,
+ unsigned long long *crash_size,
+ bool high)
+{
+   unsigned long long base, size;
+
+   base = *crash_base;
+   size = *crash_size;
+
+   /*
+* base == 0 means: find the address automatically, else just
+* verify the region is useable
+*/
+   if (base) {
+   unsigned long long start;
+
+   start = memblock_find_in_range(base, base + size,
+  size, 1 << 20);
+   if (start != base) {
+   pr_info("crashkernel reservation failed - memory is in 
use.\n");
+   return -1;
+   }
+   return 0;
+   }
+
+   /*
+* crashkernel=x,high reserves memory over 4G, also allocates
+* 256M extra low memory for DMA buffers and swiotlb.
+* But the extra memory is not required for all machines.
+* So try low memory first and fall back to high memory
+* unless "crashkernel=size[KMG],high" is specified.
+*/
+   if (high)
+   goto high_reserve;
+
+   base = memblock_find_in_range(CRASH_ALIGN,
+ CRASH_ADDR_LOW_MAX, size,
+ CRASH_ALIGN);
+   if (base)
+   goto found;
+
+high_reserve:
+   /* Try high reserve */
+   base = memblock_find_in_range(CRASH_ALIGN,
+ CRASH_ADDR_HIGH_MAX, size,
+ CRASH_ALIGN);
+   if (base)
+   goto found;
+
+   pr_info("crashkernel reservation failed - No suitable area found.\n");
+   return -1;
+found:
+   *crash_base = base;
+   *crash_size = size;
+   return 0;
+}
+
 static void __init reserve_crashkernel(void)
 {
unsigned long long crash_size, crash_base, total_mem;
@@ -550,39 +607,10 @@ static void __init reserve_crashkernel(void)
return;
}
 
-   /* 0 means: find the address automatically */
-   if (!crash_base) {
-   /*
-* Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
-* crashkernel=x,high reserves memory over 4G, also allocates
-* 256M extra low memory for DMA buffers and swiotlb.
-* But the extra memory is not required for all machines.
-* So try low memory first and fall back to high memory
-* unless "crashkernel=size[KMG],high" is specified.
-*/
-   if (!high)
-   crash_base = memblock_find_in_range(CRASH_ALIGN,
-   CRASH_ADDR_LOW_MAX,
-   crash_size, CRASH_ALIGN);
-   if (!crash_base)
-   crash_base = memblock_find_in_range(CRASH_ALIGN,
-   CRASH_ADDR_HIGH_MAX,
-   crash_size, CRASH_ALIGN);
-   if (!crash_base) {
-   pr_info("crashkernel reservation failed - No suitable 
area found.\n");
-   return;
-   }
-   } else {
-   unsigned long long start;
+   ret = crashkernel_find_region(&crash_base, &crash_size, high);
+   if (ret)
+   return;
 
-   start = memblock_find_in_range(crash_base,
-  crash_base + crash_size,
-  crash_size, 1 << 20);
-   if (start != crash_base) {
-   pr_info("crashkernel reservation failed - memory is in 
use.\n");
-   return;
-   }
-   }
ret = memblock_reserve(crash_base, crash_size);
if (ret) {
pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
-- 
2.21.0



Re: [PATCH v2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-09-02 Thread Kairui Song

On 8/31/19 12:45 AM, Borislav Petkov wrote:

On Mon, Aug 26, 2019 at 12:45:35PM +0800, Kairui Song wrote:

Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
SWIOTLB will be enabled even if there is less than 4G of memory when SME
is active, to support DMA of devices that not support address with the
encrypt bit.

And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.

Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
encryption") will always force SWIOTLB to be enabled when SEV is active
in all cases.

Now, when either SME or SEV is active, SWIOTLB will be force enabled,
and this is also true for kdump kernel. As a result kdump kernel will
run out of already scarce pre-reserved memory easily.

So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
is specified or any offset is used. As for the high reservation case, an
extra low memory region will always be reserved and that is enough for
SWIOTLB. Else if the offset format is used, user should be fully aware
of any possible kdump kernel memory requirement and have to organize the
memory usage carefully.

Signed-off-by: Kairui Song 

---
Update from V1:
- Use mem_encrypt_active() instead of "sme_active() || sev_active()"
- Don't reserve extra memory when ",high" or "@offset" is used, and
   don't print redundant message.
- Fix coding style problem

  arch/x86/kernel/setup.c | 31 ---
  1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bbe35bf879f5..221beb10c55d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void)
  
  static void __init reserve_crashkernel(void)

  {
-   unsigned long long crash_size, crash_base, total_mem;
+   unsigned long long crash_size, crash_base, total_mem, mem_enc_req;
bool high = false;
int ret;
  
@@ -550,6 +550,15 @@ static void __init reserve_crashkernel(void)

return;
}
  
+	/*

+* When SME/SEV is active, it will always required an extra SWIOTLB
+* region.
+*/
+   if (mem_encrypt_active())
+   mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
+   else
+   mem_enc_req = 0;


Hmm, ugly.


I agree with this, but didn't have a better idea about how toimprove it, so 
thanks for the suggestions below.



You set mem_enc_reg here ...


+
/* 0 means: find the address automatically */
if (!crash_base) {
/*
@@ -563,11 +572,19 @@ static void __init reserve_crashkernel(void)
if (!high)
crash_base = memblock_find_in_range(CRASH_ALIGN,
CRASH_ADDR_LOW_MAX,
-   crash_size, CRASH_ALIGN);
-   if (!crash_base)
+   crash_size + mem_enc_req,
+   CRASH_ALIGN);
+   /*
+* For high reservation, an extra low memory for SWIOTLB will
+* always be reserved later, so no need to reserve extra
+* memory for memory encryption case here.
+*/
+   if (!crash_base) {
+   mem_enc_req = 0;


... but you clear it here...


crash_base = memblock_find_in_range(CRASH_ALIGN,
CRASH_ADDR_HIGH_MAX,
crash_size, CRASH_ALIGN);
+   }
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area 
found.\n");
return;
@@ -575,6 +592,7 @@ static void __init reserve_crashkernel(void)
} else {
unsigned long long start;
  
+		mem_enc_req = 0;


... and here...


start = memblock_find_in_range(crash_base,
   crash_base + crash_size,
   crash_size, 1 << 20);
@@ -583,6 +601,13 @@ static void __init reserve_crashkernel(void)
return;
}
}
+
+   if (mem_enc_req) {
+   pr_info("Memory encryption is active, crashkernel needs %ldMB extra 
memory\n",
+   (unsigned long)(mem_enc_req >> 20));
+   crash_size += mem_enc_req;
+   }


... and then you report only when it is still set.

How about you carve out that if (!crash_base) { ... } else { } piece
into a separate function without any fu

Re: [PATCH v2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-08-26 Thread Kairui Song
On Mon, Aug 26, 2019 at 12:46 PM Kairui Song  wrote:
>
> Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
> SWIOTLB will be enabled even if there is less than 4G of memory when SME
> is active, to support DMA of devices that not support address with the
> encrypt bit.
>
> And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
> active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.
>
> Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
> encryption") will always force SWIOTLB to be enabled when SEV is active
> in all cases.
>
> Now, when either SME or SEV is active, SWIOTLB will be force enabled,
> and this is also true for kdump kernel. As a result kdump kernel will
> run out of already scarce pre-reserved memory easily.
>
> So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
> kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
> is specified or any offset is used. As for the high reservation case, an
> extra low memory region will always be reserved and that is enough for
> SWIOTLB. Else if the offset format is used, user should be fully aware
> of any possible kdump kernel memory requirement and have to organize the
> memory usage carefully.
>
> Signed-off-by: Kairui Song 
>
> ---
> Update from V1:
> - Use mem_encrypt_active() instead of "sme_active() || sev_active()"
> - Don't reserve extra memory when ",high" or "@offset" is used, and
>   don't print redundant message.
> - Fix coding style problem
>
>  arch/x86/kernel/setup.c | 31 ---
>  1 file changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index bbe35bf879f5..221beb10c55d 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void)
>
>  static void __init reserve_crashkernel(void)
>  {
> -   unsigned long long crash_size, crash_base, total_mem;
> +   unsigned long long crash_size, crash_base, total_mem, mem_enc_req;
> bool high = false;
> int ret;
>
> @@ -550,6 +550,15 @@ static void __init reserve_crashkernel(void)
> return;
> }
>
> +   /*
> +* When SME/SEV is active, it will always required an extra SWIOTLB
> +* region.
> +*/
> +   if (mem_encrypt_active())
> +   mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
> +   else
> +   mem_enc_req = 0;
> +
> /* 0 means: find the address automatically */
> if (!crash_base) {
> /*
> @@ -563,11 +572,19 @@ static void __init reserve_crashkernel(void)
> if (!high)
> crash_base = memblock_find_in_range(CRASH_ALIGN,
> CRASH_ADDR_LOW_MAX,
> -   crash_size, CRASH_ALIGN);
> -   if (!crash_base)
> +   crash_size + mem_enc_req,
> +   CRASH_ALIGN);
> +   /*
> +* For high reservation, an extra low memory for SWIOTLB will
> +* always be reserved later, so no need to reserve extra
> +* memory for memory encryption case here.
> +*/
> +   if (!crash_base) {
> +   mem_enc_req = 0;
> crash_base = memblock_find_in_range(CRASH_ALIGN,
> CRASH_ADDR_HIGH_MAX,
> crash_size, CRASH_ALIGN);
> +   }
> if (!crash_base) {
> pr_info("crashkernel reservation failed - No suitable 
> area found.\n");
> return;
> @@ -575,6 +592,7 @@ static void __init reserve_crashkernel(void)
> } else {
> unsigned long long start;
>
> +   mem_enc_req = 0;
> start = memblock_find_in_range(crash_base,
>crash_base + crash_size,
>crash_size, 1 << 20);
> @@ -583,6 +601,13 @@ static void __init reserve_crashkernel(void)
> return;
> }
> }
> +
> +   if (mem_enc_req) {
> +   pr_info("Memory encryption is active, crashkernel needs %ldMB 
> extra memory\n",
> +   (unsigned long)(mem_enc_req >> 20));
> +   crash_size += mem_enc_req;
> +   }
> +
> ret = memblock_reserve(crash_base, crash_size);
> if (ret) {
> pr_err("%s: Error reserving crashkernel memblock.\n", 
> __func__);
> --
> 2.21.0
>

Hi Tom, any comment about V2?

--
Best Regards,
Kairui Song


[PATCH v2] x86/kdump: Reserve extra memory when SME or SEV is active

2019-08-25 Thread Kairui Song
Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
SWIOTLB will be enabled even if there is less than 4G of memory when SME
is active, to support DMA of devices that not support address with the
encrypt bit.

And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.

Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
encryption") will always force SWIOTLB to be enabled when SEV is active
in all cases.

Now, when either SME or SEV is active, SWIOTLB will be force enabled,
and this is also true for kdump kernel. As a result kdump kernel will
run out of already scarce pre-reserved memory easily.

So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
is specified or any offset is used. As for the high reservation case, an
extra low memory region will always be reserved and that is enough for
SWIOTLB. Else if the offset format is used, user should be fully aware
of any possible kdump kernel memory requirement and have to organize the
memory usage carefully.

Signed-off-by: Kairui Song 

---
Update from V1:
- Use mem_encrypt_active() instead of "sme_active() || sev_active()"
- Don't reserve extra memory when ",high" or "@offset" is used, and
  don't print redundant message.
- Fix coding style problem

 arch/x86/kernel/setup.c | 31 ---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bbe35bf879f5..221beb10c55d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void)
 
 static void __init reserve_crashkernel(void)
 {
-   unsigned long long crash_size, crash_base, total_mem;
+   unsigned long long crash_size, crash_base, total_mem, mem_enc_req;
bool high = false;
int ret;
 
@@ -550,6 +550,15 @@ static void __init reserve_crashkernel(void)
return;
}
 
+   /*
+* When SME/SEV is active, it will always required an extra SWIOTLB
+* region.
+*/
+   if (mem_encrypt_active())
+   mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
+   else
+   mem_enc_req = 0;
+
/* 0 means: find the address automatically */
if (!crash_base) {
/*
@@ -563,11 +572,19 @@ static void __init reserve_crashkernel(void)
if (!high)
crash_base = memblock_find_in_range(CRASH_ALIGN,
CRASH_ADDR_LOW_MAX,
-   crash_size, CRASH_ALIGN);
-   if (!crash_base)
+   crash_size + mem_enc_req,
+   CRASH_ALIGN);
+   /*
+* For high reservation, an extra low memory for SWIOTLB will
+* always be reserved later, so no need to reserve extra
+* memory for memory encryption case here.
+*/
+   if (!crash_base) {
+   mem_enc_req = 0;
crash_base = memblock_find_in_range(CRASH_ALIGN,
CRASH_ADDR_HIGH_MAX,
crash_size, CRASH_ALIGN);
+   }
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable 
area found.\n");
return;
@@ -575,6 +592,7 @@ static void __init reserve_crashkernel(void)
} else {
unsigned long long start;
 
+   mem_enc_req = 0;
start = memblock_find_in_range(crash_base,
   crash_base + crash_size,
   crash_size, 1 << 20);
@@ -583,6 +601,13 @@ static void __init reserve_crashkernel(void)
return;
}
}
+
+   if (mem_enc_req) {
+   pr_info("Memory encryption is active, crashkernel needs %ldMB 
extra memory\n",
+   (unsigned long)(mem_enc_req >> 20));
+   crash_size += mem_enc_req;
+   }
+
ret = memblock_reserve(crash_base, crash_size);
if (ret) {
pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
-- 
2.21.0



Re: [PATCH] x86/kdump: Reserve extra memory when SME or SEV is active

2019-08-22 Thread Kairui Song
On Thu, Aug 22, 2019 at 10:35 PM Lendacky, Thomas
 wrote:
>
> On 8/21/19 9:53 PM, Kairui Song wrote:
> > Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
> > SWIOTLB will be enabled even if there is less than 4G of memory when SME
> > is active, to support DMA of devices that not support address with the
> > encrypt bit.
> >
> > And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
> > active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.
> >
> > Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
> > encryption") will always force SWIOTLB to be enabled when SEV is active
> > in all cases.
> >
> > Now, when either SME or SEV is active, SWIOTLB will be force enabled,
> > and this is also true for kdump kernel. As a result kdump kernel will
> > run out of already scarce pre-reserved memory easily.
> >
> > So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
> > kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
> > is specified or any offset is used. As for the high reservation case, an
> > extra low memory region will always be reserved and that is enough for
> > SWIOTLB. Else if the offset format is used, user should be fully aware
> > of any possible kdump kernel memory requirement and have to organize the
> > memory usage carefully.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  arch/x86/kernel/setup.c | 26 +++---
> >  1 file changed, 23 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > index bbe35bf879f5..ed91fa9d9f6e 100644
> > --- a/arch/x86/kernel/setup.c
> > +++ b/arch/x86/kernel/setup.c
> > @@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void)
> >
> >  static void __init reserve_crashkernel(void)
> >  {
> > - unsigned long long crash_size, crash_base, total_mem;
> > + unsigned long long crash_size, crash_base, total_mem, mem_enc_req;
> >   bool high = false;
> >   int ret;
> >
> > @@ -550,6 +550,17 @@ static void __init reserve_crashkernel(void)
> >   return;
> >   }
> >
> > + /*
> > +  * When SME/SEV is active, it will always required an extra SWIOTLB
> > +  * region.
> > +  */
> > + if (sme_active() || sev_active()) {
>
> You can use mem_encrypt_active() here in place of the two checks.

That's a very good suggestion.

>
> > + mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
> > + pr_info("Memory encryption is active, crashkernel needs %ldMB 
> > extra memory\n",
> > + (unsigned long)(mem_enc_req >> 20));
>
> There is a point below where you zero out this value, so should this
> be issued later only if mem_enc_req is non-zero?

Yes that's true, but currently if zero out this value when ",high" is
used, then an extra low memory region will be reserved, so this
message will not be very confusing I think? as the required extra
memory is now in the low memory region. And for the "@offset" case
this could be a hint for users. And if the reserve failed due to
enlarged crashkernel size, the user may also be better aware of what
is causing the failure by this message.

>
> Also, looks like one too many tabs.
>
> > + } else
>
> Since you used braces on the if path, you need braces on the else path.

OK, will fix the code style issues.

>
> Thanks,
> Tom
>
> > + mem_enc_req = 0;
> > +
> >   /* 0 means: find the address automatically */
> >   if (!crash_base) {
> >   /*
> > @@ -563,11 +574,19 @@ static void __init reserve_crashkernel(void)
> >   if (!high)
> >   crash_base = memblock_find_in_range(CRASH_ALIGN,
> >   CRASH_ADDR_LOW_MAX,
> > - crash_size, CRASH_ALIGN);
> > - if (!crash_base)
> > + crash_size + mem_enc_req,
> > + CRASH_ALIGN);
> > + /*
> > +  * For high reservation, an extra low memory for SWIOTLB will
> > +  * always be reserved later, so no need to reserve extra
> > +  * memory for memory encryption case here.
> > +  */
> > + if (!crash_base) {
> > +   

[PATCH] x86/kdump: Reserve extra memory when SME or SEV is active

2019-08-21 Thread Kairui Song
Since commit c7753208a94c ("x86, swiotlb: Add memory encryption support"),
SWIOTLB will be enabled even if there is less than 4G of memory when SME
is active, to support DMA of devices that not support address with the
encrypt bit.

And commit aba2d9a6385a ("iommu/amd: Do not disable SWIOTLB if SME is
active") make the kernel keep SWIOTLB enabled even if there is an IOMMU.

Then commit d7b417fa08d1 ("x86/mm: Add DMA support for SEV memory
encryption") will always force SWIOTLB to be enabled when SEV is active
in all cases.

Now, when either SME or SEV is active, SWIOTLB will be force enabled,
and this is also true for kdump kernel. As a result kdump kernel will
run out of already scarce pre-reserved memory easily.

So when SME/SEV is active, reserve extra memory for SWIOTLB to ensure
kdump kernel have enough memory, except when "crashkernel=size[KMG],high"
is specified or any offset is used. As for the high reservation case, an
extra low memory region will always be reserved and that is enough for
SWIOTLB. Else if the offset format is used, user should be fully aware
of any possible kdump kernel memory requirement and have to organize the
memory usage carefully.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/setup.c | 26 +++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bbe35bf879f5..ed91fa9d9f6e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -528,7 +528,7 @@ static int __init reserve_crashkernel_low(void)
 
 static void __init reserve_crashkernel(void)
 {
-   unsigned long long crash_size, crash_base, total_mem;
+   unsigned long long crash_size, crash_base, total_mem, mem_enc_req;
bool high = false;
int ret;
 
@@ -550,6 +550,17 @@ static void __init reserve_crashkernel(void)
return;
}
 
+   /*
+* When SME/SEV is active, it will always required an extra SWIOTLB
+* region.
+*/
+   if (sme_active() || sev_active()) {
+   mem_enc_req = ALIGN(swiotlb_size_or_default(), SZ_1M);
+   pr_info("Memory encryption is active, crashkernel needs %ldMB 
extra memory\n",
+   (unsigned long)(mem_enc_req >> 20));
+   } else
+   mem_enc_req = 0;
+
/* 0 means: find the address automatically */
if (!crash_base) {
/*
@@ -563,11 +574,19 @@ static void __init reserve_crashkernel(void)
if (!high)
crash_base = memblock_find_in_range(CRASH_ALIGN,
CRASH_ADDR_LOW_MAX,
-   crash_size, CRASH_ALIGN);
-   if (!crash_base)
+   crash_size + mem_enc_req,
+   CRASH_ALIGN);
+   /*
+* For high reservation, an extra low memory for SWIOTLB will
+* always be reserved later, so no need to reserve extra
+* memory for memory encryption case here.
+*/
+   if (!crash_base) {
+   mem_enc_req = 0;
crash_base = memblock_find_in_range(CRASH_ALIGN,
CRASH_ADDR_HIGH_MAX,
crash_size, CRASH_ALIGN);
+   }
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable 
area found.\n");
return;
@@ -583,6 +602,7 @@ static void __init reserve_crashkernel(void)
return;
}
}
+   crash_size += mem_enc_req;
ret = memblock_reserve(crash_base, crash_size);
if (ret) {
pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
-- 
2.21.0



[PATCH] x86: Fix broken multiboot2 buliding for i386

2019-07-31 Thread Kairui Song
When building for i386, an error occured:

kexec/arch/i386/kexec-x86.c:39:22: error: 'multiboot2_x86_probe'
undeclared here (not in a function); did you mean 'multiboot_x86_probe'?
39 |  { "multiboot2-x86", multiboot2_x86_probe, multiboot2_x86_load,
   |  ^~~~
   |  multiboot_x86_probe

kexec/arch/i386/kexec-x86.c:39:44: error: 'multiboot2_x86_load'
undeclared here (not in a function); did you mean 'multiboot_x86_load'?
39 |  { "multiboot2-x86", multiboot2_x86_probe, multiboot2_x86_load,
   |^~~
   |multiboot_x86_load
kexec/arch/i386/kexec-x86.c:40:4: error: 'multiboot2_x86_usage'
 undeclared here (not in a function); did you mean 'multiboot_x86_usage'?
40 |multiboot2_x86_usage },
   |^~~~
   |multiboot_x86_usage

Fix this issue by putting the definition in the right header, also tidy
up Makefile.

Fixes: 22a2ed55132e ("x86: Support multiboot2 images")
Signed-off-by: Kairui Song 
---
 kexec/arch/i386/Makefile | 2 +-
 kexec/arch/i386/kexec-x86.h  | 5 +
 kexec/arch/x86_64/kexec-x86_64.h | 5 -
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kexec/arch/i386/Makefile b/kexec/arch/i386/Makefile
index 105cefd..f486103 100644
--- a/kexec/arch/i386/Makefile
+++ b/kexec/arch/i386/Makefile
@@ -7,6 +7,7 @@ i386_KEXEC_SRCS += kexec/arch/i386/kexec-elf-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-elf-rel-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-bzImage.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-multiboot-x86.c
+i386_KEXEC_SRCS += kexec/arch/i386/kexec-mb2-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-beoboot-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-nbi.c
 i386_KEXEC_SRCS += kexec/arch/i386/x86-linux-setup.c
@@ -14,7 +15,6 @@ i386_KEXEC_SRCS += kexec/arch/i386/crashdump-x86.c
 
 dist += kexec/arch/i386/Makefile $(i386_KEXEC_SRCS)\
kexec/arch/i386/crashdump-x86.h \
-   kexec/arch/i386/kexec-mb2-x86.c \
kexec/arch/i386/kexec-x86.h \
kexec/arch/i386/x86-linux-setup.h   \
kexec/arch/i386/include/arch/options.h
diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
index 1b58c3b..16d0f6c 100644
--- a/kexec/arch/i386/kexec-x86.h
+++ b/kexec/arch/i386/kexec-x86.h
@@ -60,6 +60,11 @@ int multiboot_x86_load(int argc, char **argv, const char 
*buf, off_t len,
struct kexec_info *info);
 void multiboot_x86_usage(void);
 
+int multiboot2_x86_load(int argc, char **argv, const char *buf, off_t len,
+   struct kexec_info *info);
+void multiboot2_x86_usage(void);
+int multiboot2_x86_probe(const char *buf, off_t buf_len);
+
 int elf_x86_probe(const char *buf, off_t len);
 int elf_x86_load(int argc, char **argv, const char *buf, off_t len,
struct kexec_info *info);
diff --git a/kexec/arch/x86_64/kexec-x86_64.h b/kexec/arch/x86_64/kexec-x86_64.h
index 21c3a73..4cdeffb 100644
--- a/kexec/arch/x86_64/kexec-x86_64.h
+++ b/kexec/arch/x86_64/kexec-x86_64.h
@@ -33,9 +33,4 @@ int bzImage64_load(int argc, char **argv, const char *buf, 
off_t len,
struct kexec_info *info);
 void bzImage64_usage(void);
 
-int multiboot2_x86_load(int argc, char **argv, const char *buf, off_t len,
-   struct kexec_info *info);
-void multiboot2_x86_usage(void);
-int multiboot2_x86_probe(const char *buf, off_t buf_len);
-
 #endif /* KEXEC_X86_64_H */
-- 
2.21.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH kexec-tools] x86: re-order includes to avoid duplicate struct e820entry

2019-07-10 Thread Kairui Song
On Wed, Jul 10, 2019 at 4:11 PM Simon Horman  wrote:
>
> On Wed, Jul 03, 2019 at 10:04:32AM +0200, Simon Horman wrote:
> > xenctrl.h defines struct e820entry as:
> >
> >   if defined(__i386__) || defined(__x86_64__)
> >   ...
> >   #define E820_RAM1
> >   ...
> >   struct e820entry {
> >   uint64_t addr;
> >   uint64_t size;
> >   uint32_t type;
> >   } __attribute__((packed));
> >   ...
> >   #endif
> >
> >  $ dpkg-query -S /usr/include/xenctrl.h
> >  libxen-dev:amd64: /usr/include/xenctrl.h
> >  $  dpkg-query -W libxen-dev:amd64
> >  libxen-dev:amd64 4.8.5+shim4.10.2+xsa282-1+deb9u11
> >
> > ./include/x86/x86-linux.h defines struct e820entry as:
> >
> >   #ifndef E820_RAM
> >   struct e820entry {
> >   uint64_t addr;  /* start of memory segment */
> >   uint64_t size;  /* size of memory segment */
> >   uint32_t type;  /* type of memory segment */
> >   #define E820_RAM1
> >   ...
> >   } __attribute__((packed));
> >   #endif
> >
> > Since cedeee0a3007 ("x86: Introduce helpers for getting RSDP address")
> > ./kexec/arch/i386/kexec-x86-common.c includes
> >
> >   +#include "x86-linux-setup.h"
> >#include "../../kexec-xen.h"
> >
> > When xenctrl.h is present the above results in:
> >
> >  $ gcc
> >  ...
> >  In file included from kexec/arch/i386/../../kexec-xen.h:5:0,
> >   from kexec/arch/i386/kexec-x86-common.c:43:
> >  /usr/include/xenctrl.h:1271:8: error: redefinition of 'struct e820entry'
> >   struct e820entry {
> >  ^
> >
> >  In file included from kexec/arch/i386/x86-linux-setup.h:3:0,
> >   from kexec/arch/i386/kexec-x86-common.c:42:
> >  ./include/x86/x86-linux.h:16:8: note: originally defined here
> >   struct e820entry {
> >  ^
> >  ...
> >  $ gcc --version | head -1
> >  gcc (Debian 6.3.0-18+deb9u1) 6.3.0 20170516
> >
> > To militate this this problem re-order the includes so that
> > x86-linux.h is included after xenctrl.h and thus
> > struct e820entry will only be defined once due to it
> > being devined conditionally in x86-linux.h.
> >
> > In practice the definitions are the same so it should
> > not matter which is chosen.
> >
> > It also seems rather unpleasent to me to need to play
> > with include ordering. Perhaps a better solution in the longer
> > term would be to rename the local definition of struct e820entry.
> >
> > Fixes: cedeee0a3007 ("x86: Introduce helpers for getting RSDP address")
> > Signed-off-by: Simon Horman 
>
> I have applied this change.
>

Thanks for the fix, it looks good, so the "move the helpers to
x86-linux-setup.c" patch should be not needed now.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH kexec-tools] x86: re-order includes to avoid duplicate struct e820entry

2019-07-03 Thread Kairui Song
break;
+ }
+ }
+ fclose(fp);
+
+ return acpi_rsdp;
+}
+
+uint64_t get_acpi_rsdp(void)
+{
+ uint64_t acpi_rsdp = 0;
+
+ acpi_rsdp = bootparam_get_acpi_rsdp();
+
+ if (!acpi_rsdp)
+ acpi_rsdp = efi_get_acpi_rsdp();
+
+ return acpi_rsdp;
+}
 void setup_linux_system_parameters(struct kexec_info *info,
 struct x86_linux_param_header *real_mode)
 {
diff --git a/kexec/arch/i386/x86-linux-setup.h
b/kexec/arch/i386/x86-linux-setup.h
index 0c651e5..1e81805 100644
--- a/kexec/arch/i386/x86-linux-setup.h
+++ b/kexec/arch/i386/x86-linux-setup.h
@@ -22,7 +22,7 @@ static inline void setup_linux_bootloader_parameters(
 void setup_linux_system_parameters(struct kexec_info *info,
  struct x86_linux_param_header *real_mode);
 int get_bootparam(void *buf, off_t offset, size_t size);
-
+uint64_t get_acpi_rsdp(void);

 #define SETUP_BASE0x9
 #define KERN32_BASE  0x10 /* 1MB */
-- 
2.21.0

Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] x86/kexec: Add ACPI NVS region to the ident map

2019-06-10 Thread Kairui Song
On Mon, Jun 10, 2019 at 5:52 PM Borislav Petkov  wrote:
>
> On Mon, Jun 10, 2019 at 03:36:17PM +0800, Kairui Song wrote:
> > With the recent addition of RSDP parsing in decompression stage, kexec
> > kernel now needs ACPI tables to be covered by the identity mapping.
> > And in commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and
> > ACPI tables to the ident map"), ACPI tables memory region was added to
> > the ident map.
> >
> > But on some machines, there is only ACPI NVS memory region, and the ACPI
> > tables is located in the NVS region instead. In such case second kernel
>
> *are* located - plural.
>
> > will still fail when trying to access ACPI tables.
> >
> > So, to fix the problem, add NVS memory region in the ident map as well.
> >
> > Fixes: 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI tables 
> > to the ident map")
> > Suggested-by: Junichi Nomura 
> > Signed-off-by: Kairui Song 
> > ---
> >
> > Tested with my laptop and VM, on top of current tip:x86/boot.
>
> You tested this in a VM and not on the *actual* machine with the NVS
> region?
>
> This is a joke, right?
>

Hi Boris, unfortunately I don't have a real machine which only have
the NVS region.
I did fake the memmap to emulate such problem but can't really promise
this will fix the real case.
So just declare it won't break anything that is already working. And
I'm asking Junichi to have a try as he reported this issue on the
machines he has.

-- 
Best Regards,
Kairui Song


Re: [PATCH] x86/kexec: Add ACPI NVS region to the ident map

2019-06-10 Thread Kairui Song
On Mon, Jun 10, 2019 at 3:37 PM Kairui Song  wrote:
>
> With the recent addition of RSDP parsing in decompression stage, kexec
> kernel now needs ACPI tables to be covered by the identity mapping.
> And in commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and
> ACPI tables to the ident map"), ACPI tables memory region was added to
> the ident map.
>
> But on some machines, there is only ACPI NVS memory region, and the ACPI
> tables is located in the NVS region instead. In such case second kernel
> will still fail when trying to access ACPI tables.
>
> So, to fix the problem, add NVS memory region in the ident map as well.
>
> Fixes: 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI tables to 
> the ident map")
> Suggested-by: Junichi Nomura 
> Signed-off-by: Kairui Song 
> ---
>
> Tested with my laptop and VM, on top of current tip:x86/boot.
>
>  arch/x86/kernel/machine_kexec_64.c | 18 +++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/machine_kexec_64.c 
> b/arch/x86/kernel/machine_kexec_64.c
> index 3c77bdf7b32a..a406602fdb3c 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -54,14 +54,26 @@ static int mem_region_callback(struct resource *res, void 
> *arg)
>  static int
>  map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
>  {
> -   unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> +   int ret;
> +   unsigned long flags;
> struct init_pgtable_data data;
>
> data.info = info;
> data.level4p = level4p;
> flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> -   return walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
> -  &data, mem_region_callback);
> +
> +   ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
> + &data, mem_region_callback);
> +   if (ret && ret != -EINVAL)
> +   return ret;
> +
> +   /* ACPI tables could be located in ACPI Non-volatile Storage region */
> +   ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
> + &data, mem_region_callback);
> +   if (ret && ret != -EINVAL)
> +   return ret;
> +
> +   return 0;
>  }
>  #else
>  static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { 
> return 0; }
> --
> 2.21.0
>

Hi, could you help test the tip branch with this applied? This should
fix all the issues, I can't find any other issues now. Thanks.


--
Best Regards,
Kairui Song


Re: [PATCH v3 0/4] x86: Always try to fill acpi_rsdp_addr in boot params

2019-06-10 Thread Kairui Song
On Fri, May 31, 2019 at 5:27 PM Simon Horman  wrote:
>
> On Fri, May 24, 2019 at 02:23:17PM +0800, Kairui Song wrote:
> > This patch sync the behavior of user space kexec and kexec_file_load,
> > they will both fill the boot_params.acpi_rsdp_addr with a valid RSDP
> > value, to make sure second kernel can always get the RSDP consistently.
> >
> > This will make it effortless to boot newer version of kernel (5.0+)
> > without specifying acpi_rsdp= cmdline on EFI system even with EFI
> > service disabled. Should not change any behavior with older kernels.
> >
> > Update from V2:
> >   - Drop unneeded 'packed' attribute for boot parameters structure
> >   - Don't trust kernel cmdline as a reliable acpi rsdp source
> >
> > Update from V1:
> >   - Split into multiple patches for a cleaner structure, content is not
> > changed.
>
> Thanks Kairui,
>
> applied.

Hi Simon,

I still haven't see this series get merged yet in the git repo, any update?

--
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] x86/kexec: Add ACPI NVS region to the ident map

2019-06-10 Thread Kairui Song
With the recent addition of RSDP parsing in decompression stage, kexec
kernel now needs ACPI tables to be covered by the identity mapping.
And in commit 6bbeb276b71f ("x86/kexec: Add the EFI system tables and
ACPI tables to the ident map"), ACPI tables memory region was added to
the ident map.

But on some machines, there is only ACPI NVS memory region, and the ACPI
tables is located in the NVS region instead. In such case second kernel
will still fail when trying to access ACPI tables.

So, to fix the problem, add NVS memory region in the ident map as well.

Fixes: 6bbeb276b71f ("x86/kexec: Add the EFI system tables and ACPI tables to 
the ident map")
Suggested-by: Junichi Nomura 
Signed-off-by: Kairui Song 
---

Tested with my laptop and VM, on top of current tip:x86/boot.

 arch/x86/kernel/machine_kexec_64.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 3c77bdf7b32a..a406602fdb3c 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -54,14 +54,26 @@ static int mem_region_callback(struct resource *res, void 
*arg)
 static int
 map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
 {
-   unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+   int ret;
+   unsigned long flags;
struct init_pgtable_data data;
 
data.info = info;
data.level4p = level4p;
flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-   return walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
-  &data, mem_region_callback);
+
+   ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
+ &data, mem_region_callback);
+   if (ret && ret != -EINVAL)
+   return ret;
+
+   /* ACPI tables could be located in ACPI Non-volatile Storage region */
+   ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
+ &data, mem_region_callback);
+   if (ret && ret != -EINVAL)
+   return ret;
+
+   return 0;
 }
 #else
 static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { 
return 0; }
-- 
2.21.0



Re: [PATCH] x86: Clear isVGA flag if current fb driver is mimicking VGA

2019-05-31 Thread Kairui Song
On Fri, May 31, 2019 at 5:29 PM Simon Horman  wrote:
>
> On Fri, Nov 23, 2018 at 05:28:01PM +0800, Kairui Song wrote:
> > Some device (eg. hyperv_fb) will mimic EFI (or VESA) VGA on first boot
> > up, but after the real driver is loaded, it will switch to new mode
> > and no longer compatible with EFI/VESA VGA. Keep setting
> > orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and
> > try to manipulate the framebuffer in a wrong way.
> >
> > As we have already take care of "VESA VGA" and "EFI VGA", just set the
> > orig_video_isVGA to 0 for any other driver reports as EFI/VESA VGA but
> > is not EFI/VESA VGA.
> >
> > Signed-off-by: Kairui Song 
>
> Sorry for letting this slip through the cracks.
> Please let me know if this is still relevant.

Hi Simon, after fb5a879 ("x86: Introduce a new option
--reuse-video-type") in kexec-tools, this patch is no longer needed.


--
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] x86: Handle 64bit framebuffer memory address properly

2019-05-31 Thread Kairui Song
On Fri, May 31, 2019 at 5:29 PM Simon Horman  wrote:
>
> On Fri, Nov 23, 2018 at 05:26:33PM +0800, Kairui Song wrote:
> > In a EFI system, the frame buffer address is 64bit, so currently
> > if the address is beyound 4G, kexec will set wrong address due to
> > truncate.
> >
> > Linux kernel commit ae2ee627dc87 ('efifb: Add support for 64-bit
> > frame buffer addresses') added support for 64bit frame buffer
> > address, an 'ext_lfb_base' field is added as the upper 32-bits of
> > the frame buffer, and introduced a new capability flag
> > 'VIDEO_TYPE_CAPABILITY_64BIT_BASE' to indicate if the extend field is
> > used.
> >
> > This patch adopts this change, set proper extent address and capability
> > flag when the address is beyound 4G.
> >
> > Signed-off-by: Kairui Song 
>
> Sorry for letting this slip through the cracks.
> Please let me know if this is still relevant.

Hi Simon, I checked kexec-tools repo and this patch is merged already,
maybe you replied the wrong mail?


--
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v4] vmcore: Add a kernel parameter novmcoredd

2019-05-28 Thread Kairui Song
Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in
second kernel"), drivers is allowed to add device related dump data to
vmcore as they want by using the device dump API. This have a potential
issue, the data is stored in memory, drivers may append too much data
and use too much memory. The vmcore is typically used in a kdump kernel
which runs in a pre-reserved small chunk of memory. So as a result it
will make kdump unusable at all due to OOM issues.

So introduce new 'novmcoredd' command line option. User can disable
device dump to reduce memory usage. This is helpful if device dump is
using too much memory, disabling device dump could make sure a regular
vmcore without device dump data is still available.

Signed-off-by: Kairui Song 

---
 Update from V3:
  - Use novmcoredd instead of vmcore_device_dump. Use
vmcore_device_dump and make it off by default is confusing,
novmcoredd is a cleaner way to let user space be able to disable
device dump to save memory.

 Update from V2:
  - Improve related docs

 Update from V1:
  - Use bool parameter to turn it on/off instead of letting user give
the size limit. Size of device dump is hard to determine.

 Documentation/admin-guide/kernel-parameters.txt | 11 +++
 fs/proc/Kconfig |  3 ++-
 fs/proc/vmcore.c|  8 
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..1b900d262680 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2872,6 +2872,17 @@
/sys/module/printk/parameters/console_suspend) to
turn on/off it dynamically.
 
+   novmcoredd  [KNL,KDUMP]
+   Disable device dump. Device dump allows drivers to
+   append dump data to vmcore so you can collect driver
+   specified debug info. The drivers could append the
+   data without any limit, and the data is stored in
+   memory, this may bring a significant memory stress.
+   Disable device dump can help save memory but driver
+   debug data will be no longer available.
+   Only available when CONFIG_PROC_VMCORE_DEVICE_DUMP
+   is set.
+
noaliencache[MM, NUMA, SLAB] Disables the allocation of alien
caches in the slab allocator.  Saves per-node memory,
but will impact performance.
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 817c02b13b1d..62b19162d198 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -57,7 +57,8 @@ config PROC_VMCORE_DEVICE_DUMP
  snapshot.
 
  If you say Y here, the collected device dumps will be added
- as ELF notes to /proc/vmcore.
+ as ELF notes to /proc/vmcore. You can still disabled device
+ dump by command line option 'novmcoredd'.
 
 config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3fe90443c1bb..e815fd035fc0 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore;
 /* Device Dump list and mutex to synchronize access to list */
 static LIST_HEAD(vmcoredd_list);
 static DEFINE_MUTEX(vmcoredd_mutex);
+
+static bool vmcoredd_disabled;
+core_param(novmcoredd, vmcoredd_disabled, bool, 0);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Device Dump Size */
@@ -1451,6 +1454,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
size_t data_size;
int ret;
 
+   if (vmcoredd_disabled) {
+   pr_err_once("Device dump is disabled\n");
+   return -EINVAL;
+   }
+
if (!data || !strlen(data->dump_name) ||
!data->vmcoredd_callback || !data->size)
return -EINVAL;
-- 
2.21.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v6 1/2] x86/kexec: Build identity mapping for EFI systab and ACPI tables

2019-05-27 Thread Kairui Song
On Wed, May 22, 2019 at 2:09 AM Borislav Petkov  wrote:
>
> On Tue, May 21, 2019 at 05:02:59PM +0800, Kairui Song wrote:
> > Hi Boris, would you prefer to just fold Junichi update patch into the
> > previous one or I should send an updated patch?
>
> Please send a patch ontop after Ingo queues your old one, which should
> happen soon. This way it would also document the fact that there are
> machines with NVS regions only.
>
> Thx.
>

Hi, by now, I still didn't see any tip branch pick up this patch yet,
any update?

--
Best Regards,
Kairui Song


Re: [PATCH v3] vmcore: Add a kernel parameter vmcore_device_dump

2019-05-26 Thread Kairui Song
On Mon, May 27, 2019 at 2:45 AM Bhupesh Sharma  wrote:
>
> On Fri, May 24, 2019 at 6:25 PM Dave Young  wrote:
> >
> > On 05/24/19 at 02:29pm, Kairui Song wrote:
> > > Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in
> > > second kernel"), drivers is allowed to add device related dump data to
> > > vmcore as they want by using the device dump API. This have a potential
> > > issue, the data is stored in memory, drivers may append too much data
> > > and use too much memory. The vmcore is typically used in a kdump kernel
> > > which runs in a pre-reserved small chunk of memory. So as a result it
> > > will make kdump unusable at all due to OOM issues.
> > >
> > > So introduce new vmcore_device_dump= kernel parameter, and disable
> > > device dump by default. User can enable it only if device dump data is
> > > required for debugging, and have the chance to increase the kdump
> > > reserved memory accordingly before device dump fails kdump.
> > >
> > > Signed-off-by: Kairui Song 
> > >
> > > ---
> > >
> > >  Update from V2:
> > >   - Improve related docs
> > >
> > >  Update from V1:
> > >   - Use bool parameter to turn it on/off instead of letting user give
> > > the size limit. Size of device dump is hard to determine.
> > >
> > >  Documentation/admin-guide/kernel-parameters.txt | 14 ++
> > >  fs/proc/Kconfig |  6 --
> > >  fs/proc/vmcore.c| 13 +
> > >  3 files changed, 31 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> > > b/Documentation/admin-guide/kernel-parameters.txt
> > > index 138f6664b2e2..3706ad9e1d97 100644
> > > --- a/Documentation/admin-guide/kernel-parameters.txt
> > > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > > @@ -5078,6 +5078,20 @@
> > >   decrease the size and leave more room for directly
> > >   mapped kernel RAM.
> > >
> > > + vmcore_device_dump= [KNL,KDUMP]
> > > + Format: {"off" | "on"}
> > > + Depends on CONFIG_PROC_VMCORE_DEVICE_DUMP.
> > > + This parameter allows enable or disable device dump
> > > + for vmcore on kernel start-up.
> > > + Device dump allows drivers to append dump data to
> > > + vmcore so you can collect driver specified debug 
> > > info.
> > > + Note that the drivers could append the data without
> > > + any limit, and the data is stored in memory, this 
> > > may
> > > + bring a significant memory stress. If you want to 
> > > turn
> > > + on this option, make sure you have reserved enough 
> > > memory
> > > + with crashkernel= parameter.
> > > + default: off
> > > +
> > >   vmcp_cma=nn[MG] [KNL,S390]
> > >   Sets the memory size reserved for contiguous memory
> > >   allocations for the vmcp device driver.
> > > diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
> > > index 817c02b13b1d..1a7a38976bb0 100644
> > > --- a/fs/proc/Kconfig
> > > +++ b/fs/proc/Kconfig
> > > @@ -56,8 +56,10 @@ config PROC_VMCORE_DEVICE_DUMP
> > > recovery kernel's initramfs to collect its underlying device
> > > snapshot.
> > >
> > > -   If you say Y here, the collected device dumps will be added
> > > -   as ELF notes to /proc/vmcore.
> > > +   If you say Y here, a new kernel parameter 'vmcore_device_dump'
> > > +   will be available. You can then enable device dump by passing
> >
> > "a new kernel parameter 'vmcore_device_dump' will be available" is not
> > necessary, "new" is a not a clear word.  I suggest to remove this
> > sentence.
> >
> > s/You can then/You can
>
> I agree with Dave. We are just trying to say here that even if
> CONFIG_PROC_VMCORE_DEVICE_DUMP is set to Y, one can still disable the
> device dump feature by passing parameter 'vmcore_device_dump=off' to
> the kernel.
>
> May be you can use the wording I mentioned in the v2 patch review,
> which tried to convey a similar meaning.
>
> With the change addressed:
> Reviewed-by: Bhupesh Sharma 
>
> Thanks,
> Bhupesh
>
OK, How about:

  If you say Y here, device dump is still disabled by default.
  You can enable device dump by passing 'vmcore_device_dump=on'
  to kernel, the collected device dumps will be added as ELF
  notes to /proc/vmcore.

If you think this is good I'll send V4 including the changes.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3] vmcore: Add a kernel parameter vmcore_device_dump

2019-05-23 Thread Kairui Song
Since commit 2724273e8fd0 ("vmcore: add API to collect hardware dump in
second kernel"), drivers is allowed to add device related dump data to
vmcore as they want by using the device dump API. This have a potential
issue, the data is stored in memory, drivers may append too much data
and use too much memory. The vmcore is typically used in a kdump kernel
which runs in a pre-reserved small chunk of memory. So as a result it
will make kdump unusable at all due to OOM issues.

So introduce new vmcore_device_dump= kernel parameter, and disable
device dump by default. User can enable it only if device dump data is
required for debugging, and have the chance to increase the kdump
reserved memory accordingly before device dump fails kdump.

Signed-off-by: Kairui Song 

---

 Update from V2:
  - Improve related docs

 Update from V1:
  - Use bool parameter to turn it on/off instead of letting user give
the size limit. Size of device dump is hard to determine.

 Documentation/admin-guide/kernel-parameters.txt | 14 ++
 fs/proc/Kconfig |  6 --
 fs/proc/vmcore.c| 13 +
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..3706ad9e1d97 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5078,6 +5078,20 @@
decrease the size and leave more room for directly
mapped kernel RAM.
 
+   vmcore_device_dump= [KNL,KDUMP]
+   Format: {"off" | "on"}
+   Depends on CONFIG_PROC_VMCORE_DEVICE_DUMP.
+   This parameter allows enable or disable device dump
+   for vmcore on kernel start-up.
+   Device dump allows drivers to append dump data to
+   vmcore so you can collect driver specified debug info.
+   Note that the drivers could append the data without
+   any limit, and the data is stored in memory, this may
+   bring a significant memory stress. If you want to turn
+   on this option, make sure you have reserved enough 
memory
+   with crashkernel= parameter.
+   default: off
+
vmcp_cma=nn[MG] [KNL,S390]
Sets the memory size reserved for contiguous memory
allocations for the vmcp device driver.
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 817c02b13b1d..1a7a38976bb0 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -56,8 +56,10 @@ config PROC_VMCORE_DEVICE_DUMP
  recovery kernel's initramfs to collect its underlying device
  snapshot.
 
- If you say Y here, the collected device dumps will be added
- as ELF notes to /proc/vmcore.
+ If you say Y here, a new kernel parameter 'vmcore_device_dump'
+ will be available. You can then enable device dump by passing
+ 'vmcore_device_dump=on' to kernel, the collected device dumps
+ will be added as ELF notes to /proc/vmcore.
 
 config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3fe90443c1bb..d1b608b0efad 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -53,6 +53,8 @@ static struct proc_dir_entry *proc_vmcore;
 /* Device Dump list and mutex to synchronize access to list */
 static LIST_HEAD(vmcoredd_list);
 static DEFINE_MUTEX(vmcoredd_mutex);
+
+static bool vmcoredd_enabled;
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Device Dump Size */
@@ -1451,6 +1453,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
size_t data_size;
int ret;
 
+   if (!vmcoredd_enabled) {
+   pr_err_once("Device dump is disabled\n");
+   return -EINVAL;
+   }
+
if (!data || !strlen(data->dump_name) ||
!data->vmcoredd_callback || !data->size)
return -EINVAL;
@@ -1502,6 +1509,12 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return ret;
 }
 EXPORT_SYMBOL(vmcore_add_device_dump);
+
+static int __init vmcoredd_parse_cmdline(char *arg)
+{
+   return kstrtobool(arg, &vmcoredd_enabled);
+}
+__setup("vmcore_device_dump=", vmcoredd_parse_cmdline);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Free all dumps in vmcore device dump list */
-- 
2.21.0



[PATCH v3 0/4] x86: Always try to fill acpi_rsdp_addr in boot params

2019-05-23 Thread Kairui Song
This patch sync the behavior of user space kexec and kexec_file_load,
they will both fill the boot_params.acpi_rsdp_addr with a valid RSDP
value, to make sure second kernel can always get the RSDP consistently.

This will make it effortless to boot newer version of kernel (5.0+)
without specifying acpi_rsdp= cmdline on EFI system even with EFI
service disabled. Should not change any behavior with older kernels.

Update from V2:
  - Drop unneeded 'packed' attribute for boot parameters structure
  - Don't trust kernel cmdline as a reliable acpi rsdp source

Update from V1:
  - Split into multiple patches for a cleaner structure, content is not
changed.

Kairui Song (4):
  x86: Update boot parameters defination
  x86: Introduce helpers for getting RSDP address
  x86: Always try to fill acpi_rsdp_addr in boot params
  crashdump/x86: Use new introduce helper for getting RSDP

 include/x86/x86-linux.h|  6 +++--
 kexec/arch/i386/crashdump-x86.c| 34 +++
 kexec/arch/i386/kexec-x86-common.c | 43 ++
 kexec/arch/i386/kexec-x86.h|  1 +
 kexec/arch/i386/x86-linux-setup.c  |  6 +++--
 kexec/arch/i386/x86-linux-setup.h  |  1 +
 6 files changed, 62 insertions(+), 29 deletions(-)

-- 
2.21.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 3/4] x86: Always try to fill acpi_rsdp_addr in boot params

2019-05-23 Thread Kairui Song
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
from boot params if available"), kernel accept an acpi_rsdp_addr param in
boot_params. So fill in this parameter unconditionally, ensure second
kernel always get the right RSDP address consistently, and boot well on
EFI system even with EFI service disabled. User no longer need to change
the kernel cmdline to workaround the missing RSDP issue.

For older version of kernels (Before 5.0), there won't be any change of
behavior.

Signed-off-by: Kairui Song 
---
 kexec/arch/i386/x86-linux-setup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kexec/arch/i386/x86-linux-setup.c 
b/kexec/arch/i386/x86-linux-setup.c
index 5ca7c25..5b00b42 100644
--- a/kexec/arch/i386/x86-linux-setup.c
+++ b/kexec/arch/i386/x86-linux-setup.c
@@ -901,4 +901,7 @@ void setup_linux_system_parameters(struct kexec_info *info,
 
/* fill the EDD information */
setup_edd_info(real_mode);
+
+   /* Always try to fill acpi_rsdp_addr */
+   real_mode->acpi_rsdp_addr = get_acpi_rsdp();
 }
-- 
2.21.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 4/4] crashdump/x86: Use new introduce helper for getting RSDP

2019-05-23 Thread Kairui Song
Use the new introduce helper for getting RSDP, this ensures RSDP is
always accessible and avoid code duplication.

Signed-off-by: Kairui Song 
---
 kexec/arch/i386/crashdump-x86.c | 34 +
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
index 140f45b..a2aea31 100644
--- a/kexec/arch/i386/crashdump-x86.c
+++ b/kexec/arch/i386/crashdump-x86.c
@@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void)
 /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */
 static void cmdline_add_efi(char *cmdline)
 {
-   FILE *fp;
-   int cmdlen, len;
-   char line[MAX_LINE], *s;
-   const char *acpis = " acpi_rsdp=";
+   uint64_t acpi_rsdp;
+   char acpi_rsdp_buf[MAX_LINE];
 
-   fp = fopen("/sys/firmware/efi/systab", "r");
-   if (!fp)
-   return;
+   acpi_rsdp = get_acpi_rsdp();
 
-   while(fgets(line, sizeof(line), fp) != 0) {
-   /* ACPI20= always goes before ACPI= */
-   if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) {
-   line[strlen(line) - 1] = '\0';
-   s = strchr(line, '=');
-   s += 1;
-   len = strlen(s) + strlen(acpis);
-   cmdlen = strlen(cmdline) + len;
-   if (cmdlen > (COMMAND_LINE_SIZE - 1))
-   die("Command line overflow\n");
-   strcat(cmdline, acpis);
-   strcat(cmdline, s);
-   dbgprintf("Command line after adding efi\n");
-   dbgprintf("%s\n", cmdline);
+   if (!acpi_rsdp)
+   return;
 
-   break;
-   }
-   }
+   sprintf(acpi_rsdp_buf, " acpi_rsdp=0x%lx", acpi_rsdp);
+   if (strlen(cmdline) + strlen(acpi_rsdp_buf) > (COMMAND_LINE_SIZE - 1))
+   die("Command line overflow\n");
 
-   fclose(fp);
+   strcat(cmdline, acpi_rsdp_buf);
 }
 
 static void get_backup_area(struct kexec_info *info,
-- 
2.21.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 2/4] x86: Introduce helpers for getting RSDP address

2019-05-23 Thread Kairui Song
On x86 RSDP is fundamental for booting the machine. When second kernel
is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI
system with EFI service disabled), kexec should prepare the RSDP address
for second kernel.

Introduce helpers for getting RSDP from multiple sources, including boot
params and EFI firmware.

For legacy BIOS interface, there is no better way to find the RSDP address
rather than scanning the memory region and search for it, and this will
always be done by the kernel as a fallback, so this is no need to try to
get the RSDP address for that case.

Signed-off-by: Kairui Song 
---
 kexec/arch/i386/kexec-x86-common.c | 43 ++
 kexec/arch/i386/kexec-x86.h|  1 +
 kexec/arch/i386/x86-linux-setup.c  |  3 +--
 kexec/arch/i386/x86-linux-setup.h  |  1 +
 4 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/kexec/arch/i386/kexec-x86-common.c 
b/kexec/arch/i386/kexec-x86-common.c
index de99758..5c55ec8 100644
--- a/kexec/arch/i386/kexec-x86-common.c
+++ b/kexec/arch/i386/kexec-x86-common.c
@@ -39,6 +39,7 @@
 #include "../../firmware_memmap.h"
 #include "../../crashdump.h"
 #include "kexec-x86.h"
+#include "x86-linux-setup.h"
 #include "../../kexec-xen.h"
 
 /* Used below but not present in (older?) xenctrl.h */
@@ -392,4 +393,46 @@ int get_memory_ranges(struct memory_range **range, int 
*ranges,
return ret;
 }
 
+static uint64_t bootparam_get_acpi_rsdp(void) {
+   uint64_t acpi_rsdp = 0;
+   off_t offset = offsetof(struct x86_linux_param_header, acpi_rsdp_addr);
 
+   if (get_bootparam(&acpi_rsdp, offset, sizeof(acpi_rsdp)))
+   return 0;
+
+   return acpi_rsdp;
+}
+
+static uint64_t efi_get_acpi_rsdp(void) {
+   FILE *fp;
+   char line[MAX_LINE], *s;
+   uint64_t acpi_rsdp = 0;
+
+   fp = fopen("/sys/firmware/efi/systab", "r");
+   if (!fp)
+   return acpi_rsdp;
+
+   while(fgets(line, sizeof(line), fp) != 0) {
+   /* ACPI20= always goes before ACPI= */
+   if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) {
+   s = strchr(line, '=') + 1;
+   sscanf(s, "0x%lx", &acpi_rsdp);
+   break;
+   }
+   }
+   fclose(fp);
+
+   return acpi_rsdp;
+}
+
+uint64_t get_acpi_rsdp(void)
+{
+   uint64_t acpi_rsdp = 0;
+
+   acpi_rsdp = bootparam_get_acpi_rsdp();
+
+   if (!acpi_rsdp)
+   acpi_rsdp = efi_get_acpi_rsdp();
+
+   return acpi_rsdp;
+}
diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
index c2bcd37..1b58c3b 100644
--- a/kexec/arch/i386/kexec-x86.h
+++ b/kexec/arch/i386/kexec-x86.h
@@ -86,4 +86,5 @@ int nbi_load(int argc, char **argv, const char *buf, off_t 
len,
 void nbi_usage(void);
 
 extern unsigned xen_e820_to_kexec_type(uint32_t type);
+extern uint64_t get_acpi_rsdp(void);
 #endif /* KEXEC_X86_H */
diff --git a/kexec/arch/i386/x86-linux-setup.c 
b/kexec/arch/i386/x86-linux-setup.c
index 8fad115..5ca7c25 100644
--- a/kexec/arch/i386/x86-linux-setup.c
+++ b/kexec/arch/i386/x86-linux-setup.c
@@ -123,7 +123,6 @@ void setup_linux_bootloader_parameters_high(
cmdline_ptr[cmdline_len - 1] = '\0';
 }
 
-static int get_bootparam(void *buf, off_t offset, size_t size);
 static int setup_linux_vesafb(struct x86_linux_param_header *real_mode)
 {
struct fb_fix_screeninfo fix;
@@ -452,7 +451,7 @@ char *find_mnt_by_fsname(char *fsname)
return mntdir;
 }
 
-static int get_bootparam(void *buf, off_t offset, size_t size)
+int get_bootparam(void *buf, off_t offset, size_t size)
 {
int data_file;
char *debugfs_mnt, *sysfs_mnt;
diff --git a/kexec/arch/i386/x86-linux-setup.h 
b/kexec/arch/i386/x86-linux-setup.h
index f5d23d3..0c651e5 100644
--- a/kexec/arch/i386/x86-linux-setup.h
+++ b/kexec/arch/i386/x86-linux-setup.h
@@ -21,6 +21,7 @@ static inline void setup_linux_bootloader_parameters(
 }
 void setup_linux_system_parameters(struct kexec_info *info,
struct x86_linux_param_header *real_mode);
+int get_bootparam(void *buf, off_t offset, size_t size);
 
 
 #define SETUP_BASE0x9
-- 
2.21.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v3 1/4] x86: Update boot parameters defination

2019-05-23 Thread Kairui Song
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
from boot params if available"), kernel accept a acpi_rsdp_addr param in
boot_params. Sync the x86_linux_param_header to support this param.

Signed-off-by: Kairui Song 
---
 include/x86/x86-linux.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h
index 352ea02..9646102 100644
--- a/include/x86/x86-linux.h
+++ b/include/x86/x86-linux.h
@@ -45,7 +45,6 @@ struct apm_bios_info {
uint16_t cseg_len;  /* 0x4e */
uint16_t cseg_16_len;   /* 0x50 */
uint16_t dseg_len;  /* 0x52 */
-   uint8_t  reserved[44];  /* 0x54 */
 };
 
 /*
@@ -113,12 +112,15 @@ struct x86_linux_param_header {
uint8_t  reserved4[2];  /* 0x3e -- 0x3f reserved for 
future expansion */
 
struct apm_bios_info apm_bios_info; /* 0x40 */
+   uint8_t  reserved4_1[28];   /* 0x54 */
+   uint64_t acpi_rsdp_addr;/* 0x70 */
+   uint8_t  reserved4_2[8];/* 0x78 */
struct drive_info_struct drive_info;/* 0x80 */
struct sys_desc_table sys_desc_table;   /* 0xa0 */
uint32_t ext_ramdisk_image; /* 0xc0 */
uint32_t ext_ramdisk_size;  /* 0xc4 */
uint32_t ext_cmd_line_ptr;  /* 0xc8 */
-   uint8_t reserved4_1[0x1c0 - 0xcc];  /* 0xe4 */
+   uint8_t reserved4_3[0x1c0 - 0xcc];  /* 0xe4 */
uint8_t efi_info[32];   /* 0x1c0 */
uint32_t alt_mem_k; /* 0x1e0 */
uint8_t  reserved5[4];  /* 0x1e4 */
-- 
2.21.0


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 1/4] x86: Update boot parameters defination

2019-05-23 Thread Kairui Song
On Thu, May 23, 2019 at 11:01 AM Dave Young  wrote:
>
> Hi Kairui
> On 05/14/19 at 01:09pm, Kairui Song wrote:
> > Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
> > from boot params if available"), kernel accept a acpi_rsdp_addr param in
> > boot_params. Sync the x86_linux_param_header to support this param.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  include/x86/x86-linux.h | 8 +---
> >  1 file changed, 5 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h
> > index 352ea02..a5d8df8 100644
> > --- a/include/x86/x86-linux.h
> > +++ b/include/x86/x86-linux.h
> > @@ -45,8 +45,7 @@ struct apm_bios_info {
> >   uint16_t cseg_len;  /* 0x4e */
> >   uint16_t cseg_16_len;   /* 0x50 */
> >   uint16_t dseg_len;  /* 0x52 */
> > - uint8_t  reserved[44];  /* 0x54 */
> > -};
> > +} __attribute__((packed));
>
> It should be good to keep same as the kernel header without packed
> attribute.
>
> is it possible to sync the latest mainline uapi bootparam header file,
> maybe after this series get solved?
>

Hi Dave,

I can remove the packed attr then, just ensure it won't break
anything, a bit paranoid maybe...

About sync the whole structure, the problem is kexec tools' header
structure is different so that will change a lot. Maybe could be
discussed seperately.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 2/4] x86: Introduce helpers for getting RSDP address

2019-05-23 Thread Kairui Song
On Thu, May 23, 2019 at 11:43 AM Kairui Song  wrote:
>
> On Thu, May 23, 2019 at 11:16 AM Dave Young  wrote:
> >
> > On 05/14/19 at 01:09pm, Kairui Song wrote:
> > > On x86 RSDP is fundamental for booting the machine. When second kernel
> > > is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI
> > > system with EFI service disabled), kexec should prepare the RSDP address
> > > for second kernel.
> > >
> > > Introduce helpers for getting RSDP from multiple sources, including boot
> > > params, cmdline and EFI firmware.
> > >
> > > For legacy BIOS interface, there is no better way to find the RSDP address
> > > rather than scanning the memory region and search for it, and this will
> > > always be done by the kernel as a fallback, so this is no need to try to
> > > get the RSDP address for that case.
> > >
> > > Signed-off-by: Kairui Song 
> > > ---
> > >  kexec/arch/i386/kexec-x86-common.c | 60 ++
> > >  kexec/arch/i386/kexec-x86.h|  1 +
> > >  kexec/arch/i386/x86-linux-setup.c  |  3 +-
> > >  kexec/arch/i386/x86-linux-setup.h  |  1 +
> > >  4 files changed, 63 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/kexec/arch/i386/kexec-x86-common.c 
> > > b/kexec/arch/i386/kexec-x86-common.c
> > > index de99758..4b8eb26 100644
> > > --- a/kexec/arch/i386/kexec-x86-common.c
> > > +++ b/kexec/arch/i386/kexec-x86-common.c
> > > @@ -39,6 +39,7 @@
> > >  #include "../../firmware_memmap.h"
> > >  #include "../../crashdump.h"
> > >  #include "kexec-x86.h"
> > > +#include "x86-linux-setup.h"
> > >  #include "../../kexec-xen.h"
> > >
> > >  /* Used below but not present in (older?) xenctrl.h */
> > > @@ -392,4 +393,63 @@ int get_memory_ranges(struct memory_range **range, 
> > > int *ranges,
> > >   return ret;
> > >  }
> > >
> > > +static uint64_t cmdline_get_acpi_rsdp(void) {
> > > + uint64_t acpi_rsdp = 0;
> > > + char *tmp_cmdline, *rsdp_param;
> > >
> > > + tmp_cmdline = get_command_line();
> > > + rsdp_param = strstr(tmp_cmdline, "acpi_rsdp=");
> >
> > strstr will locate the first acpi_rsdp, what about multiple acpi_rsdp
> > provided?
>
> Good catch, should always use the latest acpi_rsdp provided, will fix that.
>
> >
> > BTW, if one provide a wrong adress in acpi_rsdp= cmdline then it is not
> > usable.
> >
>
> I think in that case kernel will not boot. If kexec is available then
> it means a right value is given.
>

After double check the kernel will boot even wrong acpi_rsdp is given,
so I'll drop this part. boot_params in newer kernel will be enough to
make sure kexec loop boot with EFI disabled won't fail.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2] vmcore: Add a kernel cmdline vmcore_device_dump

2019-05-23 Thread Kairui Song
On Wed, May 22, 2019 at 1:38 PM Dave Young  wrote:
>
> On 05/20/19 at 02:18pm, Kairui Song wrote:
> > Since commit 2724273e8fd0 ('vmcore: add API to collect hardware dump in
> > second kernel'), drivers is allowed to add device related dump data to
> > vmcore as they want by using the device dump API. This have a potential
> > issue, the data is stored in memory, drivers may append too much data
> > and use too much memory. The vmcore is typically used in a kdump kernel
> > which runs in a pre-reserved small chunk of memory. So as a result it
> > will make kdump unusable at all due to OOM issues.
> >
> > So introduce new vmcore_device_dump= kernel parameter, and disable
> > device dump by default. User can enable it only if device dump data is
> > required for debugging, and have the chance to increase the kdump
> > reserved memory accordingly before device dump fails kdump.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  Update from V1:
> >   - Use bool parameter to turn it on/off instead of letting user give
> > the size limit. Size of device dump is hard to determine.
> >
> >  Documentation/admin-guide/kernel-parameters.txt | 15 +++
> >  fs/proc/vmcore.c| 13 +
> >  2 files changed, 28 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> > b/Documentation/admin-guide/kernel-parameters.txt
> > index 43176340c73d..2d48e39fd080 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -5062,6 +5062,21 @@
> >   decrease the size and leave more room for directly
> >   mapped kernel RAM.
> >
> > + vmcore_device_dump=
> > + [VMCORE]
>
> It looks better to have above two line merged in one line, also use
> [KNL, KDUMP] will be better.
>
> > + Format: {"off" | "on"}
> > + If CONFIG_PROC_VMCORE_DEVICE_DUMP is set,
> > + this parameter allows enable or disable device dump
> > + for vmcore.
> > + Device dump allows drivers to append dump data to
> > + vmcore so you can collect driver specified debug info.
> > + Note that the drivers could append the data without
> > + any limit, and the data is stored in memory, this may
> > + bring a significant memory stress. If you want to turn
> > + on this option, make sure you have reserved enough 
> > memory
> > + with crashkernel= parameter.
> > + default: off
> > +
> >   vmcp_cma=nn[MG] [KNL,S390]
> >   Sets the memory size reserved for contiguous memory
> >   allocations for the vmcp device driver.
> > diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
> > index 3fe90443c1bb..d1b608b0efad 100644
> > --- a/fs/proc/vmcore.c
> > +++ b/fs/proc/vmcore.c
> > @@ -53,6 +53,8 @@ static struct proc_dir_entry *proc_vmcore;
> >  /* Device Dump list and mutex to synchronize access to list */
> >  static LIST_HEAD(vmcoredd_list);
> >  static DEFINE_MUTEX(vmcoredd_mutex);
> > +
> > +static bool vmcoredd_enabled;
> >  #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> >
> >  /* Device Dump Size */
> > @@ -1451,6 +1453,11 @@ int vmcore_add_device_dump(struct vmcoredd_data 
> > *data)
> >   size_t data_size;
> >   int ret;
> >
> > + if (!vmcoredd_enabled) {
> > + pr_err_once("Device dump is disabled\n");
> > + return -EINVAL;
> > + }
> > +
> >   if (!data || !strlen(data->dump_name) ||
> >   !data->vmcoredd_callback || !data->size)
> >   return -EINVAL;
> > @@ -1502,6 +1509,12 @@ int vmcore_add_device_dump(struct vmcoredd_data 
> > *data)
> >   return ret;
> >  }
> >  EXPORT_SYMBOL(vmcore_add_device_dump);
> > +
> > +static int __init vmcoredd_parse_cmdline(char *arg)
> > +{
> > + return kstrtobool(arg, &vmcoredd_enabled);
> > +}
> > +__setup("vmcore_device_dump=", vmcoredd_parse_cmdline);
> >  #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> >
> >  /* Free all dumps in vmcore device dump list */
> > --
> > 2.21.0
> >
>
> Thanks
> Dave

Good suggestion, I'll update in V3.

-- 
Best Regards,
Kairui Song


Re: [PATCH v2] vmcore: Add a kernel cmdline vmcore_device_dump

2019-05-23 Thread Kairui Song
On Thu, May 23, 2019 at 2:44 AM Bhupesh Sharma  wrote:
>
> On 05/20/2019 11:48 AM, Kairui Song wrote:
> > Since commit 2724273e8fd0 ('vmcore: add API to collect hardware dump in
> > second kernel'), drivers is allowed to add device related dump data to
> > vmcore as they want by using the device dump API. This have a potential
> > issue, the data is stored in memory, drivers may append too much data
> > and use too much memory. The vmcore is typically used in a kdump kernel
> > which runs in a pre-reserved small chunk of memory. So as a result it
> > will make kdump unusable at all due to OOM issues.
> >
> > So introduce new vmcore_device_dump= kernel parameter, and disable
> > device dump by default. User can enable it only if device dump data is
> > required for debugging, and have the chance to increase the kdump
> > reserved memory accordingly before device dump fails kdump.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >   Update from V1:
> >- Use bool parameter to turn it on/off instead of letting user give
> >  the size limit. Size of device dump is hard to determine.
> >
> >   Documentation/admin-guide/kernel-parameters.txt | 15 +++
> >   fs/proc/vmcore.c| 13 +
> >   2 files changed, 28 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> > b/Documentation/admin-guide/kernel-parameters.txt
> > index 43176340c73d..2d48e39fd080 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -5062,6 +5062,21 @@
> >   decrease the size and leave more room for directly
> >   mapped kernel RAM.
> >
> > + vmcore_device_dump=
> > + [VMCORE]
> > + Format: {"off" | "on"}
> > + If CONFIG_PROC_VMCORE_DEVICE_DUMP is set,
> > + this parameter allows enable or disable device dump
> > + for vmcore.
>
> We can add a simpler description here, something like:
> Depends on CONFIG_PROC_VMCORE_DEVICE_DUMP
>
> > + Device dump allows drivers to append dump data to
> > + vmcore so you can collect driver specified debug info.
> > + Note that the drivers could append the data without
> > + any limit, and the data is stored in memory, this may
> > + bring a significant memory stress. If you want to turn
> > + on this option, make sure you have reserved enough 
> > memory
> > + with crashkernel= parameter.
> > + default: off
>
> ... and massage the rest of text accordingly.
>
> Better to also modify the help text for 'PROC_VMCORE_DEVICE_DUMP' config
> option defined in 'fs/proc/Kconfig'. Something like:
>
> config PROC_VMCORE_DEVICE_DUMP
> bool "Device Hardware/Firmware Log Collection"
> <..snip..>
>   If you say Y here, the collected device dumps will be added
>   as ELF notes to /proc/vmcore.
>
>   If this option is selected, device dump collection can still be
> disabled by passing vmcore_device_dump=off to the kernel.
>
> See config INTEL_IOMMU_DEFAULT_ON in 'drivers/iommu/Kconfig' as an example.
>

Good suggestion! I'll update in V3.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH v2 2/4] x86: Introduce helpers for getting RSDP address

2019-05-22 Thread Kairui Song
On Thu, May 23, 2019 at 11:16 AM Dave Young  wrote:
>
> On 05/14/19 at 01:09pm, Kairui Song wrote:
> > On x86 RSDP is fundamental for booting the machine. When second kernel
> > is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI
> > system with EFI service disabled), kexec should prepare the RSDP address
> > for second kernel.
> >
> > Introduce helpers for getting RSDP from multiple sources, including boot
> > params, cmdline and EFI firmware.
> >
> > For legacy BIOS interface, there is no better way to find the RSDP address
> > rather than scanning the memory region and search for it, and this will
> > always be done by the kernel as a fallback, so this is no need to try to
> > get the RSDP address for that case.
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  kexec/arch/i386/kexec-x86-common.c | 60 ++
> >  kexec/arch/i386/kexec-x86.h|  1 +
> >  kexec/arch/i386/x86-linux-setup.c  |  3 +-
> >  kexec/arch/i386/x86-linux-setup.h  |  1 +
> >  4 files changed, 63 insertions(+), 2 deletions(-)
> >
> > diff --git a/kexec/arch/i386/kexec-x86-common.c 
> > b/kexec/arch/i386/kexec-x86-common.c
> > index de99758..4b8eb26 100644
> > --- a/kexec/arch/i386/kexec-x86-common.c
> > +++ b/kexec/arch/i386/kexec-x86-common.c
> > @@ -39,6 +39,7 @@
> >  #include "../../firmware_memmap.h"
> >  #include "../../crashdump.h"
> >  #include "kexec-x86.h"
> > +#include "x86-linux-setup.h"
> >  #include "../../kexec-xen.h"
> >
> >  /* Used below but not present in (older?) xenctrl.h */
> > @@ -392,4 +393,63 @@ int get_memory_ranges(struct memory_range **range, int 
> > *ranges,
> >   return ret;
> >  }
> >
> > +static uint64_t cmdline_get_acpi_rsdp(void) {
> > + uint64_t acpi_rsdp = 0;
> > + char *tmp_cmdline, *rsdp_param;
> >
> > + tmp_cmdline = get_command_line();
> > + rsdp_param = strstr(tmp_cmdline, "acpi_rsdp=");
>
> strstr will locate the first acpi_rsdp, what about multiple acpi_rsdp
> provided?

Good catch, should always use the latest acpi_rsdp provided, will fix that.

>
> BTW, if one provide a wrong adress in acpi_rsdp= cmdline then it is not
> usable.
>

I think in that case kernel will not boot. If kexec is available then
it means a right value is given.

> So not sure if adding this cmdline param is necessary, maybe only add
> efi case will be reliable.

Adding the cmdline param ensure kexec boot loop won't fail. eg. in an
older version kernel booted with kexec, and have EFI disabled, then
cmdline is the only source for getting and storing the RSDP address.

>
> > +
> > + if (rsdp_param)
> > + sscanf(rsdp_param, "acpi_rsdp=%lx", &acpi_rsdp);
> > +
> > + free(tmp_cmdline);
> > + return acpi_rsdp;
> > +}
> > +
> > +static uint64_t bootparam_get_acpi_rsdp(void) {
> > + uint64_t acpi_rsdp = 0;
> > + off_t offset = offsetof(struct x86_linux_param_header, 
> > acpi_rsdp_addr);
> > +
> > + if (get_bootparam(&acpi_rsdp, offset, sizeof(acpi_rsdp)))
> > + return 0;
> > +
> > + return acpi_rsdp;
> > +}
> > +
> > +static uint64_t efi_get_acpi_rsdp(void) {
> > + FILE *fp;
> > + char line[MAX_LINE], *s;
> > + uint64_t acpi_rsdp = 0;
> > +
> > + fp = fopen("/sys/firmware/efi/systab", "r");
> > + if (!fp)
> > + return acpi_rsdp;
> > +
> > + while(fgets(line, sizeof(line), fp) != 0) {
> > + /* ACPI20= always goes before ACPI= */
> > + if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) {
> > + s = strchr(line, '=') + 1;
> > + sscanf(s, "0x%lx", &acpi_rsdp);
> > + break;
> > + }
> > + }
> > + fclose(fp);
> > +
> > + return acpi_rsdp;
> > +}
> > +
> > +uint64_t get_acpi_rsdp(void)
> > +{
> > + uint64_t acpi_rsdp = 0;
> > +
> > + acpi_rsdp = cmdline_get_acpi_rsdp();
> > +
> > + if (!acpi_rsdp)
> > + acpi_rsdp = bootparam_get_acpi_rsdp();
> > +
> > + if (!acpi_rsdp)
> > + acpi_rsdp = efi_get_acpi_rsdp();
> > +
> > + return acpi_rsdp;
> > +}
> > diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
> > index c2bcd37..1b58c

Re: [PATCH v6 1/2] x86/kexec: Build identity mapping for EFI systab and ACPI tables

2019-05-21 Thread Kairui Song
On Wed, May 15, 2019 at 3:10 PM Junichi Nomura  wrote:
>
> On 5/15/19 3:58 PM, Borislav Petkov wrote:
> > On Wed, May 15, 2019 at 05:17:19AM +, Junichi Nomura wrote:
> >> Hi Kairui,
> >>
> >> On 5/13/19 5:02 PM, Baoquan He wrote:
> >>> On 05/13/19 at 09:50am, Borislav Petkov wrote:
> >>>> On Mon, May 13, 2019 at 03:32:54PM +0800, Baoquan He wrote:
> >>>> So we're going to try it again this cycle and if there's no fallout, it
> >>>> will go upstream. If not, it will have to be fixed. The usual thing.
> >>>>
> >>>> And I don't care if Kairui's patch fixes this one problem - judging by
> >>>> the fragility of this whole thing, it should be hammered on one more
> >>>> cycle on as many boxes as possible to make sure there's no other SNAFUs.
> >>>>
> >>>> So go test it on more machines instead. I've pushed it here:
> >>>>
> >>>> https://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git/log/?h=next-merge-window
> >>>
> >>> Pingfan has got a machine to reproduce the kexec breakage issue, and
> >>> applying these two patches fix it. He planned to paste the test result.
> >>> I will ask him to try this branch if he has time, or I can get his
> >>> machine to test.
> >>>
> >>> Junichi, also have a try on Boris's branch in NEC's test environment?
> >>
> >> while the patch set works on most of the machines I'm testing around,
> >> I found kexec(1) fails to load kernel on a few machines if this patch
> >> is applied.  Those machines don't have IORES_DESC_ACPI_TABLES region
> >> and have ACPI tables in IORES_DESC_ACPI_NV_STORAGE region instead.
> >
> > Why? What kind of machines are those?
>
> I don't know.  They are just general purpose Xeon-based servers
> and not some special purpose machines.  So I guess there are other
> such machines in the wild.
>

Hi, I think it's reasonable to update the patch to include the
NV_STORAGE regions as well, most likely the firmware only provided
NV_STORAGE region? Can you help confirm if the e820 didn't contain
ACPI data, and only ACPI NVS?

I had a try with this update patch, it worked and didn't break anything.

Hi Boris, would you prefer to just fold Junichi update patch into the
previous one or I should send an updated patch?


--
Best Regards,
Kairui Song


Re: [RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit

2019-05-19 Thread Kairui Song
On Mon, May 20, 2019 at 1:55 PM Bhupesh Sharma  wrote:
>
> On 05/16/2019 01:49 PM, Kairui Song wrote:
> > On Fri, May 10, 2019 at 7:17 PM Bhupesh Sharma  wrote:
> >>
> >> Hi Kairui,
> >>
> >> Thanks for the patch. Please see my comments in-line:
> >>
> >> On 05/10/2019 03:50 PM, Kairui Song wrote:
> >>> Device dump allow drivers to add device related dump data to vmcore as
> >>> they want. This have a potential issue, the data is stored in memory,
> >>> drivers may append too much data and use too much memory. The vmcore is
> >>> typically used in a kdump kernel which runs in a pre-reserved small
> >>> chunk of memory. So as a result it will make kdump unusable at all due
> >>> to OOM issues.
> >>>
> >>> So introduce new device_dump_limit= kernel parameter, and set the
> >>> default limit to 0, so device dump is not enabled unless user specify
> >>> the accetable maxiam
> >>
> >>  acceptable maximum
> >
> > Will fix this typo.
>
> Ok.
>
> >>> memory usage for device dump data. In this way user
> >>> will also have the chance to adjust the kdump reserved memory
> >>> accordingly.
> >>
> >> Hmmm., this doesn't give much confidence with the
> >> PROC_VMCORE_DEVICE_DUMP feature in its current shape. Rather shouldn't
> >> we be enabling config PROC_VMCORE_DEVICE_DUMP only under EXPERT mode for
> >> now, considering that this feature needs further thrashing and testing
> >> with real setups including platforms where drivers append large amounts
> >> of data to vmcore:
> >
> > I think no need to move it to expert mode, just leave it disabled by
> > default should be better, that should be enough to make sure driver
> > won't append that much memory and cause OOM, while it could still be
> > enabled without changing the kernel, so this feature won't bring extra
> > risk, and could be enabled anytime easily.
>
> I have seen some arm64 users report issues on mailing lists with
> PROC_VMCORE_DEVICE_DUMP enabled as this causes frequent OOM in the arm64
> crash dump kernel.
>
> I think they are using this infrastructure to extend/enable device
> driver debugging on some arm64 platforms and finding issues with the
> crash dump kernel.
>
> I will do some analysis later-on (when I get some spare time) and post a
> patch (if needed) to put the same under EXPERT mode for now.
>
> >> diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
> >> index 817c02b13b1d..c47a12cf7fc0 100644
> >> --- a/fs/proc/Kconfig
> >> +++ b/fs/proc/Kconfig
> >> @@ -45,7 +45,7 @@ config PROC_VMCORE
> >>Exports the dump image of crashed kernel in ELF format.
> >>
> >>config PROC_VMCORE_DEVICE_DUMP
> >> -   bool "Device Hardware/Firmware Log Collection"
> >> +   bool "Device Hardware/Firmware Log Collection" if EXPERT
> >>   depends on PROC_VMCORE
> >>   default n
> >>   help
> >> @@ -59,6 +59,12 @@ config PROC_VMCORE_DEVICE_DUMP
> >> If you say Y here, the collected device dumps will be added
> >> as ELF notes to /proc/vmcore.
> >>
> >> + Considering that there can be device drivers which append
> >> + large amounts of data to vmcore, you should say N here unless
> >> + you are reserving a large chunk of memory for crashdump
> >> + kernel, because otherwise the crashdump kernel might become
> >> + unusable due to OOM issues.
> >> +
> >>
> >> May be you can add a 'Fixes:' tag here.
> >
> > Problem is previous commit seems not broken, just bring extra memory
> > stress. Is "Fixes:" tag suitable for this commit?
>
> I think since the earlier patch causes an OOM, it would be better to
> atleast mention it in the git log (for easier git bisect later on).
>
> If not the 'Fixes:' tag may be we can use a 'Since commit ..' like
> wording in the commit log.
>
> >>> Signed-off-by: Kairui Song 
> >>> ---
> >>>fs/proc/vmcore.c | 20 
> >>>1 file changed, 20 insertions(+)
> >>>
> >>> diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
> >>> index 3fe90443c1bb..e28695ef2439 100644
> >>> --- a/fs/proc/vmcore.c
> >>> +++ b/fs/proc/vmcore.c
> >>> @@ -53,6 +53,9 @@ static str

[PATCH v2] vmcore: Add a kernel cmdline vmcore_device_dump

2019-05-19 Thread Kairui Song
Since commit 2724273e8fd0 ('vmcore: add API to collect hardware dump in
second kernel'), drivers is allowed to add device related dump data to
vmcore as they want by using the device dump API. This have a potential
issue, the data is stored in memory, drivers may append too much data
and use too much memory. The vmcore is typically used in a kdump kernel
which runs in a pre-reserved small chunk of memory. So as a result it
will make kdump unusable at all due to OOM issues.

So introduce new vmcore_device_dump= kernel parameter, and disable
device dump by default. User can enable it only if device dump data is
required for debugging, and have the chance to increase the kdump
reserved memory accordingly before device dump fails kdump.

Signed-off-by: Kairui Song 
---
 Update from V1:
  - Use bool parameter to turn it on/off instead of letting user give
the size limit. Size of device dump is hard to determine.

 Documentation/admin-guide/kernel-parameters.txt | 15 +++
 fs/proc/vmcore.c| 13 +
 2 files changed, 28 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 43176340c73d..2d48e39fd080 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5062,6 +5062,21 @@
decrease the size and leave more room for directly
mapped kernel RAM.
 
+   vmcore_device_dump=
+   [VMCORE]
+   Format: {"off" | "on"}
+   If CONFIG_PROC_VMCORE_DEVICE_DUMP is set,
+   this parameter allows enable or disable device dump
+   for vmcore.
+   Device dump allows drivers to append dump data to
+   vmcore so you can collect driver specified debug info.
+   Note that the drivers could append the data without
+   any limit, and the data is stored in memory, this may
+   bring a significant memory stress. If you want to turn
+   on this option, make sure you have reserved enough 
memory
+   with crashkernel= parameter.
+   default: off
+
vmcp_cma=nn[MG] [KNL,S390]
Sets the memory size reserved for contiguous memory
allocations for the vmcp device driver.
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3fe90443c1bb..d1b608b0efad 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -53,6 +53,8 @@ static struct proc_dir_entry *proc_vmcore;
 /* Device Dump list and mutex to synchronize access to list */
 static LIST_HEAD(vmcoredd_list);
 static DEFINE_MUTEX(vmcoredd_mutex);
+
+static bool vmcoredd_enabled;
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Device Dump Size */
@@ -1451,6 +1453,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
size_t data_size;
int ret;
 
+   if (!vmcoredd_enabled) {
+   pr_err_once("Device dump is disabled\n");
+   return -EINVAL;
+   }
+
if (!data || !strlen(data->dump_name) ||
!data->vmcoredd_callback || !data->size)
return -EINVAL;
@@ -1502,6 +1509,12 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return ret;
 }
 EXPORT_SYMBOL(vmcore_add_device_dump);
+
+static int __init vmcoredd_parse_cmdline(char *arg)
+{
+   return kstrtobool(arg, &vmcoredd_enabled);
+}
+__setup("vmcore_device_dump=", vmcoredd_parse_cmdline);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Free all dumps in vmcore device dump list */
-- 
2.21.0



Re: [RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit

2019-05-16 Thread Kairui Song
On Fri, May 10, 2019 at 7:17 PM Bhupesh Sharma  wrote:
>
> Hi Kairui,
>
> Thanks for the patch. Please see my comments in-line:
>
> On 05/10/2019 03:50 PM, Kairui Song wrote:
> > Device dump allow drivers to add device related dump data to vmcore as
> > they want. This have a potential issue, the data is stored in memory,
> > drivers may append too much data and use too much memory. The vmcore is
> > typically used in a kdump kernel which runs in a pre-reserved small
> > chunk of memory. So as a result it will make kdump unusable at all due
> > to OOM issues.
> >
> > So introduce new device_dump_limit= kernel parameter, and set the
> > default limit to 0, so device dump is not enabled unless user specify
> > the accetable maxiam
>
> acceptable maximum

Will fix this typo.

>
> > memory usage for device dump data. In this way user
> > will also have the chance to adjust the kdump reserved memory
> > accordingly.
>
> Hmmm., this doesn't give much confidence with the
> PROC_VMCORE_DEVICE_DUMP feature in its current shape. Rather shouldn't
> we be enabling config PROC_VMCORE_DEVICE_DUMP only under EXPERT mode for
> now, considering that this feature needs further thrashing and testing
> with real setups including platforms where drivers append large amounts
> of data to vmcore:

I think no need to move it to expert mode, just leave it disabled by
default should be better, that should be enough to make sure driver
won't append that much memory and cause OOM, while it could still be
enabled without changing the kernel, so this feature won't bring extra
risk, and could be enabled anytime easily.

>
> diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
> index 817c02b13b1d..c47a12cf7fc0 100644
> --- a/fs/proc/Kconfig
> +++ b/fs/proc/Kconfig
> @@ -45,7 +45,7 @@ config PROC_VMCORE
>   Exports the dump image of crashed kernel in ELF format.
>
>   config PROC_VMCORE_DEVICE_DUMP
> -   bool "Device Hardware/Firmware Log Collection"
> +   bool "Device Hardware/Firmware Log Collection" if EXPERT
>  depends on PROC_VMCORE
>  default n
>  help
> @@ -59,6 +59,12 @@ config PROC_VMCORE_DEVICE_DUMP
>If you say Y here, the collected device dumps will be added
>as ELF notes to /proc/vmcore.
>
> + Considering that there can be device drivers which append
> + large amounts of data to vmcore, you should say N here unless
> + you are reserving a large chunk of memory for crashdump
> + kernel, because otherwise the crashdump kernel might become
> +     unusable due to OOM issues.
> +
>
> May be you can add a 'Fixes:' tag here.

Problem is previous commit seems not broken, just bring extra memory
stress. Is "Fixes:" tag suitable for this commit?

>
> > Signed-off-by: Kairui Song 
> > ---
> >   fs/proc/vmcore.c | 20 
> >   1 file changed, 20 insertions(+)
> >
> > diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
> > index 3fe90443c1bb..e28695ef2439 100644
> > --- a/fs/proc/vmcore.c
> > +++ b/fs/proc/vmcore.c
> > @@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore;
> >   /* Device Dump list and mutex to synchronize access to list */
> >   static LIST_HEAD(vmcoredd_list);
> >   static DEFINE_MUTEX(vmcoredd_mutex);
> > +
> > +/* Device Dump Limit */
> > +static size_t vmcoredd_limit;
> >   #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> >
> >   /* Device Dump Size */
> > @@ -1465,6 +1468,11 @@ int vmcore_add_device_dump(struct vmcoredd_data 
> > *data)
> >   data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
> >   PAGE_SIZE);
> >
> > + if (vmcoredd_orig_sz + data_size >= vmcoredd_limit) {
> > + ret = -ENOMEM;
>
> Should we be adding a WARN() here to let the user know that the device
> dump data will not be available in vmcore?

Yes, that could be very helpful. How about pr_err_once? WARN is too
noise, just give a hint to the user that device dump is disabled
should be enough, so user will know why device dump data is not
present and will just enable it.

>
> > + goto out_err;
> > + }
> > +
> >   /* Allocate buffer for driver's to write their dumps */
> >   buf = vmcore_alloc_buf(data_size);
> >   if (!buf) {
> > @@ -1502,6 +1510,18 @@ int vmcore_add_device_dump(struct vmcoredd_data 
> > *data)
> >   return ret;
> >   }
> >   EXPORT_SYMBOL(vmcore_add_device_dump);
> > +
> > +static int __init parse_vmc

Re: [PATCH v6 1/2] x86/kexec: Build identity mapping for EFI systab and ACPI tables

2019-05-14 Thread Kairui Song
00
> [0.696330][T1] FS:  () GS:8c6bd600() 
> knlGS:
> [0.697330][T1] CS:  0010 DS:  ES:  CR0: 80050033
> [0.698330][T1] CR2: 8c6bde5ff000 CR3: 00015700e001 CR4: 
> 000606f0
> [0.699334][T1] Kernel panic - not syncing: Attempted to kill init! 
> exitcode=0x000b
> [0.700328][    T1] ---[ end Kernel panic - not syncing: Attempted to kill 
> init! exitcode=0x000b ]---
>
> Thanks
> Dave

I can confirm as I got same result on my T420. next-merge-window
branch fails both normal boot and kexec...
I didn't manage to get a working serial console, but the behavior is
the same so should be the same issue.

Also after "git cherry-pick de01951c8d40^..next-merge-window" on
master branch, it worked well, so the patch should be good.

--
Best Regards,
Kairui Song


[PATCH v2 4/4] crashdump/x86: Use new introduced helper for getting RSDP

2019-05-13 Thread Kairui Song
Use the new introduced helper for getting RSDP, this ensures RSDP is
always accessible and avoid code duplication.

Signed-off-by: Kairui Song 
---
 kexec/arch/i386/crashdump-x86.c | 34 +
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
index 140f45b..a2aea31 100644
--- a/kexec/arch/i386/crashdump-x86.c
+++ b/kexec/arch/i386/crashdump-x86.c
@@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void)
 /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */
 static void cmdline_add_efi(char *cmdline)
 {
-   FILE *fp;
-   int cmdlen, len;
-   char line[MAX_LINE], *s;
-   const char *acpis = " acpi_rsdp=";
+   uint64_t acpi_rsdp;
+   char acpi_rsdp_buf[MAX_LINE];
 
-   fp = fopen("/sys/firmware/efi/systab", "r");
-   if (!fp)
-   return;
+   acpi_rsdp = get_acpi_rsdp();
 
-   while(fgets(line, sizeof(line), fp) != 0) {
-   /* ACPI20= always goes before ACPI= */
-   if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) {
-   line[strlen(line) - 1] = '\0';
-   s = strchr(line, '=');
-   s += 1;
-   len = strlen(s) + strlen(acpis);
-   cmdlen = strlen(cmdline) + len;
-   if (cmdlen > (COMMAND_LINE_SIZE - 1))
-   die("Command line overflow\n");
-   strcat(cmdline, acpis);
-   strcat(cmdline, s);
-   dbgprintf("Command line after adding efi\n");
-   dbgprintf("%s\n", cmdline);
+   if (!acpi_rsdp)
+   return;
 
-   break;
-   }
-   }
+   sprintf(acpi_rsdp_buf, " acpi_rsdp=0x%lx", acpi_rsdp);
+   if (strlen(cmdline) + strlen(acpi_rsdp_buf) > (COMMAND_LINE_SIZE - 1))
+   die("Command line overflow\n");
 
-   fclose(fp);
+   strcat(cmdline, acpi_rsdp_buf);
 }
 
 static void get_backup_area(struct kexec_info *info,
-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2 3/4] x86: Always try to fill acpi_rsdp_addr in boot params

2019-05-13 Thread Kairui Song
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
from boot params if available"), kernel accept an acpi_rsdp_addr param in
boot_params. So fill in this parameter unconditionally, ensure second
kernel always get the right RSDP address consistently, and boot well on
EFI system even with EFI service disabled. User no longer need to change
the kernel cmdline to workaround the missing RSDP issue.

For older version of kernels (Before 5.0), there won't be any change of
behavior.

Signed-off-by: Kairui Song 
---
 kexec/arch/i386/x86-linux-setup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kexec/arch/i386/x86-linux-setup.c 
b/kexec/arch/i386/x86-linux-setup.c
index 5ca7c25..5b00b42 100644
--- a/kexec/arch/i386/x86-linux-setup.c
+++ b/kexec/arch/i386/x86-linux-setup.c
@@ -901,4 +901,7 @@ void setup_linux_system_parameters(struct kexec_info *info,
 
/* fill the EDD information */
setup_edd_info(real_mode);
+
+   /* Always try to fill acpi_rsdp_addr */
+   real_mode->acpi_rsdp_addr = get_acpi_rsdp();
 }
-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2 1/4] x86: Update boot parameters defination

2019-05-13 Thread Kairui Song
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
from boot params if available"), kernel accept a acpi_rsdp_addr param in
boot_params. Sync the x86_linux_param_header to support this param.

Signed-off-by: Kairui Song 
---
 include/x86/x86-linux.h | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h
index 352ea02..a5d8df8 100644
--- a/include/x86/x86-linux.h
+++ b/include/x86/x86-linux.h
@@ -45,8 +45,7 @@ struct apm_bios_info {
uint16_t cseg_len;  /* 0x4e */
uint16_t cseg_16_len;   /* 0x50 */
uint16_t dseg_len;  /* 0x52 */
-   uint8_t  reserved[44];  /* 0x54 */
-};
+} __attribute__((packed));
 
 /*
  * EDD stuff
@@ -113,12 +112,15 @@ struct x86_linux_param_header {
uint8_t  reserved4[2];  /* 0x3e -- 0x3f reserved for 
future expansion */
 
struct apm_bios_info apm_bios_info; /* 0x40 */
+   uint8_t  reserved4_1[28];   /* 0x54 */
+   uint64_t acpi_rsdp_addr;/* 0x70 */
+   uint8_t  reserved4_2[8];/* 0x78 */
struct drive_info_struct drive_info;/* 0x80 */
struct sys_desc_table sys_desc_table;   /* 0xa0 */
uint32_t ext_ramdisk_image; /* 0xc0 */
uint32_t ext_ramdisk_size;  /* 0xc4 */
uint32_t ext_cmd_line_ptr;  /* 0xc8 */
-   uint8_t reserved4_1[0x1c0 - 0xcc];  /* 0xe4 */
+   uint8_t reserved4_3[0x1c0 - 0xcc];  /* 0xe4 */
uint8_t efi_info[32];   /* 0x1c0 */
uint32_t alt_mem_k; /* 0x1e0 */
uint8_t  reserved5[4];  /* 0x1e4 */
-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2 0/4] x86: Always to to fill acpi_rsdp_addr in boot params

2019-05-13 Thread Kairui Song
This patch sync the behavior of user space kexec and kexec_file_load,
they will both fill the boot_params.acpi_rsdp_addr with a valid RSDP
value, to make sure second kernel can always get the RSDP consistently.

This will make it effortless to boot newer version of kernel (5.0+)
without specifying acpi_rsdp= cmdline on EFI system even with EFI
service disabled. Should not change any behavior with older kernels.

Update from V1:
  - Split into multiple patches for a cleaner structure, content is not
changed.

Kairui Song (4):
  x86: Update boot parameters defination
  x86: Introduce helpers for getting RSDP address
  x86: Always try to fill acpi_rsdp_addr in boot params
  crashdump/x86: Use new introduce helper for getting RSDP

 include/x86/x86-linux.h|  8 ++--
 kexec/arch/i386/crashdump-x86.c| 34 +
 kexec/arch/i386/kexec-x86-common.c | 60 ++
 kexec/arch/i386/kexec-x86.h|  1 +
 kexec/arch/i386/x86-linux-setup.c  |  6 ++-
 kexec/arch/i386/x86-linux-setup.h  |  1 +
 6 files changed, 80 insertions(+), 30 deletions(-)

-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2 2/4] x86: Introduce helpers for getting RSDP address

2019-05-13 Thread Kairui Song
On x86 RSDP is fundamental for booting the machine. When second kernel
is incapable of parsing the RSDP address (eg. kexec next kernel on an EFI
system with EFI service disabled), kexec should prepare the RSDP address
for second kernel.

Introduce helpers for getting RSDP from multiple sources, including boot
params, cmdline and EFI firmware.

For legacy BIOS interface, there is no better way to find the RSDP address
rather than scanning the memory region and search for it, and this will
always be done by the kernel as a fallback, so this is no need to try to
get the RSDP address for that case.

Signed-off-by: Kairui Song 
---
 kexec/arch/i386/kexec-x86-common.c | 60 ++
 kexec/arch/i386/kexec-x86.h|  1 +
 kexec/arch/i386/x86-linux-setup.c  |  3 +-
 kexec/arch/i386/x86-linux-setup.h  |  1 +
 4 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/kexec/arch/i386/kexec-x86-common.c 
b/kexec/arch/i386/kexec-x86-common.c
index de99758..4b8eb26 100644
--- a/kexec/arch/i386/kexec-x86-common.c
+++ b/kexec/arch/i386/kexec-x86-common.c
@@ -39,6 +39,7 @@
 #include "../../firmware_memmap.h"
 #include "../../crashdump.h"
 #include "kexec-x86.h"
+#include "x86-linux-setup.h"
 #include "../../kexec-xen.h"
 
 /* Used below but not present in (older?) xenctrl.h */
@@ -392,4 +393,63 @@ int get_memory_ranges(struct memory_range **range, int 
*ranges,
return ret;
 }
 
+static uint64_t cmdline_get_acpi_rsdp(void) {
+   uint64_t acpi_rsdp = 0;
+   char *tmp_cmdline, *rsdp_param;
 
+   tmp_cmdline = get_command_line();
+   rsdp_param = strstr(tmp_cmdline, "acpi_rsdp=");
+
+   if (rsdp_param)
+   sscanf(rsdp_param, "acpi_rsdp=%lx", &acpi_rsdp);
+
+   free(tmp_cmdline);
+   return acpi_rsdp;
+}
+
+static uint64_t bootparam_get_acpi_rsdp(void) {
+   uint64_t acpi_rsdp = 0;
+   off_t offset = offsetof(struct x86_linux_param_header, acpi_rsdp_addr);
+
+   if (get_bootparam(&acpi_rsdp, offset, sizeof(acpi_rsdp)))
+   return 0;
+
+   return acpi_rsdp;
+}
+
+static uint64_t efi_get_acpi_rsdp(void) {
+   FILE *fp;
+   char line[MAX_LINE], *s;
+   uint64_t acpi_rsdp = 0;
+
+   fp = fopen("/sys/firmware/efi/systab", "r");
+   if (!fp)
+   return acpi_rsdp;
+
+   while(fgets(line, sizeof(line), fp) != 0) {
+   /* ACPI20= always goes before ACPI= */
+   if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) {
+   s = strchr(line, '=') + 1;
+   sscanf(s, "0x%lx", &acpi_rsdp);
+   break;
+   }
+   }
+   fclose(fp);
+
+   return acpi_rsdp;
+}
+
+uint64_t get_acpi_rsdp(void)
+{
+   uint64_t acpi_rsdp = 0;
+
+   acpi_rsdp = cmdline_get_acpi_rsdp();
+
+   if (!acpi_rsdp)
+   acpi_rsdp = bootparam_get_acpi_rsdp();
+
+   if (!acpi_rsdp)
+   acpi_rsdp = efi_get_acpi_rsdp();
+
+   return acpi_rsdp;
+}
diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
index c2bcd37..1b58c3b 100644
--- a/kexec/arch/i386/kexec-x86.h
+++ b/kexec/arch/i386/kexec-x86.h
@@ -86,4 +86,5 @@ int nbi_load(int argc, char **argv, const char *buf, off_t 
len,
 void nbi_usage(void);
 
 extern unsigned xen_e820_to_kexec_type(uint32_t type);
+extern uint64_t get_acpi_rsdp(void);
 #endif /* KEXEC_X86_H */
diff --git a/kexec/arch/i386/x86-linux-setup.c 
b/kexec/arch/i386/x86-linux-setup.c
index 8fad115..5ca7c25 100644
--- a/kexec/arch/i386/x86-linux-setup.c
+++ b/kexec/arch/i386/x86-linux-setup.c
@@ -123,7 +123,6 @@ void setup_linux_bootloader_parameters_high(
cmdline_ptr[cmdline_len - 1] = '\0';
 }
 
-static int get_bootparam(void *buf, off_t offset, size_t size);
 static int setup_linux_vesafb(struct x86_linux_param_header *real_mode)
 {
struct fb_fix_screeninfo fix;
@@ -452,7 +451,7 @@ char *find_mnt_by_fsname(char *fsname)
return mntdir;
 }
 
-static int get_bootparam(void *buf, off_t offset, size_t size)
+int get_bootparam(void *buf, off_t offset, size_t size)
 {
int data_file;
char *debugfs_mnt, *sysfs_mnt;
diff --git a/kexec/arch/i386/x86-linux-setup.h 
b/kexec/arch/i386/x86-linux-setup.h
index f5d23d3..0c651e5 100644
--- a/kexec/arch/i386/x86-linux-setup.h
+++ b/kexec/arch/i386/x86-linux-setup.h
@@ -21,6 +21,7 @@ static inline void setup_linux_bootloader_parameters(
 }
 void setup_linux_system_parameters(struct kexec_info *info,
struct x86_linux_param_header *real_mode);
+int get_bootparam(void *buf, off_t offset, size_t size);
 
 
 #define SETUP_BASE0x9
-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] kexec/x86: Unconditionally add the acpi_rsdp command line

2019-05-12 Thread Kairui Song
On Fri, Mar 15, 2019 at 5:36 PM Lianbo Jiang  wrote:
>
> The Linux kernel commit 3a63f70bf4c3 introduces the early parsing
> of the RSDP. This means that boot loader must either set the
> boot_params.acpi_rsdp_addr or pass a command line 'acpi_rsdp=xxx'
> to tell the RDSP physical address.
>
> Currently, kexec neither sets the boot_params.acpi_rsdp or passes
> acpi_rsdp command line if it sees the first kernel support efi
> runtime. This is causing the second kernel boot failure.
> The EFI runtime is not available so early in the boot process so
> unconditionally pass the 'acpi_rsdp=xxx' to the second kernel.
>
> Signed-off-by: Lianbo Jiang 
> Signed-off-by: Brijesh Singh 
> ---
>  kexec/arch/i386/crashdump-x86.c | 17 +
>  1 file changed, 1 insertion(+), 16 deletions(-)
>
> diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
> index 140f45b..a29b15b 100644
> --- a/kexec/arch/i386/crashdump-x86.c
> +++ b/kexec/arch/i386/crashdump-x86.c
> @@ -35,7 +35,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include "../../kexec.h"
>  #include "../../kexec-elf.h"
>  #include "../../kexec-syscall.h"
> @@ -772,18 +771,6 @@ static enum coretype get_core_type(struct crash_elf_info 
> *elf_info,
> }
>  }
>
> -static int sysfs_efi_runtime_map_exist(void)
> -{
> -   DIR *dir;
> -
> -   dir = opendir("/sys/firmware/efi/runtime-map");
> -   if (!dir)
> -   return 0;
> -
> -   closedir(dir);
> -   return 1;
> -}
> -
>  /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */
>  static void cmdline_add_efi(char *cmdline)
>  {
> @@ -978,9 +965,7 @@ int load_crashdump_segments(struct kexec_info *info, 
> char* mod_cmdline,
> dbgprintf("Created elf header segment at 0x%lx\n", elfcorehdr);
> if (delete_memmap(memmap_p, &nr_memmap, elfcorehdr, memsz) < 0)
> return -1;
> -   if (!bzImage_support_efi_boot || arch_options.noefi ||
> -   !sysfs_efi_runtime_map_exist())
> -   cmdline_add_efi(mod_cmdline);
> +   cmdline_add_efi(mod_cmdline);
> cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);
>
> /* Inform second kernel about the presence of ACPI tables. */
> --
> 2.17.1
>
>
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

Hi Lianbo,

I've sent another patch similiar to yours:
[PATCH] x86: Always try to fill acpi_rsdp_addr in boot params

I'll update V2 and your use case should also be covered in that patch,
as we have talked in IRC previously, thanks!

--
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit

2019-05-12 Thread Kairui Song
On Mon, May 13, 2019 at 9:52 AM Dave Young  wrote:
>
> On 05/10/19 at 06:20pm, Kairui Song wrote:
> > Device dump allow drivers to add device related dump data to vmcore as
> > they want. This have a potential issue, the data is stored in memory,
> > drivers may append too much data and use too much memory. The vmcore is
> > typically used in a kdump kernel which runs in a pre-reserved small
> > chunk of memory. So as a result it will make kdump unusable at all due
> > to OOM issues.
> >
> > So introduce new device_dump_limit= kernel parameter, and set the
> > default limit to 0, so device dump is not enabled unless user specify
> > the accetable maxiam memory usage for device dump data. In this way user
> > will also have the chance to adjust the kdump reserved memory
> > accordingly.
>
> The device dump is only affective in kdump 2nd kernel, so add the
> limitation seems not useful.  One is hard to know the correct size
> unless one does some crash test.  If one did the test and want to eanble
> the device dump he needs increase crashkernel= size in 1st kernel and
> add the limit param in 2nd kernel.
>
> So a global on/off param sounds easier and better, something like
> vmcore_device_dump=on  (default is off)

Yes, on/off could be another way to solve this issue, the size limit
could being more flexibility, if device dump is not asking for too
much memory then it would just work but bring extra complexity indeed.
Considering it's actually hard to know how much memory is needed for
the device dump drivers to work, I'll update to use the on/off cmdline
then.

>
> >
> > Signed-off-by: Kairui Song 
> > ---
> >  fs/proc/vmcore.c | 20 
> >  1 file changed, 20 insertions(+)
> >
> > diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
> > index 3fe90443c1bb..e28695ef2439 100644
> > --- a/fs/proc/vmcore.c
> > +++ b/fs/proc/vmcore.c
> > @@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore;
> >  /* Device Dump list and mutex to synchronize access to list */
> >  static LIST_HEAD(vmcoredd_list);
> >  static DEFINE_MUTEX(vmcoredd_mutex);
> > +
> > +/* Device Dump Limit */
> > +static size_t vmcoredd_limit;
> >  #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> >
> >  /* Device Dump Size */
> > @@ -1465,6 +1468,11 @@ int vmcore_add_device_dump(struct vmcoredd_data 
> > *data)
> >   data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
> >   PAGE_SIZE);
> >
> > + if (vmcoredd_orig_sz + data_size >= vmcoredd_limit) {
> > + ret = -ENOMEM;
> > + goto out_err;
> > + }
> > +
> >   /* Allocate buffer for driver's to write their dumps */
> >   buf = vmcore_alloc_buf(data_size);
> >   if (!buf) {
> > @@ -1502,6 +1510,18 @@ int vmcore_add_device_dump(struct vmcoredd_data 
> > *data)
> >   return ret;
> >  }
> >  EXPORT_SYMBOL(vmcore_add_device_dump);
> > +
> > +static int __init parse_vmcoredd_limit(char *arg)
> > +{
> > + char *end;
> > +
> > + if (!arg)
> > + return -EINVAL;
> > + vmcoredd_limit = memparse(arg, &end);
> > + return end > arg ? 0 : -EINVAL;
> > +
> > +}
> > +__setup("device_dump_limit=", parse_vmcoredd_limit);
> >  #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> >
> >  /* Free all dumps in vmcore device dump list */
> > --
> > 2.20.1
> >
>
> Thanks
> Dave



-- 
Best Regards,
Kairui Song


[RFC PATCH] vmcore: Add a kernel cmdline device_dump_limit

2019-05-10 Thread Kairui Song
Device dump allow drivers to add device related dump data to vmcore as
they want. This have a potential issue, the data is stored in memory,
drivers may append too much data and use too much memory. The vmcore is
typically used in a kdump kernel which runs in a pre-reserved small
chunk of memory. So as a result it will make kdump unusable at all due
to OOM issues.

So introduce new device_dump_limit= kernel parameter, and set the
default limit to 0, so device dump is not enabled unless user specify
the accetable maxiam memory usage for device dump data. In this way user
will also have the chance to adjust the kdump reserved memory
accordingly.

Signed-off-by: Kairui Song 
---
 fs/proc/vmcore.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3fe90443c1bb..e28695ef2439 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -53,6 +53,9 @@ static struct proc_dir_entry *proc_vmcore;
 /* Device Dump list and mutex to synchronize access to list */
 static LIST_HEAD(vmcoredd_list);
 static DEFINE_MUTEX(vmcoredd_mutex);
+
+/* Device Dump Limit */
+static size_t vmcoredd_limit;
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Device Dump Size */
@@ -1465,6 +1468,11 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
PAGE_SIZE);
 
+   if (vmcoredd_orig_sz + data_size >= vmcoredd_limit) {
+   ret = -ENOMEM;
+   goto out_err;
+   }
+
/* Allocate buffer for driver's to write their dumps */
buf = vmcore_alloc_buf(data_size);
if (!buf) {
@@ -1502,6 +1510,18 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return ret;
 }
 EXPORT_SYMBOL(vmcore_add_device_dump);
+
+static int __init parse_vmcoredd_limit(char *arg)
+{
+   char *end;
+
+   if (!arg)
+   return -EINVAL;
+   vmcoredd_limit = memparse(arg, &end);
+   return end > arg ? 0 : -EINVAL;
+
+}
+__setup("device_dump_limit=", parse_vmcoredd_limit);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 /* Free all dumps in vmcore device dump list */
-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] x86/kexec: always ensure EFI systab region is mapped

2019-04-24 Thread Kairui Song
On Wed, Apr 24, 2019, 03:46 Baoquan He  wrote:
>
> On 04/24/19 at 02:18pm, Dave Young wrote:
> > On 04/24/19 at 01:41pm, Baoquan He wrote:
> > > On 04/24/19 at 02:47am, Junichi Nomura wrote:
> > > > On 4/24/19 2:15 AM, Kairui Song wrote:
> > > > > On Mon, Apr 22, 2019 at 11:21 PM Junichi Nomura 
> > > > >  wrote:
> > > > >> Is the mapping of ACPI tables just by luck, too?
> > > > >
> > > > > Good question, they should have same issue with systab, I ignored 
> > > > > this one.
> > > > > Then in first kernel when doing kexec it should ensure both ACPI
> > > > > tables and the EFI systab are mapped, that should cover everything and
> > > > > make it work.
> > > >
> > > > Right.
> > > >
> > > > > Is there anything else missing?
> > > > No, as far as I looked around get_rsdp_addr().
> > >
> > > Have made a draft patch to build ident mapping for ACPI tables too, it's
> > > based on Kairui's patch. Dave has tested on his t400s laptop, and
> > > passed. Please check if this adding is OK.
> > >
> > > Kairui, you can add this into your patch to make a new one and resend.
> > > Or I can combine them and send for you today.
>
> >
> > Since I can not reproduce the acpi table accessing fault with Kairui's 
> > patch,
> > the test is just sanity testing on same hardware. But the patch looks
> > good.
>
> Yes, usually vendor will put these efi systab, ACPI tables together. See
> the regions you listed on your t420 laptop in another mail:
> da99f000 - dae9efff Reserved (efi systab fall in this region)
> daf9f000 - daffefff ACPI tables
>
> We build 1:1 mapping for kexec kernel down to PMD level. Means for a
> region, it will align starting address down to PMD size, and align end
> address up to PMD size. So the end of efi systab, 0xdae9efff, will cause
> mapping built for the 2MB area, 0xdae0-0xdaf0. Clearly ACPI
> tables are covered by that PMD entry. That's why only efi systab
> mapping is built, accessing ACPI tables doesn't cause error.
>
> But we can't assume they will be put together always, so need map ACPI
> tables too.
>
> >
> > With Kairui's fix+ this acpi fix and Junichi's patch everything works.
> > Can anyone send them for example patch 1/2: kexec early mapping for
> > efi/acpi,  patch 2/2: Junichi's previous patch.
>
> Kairui is having a workshop in the US, I can make a patchset to
> include these two patches.
>
> For patch 1/2, I will combine the patch Kairui posted and my draft patch,
> Kairui is the author certainly, since he debugged and found out the root
> cause, and posted v1 when I was on vacation last week.
>
> For patch 2/2, I think the version Boris organized is good.
> http://lkml.kernel.org/r/20190416095209.gg27...@zn.tnic
>

Thanks a lot Bao! I was offline for about 1 day due to timezone and
flight, I have no problem with this and the ACPI mapping part looks
good to me.

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] x86/kexec: always ensure EFI systab region is mapped

2019-04-23 Thread Kairui Song
On Mon, Apr 22, 2019 at 11:21 PM Junichi Nomura  wrote:
>
> On 4/22/19 6:28 PM, Kairui Song wrote:
> > The reason is the systab region is not mapped by the identity mapping
> > provided by kexec. Currently kexec only create identity mapping for
> > mem regions, wihch won't cover the systab. So second kernel will be
> > accessing a not mapped memory region and cause fault.
> > But as kexec tend to pad the map region up to PUD size, the
> > systab could be included in the map by accident, so it worked on
> > some machines, but that will be broken easily and unstable.
>
> Is the mapping of ACPI tables just by luck, too?
>

Good question, they should have same issue with systab, I ignored this one.
Then in first kernel when doing kexec it should ensure both ACPI
tables and the EFI systab are mapped, that should cover everything and
make it work.
Is there anything else missing?

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] x86/kexec: always ensure EFI systab region is mapped

2019-04-22 Thread Kairui Song
This is a fix needed for: "x86/boot: Use efi_setup_data for searching
RSDP on kexec-ed kernels", that patch cause kexec to reset the system
on some machines.

The reason is the systab region is not mapped by the identity mapping
provided by kexec. Currently kexec only create identity mapping for
mem regions, wihch won't cover the systab. So second kernel will be
accessing a not mapped memory region and cause fault.
But as kexec tend to pad the map region up to PUD size, the
systab could be included in the map by accident, so it worked on
some machines, but that will be broken easily and unstable.

To fix it just treat systab specially, always map the systab region
unconditionally on EFI systems as long as there is a valid systab
address.

Signed-off-by: Kairui Song 
---
 arch/x86/kernel/machine_kexec_64.c | 40 ++
 1 file changed, 40 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index ceba408ea982..d5da54893f97 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -113,6 +114,37 @@ static void *alloc_pgt_page(void *data)
return p;
 }
 
+#ifdef CONFIG_EFI
+static int init_efi_systab_pgtable(struct x86_mapping_info *info,
+  pgd_t *level4p)
+{
+   unsigned long mstart, mend;
+
+   if (!efi_enabled(EFI_BOOT))
+   return 0;
+
+   mstart = (boot_params.efi_info.efi_systab |
+   ((u64)boot_params.efi_info.efi_systab_hi<<32));
+
+   if (efi_enabled(EFI_64BIT))
+   mend = mstart + sizeof(efi_system_table_64_t);
+   else
+   mend = mstart + sizeof(efi_system_table_32_t);
+
+   if (mstart)
+   return kernel_ident_mapping_init(info,
+   level4p, mstart, mend);
+
+   return 0;
+}
+#else
+static inline int init_efi_systab_pgtable(struct x86_mapping_info *info,
+ pgd_t *level4p)
+{
+   return 0;
+}
+#endif
+
 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
struct x86_mapping_info info = {
@@ -159,6 +191,14 @@ static int init_pgtable(struct kimage *image, unsigned 
long start_pgtable)
return result;
}
 
+   /*
+* Prepare EFI systab mapping for kexec kernel, systab is not
+* covered by pfn_mapped.
+*/
+   result = init_efi_systab_pgtable(&info, level4p);
+   if (result)
+   return result;
+
return init_transition_pgtable(image, level4p);
 }
 
-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it

2019-04-19 Thread Kairui Song
On Fri, Apr 19, 2019 at 7:34 PM Borislav Petkov  wrote:
>
> On Fri, Apr 19, 2019 at 07:20:06PM +0800, Kairui Song wrote:
> > Thanks for the declaration Bao, I can verify on the machine I have,
> > the issue still exist without kaslr. Currently, we read rsdp in early
> > code and fill in boot_params unconditional, so it will read from the
> > systab anyway.
>
> Yes, and in the future, info required by the kexec'ed kernel - like the
> EFI systab address or even whether the kernel has been kexec'ed or comes
> from cold boot - should be passed in boot_params. So that we don't have
> to do all that ugly dancing in early code.
>
> > Yes, kexec only cover RAM in the ident map it prepared for second
> > kernel, but the systab could be in reserved region, so if it didn't
> > fall into the 1G padding by accident it will fail when reading from
> > it. Fix in early code could make sure 2nd kernel always work. Or
> > should we treat it specially in kexec mapping prepare code?
>
> Yes, we should. As I said, this is not early boot code's problem but the
> kexec setup code's problem.
>
> If the new kernel cannot get RSDP that early, then it should fail the
> same way it failed before. That early RDSP parsing was added for the
> movable regions thing working with KASLR.
>
> If it can't get a RDSP for whatever reason, then if KASLR selects
> a region overlapping with the movable regions, then it is the old
> behavior.
>
> Ok?
>

OK. And then fix the mapping issue in 1st kernel is the right way,
I'll skip the update for the early code mapping thing.


--
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it

2019-04-19 Thread Kairui Song
 On Fri, Apr 19, 2019 at 6:50 PM Baoquan He  wrote:
>
> On 04/19/19 at 12:17pm, Borislav Petkov wrote:
> > Breaking thread because this one got too big.
> >
> > On Fri, Apr 19, 2019 at 04:34:58PM +0800, Kairui Song wrote:
> > > There are two approach to fix it, detect if the systab is mapped, and
> > > avoid reading it if not.
> >
> > Ok, so tglx and I discussed this situation which is slowly getting out
> > of hand with all the tinkering.
> >
> > So, here's what we should do - scream loudly now if some of this doesn't
> > make any sense.
> >
> > 1. Junichi's patch should get the systab check above added and sent to
> > 5.1 so that at least some EFI kexecing can work with 5.1
>
> Talked with Kairui privately just now. Seems Junichi's patch need add
> this systab mapping. Since the systab region is not mapped on some
> machines. Those machine don't have this issue because they got systab
> region luckily coverred by 1 GB page mapping in 1st kernel before
> kexec jumping.
>
> This issue should happen whether it is KASLR kernel or not KASLR kernel.

Thanks for the declaration Bao, I can verify on the machine I have,
the issue still exist without kaslr. Currently, we read rsdp in early
code and fill in boot_params unconditional, so it will read from the
systab anyway.

>
> >
> > 2. Then, the fact whether the kernel has been kexec'ed and which
> > addresses it should use early, should all be passed through boot_params
> > which is either setup by kexec(1) or by the first kernel itself, in the
> > kexec_file_load() case.
>
> Seems no better way to check if it's kexec-ed kernel, except of the
> setup data checking of kexec-ed kernel.
>
> It may happen in both kexec_load or kexec_file_load, since we build
> ident mapping of kexec for RAM in 1st kernel.

For kexec_file_load newer kernel will fill in the acpi_rsdp in
boot_params so it bypassed the kexec_get_rsdp_addr (which will read
from systab). The problem is not fixed, systab mapping still missing,
but not likely to happen with kexec_file_load on newer kernel.

>
> >
> > > the systab region is not mapped by the identity mapping provided by
> > > kexec.
> >
> > 3. Then that needs to be fixed in the first kernel as it is a
> > shortcoming of us starting to parse systab very early. It is the kexec
> > setup code's problem not the early compressed stage's problem that the
> > EFI systab is not mapped.
>
> Yeah, adding the systab mapping looks good. Kairui put it in
> decompressing stage just because he wants to cover the case in which the
> old kernel kexec jumping to 2nd kernel. Now it seems not very
> reasonable, we also have the new kernel kexec jumping to old 1nd kernel.

Yes, kexec only cover RAM in the ident map it prepared for second
kernel, but the systab could be in reserved region, so if it didn't
fall into the 1G padding by accident it will fail when reading from
it. Fix in early code could make sure 2nd kernel always work. Or
should we treat it specially in kexec mapping prepare code?

>
> Thanks
> Baoquan
--
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it

2019-04-19 Thread Kairui Song
On Fri, Apr 19, 2019 at 4:58 PM Baoquan He  wrote:
>
> On 04/19/19 at 04:34pm, Kairui Song wrote:
> >  /* Locates and clears a region for a new top level page table. */
> >  void initialize_identity_maps(void)
> >  {
> > - /* If running as an SEV guest, the encryption mask is required. */
> > - set_sev_encryption_mask();
> > -
> > - /* Exclude the encryption mask from __PHYSICAL_MASK */
> > - physical_mask &= ~sme_me_mask;
> > -
> > - /* Init mapping_info with run-time function/buffer pointers. */
> > - mapping_info.alloc_pgt_page = alloc_pgt_page;
> > - mapping_info.context = &pgt_data;
> > - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
> > - mapping_info.kernpg_flag = _KERNPG_TABLE;
> > -
> > - /*
> > -  * It should be impossible for this not to already be true,
> > -  * but since calling this a second time would rewind the other
> > -  * counters, let's just make sure this is reset too.
> > -  */
> > - pgt_data.pgt_buf_offset = 0;
> > -
> > - /*
> > -  * If we came here via startup_32(), cr3 will be _pgtable already
> > -  * and we must append to the existing area instead of entirely
> > -  * overwriting it.
> > -  *
> > -  * With 5-level paging, we use '_pgtable' to allocate the p4d page 
> > table,
> > -  * the top-level page table is allocated separately.
> > -  *
> > -  * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
> > -  * cases. On 4-level paging it's equal to 'top_level_pgt'.
> > -  */
> > - top_level_pgt = read_cr3_pa();
> > - if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
> > - debug_putstr("booted via startup_32()\n");
> > - pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
> > - pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
> > - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
> > - } else {
> > - debug_putstr("booted via startup_64()\n");
> > - pgt_data.pgt_buf = _pgtable;
> > - pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
> > - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
> > + top_level_pgt = early_boot_top_pgt;
> > + if ((p4d_t *)top_level_pgt != (p4d_t *)_pgtable)
> >   top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
>
> Kairui, will you make a patchset to include these changes separately
> later on? I don't get the purposes of code changes. E.g here, I
> don't know why you introduce a new variable early_boot_top_pgt, and
> allocate the page table, even though they have been done in the old
> initialize_identity_maps().
>
> Thanks
> Baoquan
>

OK, right, it's not a good idea to mess up things together, I'll
resend the patch, and will sent the cleanup separately. Without clean
up it may bring in some extra burden with certain kernel config, but
that should be OK for the fix.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH] kexec, x86/boot: map systab region in identity mapping before accessing it

2019-04-19 Thread Kairui Song
The previous patch "x86/boot: Use efi_setup_data for searching RSDP on
kexec-ed kernels" always reset some machines. This is a follow up of
that patch.

The reason is, by default, the systab region is not mapped by the
identity mapping provided by kexec. So kernel will be accessing a not
mapped memory region and cause fault. But as kexec tend to pad the map
region up tp PUD or PMD size, the systab could be included in
the map by accident so it worked on some machines, but that will be
broken easily and unstable.

There are two approach to fix it, detect if the systab is mapped, and avoid
reading it if not. Another one is to ensure the region is map by either
check and map the systab in fisrt kernel before kexec. Or map the systab
in early code before reading it.

Mapping in the early code should cover every case (else boot from an
older kernel will also fail). This patch is a draft of implementing it.

Just added a helper (add_identity_map_pgd) which could be used to add
extra identity mapping in very early stage. And call it before reading
systab. There should be no need to unmap it as the early page table will
be discarded later.

But some refractoring is included, which introduced a lot of changes,
move some page table related code from kaslr_64.c to pgtable_64.c. If
the appraoch goes well could prepare a sperate clean up patches.

Signed-off-by: Kairui Song 
---
 arch/x86/boot/compressed/acpi.c   |   5 +
 arch/x86/boot/compressed/kaslr_64.c   | 109 +
 arch/x86/boot/compressed/misc.c   |   2 +
 arch/x86/boot/compressed/pgtable.h|  11 +++
 arch/x86/boot/compressed/pgtable_64.c | 131 +-
 arch/x86/include/asm/boot.h   |   8 +-
 6 files changed, 156 insertions(+), 110 deletions(-)

diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 8cecce1ac0cd..a513b0f9bfda 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -2,6 +2,7 @@
 #define BOOT_CTYPE_H
 #include "misc.h"
 #include "error.h"
+#include "pgtable.h"
 #include "../string.h"
 
 #include 
@@ -134,6 +135,10 @@ static acpi_physical_address kexec_get_rsdp_addr(void)
if (!systab)
error("EFI system table not found in kexec boot_params.");
 
+   add_identity_map_pgd((unsigned long)systab,
+(unsigned long)systab + sizeof(*systab),
+early_boot_top_pgt);
+
return __efi_get_rsdp_addr((unsigned long)esd->tables, 
systab->nr_tables, true);
 }
 #else
diff --git a/arch/x86/boot/compressed/kaslr_64.c 
b/arch/x86/boot/compressed/kaslr_64.c
index 748456c365f4..ec7093e192bf 100644
--- a/arch/x86/boot/compressed/kaslr_64.c
+++ b/arch/x86/boot/compressed/kaslr_64.c
@@ -8,121 +8,21 @@
  * Copyright (C)  2016  Kees Cook
  */
 
-/*
- * Since we're dealing with identity mappings, physical and virtual
- * addresses are the same, so override these defines which are ultimately
- * used by the headers in misc.h.
- */
-#define __pa(x)  ((unsigned long)(x))
-#define __va(x)  ((void *)((unsigned long)(x)))
-
-/* No PAGE_TABLE_ISOLATION support needed either: */
-#undef CONFIG_PAGE_TABLE_ISOLATION
-
 #include "misc.h"
-
-/* These actually do the work of building the kernel identity maps. */
-#include 
-#include 
-/* Use the static base for this part of the boot process */
-#undef __PAGE_OFFSET
-#define __PAGE_OFFSET __PAGE_OFFSET_BASE
-#include "../../mm/ident_map.c"
+#include "pgtable.h"
 
 /* Used by pgtable.h asm code to force instruction serialization. */
 unsigned long __force_order;
 
-/* Used to track our page table allocation area. */
-struct alloc_pgt_data {
-   unsigned char *pgt_buf;
-   unsigned long pgt_buf_size;
-   unsigned long pgt_buf_offset;
-};
-
-/*
- * Allocates space for a page table entry, using struct alloc_pgt_data
- * above. Besides the local callers, this is used as the allocation
- * callback in mapping_info below.
- */
-static void *alloc_pgt_page(void *context)
-{
-   struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
-   unsigned char *entry;
-
-   /* Validate there is space available for a new page. */
-   if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
-   debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
-   debug_putaddr(pages->pgt_buf_offset);
-   debug_putaddr(pages->pgt_buf_size);
-   return NULL;
-   }
-
-   entry = pages->pgt_buf + pages->pgt_buf_offset;
-   pages->pgt_buf_offset += PAGE_SIZE;
-
-   return entry;
-}
-
-/* Used to track our allocated page tables. */
-static struct alloc_pgt_data pgt_data;
-
 /* The top level page table entry pointer. */
 static unsigned long top_level_pgt;
 
-phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
-
-

Re: [PATCH] x86/boot: Use efi_setup_data for searching RSDP on kexec-ed kernels

2019-04-16 Thread Kairui Song
On Wed, Apr 17, 2019 at 12:57 PM Dave Young  wrote:
>
> On 04/17/19 at 09:38am, Dave Young wrote:
> > On 04/16/19 at 03:22pm, Borislav Petkov wrote:
> > > On Tue, Apr 16, 2019 at 07:41:33PM +0800, Dave Young wrote:
> > > > On 04/16/19 at 11:52am, Borislav Petkov wrote:
> > > > > I'll queue the below in the next days if there are no more complaints:
> > > >
> > > > As for the kexec breakage, even with the V3 patch, kexec still hangs on
> > > > a Lenovo T420 laptop.  Kairui also reproduced the problem. So can we
> > > > wait a few days see if we can make some progress to find the cause?
> > >
> > > How is applying this patch going to change anything?
> > >
> > > I was told that the breakage is there even without it...
> >
> > Without this patch, the bug happens in the efi_get_rsdp.. function, this
> > patch tries to fix that by adding kexec_get.. but the new introduced
> > kexec_* function does not work on some laptops, so it is not a 100% good
> > fix, I hoped we can get it working for all known issues.  But if we can
> > not do it eg. within one week we can go with this version and leave the
> > laptop issue as a known issue.
> >
>
> Latest debugging status:
>
> Kexec boot works with commenting out some code like below, so the guid
> cmp (memcmp) caused a system reset), still need to find out why:
>
> diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
> index d9f9abd63c68..13e7a23ae94c 100644
> --- a/arch/x86/boot/compressed/acpi.c
> +++ b/arch/x86/boot/compressed/acpi.c
> @@ -95,10 +95,12 @@ __efi_get_rsdp_addr(unsigned long config_tables, unsigned 
> int nr_tables,
> table = tbl->table;
> }
>
> +/*
> if (!(efi_guidcmp(guid, ACPI_TABLE_GUID)))
> rsdp_addr = table;
> else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID)))
> return table;
> +*/
> }
>
> return rsdp_addr;
> @@ -291,9 +293,10 @@ acpi_physical_address get_rsdp_addr(void)
> if (!pa)
> pa = kexec_get_rsdp_addr();
>
> +/*
> if (!pa)
> pa = efi_get_rsdp_addr();
> -
> +*/
> if (!pa)
> pa = bios_get_rsdp_addr();
>
>

Hi Dave, for this case I think it's just because GCC will found the
loop does nothing, and optimize out the whole loop in
__efi_get_rsdp_addr and will no longer read the actual nr_table value.

I can fix the boot error on T420 with your patch, but if I add
anything, like a hardcode value assignment with the right value for
acpi_rsdp in the loop, it will reset the machine. But set acpi_rsdp
with a right initial value out side the loop works fine.
If the loop condition is false, then there should be no difference
between just comment out the line you mentioned and add an assignment.
Else it just assign the value multiple times, not very reasonable but
shouldn't fail.

And, I inspected the generated ASM code also suggest the same thing.
So still, access the systab memory is the cause of the system reset on
certain machines.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] x86: Always to to fill acpi_rsdp_addr in boot params

2019-04-04 Thread Kairui Song
On Thu, Apr 4, 2019 at 3:25 PM Dave Young  wrote:
>
> Hello Kairui
> On 03/28/19 at 05:49pm, Kairui Song wrote:
> > Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
> > from boot params if available"), kernel accept a acpi_rsdp_addr param in
> > boot_params. Sync the x86_linux_param_header to support this param.
> >
> > And previously we are already appending 'acpi_rsdp=' command line only for
> > loading crash kernel on EFI systems, it will be better to try to set the
> > boot param for any kernel get loaded, to help the kernel finding the
> > RSDP value more stably. Otherwise if the user decide to disable EFI
> > service in second kernel, it will fail to boot.
> >
> > There is no better way to find the RSDP address from legacy BIOS
> > interface rather than scanning the memory region and search for it,
> > which will always be done by the kernel as a fallback, so we only
> > look for RSDP in previous boot params, cmdline and EFI firmware.
>
> It would be good to always pass acpi_rsdp= kernel cmdline in case
> efi=old_map.  (or maybe efi=noruntime as well, but I did not remember
> the behavior of noruntime now), no matter kexec or kdump..
>
> And if you want, maybe fill the boot_params instead of passing cmdline
> for new kernel which supports the new boot_param field.
>
> Split the patch to small patches would be better.
>
> Thanks
> Dave

Thanks for the review!

I'll update in V2 accordingly.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] x86: Always to to fill acpi_rsdp_addr in boot params

2019-03-28 Thread Kairui Song
On Thu, Mar 28, 2019 at 5:49 PM Kairui Song  wrote:
>
> Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
> from boot params if available"), kernel accept a acpi_rsdp_addr param in
> boot_params. Sync the x86_linux_param_header to support this param.
>
> And previously we are already appending 'acpi_rsdp=' command line only for
> loading crash kernel on EFI systems, it will be better to try to set the
> boot param for any kernel get loaded, to help the kernel finding the
> RSDP value more stably. Otherwise if the user decide to disable EFI
> service in second kernel, it will fail to boot.
>
> There is no better way to find the RSDP address from legacy BIOS
> interface rather than scanning the memory region and search for it,
> which will always be done by the kernel as a fallback, so we only
> look for RSDP in previous boot params, cmdline and EFI firmware.
>
> Signed-off-by: Kairui Song 
> ---
>  include/x86/x86-linux.h|  8 ++--
>  kexec/arch/i386/crashdump-x86.c| 34 +
>  kexec/arch/i386/kexec-x86-common.c | 60 ++
>  kexec/arch/i386/kexec-x86.h|  1 +
>  kexec/arch/i386/x86-linux-setup.c  |  6 ++-
>  kexec/arch/i386/x86-linux-setup.h  |  1 +
>  6 files changed, 80 insertions(+), 30 deletions(-)
>
> diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h
> index 352ea02..a5d8df8 100644
> --- a/include/x86/x86-linux.h
> +++ b/include/x86/x86-linux.h
> @@ -45,8 +45,7 @@ struct apm_bios_info {
> uint16_t cseg_len;  /* 0x4e */
> uint16_t cseg_16_len;   /* 0x50 */
> uint16_t dseg_len;  /* 0x52 */
> -   uint8_t  reserved[44];  /* 0x54 */
> -};
> +} __attribute__((packed));
>
>  /*
>   * EDD stuff
> @@ -113,12 +112,15 @@ struct x86_linux_param_header {
> uint8_t  reserved4[2];  /* 0x3e -- 0x3f reserved for 
> future expansion */
>
> struct apm_bios_info apm_bios_info; /* 0x40 */
> +   uint8_t  reserved4_1[28];   /* 0x54 */
> +   uint64_t acpi_rsdp_addr;/* 0x70 */
> +   uint8_t  reserved4_2[8];/* 0x78 */
> struct drive_info_struct drive_info;/* 0x80 */
> struct sys_desc_table sys_desc_table;   /* 0xa0 */
> uint32_t ext_ramdisk_image; /* 0xc0 */
> uint32_t ext_ramdisk_size;  /* 0xc4 */
> uint32_t ext_cmd_line_ptr;  /* 0xc8 */
> -   uint8_t reserved4_1[0x1c0 - 0xcc];  /* 0xe4 */
> +   uint8_t reserved4_3[0x1c0 - 0xcc];  /* 0xe4 */
> uint8_t efi_info[32];   /* 0x1c0 */
> uint32_t alt_mem_k; /* 0x1e0 */
> uint8_t  reserved5[4];  /* 0x1e4 */
> diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
> index 140f45b..262d157 100644
> --- a/kexec/arch/i386/crashdump-x86.c
> +++ b/kexec/arch/i386/crashdump-x86.c
> @@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void)
>  /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */
>  static void cmdline_add_efi(char *cmdline)
>  {
> -   FILE *fp;
> -   int cmdlen, len;
> -   char line[MAX_LINE], *s;
> -   const char *acpis = " acpi_rsdp=";
> +   int cmdlen;
> +   uint64_t acpi_rsdp;
>
> -   fp = fopen("/sys/firmware/efi/systab", "r");
> -   if (!fp)
> -   return;
> +   acpi_rsdp = get_acpi_rsdp();
> +   cmdlen = strlen(cmdline);
>
> -   while(fgets(line, sizeof(line), fp) != 0) {
> -   /* ACPI20= always goes before ACPI= */
> -   if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) {
> -   line[strlen(line) - 1] = '\0';
> -   s = strchr(line, '=');
> -   s += 1;
> -   len = strlen(s) + strlen(acpis);
> -   cmdlen = strlen(cmdline) + len;
> -   if (cmdlen > (COMMAND_LINE_SIZE - 1))
> -   die("Command line overflow\n");
> -   strcat(cmdline, acpis);
> -   strcat(cmdline, s);
> -   dbgprintf("Command line after adding efi\n");
> -   dbgprintf("%s\n", cmdline);
> +   if (!acpi_rsdp)
> +   return;
>
> -   break;
> -   }
> -   }
> +   if (cmdlen + sizeof(" acpi_rsdp=0x") + 16 > (COMMAND_LINE_SIZE - 1))
> +   die("Command 

[PATCH] x86: Always to to fill acpi_rsdp_addr in boot params

2019-03-28 Thread Kairui Song
Since kernel commit e6e094e053af75 ("x86/acpi, x86/boot: Take RSDP address
from boot params if available"), kernel accept a acpi_rsdp_addr param in
boot_params. Sync the x86_linux_param_header to support this param.

And previously we are already appending 'acpi_rsdp=' command line only for
loading crash kernel on EFI systems, it will be better to try to set the
boot param for any kernel get loaded, to help the kernel finding the
RSDP value more stably. Otherwise if the user decide to disable EFI
service in second kernel, it will fail to boot.

There is no better way to find the RSDP address from legacy BIOS
interface rather than scanning the memory region and search for it,
which will always be done by the kernel as a fallback, so we only
look for RSDP in previous boot params, cmdline and EFI firmware.

Signed-off-by: Kairui Song 
---
 include/x86/x86-linux.h|  8 ++--
 kexec/arch/i386/crashdump-x86.c| 34 +
 kexec/arch/i386/kexec-x86-common.c | 60 ++
 kexec/arch/i386/kexec-x86.h|  1 +
 kexec/arch/i386/x86-linux-setup.c  |  6 ++-
 kexec/arch/i386/x86-linux-setup.h  |  1 +
 6 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/include/x86/x86-linux.h b/include/x86/x86-linux.h
index 352ea02..a5d8df8 100644
--- a/include/x86/x86-linux.h
+++ b/include/x86/x86-linux.h
@@ -45,8 +45,7 @@ struct apm_bios_info {
uint16_t cseg_len;  /* 0x4e */
uint16_t cseg_16_len;   /* 0x50 */
uint16_t dseg_len;  /* 0x52 */
-   uint8_t  reserved[44];  /* 0x54 */
-};
+} __attribute__((packed));
 
 /*
  * EDD stuff
@@ -113,12 +112,15 @@ struct x86_linux_param_header {
uint8_t  reserved4[2];  /* 0x3e -- 0x3f reserved for 
future expansion */
 
struct apm_bios_info apm_bios_info; /* 0x40 */
+   uint8_t  reserved4_1[28];   /* 0x54 */
+   uint64_t acpi_rsdp_addr;/* 0x70 */
+   uint8_t  reserved4_2[8];/* 0x78 */
struct drive_info_struct drive_info;/* 0x80 */
struct sys_desc_table sys_desc_table;   /* 0xa0 */
uint32_t ext_ramdisk_image; /* 0xc0 */
uint32_t ext_ramdisk_size;  /* 0xc4 */
uint32_t ext_cmd_line_ptr;  /* 0xc8 */
-   uint8_t reserved4_1[0x1c0 - 0xcc];  /* 0xe4 */
+   uint8_t reserved4_3[0x1c0 - 0xcc];  /* 0xe4 */
uint8_t efi_info[32];   /* 0x1c0 */
uint32_t alt_mem_k; /* 0x1e0 */
uint8_t  reserved5[4];  /* 0x1e4 */
diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
index 140f45b..262d157 100644
--- a/kexec/arch/i386/crashdump-x86.c
+++ b/kexec/arch/i386/crashdump-x86.c
@@ -787,35 +787,19 @@ static int sysfs_efi_runtime_map_exist(void)
 /* Appends 'acpi_rsdp=' commandline for efi boot crash dump */
 static void cmdline_add_efi(char *cmdline)
 {
-   FILE *fp;
-   int cmdlen, len;
-   char line[MAX_LINE], *s;
-   const char *acpis = " acpi_rsdp=";
+   int cmdlen;
+   uint64_t acpi_rsdp;
 
-   fp = fopen("/sys/firmware/efi/systab", "r");
-   if (!fp)
-   return;
+   acpi_rsdp = get_acpi_rsdp();
+   cmdlen = strlen(cmdline);
 
-   while(fgets(line, sizeof(line), fp) != 0) {
-   /* ACPI20= always goes before ACPI= */
-   if ((strstr(line, "ACPI20=")) || (strstr(line, "ACPI="))) {
-   line[strlen(line) - 1] = '\0';
-   s = strchr(line, '=');
-   s += 1;
-   len = strlen(s) + strlen(acpis);
-   cmdlen = strlen(cmdline) + len;
-   if (cmdlen > (COMMAND_LINE_SIZE - 1))
-   die("Command line overflow\n");
-   strcat(cmdline, acpis);
-   strcat(cmdline, s);
-   dbgprintf("Command line after adding efi\n");
-   dbgprintf("%s\n", cmdline);
+   if (!acpi_rsdp)
+   return;
 
-   break;
-   }
-   }
+   if (cmdlen + sizeof(" acpi_rsdp=0x") + 16 > (COMMAND_LINE_SIZE - 1))
+   die("Command line overflow\n");
 
-   fclose(fp);
+   sprintf(cmdline + cmdlen, " acpi_rsdp=0x%016lx", acpi_rsdp);
 }
 
 static void get_backup_area(struct kexec_info *info,
diff --git a/kexec/arch/i386/kexec-x86-common.c 
b/kexec/arch/i386/kexec-x86-common.c
index de99758..4b8eb26 100644
--- a/kexec/arch/i386/kexec-x86-common.c
+++ b/kexec/arch/i386/kexec-x86-common.c
@@ -39,6 +39,7 @@
 #include "../../firmware_memmap.h"
 #include "../../crashdump.h"
 #include "kexec-x86.h"
+#include "x86-linux-

Re: [PATCH] x86/boot: Use EFI setup data if provided

2019-03-24 Thread Kairui Song
On Mon, Mar 25, 2019 at 2:20 PM Dave Young  wrote:
>
> On 03/25/19 at 02:01pm, Dave Young wrote:
> > On 03/25/19 at 12:27am, Junichi Nomura wrote:
> > > On Fri, Mar 22, 2019 at 04:23:28PM +0100, Borislav Petkov wrote:
> > > > On Fri, Mar 22, 2019 at 11:03:43AM +, Junichi Nomura wrote:
> > > > > Commit 3a63f70bf4c3a ("x86/boot: Early parse RSDP and save it in
> > > > > boot_params") broke kexec boot on EFI systems.  efi_get_rsdp_addr()
> > > > > in the early parsing code tries to search RSDP from EFI table but
> > > > > whose address is virtual.
> > > > >
> > > > > Since kexec(1) provides physical address of config_table via 
> > > > > boot_params,
> > > > > efi_get_rsdp_addr() should look for setup_data in the same way as
> > > > > efi_systab_init() in arch/x86/platform/efi/efi.c does.
> > > >
> > > > If the kexec kernel should continue to use efi_systab_init() then you
> > > > should make efi_get_rsdp_addr() exit early in the kexec-ed kernel.
> > >
> > > I'm not sure which way kexec devel is going. Added kexec list.
> > > Here is the version that exits early in efi_get_rsdp_addr().
> > >
> > > [PATCH] x86/boot: Don't try to search RSDP from EFI when kexec-booted
> > >
> > > Commit 3a63f70bf4c3a ("x86/boot: Early parse RSDP and save it in
> > > boot_params") broke kexec boot on EFI systems.  efi_get_rsdp_addr()
> > > in the early parsing code tries to search RSDP from EFI table but
> > > whose address is virtual.
> > >
> > > Normally kexec(1) provides physical address of config_table via 
> > > boot_params
> > > and EFI code uses that during initialization.
> > > For the early boot code, we just exit efi_get_rsdp_addr() early if the 
> > > kernel
> > > is booted by kexec.
> > >
> > > Fixes: 3a63f70bf4c3a ("x86/boot: Early parse RSDP and save it in 
> > > boot_params")
> > > Signed-off-by: Jun'ichi Nomura 
> > > Cc: Chao Fan 
> > > Cc: Borislav Petkov 
> > >
> > > diff --git a/arch/x86/boot/compressed/acpi.c 
> > > b/arch/x86/boot/compressed/acpi.c
> > > index 0ef4ad5..1cefc43 100644
> > > --- a/arch/x86/boot/compressed/acpi.c
> > > +++ b/arch/x86/boot/compressed/acpi.c
> > > @@ -44,6 +44,24 @@ static acpi_physical_address get_acpi_rsdp(void)
> > > return addr;
> > >  }
> > >
> > > +static bool is_kexec_booted(void)
> > > +{
> > > +   struct setup_data *data;
> > > +
> > > +   /*
> > > +* kexec-tools provides EFI setup data so that kexec-ed kernel
> > > +* can find proper tables.
> > > +*/
> > > +   data = (struct setup_data *) boot_params->hdr.setup_data;
> > > +   while (data) {
> > > +   if (data->type == SETUP_EFI)
> > > +   return true;
> > > +   data = (struct setup_data *) data->next;
> > > +   }
> > > +
> > > +   return false;
> > > +}
> > > +
> > >  /* Search EFI system tables for RSDP. */
> > >  static acpi_physical_address efi_get_rsdp_addr(void)
> > >  {
> > > @@ -57,6 +75,10 @@ static acpi_physical_address efi_get_rsdp_addr(void)
> > > int size, i;
> > > char *sig;
> > >
> > > +   /* If the system is kexec-booted, poking EFI systab may not work. */
> > > +   if (is_kexec_booted())
> > > +   return 0;
> > > +
> > > ei = &boot_params->efi_info;
> > > sig = (char *)&ei->efi_loader_signature;
> > >
> > >
> > > ___
> > > kexec mailing list
> > > kexec@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/kexec
> >
> > Good catch, this way looks good to me.  But the function
> > is_kexec_booted can be compiled when #ifdef CONFIG_EFI
> >
> > Otherwise:
> >
> > Acked-by: Dave Young 
> >
>
> Hold on, I replied too quick.  One question is does the above patch
> passed your test?   It can workaround and skip the wrong phys addr
> issue, but the acpi early parsing still fails because efi_get_rsdp_addr
> return 0?
>
> If this is the case you may need go with your old patch.
>
> I think normally people do not see this bug, because kernel will set the
> rsdp in boot_params->acpi_rsdp_addr.  Maybe you are testing with
> different kernel versions, eg.
>
> old kernel kexec to new kernel.
>
> And the old kernel does not set boot_params->acpi_rsdp_addr
>
> Is this correct?
>
> Thanks
> Dave

Hi Dave, actually only kexec_file_load will always set the
boot_params->acpi_rsdp_addr. Can't guarantee how user space tools will
prepare the boot_prams if kexec_load is used, so it's should very
likely to happen.

And for the patch, I also think the first patch looks better, if we
just return 0 early in efi_get_rsdp_addr aren't we still failing to
parse the rsdp in early code?

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH v2] x86: Introduce a new option --reuse-video-type

2019-03-05 Thread Kairui Song
After commit 060eee58 "x86: use old screen_info if needed", kexec-tools
will force use old screen_info and vga type if failed to determine
current vga type. But it is not always a good idea.

Currently kernel hanging is inspected on some hyper-v VMs after this
commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot
up, but after the real driver is loaded, it will switch to new mode
and no longer compatible with EFI/VESA VGA. Keep setting
orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and
try to manipulate the framebuffer in a wrong way.

We can't ensure this won't happen on other framebuffer drivers, But
it's a helpful feature if the framebuffer drivers just work. So this
patch introduce a --reuse-video-type options to let user decide if the
old screen_info hould be used unconditional or not.

Signed-off-by: Kairui Song 

---

Update from V1:
- Fix a fd leak
- Rename the option from --force-vga to --reuse-video-type

 kexec/arch/i386/include/arch/options.h | 2 ++
 kexec/arch/i386/kexec-x86.h| 1 +
 kexec/arch/i386/x86-linux-setup.c  | 8 ++--
 kexec/arch/x86_64/kexec-x86_64.c   | 5 +
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/kexec/arch/i386/include/arch/options.h 
b/kexec/arch/i386/include/arch/options.h
index c113a83..0e57951 100644
--- a/kexec/arch/i386/include/arch/options.h
+++ b/kexec/arch/i386/include/arch/options.h
@@ -32,6 +32,7 @@
 #define OPT_ENTRY_32BIT(OPT_ARCH_MAX+10)
 #define OPT_PASS_MEMMAP_CMDLINE(OPT_ARCH_MAX+11)
 #define OPT_NOEFI  (OPT_ARCH_MAX+12)
+#define OPT_REUSE_VIDEO_TYPE   (OPT_ARCH_MAX+13)
 
 /* Options relevant to the architecture (excluding loader-specific ones): */
 #define KEXEC_ARCH_OPTIONS \
@@ -45,6 +46,7 @@
{ "elf64-core-headers", 0, 0, OPT_ELF64_CORE }, \
{ "pass-memmap-cmdline", 0, 0, OPT_PASS_MEMMAP_CMDLINE }, \
{ "noefi", 0, 0, OPT_NOEFI}, \
+   { "reuse-video-type", 0, 0, OPT_REUSE_VIDEO_TYPE }, \
 
 #define KEXEC_ARCH_OPT_STR KEXEC_OPT_STR ""
 
diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
index 51855f8..c2bcd37 100644
--- a/kexec/arch/i386/kexec-x86.h
+++ b/kexec/arch/i386/kexec-x86.h
@@ -52,6 +52,7 @@ struct arch_options_t {
enum coretype   core_header_type;
uint8_t pass_memmap_cmdline;
uint8_t noefi;
+   uint8_t reuse_video_type;
 };
 
 int multiboot_x86_probe(const char *buf, off_t len);
diff --git a/kexec/arch/i386/x86-linux-setup.c 
b/kexec/arch/i386/x86-linux-setup.c
index 1bd408b..8fad115 100644
--- a/kexec/arch/i386/x86-linux-setup.c
+++ b/kexec/arch/i386/x86-linux-setup.c
@@ -144,7 +144,7 @@ static int setup_linux_vesafb(struct x86_linux_param_header 
*real_mode)
} else if (0 == strcmp(fix.id, "EFI VGA")) {
/* VIDEO_TYPE_EFI */
real_mode->orig_video_isVGA = 0x70;
-   } else {
+   } else if (arch_options.reuse_video_type) {
int err;
off_t offset = offsetof(typeof(*real_mode), orig_video_isVGA);
 
@@ -152,6 +152,10 @@ static int setup_linux_vesafb(struct 
x86_linux_param_header *real_mode)
err = get_bootparam(&real_mode->orig_video_isVGA, offset, 1);
if (err)
goto out;
+   } else {
+   real_mode->orig_video_isVGA = 0;
+   close(fd);
+   return 0;
}
close(fd);
 
@@ -844,7 +848,7 @@ void setup_linux_system_parameters(struct kexec_info *info,
setup_subarch(real_mode);
if (bzImage_support_efi_boot && !arch_options.noefi)
setup_efi_info(info, real_mode);
-   
+
/* Default screen size */
real_mode->orig_x = 0;
real_mode->orig_y = 0;
diff --git a/kexec/arch/x86_64/kexec-x86_64.c b/kexec/arch/x86_64/kexec-x86_64.c
index 041b007..ccdc980 100644
--- a/kexec/arch/x86_64/kexec-x86_64.c
+++ b/kexec/arch/x86_64/kexec-x86_64.c
@@ -55,6 +55,7 @@ void arch_usage(void)
" --console-serial  Enable the serial console\n"
" --pass-memmap-cmdline Pass memory map via command 
line in kexec on panic case\n"
" --noefi   Disable efi support\n"
+   " --reuse-video-typeReuse old boot time video type 
blindly\n"
);
 }
 
@@ -67,6 +68,7 @@ struct arch_options_t arch_options = {
.core_header_type = CORE_TYPE_ELF64,
.pass_memmap_cmdline = 0,
.noefi = 0,
+   .reuse_video_type = 0,
 };
 
 int arch_process_options(int argc, char **argv)
@@ -136,6 +138,9 @@ int arch_process_options(int argc, char **argv)
case OPT_NOEFI:
arch_options.noefi = 1;
  

Re: [PATCH] x86: Introdudce a new option --force-vga

2019-03-05 Thread Kairui Song
On Tue, Mar 5, 2019 at 6:09 PM Dave Young  wrote:
>
> On 03/05/19 at 04:24pm, Kairui Song wrote:
> > On Mon, Mar 4, 2019 at 2:30 PM Dave Young  wrote:
> > >
> > > On 02/28/19 at 06:07pm, Kairui Song wrote:
> > > > After commit 060eee58 "x86: use old screen_info if needed", kexec-tools
> > > > will force use old screen_info and vga type if failed to determine
> > > > current vga type. But it is not always a good idea.
> > > >
> > > > Currently kernel hanging is inspected on some hyper-v VMs after this
> > > > commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot
> > > > up, but after the real driver is loaded, it will switch to new mode
> > > > and no longer compatible with EFI/VESA VGA. Keep setting
> > > > orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and
> > > > try to manipulate the framebuffer in a wrong way.
> > > >
> > > > We can't ensure this won't happen on other framebuffer drivers, But
> > > > it's a helpful feature if the framebuffer drivers just work. So this
> > > > patch introduce a --force-vga options to let user decide if the
> > > > old screen_info should be used unconditional or not.
> > >
> > > It looks good to me except the option name, because vga usually means
> > > the specific vga video type.  But here you are enforcing to reuse the 
> > > first
> > > kernel original video type.
> > >
> > > It would be better to use --reuse-video-type or --force-orig-video, etc..
> > >
> >
> > Thanks for the review, the naming is not very good indeed, will update
> > the patch. How about just --reuse-video? This should be general enough
> > and clear.
>
> Hmm, I feel --reuse-video-type is clearer,  --reuse-video seems not
> clear :)
>

OK, will use --reuse-video-type then.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] x86: Introdudce a new option --force-vga

2019-03-05 Thread Kairui Song
On Mon, Mar 4, 2019 at 2:30 PM Dave Young  wrote:
>
> On 02/28/19 at 06:07pm, Kairui Song wrote:
> > After commit 060eee58 "x86: use old screen_info if needed", kexec-tools
> > will force use old screen_info and vga type if failed to determine
> > current vga type. But it is not always a good idea.
> >
> > Currently kernel hanging is inspected on some hyper-v VMs after this
> > commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot
> > up, but after the real driver is loaded, it will switch to new mode
> > and no longer compatible with EFI/VESA VGA. Keep setting
> > orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and
> > try to manipulate the framebuffer in a wrong way.
> >
> > We can't ensure this won't happen on other framebuffer drivers, But
> > it's a helpful feature if the framebuffer drivers just work. So this
> > patch introduce a --force-vga options to let user decide if the
> > old screen_info should be used unconditional or not.
>
> It looks good to me except the option name, because vga usually means
> the specific vga video type.  But here you are enforcing to reuse the first
> kernel original video type.
>
> It would be better to use --reuse-video-type or --force-orig-video, etc..
>

Thanks for the review, the naming is not very good indeed, will update
the patch. How about just --reuse-video? This should be general enough
and clear.

-- 
Best Regards,
Kairui Song

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] x86: Introdudce a new option --force-vga

2019-02-28 Thread Kairui Song
After commit 060eee58 "x86: use old screen_info if needed", kexec-tools
will force use old screen_info and vga type if failed to determine
current vga type. But it is not always a good idea.

Currently kernel hanging is inspected on some hyper-v VMs after this
commit, because hyperv_fb will mimic EFI (or VESA) VGA on first boot
up, but after the real driver is loaded, it will switch to new mode
and no longer compatible with EFI/VESA VGA. Keep setting
orig_video_isVGA to EFI/VESA VGA flag will get wrong driver loaded and
try to manipulate the framebuffer in a wrong way.

We can't ensure this won't happen on other framebuffer drivers, But
it's a helpful feature if the framebuffer drivers just work. So this
patch introduce a --force-vga options to let user decide if the
old screen_info should be used unconditional or not.

Signed-off-by: Kairui Song 
---
 kexec/arch/i386/include/arch/options.h | 2 ++
 kexec/arch/i386/kexec-x86.h| 1 +
 kexec/arch/i386/x86-linux-setup.c  | 7 +--
 kexec/arch/x86_64/kexec-x86_64.c   | 5 +
 4 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/kexec/arch/i386/include/arch/options.h 
b/kexec/arch/i386/include/arch/options.h
index c113a83..7667cf4 100644
--- a/kexec/arch/i386/include/arch/options.h
+++ b/kexec/arch/i386/include/arch/options.h
@@ -32,6 +32,7 @@
 #define OPT_ENTRY_32BIT(OPT_ARCH_MAX+10)
 #define OPT_PASS_MEMMAP_CMDLINE(OPT_ARCH_MAX+11)
 #define OPT_NOEFI  (OPT_ARCH_MAX+12)
+#define OPT_FORCE_VGA  (OPT_ARCH_MAX+13)
 
 /* Options relevant to the architecture (excluding loader-specific ones): */
 #define KEXEC_ARCH_OPTIONS \
@@ -45,6 +46,7 @@
{ "elf64-core-headers", 0, 0, OPT_ELF64_CORE }, \
{ "pass-memmap-cmdline", 0, 0, OPT_PASS_MEMMAP_CMDLINE }, \
{ "noefi", 0, 0, OPT_NOEFI}, \
+   { "force-vga",  0, 0, OPT_FORCE_VGA },  \
 
 #define KEXEC_ARCH_OPT_STR KEXEC_OPT_STR ""
 
diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
index 51855f8..d16679f 100644
--- a/kexec/arch/i386/kexec-x86.h
+++ b/kexec/arch/i386/kexec-x86.h
@@ -52,6 +52,7 @@ struct arch_options_t {
enum coretype   core_header_type;
uint8_t pass_memmap_cmdline;
uint8_t noefi;
+   uint8_t force_vga;
 };
 
 int multiboot_x86_probe(const char *buf, off_t len);
diff --git a/kexec/arch/i386/x86-linux-setup.c 
b/kexec/arch/i386/x86-linux-setup.c
index 1bd408b..0e92d26 100644
--- a/kexec/arch/i386/x86-linux-setup.c
+++ b/kexec/arch/i386/x86-linux-setup.c
@@ -144,7 +144,7 @@ static int setup_linux_vesafb(struct x86_linux_param_header 
*real_mode)
} else if (0 == strcmp(fix.id, "EFI VGA")) {
/* VIDEO_TYPE_EFI */
real_mode->orig_video_isVGA = 0x70;
-   } else {
+   } else if (arch_options.force_vga) {
int err;
off_t offset = offsetof(typeof(*real_mode), orig_video_isVGA);
 
@@ -152,6 +152,9 @@ static int setup_linux_vesafb(struct x86_linux_param_header 
*real_mode)
err = get_bootparam(&real_mode->orig_video_isVGA, offset, 1);
if (err)
goto out;
+   } else {
+   real_mode->orig_video_isVGA = 0;
+   return 0;
}
close(fd);
 
@@ -844,7 +847,7 @@ void setup_linux_system_parameters(struct kexec_info *info,
setup_subarch(real_mode);
if (bzImage_support_efi_boot && !arch_options.noefi)
setup_efi_info(info, real_mode);
-   
+
/* Default screen size */
real_mode->orig_x = 0;
real_mode->orig_y = 0;
diff --git a/kexec/arch/x86_64/kexec-x86_64.c b/kexec/arch/x86_64/kexec-x86_64.c
index 041b007..2e54381 100644
--- a/kexec/arch/x86_64/kexec-x86_64.c
+++ b/kexec/arch/x86_64/kexec-x86_64.c
@@ -55,6 +55,7 @@ void arch_usage(void)
" --console-serial  Enable the serial console\n"
" --pass-memmap-cmdline Pass memory map via command 
line in kexec on panic case\n"
" --noefi   Disable efi support\n"
+   " --force-vga   Enabled vga blindly whenever 
possible \n"
);
 }
 
@@ -67,6 +68,7 @@ struct arch_options_t arch_options = {
.core_header_type = CORE_TYPE_ELF64,
.pass_memmap_cmdline = 0,
.noefi = 0,
+   .force_vga = 0,
 };
 
 int arch_process_options(int argc, char **argv)
@@ -136,6 +138,9 @@ int arch_process_options(int argc, char **argv)
case OPT_NOEFI:
arch_options.noefi = 1;
break;
+   case OPT_FORCE_VGA:
+   arch_options.force_vga = 1;
+   break;
}
}
/*

[PATCH] x86, kexec_file_load: fill in acpi_rsdp_addr boot param unconditionally

2019-02-04 Thread Kairui Song
When efi=noruntime or efi=oldmap is used, EFI services won't be available
in the second kernel, therefore the second kernel will not be able to get
the ACPI RSDP address from firmware by calling EFI services so it won't
boot. Previously we are expecting the user to set the acpi_rsdp=
on kernel command line for second kernel as there was no other way to
pass RSDP address to second kernel.

After commit e6e094e053af ("x86/acpi, x86/boot: Take RSDP address from
boot params if available"), now it's possible to set an acpi_rsdp_addr
parameter in the boot_params passed to second kernel, and kernel will
prefer using this value for the RSDP address when it's set.

And with commit 3a63f70bf4c3 ("x86/boot: Early parse RSDP and save it in
boot_params"), now the acpi_rsdp_addr will always be filled with valid
RSDP address. So we just fill in that value for second kernel's
boot_params unconditionally, this ensure second kernel always use the
same RSDP value as the first kernel.

Tested with an EFI enabled KVM VM with efi=noruntime.

Signed-off-by: Kairui Song 
---

This is update of part of patch series: "[PATCH v3 0/3] make kexec work
with efi=noruntime or efi=old_map."

But "[PATCH v3 1/3] x86, kexec_file_load: Don't setup EFI info if EFI
runtime is not enabled" is already in [tip:x86/urgent], and with Chao's
commit 3a63f70bf4c3 in [tip:x86/boot], we can just fill in acpi_rsdp_addr
boot param unconditionally to fix the problem, so only I update and resend
this patch.

 arch/x86/kernel/kexec-bzimage64.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kexec-bzimage64.c 
b/arch/x86/kernel/kexec-bzimage64.c
index 53917a3ebf94..3611946dc7ea 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -218,6 +218,9 @@ setup_boot_parameters(struct kimage *image, struct 
boot_params *params,
params->screen_info.ext_mem_k = 0;
params->alt_mem_k = 0;
 
+   /* Always fill in RSDP, it's either 0 or a valid value */
+   params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr;
+
/* Default APM info */
memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info));
 
@@ -256,7 +259,6 @@ setup_boot_parameters(struct kimage *image, struct 
boot_params *params,
setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
efi_setup_data_offset);
 #endif
-
/* Setup EDD info */
memcpy(params->eddbuf, boot_params.eddbuf,
EDDMAXNR * sizeof(struct edd_info));
-- 
2.20.1


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


  1   2   >