[PATCH v2 1/2] x86/kexec: Remove unnecessary arch_kexec_kernel_image_load()
From: Bjorn Helgaas The x86 implementation of arch_kexec_kernel_image_load() is functionally identical to the generic arch_kexec_kernel_image_load(): arch_kexec_kernel_image_load# x86 if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); return image->fops->load(image, image->kernel_buf, ...) arch_kexec_kernel_image_load# generic kexec_image_load_default if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); return image->fops->load(image, image->kernel_buf, ...) Remove the x86-specific version and use the generic arch_kexec_kernel_image_load(). No functional change intended. Signed-off-by: Bjorn Helgaas --- arch/x86/include/asm/kexec.h | 3 --- arch/x86/kernel/machine_kexec_64.c | 11 --- include/linux/kexec.h | 2 -- 3 files changed, 16 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a3760ca796aa..5b77bbc28f96 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -200,9 +200,6 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add -void *arch_kexec_kernel_image_load(struct kimage *image); -#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load - int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 0611fd83858e..1a3e2c05a8a5 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -374,17 +374,6 @@ void machine_kexec(struct kimage *image) /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE -void *arch_kexec_kernel_image_load(struct kimage *image) -{ - if (!image->fops || !image->fops->load) - return ERR_PTR(-ENOEXEC); - - return image->fops->load(image, image->kernel_buf, -image->kernel_buf_len, image->initrd_buf, -image->initrd_buf_len, image->cmdline_buf, -image->cmdline_buf_len); -} - /* * Apply purgatory relocations. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 6883c5922701..4746bc9d39c9 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -207,12 +207,10 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif -#ifndef arch_kexec_kernel_image_load static inline void *arch_kexec_kernel_image_load(struct kimage *image) { return kexec_image_load_default(image); } -#endif #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION -- 2.25.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 0/2] kexec: Remove unnecessary arch hook
From: Bjorn Helgaas There are no arch-specific things in arch_kexec_kernel_image_load(), so remove it and just use the generic version. v1 is at: https://lore.kernel.org/all/20221215182339.129803-1-helg...@kernel.org/ This v2 is trivially rebased to v6.3-rc1 and the commit log expanded slightly. Bjorn Helgaas (2): x86/kexec: Remove unnecessary arch_kexec_kernel_image_load() kexec: Remove unnecessary arch_kexec_kernel_image_load() arch/x86/include/asm/kexec.h | 3 --- arch/x86/kernel/machine_kexec_64.c | 11 --- include/linux/kexec.h | 8 kernel/kexec_file.c| 6 +++--- 4 files changed, 3 insertions(+), 25 deletions(-) -- 2.25.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 2/2] kexec: Remove unnecessary arch_kexec_kernel_image_load()
From: Bjorn Helgaas arch_kexec_kernel_image_load() only calls kexec_image_load_default(), and there are no arch-specific implementations. Remove the unnecessary arch_kexec_kernel_image_load() and make kexec_image_load_default() static. No functional change intended. Signed-off-by: Bjorn Helgaas --- include/linux/kexec.h | 6 -- kernel/kexec_file.c | 6 +++--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 4746bc9d39c9..22b5cd24f581 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -190,7 +190,6 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); -void *kexec_image_load_default(struct kimage *image); #ifndef arch_kexec_kernel_image_probe static inline int @@ -207,11 +206,6 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif -static inline void *arch_kexec_kernel_image_load(struct kimage *image) -{ - return kexec_image_load_default(image); -} - #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1a0e4e3fb5c..f989f5f1933b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -65,7 +65,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, return ret; } -void *kexec_image_load_default(struct kimage *image) +static void *kexec_image_load_default(struct kimage *image) { if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -249,8 +249,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); - /* Call arch image load handlers */ - ldata = arch_kexec_kernel_image_load(image); + /* Call image load handler */ + ldata = kexec_image_load_default(image); if (IS_ERR(ldata)) { ret = PTR_ERR(ldata); -- 2.25.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/2] kexec: Remove unnecessary arch hook
On Sat, Dec 17, 2022 at 05:48:51PM +0800, Baoquan He wrote: > On 12/15/22 at 12:23pm, Bjorn Helgaas wrote: > > From: Bjorn Helgaas > > > > There are no arch-specific things in arch_kexec_kernel_image_load(), so > > remove it and just use the generic version. > > I ever posted below patch to do the same thing, Andrew only picked the > memory leak fixing patch. > > [PATCH v2 2/2] kexec_file: clean up arch_kexec_kernel_image_load > https://lore.kernel.org/all/20220223113225.63106-3-...@redhat.com/T/#u Indeed! Sorry, I wasn't aware of your previous work. If you repost it, cc me and I'll be glad to help review it. > > Bjorn Helgaas (2): > > x86/kexec: Remove unnecessary arch_kexec_kernel_image_load() > > kexec: Remove unnecessary arch_kexec_kernel_image_load() > > > > arch/x86/include/asm/kexec.h | 3 --- > > arch/x86/kernel/machine_kexec_64.c | 11 --- > > include/linux/kexec.h | 8 > > kernel/kexec_file.c| 6 +++--- > > 4 files changed, 3 insertions(+), 25 deletions(-) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 2/2] kexec: Remove unnecessary arch_kexec_kernel_image_load()
From: Bjorn Helgaas arch_kexec_kernel_image_load() only calls kexec_image_load_default(), and there are no arch-specific implementations. Remove the unnecessary arch_kexec_kernel_image_load() and make kexec_image_load_default() static . No functional change intended. Signed-off-by: Bjorn Helgaas kexec: make static Signed-off-by: Bjorn Helgaas --- include/linux/kexec.h | 6 -- kernel/kexec_file.c | 6 +++--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c08d5d52223a..8844e7debfa4 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -190,7 +190,6 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); -void *kexec_image_load_default(struct kimage *image); #ifndef arch_kexec_kernel_image_probe static inline int @@ -207,11 +206,6 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif -static inline void *arch_kexec_kernel_image_load(struct kimage *image) -{ - return kexec_image_load_default(image); -} - #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index dd5983010b7b..39ddf09ab573 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -65,7 +65,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, return ret; } -void *kexec_image_load_default(struct kimage *image) +static void *kexec_image_load_default(struct kimage *image) { if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -249,8 +249,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); - /* Call arch image load handlers */ - ldata = arch_kexec_kernel_image_load(image); + /* Call image load handler */ + ldata = kexec_image_load_default(image); if (IS_ERR(ldata)) { ret = PTR_ERR(ldata); -- 2.25.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 0/2] kexec: Remove unnecessary arch hook
From: Bjorn Helgaas There are no arch-specific things in arch_kexec_kernel_image_load(), so remove it and just use the generic version. Bjorn Helgaas (2): x86/kexec: Remove unnecessary arch_kexec_kernel_image_load() kexec: Remove unnecessary arch_kexec_kernel_image_load() arch/x86/include/asm/kexec.h | 3 --- arch/x86/kernel/machine_kexec_64.c | 11 --- include/linux/kexec.h | 8 kernel/kexec_file.c| 6 +++--- 4 files changed, 3 insertions(+), 25 deletions(-) -- 2.25.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 1/2] x86/kexec: Remove unnecessary arch_kexec_kernel_image_load()
From: Bjorn Helgaas The x86 implementation of arch_kexec_kernel_image_load() is functionally identical to the generic arch_kexec_kernel_image_load(). Remove it and use the generic arch_kexec_kernel_image_load(). No functional change intended. Signed-off-by: Bjorn Helgaas --- arch/x86/include/asm/kexec.h | 3 --- arch/x86/kernel/machine_kexec_64.c | 11 --- include/linux/kexec.h | 2 -- 3 files changed, 16 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a3760ca796aa..5b77bbc28f96 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -200,9 +200,6 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add -void *arch_kexec_kernel_image_load(struct kimage *image); -#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load - int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 0611fd83858e..1a3e2c05a8a5 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -374,17 +374,6 @@ void machine_kexec(struct kimage *image) /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE -void *arch_kexec_kernel_image_load(struct kimage *image) -{ - if (!image->fops || !image->fops->load) - return ERR_PTR(-ENOEXEC); - - return image->fops->load(image, image->kernel_buf, -image->kernel_buf_len, image->initrd_buf, -image->initrd_buf_len, image->cmdline_buf, -image->cmdline_buf_len); -} - /* * Apply purgatory relocations. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 5dd4343c1bbe..c08d5d52223a 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -207,12 +207,10 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif -#ifndef arch_kexec_kernel_image_load static inline void *arch_kexec_kernel_image_load(struct kimage *image) { return kexec_image_load_default(image); } -#endif #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION -- 2.25.1 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
On Wed, Nov 18, 2020 at 07:36:08PM -0300, Guilherme Piccoli wrote: > Thanks a lot Bjorn! I confess except for PPC64 Server machines, I > never saw other "domains" or segments. Is it common in x86 to have > that? The early_quirks() are restricted to the first segment, no > matter how many host bridges we have in segment ? I don't know whether it's *common* to have multiple domains on x86, but they're definitely used on large systems. This includes some lspci info from an HPE Superdome Flex system: https://lore.kernel.org/lkml/5350e02a-6457-41a8-8f33-af67bfdae...@fb.com/ The early quirks in arch/x86/kernel/early-quirks.c (not the DECLARE_PCI_FIXUP_EARLY quirks in drivers/pci/quirks.c) are restricted to segment 0, no matter how many host bridges there are. This is because they use read_pci_config_16(), which uses a config access mechanism that has no provision for a segment number. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
On Tue, Nov 17, 2020 at 09:04:07AM -0300, Guilherme Piccoli wrote: > Also, taking here the opportunity to clarify my understanding about > the limitations of that approach: Bjorn, in our reproducer machine we > had 3 parents in the PCI tree (as per lspci -t), :00, :ff and > :80 - are those all under "segment 0" as per your verbiage? Yes. The "" is the PCI segment (or "domain" in the Linux PCI core). It's common on x86 to have multiple host bridges in segment . ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
On Mon, Nov 16, 2020 at 05:31:36PM -0300, Guilherme G. Piccoli wrote: > First of all, thanks everybody for the great insights/discussion! This > thread ended-up being a great learning for (at least) me. > > Given the big amount of replies and intermixed comments, I wouldn't be > able to respond inline to all, so I'll try another approach below. > > > From Bjorn: > "I think [0] proposes using early_quirks() to disable MSIs at boot-time. > That doesn't seem like a robust solution because (a) the problem affects > all arches but early_quirks() is x86-specific and (b) even on x86 > early_quirks() only works for PCI segment 0 because it relies on the > 0xCF8/0xCFC I/O ports." > > Ah. I wasn't aware of that limitation, I thought enhancing the > early_quirks() search to go through all buses would fix that, thanks for > the clarification! And again, worth to clarify that this is not a > problem affecting all arches _in practice_ - PowerPC for example has the > FW primitives allowing a powerful PCI controller (out-of-band) reset, > preventing this kind of issue usually. > > [0] > https://lore.kernel.org/linux-pci/20181018183721.27467-1-gpicc...@canonical.com > > > From Bjorn: > "A crash_device_shutdown() could do something at the host bridge level > if that's possible, or reset/disable bus mastering/disable MSI/etc on > individual PCI devices if necessary." > > From Lukas: > "Guilherme's original patches from 2018 iterate over all 256 PCI buses. > That might impact boot time negatively. The reason he has to do that is > because the crashing kernel doesn't know which devices exist and which > have interrupts enabled. However the crashing kernel has that > information. It should either disable interrupts itself or pass the > necessary information to the crashing kernel as setup_data or whatever. I don't think passing the device information to the kdump kernel is really practical. The kdump kernel would use it to do PCI config writes to disable MSIs before enabling IRQs, and it doesn't know how to access config space that early. We could invent special "early config access" things, but that gets really complicated really fast. Config access depends on ACPI MCFG tables, firmware interfaces, and in many cases, on the native host bridge drivers in drivers/pci/controllers/. I think we need to disable MSIs in the crashing kernel before the kexec. It adds a little more code in the crash_kexec() path, but it seems like a worthwhile tradeoff. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
[+cc Rafael for question about ACPI method for PCI host bridge reset] On Sat, Nov 14, 2020 at 09:58:08PM +0100, Thomas Gleixner wrote: > On Sat, Nov 14 2020 at 14:39, Bjorn Helgaas wrote: > > On Sat, Nov 14, 2020 at 12:40:10AM +0100, Thomas Gleixner wrote: > >> On Sat, Nov 14 2020 at 00:31, Thomas Gleixner wrote: > >> > On Fri, Nov 13 2020 at 10:46, Bjorn Helgaas wrote: > >> >> pci_device_shutdown() still clears the Bus Master Enable bit if we're > >> >> doing a kexec and the device is in D0-D3hot, which should also disable > >> >> MSI/MSI-X. Why doesn't this solve the problem? Is this because the > >> >> device causing the storm was in PCI_UNKNOWN state? > >> > > >> > That's indeed a really good question. > >> > >> So we do that on kexec, but is that true when starting a kdump kernel > >> from a kernel crash? I doubt it. > > > > Ah, right, I bet that's it, thanks. The kdump path is basically this: > > > > crash_kexec > > machine_kexec > > > > while the usual kexec path is: > > > > kernel_kexec > > kernel_restart_prepare > > device_shutdown > > while (!list_empty(&devices_kset->list)) > > dev->bus->shutdown > > pci_device_shutdown# pci_bus_type.shutdown > > machine_kexec > > > > So maybe we need to explore doing some or all of device_shutdown() in > > the crash_kexec() path as well as in the kernel_kexec() path. > > The problem is that if the machine crashed anything you try to attempt > before starting the crash kernel is reducing the chance that the crash > kernel actually starts. Right. > Is there something at the root bridge level which allows to tell the > underlying busses to shut up, reset or go into a defined state? That > might avoid chasing lists which might be already unreliable. Maybe we need some kind of crash_device_shutdown() that does the minimal thing to protect the kdump kernel from devices. The programming model for conventional PCI host bridges and PCIe Root Complexes is device-specific since they're outside the PCI domain. There probably *are* ways to do those things, but you would need a native host bridge driver or something like an ACPI method. I'm not aware of an ACPI way to do this, but I added Rafael in case he is. A crash_device_shutdown() could do something at the host bridge level if that's possible, or reset/disable bus mastering/disable MSI/etc on individual PCI devices if necessary. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
On Sat, Nov 14, 2020 at 12:40:10AM +0100, Thomas Gleixner wrote: > Bjorn, > > On Sat, Nov 14 2020 at 00:31, Thomas Gleixner wrote: > > On Fri, Nov 13 2020 at 10:46, Bjorn Helgaas wrote: > >> pci_device_shutdown() still clears the Bus Master Enable bit if we're > >> doing a kexec and the device is in D0-D3hot, which should also disable > >> MSI/MSI-X. Why doesn't this solve the problem? Is this because the > >> device causing the storm was in PCI_UNKNOWN state? > > > > That's indeed a really good question. > > So we do that on kexec, but is that true when starting a kdump kernel > from a kernel crash? I doubt it. Ah, right, I bet that's it, thanks. The kdump path is basically this: crash_kexec machine_kexec while the usual kexec path is: kernel_kexec kernel_restart_prepare device_shutdown while (!list_empty(&devices_kset->list)) dev->bus->shutdown pci_device_shutdown# pci_bus_type.shutdown machine_kexec So maybe we need to explore doing some or all of device_shutdown() in the crash_kexec() path as well as in the kernel_kexec() path. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
On Fri, Nov 06, 2020 at 10:14:14AM -0300, Guilherme G. Piccoli wrote: > On 23/10/2018 14:03, Bjorn Helgaas wrote: > > On Mon, Oct 22, 2018 at 05:35:06PM -0300, Guilherme G. Piccoli wrote: > >> On 18/10/2018 19:15, Bjorn Helgaas wrote: > >>> On Thu, Oct 18, 2018 at 03:37:19PM -0300, Guilherme G. Piccoli wrote: > >>> [...] > >> I understand your point, but I think this is inherently an architecture > >> problem. No matter what solution we decide for, it'll need to be applied > >> in early boot time, like before the PCI layer gets initialized. > > > > This is the part I want to know more about. Apparently there's some > > event X between early_quirks() and the PCI device enumeration, and we > > must disable MSIs before X: > > > > setup_arch() > > early_quirks() # arch/x86/kernel/early-quirks.c > > early_pci_clear_msi() > > ... > > X > > ... > > pci_scan_root_bus_bridge() > > ... > > DECLARE_PCI_FIXUP_EARLY # drivers/pci/quirks.c > > > > I want to know specifically what X is. If we don't know what X is and > > all we know is "we have to disable MSIs earlier than PCI init", then > > we're likely to break things again in the future by changing the order > > of disabling MSIs and whatever X is. > > Hi Bjorn (and all CCed), I'm sorry to necro-bump a thread >2 years > later, but recent discussions led to a better understanding of this 'X' > point, thanks to Thomas! > > For those that deleted this thread from their email clients, it's > available in [0] - the summary is that we faced an IRQ storm really > early in boot, due to a bogus PCIe device MSI behavior, when booting a > kdump kernel. This led the machine to get stuck in the boot and we > couldn't kdump. The solution hereby proposed is to clear MSI interrupts > early in x86, if a parameter is provided. I don't have the reproducer > anymore and it was pretty hard to reproduce in virtual environments. > > So, about the 'X' Bjorn, in another recent thread about IRQ storms [1], > Thomas clarified that and after a brief discussion, it seems there's no > better way to prevent the MSI storm other than clearing the MSI > capability early in boot. As discussed both here and in thread [1], this > is indeed a per-architecture issue (powerpc is not subject to that, due > to a better FW reset mechanism), so I think we still could benefit in > having this idea implemented upstream, at least in x86 (we could expand > to other architectures if desired, in the future). > > As a "test" data point, this was implemented in Ubuntu (same 3 patches > present in this series) for ~2 years and we haven't received bug reports > - I'm saying that because I understand your concerns about expanding the > early PCI quirks scope. > > Let me know your thoughts. I'd suggest all to read thread [1], which > addresses a similar issue but in a different "moment" of the system boot > and provides some more insight on why the early MSI clearing seems to > make sense. I guess Thomas' patch [2] (from thread [1]) doesn't solve this problem? I think [0] proposes using early_quirks() to disable MSIs at boot-time. That doesn't seem like a robust solution because (a) the problem affects all arches but early_quirks() is x86-specific and (b) even on x86 early_quirks() only works for PCI segment 0 because it relies on the 0xCF8/0xCFC I/O ports. If I understand Thomas' email correctly, the IRQ storm occurs here: start_kernel setup_arch early_quirks # x86-only ... read_pci_config_16(num, slot, func, PCI_VENDOR_ID) outl(..., 0xcf8) # PCI segment 0 only inw(0xcfc) local_irq_enable ... native_irq_enable asm("sti") # <-- enable IRQ, storm occurs native_irq_enable() happens long before we discover PCI host bridges and run the normal PCI quirks, so those would be too late to disable MSIs. It doesn't seem practical to disable MSIs in the kdump kernel at the PCI level. I was hoping we could disable them somewhere in the IRQ code, e.g., at IOAPICs, but I think Thomas is saying that's not feasible. It seems like the only option left is to disable MSIs before the kexec. We used to clear the MSI/MSI-X Enable bits in pci_device_shutdown(), but that broke console devices that relied on MSI and caused "nobody cared" warnings when the devices fell back to using INTx, so fda78d7a0ead ("PCI/MSI: Stop disabling MSI/MSI-X in pci_device_shutdown()") left them unchanged. pci_device_shutdown(
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Wed, Jul 22, 2020 at 03:50:48PM -0600, Jerry Hoemann wrote: > On Wed, Jul 22, 2020 at 10:21:23AM -0500, Bjorn Helgaas wrote: > > On Wed, Jul 22, 2020 at 10:52:26PM +0800, Kairui Song wrote: > > > I think I didn't make one thing clear, The PCI UR error never arrives > > > in kernel, it's the iLo BMC on that HPE machine caught the error, and > > > send kernel an NMI. kernel is panicked by NMI, I'm still trying to > > > figure out why the NMI hanged kernel, even with panic=-1, > > > panic_on_io_nmi, panic_on_unknown_nmi all set. But if we can avoid the > > > NMI by shutdown the devices in right order, that's also a solution. ACPI v6.3, chapter 18, does mention NMIs several times, e.g., Table 18-394 and sec 18.4. I'm not familiar enough with APEI to know whether Linux correctly supports all those cases. Maybe this is a symptom that we don't? > > I'm not sure how much sympathy to have for this situation. A PCIe UR > > is fatal for the transaction and maybe even the device, but from the > > overall system point of view, it *should* be a recoverable error and > > we shouldn't panic. > > > > Errors like that should be reported via the normal AER or ACPI/APEI > > mechanisms. It sounds like in this case, the platform has decided > > these aren't enough and it is trying to force a reboot? If this is > > "special" platform behavior, I'm not sure how much we need to cater > > for it. > > Are these AER errors the type processed by the GHES code? My understanding from ACPI v6.3, sec 18.3.2, is that the Hardware Error Source Table may contain Error Source Descriptors of types like: IA-32 Machine Check Exception IA-32 Corrected Machine Check IA-32 Non-Maskable Interrupt PCIe Root Port AER PCIe Device AER Generic Hardware Error Source (GHES) Hardware Error Notification IA-32 Deferred Machine Check I would naively expect PCIe UR errors to be reported via one of the PCIe Error Sources, not GHES, but maybe there's some reason to use GHES. The kernel should already know how to deal with the PCIe AER errors, but we'd have to add new device-specific code to handle things reported via GHES, along the lines of what Shiju is doing here: https://lore.kernel.org/r/20200722104245.1060-1-shiju.j...@huawei.com > I'll note that RedHat runs their crash kernel with: hest_disable. > So, the ghes code is disabled in the crash kernel. That would disable all the HEST error sources, including the PCIe AER ones as well as GHES ones. If we turn off some of the normal error handling mechanisms, I guess we have to expect that some errors won't be handled correctly. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
On Wed, Jul 22, 2020 at 10:52:26PM +0800, Kairui Song wrote: > On Fri, Mar 6, 2020 at 5:38 PM Baoquan He wrote: > > On 03/04/20 at 08:53pm, Deepa Dinamani wrote: > > > On Wed, Mar 4, 2020 at 7:53 PM Baoquan He wrote: > > > > On 03/03/20 at 01:01pm, Deepa Dinamani wrote: > > > > > I looked at this some more. Looks like we do not clear irqs > > > > > when we do a kexec reboot. And, the bootup code maintains > > > > > the same table for the kexec-ed kernel. I'm looking at the > > > > > following code in > > > > > > > > I guess you are talking about kdump reboot here, right? Kexec > > > > and kdump boot take the similar mechanism, but differ a > > > > little. > > > > > > Right I meant kdump kernel here. And, clearly the > > > is_kdump_kernel() case below. > > > > > > > > intel_irq_remapping.c: > > > > > > > > > > if (ir_pre_enabled(iommu)) { > > > > > if (!is_kdump_kernel()) { > > > > > pr_warn("IRQ remapping was enabled on %s but > > > > > we are not in kdump mode\n", > > > > > iommu->name); > > > > > clear_ir_pre_enabled(iommu); > > > > > iommu_disable_irq_remapping(iommu); > > > > > } else if (iommu_load_old_irte(iommu)) > > > > > > > > Here, it's for kdump kernel to copy old ir table from 1st kernel. > > > > > > Correct. > > > > > > > > pr_err("Failed to copy IR table for %s from > > > > > previous kernel\n", > > > > >iommu->name); > > > > > else > > > > > pr_info("Copied IR table for %s from previous > > > > > kernel\n", > > > > > iommu->name); > > > > > } > > > > > > > > > > Would cleaning the interrupts(like in the non kdump path > > > > > above) just before shutdown help here? This should clear the > > > > > interrupts enabled for all the devices in the current > > > > > kernel. So when kdump kernel starts, it starts clean. This > > > > > should probably help block out the interrupts from a device > > > > > that does not have a driver. > > > > > > > > I think stopping those devices out of control from continue > > > > sending interrupts is a good idea. While not sure if only > > > > clearing the interrupt will be enough. Those devices which > > > > will be initialized by their driver will brake, but devices > > > > which drivers are not loaded into kdump kernel may continue > > > > acting. Even though interrupts are cleaning at this time, the > > > > on-flight DMA could continue triggerring interrupt since the > > > > ir table and iopage table are rebuilt. > > > > > > This should be handled by the IOMMU, right? And, hence you are > > > getting UR. This seems like the correct execution flow to me. > > > > Sorry for late reply. > > Yes, this is initializing IOMMU device. > > > > > Anyway, you could just test this theory by removing the > > > is_kdump_kernel() check above and see if it solves your problem. > > > Obviously, check the VT-d spec to figure out the exact sequence to > > > turn off the IR. > > > > OK, I will talk to Kairui and get a machine to test it. Thanks for your > > nice idea, if you have a draft patch, we are happy to test it. > > > > > Note that the device that is causing the problem here is a legit > > > device. We want to have interrupts from devices we don't know about > > > blocked anyway because we can have compromised firmware/ devices that > > > could cause a DoS attack. So blocking the unwanted interrupts seems > > > like the right thing to do here. > > > > Kairui said it's a device which driver is not loaded in kdump kernel > > because it's not needed by kdump. We try to only load kernel modules > > which are needed, e.g one device is the dump target, its driver has to > > be loaded in. In this case, the device is more like a out of control > > device to kdump kernel. > > Hi Bao, Deepa, sorry for this very late response. The test machine was > not available for sometime, and I restarted to work on this problem. > > For the workaround mention by Deepa (by remote the is_kdump_kernel() > check), it didn't work, the machine still hangs upon shutdown. > The devices that were left in an unknown state and sending interrupt > could be a problem, but it's irrelevant to this hanging problem. > > I think I didn't make one thing clear, The PCI UR error never arrives > in kernel, it's the iLo BMC on that HPE machine caught the error, and > send kernel an NMI. kernel is panicked by NMI, I'm still trying to > figure out why the NMI hanged kernel, even with panic=-1, > panic_on_io_nmi, panic_on_unknown_nmi all set. But if we can avoid the > NMI by shutdown the devices in right order, that's also a solution. I'm not sure how much sympathy to have for this situation. A PCIe UR is fatal for the transaction and maybe even the device, but from the overall system point of view, it *should* be a recoverable error and
Re: [PATCH][v2] iommu: arm-smmu-v3: Copy SMMU table for kdump kernel
On Sun, Jun 07, 2020 at 02:00:35PM +0530, Prabhakar Kushwaha wrote: > On Thu, Jun 4, 2020 at 5:32 AM Bjorn Helgaas wrote: > > On Wed, Jun 03, 2020 at 11:12:48PM +0530, Prabhakar Kushwaha wrote: > > > On Sat, May 30, 2020 at 1:03 AM Bjorn Helgaas wrote: > > > > On Fri, May 29, 2020 at 07:48:10PM +0530, Prabhakar Kushwaha wrote: > > > > > diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c > > > > > index 117c0a2b2ba4..26b908f55aef 100644 > > > > > --- a/drivers/pci/pcie/err.c > > > > > +++ b/drivers/pci/pcie/err.c > > > > > @@ -66,6 +66,20 @@ static int report_error_detected(struct pci_dev > > > > > *dev, > > > > > if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { > > > > > vote = PCI_ERS_RESULT_NO_AER_DRIVER; > > > > > pci_info(dev, "can't recover (no > > > > > error_detected callback)\n"); > > > > > + > > > > > + pci_save_state(dev); > > > > > + pci_cfg_access_lock(dev); > > > > > + > > > > > + /* Quiesce the device completely */ > > > > > + pci_write_config_word(dev, PCI_COMMAND, > > > > > + PCI_COMMAND_INTX_DISABLE); > > > > > + if (!__pci_reset_function_locked(dev)) { > > > > > + vote = PCI_ERS_RESULT_RECOVERED; > > > > > + pci_info(dev, "recovered via pci level > > > > > reset\n"); > > > > > + } > > > > So I guess we *do* need to save the state before the reset and restore > > it (either that or enumerate the device from scratch just like we > > would if it had been hot-added). I'm not really thrilled with trying > > to save the state after the device has already reported an error. I'd > > rather do it earlier, maybe during enumeration, like in > > pci_init_capabilities(). But I don't understand all the subtleties of > > dev->state_saved, so that requires some legwork. > > I tried moving pci_save_state earlier. All observations are the same > as mentioned in earlier discussions. By "legwork", I didn't mean just trying things to see whether they seem to work. I meant researching the history to find out *why* it's designed the way it is so that when we change it, we don't break things. For example, these commits are obviously important to understand: aa8c6c93747f ("PCI PM: Restore standard config registers of all devices early") c82f63e411f1 ("PCI: check saved state before restore") 4b77b0a2ba27 ("PCI: Clear saved_state after the state has been restored") I think we need to step back and separate this AER issue from the whole SMMU table copying thing. Then do the research and start a new thread with a patch to fix just the AER issue. The ARM guys would probably be grateful to be dropped from the AER thread because it really has nothing to do with ARM. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH][v2] iommu: arm-smmu-v3: Copy SMMU table for kdump kernel
On Wed, Jun 03, 2020 at 11:12:48PM +0530, Prabhakar Kushwaha wrote: > On Sat, May 30, 2020 at 1:03 AM Bjorn Helgaas wrote: > > On Fri, May 29, 2020 at 07:48:10PM +0530, Prabhakar Kushwaha wrote: > > > On Thu, May 28, 2020 at 1:48 AM Bjorn Helgaas wrote: > > > > > > > > On Wed, May 27, 2020 at 05:14:39PM +0530, Prabhakar Kushwaha wrote: > > > > > On Fri, May 22, 2020 at 4:19 AM Bjorn Helgaas > > > > > wrote: > > > > > > On Thu, May 21, 2020 at 09:28:20AM +0530, Prabhakar Kushwaha wrote: > > > > > > > On Wed, May 20, 2020 at 4:52 AM Bjorn Helgaas > > > > > > > wrote: > > > > > > > > On Thu, May 14, 2020 at 12:47:02PM +0530, Prabhakar Kushwaha > > > > > > > > wrote: > > > > > > > > > On Wed, May 13, 2020 at 3:33 AM Bjorn Helgaas > > > > > > > > > wrote: > > > > > > > > > > On Mon, May 11, 2020 at 07:46:06PM -0700, Prabhakar > > > > > > > > > > Kushwaha wrote: > > > > > > > > > > > An SMMU Stream table is created by the primary kernel. > > > > > > > > > > > This table is > > > > > > > > > > > used by the SMMU to perform address translations for > > > > > > > > > > > device-originated > > > > > > > > > > > transactions. Any crash (if happened) launches the kdump > > > > > > > > > > > kernel which > > > > > > > > > > > re-creates the SMMU Stream table. New transactions will > > > > > > > > > > > be translated > > > > > > > > > > > via this new table.. > > > > > > > > > > > > > > > > > > > > > > There are scenarios, where devices are still having old > > > > > > > > > > > pending > > > > > > > > > > > transactions (configured in the primary kernel). These > > > > > > > > > > > transactions > > > > > > > > > > > come in-between Stream table creation and device-driver > > > > > > > > > > > probe. > > > > > > > > > > > As new stream table does not have entry for older > > > > > > > > > > > transactions, > > > > > > > > > > > it will be aborted by SMMU. > > > > > > > > > > > > > > > > > > > > > > Similar observations were found with PCIe-Intel 82576 > > > > > > > > > > > Gigabit > > > > > > > > > > > Network card. It sends old Memory Read transaction in > > > > > > > > > > > kdump kernel. > > > > > > > > > > > Transactions configured for older Stream table entries, > > > > > > > > > > > that do not > > > > > > > > > > > exist any longer in the new table, will cause a PCIe > > > > > > > > > > > Completion Abort. > > > > > > > > > > > > > > > > > > > > That sounds like exactly what we want, doesn't it? > > > > > > > > > > > > > > > > > > > > Or do you *want* DMA from the previous kernel to complete? > > > > > > > > > > That will > > > > > > > > > > read or scribble on something, but maybe that's not > > > > > > > > > > terrible as long > > > > > > > > > > as it's not memory used by the kdump kernel. > > > > > > > > > > > > > > > > > > Yes, Abort should happen. But it should happen in context of > > > > > > > > > driver. > > > > > > > > > But current abort is happening because of SMMU and no > > > > > > > > > driver/pcie > > > > > > > > > setup present at this moment. > > > > > > > > > > > > > > > > I don't understand what you mean by "in context of driver." > > > > > > > > The whole > > > > > > > > problem is that we can't control *when* the abort happens, so > >
Re: [PATCH][v2] iommu: arm-smmu-v3: Copy SMMU table for kdump kernel
On Fri, May 29, 2020 at 07:48:10PM +0530, Prabhakar Kushwaha wrote: > On Thu, May 28, 2020 at 1:48 AM Bjorn Helgaas wrote: > > > > On Wed, May 27, 2020 at 05:14:39PM +0530, Prabhakar Kushwaha wrote: > > > On Fri, May 22, 2020 at 4:19 AM Bjorn Helgaas wrote: > > > > On Thu, May 21, 2020 at 09:28:20AM +0530, Prabhakar Kushwaha wrote: > > > > > On Wed, May 20, 2020 at 4:52 AM Bjorn Helgaas > > > > > wrote: > > > > > > On Thu, May 14, 2020 at 12:47:02PM +0530, Prabhakar Kushwaha wrote: > > > > > > > On Wed, May 13, 2020 at 3:33 AM Bjorn Helgaas > > > > > > > wrote: > > > > > > > > On Mon, May 11, 2020 at 07:46:06PM -0700, Prabhakar Kushwaha > > > > > > > > wrote: > > > > > > > > > An SMMU Stream table is created by the primary kernel. This > > > > > > > > > table is > > > > > > > > > used by the SMMU to perform address translations for > > > > > > > > > device-originated > > > > > > > > > transactions. Any crash (if happened) launches the kdump > > > > > > > > > kernel which > > > > > > > > > re-creates the SMMU Stream table. New transactions will be > > > > > > > > > translated > > > > > > > > > via this new table.. > > > > > > > > > > > > > > > > > > There are scenarios, where devices are still having old > > > > > > > > > pending > > > > > > > > > transactions (configured in the primary kernel). These > > > > > > > > > transactions > > > > > > > > > come in-between Stream table creation and device-driver probe. > > > > > > > > > As new stream table does not have entry for older > > > > > > > > > transactions, > > > > > > > > > it will be aborted by SMMU. > > > > > > > > > > > > > > > > > > Similar observations were found with PCIe-Intel 82576 Gigabit > > > > > > > > > Network card. It sends old Memory Read transaction in kdump > > > > > > > > > kernel. > > > > > > > > > Transactions configured for older Stream table entries, that > > > > > > > > > do not > > > > > > > > > exist any longer in the new table, will cause a PCIe > > > > > > > > > Completion Abort. > > > > > > > > > > > > > > > > That sounds like exactly what we want, doesn't it? > > > > > > > > > > > > > > > > Or do you *want* DMA from the previous kernel to complete? > > > > > > > > That will > > > > > > > > read or scribble on something, but maybe that's not terrible as > > > > > > > > long > > > > > > > > as it's not memory used by the kdump kernel. > > > > > > > > > > > > > > Yes, Abort should happen. But it should happen in context of > > > > > > > driver. > > > > > > > But current abort is happening because of SMMU and no driver/pcie > > > > > > > setup present at this moment. > > > > > > > > > > > > I don't understand what you mean by "in context of driver." The > > > > > > whole > > > > > > problem is that we can't control *when* the abort happens, so it may > > > > > > happen in *any* context. It may happen when a NIC receives a packet > > > > > > or at some other unpredictable time. > > > > > > > > > > > > > Solution of this issue should be at 2 place > > > > > > > a) SMMU level: I still believe, this patch has potential to > > > > > > > overcome > > > > > > > issue till finally driver's probe takeover. > > > > > > > b) Device level: Even if something goes wrong. Driver/device > > > > > > > should > > > > > > > able to recover. > > > > > > > > > > > > > > > > Returned PCIe completion abort further leads to AER Errors > > > > > > > > > from APEI > > > > > >
Re: [PATCH][v2] iommu: arm-smmu-v3: Copy SMMU table for kdump kernel
On Wed, May 27, 2020 at 05:14:39PM +0530, Prabhakar Kushwaha wrote: > On Fri, May 22, 2020 at 4:19 AM Bjorn Helgaas wrote: > > On Thu, May 21, 2020 at 09:28:20AM +0530, Prabhakar Kushwaha wrote: > > > On Wed, May 20, 2020 at 4:52 AM Bjorn Helgaas wrote: > > > > On Thu, May 14, 2020 at 12:47:02PM +0530, Prabhakar Kushwaha wrote: > > > > > On Wed, May 13, 2020 at 3:33 AM Bjorn Helgaas > > > > > wrote: > > > > > > On Mon, May 11, 2020 at 07:46:06PM -0700, Prabhakar Kushwaha wrote: > > > > > > > An SMMU Stream table is created by the primary kernel. This table > > > > > > > is > > > > > > > used by the SMMU to perform address translations for > > > > > > > device-originated > > > > > > > transactions. Any crash (if happened) launches the kdump kernel > > > > > > > which > > > > > > > re-creates the SMMU Stream table. New transactions will be > > > > > > > translated > > > > > > > via this new table.. > > > > > > > > > > > > > > There are scenarios, where devices are still having old pending > > > > > > > transactions (configured in the primary kernel). These > > > > > > > transactions > > > > > > > come in-between Stream table creation and device-driver probe. > > > > > > > As new stream table does not have entry for older transactions, > > > > > > > it will be aborted by SMMU. > > > > > > > > > > > > > > Similar observations were found with PCIe-Intel 82576 Gigabit > > > > > > > Network card. It sends old Memory Read transaction in kdump > > > > > > > kernel. > > > > > > > Transactions configured for older Stream table entries, that do > > > > > > > not > > > > > > > exist any longer in the new table, will cause a PCIe Completion > > > > > > > Abort. > > > > > > > > > > > > That sounds like exactly what we want, doesn't it? > > > > > > > > > > > > Or do you *want* DMA from the previous kernel to complete? That > > > > > > will > > > > > > read or scribble on something, but maybe that's not terrible as long > > > > > > as it's not memory used by the kdump kernel. > > > > > > > > > > Yes, Abort should happen. But it should happen in context of driver. > > > > > But current abort is happening because of SMMU and no driver/pcie > > > > > setup present at this moment. > > > > > > > > I don't understand what you mean by "in context of driver." The whole > > > > problem is that we can't control *when* the abort happens, so it may > > > > happen in *any* context. It may happen when a NIC receives a packet > > > > or at some other unpredictable time. > > > > > > > > > Solution of this issue should be at 2 place > > > > > a) SMMU level: I still believe, this patch has potential to overcome > > > > > issue till finally driver's probe takeover. > > > > > b) Device level: Even if something goes wrong. Driver/device should > > > > > able to recover. > > > > > > > > > > > > Returned PCIe completion abort further leads to AER Errors from > > > > > > > APEI > > > > > > > Generic Hardware Error Source (GHES) with completion timeout. > > > > > > > A network device hang is observed even after continuous > > > > > > > reset/recovery from driver, Hence device is no more usable. > > > > > > > > > > > > The fact that the device is no longer usable is definitely a > > > > > > problem. > > > > > > But in principle we *should* be able to recover from these errors. > > > > > > If > > > > > > we could recover and reliably use the device after the error, that > > > > > > seems like it would be a more robust solution that having to add > > > > > > special cases in every IOMMU driver. > > > > > > > > > > > > If you have details about this sort of error, I'd like to try to fix > > > > > > it because we want to recover from that sort of error in normal > > &
Re: [PATCH][v2] iommu: arm-smmu-v3: Copy SMMU table for kdump kernel
On Thu, May 21, 2020 at 09:28:20AM +0530, Prabhakar Kushwaha wrote: > On Wed, May 20, 2020 at 4:52 AM Bjorn Helgaas wrote: > > On Thu, May 14, 2020 at 12:47:02PM +0530, Prabhakar Kushwaha wrote: > > > On Wed, May 13, 2020 at 3:33 AM Bjorn Helgaas wrote: > > > > On Mon, May 11, 2020 at 07:46:06PM -0700, Prabhakar Kushwaha wrote: > > > > > An SMMU Stream table is created by the primary kernel. This table is > > > > > used by the SMMU to perform address translations for device-originated > > > > > transactions. Any crash (if happened) launches the kdump kernel which > > > > > re-creates the SMMU Stream table. New transactions will be translated > > > > > via this new table.. > > > > > > > > > > There are scenarios, where devices are still having old pending > > > > > transactions (configured in the primary kernel). These transactions > > > > > come in-between Stream table creation and device-driver probe. > > > > > As new stream table does not have entry for older transactions, > > > > > it will be aborted by SMMU. > > > > > > > > > > Similar observations were found with PCIe-Intel 82576 Gigabit > > > > > Network card. It sends old Memory Read transaction in kdump kernel. > > > > > Transactions configured for older Stream table entries, that do not > > > > > exist any longer in the new table, will cause a PCIe Completion Abort. > > > > > > > > That sounds like exactly what we want, doesn't it? > > > > > > > > Or do you *want* DMA from the previous kernel to complete? That will > > > > read or scribble on something, but maybe that's not terrible as long > > > > as it's not memory used by the kdump kernel. > > > > > > Yes, Abort should happen. But it should happen in context of driver. > > > But current abort is happening because of SMMU and no driver/pcie > > > setup present at this moment. > > > > I don't understand what you mean by "in context of driver." The whole > > problem is that we can't control *when* the abort happens, so it may > > happen in *any* context. It may happen when a NIC receives a packet > > or at some other unpredictable time. > > > > > Solution of this issue should be at 2 place > > > a) SMMU level: I still believe, this patch has potential to overcome > > > issue till finally driver's probe takeover. > > > b) Device level: Even if something goes wrong. Driver/device should > > > able to recover. > > > > > > > > Returned PCIe completion abort further leads to AER Errors from APEI > > > > > Generic Hardware Error Source (GHES) with completion timeout. > > > > > A network device hang is observed even after continuous > > > > > reset/recovery from driver, Hence device is no more usable. > > > > > > > > The fact that the device is no longer usable is definitely a problem. > > > > But in principle we *should* be able to recover from these errors. If > > > > we could recover and reliably use the device after the error, that > > > > seems like it would be a more robust solution that having to add > > > > special cases in every IOMMU driver. > > > > > > > > If you have details about this sort of error, I'd like to try to fix > > > > it because we want to recover from that sort of error in normal > > > > (non-crash) situations as well. > > > > > > > Completion abort case should be gracefully handled. And device should > > > always remain usable. > > > > > > There are 2 scenario which I am testing with Ethernet card PCIe-Intel > > > 82576 Gigabit Network card. > > > > > > I) Crash testing using kdump root file system: De-facto scenario > > > - kdump file system does not have Ethernet driver > > > - A lot of AER prints [1], making it impossible to work on shell > > > of kdump root file system. > > > > In this case, I think report_error_detected() is deciding that because > > the device has no driver, we can't do anything. The flow is like > > this: > > > > aer_recover_work_func # aer_recover_work > > kfifo_get(aer_recover_ring, entry) > > dev = pci_get_domain_bus_and_slot > > cper_print_aer(dev, ...) > > pci_err("AER: aer_status:") > > pci_err("AER: [14] CmpltTO"
Re: [PATCH][v2] iommu: arm-smmu-v3: Copy SMMU table for kdump kernel
[+cc Sathy, Vijay, Myron] On Thu, May 14, 2020 at 12:47:02PM +0530, Prabhakar Kushwaha wrote: > On Wed, May 13, 2020 at 3:33 AM Bjorn Helgaas wrote: > > On Mon, May 11, 2020 at 07:46:06PM -0700, Prabhakar Kushwaha wrote: > > > An SMMU Stream table is created by the primary kernel. This table is > > > used by the SMMU to perform address translations for device-originated > > > transactions. Any crash (if happened) launches the kdump kernel which > > > re-creates the SMMU Stream table. New transactions will be translated > > > via this new table.. > > > > > > There are scenarios, where devices are still having old pending > > > transactions (configured in the primary kernel). These transactions > > > come in-between Stream table creation and device-driver probe. > > > As new stream table does not have entry for older transactions, > > > it will be aborted by SMMU. > > > > > > Similar observations were found with PCIe-Intel 82576 Gigabit > > > Network card. It sends old Memory Read transaction in kdump kernel. > > > Transactions configured for older Stream table entries, that do not > > > exist any longer in the new table, will cause a PCIe Completion Abort. > > > > That sounds like exactly what we want, doesn't it? > > > > Or do you *want* DMA from the previous kernel to complete? That will > > read or scribble on something, but maybe that's not terrible as long > > as it's not memory used by the kdump kernel. > > Yes, Abort should happen. But it should happen in context of driver. > But current abort is happening because of SMMU and no driver/pcie > setup present at this moment. I don't understand what you mean by "in context of driver." The whole problem is that we can't control *when* the abort happens, so it may happen in *any* context. It may happen when a NIC receives a packet or at some other unpredictable time. > Solution of this issue should be at 2 place > a) SMMU level: I still believe, this patch has potential to overcome > issue till finally driver's probe takeover. > b) Device level: Even if something goes wrong. Driver/device should > able to recover. > > > > Returned PCIe completion abort further leads to AER Errors from APEI > > > Generic Hardware Error Source (GHES) with completion timeout. > > > A network device hang is observed even after continuous > > > reset/recovery from driver, Hence device is no more usable. > > > > The fact that the device is no longer usable is definitely a problem. > > But in principle we *should* be able to recover from these errors. If > > we could recover and reliably use the device after the error, that > > seems like it would be a more robust solution that having to add > > special cases in every IOMMU driver. > > > > If you have details about this sort of error, I'd like to try to fix > > it because we want to recover from that sort of error in normal > > (non-crash) situations as well. > > > Completion abort case should be gracefully handled. And device should > always remain usable. > > There are 2 scenario which I am testing with Ethernet card PCIe-Intel > 82576 Gigabit Network card. > > I) Crash testing using kdump root file system: De-facto scenario > - kdump file system does not have Ethernet driver > - A lot of AER prints [1], making it impossible to work on shell > of kdump root file system. In this case, I think report_error_detected() is deciding that because the device has no driver, we can't do anything. The flow is like this: aer_recover_work_func # aer_recover_work kfifo_get(aer_recover_ring, entry) dev = pci_get_domain_bus_and_slot cper_print_aer(dev, ...) pci_err("AER: aer_status:") pci_err("AER: [14] CmpltTO") pci_err("AER: aer_layer=") if (AER_NONFATAL) pcie_do_recovery(dev, pci_channel_io_normal) status = CAN_RECOVER pci_walk_bus(report_normal_detected) report_error_detected if (!dev->driver) vote = NO_AER_DRIVER pci_info("can't recover (no error_detected callback)") *result = merge_result(*, NO_AER_DRIVER) # always NO_AER_DRIVER status is now NO_AER_DRIVER So pcie_do_recovery() does not call .report_mmio_enabled() or .slot_reset(), and status is not RECOVERED, so it skips .resume(). I don't remember the history there, but if a device has no driver and the device generates errors, it seems like we ought to be able to reset it. We should be able to field one (or a few) AER errors, reset the device, and you sho
Re: [PATCH][v2] iommu: arm-smmu-v3: Copy SMMU table for kdump kernel
[+cc linux-pci] On Mon, May 11, 2020 at 07:46:06PM -0700, Prabhakar Kushwaha wrote: > An SMMU Stream table is created by the primary kernel. This table is > used by the SMMU to perform address translations for device-originated > transactions. Any crash (if happened) launches the kdump kernel which > re-creates the SMMU Stream table. New transactions will be translated > via this new table. > > There are scenarios, where devices are still having old pending > transactions (configured in the primary kernel). These transactions > come in-between Stream table creation and device-driver probe. > As new stream table does not have entry for older transactions, > it will be aborted by SMMU. > > Similar observations were found with PCIe-Intel 82576 Gigabit > Network card. It sends old Memory Read transaction in kdump kernel. > Transactions configured for older Stream table entries, that do not > exist any longer in the new table, will cause a PCIe Completion Abort. That sounds like exactly what we want, doesn't it? Or do you *want* DMA from the previous kernel to complete? That will read or scribble on something, but maybe that's not terrible as long as it's not memory used by the kdump kernel. > Returned PCIe completion abort further leads to AER Errors from APEI > Generic Hardware Error Source (GHES) with completion timeout. > A network device hang is observed even after continuous > reset/recovery from driver, Hence device is no more usable. The fact that the device is no longer usable is definitely a problem. But in principle we *should* be able to recover from these errors. If we could recover and reliably use the device after the error, that seems like it would be a more robust solution that having to add special cases in every IOMMU driver. If you have details about this sort of error, I'd like to try to fix it because we want to recover from that sort of error in normal (non-crash) situations as well. > So, If we are in a kdump kernel try to copy SMMU Stream table from > primary/old kernel to preserve the mappings until the device driver > takes over. > > Signed-off-by: Prabhakar Kushwaha > --- > Changes for v2: Used memremap in-place of ioremap > > V2 patch has been sanity tested. > > V1 patch has been tested with > A) PCIe-Intel 82576 Gigabit Network card in following > configurations with "no AER error". Each iteration has > been tested on both Suse kdump rfs And default Centos distro rfs. > > 1) with 2 level stream table > >SMMU | Normal Ping | Flood Ping >- >Default Operation | 100 times | 10 times >- >IOMMU bypass | 41 times | 10 times >- > > 2) with Linear stream table. >- >SMMU | Normal Ping | Flood Ping >-- >Default Operation | 100 times | 10 times >-- >IOMMU bypass | 55 times | 10 times >--- > > B) This patch is also tested with Micron Technology Inc 9200 PRO NVMe > SSD card with 2 level stream table using "fio" in mixed read/write and > only read configurations. It is tested for both Default Operation and > IOMMU bypass mode for minimum 10 iterations across Centos kdump rfs and > default Centos ditstro rfs. > > This patch is not full proof solution. Issue can still come > from the point device is discovered and driver probe called. > This patch has reduced window of scenario from "SMMU Stream table > creation - device-driver" to "device discovery - device-driver". > Usually, device discovery to device-driver is very small time. So > the probability is very low. > > Note: device-discovery will overwrite existing stream table entries > with both SMMU stage as by-pass. > > > drivers/iommu/arm-smmu-v3.c | 36 +++- > 1 file changed, 35 insertions(+), 1 deletion(-) > > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c > index 82508730feb7..d492d92c2dd7 100644 > --- a/drivers/iommu/arm-smmu-v3.c > +++ b/drivers/iommu/arm-smmu-v3.c > @@ -1847,7 +1847,13 @@ static void arm_smmu_write_strtab_ent(struct > arm_smmu_master *master, u32 sid, > break; > case STRTAB_STE_0_CFG_S1_TRANS: > case STRTAB_STE_0_CFG_S2_TRANS: > - ste_live = true; > + /* > + * As kdump kernel copy STE table from previous > + * kernel. It still may have valid stream table entries. > + * Forcing entry as false to
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
[+cc Khalid, Deepa, Randy, Dave, Myron] On Thu, Dec 26, 2019 at 03:21:18AM +0800, Kairui Song wrote: > There are reports about kdump hang upon reboot on some HPE machines, > kernel hanged when trying to shutdown a PCIe port, an uncorrectable > error occurred and crashed the system. Did we ever make progress on this? This definitely sounds like a problem that needs to be fixed, but I don't see a resolution here. > On the machine I can reproduce this issue, part of the topology > looks like this: > > [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 > +-01.0-[02]-- > +-01.1-[05]-- > +-02.0-[06]--+-00.0 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.1 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.2 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.3 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.4 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.5 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.6 Emulex Corporation OneConnect NIC (Skyhawk) > |\-00.7 Emulex Corporation OneConnect NIC (Skyhawk) > +-02.1-[0f]-- > +-02.2-[07]00.0 Hewlett-Packard Company Smart Array Gen9 > Controllers > > When shuting down PCIe port :00:02.2 or :00:02.0, the machine > will hang, depend on which device is reinitialized in kdump kernel. > > If force remove unused device then trigger kdump, the problem will never > happen: > > echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove > echo c > /proc/sysrq-trigger > > ... Kdump save vmcore through network, the NIC get reinitialized and > hpsa is untouched. Then reboot with no problem. (If hpsa is used > instead, shutdown the NIC in first kernel will help) > > The cause is that some devices are enabled by the first kernel, but it > don't have the chance to shutdown the device, and kdump kernel is not > aware of it, unless it reinitialize the device. > > Upon reboot, kdump kernel will skip downstream device shutdown and > clears its bridge's master bit directly. The downstream device could > error out as it can still send requests but upstream refuses it. > > So for kdump, let kernel read the correct hardware power state on boot, > and always clear the bus master bit of PCI device upon shutdown if the > device is on. PCIe port driver will always shutdown all downstream > devices first, so this should ensure all downstream devices have bus > master bit off before clearing the bridge's bus master bit. > > Signed-off-by: Kairui Song > --- > drivers/pci/pci-driver.c | 11 --- > drivers/pci/quirks.c | 20 > 2 files changed, 28 insertions(+), 3 deletions(-) > > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > index 0454ca0e4e3f..84a7fd643b4d 100644 > --- a/drivers/pci/pci-driver.c > +++ b/drivers/pci/pci-driver.c > @@ -18,6 +18,7 @@ > #include > #include > #include > +#include > #include "pci.h" > #include "pcie/portdrv.h" > > @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev) >* If this is a kexec reboot, turn off Bus Master bit on the >* device to tell it to not continue to do DMA. Don't touch >* devices in D3cold or unknown states. > - * If it is not a kexec reboot, firmware will hit the PCI > - * devices with big hammer and stop their DMA any way. > + * If this is kdump kernel, also turn off Bus Master, the device > + * could be activated by previous crashed kernel and may block > + * it's upstream from shutting down. > + * Else, firmware will hit the PCI devices with big hammer > + * and stop their DMA any way. >*/ > - if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) > + if ((kexec_in_progress || is_kdump_kernel()) && > + pci_dev->current_state <= PCI_D3hot) > pci_clear_master(pci_dev); > } > > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c > index 4937a088d7d8..c65d11ab3939 100644 > --- a/drivers/pci/quirks.c > +++ b/drivers/pci/quirks.c > @@ -28,6 +28,7 @@ > #include > #include > #include > +#include > #include /* isa_dma_bridge_buggy */ > #include "pci.h" > > @@ -192,6 +193,25 @@ static int __init pci_apply_final_quirks(void) > } > fs_initcall_sync(pci_apply_final_quirks); > > +/* > + * Read the device state even if it's not enabled. The device could be > + * activated by previous crashed kernel, this will read and correct the > + * cached state. > + */ > +static void quirk_read_pm_state_in_kdump(struct pci_dev *dev) > +{ > + u16 pmcsr; > + > + if (!is_kdump_kernel()) > + return; > + > + if (dev->pm_cap) { > + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); > + dev->current_stat
Re: [PATCH] ata: ahci: Add shutdown to freeze hardware resources of ahci
[FYI, your reply was a multipart message (not a simple plain text message), so I think the mailing lists discarded it and it doesn't appear in the archives] On Thu, Jan 23, 2020 at 08:05:04PM +0530, Prabhakar Kushwaha wrote: > Thanks Bjorn for review. Reply is in-lined. > > On Thu, Jan 23, 2020 at 7:24 PM Bjorn Helgaas wrote: > > > On Thu, Jan 23, 2020 at 10:41:57AM +, Prabhakar Kushwaha wrote: > > > device_shutdown() called from reboot/power_shutdown expect all > > > devices to be shutdown. Same is true for ahci pci driver. > > > As no shutdown function was implemented ata subsystem remains > > > always alive and DMA/interrupt still active. > > > > > > It creates problem during kexec, here "M" bit is cleared to stop > > > DMA usage. > > > > Maybe this is obvious to AHCI folks, but I don't know what "M" bit you > > are referring to, and I don't see anything in the patch itself that > > looks like an "M" bit. I thought maybe you meant the "Bus Master > > Enable" bit (PCI_COMMAND_MASTER), but I don't see an obvious > > connection to that either. > > > I missed to explain M bit. Thanks for pointing it out. > M bit is PCI_COMMAND_MASTER bit i.e. Bus Master Enable. Please don't call it the "M" bit. Neither the Conventional PCI nor the PCIe specs call it that, and it could just as easily refer to the "Memory Space Enable" bit. Just call it the "Bus Master Enable" bit like the specs do. > There are 2 flow in kernel which calls device_shutdown() which inherently > call pci_device_shutdown()(). > a) reboot flow > b) kexec -e flow > > issue seen in kexec -e flow where hard-disk is not getting detected in > second kernel. > There is special code in pci_device_shutdown() which clear M bit during > kexec -e i.e. kexec_in_progress flag is set. > > if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) > pci_clear_master(pci_dev); > > There should not be any transaction after pci_clear_master() but i am > seeing filesystem sync function calls after this. > this is the reason shutdown function is added(this patch) which disable > interrupt, stop DMA and freeze SATA port before execution of > pci_clear_master() I don't see where your patch adds anything that clears Bus Master Enable. You're adding ahci_shutdown_one(), so pci_device_shutdown() will now call it, and maybe something inside ahci_shutdown_one() will clear Bus Master Enable, but I couldn't find it with a quick look. pci_device_shutdown() *does* clear it, of course, but only when kexec_in_progress, and it does that even without this patch. Just to be clear, I don't object to the patch itself -- that's an AHCI thing that I don't know much about. I'm just complaining because it's not obvious that your commit log describes what the patch actually does. ahci_shutdown_one() might indeed stop the AHCI port DMA, but I suspect it's doing that via the AHCI programming model, not by clearing Bus Master Enable. So maybe all you need to do is remove the references to BME. > After adding shutdown function, i can see hard-disk getting detected in > second kernel. > > > > Any further DMA transaction may cause instability and > > > the hard-disk may even not get detected for second kernel. > > > One of possible case is periodic file system sync. > > > > I think "may cause instability" and "disk may even not get detected" > > is too vague and hand-wavy to really add useful information to the > > commit log. > > > let me rework on this.. Thanks > > > > So defining ahci pci driver shutdown to freeze hardware (mask > > > interrupt, stop DMA engine and free DMA resources). ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] ata: ahci: Add shutdown to freeze hardware resources of ahci
On Thu, Jan 23, 2020 at 10:41:57AM +, Prabhakar Kushwaha wrote: > device_shutdown() called from reboot/power_shutdown expect all > devices to be shutdown. Same is true for ahci pci driver. > As no shutdown function was implemented ata subsystem remains > always alive and DMA/interrupt still active. > > It creates problem during kexec, here "M" bit is cleared to stop > DMA usage. Maybe this is obvious to AHCI folks, but I don't know what "M" bit you are referring to, and I don't see anything in the patch itself that looks like an "M" bit. I thought maybe you meant the "Bus Master Enable" bit (PCI_COMMAND_MASTER), but I don't see an obvious connection to that either. > Any further DMA transaction may cause instability and > the hard-disk may even not get detected for second kernel. > One of possible case is periodic file system sync. I think "may cause instability" and "disk may even not get detected" is too vague and hand-wavy to really add useful information to the commit log. > So defining ahci pci driver shutdown to freeze hardware (mask > interrupt, stop DMA engine and free DMA resources). > > Signed-off-by: Prabhakar Kushwaha > --- > drivers/ata/ahci.c| 8 > drivers/ata/libata-core.c | 21 + > include/linux/libata.h| 1 + > 3 files changed, 30 insertions(+) > > diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c > index 4bfd1b14b390..31fc934740b6 100644 > --- a/drivers/ata/ahci.c > +++ b/drivers/ata/ahci.c > @@ -81,6 +81,7 @@ enum board_ids { > > static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id > *ent); > static void ahci_remove_one(struct pci_dev *dev); > +static void ahci_shutdown_one(struct pci_dev *dev); > static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class, >unsigned long deadline); > static int ahci_avn_hardreset(struct ata_link *link, unsigned int *class, > @@ -606,6 +607,7 @@ static struct pci_driver ahci_pci_driver = { > .id_table = ahci_pci_tbl, > .probe = ahci_init_one, > .remove = ahci_remove_one, > + .shutdown = ahci_shutdown_one, > .driver = { > .pm = &ahci_pci_pm_ops, > }, > @@ -626,6 +628,7 @@ MODULE_PARM_DESC(mobile_lpm_policy, "Default LPM policy > for mobile chipsets"); > static void ahci_pci_save_initial_config(struct pci_dev *pdev, >struct ahci_host_priv *hpriv) > { > + Spurious whitespace change, please remove. > if (pdev->vendor == PCI_VENDOR_ID_JMICRON && pdev->device == 0x2361) { > dev_info(&pdev->dev, "JMB361 has only one port\n"); > hpriv->force_port_map = 1; > @@ -1877,6 +1880,11 @@ static int ahci_init_one(struct pci_dev *pdev, const > struct pci_device_id *ent) > return 0; > } > > +static void ahci_shutdown_one(struct pci_dev *pdev) > +{ > + ata_pci_shutdown_one(pdev); > +} > + > static void ahci_remove_one(struct pci_dev *pdev) > { > pm_runtime_get_noresume(&pdev->dev); > diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c > index 6f4ab5c5b52d..42c8728f6117 100644 > --- a/drivers/ata/libata-core.c > +++ b/drivers/ata/libata-core.c > @@ -6767,6 +6767,26 @@ void ata_pci_remove_one(struct pci_dev *pdev) > ata_host_detach(host); > } > > +void ata_pci_shutdown_one(struct pci_dev *pdev) > +{ > + struct ata_host *host = pci_get_drvdata(pdev); > + int i; > + > + for (i = 0; i < host->n_ports; i++) { > + struct ata_port *ap = host->ports[i]; > + > + ap->pflags |= ATA_PFLAG_FROZEN; > + > + /* Disable port interrupts */ > + if (ap->ops->freeze) > + ap->ops->freeze(ap); > + > + /* Stop the port DMA engines */ > + if (ap->ops->port_stop) > + ap->ops->port_stop(ap); > + } > +} > + > /* move to PCI subsystem */ > int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits) > { > @@ -7387,6 +7407,7 @@ EXPORT_SYMBOL_GPL(ata_timing_cycle2mode); > > #ifdef CONFIG_PCI > EXPORT_SYMBOL_GPL(pci_test_config_bits); > +EXPORT_SYMBOL_GPL(ata_pci_shutdown_one); > EXPORT_SYMBOL_GPL(ata_pci_remove_one); > #ifdef CONFIG_PM > EXPORT_SYMBOL_GPL(ata_pci_device_do_suspend); > diff --git a/include/linux/libata.h b/include/linux/libata.h > index 2dbde119721d..bff539918d82 100644 > --- a/include/linux/libata.h > +++ b/include/linux/libata.h > @@ -1221,6 +1221,7 @@ struct pci_bits { > }; > > extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits > *bits); > +extern void ata_pci_shutdown_one(struct pci_dev *pdev); > extern void ata_pci_remove_one(struct pci_dev *pdev); > > #ifdef CONFIG_PM > -- > 2.17.1 > ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexe
Re: [RFC PATCH] PCI, kdump: Clear bus master bit upon shutdown in kdump kernel
[+cc Deepa (also working in this area)] On Thu, Dec 26, 2019 at 03:21:18AM +0800, Kairui Song wrote: > There are reports about kdump hang upon reboot on some HPE machines, > kernel hanged when trying to shutdown a PCIe port, an uncorrectable > error occurred and crashed the system. Details? Do you have URLs for bug reports, dmesg logs, etc? > On the machine I can reproduce this issue, part of the topology > looks like this: > > [:00]-+-00.0 Intel Corporation Xeon E7 v3/Xeon E5 v3/Core i7 DMI2 > +-01.0-[02]-- > +-01.1-[05]-- > +-02.0-[06]--+-00.0 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.1 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.2 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.3 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.4 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.5 Emulex Corporation OneConnect NIC (Skyhawk) > |+-00.6 Emulex Corporation OneConnect NIC (Skyhawk) > |\-00.7 Emulex Corporation OneConnect NIC (Skyhawk) > +-02.1-[0f]-- > +-02.2-[07]00.0 Hewlett-Packard Company Smart Array Gen9 > Controllers > > When shutting down PCIe port :00:02.2 or :00:02.0, the machine > will hang, depend on which device is reinitialized in kdump kernel. > > If force remove unused device then trigger kdump, the problem will never > happen: > > echo 1 > /sys/bus/pci/devices/\:00\:02.2/\:07\:00.0/remove > echo c > /proc/sysrq-trigger > > ... Kdump save vmcore through network, the NIC get reinitialized and > hpsa is untouched. Then reboot with no problem. (If hpsa is used > instead, shutdown the NIC in first kernel will help) > > The cause is that some devices are enabled by the first kernel, but it > don't have the chance to shutdown the device, and kdump kernel is not > aware of it, unless it reinitialize the device. > > Upon reboot, kdump kernel will skip downstream device shutdown and > clears its bridge's master bit directly. The downstream device could > error out as it can still send requests but upstream refuses it. Can you help me understand the sequence of events? If I understand correctly, the desired sequence is: - user kernel boots - user kernel panics and kexecs to kdump kernel - kdump kernel writes vmcore to network or disk - kdump kernel reboots - user kernel boots But the problem is that as part of the kdump kernel reboot, - kdump kernel disables bus mastering for a Root Port - device below the Root Port attempts DMA - Root Port receives DMA transaction, handles it as Unsupported Request, sends UR Completion to device - device signals uncorrectable error - uncorrectable error causes a crash (Or a hang? You mention both and I'm not sure which it is) Is that right so far? > So for kdump, let kernel read the correct hardware power state on boot, > and always clear the bus master bit of PCI device upon shutdown if the > device is on. PCIe port driver will always shutdown all downstream > devices first, so this should ensure all downstream devices have bus > master bit off before clearing the bridge's bus master bit. > > Signed-off-by: Kairui Song > --- > drivers/pci/pci-driver.c | 11 --- > drivers/pci/quirks.c | 20 > 2 files changed, 28 insertions(+), 3 deletions(-) > > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > index 0454ca0e4e3f..84a7fd643b4d 100644 > --- a/drivers/pci/pci-driver.c > +++ b/drivers/pci/pci-driver.c > @@ -18,6 +18,7 @@ > #include > #include > #include > +#include > #include "pci.h" > #include "pcie/portdrv.h" > > @@ -488,10 +489,14 @@ static void pci_device_shutdown(struct device *dev) >* If this is a kexec reboot, turn off Bus Master bit on the >* device to tell it to not continue to do DMA. Don't touch >* devices in D3cold or unknown states. > - * If it is not a kexec reboot, firmware will hit the PCI > - * devices with big hammer and stop their DMA any way. > + * If this is kdump kernel, also turn off Bus Master, the device > + * could be activated by previous crashed kernel and may block > + * it's upstream from shutting down. > + * Else, firmware will hit the PCI devices with big hammer > + * and stop their DMA any way. >*/ > - if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) > + if ((kexec_in_progress || is_kdump_kernel()) && > + pci_dev->current_state <= PCI_D3hot) > pci_clear_master(pci_dev); I'm clearly missing something because this will turn off bus mastering in cases where we previously left it enabled. I was assuming the crash was related to a device doing DMA when the Root Port had bus mastering disabled. But that must be wrong. I'd like to
Re: kexec -e not working: root disk not able to detect
[+cc Jens, ahci.c maintainer] On Mon, Jan 06, 2020 at 05:24:44PM +0530, Prabhakar Kushwaha wrote: > Hi All, > > I am trying kexec -e with latest kernel i.e. Linux-5.5.0-rc4. Here > second kernel is not able to detect/mount hard-disk having root file > system (INTEL SSDSC2BB240G7). > > [ 279.690575] ata1: softreset failed (1st FIS failed) > [ 279.695446] ata1: limiting SATA link speed to 3.0 Gbps > [ 280.910020] ata1: SATA link down (SStatus 0 SControl 320) > [ 282.626018] ata1: SATA link down (SStatus 0 SControl 300) > [ 282.631409] ata1: link online but 1 devices misclassified, retrying > [ 282.637665] ata1: reset failed (errno=-11), retrying in 9 secs > [ 298.294546] ata1: failed to reset engine (errno=-5) > [ 302.042967] ata1: softreset failed (1st FIS failed) > [ 308.798609] ata1: failed to reset engine (errno=-5) > [ 337.546605] ata1: softreset failed (1st FIS failed) > [ 337.551475] ata1: limiting SATA link speed to 3.0 Gbps > [ 338.766022] ata1: SATA link down (SStatus 0 SControl 320) > [ 339.270943] ata1: EH pending after 5 tries, giving up > > I found following two workaround for this issue. > A) Define ".shutdown" in driver/ata/ahci.c. > > reboot --> kernel_kexec() --> kernel_restart_prepare() --> > device_shutdown() --> pci_device_shutdown() --> ahci_shutdown_one() > --> new function > > diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c > index 4bfd1b14b390..50a101002885 100644 > --- a/drivers/ata/ahci.c > +++ b/drivers/ata/ahci.c > @@ -81,6 +81,7 @@ enum board_ids { > > static int ahci_init_one(struct pci_dev *pdev, const struct > pci_device_id *ent); > static void ahci_remove_one(struct pci_dev *dev); > +static void ahci_shutdown_one(struct pci_dev *dev); > static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class, > unsigned long deadline); > static int ahci_avn_hardreset(struct ata_link *link, unsigned int *class, > @@ -606,6 +607,7 @@ static struct pci_driver ahci_pci_driver = { > .id_table = ahci_pci_tbl, > .probe = ahci_init_one, > .remove = ahci_remove_one, > + .shutdown = ahci_shutdown_one, > .driver = { > .pm = &ahci_pci_pm_ops, > }, > > +static void ahci_shutdown_one(struct pci_dev *pdev) > +{ > + pm_runtime_get_noresume(&pdev->dev); > + ata_pci_remove_one(pdev); > +} > + > Note: After defining shutdown, error related to file-system write > seen. Looks like even after device_shutdown, file system related > transaction goes to disk. > > B)) Commenting of pci_clear_master() from pci_device_shutdown() > reboot --> kernel_kexec() --> kernel_restart_prepare() --> > device_shutdown() --> pci_device_shutdown() > > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > index 0454ca0e4e3f..ddffaa9321bb 100644 > --- a/drivers/pci/pci-driver.c > +++ b/drivers/pci/pci-driver.c > @@ -481,8 +481,10 @@ static void pci_device_shutdown(struct device *dev) > /* > * If this is a kexec reboot, turn off Bus Master bit on the > @@ -491,8 +493,16 @@ static void pci_device_shutdown(struct device *dev) > * If it is not a kexec reboot, firmware will hit the PCI > * devices with big hammer and stop their DMA any way. > */ > > - if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) > -pci_clear_master(pci_dev); I doubt we would remove this without a much clearer justification. > Here pci_dev current_state. It is "0" i.e. D0. > > From A and B. Looks like even after pci_clear_master(), Some DMA > transactions going on PCIe device causing device in unstable. > Not sure if this is the reason and how to solve this problem. Is it possible the ahci driver depends on receiving the device with bus mastering already enabled? I would guess that would be the common case in a non-kexec boot -- the BIOS probably hands off the device with bus mastering enabled. ahci_init_one() does turn on bus mastering itself (it calls pci_set_master()), but it's near the end, do if anything before that depends on DMA, it wouldn't work. And I don't know how adding a shutdown method would also be a workaround. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v4 7/8] linux/log2.h: Fix 64bit calculations in roundup/down_pow_two()
You got the "n" on "down" in the subject, but still missing "of" ;) On Tue, Dec 03, 2019 at 12:47:40PM +0100, Nicolas Saenz Julienne wrote: > Some users need to make sure their rounding function accepts and returns > 64bit long variables regardless of the architecture. Sadly > roundup/rounddown_pow_two() takes and returns unsigned longs. It turns > out ilog2() already handles 32/64bit calculations properly, and being > the building block to the round functions we can rework them as a > wrapper around it. Missing "of" in the function names here. s/a wrapper/wrappers/ IIUC the point of this is that roundup_pow_of_two() returned "unsigned long", which can be either 32 or 64 bits (worth pointing out, I think), and many callers need something that returns "unsigned long long" (always 64 bits). It's a nice simplification to remove the "__" variants. Just as a casual reader of this commit message, I'd like to know why we had both the roundup and the __roundup versions in the first place, and why we no longer need both. > -#define roundup_pow_of_two(n)\ > -(\ > - __builtin_constant_p(n) ? ( \ > - (n == 1) ? 1 : \ > - (1UL << (ilog2((n) - 1) + 1)) \ > -) : \ > - __roundup_pow_of_two(n) \ > - ) > +#define roundup_pow_of_two(n) \ > +( \ > + (__builtin_constant_p(n) && ((n) == 1)) ? \ > + 1 : (1ULL << (ilog2((n) - 1) + 1))\ > +) Should the resulting type of this expression always be a ULL, even when n==1, i.e., should it be this? 1ULL : (1ULL << (ilog2((n) - 1) + 1))\ Or maybe there's no case where that makes a difference? Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/2 v6] x86/kexec_file: add e820 entry in case e820 type string matches to io resource name
On Thu, Nov 15, 2018 at 4:40 AM Borislav Petkov wrote: > > + Bjorn. > > On Thu, Nov 15, 2018 at 01:44:07PM +0800, lijiang wrote: > > At present, the upstream kernel does not pass the e820 reserved ranges to > > the > > second kernel, which might cause two problems: > > > > The first one is the MMCONFIG issue, the PCI MMCONFIG(extended mode) > > requires > > the reserved region otherwise it falls back to legacy mode, which might > > lead to > > the hot-plug device could not be recognized in kdump kernel. > > Well, this still doesn't explain it fully. Let's look at a box: > > [0.00] e820: BIOS-provided physical RAM map: > [0.00] BIOS-e820: [mem 0x-0x000997ff] usable > [0.00] BIOS-e820: [mem 0x00099800-0x0009] reserved > [0.00] BIOS-e820: [mem 0x000e-0x000f] reserved > [0.00] BIOS-e820: [mem 0x0010-0x65642fff] usable > [0.00] BIOS-e820: [mem 0x65643000-0x67fb8fff] reserved > [0.00] BIOS-e820: [mem 0x67fb9000-0x689e8fff] ACPI NVS > [0.00] BIOS-e820: [mem 0x689e9000-0x68bf5fff] ACPI > data > [0.00] BIOS-e820: [mem 0x68bf6000-0x6f7f] usable > [0.00] BIOS-e820: [mem 0x6f80-0x8fff] reserved > [0.00] BIOS-e820: [mem 0xfd00-0xfe7f] reserved > [0.00] BIOS-e820: [mem 0xfec0-0xfec00fff] reserved > [0.00] BIOS-e820: [mem 0xfec8-0xfed00fff] reserved > [0.00] BIOS-e820: [mem 0xff80-0x0001007f] reserved > [0.00] BIOS-e820: [mem 0x00010080-0x00603fff] usable > > this one has 8 reserved regions. Does that mean that we need to pass > them *all* 8 to the second kernel so that MMCONFIG works? > > Or is it only one reserved region which is needed for MMCONFIG? > > Bjorn, do you know what the detection logic should be to map the correct > reserved region (or regions) for MMCONFIG? MMCONFIG (aka ECAM) space is described in the ACPI MCFG table. The generic code to read that is in drivers/acpi/pci_mcfg.c (ignore all the quirks at the top) and the generic code to use it is drivers/pci/ecam.c. Unfortunately x86 doesn't use any of that generic path. It uses the same MCFG table, but it's parsed in arch/x86/pci/mmconfig-shared.c, and the code there checks to ensure the ECAM regions are reserved somehow by firmware, e.g., via the e820 table. There's a bunch of grungy device-dependent code there, too, possibly to work around firmware defects, or (just as likely) to compensate for Linux defects that were *attributed* to firmware. I think you should regard correct MCFG/ECAM usage in the kdump kernel as a requirement. If you don't have ECAM (a) PCI devices won't work at all on non-x86 systems that use only ECAM for config access, (b) you won't be able to access devices on non-0 segments (granted, there aren't very many of these yet, but there will be more in the future), and (c) you won't be able to access extended config space (addresses 0x100-0xfff), which means none of the Extended Capabilities will be available (AER, ACS, ATS, etc). Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
On Mon, Oct 22, 2018 at 05:35:06PM -0300, Guilherme G. Piccoli wrote: > On 18/10/2018 19:15, Bjorn Helgaas wrote: > > On Thu, Oct 18, 2018 at 03:37:19PM -0300, Guilherme G. Piccoli wrote: > > [...] > I understand your point, but I think this is inherently an architecture > problem. No matter what solution we decide for, it'll need to be applied > in early boot time, like before the PCI layer gets initialized. This is the part I want to know more about. Apparently there's some event X between early_quirks() and the PCI device enumeration, and we must disable MSIs before X: setup_arch() early_quirks() # arch/x86/kernel/early-quirks.c early_pci_clear_msi() ... X ... pci_scan_root_bus_bridge() ... DECLARE_PCI_FIXUP_EARLY # drivers/pci/quirks.c I want to know specifically what X is. If we don't know what X is and all we know is "we have to disable MSIs earlier than PCI init", then we're likely to break things again in the future by changing the order of disabling MSIs and whatever X is. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/quirks: Scan all busses for early PCI quirks
On Thu, Oct 18, 2018 at 03:37:19PM -0300, Guilherme G. Piccoli wrote: > Recently was noticed in an HP GEN9 system that kdump couldn't succeed > due to an irq storm coming from an Intel NIC, narrowed down to be lack > of clearing the MSI/MSI-X enable bits during the kdump kernel boot. > For that, we need an early quirk to manually turn off MSI/MSI-X for > PCI devices - this was worked as an optional boot parameter in a > subsequent patch. > > Problem is that in our test system, the Intel NICs were not present in > any secondary bus under the first PCIe root complex, so they couldn't > be reached by the recursion in check_dev_quirk(). Modern systems, > specially with multi-processors and multiple NUMA nodes expose multiple > root complexes, describing more than one PCI hierarchy domain. Currently > the simple recursion present in the early-quirks code from x86 starts a > descending recursion from bus :00, and reach many other busses by > navigating this hierarchy walking through the bridges. This is not > enough in systems with more than one root complex/host bridge, since > the recursion won't "traverse" to other root complexes by starting > statically in :00 (for more details, see [0]). > > This patch hence implements the full bus/device/function scan in > early_quirks(), by checking all possible busses instead of using a > recursion based on the first root bus or limiting the search scope to > the first 32 busses (like it was done in the beginning [1]). I don't want to expand the early quirk infrastructure unless there is absolutely no other way to solve this. The early quirk stuff is x86-specific, and it's not obvious that this problem is x86-only. This patch scans buses 0-255, but still only in domain 0, so it won't help with even more complicated systems that use other domains. I'm not an IRQ expert, but it seems wrong to me that we are enabling this interrupt before we're ready for it. The MSI should target an IOAPIC. Can't that IOAPIC entry be masked until later? I guess the kdump kernel doesn't know what MSI address the device might be using. Could the IRQ core be more tolerant of this somehow, e.g., if it notices incoming interrupts with no handler, could it disable the IOAPIC entry and fall back to polling periodically until a handler is added? > [0] https://bugs.launchpad.net/bugs/1797990 > > [1] From historical perspective, early PCI scan dates back > to BitKeeper, added by Andi Kleen's "[PATCH] APIC fixes for x86-64", > on October/2003. It initially restricted the search to the first > 32 busses and slots. > > Due to a potential bug found in Nvidia chipsets, the scan > was changed to run only in the first root bus: see > commit 8659c406ade3 ("x86: only scan the root bus in early PCI quirks") > > Finally, secondary busses reachable from the 1st bus were re-added back by: > commit 850c321027c2 ("x86/quirks: Reintroduce scanning of secondary buses") > > Reported-by: Dan Streetman > Signed-off-by: Guilherme G. Piccoli > --- > arch/x86/kernel/early-quirks.c | 13 ++--- > 1 file changed, 6 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c > index 50d5848bf22e..fd50f9e21623 100644 > --- a/arch/x86/kernel/early-quirks.c > +++ b/arch/x86/kernel/early-quirks.c > @@ -731,7 +731,6 @@ static int __init check_dev_quirk(int num, int slot, int > func) > u16 vendor; > u16 device; > u8 type; > - u8 sec; > int i; > > class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); > @@ -760,11 +759,8 @@ static int __init check_dev_quirk(int num, int slot, int > func) > type = read_pci_config_byte(num, slot, func, > PCI_HEADER_TYPE); > > - if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { > - sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); > - if (sec > num) > - early_pci_scan_bus(sec); > - } > + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) > + return -1; > > if (!(type & 0x80)) > return -1; > @@ -787,8 +783,11 @@ static void __init early_pci_scan_bus(int bus) > > void __init early_quirks(void) > { > + int bus; > + > if (!early_pci_allowed()) > return; > > - early_pci_scan_bus(0); > + for (bus = 0; bus < 256; bus++) > + early_pci_scan_bus(bus); > } > -- > 2.19.0 > ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3] x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
On Mon, Oct 15, 2018 at 12:51:38PM +0800, Dave Young wrote: > On 09/30/18 at 05:27pm, Dave Young wrote: > > On 09/30/18 at 05:21pm, Dave Young wrote: > > > On 09/27/18 at 09:21am, Bjorn Helgaas wrote: > > > > From: Bjorn Helgaas > > > > > > > > The only use of KEXEC_BACKUP_SRC_END is as an argument to > > > > walk_system_ram_res(): > > > > > > > > int crash_load_segments(struct kimage *image) > > > > { > > > > ... > > > > walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, > > > > image, determine_backup_region); > > > > > > > > walk_system_ram_res() expects "start, end" arguments that are inclusive, > > > > i.e., the range to be walked includes both the start and end addresses. > > > > > > Looking at the function comment of find_next_iomem_res, the res->end > > > should be exclusive, am I missing something? > > > > Oops, you fix it in 2nd patch, I apparently miss that. > > > > Since the fix of checking the end is in another patch, probably merge > > these two patches so that they are in one patch to avoid break bisect. > > Not sure if above comment was missed, Boris, would you mind to fold the > patch 1 and 2? Sorry, I did miss this comment. Patch 2 was for the very specific case of a single-byte resource at the end address, which we probably never see in practice. For patch 1, the find_next_iomem_res() function comment had "[res->start.res->end)", but I think the code actually treated it as "[res->start.res->end]", so the comment was inaccurate. Before my patches we had: #define KEXEC_BACKUP_SRC_START (0UL) #define KEXEC_BACKUP_SRC_END(640 * 1024UL)# 0xa The intention is to search for system RAM resources that intersect this region: [mem 0x0-0x9] The call is: walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, ..., determine_backup_region); walk_system_ram_res(0, 0xa, ..., determine_backup_region); Assume iomem_resource contains this system RAM resource: [mem 0x9-0xa] In find_next_iomem_res(), the "res" input parameter is the region to search: res->start = 0; # KEXEC_BACKUP_SRC_START res->end = 0xa;# KEXEC_BACKUP_SRC_END In one of the loop iterations we find the [mem 0x9-0xa] resource (p): p->start = 0x9; p->end = 0xa; if (p->start > end) # 0x9 > 0xa? false if (p->end >= start && p->start < end) # 0xa >= 0 ? true # 0x9 < 0xa ? true break; # so we'll return part of "p" if (res->start < p->start) # 0x0 < 0x9 ? true res->start = 0x9; # trim beginning to p->start if (res->end > p->end) # 0xa > 0xa ? false So find_next_iomem_res() returns with this: res->start = 0x9;# trimmed to p->start res->end = 0xa;# unchanged from input [mem 0x9-0xa]# returned resource (res) and we call determine_backup_region(res), which sets: image->arch.backup_src_start = 0x9; image->arch.backup_src_sz = resource_size(res) # 0xa - 0x9 + 1 # (0x10001) This is incorrect. What we wanted was the part of [mem 0x9-0xa] that intersects the first 640K, i.e., [mem 0x9-0x9], but what we got was [mem 0x9-0xa], which is one byte too long. The resource returned find_next_iomem_res() always ends at the "res->end" supplied as an input parameter *unless* the input res->end is strictly greater than the p->end, when it is truncated to p->end. Bottom line, I don't think patches 1 and 2 need to be folded together because they fix different problems. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 3/3] resource: Fix find_next_iomem_res() iteration issue
On Fri, Sep 28, 2018 at 11:42 AM Borislav Petkov wrote: > > On Thu, Sep 27, 2018 at 09:22:09AM -0500, Bjorn Helgaas wrote: > > From: Bjorn Helgaas > > > > Previously find_next_iomem_res() used "*res" as both an input parameter for > > the range to search and the type of resource to search for, and an output > > parameter for the resource we found, which makes the interface confusing. > > > > The current callers use find_next_iomem_res() incorrectly because they > > allocate a single struct resource and use it for repeated calls to > > find_next_iomem_res(). When find_next_iomem_res() returns a resource, it > > overwrites the start, end, flags, and desc members of the struct. If we > > call find_next_iomem_res() again, we must update or restore these fields. > > The previous code restored res.start and res.end, but not res.flags or > > res.desc. > > ... which is a sure sign that the design of this thing is not the best one. > > > > > Since the callers did not restore res.flags, if they searched for flags > > IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags > > IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would > > incorrectly skip resources unless they were also marked as > > IORESOURCE_SYSRAM. > > Nice example! > > > Fix this by restructuring the interface so it takes explicit "start, end, > > flags" parameters and uses "*res" only as an output parameter. > > > > Original-patch: > > http://lore.kernel.org/lkml/20180921073211.20097-2-liji...@redhat.com > > Based-on-patch-by: Lianbo Jiang > > Signed-off-by: Bjorn Helgaas > > --- > > kernel/resource.c | 94 > > +++-- > > 1 file changed, 41 insertions(+), 53 deletions(-) > > > > diff --git a/kernel/resource.c b/kernel/resource.c > > index 155ec873ea4d..9891ea90cc8f 100644 > > --- a/kernel/resource.c > > +++ b/kernel/resource.c > > @@ -319,23 +319,26 @@ int release_resource(struct resource *old) > > EXPORT_SYMBOL(release_resource); > > > > /* > > I guess this could be made kernel-doc, since you're touching it... > > > - * Finds the lowest iomem resource existing within [res->start..res->end]. > > - * The caller must specify res->start, res->end, res->flags, and optionally > > - * desc. If found, returns 0, res is overwritten, if not found, returns > > -1. > > - * This function walks the whole tree and not just first level children > > until > > - * and unless first_level_children_only is true. > > + * Finds the lowest iomem resource that covers part of [start..end]. The > > + * caller must specify start, end, flags, and desc (which may be > > + * IORES_DESC_NONE). > > + * > > + * If a resource is found, returns 0 and *res is overwritten with the part > > + * of the resource that's within [start..end]; if none is found, returns > > + * -1. > > + * > > + * This function walks the whole tree and not just first level children > > + * unless first_level_children_only is true. > > ... and then prepend that with '@' - @first_level_children_only to refer > to the function parameter. > > > */ > > -static int find_next_iomem_res(struct resource *res, unsigned long desc, > > -bool first_level_children_only) > > +static int find_next_iomem_res(resource_size_t start, resource_size_t end, > > +unsigned long flags, unsigned long desc, > > +bool first_level_children_only, > > +struct resource *res) > > { > > - resource_size_t start, end; > > struct resource *p; > > bool sibling_only = false; > > > > BUG_ON(!res); > > - > > - start = res->start; > > - end = res->end; > > BUG_ON(start >= end); > > And since we're touching this, maybe replace that BUG_ON() fun with > simply return -EINVAL or some error code... > > > > > if (first_level_children_only) > > if (first_level_children_only) > sibling_only = true; > > So this is just silly - replacing a bool function param with a local bool > var. > > You could kill that, shorten first_level_children_only's name and use it > directly. > > Depending on how much cleanup it amounts to, you could make that a > separate cleanup patch ontop, to keep the changes from the cleanup > separate. > > > @@ -344,7 +347,7 @@ static in
[PATCH 3/3] resource: Fix find_next_iomem_res() iteration issue
From: Bjorn Helgaas Previously find_next_iomem_res() used "*res" as both an input parameter for the range to search and the type of resource to search for, and an output parameter for the resource we found, which makes the interface confusing. The current callers use find_next_iomem_res() incorrectly because they allocate a single struct resource and use it for repeated calls to find_next_iomem_res(). When find_next_iomem_res() returns a resource, it overwrites the start, end, flags, and desc members of the struct. If we call find_next_iomem_res() again, we must update or restore these fields. The previous code restored res.start and res.end, but not res.flags or res.desc. Since the callers did not restore res.flags, if they searched for flags IORESOURCE_MEM | IORESOURCE_BUSY and found a resource with flags IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search would incorrectly skip resources unless they were also marked as IORESOURCE_SYSRAM. Fix this by restructuring the interface so it takes explicit "start, end, flags" parameters and uses "*res" only as an output parameter. Original-patch: http://lore.kernel.org/lkml/20180921073211.20097-2-liji...@redhat.com Based-on-patch-by: Lianbo Jiang Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 94 +++-- 1 file changed, 41 insertions(+), 53 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 155ec873ea4d..9891ea90cc8f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -319,23 +319,26 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem resource existing within [res->start..res->end]. - * The caller must specify res->start, res->end, res->flags, and optionally - * desc. If found, returns 0, res is overwritten, if not found, returns -1. - * This function walks the whole tree and not just first level children until - * and unless first_level_children_only is true. + * Finds the lowest iomem resource that covers part of [start..end]. The + * caller must specify start, end, flags, and desc (which may be + * IORES_DESC_NONE). + * + * If a resource is found, returns 0 and *res is overwritten with the part + * of the resource that's within [start..end]; if none is found, returns + * -1. + * + * This function walks the whole tree and not just first level children + * unless first_level_children_only is true. */ -static int find_next_iomem_res(struct resource *res, unsigned long desc, - bool first_level_children_only) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + bool first_level_children_only, + struct resource *res) { - resource_size_t start, end; struct resource *p; bool sibling_only = false; BUG_ON(!res); - - start = res->start; - end = res->end; BUG_ON(start >= end); if (first_level_children_only) @@ -344,7 +347,7 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, read_lock(&resource_lock); for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if ((p->flags & res->flags) != res->flags) + if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; @@ -359,32 +362,31 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, read_unlock(&resource_lock); if (!p) return -1; + /* copy data */ - if (res->start < p->start) - res->start = p->start; - if (res->end > p->end) - res->end = p->end; + res->start = max(start, p->start); + res->end = min(end, p->end); res->flags = p->flags; res->desc = p->desc; return 0; } -static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, -bool first_level_children_only, -void *arg, +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, +unsigned long flags, unsigned long desc, +bool first_level_children_only, void *arg, int (*func)(struct resource *, void *)) { - u64 orig_end = res->end; + struct resource res; int ret = -1; - while ((res->start < res->end) && - !find_next_iomem_res(res, desc, first_level_children_only)) { - ret = (*func)(res, arg); + wh
[PATCH 2/3] resource: Include resource end in walk_*() interfaces
From: Bjorn Helgaas find_next_iomem_res() finds an iomem resource that covers part of a range described by "start, end". All callers expect that range to be inclusive, i.e., both start and end are included, but find_next_iomem_res() doesn't handle the end address correctly. If it finds an iomem resource that contains exactly the end address, it skips it, e.g., if "start, end" is [0x0-0x1] and there happens to be an iomem resource [mem 0x1-0x1] (the single byte at 0x1), we skip it: find_next_iomem_res(...) { start = 0x0; end = 0x1; for (p = next_resource(...)) { # p->start = 0x1; # p->end = 0x1; # we *should* return this resource, but this condition is false: if ((p->end >= start) && (p->start < end)) break; Adjust find_next_iomem_res() so it allows a resource that includes the single byte at the end of the range. This is a corner case that we probably don't see in practice. Fixes: 58c1b5b07907 ("[PATCH] memory hotadd fixes: find_next_system_ram catch range fix") Signed-off-by: Bjorn Helgaas --- kernel/resource.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 30e1bc68503b..155ec873ea4d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -319,7 +319,7 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem resource existing within [res->start.res->end). + * Finds the lowest iomem resource existing within [res->start..res->end]. * The caller must specify res->start, res->end, res->flags, and optionally * desc. If found, returns 0, res is overwritten, if not found, returns -1. * This function walks the whole tree and not just first level children until @@ -352,7 +352,7 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, p = NULL; break; } - if ((p->end >= start) && (p->start < end)) + if ((p->end >= start) && (p->start <= end)) break; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 0/3] find_next_iomem_res() fixes
These fix: - A kexec off-by-one error that we probably never see in practice (only affects a resource starting at exactly 640KB). - A corner case in walk_iomem_res_desc(), walk_system_ram_res(), etc that we probably also never see in practice (only affects a single byte resource at the very end of the region we're searching) - An issue in walk_iomem_res_desc() that apparently causes a kdump issue (see Lianbo's note at [1]). I think we need to fix the kdump issue either by these patches (specifically the last one) or by the patch Lianbo posted [2]. I'm hoping to avoid deciding and merging these myself, but I'm not sure who really wants to own kernel/resource.c. [1] https://lore.kernel.org/lkml/01551d06-c421-5df3-b19f-fc66f3639...@redhat.com [2] https://lore.kernel.org/lkml/20180921073211.20097-2-liji...@redhat.com --- Bjorn Helgaas (3): x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error resource: Include resource end in walk_*() interfaces resource: Fix find_next_iomem_res() iteration issue arch/x86/include/asm/kexec.h |2 - kernel/resource.c| 96 ++ 2 files changed, 43 insertions(+), 55 deletions(-) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 1/3] x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
From: Bjorn Helgaas The only use of KEXEC_BACKUP_SRC_END is as an argument to walk_system_ram_res(): int crash_load_segments(struct kimage *image) { ... walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, image, determine_backup_region); walk_system_ram_res() expects "start, end" arguments that are inclusive, i.e., the range to be walked includes both the start and end addresses. KEXEC_BACKUP_SRC_END was previously defined as (640 * 1024UL), which is the first address *past* the desired 0-640KB range. Define KEXEC_BACKUP_SRC_END as (640 * 1024UL - 1) so the KEXEC_BACKUP_SRC region is [0-0x9], not [0-0xa]. Fixes: dd5f726076cc ("kexec: support for kexec on panic using new system call") Signed-off-by: Bjorn Helgaas --- arch/x86/include/asm/kexec.h |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f327236f0fa7..5125fca472bb 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -67,7 +67,7 @@ struct kimage; /* Memory to backup during crash kdump */ #define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ +#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* * CPU does not save ss and sp on stack if execution is already ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 3/3] resource: Fix find_next_iomem_res() iteration issue
On Thu, Sep 27, 2018 at 01:27:41PM +0800, lijiang wrote: > 在 2018年09月25日 06:15, Bjorn Helgaas 写道: > > From: Bjorn Helgaas > > > > Previously find_next_iomem_res() used "*res" as both an input parameter for > > the range to search and the type of resource to search for, and an output > > parameter for the resource we found, which makes the interface confusing > > and hard to use correctly. > > > > All callers allocate a single struct resource and use it for repeated calls > > to find_next_iomem_res(). When find_next_iomem_res() returns a resource, > > it overwrites the start, end, flags, and desc members of the struct. If we > > call find_next_iomem_res() again, we must update or restore these fields. > > > > The callers (__walk_iomem_res_desc() and walk_system_ram_range()) do not > > restore res->flags, so if the caller is searching for flags of > > IORESOURCE_MEM | IORESOURCE_BUSY and finds a resource with flags of > > IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search will > > find only resources marked as IORESOURCE_SYSRAM. > > > > Fix this by restructuring the interface so it takes explicit "start, end, > > flags" parameters and uses "*res" only as an output parameter. > > Hi, Bjorn > I personally suggest that some comments might be added in the code, make it > clear > and easy to understand, then which could avoid the old confusion and more > code changes. Since I think the current interface (using *res as both input and output parameters that have very different meanings) is confusing, it's hard for *me* to write comments that make it less confusing, but of course, you're welcome to propose something. My opinion (probably not universally shared) is that my proposal would make the code more readable, and it's worth doing even though the diff is larger. Anyway, I'll post these patches independently and see if anybody else has an opinion. Bjorn > > Original-patch: > > http://lore.kernel.org/lkml/20180921073211.20097-2-liji...@redhat.com > > Based-on-patch-by: Lianbo Jiang > > Signed-off-by: Bjorn Helgaas > > --- > > kernel/resource.c | 94 > > +++-- > > 1 file changed, 41 insertions(+), 53 deletions(-) > > > > diff --git a/kernel/resource.c b/kernel/resource.c > > index 155ec873ea4d..9891ea90cc8f 100644 > > --- a/kernel/resource.c > > +++ b/kernel/resource.c > > @@ -319,23 +319,26 @@ int release_resource(struct resource *old) > > EXPORT_SYMBOL(release_resource); > > > > /* > > - * Finds the lowest iomem resource existing within [res->start..res->end]. > > - * The caller must specify res->start, res->end, res->flags, and optionally > > - * desc. If found, returns 0, res is overwritten, if not found, returns > > -1. > > - * This function walks the whole tree and not just first level children > > until > > - * and unless first_level_children_only is true. > > + * Finds the lowest iomem resource that covers part of [start..end]. The > > + * caller must specify start, end, flags, and desc (which may be > > + * IORES_DESC_NONE). > > + * > > + * If a resource is found, returns 0 and *res is overwritten with the part > > + * of the resource that's within [start..end]; if none is found, returns > > + * -1. > > + * > > + * This function walks the whole tree and not just first level children > > + * unless first_level_children_only is true. > > */ > > -static int find_next_iomem_res(struct resource *res, unsigned long desc, > > - bool first_level_children_only) > > +static int find_next_iomem_res(resource_size_t start, resource_size_t end, > > + unsigned long flags, unsigned long desc, > > + bool first_level_children_only, > > + struct resource *res) > > { > > - resource_size_t start, end; > > struct resource *p; > > bool sibling_only = false; > > > > BUG_ON(!res); > > - > > - start = res->start; > > - end = res->end; > > BUG_ON(start >= end); > > > > if (first_level_children_only) > > @@ -344,7 +347,7 @@ static int find_next_iomem_res(struct resource *res, > > unsigned long desc, > > read_lock(&resource_lock); > > > > for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { > > - if ((p->flags & res->flags) != res->flags) > > + if ((p->flags &
[PATCH 3/3] resource: Fix find_next_iomem_res() iteration issue
From: Bjorn Helgaas Previously find_next_iomem_res() used "*res" as both an input parameter for the range to search and the type of resource to search for, and an output parameter for the resource we found, which makes the interface confusing and hard to use correctly. All callers allocate a single struct resource and use it for repeated calls to find_next_iomem_res(). When find_next_iomem_res() returns a resource, it overwrites the start, end, flags, and desc members of the struct. If we call find_next_iomem_res() again, we must update or restore these fields. The callers (__walk_iomem_res_desc() and walk_system_ram_range()) do not restore res->flags, so if the caller is searching for flags of IORESOURCE_MEM | IORESOURCE_BUSY and finds a resource with flags of IORESOURCE_MEM | IORESOURCE_BUSY | IORESOURCE_SYSRAM, the next search will find only resources marked as IORESOURCE_SYSRAM. Fix this by restructuring the interface so it takes explicit "start, end, flags" parameters and uses "*res" only as an output parameter. Original-patch: http://lore.kernel.org/lkml/20180921073211.20097-2-liji...@redhat.com Based-on-patch-by: Lianbo Jiang Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 94 +++-- 1 file changed, 41 insertions(+), 53 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 155ec873ea4d..9891ea90cc8f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -319,23 +319,26 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem resource existing within [res->start..res->end]. - * The caller must specify res->start, res->end, res->flags, and optionally - * desc. If found, returns 0, res is overwritten, if not found, returns -1. - * This function walks the whole tree and not just first level children until - * and unless first_level_children_only is true. + * Finds the lowest iomem resource that covers part of [start..end]. The + * caller must specify start, end, flags, and desc (which may be + * IORES_DESC_NONE). + * + * If a resource is found, returns 0 and *res is overwritten with the part + * of the resource that's within [start..end]; if none is found, returns + * -1. + * + * This function walks the whole tree and not just first level children + * unless first_level_children_only is true. */ -static int find_next_iomem_res(struct resource *res, unsigned long desc, - bool first_level_children_only) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + bool first_level_children_only, + struct resource *res) { - resource_size_t start, end; struct resource *p; bool sibling_only = false; BUG_ON(!res); - - start = res->start; - end = res->end; BUG_ON(start >= end); if (first_level_children_only) @@ -344,7 +347,7 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, read_lock(&resource_lock); for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if ((p->flags & res->flags) != res->flags) + if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; @@ -359,32 +362,31 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, read_unlock(&resource_lock); if (!p) return -1; + /* copy data */ - if (res->start < p->start) - res->start = p->start; - if (res->end > p->end) - res->end = p->end; + res->start = max(start, p->start); + res->end = min(end, p->end); res->flags = p->flags; res->desc = p->desc; return 0; } -static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, -bool first_level_children_only, -void *arg, +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, +unsigned long flags, unsigned long desc, +bool first_level_children_only, void *arg, int (*func)(struct resource *, void *)) { - u64 orig_end = res->end; + struct resource res; int ret = -1; - while ((res->start < res->end) && - !find_next_iomem_res(res, desc, first_level_children_only)) { - ret = (*func)(res, arg); + while (start < end && + !find_next_iomem_res(start,
[PATCH 2/3] resource: Include resource end in walk_*() interfaces
From: Bjorn Helgaas find_next_iomem_res() finds an iomem resource that covers part of a range described by "start, end". All callers expect that range to be inclusive, i.e., both start and end are included, but find_next_iomem_res() doesn't handle the end address correctly. If it finds an iomem resource that contains exactly the end address, it skips it, e.g., if "start, end" is [0x0-0x1] and there happens to be an iomem resource [mem 0x1-0x1] (the single byte at 0x1), we skip it: find_next_iomem_res(...) { start = 0x0; end = 0x1; for (p = next_resource(...)) { # p->start = 0x1; # p->end = 0x1; # we *should* return this resource, but this condition is false: if ((p->end >= start) && (p->start < end)) break; Adjust find_next_iomem_res() so it allows a resource that includes the single byte at the end of the range. This is a corner case that we probably don't see in practice. Fixes: 58c1b5b07907 ("[PATCH] memory hotadd fixes: find_next_system_ram catch range fix") Signed-off-by: Bjorn Helgaas --- kernel/resource.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 30e1bc68503b..155ec873ea4d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -319,7 +319,7 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem resource existing within [res->start.res->end). + * Finds the lowest iomem resource existing within [res->start..res->end]. * The caller must specify res->start, res->end, res->flags, and optionally * desc. If found, returns 0, res is overwritten, if not found, returns -1. * This function walks the whole tree and not just first level children until @@ -352,7 +352,7 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, p = NULL; break; } - if ((p->end >= start) && (p->start < end)) + if ((p->end >= start) && (p->start <= end)) break; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 0/3] find_next_iomem_res() fixes
Hi Lianbo, These three patches are a possible replacement for your first patch ("[PATCH 1/3 v3] resource: fix an error which walks through iomem resources"). I think the interface of find_next_iomem_res() can be improved to make the code easier to read and also avoid the errors you're fixing. I can't test these, so they've only been compiled. If you can test them and if you like them, feel free to incorporate them into your series. If not, just drop them (but please at least fix the same error in walk_system_ram_range()). --- Bjorn Helgaas (3): x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error resource: Include resource end in walk_*() interfaces resource: Fix find_next_iomem_res() iteration issue arch/x86/include/asm/kexec.h |2 - kernel/resource.c| 96 ++ 2 files changed, 43 insertions(+), 55 deletions(-) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 1/3] x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
From: Bjorn Helgaas The only use of KEXEC_BACKUP_SRC_END is as an argument to walk_system_ram_res(): int crash_load_segments(struct kimage *image) { ... walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, image, determine_backup_region); walk_system_ram_res() expects "start, end" arguments that are inclusive, i.e., the range to be walked includes both the start and end addresses. KEXEC_BACKUP_SRC_END was previously defined as (640 * 1024UL), which is the first address *past* the desired 0-64KB range. Define KEXEC_BACKUP_SRC_END as (640 * 1024UL - 1) so the KEXEC_BACKUP_SRC region is [0-0x], not [0-0x1]. Fixes: dd5f726076cc ("kexec: support for kexec on panic using new system call") Signed-off-by: Bjorn Helgaas --- arch/x86/include/asm/kexec.h |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f327236f0fa7..5125fca472bb 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -67,7 +67,7 @@ struct kimage; /* Memory to backup during crash kdump */ #define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ +#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* * CPU does not save ss and sp on stack if execution is already ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/3 v3] resource: fix an error which walks through iomem resources
On Fri, Sep 21, 2018 at 03:32:09PM +0800, Lianbo Jiang wrote: > When we walk through iomem resources by calling walk_iomem_res_desc(), > the values of the function parameter may be modified in the while loop > of __walk_iomem_res_desc(), which will cause us to not get the desired > result in some cases. If I understand correctly, the issue is caused by the interaction between __walk_iomem_res_desc() and find_next_iomem_res() in this path: __walk_iomem_res_desc find_next_iomem_res res->flags = p->flags;# <-- problem This path is used by the following interfaces, and I think your patch would fix the issue for them: walk_iomem_res_desc() walk_system_ram_res() walk_mem_res() However, find_next_iomem_res() is also used directly by walk_system_ram_range(). I think that path has the same problem, and your patch does not fix that path. I have a few more comments related to the existing code that I'll post soon. > At present, it only restores the original value of res->end, but it > doesn't restore the original value of res->flags in the while loop of > __walk_iomem _res_desc(). Whenever the find_next_iomem_res() finds a > resource and returns the result, the original values of this resource > will be modified, which might lead to an error in the next loop. For > example: > > The original value of resource flags is: > res->flags=0x8200(initial value) > > p->flags _ 0x81000200 __ 0x8200 _ > / \ / \ > ||___A||_._|__B_|..___| > 00x > (memory address ranges) > > Note: if ((p->flags & res->flags) != res->flags) continue; > > When the resource A is found, the original value of this resource flags > will be changed to 0x81000200(res->flags=0x81000200), and continue to > look for the next resource, when the loop reaches resource B, it can not > get the resource B any more(you can refer to the for loop of find_next > _iomem_res()), because the value of conditional expression will become > true and will also jump the resource B. > > In fact, we should get the resource A and B when we walk through the > whole tree, but it only gets the resource A, the resource B is missed. > > Signed-off-by: Lianbo Jiang > --- > kernel/resource.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/kernel/resource.c b/kernel/resource.c > index 30e1bc68503b..f5d9fc70a04c 100644 > --- a/kernel/resource.c > +++ b/kernel/resource.c > @@ -375,6 +375,7 @@ static int __walk_iomem_res_desc(struct resource *res, > unsigned long desc, >int (*func)(struct resource *, void *)) > { > u64 orig_end = res->end; > + u64 orig_flags = res->flags; > int ret = -1; > > while ((res->start < res->end) && > @@ -385,6 +386,7 @@ static int __walk_iomem_res_desc(struct resource *res, > unsigned long desc, > > res->start = res->end + 1; > res->end = orig_end; > + res->flags = orig_flags; > } > > return ret; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: pciehp 0000:00:1c.0:pcie004: Timeout on hotplug command 0x1038 (issued 65284 msec ago)
On Tue, May 01, 2018 at 01:59:20PM +0100, Marc Zyngier wrote: > On 01/05/18 13:38, Sinan Kaya wrote: > > +Marc, > > > > On 4/30/2018 5:27 PM, Sinan Kaya wrote: > >> On 4/30/2018 5:17 PM, Bjorn Helgaas wrote: > >>>> What should we do about this? > >>>> > >>>> Since there is an actual HW errata involved, should we quirk this > >>>> root port and not wait as if remove/shutdown doesn't exist? > >>> I was hoping to avoid a quirk because AFAIK all Intel parts have this > >>> issue so it will be an ongoing maintenance issue. I tried to avoid > >>> the timeout delays, e.g., with 40b960831cfa ("PCI: pciehp: Compute > >>> timeout from hotplug command start time"). > >>> > >>> But we still see the alarming messages, so we should probably add a > >>> quirk to get rid of those. > >>> > >>> But I haven't given up on the idea of getting rid of the > >>> pciehp_remove() path. I'm not convinced yet that we actually need to > >>> do anything to shut this device down. I don't like the assumption > >>> that kexec requires this. The kexec is fundamentally just a branch, > >>> and anything we do before the branch (i.e., in the old kernel), we > >>> should also be able to do after the branch (i.e., in the kexec-ed > >>> kernel). > >>> > >> > >> In my experience with kexec, MSI type edge interrupts are harmless. > >> You might just see a few unhandled interrupt messages during boot > >> if something is pending from the first kernel. > > Unfortunately, that's not always the case. > > A number of GICv3/v4 implementations (a very common interrupt controller > on ARM servers) cannot be disabled, which means they will keep writing > to their pending tables long after kexec will have started the new > kernel. And since we don't track memory allocation across kexec, you > end-up with significant chances of observing single bit corruption as > interrupts carry on being delivered. Oh, and you won't actually be able > to take MSIs because you can't even reprogram the damn thing. > > Yes, this can be considered a HW bug. > > >> It is the level interrupts that are more concerning. It remains pending > >> until the interrupt source is cleared. CPU never returns from the > >> interrupt handler to actually continue booting the second kernel. > > > > This makes me wonder why kexec doesn't disable all interrupt sources by > > itself instead of relying on the drivers shutdown routine. Some drivers > > don't even have a shutdown callback. Kexec could have done both as another > > example. Something like. > > > > 1. Call shutdown for all drivers if available. > > 2. Disable all interrupt sources in the interrupt controller > > 3. Start the new kernel. > > See above. Although you can shut off the end-point and to some extent > mask interrupts before jumping into the payload, it is not always > possible to go back to a reasonable state where you can take actually MSIs. This is exactly the sort of thing it would be nice to collect and document as part of the background of "why kexec works the way it does." It certainly helps explain things that are far from obvious if you don't have the background. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: pciehp 0000:00:1c.0:pcie004: Timeout on hotplug command 0x1038 (issued 65284 msec ago)
On Mon, Apr 30, 2018 at 04:48:15PM -0400, Sinan Kaya wrote: > Bjorn, > > On 4/28/2018 9:03 AM, ok...@codeaurora.org wrote: > >> Hmm, if it is the remove() method then kexec does not use it. kexec use > >> the shutdown() method instead. I missed this details when I replied. > > > > Portdrv hooks up remove handler to shutdown. That's why remove is getting > > called. > > What should we do about this? > > Since there is an actual HW errata involved, should we quirk this > root port and not wait as if remove/shutdown doesn't exist? I was hoping to avoid a quirk because AFAIK all Intel parts have this issue so it will be an ongoing maintenance issue. I tried to avoid the timeout delays, e.g., with 40b960831cfa ("PCI: pciehp: Compute timeout from hotplug command start time"). But we still see the alarming messages, so we should probably add a quirk to get rid of those. But I haven't given up on the idea of getting rid of the pciehp_remove() path. I'm not convinced yet that we actually need to do anything to shut this device down. I don't like the assumption that kexec requires this. The kexec is fundamentally just a branch, and anything we do before the branch (i.e., in the old kernel), we should also be able to do after the branch (i.e., in the kexec-ed kernel). > Paul, > You might want to file a bugzilla so that we can keep our debug > efforts out of this list. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: pciehp 0000:00:1c.0:pcie004: Timeout on hotplug command 0x1038 (issued 65284 msec ago)
[+cc Eric, Vivek, kexec list] On Fri, Apr 27, 2018 at 03:34:30PM -0400, Sinan Kaya wrote: > On 4/27/2018 3:22 PM, Bjorn Helgaas wrote: > > Sinan mooted the idea of using a "no-wait" path of sending the "don't > > generate hotplug interrupts" command. I think we should work on this > > idea a little more. If we're shutting down the whole system, I can't > > believe there's much value in *anything* we do in the pciehp_remove() > > path. > > > > Maybe we should just get rid of pciehp_remove() (and probably > > pcie_port_remove_service() and the other service driver remove methods) > > completely. That dates from when the service drivers could be modules that > > could be potentially unloaded, but unloading them hasn't been possible for > > years. > > Shutdown path is also used for kexec. Leaving hotplug interrupts > pending is dangerous for the newly loaded kernel as it leaves > spurious interrupts during the new kernel boot. > > I think we should always disable the hotplug interrupt on shutdown. > We might think of not waiting for command-completion as a > middle-ground or go to polling path instead of interrupts all the > time. Ah, I forgot about the kexec path. The kexec path is used for crashdump, too, so ideally the newly-loaded kernel would defend itself when possible so it doesn't depend on the original kernel doing things correctly. Seems like this question of whether to do things in the original kernel or the kexec-ed kernel comes up periodically, but I can never remember a definitive answer. My initial reaction is that it'd be nice if we didn't have to do *any* shutdown in the original kernel, but I'm sure there are reasons that's not practical. I copied Eric (kexec maintainer) and Vivek (contact listed in Documentation/kdump/kdump.txt) in case they have suggestions or would consider some sort of Documentation/ update. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v1 0/2] kexec: Remove "weak" annotations from headers
On Fri, Apr 13, 2018 at 11:08:20AM +0200, Philipp Rudo wrote: > Hi Bjorn, > > in recent patches AKASHI [1] and I [2] made some changes to the declarations > you are touching and already removed some of the weak statements. The patches > got accepted on linux-next and will (hopefully) be pulled for v4.17. So you > should prepare for some merge conflicts. Nevertheless three weak statements > still remain (arch_kexec_walk_mem & arch_kexec_apply_relocations*) so your > patch still makes totally sense. Thanks, I see those changes in v4.17-rc1. I'll update my patches and post a v2. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v1 0/2] kexec: Remove "weak" annotations from headers
On Fri, Apr 13, 2018 at 05:29:08PM +0800, Baoquan He wrote: > Hi Bjorn, > > There are changes I have made to solve 5-level conflict with > kexec/kdump and also interface unification task, they will involve x86 > 64 only changes on these functions, I don't think we need remove them if > without any obvious impact or error reported. Removing the weak attribute from the declaration in the header file does not prevent you from defining a weak function later in the .c file. We should remove the weak attribute from the header file declaration because it can lead to non-obvious errors, e.g., calling the wrong version of the function. There's no build-time or run-time indication that this happens, so it's a real trap. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v1 2/2] kexec: Remove "weak" from arch_kexec_walk_mem() declaration
From: Bjorn Helgaas Weak header file declarations are error-prone because they make every definition weak, and the linker chooses one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). kernel/kexec_file.c contains a weak definition of arch_kexec_walk_mem() and arch/powerpc/kernel/machine_kexec_file_64.c contains a definition intended to be non-weak. But the annotation in the header file makes *both* definitions weak, so it's unclear which one will be used. Remove the "weak" attribute from the declaration so we always prefer a non-weak definition over the weak one. Fixes: 60fe3910bb02 ("kexec_file: Allow arch-specific memory walking for kexec_add_buffer") Signed-off-by: Bjorn Helgaas CC: sta...@vger.kernel.org # v4.10+ --- include/linux/kexec.h |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8bf0ff90885c..e7db550c5fb6 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -159,8 +159,8 @@ struct kexec_buf { bool top_down; }; -int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)); +int arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)); extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); #endif /* CONFIG_KEXEC_FILE */ ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v1 0/2] kexec: Remove "weak" annotations from headers
"Weak" annotations in header files are error-prone because they make every definition weak. Remove them from include/linux/kexec.h. These were introduced in two separate commits, so this is in two patches so they can be easily backported to stable kernels (some of them date back to v4.3 and one only goes back to v4.10). --- Bjorn Helgaas (2): kexec: Remove "weak" from kexec_file function declarations kexec: Remove "weak" from arch_kexec_walk_mem() declaration include/linux/kexec.h | 24 1 file changed, 12 insertions(+), 12 deletions(-) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v1 1/2] kexec: Remove "weak" from kexec_file function declarations
From: Bjorn Helgaas Weak header file declarations are error-prone because they make every definition weak, and the linker chooses one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). For the following functions: arch_kexec_kernel_image_probe() arch_kexec_kernel_image_load() arch_kimage_file_post_load_cleanup() arch_kexec_kernel_verify_sig() arch_kexec_apply_relocations_add() arch_kexec_apply_relocations() kernel/kexec_file.c contains weak definitions, and x86 and powerpc arch code contains definitions intended to be non-weak. But the annotations in the header file make *all* the definitions weak, so it's unclear which ones will be used. Remove the "weak" attribute from the declarations so we always prefer non-weak definitions over the weak ones. Fixes: a43cac0d9dc2 ("kexec: split kexec_file syscall code to kexec_file.c") Signed-off-by: Bjorn Helgaas CC: sta...@vger.kernel.org # v4.3+ --- include/linux/kexec.h | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f16f6ceb3875..8bf0ff90885c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -277,16 +277,16 @@ int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, -unsigned long buf_len); -void * __weak arch_kexec_kernel_image_load(struct kimage *image); -int __weak arch_kimage_file_post_load_cleanup(struct kimage *image); -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len); -int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, - Elf_Shdr *sechdrs, unsigned int relsec); -int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec); +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len); +void *arch_kexec_kernel_image_load(struct kimage *image); +int arch_kimage_file_post_load_cleanup(struct kimage *image); +int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, +unsigned long buf_len); +int arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, +unsigned int relsec); +int arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, +unsigned int relsec); void arch_kexec_protect_crashkres(void); void arch_kexec_unprotect_crashkres(void); ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: RFC on Kdump and PCIe on ARM64
On Thu, Mar 01, 2018 at 02:19:09PM -0500, Sinan Kaya wrote: > On 3/1/2018 2:05 PM, Bjorn Helgaas wrote: > > On Thu, Mar 01, 2018 at 12:44:26PM -0500, Sinan Kaya wrote: > >> Hi, > >> > >> We are seeing IOMMU faults when booting the kdump kernel on ARM64. > >> > >> [7.220162] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x02 received: > >> [7.226123] arm-smmu-v3 arm-smmu-v3.0.auto: > >> 0x0102 > >> [7.232023] arm-smmu-v3 arm-smmu-v3.0.auto: > >> 0x > >> [7.237925] arm-smmu-v3 arm-smmu-v3.0.auto: > >> 0x > >> [7.243827] arm-smmu-v3 arm-smmu-v3.0.auto: > >> 0x > >> > >> This is Nate's interpretation of the fault: > >> > >> "The PCI device is sending transactions just after the SMMU was > >> reset/reinitialized which is problematic because the device has not > >> yet been added to the SMMU and thus should not be doing *any* DMA. > >> DMA from the PCI devices should be quiesced prior to starting the > >> crashdump kernel or you risk overwriting portions of memory you > >> meant to preserve. In this case the SMMU was actually doing you a > >> favor by blocking these errant DMA operations!!" > >> > >> I think this makes sense especially for the IOMMU enabled case on > >> the host where an IOVA can overlap with the region of memory kdump > >> reserved for itself. > >> > >> Apparently, there has been similar concerns in the past. > >> > >> https://www.fujitsu.com/jp/documents/products/software/os/linux/catalog/LinuxConJapan2013-Indoh.pdf > >> > >> and was not addressed globally due to IOMMU+PCI driver ordering > >> issues and bugs in HW due to hot reset. > >> > >> https://lkml.org/lkml/2012/8/3/160 > >> > >> Hot reset as mentioned is destructive and may not be the best > >> implementation choice. However, most of the modern endpoints > >> support PCIE function level reset. > >> > >> One other solution is for SMMUv3 driver to reserve the kdump used > >> IOVA addresses. > >> > >> Another solution is for the SMMUv3 driver to disable PCIe devices > >> behind the SMMU if it see SMMU is already enabled. > > > > What problem are you trying to solve? If the IOMMU is blocking DMA > > after the kdump kernel starts up, that sounds like the desired > > behavior. > > > > Three issues: > 1. I'm seeing a flood of SMMUv3 faults due to adapter using > addresses from the previous kernel. This might be OK. Yep. That's cosmetic and we could suppress the messages if they were a problem. Isn't part of the point of an IOMMU protection against malicious devices and drivers? If so, we should be able to withstand an arbitrary number of faults. > 2. When the SMMUv3 driver sees that it is enabled, it resets itself > and configures it one more time. > > [7.018304] arm-smmu-v3 arm-smmu-v3.0.auto: ias 44-bit, oas 44-bit > (features 0x1fef) > [7.026379] arm-smmu-v3 arm-smmu-v3.0.auto: SMMU currently enabled! > Resetting... > > From the moment IOMMU is disabled to the point where IOMMU get > enabled again, there is a potential for the PCIE device to corrupt > the kdump kernel memory as the bus master and memory enable bits are > left enabled. Do you really have to reset the IOMMU? Can you just give it new page tables that start out with all IOVAs from all devices being invalid, then add valid mappings as drivers need them (presumably after the driver has done whatever it needs to so the device stops using the old DMA addresses)? > [0.00] crashkernel reserved: 0x7fe0 - 0xffe0 > (2048 MB) > > This region happens to overlap with the IOVA addresses that SMMUv3 > driver on the main kernel is allocating. > > IOVA addresses start from 0x and get decremented on each > allocation. > > 3. The last one is adapter gets into fuzzy state due to not coming > out of clean state in the second time init and being rejected by > SMMUv3 multiple times. > > [ 16.093441] pci :01:00.0: aer_status: 0x0004, aer_mask: 0x > [ 16.099356] pci :01:00.0: Malformed TLP > [ 16.103522] pci :01:00.0: aer_layer=Transaction Layer, > aer_agent=Receiver ID > [ 16.110900] pci :01:00.0: aer_uncor_severity: 0x00062011 > [ 16.116543] pci :01:00.0: TLP Header: 0a00a000 8100 01010100 > I'm not clear on this. I don't remember what an IOMMU fault looks like to an Endpoint. Are you saying that if an Endpoint sees too many of those faults, it gets into this "fuzzy state" (whatever that is :))? Is this a hardware defect? Do we care (this is a kdump kernel, after all)? If we do care, can we fix the device by resetting it? Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: RFC on Kdump and PCIe on ARM64
[+cc Joerg, David, iommu list] On Thu, Mar 01, 2018 at 12:44:26PM -0500, Sinan Kaya wrote: > Hi, > > We are seeing IOMMU faults when booting the kdump kernel on ARM64. > > [7.220162] arm-smmu-v3 arm-smmu-v3.0.auto: event 0x02 received: > [7.226123] arm-smmu-v3 arm-smmu-v3.0.auto:0x0102 > [7.232023] arm-smmu-v3 arm-smmu-v3.0.auto:0x > [7.237925] arm-smmu-v3 arm-smmu-v3.0.auto:0x > [7.243827] arm-smmu-v3 arm-smmu-v3.0.auto:0x > > This is Nate's interpretation of the fault: > > "The PCI device is sending transactions just after the SMMU was > reset/reinitialized which is problematic because the device has not > yet been added to the SMMU and thus should not be doing *any* DMA. > DMA from the PCI devices should be quiesced prior to starting the > crashdump kernel or you risk overwriting portions of memory you > meant to preserve. In this case the SMMU was actually doing you a > favor by blocking these errant DMA operations!!" > > I think this makes sense especially for the IOMMU enabled case on > the host where an IOVA can overlap with the region of memory kdump > reserved for itself. > > Apparently, there has been similar concerns in the past. > > https://www.fujitsu.com/jp/documents/products/software/os/linux/catalog/LinuxConJapan2013-Indoh.pdf > > and was not addressed globally due to IOMMU+PCI driver ordering > issues and bugs in HW due to hot reset. > > https://lkml.org/lkml/2012/8/3/160 > > Hot reset as mentioned is destructive and may not be the best > implementation choice. However, most of the modern endpoints > support PCIE function level reset. > > One other solution is for SMMUv3 driver to reserve the kdump used > IOVA addresses. > > Another solution is for the SMMUv3 driver to disable PCIe devices > behind the SMMU if it see SMMU is already enabled. What problem are you trying to solve? If the IOMMU is blocking DMA after the kdump kernel starts up, that sounds like the desired behavior. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/8] iommu/vt-d: Fix crash dump failure caused by legacy DMA/IO
On Wed, Oct 22, 2014 at 7:21 AM, Joerg Roedel wrote: > Hi Bjorn, > > On Tue, Oct 21, 2014 at 08:16:46PM -0600, Bjorn Helgaas wrote: >> I was looking at Zhen-Hua's recent patches, trying to figure out if I >> need to do anything with them. Resetting devices in the old kernel >> seems like a non-starter. Resetting devices in the new kernel, ..., >> well, maybe. It seems ugly, and it seems like the sort of problem >> that IOMMUs are designed to solve. > > Actually resetting the devices in the kdump kernel would be one of the > better solutions for this problem. When we have a generic way to stop > all in-flight DMA from the PCI endpoints we could safely disable and > then re-enable the IOMMU. > >> On Wed, Jul 2, 2014 at 7:32 AM, Joerg Roedel wrote: >> > That is a solution to prevent the in-flight DMA failures. But what >> > happens when there is some in-flight DMA to a disk to write some inodes >> > or a new superblock. Then this scratch address-space may cause >> > filesystem corruption at worst. >> >> This in-flight DMA is from a device programmed by the old kernel, and >> it would be reading data from the old kernel's buffers. I think >> you're suggesting that we might want that DMA read to complete so the >> device can update filesystem metadata? > > Well, it is not about updating filesystem metadata. In the kdump kernel > we have these options: > > 1) Disable the IOMMU. Problem here is, that DMA is now >untranslated, so that any in-flight DMA might read or write >from a random location in memory, corrupting the kdump or >even the new kexec kernel memory. So this is a non-starter. Agreed (at least if the IOMMU was enabled in the crashed kernel). > 2) Re-program the IOMMU to block all DMA. This is safer as it >does not corrupt any data in memory. But some devices react >very poorly on a master abort from the IOMMU, so bad that the >driver in the kdump kernel fails to initialize that device. >In this case taking dump itself might fail (and often does, >according to reports) Sounds like an option, even though broken devices work poorly. > 3) To prevent master aborts like in option (2), David suggested >to map the whole DMA address space to a scratch page. This >also prevents system memory corruption and the master aborts. >But the problem is, that in-flight DMA will now read all >zeros. This can corrupt disk and network data, at worst it >nulls out the superblocks of your filesystem and you lose all >data. So this is not an option too. Ah, yes, I see your point now. This allows corrupted data, e.g., all zeroes, to be written to disk or network after the kernel crash. I agree; this doesn't sound like a good option. And the proposal below is a 4th option (leave IOMMU enabled, reusing crashed kernel's mappings until drivers make new mappings). > What we currently do in the VT-d driver is a mixture of (1) and (2). The > VT-d driver disables the IOMMU hardware (opening a race window for > memory data corruption), re-initializes it to reject any ongoing DMA > (which causes master aborts for in-flight DMA) and re-enables the IOMMU > hardware. > > But this strategy fails in heavy IO environments quite often and we look > into ways to solve the problem, or at least improve the current > situation as good as we can. > > I talked to David about this at LPC and we came up with basically this > procedure: > > 1. If the VT-d driver finds the IOMMU enabled, it reuses its >root-context table. This way the IOMMU must not be disabled >and re-enabled, eliminating the race we have now. >Other data structures like the context-entries are copied >over from the old kernel. We basically keep all mappings >from the old kernel, allowing any in-flight DMA to succeed. >No memory or disk data corruption. If the crashed kernel had corrupted memory, couldn't an in-flight DMA read that corrupted data from memory and write it to disk? I guess you could argue that this is merely a race, and the in-flight DMA could just as easily have happened before the kernel crash, so there's always a window and the only question is whether it closes when the IOMMU driver starts up or when the device driver starts up. >The issue here is, that we modify data from the old kernel >which is about to be dumped. But there are ways to work >around that. > > 2. When a device driver issues the first dma_map command for a >d
Re: [PATCH 0/8] iommu/vt-d: Fix crash dump failure caused by legacy DMA/IO
[-cc Bill, +cc Zhen-Hua, Eric, Tom, Jerry] Hi Joerg, I was looking at Zhen-Hua's recent patches, trying to figure out if I need to do anything with them. Resetting devices in the old kernel seems like a non-starter. Resetting devices in the new kernel, ..., well, maybe. It seems ugly, and it seems like the sort of problem that IOMMUs are designed to solve. Anyway, I found this old discussion that I didn't quite understand: On Wed, Jul 2, 2014 at 7:32 AM, Joerg Roedel wrote: > On Wed, Apr 30, 2014 at 11:49:33AM +0100, David Woodhouse wrote: >> After the last round of this patchset, we discussed a potential >> improvement where you point every virtual bus address at the *same* >> physical scratch page. > > That is a solution to prevent the in-flight DMA failures. But what > happens when there is some in-flight DMA to a disk to write some inodes > or a new superblock. Then this scratch address-space may cause > filesystem corruption at worst. This in-flight DMA is from a device programmed by the old kernel, and it would be reading data from the old kernel's buffers. I think you're suggesting that we might want that DMA read to complete so the device can update filesystem metadata? I don't really understand that argument. Don't we usually want to stop any data from escaping the machine after a crash, on the theory that the old kernel is crashing because something is catastrophically wrong and we may have already corrupted things in memory? If so, allowing this old DMA to complete is just as likely to make things worse as to make them better. Without kdump, we likely would reboot through the BIOS and the device would get reset and the DMA would never happen at all. So if we made the dump kernel program the IOMMU to prevent the DMA, that seems like a similar situation. > So with this in mind I would prefer initially taking over the > page-tables from the old kernel before the device drivers re-initialize > the devices. This makes the dump kernel more dependent on data from the old kernel, which we obviously want to avoid when possible. I didn't find the previous discussion where pointing every virtual bus address at the same physical scratch page was proposed. Why was that better than programming the IOMMU to reject every DMA? Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] PCI: Clear Bus Master bit only on kexec reboot
On Wed, Nov 27, 2013 at 3:19 PM, Khalid Aziz wrote: > Add a flag to tell the PCI subsystem that kernel is shutting down > in prepapration to kexec a kernel. Add code in PCI subsystem to use > this flag to clear Bus Master bit on PCI devices only in case of > kexec reboot. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=63861 > and avoids any other issues caused by clearing Bus Master bit on PCI > devices in normal shutdown path. This patch is based on discussion at > http://marc.info/?l=linux-pci&m=138425645204355&w=2 > > Signed-off-by: Khalid Aziz > Acked-by: Konstantin Khlebnikov > Cc: sta...@vger.kernel.org Applied to my for-linus branch for v3.13, thanks! Bjorn > --- > Changes since v1: > - Moved kexec_in_progress flag from pci.h to kexec.h > - Changed the type for kexec_in_progress flag to bool > - Added #ifdef CONFIG_KEXEC to code in pci-driver.c to > ensure it builds with CONFIG_KEXEC not set. > > drivers/pci/pci-driver.c | 12 +--- > include/linux/kexec.h| 3 +++ > kernel/kexec.c | 4 > 3 files changed, 16 insertions(+), 3 deletions(-) > > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > index 9042fdb..8eca81a 100644 > --- a/drivers/pci/pci-driver.c > +++ b/drivers/pci/pci-driver.c > @@ -19,6 +19,7 @@ > #include > #include > #include > +#include > #include "pci.h" > > struct pci_dynid { > @@ -399,12 +400,17 @@ static void pci_device_shutdown(struct device *dev) > pci_msi_shutdown(pci_dev); > pci_msix_shutdown(pci_dev); > > +#ifdef CONFIG_KEXEC > /* > -* Turn off Bus Master bit on the device to tell it to not > -* continue to do DMA. Don't touch devices in D3cold or unknown > states. > +* If this is a kexec reboot, turn off Bus Master bit on the > +* device to tell it to not continue to do DMA. Don't touch > +* devices in D3cold or unknown states. > +* If it is not a kexec reboot, firmware will hit the PCI > +* devices with big hammer and stop their DMA any way. > */ > - if (pci_dev->current_state <= PCI_D3hot) > + if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) > pci_clear_master(pci_dev); > +#endif > } > > #ifdef CONFIG_PM > diff --git a/include/linux/kexec.h b/include/linux/kexec.h > index d78d28a..5fd33dc 100644 > --- a/include/linux/kexec.h > +++ b/include/linux/kexec.h > @@ -198,6 +198,9 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; > extern size_t vmcoreinfo_size; > extern size_t vmcoreinfo_max_size; > > +/* flag to track if kexec reboot is in progress */ > +extern bool kexec_in_progress; > + > int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, > unsigned long long *crash_size, unsigned long long > *crash_base); > int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, > diff --git a/kernel/kexec.c b/kernel/kexec.c > index 490afc0..d0d8fca 100644 > --- a/kernel/kexec.c > +++ b/kernel/kexec.c > @@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; > size_t vmcoreinfo_size; > size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); > > +/* Flag to indicate we are going to kexec a new kernel */ > +bool kexec_in_progress = false; > + > /* Location of the reserved area for the crash kernel */ > struct resource crashk_res = { > .name = "Crash kernel", > @@ -1675,6 +1678,7 @@ int kernel_kexec(void) > } else > #endif > { > + kexec_in_progress = true; > kernel_restart_prepare(NULL); > printk(KERN_EMERG "Starting new kernel\n"); > machine_shutdown(); > -- > 1.8.3.2 > ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [Help Test] kdump, x86, acpi: Reproduce CPU0 SMI corruption issue after unsetting BSP flag
On Tue, Aug 6, 2013 at 3:19 AM, HATAYAMA Daisuke wrote: > So, could anyone help testing the idea 2) above if you have which of > the following machines? (or other ones that can lead to the same bug) > > - HP Compaq 6910p > - HP Compaq 6710b > - HP Compaq 6710s > - HP Compaq 6510b > - HP Compaq 2510p Sorry, I don't have access to any of these machines any more, and I don't have any useful advice for you. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] PCI: Reset PCIe devices to stop ongoing DMA
[+cc Rafael, linux-acpi] On Tue, Jul 30, 2013 at 6:35 PM, Takao Indoh wrote: > On x86, currently IOMMU initialization run *after* PCI enumeration, but > what you are talking about is that it should be changed so that x86 > IOMMU initialization is done *before* PCI enumeration like sparc, right? Yes. I don't know whether or when that initialization order will ever be changed, but I do think we should avoid building more infrastructure that depends on the current order. Changing the order is a pretty big deal because it's a lot more than just the IOMMU. Basically I think we should be enumerating ACPI devices, including the IOMMU, before PCI devices, but there's a lot of legacy involved in that area. Added Rafael in case he has any thoughts. > Hmm, ok, I think I need to post attached patch to iommu list and > discuss it including current order of x86 IOMMU initialization. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] PCI: Reset PCIe devices to stop ongoing DMA
On Tue, Jul 30, 2013 at 12:09 AM, Takao Indoh wrote: > (2013/07/29 23:17), Bjorn Helgaas wrote: >> On Sun, Jul 28, 2013 at 6:37 PM, Takao Indoh >> wrote: >>> (2013/07/26 2:00), Bjorn Helgaas wrote: >>>> My point about IOMMU and PCI initialization order doesn't go away just >>>> because it doesn't fit "kdump policy." Having system initialization >>>> occur in a logical order is far more important than making kdump work. >>> >>> My next plan is as follows. I think this is matched to logical order >>> on boot. >>> >>> drivers/pci/pci.c >>> - Add function to reset bus, for example, pci_reset_bus(struct pci_bus *bus) >>> >>> drivers/iommu/intel-iommu.c >>> - On initialization, if IOMMU is already enabled, call this bus reset >>>function before disabling and re-enabling IOMMU. >> >> I raised this issue because of arches like sparc that enumerate the >> IOMMU before the PCI devices that use it. In that situation, I think >> you're proposing this: >> >>panic kernel >> enable IOMMU >> panic >>kdump kernel >> initialize IOMMU (already enabled) >>pci_reset_bus >>disable IOMMU >>enable IOMMU >> enumerate PCI devices >> >> But the problem is that when you call pci_reset_bus(), you haven't >> enumerated the PCI devices, so you don't know what to reset. > > Right, so my idea is adding reset code into "intel-iommu.c". intel-iommu > initialization is based on the assumption that enumeration of PCI devices > is already done. We can find target device from IOMMU page table instead > of scanning all devices in pci tree. > > Therefore, this idea is only for intel-iommu. Other architectures need > to implement their own reset code. That's my point. I'm opposed to adding code to PCI when it only benefits x86 and we know other arches will need a fundamentally different design. I would rather have a design that can work for all arches. If your implementation is totally implemented under arch/x86 (or in intel-iommu.c, I guess), I can't object as much. However, I think that eventually even x86 should enumerate the IOMMUs via ACPI before we enumerate PCI devices. It's pretty clear that's how BIOS designers expect the OS to work. For example, sec 8.7.3 of the Intel Virtualization Technology for Directed I/O spec, rev 1.3, shows the expectation that remapping hardware (IOMMU) is initialized before discovering the I/O hierarchy below a hot-added host bridge. Obviously you're not talking about a hot-add scenario, but I think the same sequence should apply at boot-time as well. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] PCI: Reset PCIe devices to stop ongoing DMA
On Sun, Jul 28, 2013 at 6:37 PM, Takao Indoh wrote: > (2013/07/26 2:00), Bjorn Helgaas wrote: >> On Wed, Jul 24, 2013 at 12:29 AM, Takao Indoh >> wrote: >>> Sorry for letting this discussion slide, I was busy on other works:-( >>> Anyway, the summary of previous discussion is: >>> - My patch adds new initcall(fs_initcall) to reset all PCIe endpoints on >>>boot. This expects PCI enumeration is done before IOMMU >>>initialization as follows. >>> (1) PCI enumeration >>> (2) fs_initcall ---> device reset >>> (3) IOMMU initialization >>> - This works on x86, but does not work on other architecture because >>>IOMMU is initialized before PCI enumeration on some architectures. So, >>>device reset should be done where IOMMU is initialized instead of >>>initcall. >>> - Or, as another idea, we can reset devices in first kernel(panic kernel) >>> >>> Resetting devices in panic kernel is against kdump policy and seems not to >>> be good idea. So I think adding reset code into iommu initialization is >>> better. I'll post patches for that. >> >> Of course nobody *wants* to do anything in the panic kernel. But >> simply saying "it's against kdump policy and seems not to be a good >> idea" is not a technical argument. There are things that are >> impractical to do in the kdump kernel, so they have to be done in the >> panic kernel even though we know the kernel is unreliable and the >> attempt may fail. > > Accessing kernel data in panic kernel causes panic again, so > - Don't touch kernel data in panic situation > - Jump to kdump kernel as quickly as possible, and do things in safe > kernel > These are basic "kdump policy". Of course if there are any works which > we cannot do in kdump kernel and can do only in panic kernel, for > example saving registers or stopping cpus, we should do them in panic > kernel. > > Resetting devices in panic kernel is worth considering if we can safely > find pci_dev and reset it, but I have no idea how to do that because > for example struct pci_dev may be borken. Nobody can guarantee that the panic kernel can do *anything* safely because any arbitrary kernel data or text may be corrupted. But if you consider any specific data structure, e.g., CPU or PCI device lists, it's not very likely that it will be corrupted. >> My point about IOMMU and PCI initialization order doesn't go away just >> because it doesn't fit "kdump policy." Having system initialization >> occur in a logical order is far more important than making kdump work. > > My next plan is as follows. I think this is matched to logical order > on boot. > > drivers/pci/pci.c > - Add function to reset bus, for example, pci_reset_bus(struct pci_bus *bus) > > drivers/iommu/intel-iommu.c > - On initialization, if IOMMU is already enabled, call this bus reset > function before disabling and re-enabling IOMMU. I raised this issue because of arches like sparc that enumerate the IOMMU before the PCI devices that use it. In that situation, I think you're proposing this: panic kernel enable IOMMU panic kdump kernel initialize IOMMU (already enabled) pci_reset_bus disable IOMMU enable IOMMU enumerate PCI devices But the problem is that when you call pci_reset_bus(), you haven't enumerated the PCI devices, so you don't know what to reset. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] PCI: Reset PCIe devices to stop ongoing DMA
On Wed, Jul 24, 2013 at 12:29 AM, Takao Indoh wrote: > Sorry for letting this discussion slide, I was busy on other works:-( > Anyway, the summary of previous discussion is: > - My patch adds new initcall(fs_initcall) to reset all PCIe endpoints on > boot. This expects PCI enumeration is done before IOMMU > initialization as follows. > (1) PCI enumeration > (2) fs_initcall ---> device reset > (3) IOMMU initialization > - This works on x86, but does not work on other architecture because > IOMMU is initialized before PCI enumeration on some architectures. So, > device reset should be done where IOMMU is initialized instead of > initcall. > - Or, as another idea, we can reset devices in first kernel(panic kernel) > > Resetting devices in panic kernel is against kdump policy and seems not to > be good idea. So I think adding reset code into iommu initialization is > better. I'll post patches for that. Of course nobody *wants* to do anything in the panic kernel. But simply saying "it's against kdump policy and seems not to be a good idea" is not a technical argument. There are things that are impractical to do in the kdump kernel, so they have to be done in the panic kernel even though we know the kernel is unreliable and the attempt may fail. My point about IOMMU and PCI initialization order doesn't go away just because it doesn't fit "kdump policy." Having system initialization occur in a logical order is far more important than making kdump work. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] PCI: Reset PCIe devices to stop ongoing DMA
On Wed, Jun 12, 2013 at 8:44 PM, Takao Indoh wrote: > (2013/06/12 13:45), Bjorn Helgaas wrote: >> [+cc Vivek, Haren; sorry I didn't think to add you earlier] >> >> On Tue, Jun 11, 2013 at 12:08 AM, Takao Indoh >> wrote: >>> (2013/06/11 11:20), Bjorn Helgaas wrote: >> >>>> I'm not sure you need to reset legacy devices (or non-PCI devices) >>>> yet, but the current hook isn't anchored anywhere -- it's just an >>>> fs_initcall() that doesn't give the reader any clue about the >>>> connection between the reset and the problem it's solving. >>>> >>>> If we do something like this patch, I think it needs to be done at the >>>> point where we enable or disable the IOMMU. That way, it's connected >>>> to the important event, and there's a clue about how to make >>>> corresponding fixes for other IOMMUs. >>> >>> Ok. pci_iommu_init() is appropriate place to add this hook? >> >> I looked at various IOMMU init places today, and it's far more >> complicated and varied than I had hoped. >> >> This reset scheme depends on enumerating PCI devices before we >> initialize the IOMMU used by those devices. x86 works that way today, >> but not all architectures do (see the sparc pci_fire_pbm_init(), for > > Sorry, could you tell me which part depends on architecture? Your patch works if PCIe devices are reset before the kdump kernel enables the IOMMU. On x86, this is possible because PCI enumeration happens before the IOMMU initialization. On sparc, the IOMMU is initialized before PCI devices are enumerated, so there would still be a window where ongoing DMA could cause an IOMMU error. Of course, it might be possible to reorganize the sparc code to to the IOMMU init *after* it enumerates PCI devices. But I think that change would be hard to justify. And I think even on x86, it would be better if we did the IOMMU init before PCI enumeration -- the PCI devices depend on the IOMMU, so logically the IOMMU should be initialized first so the PCI devices can be associated with it as they are enumerated. >> example). And I think conceptually, the IOMMU should be enumerated >> and initialized *before* the devices that use it. >> >> So I'm uncomfortable with that aspect of this scheme. >> >> It would be at least conceivable to reset the devices in the system >> kernel, before the kexec. I know we want to do as little as possible >> in the crashing kernel, but it's at least a possibility, and it might >> be cleaner. > > I bet this will be not accepted by kdump maintainer. Everything in panic > kernel is unreliable. kdump is inherently unreliable. The kdump kernel doesn't start from an arbitrary machine state. We don't expect it to tolerate all CPUs running, for example. Maybe it should be expected to tolerate PCI devices running, either. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH bugfix 3.9] PCI: Don't try to disable Bus Master on disconnected PCI devices
[+cc linux-pci] On Mon, Apr 1, 2013 at 2:00 AM, Konstantin Khlebnikov wrote: > BUMP. This is degradation from 3.8, so this patch must be in 3.9. > > I still don't like this forced clearing bus-master bit. But this hack > definitely fixes problems in kexec, so there is reason to keep it here. Applied to for-linus for v3.9, thanks! > Konstantin Khlebnikov wrote: >> >> This is fix for commit 7897e6022761ace7377f0f784fca059da55f5d71 from >> v3.9-rc1 >> ("PCI: Disable Bus Master unconditionally in pci_device_shutdown()") >> in turn that was fix for b566a22c23327f18ce941ffad0ca907e50a53d41 from >> v3.5-rc1 >> ("PCI: disable Bus Master on PCI device shutdown") >> >> Unfortunately fixing one bug uncovers another: >> ->shutdown() callback might switch device to deep sleep state. >> PCI config space no longer available after that. >> >> Link: https://lkml.org/lkml/2013/3/12/529 >> Signed-off-by: Konstantin Khlebnikov >> Reported-and-Tested-by: Vivek Goyal >> Cc: Bjorn Helgaas >> Cc: Rafael J. Wysocki >> --- >> drivers/pci/pci-driver.c |5 +++-- >> 1 file changed, 3 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c >> index 1fa1e48..79277fb 100644 >> --- a/drivers/pci/pci-driver.c >> +++ b/drivers/pci/pci-driver.c >> @@ -390,9 +390,10 @@ static void pci_device_shutdown(struct device *dev) >> >> /* >> * Turn off Bus Master bit on the device to tell it to not >> -* continue to do DMA >> +* continue to do DMA. Don't touch devices in D3cold or unknown >> states. >> */ >> - pci_clear_master(pci_dev); >> + if (pci_dev->current_state <= PCI_D3hot) >> + pci_clear_master(pci_dev); >> } >> >> #ifdef CONFIG_PM >> > ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC 1/3] PCI/PM: Fix kexec for D3cold and bridge suspending
+cc Eric and kexec list On Mon, Sep 17, 2012 at 2:54 AM, Huang Ying wrote: > If PCI devices are put into D3cold before kexec, because the > configuration registers of PCI devices in D3cold are not accessible. > > And if PCI bridges are put into low power state before kexec, > configuration registers of PCI devices underneath the PCI bridges are > not accessible too. > > These will make some PCI devices can not be scanned after kexec, so > resume the PCI devices in D3cold or PCI bridges in low power state > before kexec. Don't we need to resume the device even without the kexec issue? And even if it's in D1 or D2? It looks to me like pci_msi_shutdown() (and probably drv->shutdown()) depend on the device being in D0. > Signed-off-by: Huang Ying > --- > drivers/pci/pci-driver.c |4 > 1 file changed, 4 insertions(+) > > --- a/drivers/pci/pci-driver.c > +++ b/drivers/pci/pci-driver.c > @@ -421,6 +421,10 @@ static void pci_device_shutdown(struct d > struct pci_dev *pci_dev = to_pci_dev(dev); > struct pci_driver *drv = pci_dev->driver; > > + /* Resume bridges and devices in D3cold for kexec to work properly */ > + if (pci_dev->current_state == PCI_D3cold || pci_dev->subordinate) > + pm_runtime_resume(dev); > + > if (drv && drv->shutdown) > drv->shutdown(pci_dev); > pci_msi_shutdown(pci_dev); ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [patch] ia64: make PA() work for both physical identity-mapped virtual addresses
On Monday 06 October 2008 3:32:10 pm Simon Horman wrote: > On Mon, Oct 06, 2008 at 09:24:03AM -0600, Bjorn Helgaas wrote: > > The EFI Runtime Services Table contains pointers to ia64 function > > descriptors. On existing, pre-Tiano, firmware, SetVirtualAddressMap() > > converts *all* these pointers from physical to virtual. On Tiano-based > > firmware, the pointer to the SetVirtualAddressMap() function descriptor > > is not converted, so it remains a physical pointer. > > > > The ia64 kexec purgatory patches the SetVirtualAddressMap() function > > descriptor so that when the new kernel calls SetVirtualAddressMap(), it > > never reaches firmware. Instead, it calls a dummy function that just > > returns success. > > > > Purgatory runs in physical mode, so it must convert the pointer from the > > RuntimeServicesTable to a physical address. This patch makes that > > conversion work both for old firmware (where the pointer is an identity- > > mapped virtual address) and new Tiano firmware (where the pointer is a > > physical address). > > > > Without this patch, kexec on Tiano firmware causes an MCA because > > ia64_env_setup() subtracts PAGE_OFFSET from a physical address and ends > > up with an invalid physical address. Referencing that address causes > > the MCA. > > > > Signed-off-by: Bjorn Helgaas <[EMAIL PROTECTED]> > > Thanks Bjorn, applied. Thanks! Any idea when you'll make another kexec-tools release? I have a Red Hat feature request in for RHEL6, and they'll need to know what kexec-tools version to look for. RH will get the latest for RHEL6, but I really don't know when they'll start putting that together. https://bugzilla.redhat.com/show_bug.cgi?id=463477 Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[patch] ia64: make PA() work for both physical identity-mapped virtual addresses
The EFI Runtime Services Table contains pointers to ia64 function descriptors. On existing, pre-Tiano, firmware, SetVirtualAddressMap() converts *all* these pointers from physical to virtual. On Tiano-based firmware, the pointer to the SetVirtualAddressMap() function descriptor is not converted, so it remains a physical pointer. The ia64 kexec purgatory patches the SetVirtualAddressMap() function descriptor so that when the new kernel calls SetVirtualAddressMap(), it never reaches firmware. Instead, it calls a dummy function that just returns success. Purgatory runs in physical mode, so it must convert the pointer from the RuntimeServicesTable to a physical address. This patch makes that conversion work both for old firmware (where the pointer is an identity- mapped virtual address) and new Tiano firmware (where the pointer is a physical address). Without this patch, kexec on Tiano firmware causes an MCA because ia64_env_setup() subtracts PAGE_OFFSET from a physical address and ends up with an invalid physical address. Referencing that address causes the MCA. Signed-off-by: Bjorn Helgaas <[EMAIL PROTECTED]> diff --git a/purgatory/arch/ia64/purgatory-ia64.c b/purgatory/arch/ia64/purgatory-ia64.c index b2fe6d4..acacb56 100644 --- a/purgatory/arch/ia64/purgatory-ia64.c +++ b/purgatory/arch/ia64/purgatory-ia64.c @@ -147,7 +147,7 @@ setup_arch(void) inline unsigned long PA(unsigned long addr) { - return addr - PAGE_OFFSET; + return addr & 0x0fffLL; } void ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: SetVirtualAddressMap() on Tiano EFI firmware
On Thursday 11 September 2008 02:23:58 pm Bjorn Helgaas wrote: > ... > The following patch makes kexec-tools work on either old or new > firmware. Masking out the top nibble of either a physical address > or a virtual address in region 7 results in a physical address. > > diff --git a/purgatory/arch/ia64/purgatory-ia64.c > b/purgatory/arch/ia64/purgatory-ia64.c > index b2fe6d4..acacb56 100644 > --- a/purgatory/arch/ia64/purgatory-ia64.c > +++ b/purgatory/arch/ia64/purgatory-ia64.c > @@ -147,7 +147,7 @@ setup_arch(void) > > inline unsigned long PA(unsigned long addr) > { > - return addr - PAGE_OFFSET; > + return addr & 0x0fffLL; > } > > void > I haven't seen any activity on this. Simon, if nobody objects to it, would you consider the patch above for kexec-tools? Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: SetVirtualAddressMap() on Tiano EFI firmware
On Friday 12 September 2008 12:59:53 pm Bjorn Helgaas wrote: > On Thursday 11 September 2008 03:59:54 pm Lombard, David N wrote: > > On Thu, Sep 11, 2008 at 01:23:58PM -0700, Bjorn Helgaas wrote: > > > Has kexec been tested on x86 with EFI firmware? > > > > > > I'm testing it on ia64 with Tiano-based EFI firmware, and I > > > tripped over an issue with SetVirtualAddressMap(). I would > > > expect a similar problem to happen on x86, but I don't see > > > any code to deal with it. > > > > Hmmm. I run on 32- and 64-bit kernels on a 32-bit IA EFI, and > > haven't seen this problem, including chains of kexec invocations. > > > > I'll take a closer look at what's going on on my systems. Any news on this? > Thanks. In case it wasn't clear, here's my confusion about x86: > > I don't see anything in kexec-tools that touches SetVirtualAddressMap(). > And I don't see anything in Linux that prevents x86 from calling > SetVirtualAddressMap() in the new kernel. > > If we do call SetVirtualAddressMap() in the new kernel, I would > expect it to return EFI_UNSUPPORTED, which should cause Linux to > panic. ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: SetVirtualAddressMap() on Tiano EFI firmware
On Thursday 11 September 2008 03:59:54 pm Lombard, David N wrote: > On Thu, Sep 11, 2008 at 01:23:58PM -0700, Bjorn Helgaas wrote: > > Has kexec been tested on x86 with EFI firmware? > > > > I'm testing it on ia64 with Tiano-based EFI firmware, and I > > tripped over an issue with SetVirtualAddressMap(). I would > > expect a similar problem to happen on x86, but I don't see > > any code to deal with it. > > Hmmm. I run on 32- and 64-bit kernels on a 32-bit IA EFI, and > haven't seen this problem, including chains of kexec invocations. > > I'll take a closer look at what's going on on my systems. Thanks. In case it wasn't clear, here's my confusion about x86: I don't see anything in kexec-tools that touches SetVirtualAddressMap(). And I don't see anything in Linux that prevents x86 from calling SetVirtualAddressMap() in the new kernel. If we do call SetVirtualAddressMap() in the new kernel, I would expect it to return EFI_UNSUPPORTED, which should cause Linux to panic. Bjorn ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
SetVirtualAddressMap() on Tiano EFI firmware
Has kexec been tested on x86 with EFI firmware? I'm testing it on ia64 with Tiano-based EFI firmware, and I tripped over an issue with SetVirtualAddressMap(). I would expect a similar problem to happen on x86, but I don't see any code to deal with it. The EFI SetVirtualAddressMap() interface is only supposed to be called once. Linux calls it once at boot-time, and if we kexec a new Linux kernel, the new kernel calls it again. The EFI spec suggests that if it *is* called twice, SetVirtualAddressMap() should return EFI_UNSUPPORTED. But we go to great pains in purgatory (ia64_env_setup()) to patch the function descriptor so that the new kernel's call never even makes it to the EFI routine. I don't know whether this is because SetVirtualAddressMap() blows up if we call it again, or whether we just didn't want to change Linux to ignore an EFI_UNSUPPORTED return value. On ia64, the SetVirtualAddressMap entry in the EFI Runtime Services Table points to a function descriptor, and ia64_env_setup() patches the code address in the descriptor so that when the new kernel thinks it's calling SetVirtualAddressMap(), it's instead calling a dummy function that immediately returns success. In most current ia64 EFI implementations, SetVirtualAddressMap() converts all pointers in the EFI Runtime Services Table from physical to virtual, including the SetVirtualAddressMap pointer itself. ia64_env_setup() (running in physical mode) converts the SetVirtualAddressMap pointer from virtual to physical by subtracting PAGE_OFFSET. But new Tiano-based EFI implementations do *not* convert the SetVirtualAddressMap pointer to virtual -- they leave it as physical. So when ia64_env_setup() subtracts PAGE_OFFSET from the physical address, it ends up with garbage, and patching the function descriptor causes an MCA. The following patch makes kexec-tools work on either old or new firmware. Masking out the top nibble of either a physical address or a virtual address in region 7 results in a physical address. diff --git a/purgatory/arch/ia64/purgatory-ia64.c b/purgatory/arch/ia64/purgatory-ia64.c index b2fe6d4..acacb56 100644 --- a/purgatory/arch/ia64/purgatory-ia64.c +++ b/purgatory/arch/ia64/purgatory-ia64.c @@ -147,7 +147,7 @@ setup_arch(void) inline unsigned long PA(unsigned long addr) { - return addr - PAGE_OFFSET; + return addr & 0x0fffLL; } void ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec