Re: [PATCH] s390/crash: Fix KEXEC_NOTE_BYTES definition
Am Fri, 9 Jun 2017 10:17:05 +0800 schrieb Xunlei Pang <xlp...@redhat.com>: > S390 KEXEC_NOTE_BYTES is not used by note_buf_t as before, which > is now defined as follows: > typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; > It was changed by the CONFIG_CRASH_CORE feature. > > This patch gets rid of all the old KEXEC_NOTE_BYTES stuff, and > renames KEXEC_NOTE_BYTES to CRASH_CORE_NOTE_BYTES for S390. > > Fixes: 692f66f26a4c ("crash: move crashkernel parsing and vmcore related code > under CONFIG_CRASH_CORE") > Cc: Dave Young <dyo...@redhat.com> > Cc: Dave Anderson <ander...@redhat.com> > Cc: Hari Bathini <hbath...@linux.vnet.ibm.com> > Cc: Gustavo Luiz Duarte <gustav...@linux.vnet.ibm.com> > Signed-off-by: Xunlei Pang <xlp...@redhat.com> Hello Xunlei, As you already know on s390 we create the ELF header in the new kernel. Therefore we don't use the per-cpu buffers for ELF notes to store the register state. For RHEL7 we still store the registers in machine_kexec.c:add_elf_notes(). Though we also use the ELF header from new kernel ... We assume your original problem with the "kmem -s" failure was caused by the memory overwrite due to the invalid size of the "crash_notes" per-cpu buffers. Therefore your patch looks good for RHEL7 but for upstream we propose the patch below. --- [PATCH] s390/crash: Remove unused KEXEC_NOTE_BYTES After commmit 692f66f26a4c19 ("crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE") the KEXEC_NOTE_BYTES macro is not used anymore and for s390 we create the ELF header in the new kernel anyway. Therefore remove the macro. Reported-by: Xunlei Pang <xp...@redhat.com> Reviewed-by: Mikhail Zaslonko <zaslo...@linux.vnet.ibm.com> Signed-off-by: Michael Holzheu <holz...@linux.vnet.ibm.com> --- arch/s390/include/asm/kexec.h | 18 -- include/linux/crash_core.h| 5 + include/linux/kexec.h | 9 - 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 2f924bc30e35..dccf24ee26d3 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -41,24 +41,6 @@ /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 -/* - * Size for s390x ELF notes per CPU - * - * Seven notes plus zero note at the end: prstatus, fpregset, timer, - * tod_cmp, tod_reg, control regs, and prefix - */ -#define KEXEC_NOTE_BYTES \ - (ALIGN(sizeof(struct elf_note), 4) * 8 + \ -ALIGN(sizeof("CORE"), 4) * 7 + \ -ALIGN(sizeof(struct elf_prstatus), 4) + \ -ALIGN(sizeof(elf_fpregset_t), 4) + \ -ALIGN(sizeof(u64), 4) + \ -ALIGN(sizeof(u64), 4) + \ -ALIGN(sizeof(u32), 4) + \ -ALIGN(sizeof(u64) * 16, 4) + \ -ALIGN(sizeof(u32), 4) \ - ) - /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 541a197ba4a2..4090a42578a8 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -10,6 +10,11 @@ #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) +/* + * The per-cpu notes area is a list of notes terminated by a "NULL" + * note header. For kdump, the code in vmcore.c runs in the context + * of the second kernel to combine them into one note. + */ #define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ CRASH_CORE_NOTE_NAME_BYTES + \ CRASH_CORE_NOTE_DESC_BYTES) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c9481ebcbc0c..65888418fb69 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -63,15 +63,6 @@ #define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME /* - * The per-cpu notes area is a list of notes terminated by a "NULL" - * note header. For kdump, the code in vmcore.c runs in the context - * of the second kernel to combine them into one note. - */ -#ifndef KEXEC_NOTE_BYTES -#define KEXEC_NOTE_BYTES CRASH_CORE_NOTE_BYTES -#endif - -/* * This structure is used to hold the arguments that are used when loading * kernel binaries. */ -- 2.11.2 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] s390/crash: Fix KEXEC_NOTE_BYTES definition
Hi Xunlei, Sorry for the late reply - I was on vacation up to now. Give us some time to look into this issue. Michael Am Fri, 9 Jun 2017 10:17:05 +0800 schrieb Xunlei Pang: > S390 KEXEC_NOTE_BYTES is not used by note_buf_t as before, which > is now defined as follows: > typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; > It was changed by the CONFIG_CRASH_CORE feature. > > This patch gets rid of all the old KEXEC_NOTE_BYTES stuff, and > renames KEXEC_NOTE_BYTES to CRASH_CORE_NOTE_BYTES for S390. > > Fixes: 692f66f26a4c ("crash: move crashkernel parsing and vmcore related code > under CONFIG_CRASH_CORE") > Cc: Dave Young > Cc: Dave Anderson > Cc: Hari Bathini > Cc: Gustavo Luiz Duarte > Signed-off-by: Xunlei Pang > --- > arch/s390/include/asm/kexec.h | 2 +- > include/linux/crash_core.h| 7 +++ > include/linux/kexec.h | 11 +-- > 3 files changed, 9 insertions(+), 11 deletions(-) > > diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h > index 2f924bc..352deb8 100644 > --- a/arch/s390/include/asm/kexec.h > +++ b/arch/s390/include/asm/kexec.h > @@ -47,7 +47,7 @@ > * Seven notes plus zero note at the end: prstatus, fpregset, timer, > * tod_cmp, tod_reg, control regs, and prefix > */ > -#define KEXEC_NOTE_BYTES \ > +#define CRASH_CORE_NOTE_BYTES \ > (ALIGN(sizeof(struct elf_note), 4) * 8 + \ >ALIGN(sizeof("CORE"), 4) * 7 + \ >ALIGN(sizeof(struct elf_prstatus), 4) + \ > diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h > index e9de6b4..dbc6e5c 100644 > --- a/include/linux/crash_core.h > +++ b/include/linux/crash_core.h > @@ -10,9 +10,16 @@ > #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) > #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) > > +/* > + * The per-cpu notes area is a list of notes terminated by a "NULL" > + * note header. For kdump, the code in vmcore.c runs in the context > + * of the second kernel to combine them into one note. > + */ > +#ifndef CRASH_CORE_NOTE_BYTES > #define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + > \ >CRASH_CORE_NOTE_NAME_BYTES + \ >CRASH_CORE_NOTE_DESC_BYTES) > +#endif > > #define VMCOREINFO_BYTESPAGE_SIZE > #define VMCOREINFO_NOTE_NAME"VMCOREINFO" > diff --git a/include/linux/kexec.h b/include/linux/kexec.h > index 3ea8275..133df03 100644 > --- a/include/linux/kexec.h > +++ b/include/linux/kexec.h > @@ -14,7 +14,6 @@ > > #if !defined(__ASSEMBLY__) > > -#include > #include > > #include > @@ -25,6 +24,7 @@ > #include > #include > #include > +#include > > /* Verify architecture specific macros are defined */ > > @@ -63,15 +63,6 @@ > #define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME > > /* > - * The per-cpu notes area is a list of notes terminated by a "NULL" > - * note header. For kdump, the code in vmcore.c runs in the context > - * of the second kernel to combine them into one note. > - */ > -#ifndef KEXEC_NOTE_BYTES > -#define KEXEC_NOTE_BYTES CRASH_CORE_NOTE_BYTES > -#endif > - > -/* > * This structure is used to hold the arguments that are used when loading > * kernel binaries. > */ ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: Error on re-filtering the dump file with no free pages
Am Thu, 18 May 2017 04:37:20 + schrieb Atsushi Kumagai: > Hello Zaslonko, > > > When re-filtering the dump file after the free pages have already been > > stripped out we get an error "Can't get necessary symbols for excluding > > free pages" if newly specified dump level is below 16 (preserves free > > pages). > > According to the code, the check for the new dump level is done BEFORE > > the new dump level is actually set (based on the dump level specified in > > the command and the one from the input dump file). > > Moving the new_dump_level calculation ahead would fix the error. > > Well, I guess the real problem is different. > The error you said is printed by exclude_free_page(): > > if ((SYMBOL(node_data) == NOT_FOUND_SYMBOL) > && (SYMBOL(pgdat_list) == NOT_FOUND_SYMBOL) > && (SYMBOL(contig_page_data) == NOT_FOUND_SYMBOL)) { > ERRMSG("Can't get necessary symbols for excluding free > pages.\n"); > return FALSE; > } > > I think the availability of these symbols are not related to free pages. > exclude_free_page() is called if info->page_is_buddy is null, I estimate that > this situation occurs only if the kernel is older(2.6.16 or before). > > However, I don't think you use such too old kernel, so I suspect that > setup_page_is_buddy() should be updated for newer kernel. Mikhail is on vacation now - so I try to explain: The test case is as follows: 1) We have a -d31 filtered dump "dump.d31" 2) We want to compress the dump with "makedumpfile -c dump.31 dump31.compressed" This fails with: makedumpfile -c dump.31 dump.31.compressed Excluding unnecessary pages: [100.0 %] exclude_free_page: Can't get necessary symbols for excluding free pages. dump_level is changed to 31, because dump.31 was created by dump_level(31). makedumpfile Failed. The problem is that setup_page_is_buddy() is not called in this case because info->dump_level is still 0 since it was not adjusted early enough: if (info->dump_level & DL_EXCLUDE_FREE) setup_page_is_buddy(); Because it is not set info->page_is_buddy is NULL and therefore the following if condition gets true: if ((info->dump_level & DL_EXCLUDE_FREE) && !info->page_is_buddy) if (!exclude_free_page(cycle)) return FALSE; Since we don't have the symbols in VMCOREINFO (and IMHO don't need it?) the exclude_free_page() functions fails with the described error message. So our fix is to adjust the info->level before setup_page_is_buddy() is called. Best Regards Michael > Could you tell me your kernel version and how to reproduce this issue ? ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v4 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
Am Thu, 20 Apr 2017 19:39:32 +0800 schrieb Xunlei Pang <xlp...@redhat.com>: > As Eric said, > "what we need to do is move the variable vmcoreinfo_note out > of the kernel's .bss section. And modify the code to regenerate > and keep this information in something like the control page. > > Definitely something like this needs a page all to itself, and ideally > far away from any other kernel data structures. I clearly was not > watching closely the data someone decided to keep this silly thing > in the kernel's .bss section." > > This patch allocates extra pages for these vmcoreinfo_XXX variables, > one advantage is that it enhances some safety of vmcoreinfo, because > vmcoreinfo now is kept far away from other kernel data structures. > > Suggested-by: Eric Biederman <ebied...@xmission.com> > Cc: Michael Holzheu <holz...@linux.vnet.ibm.com> > Cc: Juergen Gross <jgr...@suse.com> > Signed-off-by: Xunlei Pang <xlp...@redhat.com> Now s390 seems to work with the patches. I tested kdump and zfcpdump. Tested-by: Michael Holzheu <holz...@linux.vnet.ibm.com> > --- > v3->v4: > -Rebased on the latest linux-next > -Handle S390 vmcoreinfo_note properly > -Handle the newly-added xen/mmu_pv.c > > arch/ia64/kernel/machine_kexec.c | 5 - > arch/s390/kernel/machine_kexec.c | 1 + > arch/s390/kernel/setup.c | 6 -- > arch/x86/kernel/crash.c | 2 +- > arch/x86/xen/mmu_pv.c| 4 ++-- > include/linux/crash_core.h | 2 +- > kernel/crash_core.c | 27 +++ > kernel/ksysfs.c | 2 +- > 8 files changed, 29 insertions(+), 20 deletions(-) > > diff --git a/arch/ia64/kernel/machine_kexec.c > b/arch/ia64/kernel/machine_kexec.c > index 599507b..c14815d 100644 > --- a/arch/ia64/kernel/machine_kexec.c > +++ b/arch/ia64/kernel/machine_kexec.c > @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void) > #endif > } > > -phys_addr_t paddr_vmcoreinfo_note(void) > -{ > - return ia64_tpa((unsigned long)(char *)_note); > -} > - > diff --git a/arch/s390/kernel/machine_kexec.c > b/arch/s390/kernel/machine_kexec.c > index 49a6bd4..3d0b14a 100644 > --- a/arch/s390/kernel/machine_kexec.c > +++ b/arch/s390/kernel/machine_kexec.c > @@ -246,6 +246,7 @@ void arch_crash_save_vmcoreinfo(void) > VMCOREINFO_SYMBOL(lowcore_ptr); > VMCOREINFO_SYMBOL(high_memory); > VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); > + mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); > } > > void machine_shutdown(void) > diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c > index 3ae756c..3d1d808 100644 > --- a/arch/s390/kernel/setup.c > +++ b/arch/s390/kernel/setup.c > @@ -496,11 +496,6 @@ static void __init setup_memory_end(void) > pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); > } > > -static void __init setup_vmcoreinfo(void) > -{ > - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); > -} > - > #ifdef CONFIG_CRASH_DUMP > > /* > @@ -939,7 +934,6 @@ void __init setup_arch(char **cmdline_p) > #endif > > setup_resources(); > - setup_vmcoreinfo(); > setup_lowcore(); > smp_fill_possible_mask(); > cpu_detect_mhz_feature(); > diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c > index 22217ec..44404e2 100644 > --- a/arch/x86/kernel/crash.c > +++ b/arch/x86/kernel/crash.c > @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data > *ced, > bufp += sizeof(Elf64_Phdr); > phdr->p_type = PT_NOTE; > phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); > - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); > + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; > (ehdr->e_phnum)++; > > #ifdef CONFIG_X86_64 > diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c > index 9d9ae66..35543fa 100644 > --- a/arch/x86/xen/mmu_pv.c > +++ b/arch/x86/xen/mmu_pv.c > @@ -2723,8 +2723,8 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, > unsigned int order) > phys_addr_t paddr_vmcoreinfo_note(void) > { > if (xen_pv_domain()) > - return virt_to_machine(_note).maddr; > + return virt_to_machine(vmcoreinfo_note).maddr; > else > - return __pa_symbol(_note); > + return __pa(vmcoreinfo_note); > } > #endif /* CONFIG_KEXEC_CORE */ > diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h > index eb71a70..ba283a2 100644 > --- a/include/linux/crash
Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
Am Thu, 23 Mar 2017 17:23:53 +0800 schrieb Xunlei Pang <xp...@redhat.com>: > On 03/23/2017 at 04:48 AM, Michael Holzheu wrote: > > Am Wed, 22 Mar 2017 12:30:04 +0800 > > schrieb Dave Young <dyo...@redhat.com>: > > > >> On 03/21/17 at 10:18pm, Eric W. Biederman wrote: > >>> Dave Young <dyo...@redhat.com> writes: > >>> > > [snip] > > > >>>> I think makedumpfile is using it, but I also vote to remove the > >>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile > >>>> userspace patch is needed to drop the use of it. > >>>> > >>>>> As we are looking at reliability concerns removing CRASHTIME should make > >>>>> everything in vmcoreinfo a boot time constant. Which should simplify > >>>>> everything considerably. > >>>> It is a nice improvement.. > >>> We also need to take a close look at what s390 is doing with vmcoreinfo. > >>> As apparently it is reading it in a different kind of crashdump process. > >> Yes, need careful review from s390 and maybe ppc64 especially about > >> patch 2/3, better to have comments from IBM about s390 dump tool and ppc > >> fadump. Added more cc. > > On s390 we have at least an issue with patch 1/3. For stand-alone dump > > and also because we create the ELF header for kdump in the new > > kernel we save the pointer to the vmcoreinfo note in the old kernel on a > > defined memory address in our absolute zero lowcore. > > > > This is done in arch/s390/kernel/setup.c: > > > > static void __init setup_vmcoreinfo(void) > > { > > mem_assign_absolute(S390_lowcore.vmcore_info, > > paddr_vmcoreinfo_note()); > > } > > > > Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in > > time we have a problem here. > > > > To solve this - I think - we could move the initialization to > > arch/s390/kernel/machine_kexec.c: > > > > void arch_crash_save_vmcoreinfo(void) > > { > > VMCOREINFO_SYMBOL(lowcore_ptr); > > VMCOREINFO_SYMBOL(high_memory); > > VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); > > mem_assign_absolute(S390_lowcore.vmcore_info, > > paddr_vmcoreinfo_note()); > > } > > > > Probably related to this is my observation that patch 3/3 leads to > > an empty VMCOREINFO note for kdump on s390. The note is there ... > > > > # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE > > VMCOREINFO 0x068e Unknown note type: (0x) > > > > But it contains only zeros. > > Yes, this is a good catch, I will do more tests. Hello Xunlei, After spending some time on this, I now understood the problem: In patch 3/3 you copy vmcoreinfo into the control page before machine_kexec_prepare() is called. For s390 we give back all the crashkernel memory to the hypervisor before the new crashkernel is loaded: /* * Give back memory to hypervisor before new kdump is loaded */ static int machine_kexec_prepare_kdump(void) { #ifdef CONFIG_CRASH_DUMP if (MACHINE_IS_VM) diag10_range(PFN_DOWN(crashk_res.start), PFN_DOWN(crashk_res.end - crashk_res.start + 1)); return 0; #else return -EINVAL; #endif } So after machine_kexec_prepare_kdump() the contents of your control page is gone and therefore the vmcorinfo ELF note contains only zeros. If you call kimage_crash_copy_vmcoreinfo() after machine_kexec_prepare_kdump() the problem should be solved for s390. Regards Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
Am Wed, 22 Mar 2017 12:30:04 +0800 schrieb Dave Young: > On 03/21/17 at 10:18pm, Eric W. Biederman wrote: > > Dave Young writes: > > [snip] > > > I think makedumpfile is using it, but I also vote to remove the > > > CRASHTIME. It is better not to do this while crashing and a makedumpfile > > > userspace patch is needed to drop the use of it. > > > > > >> > > >> As we are looking at reliability concerns removing CRASHTIME should make > > >> everything in vmcoreinfo a boot time constant. Which should simplify > > >> everything considerably. > > > > > > It is a nice improvement.. > > > > We also need to take a close look at what s390 is doing with vmcoreinfo. > > As apparently it is reading it in a different kind of crashdump process. > > Yes, need careful review from s390 and maybe ppc64 especially about > patch 2/3, better to have comments from IBM about s390 dump tool and ppc > fadump. Added more cc. On s390 we have at least an issue with patch 1/3. For stand-alone dump and also because we create the ELF header for kdump in the new kernel we save the pointer to the vmcoreinfo note in the old kernel on a defined memory address in our absolute zero lowcore. This is done in arch/s390/kernel/setup.c: static void __init setup_vmcoreinfo(void) { mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); } Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in time we have a problem here. To solve this - I think - we could move the initialization to arch/s390/kernel/machine_kexec.c: void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); } Probably related to this is my observation that patch 3/3 leads to an empty VMCOREINFO note for kdump on s390. The note is there ... # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE VMCOREINFO 0x068e Unknown note type: (0x) But it contains only zeros. Unfortunately I have not yet understood the reason for this. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] s390/kexec: Consolidate crash_map/unmap_reserved_pages() and arch_kexec_protect(unprotect)_crashkres()
Hello Xunlei, On Tue, 5 Apr 2016 15:09:59 +0800 Xunlei Pang <xlp...@redhat.com> wrote: > Commit 3f625002581b ("kexec: introduce a protection mechanism > for the crashkernel reserved memory") is a similar mechanism > for protecting the crash kernel reserved memory to previous > crash_map/unmap_reserved_pages() implementation, the new one > is more generic in name and cleaner in code (besides, some > arch may not be allowed to unmap the pgtable). > > Therefore, this patch consolidates them, and uses the new > arch_kexec_protect(unprotect)_crashkres() to replace former > crash_map/unmap_reserved_pages() which by now has been only > used by S390. > > The consolidation work needs the crash memory to be mapped > initially, so get rid of S390 crash kernel memblock removal > in reserve_crashkernel(). If you fix this comment, I am fine with your patch. Acked-by: Michael Holzheu <holz...@linux.vnet.ibm.com> ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] s390/kexec: Consolidate crash_map/unmap_reserved_pages() and arch_kexec_protect(unprotect)_crashkres()
Hello Xunlei again, Some initial comments below... On Wed, 30 Mar 2016 19:47:21 +0800 Xunlei Pang <xlp...@redhat.com> wrote: > Commit 3f625002581b ("kexec: introduce a protection mechanism > for the crashkernel reserved memory") is a similar mechanism > for protecting the crash kernel reserved memory to previous > crash_map/unmap_reserved_pages() implementation, the new one > is more generic in name and cleaner in code (besides, some > arch may not be allowed to unmap the pgtable). > > Therefore, this patch consolidates them, and uses the new > arch_kexec_protect(unprotect)_crashkres() to replace former > crash_map/unmap_reserved_pages() which by now has been only > used by S390. > > The consolidation work needs the crash memory to be mapped > initially, so get rid of S390 crash kernel memblock removal > in reserve_crashkernel(). Once kdump kernel is loaded, the > new arch_kexec_protect_crashkres() implemented for S390 will > actually unmap the pgtable like before. > > The patch also fixed a S390 crash_shrink_memory() bad page warning > in passing due to not using memblock_reserve(): > BUG: Bad page state in process bash pfn:7e400 > page:03d101f9 count:0 mapcount:1 mapping: (null) index:0x0 > flags: 0x0() > page dumped because: nonzero mapcount > Modules linked in: ghash_s390 prng aes_s390 des_s390 des_generic > CPU: 0 PID: 1558 Comm: bash Not tainted 4.6.0-rc1-next-20160327 #1 >73007a58 73007ae8 0002 >73007b88 73007b00 73007b00 0022cf4e >00a579b8 007b0dd6 00791a8c >000b >73007b48 73007ae8 >070003d10001 00112f20 73007ae8 73007b48 > Call Trace: > ([<00112e0c>] show_trace+0x5c/0x78) > ([<00112ed4>] show_stack+0x6c/0xe8) > ([<003f28dc>] dump_stack+0x84/0xb8) > ([<00235454>] bad_page+0xec/0x158) > ([<002357a4>] free_pages_prepare+0x2e4/0x308) > ([<002383a2>] free_hot_cold_page+0x42/0x198) > ([<001c45e0>] crash_free_reserved_phys_range+0x60/0x88) > ([<001c49b0>] crash_shrink_memory+0xb8/0x1a0) > ([<0015bcae>] kexec_crash_size_store+0x46/0x60) > ([<0033d326>] kernfs_fop_write+0x136/0x180) > ([<002b253c>] __vfs_write+0x3c/0x100) > ([<002b35ce>] vfs_write+0x8e/0x190) > ([<002b4ca0>] SyS_write+0x60/0xd0) > ([<0063067c>] system_call+0x244/0x264) > > Cc: Michael Holzheu <holz...@linux.vnet.ibm.com> > Signed-off-by: Xunlei Pang <xlp...@redhat.com> > --- > Tested kexec/kdump on S390x > > arch/s390/kernel/machine_kexec.c | 86 > ++-- > arch/s390/kernel/setup.c | 7 ++-- > include/linux/kexec.h| 2 - > kernel/kexec.c | 12 -- > kernel/kexec_core.c | 11 + > 5 files changed, 54 insertions(+), 64 deletions(-) > > diff --git a/arch/s390/kernel/machine_kexec.c > b/arch/s390/kernel/machine_kexec.c > index 2f1b721..1ec6cfc 100644 > --- a/arch/s390/kernel/machine_kexec.c > +++ b/arch/s390/kernel/machine_kexec.c > @@ -35,6 +35,52 @@ extern const unsigned long long relocate_kernel_len; > #ifdef CONFIG_CRASH_DUMP > > /* > + * Map or unmap crashkernel memory > + */ > +static void crash_map_pages(int enable) > +{ > + unsigned long size = resource_size(_res); > + > + BUG_ON(crashk_res.start % KEXEC_CRASH_MEM_ALIGN || > +size % KEXEC_CRASH_MEM_ALIGN); > + if (enable) > + vmem_add_mapping(crashk_res.start, size); > + else { > + vmem_remove_mapping(crashk_res.start, size); > + if (size) > + os_info_crashkernel_add(crashk_res.start, size); > + else > + os_info_crashkernel_add(0, 0); > + } > +} Please do not move these functions in the file. If you leave it at their old location, the patch will be *much* smaller. > + > +/* > + * Map crashkernel memory > + */ > +static void crash_map_reserved_pages(void) > +{ > + crash_map_pages(1); > +} > + > +/* > + * Unmap crashkernel memory > + */ > +static void crash_unmap_reserved_pages(void) > +{ > + crash_map_pages(0); > +} > + > +void arch_kexec_protect_crashkres(void) > +{ > + crash_unmap_reserved_pages(); > +} > + > +void arch_kexec_unprotect_crashkres(void) > +{ > + crash_map_reserved_pages();
Re: [PATCH] s390/kexec: Consolidate crash_map/unmap_reserved_pages() and arch_kexec_protect(unprotect)_crashkres()
Hello Xunlei, This patch can has potential to create some funny side effects. Especially the change from memblock_remove() to memblock_reserve() and the later call of reserve_crashkernel(). Give me some time. I will look into this next week. Michael On Wed, 30 Mar 2016 19:47:21 +0800 Xunlei Pang <xlp...@redhat.com> wrote: > Commit 3f625002581b ("kexec: introduce a protection mechanism > for the crashkernel reserved memory") is a similar mechanism > for protecting the crash kernel reserved memory to previous > crash_map/unmap_reserved_pages() implementation, the new one > is more generic in name and cleaner in code (besides, some > arch may not be allowed to unmap the pgtable). > > Therefore, this patch consolidates them, and uses the new > arch_kexec_protect(unprotect)_crashkres() to replace former > crash_map/unmap_reserved_pages() which by now has been only > used by S390. > > The consolidation work needs the crash memory to be mapped > initially, so get rid of S390 crash kernel memblock removal > in reserve_crashkernel(). Once kdump kernel is loaded, the > new arch_kexec_protect_crashkres() implemented for S390 will > actually unmap the pgtable like before. > > The patch also fixed a S390 crash_shrink_memory() bad page warning > in passing due to not using memblock_reserve(): > BUG: Bad page state in process bash pfn:7e400 > page:03d101f9 count:0 mapcount:1 mapping: (null) index:0x0 > flags: 0x0() > page dumped because: nonzero mapcount > Modules linked in: ghash_s390 prng aes_s390 des_s390 des_generic > CPU: 0 PID: 1558 Comm: bash Not tainted 4.6.0-rc1-next-20160327 #1 >73007a58 73007ae8 0002 >73007b88 73007b00 73007b00 0022cf4e >00a579b8 007b0dd6 00791a8c >000b >73007b48 73007ae8 >070003d10001 00112f20 73007ae8 73007b48 > Call Trace: > ([<00112e0c>] show_trace+0x5c/0x78) > ([<00112ed4>] show_stack+0x6c/0xe8) > ([<003f28dc>] dump_stack+0x84/0xb8) > ([<00235454>] bad_page+0xec/0x158) > ([<002357a4>] free_pages_prepare+0x2e4/0x308) > ([<002383a2>] free_hot_cold_page+0x42/0x198) > ([<001c45e0>] crash_free_reserved_phys_range+0x60/0x88) > ([<001c49b0>] crash_shrink_memory+0xb8/0x1a0) > ([<0015bcae>] kexec_crash_size_store+0x46/0x60) > ([<0033d326>] kernfs_fop_write+0x136/0x180) > ([<002b253c>] __vfs_write+0x3c/0x100) > ([<002b35ce>] vfs_write+0x8e/0x190) > ([<002b4ca0>] SyS_write+0x60/0xd0) > ([<0063067c>] system_call+0x244/0x264) > > Cc: Michael Holzheu <holz...@linux.vnet.ibm.com> > Signed-off-by: Xunlei Pang <xlp...@redhat.com> > --- > Tested kexec/kdump on S390x > > arch/s390/kernel/machine_kexec.c | 86 > ++-- > arch/s390/kernel/setup.c | 7 ++-- > include/linux/kexec.h| 2 - > kernel/kexec.c | 12 -- > kernel/kexec_core.c | 11 + > 5 files changed, 54 insertions(+), 64 deletions(-) > > diff --git a/arch/s390/kernel/machine_kexec.c > b/arch/s390/kernel/machine_kexec.c > index 2f1b721..1ec6cfc 100644 > --- a/arch/s390/kernel/machine_kexec.c > +++ b/arch/s390/kernel/machine_kexec.c > @@ -35,6 +35,52 @@ extern const unsigned long long relocate_kernel_len; > #ifdef CONFIG_CRASH_DUMP > > /* > + * Map or unmap crashkernel memory > + */ > +static void crash_map_pages(int enable) > +{ > + unsigned long size = resource_size(_res); > + > + BUG_ON(crashk_res.start % KEXEC_CRASH_MEM_ALIGN || > +size % KEXEC_CRASH_MEM_ALIGN); > + if (enable) > + vmem_add_mapping(crashk_res.start, size); > + else { > + vmem_remove_mapping(crashk_res.start, size); > + if (size) > + os_info_crashkernel_add(crashk_res.start, size); > + else > + os_info_crashkernel_add(0, 0); > + } > +} > + > +/* > + * Map crashkernel memory > + */ > +static void crash_map_reserved_pages(void) > +{ > + crash_map_pages(1); > +} > + > +/* > + * Unmap crashkernel memory > + */ > +static void crash_unmap_reserved_pages(void) > +{ > + crash_map_pages(0); > +} > + > +void arch_kexec_protect_crashkres(void) > +{ > + crash_unmap_reserved_pages(); > +} > + > +void arch_kexec_unpr
Re: [PATCH] kexec: unmap reserved pages for each error-return way
On Thu, 28 Jan 2016 19:56:56 +0800 Xunlei Pang <xp...@redhat.com> wrote: > On 2016/01/28 at 18:32, Michael Holzheu wrote: > > On Wed, 27 Jan 2016 11:15:46 -0800 > > Andrew Morton <a...@linux-foundation.org> wrote: > > > >> On Wed, 27 Jan 2016 14:48:31 +0300 Dmitry Safonov <dsafo...@virtuozzo.com> > >> wrote: > >> > >>> For allocation of kimage failure or kexec_prepare or load segments > >>> errors there is no need to keep crashkernel memory mapped. > >>> It will affect only s390 as map/unmap hook defined only for it. > >>> As on unmap s390 also changes os_info structure let's check return code > >>> and add info only on success. > >>> > >> This conflicts (both mechanically and somewhat conceptually) with > >> Xunlei Pang's "kexec: Introduce a protection mechanism for the > >> crashkernel reserved memory" and "kexec: provide > >> arch_kexec_protect(unprotect)_crashkres()". > >> > >> http://ozlabs.org/~akpm/mmots/broken-out/kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory.patch > >> http://ozlabs.org/~akpm/mmots/broken-out/kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory-v4.patch > >> > >> and > >> > >> http://ozlabs.org/~akpm/mmots/broken-out/kexec-provide-arch_kexec_protectunprotect_crashkres.patch > >> http://ozlabs.org/~akpm/mmots/broken-out/kexec-provide-arch_kexec_protectunprotect_crashkres-v4.patch > > Hmm, It looks to me that arch_kexec_(un)protect_crashkres() has exactly > > the same semantics as crash_(un)map_reserved_pages(). > > > > On s390 we don't have the crashkernel memory mapped and therefore need > > crash_map_reserved_pages() before loading something into crashkernel > > memory. > > I don't know s390, just curious, if s390 doesn't have crash kernel memory > mapped, > what's the purpose of the commit(558df7209e) for s390 as the reserved crash > memory > with no kernel mapping already means the protection is on? When we reserve crashkernel memory on s390 ("crashkernel=" kernel parameter), we create a memory hole without page tables. Commit (558df7209e) was necessary to load a kernel/ramdisk into the memory hole with the kexec() system call. We create a temporary mapping with crash_map_reserved_pages(), then copy the kernel/ramdisk and finally remove the mapping again via crash_unmap_reserved_pages(). We did that all in order to protect the preloaded kernel and ramdisk. I forgot the details why commit(558df7209e) wasn't necessary before. AFAIK it became necessary because of some kdump (mmap?) rework. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] kexec: unmap reserved pages for each error-return way
On Wed, 27 Jan 2016 11:15:46 -0800 Andrew Mortonwrote: > On Wed, 27 Jan 2016 14:48:31 +0300 Dmitry Safonov > wrote: > > > For allocation of kimage failure or kexec_prepare or load segments > > errors there is no need to keep crashkernel memory mapped. > > It will affect only s390 as map/unmap hook defined only for it. > > As on unmap s390 also changes os_info structure let's check return code > > and add info only on success. > > > > This conflicts (both mechanically and somewhat conceptually) with > Xunlei Pang's "kexec: Introduce a protection mechanism for the > crashkernel reserved memory" and "kexec: provide > arch_kexec_protect(unprotect)_crashkres()". > > http://ozlabs.org/~akpm/mmots/broken-out/kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory.patch > http://ozlabs.org/~akpm/mmots/broken-out/kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory-v4.patch > > and > > http://ozlabs.org/~akpm/mmots/broken-out/kexec-provide-arch_kexec_protectunprotect_crashkres.patch > http://ozlabs.org/~akpm/mmots/broken-out/kexec-provide-arch_kexec_protectunprotect_crashkres-v4.patch Hmm, It looks to me that arch_kexec_(un)protect_crashkres() has exactly the same semantics as crash_(un)map_reserved_pages(). On s390 we don't have the crashkernel memory mapped and therefore need crash_map_reserved_pages() before loading something into crashkernel memory. Perhaps I missed something? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] kexec: unmap reserved pages for each error-return way
On Thu, 28 Jan 2016 21:12:54 +0800 Xunlei Pang <xp...@redhat.com> wrote: > On 2016/01/28 at 20:44, Michael Holzheu wrote: > > On Thu, 28 Jan 2016 19:56:56 +0800 > > Xunlei Pang <xp...@redhat.com> wrote: > > > >> On 2016/01/28 at 18:32, Michael Holzheu wrote: > >>> On Wed, 27 Jan 2016 11:15:46 -0800 > >>> Andrew Morton <a...@linux-foundation.org> wrote: > >>> > >>>> On Wed, 27 Jan 2016 14:48:31 +0300 Dmitry Safonov > >>>> <dsafo...@virtuozzo.com> wrote: > >>>> > >>>>> For allocation of kimage failure or kexec_prepare or load segments > >>>>> errors there is no need to keep crashkernel memory mapped. > >>>>> It will affect only s390 as map/unmap hook defined only for it. > >>>>> As on unmap s390 also changes os_info structure let's check return code > >>>>> and add info only on success. > >>>>> > >>>> This conflicts (both mechanically and somewhat conceptually) with > >>>> Xunlei Pang's "kexec: Introduce a protection mechanism for the > >>>> crashkernel reserved memory" and "kexec: provide > >>>> arch_kexec_protect(unprotect)_crashkres()". > >>>> > >>>> http://ozlabs.org/~akpm/mmots/broken-out/kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory.patch > >>>> http://ozlabs.org/~akpm/mmots/broken-out/kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory-v4.patch > >>>> > >>>> and > >>>> > >>>> http://ozlabs.org/~akpm/mmots/broken-out/kexec-provide-arch_kexec_protectunprotect_crashkres.patch > >>>> http://ozlabs.org/~akpm/mmots/broken-out/kexec-provide-arch_kexec_protectunprotect_crashkres-v4.patch > >>> Hmm, It looks to me that arch_kexec_(un)protect_crashkres() has exactly > >>> the same semantics as crash_(un)map_reserved_pages(). > >>> > >>> On s390 we don't have the crashkernel memory mapped and therefore need > >>> crash_map_reserved_pages() before loading something into crashkernel > >>> memory. > >> I don't know s390, just curious, if s390 doesn't have crash kernel memory > >> mapped, > >> what's the purpose of the commit(558df7209e) for s390 as the reserved > >> crash memory > >> with no kernel mapping already means the protection is on? > > When we reserve crashkernel memory on s390 ("crashkernel=" kernel > > parameter), > > we create a memory hole without page tables. > > > > Commit (558df7209e) was necessary to load a kernel/ramdisk into > > the memory hole with the kexec() system call. > > > > We create a temporary mapping with crash_map_reserved_pages(), then > > copy the kernel/ramdisk and finally remove the mapping again > > via crash_unmap_reserved_pages(). > > Thanks for the explanation. > So, on s390 the physical memory address has the same value as its kernel > virtual address, > and kmap() actually returns the value of the physical address of the page, > right? Correct. On s390 kmap() always return the physical address of the page. We have an 1:1 mapping for all the physical memory. For this area virtual=real. In addition to that we have the vmalloc area above the 1:1 mapping where some of the memory is mapped a second time. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2] kexec: fix mmap return code handling
Hi Simon again, After a bit more thinking: In theory mmap() could also return NULL. Therefore the following fix is probably the better one ... --- Subject: [PATCH] kexec: fix mmap return code handling When mmap fails, MAP_FAILED (that is, (void *) -1) is returned. Currently we assume that NULL is returned. Fix this and add the MAP_FAILED check. Fixes: 95741713e790 ("kexec/s390x: use mmap instead of read for slurp_file") Signed-off-by: Michael Holzheu <holz...@linux.vnet.ibm.com> diff --git a/kexec/kexec.c b/kexec/kexec.c index cf6e03d..f0bd527 100644 --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -573,7 +573,7 @@ static char *slurp_file_generic(const char *filename, off_t *r_size, buf = slurp_fd(fd, filename, size, ); } } - if (!buf) + if ((use_mmap && (buf == MAP_FAILED)) || (!use_mmap && (buf == NULL))) die("Cannot read %s", filename); if (nread != size) -- 2.3.9 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] kexec: fix mmap return code handling
On Thu, 26 Nov 2015 19:02:28 +0100 Petr Tesarik <ptesa...@suse.cz> wrote: > On Thu, 26 Nov 2015 18:32:31 +0100 > Michael Holzheu <holz...@linux.vnet.ibm.com> wrote: > > > Hi Simon again, > > > > After a bit more thinking: In theory mmap() could also return NULL. > > Therefore the following fix is probably the better one ... > > No, if you let the kernel choose the address (i.e. call mmap with NULL > addr), it will return at least PAGE_SIZE (and a higher limit is usually > enforced by sys.vm.mmap_min_addr sysctl). Admittedly the limit is set > in arch-specific code, so theoretically there can be an architecture > which sets the limit to 0, but I doubt it, because it would break too > many assumptions in user space (for example gcc assumes that > dereferencing a NULL pointer terminates a process). > > In short, this other fix is just as good as the previous one. Hi Petr, Thanks for clarification! I still would vote for the second one ;-) Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] kexec: fix mmap return code handling
Hi Simon, I made a mistake in my mmap patch, sorry for that! When mmap fails, MAP_FAILED (that is, (void *) -1) is returned. Currently we assume that NULL is returned. Fix this and add the MAP_FAILED check. Fixes: 95741713e790 ("kexec/s390x: use mmap instead of read for slurp_file") Signed-off-by: Michael Holzheu <holz...@linux.vnet.ibm.com> --- kexec/kexec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kexec/kexec.c b/kexec/kexec.c index cf6e03d..02285fb 100644 --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -568,6 +568,8 @@ static char *slurp_file_generic(const char *filename, off_t *r_size, if (use_mmap) { buf = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (buf == MAP_FAILED) + buf = NULL; nread = size; } else { buf = slurp_fd(fd, filename, size, ); -- 2.3.9 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] kexec/s390x: use mmap instead of read for slurp_file()
On Fri, 30 Oct 2015 10:03:22 +0800 Dave Youngwrote: > Hi, > [snip] > > --- a/kexec/arch/s390/kexec-image.c > > +++ b/kexec/arch/s390/kexec-image.c > > @@ -101,7 +101,7 @@ image_s390_load(int argc, char **argv, c > > * we load the ramdisk directly behind the image with 1 MiB alignment. > > */ > > if (ramdisk) { > > - rd_buffer = slurp_file(ramdisk, _len); > > + rd_buffer = slurp_file_mmap(ramdisk, _len); > > if (rd_buffer == NULL) { > > fprintf(stderr, "Could not read ramdisk.\n"); > > return -1; > > --- a/kexec/kexec.c > > +++ b/kexec/kexec.c > > @@ -29,6 +29,7 @@ > > #include > > #include > > #include > > +#include > > #include > > #include > > #ifndef _O_BINARY > > @@ -514,7 +515,8 @@ static char *slurp_fd(int fd, const char > > return buf; > > } > > > > -char *slurp_file(const char *filename, off_t *r_size) > > +static char *slurp_file_generic(const char *filename, off_t *r_size, > > + int use_mmap) > > Add a function comment about the argument use_mmap so that one knows > that it will be not used for character devices?# Good idea, I put comments before slurp_file() and slurp_file_mmap(). > > > { > > int fd; > > char *buf; > > @@ -552,11 +554,17 @@ char *slurp_file(const char *filename, o > > if (err < 0) > > die("Can not seek to the begin of file %s: %s\n", > > filename, strerror(errno)); > > + buf = slurp_fd(fd, filename, size, ); > > } else { > > size = stats.st_size; > > + if (use_mmap) { > > + buf = mmap(NULL, size, PROT_READ|PROT_WRITE, > > + MAP_PRIVATE, fd, 0); > > Probably map it as readonly is enough. Although I agree with you for the ramdisk case, I nevertheless would prefer to keep it writable to have the same semantics as for the malloc case. So do you agree with the patch below? Michael --- kexec/arch/s390/kexec-image.c |2 +- kexec/kexec.c | 31 --- kexec/kexec.h |1 + 3 files changed, 30 insertions(+), 4 deletions(-) --- a/kexec/arch/s390/kexec-image.c +++ b/kexec/arch/s390/kexec-image.c @@ -101,7 +101,7 @@ image_s390_load(int argc, char **argv, c * we load the ramdisk directly behind the image with 1 MiB alignment. */ if (ramdisk) { - rd_buffer = slurp_file(ramdisk, _len); + rd_buffer = slurp_file_mmap(ramdisk, _len); if (rd_buffer == NULL) { fprintf(stderr, "Could not read ramdisk.\n"); return -1; --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #ifndef _O_BINARY @@ -514,7 +515,8 @@ static char *slurp_fd(int fd, const char return buf; } -char *slurp_file(const char *filename, off_t *r_size) +static char *slurp_file_generic(const char *filename, off_t *r_size, + int use_mmap) { int fd; char *buf; @@ -552,11 +554,17 @@ char *slurp_file(const char *filename, o if (err < 0) die("Can not seek to the begin of file %s: %s\n", filename, strerror(errno)); + buf = slurp_fd(fd, filename, size, ); } else { size = stats.st_size; + if (use_mmap) { + buf = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, 0); + nread = size; + } else { + buf = slurp_fd(fd, filename, size, ); + } } - - buf = slurp_fd(fd, filename, size, ); if (!buf) die("Cannot read %s", filename); @@ -567,6 +575,23 @@ char *slurp_file(const char *filename, o return buf; } +/* + * Read file into malloced buffer + */ +char *slurp_file(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 0); +} + +/* + * Map "normal" file or read "character device" into malloced buffer. + * You must not use free, realloc, etc. with the returned buffer. + */ +char *slurp_file_mmap(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 1); +} + /* This functions reads either specified number of bytes from the file or lesser if EOF is met. */ --- a/kexec/kexec.h +++ b/kexec/kexec.h @@ -253,6 +253,7 @@ extern void die(const char *fmt, ...) extern void *xmalloc(size_t size); extern void *xrealloc(void *ptr, size_t size); extern char *slurp_file(const char *filename, off_t *r_size); +extern char *slurp_file_mmap(const char *filename, off_t *r_size); extern char *slurp_file_len(const char *filename, off_t size, off_t *nread);
[PATCH v3] kexec/s390x: use mmap instead of read for slurp_file()
The slurp_fd() function allocates memory and uses the read() system call. This results in double memory consumption for image and initrd: 1) Memory allocated in user space by the kexec tool 2) Memory allocated in kernel by the kexec() system call The following illustrates the use case that we have on s390x: 1) Boot a 4 GB Linux system 2) Copy kernel and 1,5 GB ramdisk from external source into tmpfs (ram) 3) Use kexec to boot kernel with ramdisk Therefore for kexec runtime we need: 1,5 GB (tmpfs) + 1,5 GB (kexec malloc) + 1,5 GB (kernel memory) = 4,5 GB This patch introduces slurp_file_mmap() which for "normal" files uses mmap() instead of malloc()/read(). This reduces the runtime memory consumption of the kexec tool as follows: 1,5 GB (tmpfs) + 1,5 GB (kernel memory) = 3 GB Signed-off-by: Michael Holzheu <holz...@linux.vnet.ibm.com> Reviewed-by: Dave Young <dyo...@redhat.com> --- kexec/arch/s390/kexec-image.c |2 +- kexec/kexec.c | 31 --- kexec/kexec.h |1 + 3 files changed, 30 insertions(+), 4 deletions(-) --- a/kexec/arch/s390/kexec-image.c +++ b/kexec/arch/s390/kexec-image.c @@ -101,7 +101,7 @@ image_s390_load(int argc, char **argv, c * we load the ramdisk directly behind the image with 1 MiB alignment. */ if (ramdisk) { - rd_buffer = slurp_file(ramdisk, _len); + rd_buffer = slurp_file_mmap(ramdisk, _len); if (rd_buffer == NULL) { fprintf(stderr, "Could not read ramdisk.\n"); return -1; --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #ifndef _O_BINARY @@ -514,7 +515,8 @@ static char *slurp_fd(int fd, const char return buf; } -char *slurp_file(const char *filename, off_t *r_size) +static char *slurp_file_generic(const char *filename, off_t *r_size, + int use_mmap) { int fd; char *buf; @@ -552,11 +554,17 @@ char *slurp_file(const char *filename, o if (err < 0) die("Can not seek to the begin of file %s: %s\n", filename, strerror(errno)); + buf = slurp_fd(fd, filename, size, ); } else { size = stats.st_size; + if (use_mmap) { + buf = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, 0); + nread = size; + } else { + buf = slurp_fd(fd, filename, size, ); + } } - - buf = slurp_fd(fd, filename, size, ); if (!buf) die("Cannot read %s", filename); @@ -567,6 +575,23 @@ char *slurp_file(const char *filename, o return buf; } +/* + * Read file into malloced buffer. + */ +char *slurp_file(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 0); +} + +/* + * Map "normal" file or read "character device" into malloced buffer. + * You must not use free, realloc, etc. for the returned buffer. + */ +char *slurp_file_mmap(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 1); +} + /* This functions reads either specified number of bytes from the file or lesser if EOF is met. */ --- a/kexec/kexec.h +++ b/kexec/kexec.h @@ -253,6 +253,7 @@ extern void die(const char *fmt, ...) extern void *xmalloc(size_t size); extern void *xrealloc(void *ptr, size_t size); extern char *slurp_file(const char *filename, off_t *r_size); +extern char *slurp_file_mmap(const char *filename, off_t *r_size); extern char *slurp_file_len(const char *filename, off_t size, off_t *nread); extern char *slurp_decompress_file(const char *filename, off_t *r_size); extern unsigned long virt_to_phys(unsigned long addr); ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] kexec/s390x: use mmap instead of read for slurp_file()
On Thu, 29 Oct 2015 14:37:10 +0800 Dave Young <dyo...@redhat.com> wrote: > On 10/28/15 at 10:57am, Michael Holzheu wrote: > > On Wed, 28 Oct 2015 14:46:23 +0800 > > Dave Young <dyo...@redhat.com> wrote: > > > > > Hi, Michael > > > > > > > @@ -552,11 +563,18 @@ char *slurp_file(const char *filename, o > > > > if (err < 0) > > > > die("Can not seek to the begin of file %s: > > > > %s\n", > > > > filename, strerror(errno)); > > > > + buf = slurp_fd(fd, filename, size, , use_mmap); > > > > } else { > > > > size = stats.st_size; > > > > + if (use_mmap) { > > > > + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, > > > > + MAP_PRIVATE, fd, 0); > > > > + nread = stats.st_size; > > > > + } else { > > > > + buf = slurp_fd(fd, filename, size, , 0); > > > > + } > > > > } > > > > > > Drop above changes and replace below lines with an extra use_mmap argument > > > should be enough? > > > > > > - buf = slurp_fd(fd, filename, size, ); > > > [snip] > > > > Hmm, I don't think so. > > > > In case of non-character devices I either mmap the file directly > > (use_mmap=true) > > or use "slurp_fd()" (use_mmap=false). So I can't unconditionaly use > > slurp_fd(). > > How about handle these in slurp_fd only? Directly return mmapped buf in case > use_mmap=1 there. I do not understand why use_mmap=1 but you still call read > syscall to read data into the mmapped buffer.. For the character device case (S_ISCHR(stats.st_mode)) we have to use the read() syscall path. With my patch I wanted to ensure that when calling slurp_file_mmap() we always return mmaped storage. Otherwise slurp_file_mmap() would return mmaped storage for files and malloced memory for character devices. As already noted this is only to be consistent and is not really required for our use case. So would you prefer the patch below? Michael --- kexec/arch/s390/kexec-image.c |2 +- kexec/kexec.c | 24 +--- kexec/kexec.h |1 + 3 files changed, 23 insertions(+), 4 deletions(-) --- a/kexec/arch/s390/kexec-image.c +++ b/kexec/arch/s390/kexec-image.c @@ -101,7 +101,7 @@ image_s390_load(int argc, char **argv, c * we load the ramdisk directly behind the image with 1 MiB alignment. */ if (ramdisk) { - rd_buffer = slurp_file(ramdisk, _len); + rd_buffer = slurp_file_mmap(ramdisk, _len); if (rd_buffer == NULL) { fprintf(stderr, "Could not read ramdisk.\n"); return -1; --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #ifndef _O_BINARY @@ -514,7 +515,8 @@ static char *slurp_fd(int fd, const char return buf; } -char *slurp_file(const char *filename, off_t *r_size) +static char *slurp_file_generic(const char *filename, off_t *r_size, + int use_mmap) { int fd; char *buf; @@ -552,11 +554,17 @@ char *slurp_file(const char *filename, o if (err < 0) die("Can not seek to the begin of file %s: %s\n", filename, strerror(errno)); + buf = slurp_fd(fd, filename, size, ); } else { size = stats.st_size; + if (use_mmap) { + buf = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, 0); + nread = size; + } else { + buf = slurp_fd(fd, filename, size, ); + } } - - buf = slurp_fd(fd, filename, size, ); if (!buf) die("Cannot read %s", filename); @@ -567,6 +575,16 @@ char *slurp_file(const char *filename, o return buf; } +char *slurp_file(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 0); +} + +char *slurp_file_mmap(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 1); +} + /* This functions reads either specified number of bytes from the file or lesser if EOF is met. */ --- a/kexec/kexec.h +++ b/kexec/kexec.h @@ -253,6 +253,7 @@ extern void die(const char *fmt, ...) extern void
Re: [PATCH v2] kexec/s390x: use mmap instead of read for slurp_file()
On Wed, 28 Oct 2015 14:46:23 +0800 Dave Youngwrote: > Hi, Michael > > > @@ -552,11 +563,18 @@ char *slurp_file(const char *filename, o > > if (err < 0) > > die("Can not seek to the begin of file %s: %s\n", > > filename, strerror(errno)); > > + buf = slurp_fd(fd, filename, size, , use_mmap); > > } else { > > size = stats.st_size; > > + if (use_mmap) { > > + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, > > + MAP_PRIVATE, fd, 0); > > + nread = stats.st_size; > > + } else { > > + buf = slurp_fd(fd, filename, size, , 0); > > + } > > } > > Drop above changes and replace below lines with an extra use_mmap argument > should be enough? > > - buf = slurp_fd(fd, filename, size, ); > [snip] Hmm, I don't think so. In case of non-character devices I either mmap the file directly (use_mmap=true) or use "slurp_fd()" (use_mmap=false). So I can't unconditionaly use slurp_fd(). The change in slurp_fd() to use anonymous mmap in case of use_mmap=true is not really necessary. I did it nevertheless for consistency. This ensures that the slrup_file_mmap() functions *always* returns mmaped memory. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2] kexec/s390x: use mmap instead of read for slurp_file()
On Mon, 26 Oct 2015 15:31:39 +0800 Dave Young <dyo...@redhat.com> wrote: [snip] > IMHO adding a slurp_file_mmap for s390x use is a better way since the > huge initramfs is not a general case. Ok, what about the following patch: --- [PATCH] kexec/s390x: use mmap instead of read for slurp_file() The slurp_fd() function allocates memory and uses the read() system call. This results in double memory consumption for image and initrd: 1) Memory allocated in user space by the kexec tool 2) Memory allocated in kernel by the kexec() system call Therefore use mmap() to reduce the runtime memory consumption of the kexec tool. The following use case illustrates the usefulness of this patch a bit more: 1) Boot a 4 GB Linux system 2) Read kernel and 1,5 GB ramdisk from external source into local tmpfs (ram) 3) kexec the kernel and ramdisk Without this patch for the kexec runtime we need: 1,5 GB (tmpfs) + 1,5 GB (kexec malloc) + 1,5 GB (kernel memory) = 4,5 GB Signed-off-by: Michael Holzheu <holz...@linux.vnet.ibm.com> --- kexec/arch/s390/kexec-image.c |2 +- kexec/kexec.c | 40 ++-- kexec/kexec.h |1 + 3 files changed, 36 insertions(+), 7 deletions(-) --- a/kexec/arch/s390/kexec-image.c +++ b/kexec/arch/s390/kexec-image.c @@ -101,7 +101,7 @@ image_s390_load(int argc, char **argv, c * we load the ramdisk directly behind the image with 1 MiB alignment. */ if (ramdisk) { - rd_buffer = slurp_file(ramdisk, _len); + rd_buffer = slurp_file_mmap(ramdisk, _len); if (rd_buffer == NULL) { fprintf(stderr, "Could not read ramdisk.\n"); return -1; --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #ifndef _O_BINARY @@ -481,13 +482,19 @@ static int add_backup_segments(struct ke return 0; } -static char *slurp_fd(int fd, const char *filename, off_t size, off_t *nread) +static char *slurp_fd(int fd, const char *filename, off_t size, off_t *nread, + int use_mmap) { char *buf; off_t progress; ssize_t result; - buf = xmalloc(size); + if (use_mmap) { + buf = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_ANON|MAP_PRIVATE, -1, 0); + } else { + buf = xmalloc(size); + } progress = 0; while (progress < size) { result = read(fd, buf + progress, size - progress); @@ -496,7 +503,10 @@ static char *slurp_fd(int fd, const char continue; fprintf(stderr, "Read on %s failed: %s\n", filename, strerror(errno)); - free(buf); + if (use_mmap) + munmap(buf, size); + else + free(buf); close(fd); return NULL; } @@ -514,7 +524,8 @@ static char *slurp_fd(int fd, const char return buf; } -char *slurp_file(const char *filename, off_t *r_size) +static char *slurp_file_generic(const char *filename, off_t *r_size, + int use_mmap) { int fd; char *buf; @@ -552,11 +563,18 @@ char *slurp_file(const char *filename, o if (err < 0) die("Can not seek to the begin of file %s: %s\n", filename, strerror(errno)); + buf = slurp_fd(fd, filename, size, , use_mmap); } else { size = stats.st_size; + if (use_mmap) { + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + nread = stats.st_size; + } else { + buf = slurp_fd(fd, filename, size, , 0); + } } - buf = slurp_fd(fd, filename, size, ); if (!buf) die("Cannot read %s", filename); @@ -567,6 +585,16 @@ char *slurp_file(const char *filename, o return buf; } +char *slurp_file(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 0); +} + +char *slurp_file_mmap(const char *filename, off_t *r_size) +{ + return slurp_file_generic(filename, r_size, 1); +} + /* This functions reads either specified number of bytes from the file or lesser if EOF is met. */ @@ -583,7 +611,7 @@ char *slurp_file_len(const char *filenam return 0; } - return slurp_fd(fd, filename, size, nread); + return slurp_fd(fd, filename, size, nread, 0); } char *slurp_decompress_file(const char *filename, off_t *
Re: [PATCH] Revert "kexec: use mmap instead of read for slurp_file()"
On Fri, 23 Oct 2015 11:10:00 +0800 Dave Youngwrote: > This reverts commit 7ab842d8a004f6cd75a9d7b3528e4a70819ce4ef. > > using mmap by default in slurp_file cause segment fault while later > reallocing dtb_buf during my arm kexec test. Sorry, I obviously missed that part. How can we fix that: - Create a separate function slurp_file_mmap() that is called by s390x? - Rework xmalloc/xrealloc to always use mmap() and mremap()? - ... Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] kexec: use mmap instead of read for slurp_file()
On Wed, 14 Oct 2015 17:05:52 -0700 Geoff Levandwrote: [snip] > > if (err < 0) > > > > > > die("Can not seek to the begin of file %s: %s\n", > > > > > > > > filename, strerror(errno)); > > +> > > buf = slurp_fd(fd, filename, size, ); > > > > } else { > > -> > > size = stats.st_size; > > +> > > size = nread = stats.st_size; > > +> > > buf = mmap(NULL, size, > > With this change the caller can't tell if buf was malloc'ed or mmaped. The > only safe thing it can do is to not call free() on the returned buf, this will > lead to memory leakage for malloc'ed buffers. I have read the code and have not found any free call. Therefore I assumed that the kexec approach is to not free the buffer *explicitly* and leave to the kernel to free it *automatically* at process exit. @Simon: Was this assumption wrong? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] kexec: use mmap instead of read for slurp_file()
The slurp_fd() function allocates memory and uses the read() system call. This results in double memory consumption for image and initrd: 1) Memory allocated in user space by the kexec tool 2) Memory allocated in kernel by the kexec() system call Therefore use mmap() for non-character devices to reduce the runtime memory consumption of the kexec tool. The following use case illustrates the usefulness of this patch a bit more: 1) Boot a 4 GB Linux system 2) Read kernel and 1,5 GB ramdisk from external source into local tmpfs (ram) 3) kexec the kernel and ramdisk Without this patch for the kexec runtime we need: 1,5 GB (tmpfs) + 1,5 GB (kexec malloc) + 1,5 GB (kernel memory) = 4,5 GB Signed-off-by: Michael Holzheu <holz...@linux.vnet.ibm.com> --- kexec/kexec.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kexec/kexec.c b/kexec/kexec.c index 8ce6885..fecf061 100644 --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -552,11 +553,12 @@ char *slurp_file(const char *filename, off_t *r_size) if (err < 0) die("Can not seek to the begin of file %s: %s\n", filename, strerror(errno)); + buf = slurp_fd(fd, filename, size, ); } else { - size = stats.st_size; + size = nread = stats.st_size; + buf = mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); } - - buf = slurp_fd(fd, filename, size, ); if (!buf) die("Cannot read %s", filename); -- 2.3.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [RFC PATCH] kexec: use mmap instead of read for slurp_file()
On Wed, 2 Sep 2015 10:07:21 +0900 Simon Hormanwrote: [snip] > > The slurp_fd() function allocates memory and uses the read() system call. > > This results in double memory consumption for image and initrd: > > > > 1) Memory allocated in user space by the kexec tool > > 2) Memory allocated in kernel by the kexec() system call > > > > Therefore use mmap() for non-character devices to reduce the memory > > consumption of the kexec tool. > > I'm not opposed to this change but I also don't see a strong motivation for > it. I would imagine that the memory saving is not that large. And that the > memory consumption disappears when the presumably short-lived kexec process > exits. Correct it will disappear. The reason for the the patch is that we have the following scanario: 1) Boot a 4 GB Linux system 2) Read kernel and 1,5 GB ramdisk from external source into local tmpfs (ram) 3) kexec the kernel and ramdisk So without the mmap patch for the kexec runtime we need: 1,5 GB (tmpfs) + 1,5 GB (kexec malloc) + 1,5 GB (kernel memory) = 4,5 GB If we use mmap we only need 3 GB memory. Regards, Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
PING: [RFC PATCH] kexec: use mmap instead of read for slurp_file()
Hello Simon, Did you have time to look at my patch? Regards, Michael On Tue, 18 Aug 2015 18:17:23 +0200 Michael Holzheu holz...@linux.vnet.ibm.com wrote: The slurp_fd() function allocates memory and uses the read() system call. This results in double memory consumption for image and initrd: 1) Memory allocated in user space by the kexec tool 2) Memory allocated in kernel by the kexec() system call Therefore use mmap() for non-character devices to reduce the memory consumption of the kexec tool. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- kexec/kexec.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kexec/kexec.c b/kexec/kexec.c index 8ce6885..fecf061 100644 --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -26,6 +26,7 @@ #include stdlib.h #include errno.h #include limits.h +#include sys/mman.h #include sys/types.h #include sys/stat.h #include sys/reboot.h @@ -552,11 +553,12 @@ char *slurp_file(const char *filename, off_t *r_size) if (err 0) die(Can not seek to the begin of file %s: %s\n, filename, strerror(errno)); + buf = slurp_fd(fd, filename, size, nread); } else { - size = stats.st_size; + size = nread = stats.st_size; + buf = mmap(NULL, size, +PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); } - - buf = slurp_fd(fd, filename, size, nread); if (!buf) die(Cannot read %s, filename); ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[RFC PATCH] kexec: use mmap instead of read for slurp_file()
The slurp_fd() function allocates memory and uses the read() system call. This results in double memory consumption for image and initrd: 1) Memory allocated in user space by the kexec tool 2) Memory allocated in kernel by the kexec() system call Therefore use mmap() for non-character devices to reduce the memory consumption of the kexec tool. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- kexec/kexec.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kexec/kexec.c b/kexec/kexec.c index 8ce6885..fecf061 100644 --- a/kexec/kexec.c +++ b/kexec/kexec.c @@ -26,6 +26,7 @@ #include stdlib.h #include errno.h #include limits.h +#include sys/mman.h #include sys/types.h #include sys/stat.h #include sys/reboot.h @@ -552,11 +553,12 @@ char *slurp_file(const char *filename, off_t *r_size) if (err 0) die(Can not seek to the begin of file %s: %s\n, filename, strerror(errno)); + buf = slurp_fd(fd, filename, size, nread); } else { - size = stats.st_size; + size = nread = stats.st_size; + buf = mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); } - - buf = slurp_fd(fd, filename, size, nread); if (!buf) die(Cannot read %s, filename); -- 2.3.0 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v4] kexec: Make a pair of map and unmap reserved pages when kdump fails to start
On Tue, 14 Jul 2015 10:09:20 -0400 Vivek Goyal vgo...@redhat.com wrote: On Fri, Jul 10, 2015 at 11:14:06AM +0200, Michael Holzheu wrote: [..] What about the following patch: --- diff --git a/kernel/kexec.c b/kernel/kexec.c index 7a36fdc..7837c4e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1236,10 +1236,68 @@ int kexec_load_disabled; static DEFINE_MUTEX(kexec_mutex); +static int __kexec_load(unsigned long entry, unsigned long nr_segments, [snip] + +failure_unmap_mem: I don't like this tag failure_unmap_mem. We are calling this both in success path as well as failure path. So why not simply call it out. Since the code is better readable now, I'm fine with out :-) + if (flags KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); + kimage_free(image); Now kimage_free() is called with kexec_mutex held. Previously that was not the case. I hope that's not a problem. Yes, I noticed that. But also in the original code there is already one spot where kimage_free() is called under lock: /* * In case of crash, new kernel gets loaded in reserved region. It is * same memory where old crash kernel might be loaded. Free any * current crash dump kernel before we corrupt it. */ if (flags KEXEC_FILE_ON_CRASH) kimage_free(xchg(kexec_crash_image, NULL)); Therefore I thought it should be ok. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v3] kexec: Make a pair of map and unmap reserved pages when kdump fails to start
On Fri, 10 Jul 2015 12:05:27 +0800 Minfei Huang mnfhu...@gmail.com wrote: On 07/09/15 at 05:54P, Michael Holzheu wrote: On Tue, 7 Jul 2015 17:18:40 -0400 Vivek Goyal vgo...@redhat.com wrote: On Thu, Jul 02, 2015 at 09:45:52AM +0800, Minfei Huang wrote: [snip] I am thinking of moving kernel loading code in a separate function to make things little simpler. Right now it is confusing. Can you please test attached patch. I have only compile tested it. This is primarily doing what you are doing but in a separate function. It seems more readable now. The patch looks good to me. What about the following patch on top to make things even more readable? --- kernel/kexec.c | 50 +- 1 file changed, 17 insertions(+), 33 deletions(-) --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1236,14 +1236,18 @@ int kexec_load_disabled; static DEFINE_MUTEX(kexec_mutex); -static int __kexec_load(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, +static int __kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user * segments, unsigned long flags) { + struct kimage *image, **dest_image; unsigned long i; int result; - struct kimage *image; + + dest_image = (flags KEXEC_ON_CRASH) ? kexec_crash_image : kexec_image; + + if (nr_segments == 0) + return 0; It is fine, if nr_segments is 0. So we should deal with this case like original kexec code. if (flags KEXEC_ON_CRASH) { /* @@ -1251,7 +1255,6 @@ static int __kexec_load(struct kimage ** * crashes. Free any current crash dump kernel before * we corrupt it. */ - kimage_free(xchg(kexec_crash_image, NULL)); } @@ -1267,30 +1270,29 @@ static int __kexec_load(struct kimage ** result = machine_kexec_prepare(image); if (result) - goto out; + goto fail; for (i = 0; i nr_segments; i++) { result = kimage_load_segment(image, image-segment[i]); if (result) - goto out; + goto fail; } - kimage_terminate(image); - *rimage = image; -out: + /* Install the new kernel, and uninstall the old */ + kimage_free(xchg(dest_image, image)); if (flags KEXEC_ON_CRASH) crash_unmap_reserved_pages(); - - /* Free image if there was an error */ - if (result) - kimage_free(image); + return 0; +fail: + if (flags KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); + kimage_free(image); Kernel release image again Again? This is only done in the error case. , and will crash in here, since we do not assign the image to NULL when we release the image above. Good catch, I should have set image=NULL at the beginning of __kexec_load(). Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v4] kexec: Make a pair of map and unmap reserved pages when kdump fails to start
On Fri, 10 Jul 2015 13:12:17 +0800 Minfei Huang mnfhu...@gmail.com wrote: For some arch, kexec shall map the reserved pages, then use them, when we try to start the kdump service. Now kexec will never unmap the reserved pages, once it fails to continue starting the kdump service. So we make a pair of map/unmap reserved pages whatever kexec fails or not in code path. In order to make code readable, wrap a new function __kexec_load which contains all of the logic to deal with the image loading. Signed-off-by: Minfei Huang mnfhu...@gmail.com --- v3: - reconstruct the patch, wrap a new function to deal with the code logic, based on Vivek and Michael's patch v2: - replace the failure label with fail_unmap_pages v1: - reconstruct the patch code --- kernel/kexec.c | 112 - 1 file changed, 63 insertions(+), 49 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index a785c10..2232c90 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1247,10 +1247,71 @@ int kexec_load_disabled; static DEFINE_MUTEX(kexec_mutex); +static int __kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) +{ + int result = 0; + struct kimage **dest_image, *image; + + dest_image = kexec_image; + + if (flags KEXEC_ON_CRASH) + dest_image = kexec_crash_image; + + if (nr_segments == 0) { + /* Install the new kernel, and Uninstall the old */ + image = xchg(dest_image, image); + kimage_free(image); Well this is wrong and should probably be: if (nr_segments == 0) { /* Uninstall image */ image = xchg(dest_image, NULL); kimage_free(image); + } else { + unsigned long i; + + if (flags KEXEC_ON_CRASH) { + /* [snip] + result = kimage_load_segment(image, image-segment[i]); + if (result) + goto failure_unmap_mem; + } + + kimage_terminate(image); + + /* Install the new kernel, and Uninstall the old */ Perhaps fix the comment: Remove superfluous blank and lowercase uninstall? + image = xchg(dest_image, image); + +failure_unmap_mem: + if (flags KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); + kimage_free(image); Here the update patch: --- diff --git a/kernel/kexec.c b/kernel/kexec.c index e686a39..2f5b4aa 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1249,8 +1249,8 @@ static int __kexec_load(unsigned long entry, unsigned long nr_segments, dest_image = kexec_crash_image; if (nr_segments == 0) { - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); + /* Uninstall image */ + image = xchg(dest_image, NULL); kimage_free(image); } else { unsigned long i; @@ -1287,7 +1287,7 @@ static int __kexec_load(unsigned long entry, unsigned long nr_segments, kimage_terminate(image); - /* Install the new kernel, and Uninstall the old */ + /* Install the new kernel, and uninstall the old */ image = xchg(dest_image, image); failure_unmap_mem: ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v4] kexec: Make a pair of map and unmap reserved pages when kdump fails to start
On Fri, 10 Jul 2015 11:14:06 +0200 Michael Holzheu holz...@linux.vnet.ibm.com wrote: On Fri, 10 Jul 2015 17:03:22 +0800 Minfei Huang mnfhu...@gmail.com wrote: [snip] +static int __kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) +{ + struct kimage **dest_image, *image; + unsigned long i; + int result; + + if (flags KEXEC_ON_CRASH) + dest_image = kexec_crash_image; + else + dest_image = kexec_image; + + if (nr_segments == 0) { + /* Uninstall image */ + kfree(xchg(dest_image, NULL)); Sorry, too fast today... Should be of course not kfree, but: kimage_free(dest_image, NULL)); Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v3] kexec: Make a pair of map and unmap reserved pages when kdump fails to start
On Tue, 7 Jul 2015 17:18:40 -0400 Vivek Goyal vgo...@redhat.com wrote: On Thu, Jul 02, 2015 at 09:45:52AM +0800, Minfei Huang wrote: [snip] I am thinking of moving kernel loading code in a separate function to make things little simpler. Right now it is confusing. Can you please test attached patch. I have only compile tested it. This is primarily doing what you are doing but in a separate function. It seems more readable now. The patch looks good to me. What about the following patch on top to make things even more readable? --- kernel/kexec.c | 50 +- 1 file changed, 17 insertions(+), 33 deletions(-) --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1236,14 +1236,18 @@ int kexec_load_disabled; static DEFINE_MUTEX(kexec_mutex); -static int __kexec_load(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, +static int __kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user * segments, unsigned long flags) { + struct kimage *image, **dest_image; unsigned long i; int result; - struct kimage *image; + + dest_image = (flags KEXEC_ON_CRASH) ? kexec_crash_image : kexec_image; + + if (nr_segments == 0) + return 0; if (flags KEXEC_ON_CRASH) { /* @@ -1251,7 +1255,6 @@ static int __kexec_load(struct kimage ** * crashes. Free any current crash dump kernel before * we corrupt it. */ - kimage_free(xchg(kexec_crash_image, NULL)); } @@ -1267,30 +1270,29 @@ static int __kexec_load(struct kimage ** result = machine_kexec_prepare(image); if (result) - goto out; + goto fail; for (i = 0; i nr_segments; i++) { result = kimage_load_segment(image, image-segment[i]); if (result) - goto out; + goto fail; } - kimage_terminate(image); - *rimage = image; -out: + /* Install the new kernel, and uninstall the old */ + kimage_free(xchg(dest_image, image)); if (flags KEXEC_ON_CRASH) crash_unmap_reserved_pages(); - - /* Free image if there was an error */ - if (result) - kimage_free(image); + return 0; +fail: + if (flags KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); + kimage_free(image); return result; } SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - struct kimage **dest_image, *image; int result; /* We only trust the superuser with rebooting the system. */ @@ -1315,9 +1317,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned lon if (nr_segments KEXEC_SEGMENT_MAX) return -EINVAL; - image = NULL; - result = 0; - /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -1329,24 +1328,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned lon if (!mutex_trylock(kexec_mutex)) return -EBUSY; - dest_image = kexec_image; - if (flags KEXEC_ON_CRASH) - dest_image = kexec_crash_image; - /* Load new kernel */ - if (nr_segments 0) { - result = __kexec_load(image, entry, nr_segments, segments, - flags); - if (result) - goto out; - } - - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); - -out: + result = __kexec_load(entry, nr_segments, segments, flags); mutex_unlock(kexec_mutex); - kimage_free(image); return result; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2] kexec: Make a pair of map and unmap reserved pages when kdump fails to start
Hello Minfei, Regarding functionality your patch looks ok for me. But the code is not easy to read. What about replacing the failure label with fail_unmap_pages? Michael On Tue, 30 Jun 2015 13:44:46 +0800 Minfei Huang mnfhu...@gmail.com wrote: For some arch, kexec shall map the reserved pages, then use them, when we try to start the kdump service. Now kexec will never unmap the reserved pages, once it fails to continue starting the kdump service. Make a pair of reserved pages in kdump starting path, whatever kexec fails or not. Signed-off-by: Minfei Huang mnfhu...@gmail.com --- kernel/kexec.c | 26 ++ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index 4589899..68f6dfb 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1291,35 +1291,37 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, */ kimage_free(xchg(kexec_crash_image, NULL)); - result = kimage_alloc_init(image, entry, nr_segments, -segments, flags); - crash_map_reserved_pages(); - } else { - /* Loading another kernel to reboot into. */ - - result = kimage_alloc_init(image, entry, nr_segments, -segments, flags); } + + result = kimage_alloc_init(image, entry, nr_segments, + segments, flags); if (result) goto out; + if (flags KEXEC_ON_CRASH) + crash_map_reserved_pages(); + if (flags KEXEC_PRESERVE_CONTEXT) image-preserve_context = 1; result = machine_kexec_prepare(image); if (result) - goto out; + goto failure; for (i = 0; i nr_segments; i++) { result = kimage_load_segment(image, image-segment[i]); if (result) - goto out; + goto failure; } kimage_terminate(image); + +failure: if (flags KEXEC_ON_CRASH) crash_unmap_reserved_pages(); } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); + + if (result == 0) + /* Install the new kernel, and Uninstall the old */ + image = xchg(dest_image, image); out: mutex_unlock(kexec_mutex); ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
On Thu, 12 Mar 2015 16:38:22 +0100 Petr Tesarik ptesa...@suse.cz wrote: On Mon, 9 Mar 2015 17:08:58 +0100 Michael Holzheu holz...@linux.vnet.ibm.com wrote: [snip] I counted the mmap and read system calls with perf stat: mmap unmap read =sum === mmap -d0482 443165 1090 mmap -d31 13454 1341416527033 non-mmap -d0 34 3 458917 458954 non-mmap -d3134 3 7427374310 If your VM has 1.5 GiB of RAM, then the numbers for -d0 look reasonable. I have 1792 MiB RAM. For -d31, we should be able to do better than this by allocating more cache slots and improving the algorithm. I originally didn't deem it worth the effort, but seeing almost 30 times more mmaps than with -d0 may change my mind. ok. Here the actual results I got with perf record: $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f Output of perf report for mmap case: /* Most time spent for unmap in kernel */ 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page /* Still some mmap overhead in makedumpfile readmem() */ 21.56% makedumpfile makedumpfile [.] readmem This number is interesting. Did you compile makedumpfile with optimizations? If yes, then this number probably includes some functions which were inlined. Yes, I used the default Makefile (-O2) so most functions are inlined. With -O0 I get the following: 15.35% makedumpfile libc-2.15.so [.] memcpy 2.14% makedumpfile makedumpfile [.] __exclude_unnecessary_pages 1.82% makedumpfile makedumpfile [.] test_bit 1.82% makedumpfile makedumpfile [.] set_bitmap_cyclic 1.32% makedumpfile makedumpfile [.] clear_bit_on_2nd_bitmap 1.32% makedumpfile makedumpfile [.] write_kdump_pages_cyclic 1.01% makedumpfile makedumpfile [.] is_on 0.88% makedumpfile makedumpfile [.] paddr_to_offset 0.75% makedumpfile makedumpfile [.] is_dumpable_cyclic 0.69% makedumpfile makedumpfile [.] exclude_range 0.63% makedumpfile makedumpfile [.] clear_bit_on_2nd_bitmap_for_kernel 0.63% makedumpfile [vdso] [.] __kernel_gettimeofday 0.57% makedumpfile makedumpfile [.] print_progress 0.50% makedumpfile makedumpfile [.] cache_search 8.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic Output of perf report for non-mmap case: /* Time spent for sys_read (that needs also two copy operations on s390 :() */ 25.32% makedumpfile [kernel.kallsyms] [k] memcpy_real 22.74% makedumpfile [kernel.kallsyms] [k] __copy_to_user /* readmem() for read path is cheaper ? */ 13.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic 4.53% makedumpfile makedumpfile [.] readmem Yes, much lower overhead of readmem is strange. For a moment I suspected wrong accounting of the page fault handler, but then I realized that for /proc/vmcore, all page table entries are created with the present bit set already, so there are no page faults... Right, as said below, perhaps it is the HW caching issue. I haven't had time yet to set up a system for reproduction, but I'll try to identify what's eating up the CPU time in readmem(). [...] I hope this analysis helps more than it confuses :-) As a conclusion, we could think of mapping larger chunks also for the fragmented case of -d 31 to reduce the amount of mmap/munmap calls. I agree in general. Memory mapped through /proc/vmcore does not increase run-time memory requirements, because it only adds a mapping to the old kernel's memory. At least you need the page table memory for the /proc/vmcore mapping, right? The only limiting factor is the virtual address space. On many architectures, this is no issue at all, and we could simply map the whole file at beginning. On some architectures, the virtual address space is smaller than possible physical RAM, so this approach would not work for them. Another open question was why the mmap case consumes more CPU time in readmem() than the read case. Our theory is that the first memory access is slower because it is not in the HW cache. For the mmap case userspace issues the first access (copy to makdumpfile cache) and for the read case the kernel issues the first access (memcpy_real/copy_to_user). Therefore the cache miss is accounted to userspace for mmap and to kernel for read. I have no idea how to measure this on s390. On x86_64 I would add some asm code to read TSC before and after the memory access instruction. I guess there is a similar counter on s390. Suggestions? On s390 under LPAR we have
Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
On Mon, 9 Mar 2015 17:08:58 +0100 Michael Holzheu holz...@linux.vnet.ibm.com wrote: Hello Petr, [snip] As a conclusion, we could think of mapping larger chunks also for the fragmented case of -d 31 to reduce the amount of mmap/munmap calls. FYI: I did some more tests and I am no longer sure if the above conclusion was correct. A simple copy program that reads or maps/unmaps every page from /proc/vmcore and then writes it to /dev/null is faster with mmap()/munmap() than with using read(): read: - # time ./copy /dev/null read real0m1.072s user0m0.010s sys 0m1.054s # perf stat -e syscalls:sys_enter_old_mmap,syscalls:sys_enter_munmap,syscalls:sys_enter_read ./copy /dev/null read 8 syscalls:sys_enter_old_mmap 1 syscalls:sys_enter_munmap 458753 syscalls:sys_enter_read 1.405457536 seconds time elapsed mmap: - # time ./copy /dev/null mmap real0m0.947s user0m0.314s sys 0m0.631s # perf stat -e syscalls:sys_enter_old_mmap,syscalls:sys_enter_munmap,syscalls:sys_enter_read ./copy /dev/null mmap 458760 syscalls:sys_enter_old_mmap 458753 syscalls:sys_enter_munmap 1 syscalls:sys_enter_read 1.175956735 seconds time elapsed Regards, Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2 0/8] Handle mmaped regions in cache
Hello Petr, With your patch we get 5-10 percent performance improvement on s390. Two examples: $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f user sys = total = With patches 0.156 0.248 0.404 Without patches 0.168 0.274 0.442 - No idea why we save system time? $ time ./makedumpfile -d 0 /proc/vmcore /dev/null -f user sys = total = With patches 0.683 0.020 0.703 Without patches 0.714 0.019 0.733 Regards Michael On Fri, 6 Mar 2015 15:03:12 +0100 Petr Tesarik ptesa...@suse.cz wrote: Because all pages must go into the cache, data is unnecessarily copied from mmapped regions to cache. Avoid this copying by storing the mmapped regions directly in the cache. First, the cache code needs a clean up clarification of the concept, especially the meaning of the pending list (allocated cache entries whose content is not yet valid). Second, the cache must be able to handle differently sized objects so that it can store individual pages as well as mmapped regions. Last, the cache eviction code must be extended to allow either reusing the read buffer or unmapping the region. Changelog: v2: cache cleanup _and_ actuall mmap implementation v1: only the cache cleanup Petr Tesarik (8): cache: get rid of search loop in cache_add() cache: allow to return a page to the pool cache: do not allocate from the pending list cache: add hit/miss statistics to the final report cache: allocate buffers in one big chunk cache: allow arbitrary size of cache entries cache: store mapped regions directly in the cache cleanup: remove unused page_is_fractional cache.c| 81 + cache.h| 16 +-- elf_info.c | 16 --- elf_info.h | 2 - makedumpfile.c | 138 ++--- 5 files changed, 138 insertions(+), 115 deletions(-) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
Hello Petr, With your patches I now used perf record and perf stat to check where the CPU time is consumed for -d31 and -d0. For -d31 the read case is better and for -d0 the mmap case is better. $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f [--non-mmap] user sys = total === mmap 0.156 0.248 0.404 non-mmap 0.090 0.180 0.270 $ time ./makedumpfile -d 0 /proc/vmcore /dev/null -f [--non-mmap] user sys = total == mmap 0.637 0.018 0.655 non-mmap 0.275 1.153 1.428 As already said, we think the reason is that for -d0 we issue only a small number of mmap/munmap calls because the mmap chunks are larger than the read chunks. For -d31 memory is fragmented and we issue lots of small mmap/munmap calls. Because munmap (at least on s390) is a very expensive operation and we need two calls (mmap/munmap), the mmap mode is slower that the read mode. I counted the mmap and read system calls with perf stat: mmap unmap read =sum === mmap -d0482 443165 1090 mmap -d31 13454 1341416527033 non-mmap -d0 34 3 458917 458954 non-mmap -d3134 3 7427374310 Here the actual results I got with perf record: $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f Output of perf report for mmap case: /* Most time spent for unmap in kernel */ 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page /* Still some mmap overhead in makedumpfile readmem() */ 21.56% makedumpfile makedumpfile [.] readmem 8.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic Output of perf report for non-mmap case: /* Time spent for sys_read (that needs also two copy operations on s390 :() */ 25.32% makedumpfile [kernel.kallsyms] [k] memcpy_real 22.74% makedumpfile [kernel.kallsyms] [k] __copy_to_user /* readmem() for read path is cheaper ? */ 13.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic 4.53% makedumpfile makedumpfile [.] readmem $ time ./makedumpfile -d 0 /proc/vmcore /dev/null -f Output of perf report for mmap case: /* Almost no kernel time because we issue very view system calls */ 0.61% makedumpfile [kernel.kallsyms] [k] unmap_single_vma 0.61% makedumpfile [kernel.kallsyms] [k] sysc_do_svc /* Almost all time consumed in user space */ 84.64% makedumpfile makedumpfile [.] readmem 8.82% makedumpfile makedumpfile [.] write_cache Output of perf report for non-mmap case: /* Time spent for sys_read (that needs also two copy operations on s390) */ 31.50% makedumpfile [kernel.kallsyms] [k] memcpy_real 29.33% makedumpfile [kernel.kallsyms] [k] __copy_to_user /* Very little user space time */ 3.87% makedumpfile makedumpfile [.] write_cache 3.82% makedumpfile makedumpfile [.] readmem I hope this analysis helps more than it confuses :-) As a conclusion, we could think of mapping larger chunks also for the fragmented case of -d 31 to reduce the amount of mmap/munmap calls. Another open question was why the mmap case consumes more CPU time in readmem() than the read case. Our theory is that the first memory access is slower because it is not in the HW cache. For the mmap case userspace issues the first access (copy to makdumpfile cache) and for the read case the kernel issues the first access (memcpy_real/copy_to_user). Therefore the cache miss is accounted to userspace for mmap and to kernel for read. And last but not least, perhaps on s390 we could replace the bounce buffer used for memcpy_real()/copy_to_user() by some more inteligent solution. Best Regards Michael On Fri, 6 Mar 2015 15:03:12 +0100 Petr Tesarik ptesa...@suse.cz wrote: Because all pages must go into the cache, data is unnecessarily copied from mmapped regions to cache. Avoid this copying by storing the mmapped regions directly in the cache. First, the cache code needs a clean up clarification of the concept, especially the meaning of the pending list (allocated cache entries whose content is not yet valid). Second, the cache must be able to handle differently sized objects so that it can store individual pages as well as mmapped regions. Last, the cache eviction code must be extended to allow either reusing the read buffer or unmapping the region. Changelog: v2: cache cleanup _and_ actuall mmap implementation v1: only the cache cleanup Petr Tesarik (8): cache: get rid of search loop in cache_add() cache: allow to return a page to the pool cache: do not
Re: [PATCH] makedumpfile: Use file offset in initialize_mmap()
On Thu, 5 Mar 2015 22:30:05 +0100 Petr Tesarik ptesa...@suse.cz wrote: On Wed, 4 Mar 2015 13:44:18 +0100 Michael Holzheu holz...@linux.vnet.ibm.com wrote: [snip] What do you think? I'm not sure. Clearly, we should get rid of the temporary buffer. OTOH this slow-down should be observed on all architectures, not just s390. Now, mmap should have been implemented in the cache code, not above it. Since I wrote the cache, this task is probably up to me. Sounds good, thanks! Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: Use file offset in initialize_mmap()
On Tue, 3 Mar 2015 11:07:50 +0100 Petr Tesarik ptesa...@suse.cz wrote: On Tue, 3 Mar 2015 10:15:43 +0100 Michael Holzheu holz...@linux.vnet.ibm.com wrote: [snip] I did a quick test with your patch and it looks like the mmap mode on my s390 system is slower than the read mode: That's sad. OTOH I had similar results on a file mmap some time ago. The cost of copying data was less than the cost of handling a series of minor page faults. I think we understood the problem: As for the read path, also for mmap the memory is copied into a temporary buffer: static int read_with_mmap(off_t offset, void *bufptr, ...) { ... memcpy(bufptr, info-mmap_buf + (offset - info-mmap_start_offset), read_size); Because on s390 copy_to_user() is as fast as userspace memcpy() we don't have any benefit here. The only saving is due to less mmap()/munmap() than read() system calls because bigger chunks are mapped than read. If you specify -d 31 the dump memory is fragmented and we have to issue more mmap()/munmap() calls and therefore also the system call overhead increases. If we really want to speed up the mmap path on s390 we probably have to get rid of the temporary buffer. What do you think? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: Use file offset in initialize_mmap()
Hello Petr, Thanks for the fix! Hard to believe that makedumpfile mmap mode on s390 has never worked. On Fri, 27 Feb 2015 13:14:09 +0100 Petr Tesarik ptesa...@suse.cz wrote: Hi all, update_mmap_range() expects a file offset as its first argument, but initialize_mmap() passes a physical address. Since the first segment usually starts at physical addr 0 on S/390, but there is no segment at file offset 0, update_mmap_range() fails, and makedumpfile falls back to read(). And for other architectures the wrong parameter was no problem? @Michael: I wonder how you actually tested the kernel mmap patches; this bug has prevented mmap on all my s390 systems... We tested /proc/vmcore mmap with our SCSI stand-alone dump (zfcpdump) and with small test programs that used mmap. I did a quick test with your patch and it looks like the mmap mode on my s390 system is slower than the read mode: # time ./makedumpfile -d 31 /proc/vmcore out real0m1.043s user0m0.187s sys 0m0.433s # time ./makedumpfile -d 31 /proc/vmcore out --non-mmap real0m0.767s user0m0.098s sys 0m0.364s We will have to look into this. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: Enable --mem-usage for s390x
Hello Atsushi, On Thu, 30 Oct 2014 01:29:18 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: [snip] Now I don't plan to use is_iomem_phys_addr() for non s390x architectures, I prefer putting is_iomem_phys_addr() into arch/s390x.c as Baoquan commented before. Could you modify the patch ? or any questions ? I modified the patch and moved the is_iomem_phys_addr() function to s390x.c. I did not add __pa() because currently I don't see the need for it (?). The updated patch below applies on top of our Compile warnings on archs without get_versiondep_info() patch. Michael --- [PATCH] makedumpfile: Enable --mem-usage for s390x Replace is_vmalloc_addr() by is_phys_addr() and implement is_phys_addr() on s390x using /proc/iommem parsing to enable the new makedumpfile option --mem-usage. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- arch/s390x.c | 26 ++ elf_info.c |4 ++-- makedumpfile.h | 20 +--- 3 files changed, 41 insertions(+), 9 deletions(-) --- a/arch/s390x.c +++ b/arch/s390x.c @@ -308,4 +308,30 @@ vaddr_to_paddr_s390x(unsigned long vaddr return paddr; } +struct addr_check { + unsigned long addr; + int found; +}; + +static int phys_addr_callback(void *data, int nr, char *str, + unsigned long base, unsigned long length) +{ + struct addr_check *addr_check = data; + unsigned long addr = addr_check-addr; + + if (addr = base addr base + length) { + addr_check-found = 1; + return -1; + } + return 0; +} + +int is_iomem_phys_addr_s390x(unsigned long addr) +{ + struct addr_check addr_check = {addr, 0}; + + iomem_for_each_line(System RAM\n, phys_addr_callback, addr_check); + return addr_check.found; +} + #endif /* __s390x__ */ --- a/elf_info.c +++ b/elf_info.c @@ -854,7 +854,7 @@ int get_kcore_dump_loads(void) for (i = 0; i num_pt_loads; ++i) { struct pt_load_segment *p = pt_loads[i]; - if (is_vmalloc_addr(p-virt_start)) + if (!is_phys_addr(p-virt_start)) continue; loads++; } @@ -874,7 +874,7 @@ int get_kcore_dump_loads(void) for (i = 0, j = 0; i num_pt_loads; ++i) { struct pt_load_segment *p = pt_loads[i]; - if (is_vmalloc_addr(p-virt_start)) + if (!is_phys_addr(p-virt_start)) continue; if (j = loads) return FALSE; --- a/makedumpfile.h +++ b/makedumpfile.h @@ -767,7 +767,7 @@ unsigned long long vaddr_to_paddr_arm(un #define get_machdep_info() get_machdep_info_arm() #define get_versiondep_info() stub_true() #define vaddr_to_paddr(X) vaddr_to_paddr_arm(X) -#define is_vmalloc_addr(X) stub_true_ul(X) +#define is_phys_addr(X)stub_true_ul(X) #endif /* arm */ #ifdef __x86__ @@ -778,7 +778,7 @@ unsigned long long vaddr_to_paddr_x86(un #define get_machdep_info() get_machdep_info_x86() #define get_versiondep_info() get_versiondep_info_x86() #define vaddr_to_paddr(X) vaddr_to_paddr_x86(X) -#define is_vmalloc_addr(X) stub_true_ul(X) +#define is_phys_addr(X)stub_true_ul(X) #endif /* x86 */ #ifdef __x86_64__ @@ -791,7 +791,7 @@ unsigned long long vaddr_to_paddr_x86_64 #define get_machdep_info() get_machdep_info_x86_64() #define get_versiondep_info() get_versiondep_info_x86_64() #define vaddr_to_paddr(X) vaddr_to_paddr_x86_64(X) -#define is_vmalloc_addr(X) is_vmalloc_addr_x86_64(X) +#define is_phys_addr(X)(!is_vmalloc_addr_x86_64(X)) #endif /* x86_64 */ #ifdef __powerpc64__ /* powerpc64 */ @@ -802,7 +802,7 @@ unsigned long long vaddr_to_paddr_ppc64( #define get_machdep_info() get_machdep_info_ppc64() #define get_versiondep_info() get_versiondep_info_ppc64() #define vaddr_to_paddr(X) vaddr_to_paddr_ppc64(X) -#define is_vmalloc_addr(X) stub_true_ul(X) +#define is_phys_addr(X)stub_true_ul(X) #endif /* powerpc64 */ #ifdef __powerpc32__ /* powerpc32 */ @@ -812,17 +812,18 @@ unsigned long long vaddr_to_paddr_ppc(un #define get_machdep_info() get_machdep_info_ppc() #define get_versiondep_info() stub_true() #define vaddr_to_paddr(X) vaddr_to_paddr_ppc(X) -#define is_vmalloc_addr(X) stub_true_ul(X) +#define is_phys_addr(X)stub_true_ul(X) #endif /* powerpc32 */ #ifdef __s390x__ /* s390x */ int get_machdep_info_s390x(void); unsigned long long vaddr_to_paddr_s390x(unsigned long vaddr); +int is_iomem_phys_addr_s390x(unsigned long addr); #define get_phys_base()stub_true() #define get_machdep_info() get_machdep_info_s390x() #define get_versiondep_info() stub_true() #define vaddr_to_paddr(X) vaddr_to_paddr_s390x(X) -#define is_vmalloc_addr(X) stub_true_ul(X) +#define
Re: [PATCH] makedumpfile: Enable --mem-usage for s390x
Hello Atsushi, On Thu, 23 Oct 2014 06:56:47 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: On Tue, 14 Oct 2014 07:19:13 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: [snip] I noticed that is_vmalloc_addr_x86_64() can't be used as is_phys_addr() due to the kaslr issue. I fixed it in the common path as below, but --mem-usage still has the same issue since initial() will be invoked after get_kcore_dump_loads(). http://lists.infradead.org/pipermail/kexec/2014-October/012743.html I know it's so late, but I began to think the current implementation that invokes vaddr_to_paddr_XXX() before initial() is bad: show_mem_usage() + get_kcore_dump_loads() + process_dump_load() + vaddr_to_paddr_XXX() + initial() ... vaddr_to_paddr_XXX() does VtoP translation *properly*, so it needs several symbols. This design is OK since it's a general method. So the current implementation that uses vaddr_to_paddr_xxx() for --mem-usage *before* initial() is broken? Or does it currently works only by chance? OTOH, we need a kludge which can be used under any situations and should use it for --mem-usage: VtoP_kludge_s390x(unsigned long vaddr) { /* s390 has 1:1 VtoP mapping that starts with zero. */ return vaddr; } also x86_64's can be implemented like below: VtoP_kludge_x86_64(unsigned long vaddr) { /* if the address is direct mapped, it's easy to translate. */ unsigned long paddr = vaddr - PAGE_OFFSET; return paddr; } What about using the kernel name __pa(vaddr)? [snip] --- a/elf_info.c +++ b/elf_info.c @@ -854,7 +854,7 @@ int get_kcore_dump_loads(void) for (i = 0; i num_pt_loads; ++i) { struct pt_load_segment *p = pt_loads[i]; -if (is_vmalloc_addr(p-virt_start)) +if (!is_phys_addr(p-virt_start)) This part implicitly does VtoP translation. Actually it works for s390x but it's not suitable as a general code, so we should use VtoP_kludge(should be better name!) too. Then we can use is_iomem_phys_addr() also for other architecture. So how exactly should the code look like for non s390x architectures? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile-1.5.7: Compile warnings on archs without get_versiondep_info()
On Tue, 21 Oct 2014 05:13:37 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Michael, I just noticed that makedumpfile-1.5.7 produces warnings on s390x and probably all other archs that have not defined get_versiondep_info(): Thanks for your reporting, does this patch help you ? Hello Atsushi, I had problems applying the patch but after manually adding the changes I still get the following warning: Thanks, Atsushi Kumagai From: Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp Date: Tue, 21 Oct 2014 11:11:46 +0900 Subject: [PATCH] Introduce stub method for machine dependent parts. Some machine dependent methods are implemented as the literal 1 since there is no need to do anything in their architectures. It's polite to replace them into an empty method, this will solve some compile warnings. Reported-by: Michael Holzheu holz...@linux.vnet.ibm.com Signed-off-by: Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp --- makedumpfile.h | 29 +++-- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/makedumpfile.h b/makedumpfile.h index a3342b5..5fda575 100644 --- a/makedumpfile.h +++ b/makedumpfile.h @@ -759,26 +759,27 @@ do { \ /* * The function of dependence on machine */ +static inline int stub_true() { return TRUE; } #ifdef __arm__ int get_phys_base_arm(void); int get_machdep_info_arm(void); unsigned long long vaddr_to_paddr_arm(unsigned long vaddr); #define get_phys_base() get_phys_base_arm() #define get_machdep_info() get_machdep_info_arm() -#define get_versiondep_info()TRUE +#define get_versiondep_info()stub_true() #define vaddr_to_paddr(X)vaddr_to_paddr_arm(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true(X) #endif /* arm */ #ifdef __x86__ int get_machdep_info_x86(void); int get_versiondep_info_x86(void); unsigned long long vaddr_to_paddr_x86(unsigned long vaddr); -#define get_phys_base() TRUE +#define get_phys_base() stub_true() #define get_machdep_info() get_machdep_info_x86() #define get_versiondep_info()get_versiondep_info_x86() #define vaddr_to_paddr(X)vaddr_to_paddr_x86(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true(X) #endif /* x86 */ #ifdef __x86_64__ @@ -798,31 +799,31 @@ unsigned long long vaddr_to_paddr_x86_64(unsigned long vaddr); int get_machdep_info_ppc64(void); int get_versiondep_info_ppc64(void); unsigned long long vaddr_to_paddr_ppc64(unsigned long vaddr); -#define get_phys_base() TRUE +#define get_phys_base() stub_true() #define get_machdep_info() get_machdep_info_ppc64() #define get_versiondep_info()get_versiondep_info_ppc64() #define vaddr_to_paddr(X)vaddr_to_paddr_ppc64(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true(X) #endif /* powerpc64 */ #ifdef __powerpc32__ /* powerpc32 */ int get_machdep_info_ppc(void); unsigned long long vaddr_to_paddr_ppc(unsigned long vaddr); -#define get_phys_base() TRUE +#define get_phys_base() stub_true() #define get_machdep_info() get_machdep_info_ppc() -#define get_versiondep_info()TRUE +#define get_versiondep_info()stub_true() #define vaddr_to_paddr(X)vaddr_to_paddr_ppc(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true(X) #endif /* powerpc32 */ #ifdef __s390x__ /* s390x */ int get_machdep_info_s390x(void); unsigned long long vaddr_to_paddr_s390x(unsigned long vaddr); -#define get_phys_base() TRUE +#define get_phys_base() stub_true() #define get_machdep_info() get_machdep_info_s390x() -#define get_versiondep_info()TRUE +#define get_versiondep_info()stub_true() #define vaddr_to_paddr(X)vaddr_to_paddr_s390x(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true(X) #endif /* s390x */ #ifdef __ia64__ /* ia64 */ @@ -831,10 +832,10 @@ int get_machdep_info_ia64(void); unsigned long long vaddr_to_paddr_ia64(unsigned long vaddr); #define get_machdep_info() get_machdep_info_ia64() #define get_phys_base() get_phys_base_ia64() -#define get_versiondep_info()TRUE +#define get_versiondep_info()stub_true() #define vaddr_to_paddr(X)vaddr_to_paddr_ia64(X) #define VADDR_REGION(X) (((unsigned long)(X)) REGION_SHIFT) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true(X) #endif /* ia64 */ typedef unsigned long long mdf_pfn_t; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile-1.5.7: Compile warnings on archs without get_versiondep_info()
On Tue, 21 Oct 2014 05:13:37 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Michael, I just noticed that makedumpfile-1.5.7 produces warnings on s390x and probably all other archs that have not defined get_versiondep_info(): Thanks for your reporting, does this patch help you ? Hello Atsushi, Sorry, the first note was sent by accident... I had problems applying the patch but after manually adding the changes I still get the following warning: elf_info.c: In function 'get_kcore_dump_loads': elf_info.c:855:27: warning: unused variable 'p' [-Wunused-variable] struct pt_load_segment *p = pt_loads[i]; What about the following patch: --- makedumpfile.h | 30 -- 1 file changed, 16 insertions(+), 14 deletions(-) --- a/makedumpfile.h +++ b/makedumpfile.h @@ -759,26 +759,28 @@ do { \ /* * The function of dependence on machine */ +static inline int stub_true() { return TRUE; } +static inline int stub_true_ul(unsigned long x) { return TRUE; } #ifdef __arm__ int get_phys_base_arm(void); int get_machdep_info_arm(void); unsigned long long vaddr_to_paddr_arm(unsigned long vaddr); #define get_phys_base()get_phys_base_arm() #define get_machdep_info() get_machdep_info_arm() -#define get_versiondep_info() TRUE +#define get_versiondep_info() stub_true() #define vaddr_to_paddr(X) vaddr_to_paddr_arm(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true_ul(X) #endif /* arm */ #ifdef __x86__ int get_machdep_info_x86(void); int get_versiondep_info_x86(void); unsigned long long vaddr_to_paddr_x86(unsigned long vaddr); -#define get_phys_base()TRUE +#define get_phys_base()stub_true() #define get_machdep_info() get_machdep_info_x86() #define get_versiondep_info() get_versiondep_info_x86() #define vaddr_to_paddr(X) vaddr_to_paddr_x86(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true_ul(X) #endif /* x86 */ #ifdef __x86_64__ @@ -798,31 +800,31 @@ unsigned long long vaddr_to_paddr_x86_64 int get_machdep_info_ppc64(void); int get_versiondep_info_ppc64(void); unsigned long long vaddr_to_paddr_ppc64(unsigned long vaddr); -#define get_phys_base()TRUE +#define get_phys_base()stub_true() #define get_machdep_info() get_machdep_info_ppc64() #define get_versiondep_info() get_versiondep_info_ppc64() #define vaddr_to_paddr(X) vaddr_to_paddr_ppc64(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true_ul(X) #endif /* powerpc64 */ #ifdef __powerpc32__ /* powerpc32 */ int get_machdep_info_ppc(void); unsigned long long vaddr_to_paddr_ppc(unsigned long vaddr); -#define get_phys_base()TRUE +#define get_phys_base()stub_true() #define get_machdep_info() get_machdep_info_ppc() -#define get_versiondep_info() TRUE +#define get_versiondep_info() stub_true() #define vaddr_to_paddr(X) vaddr_to_paddr_ppc(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true_ul(X) #endif /* powerpc32 */ #ifdef __s390x__ /* s390x */ int get_machdep_info_s390x(void); unsigned long long vaddr_to_paddr_s390x(unsigned long vaddr); -#define get_phys_base()TRUE +#define get_phys_base()stub_true() #define get_machdep_info() get_machdep_info_s390x() -#define get_versiondep_info() TRUE +#define get_versiondep_info() stub_true() #define vaddr_to_paddr(X) vaddr_to_paddr_s390x(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true_ul(X) #endif /* s390x */ #ifdef __ia64__ /* ia64 */ @@ -831,10 +833,10 @@ int get_machdep_info_ia64(void); unsigned long long vaddr_to_paddr_ia64(unsigned long vaddr); #define get_machdep_info() get_machdep_info_ia64() #define get_phys_base()get_phys_base_ia64() -#define get_versiondep_info() TRUE +#define get_versiondep_info() stub_true() #define vaddr_to_paddr(X) vaddr_to_paddr_ia64(X) #define VADDR_REGION(X)(((unsigned long)(X)) REGION_SHIFT) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) stub_true_ul(X) #endif /* ia64 */ typedef unsigned long long mdf_pfn_t; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: Enable --mem-usage for s390x
On Tue, 14 Oct 2014 07:19:13 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: [snip] I understand why your patch works on s390x, so how about this way ? 1. Define is_phys_addr() for --mem-usage. 2. Implement is_phys_addr() using is_iomem_phys_addr() for s390x while x86_64 uses is_vmalloc_addr_x86_64(). 3. Use is_phys_addr() instead of is_vmalloc_addr() in get_kcore_dump_loads(). Hello Atsushi, Great, so let's do that. @Baoquan: If you want to use the is_iomem_phys_addr() function also for x86, you could perhaps add an additional patch. Here the updated patch: --- [PATCH] makedumpfile: Enable --mem-usage for s390x Replace is_vmalloc_addr() by is_phys_addr() and implement is_phys_addr() using /proc/iommem parsing to enable the new makedumpfile option --mem-usage on s390x. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- elf_info.c |4 ++-- makedumpfile.c | 26 ++ makedumpfile.h | 15 --- 3 files changed, 36 insertions(+), 9 deletions(-) --- a/elf_info.c +++ b/elf_info.c @@ -854,7 +854,7 @@ int get_kcore_dump_loads(void) for (i = 0; i num_pt_loads; ++i) { struct pt_load_segment *p = pt_loads[i]; - if (is_vmalloc_addr(p-virt_start)) + if (!is_phys_addr(p-virt_start)) continue; loads++; } @@ -874,7 +874,7 @@ int get_kcore_dump_loads(void) for (i = 0, j = 0; i num_pt_loads; ++i) { struct pt_load_segment *p = pt_loads[i]; - if (is_vmalloc_addr(p-virt_start)) + if (!is_phys_addr(p-virt_start)) continue; if (j = loads) return FALSE; --- a/makedumpfile.c +++ b/makedumpfile.c @@ -9227,6 +9227,32 @@ int is_crashkernel_mem_reserved(void) return !!crash_reserved_mem_nr; } +struct addr_check { + unsigned long addr; + int found; +}; + +static int phys_addr_callback(void *data, int nr, char *str, + unsigned long base, unsigned long length) +{ + struct addr_check *addr_check = data; + unsigned long addr = addr_check-addr; + + if (addr = base addr base + length) { + addr_check-found = 1; + return -1; + } + return 0; +} + +int is_iomem_phys_addr(unsigned long addr) +{ + struct addr_check addr_check = {addr, 0}; + + iomem_for_each_line(System RAM\n, phys_addr_callback, addr_check); + return addr_check.found; +} + static int get_page_offset(void) { struct utsname utsname; --- a/makedumpfile.h +++ b/makedumpfile.h @@ -765,7 +765,7 @@ unsigned long long vaddr_to_paddr_arm(un #define get_machdep_info() get_machdep_info_arm() #define get_versiondep_info() TRUE #define vaddr_to_paddr(X) vaddr_to_paddr_arm(X) -#define is_vmalloc_addr(X) TRUE +#define is_phys_addr(X)TRUE #endif /* arm */ #ifdef __x86__ @@ -776,7 +776,7 @@ unsigned long long vaddr_to_paddr_x86(un #define get_machdep_info() get_machdep_info_x86() #define get_versiondep_info() get_versiondep_info_x86() #define vaddr_to_paddr(X) vaddr_to_paddr_x86(X) -#define is_vmalloc_addr(X) TRUE +#define is_phys_addr(X)TRUE #endif /* x86 */ #ifdef __x86_64__ @@ -789,7 +789,7 @@ unsigned long long vaddr_to_paddr_x86_64 #define get_machdep_info() get_machdep_info_x86_64() #define get_versiondep_info() get_versiondep_info_x86_64() #define vaddr_to_paddr(X) vaddr_to_paddr_x86_64(X) -#define is_vmalloc_addr(X) is_vmalloc_addr_x86_64(X) +#define is_phys_addr(X)(!is_vmalloc_addr_x86_64(X) #endif /* x86_64 */ #ifdef __powerpc64__ /* powerpc64 */ @@ -800,7 +800,7 @@ unsigned long long vaddr_to_paddr_ppc64( #define get_machdep_info() get_machdep_info_ppc64() #define get_versiondep_info() get_versiondep_info_ppc64() #define vaddr_to_paddr(X) vaddr_to_paddr_ppc64(X) -#define is_vmalloc_addr(X) TRUE +#define is_phys_addr(X)TRUE #endif /* powerpc64 */ #ifdef __powerpc32__ /* powerpc32 */ @@ -810,7 +810,7 @@ unsigned long long vaddr_to_paddr_ppc(un #define get_machdep_info() get_machdep_info_ppc() #define get_versiondep_info() TRUE #define vaddr_to_paddr(X) vaddr_to_paddr_ppc(X) -#define is_vmalloc_addr(X) TRUE +#define is_phys_addr(X)TRUE #endif /* powerpc32 */ #ifdef __s390x__ /* s390x */ @@ -820,7 +820,7 @@ unsigned long long vaddr_to_paddr_s390x( #define get_machdep_info() get_machdep_info_s390x() #define get_versiondep_info() TRUE #define vaddr_to_paddr(X) vaddr_to_paddr_s390x(X) -#define is_vmalloc_addr(X) TRUE +#define is_phys_addr(X)is_iomem_phys_addr(X) #endif /* s390x */ #ifdef __ia64__ /* ia64 */ @@ -832,7 +832,7 @@ unsigned long long vaddr_to_paddr_ia64(u #define get_versiondep_info
Re: [PATCH] makedumpfile: Enable --mem-usage for s390x
On Thu, 9 Oct 2014 06:41:10 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello, On Tue, 30 Sep 2014 17:02:01 +0800 Baoquan He b...@redhat.com wrote: On 09/29/14 at 03:14pm, Michael Holzheu wrote: Implement is_vmalloc_addr() using /proc/iommem parsing to enable the new makedumpfile option --mem-usage. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com Hi Michael, This idea looks good to me. One question, should it be put in arch/s390.c since this is only for s390? Then iomem_for_each_line() need be declared in makedumpfile.h . If later it's needed by other arch, can be taken out to makedumpfile.c, that should be better. Surely this is only my personal concern, if Atsushi like to accept it, I am fine too. Hello Atsushi, What is your preference regarding this question? Michael In the first place, this is_vmalloc_addr() for s390 isn't a good implementation because it works only on 1st kernel due to the dependence on /proc/iomem. is_vmalloc_addr_XXX() are general functions, they can be called from any path besides --mem-usage. I think the Michael's first idea below is better since it implements is_real_addr() only for --mem-usage as a common function for all architectures, it's explicit design. @@ -854,7 +854,7 @@ int get_kcore_dump_loads(void) for (i = 0; i num_pt_loads; ++i) { struct pt_load_segment *p = pt_loads[i]; -if (is_vmalloc_addr(p-virt_start)) +if (!is_real_addr(p-virt_start)) continue; loads++; } However, this code will not work since the argument of is_real_addr() must be physical address. Even unluckily, /proc/kcore's PT_LOAD looks useless for VtoP converting because PhysAddr is always 0: Program Headers: Type Offset VirtAddr PhysAddr FileSizMemSiz Flags Align NOTE 0x02a8 0x 0x 0x0a84 0x 0 LOAD 0x7f601000 0xff60 0x 0x1000 0x1000 RWE1000 LOAD 0x7fff81001000 0x8100 0x 0x00a1b000 0x00a1b000 RWE1000 LOAD 0x49001000 0xc900 0x 0x1fff 0x1fff RWE1000 LOAD 0x7fffa0001000 0xa000 0x 0x5f00 0x5f00 RWE1000 ... So the way using /proc/iomem seems inappropriate, we have to consider other approaches (but I still don't have any good ideas...) Hello Atsushi, Hmmm ok, sure. For x86 using /proc/iomem does not work because there is no 1:1 mapping for the kernel address space. The kernel/real memory is mapped somewhere at the end, right? For s390 we have a 1:1 mapping for the kernel physical memory that starts with zero. Therefore IMHO we could use /proc/iomem. For example, on my s390x system with 1GB memory and 256MB crashkernel: $ cat /proc/iomem -2fff : System RAM -007ddd4b : Kernel code 007ddd4c-00bfcc5f : Kernel data 00e18000-01c28b1f : Kernel bss 3000-3fff : Crash kernel $ objdump -h /proc/kcore ... Type Offset VirtAddr PhysAddr FileSizMemSiz Flags Align NOTE 0x0158 0x 0x 0x27fc 0x 0 LOAD 0x03e080003000 0x03e08000 0x 0x001f 0x001f RWE1000 LOAD 0x03ff80003000 0x03ff8000 0x 0x8000 0x8000 RWE1000 LOAD 0x3000 0x 0x 0x3000 0x3000 RWE1000 LOAD 0x03d13000 0x03d1 0x 0x00c0 0x00c0 RWE1000 So in that case every /proc/kcore load that is below 0x3000 must be real memory. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: Enable --mem-usage for s390x
On Tue, 30 Sep 2014 17:02:01 +0800 Baoquan He b...@redhat.com wrote: On 09/29/14 at 03:14pm, Michael Holzheu wrote: Implement is_vmalloc_addr() using /proc/iommem parsing to enable the new makedumpfile option --mem-usage. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com Hi Michael, This idea looks good to me. One question, should it be put in arch/s390.c since this is only for s390? Then iomem_for_each_line() need be declared in makedumpfile.h . If later it's needed by other arch, can be taken out to makedumpfile.c, that should be better. Surely this is only my personal concern, if Atsushi like to accept it, I am fine too. Hello Atsushi, What is your preference regarding this question? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: Add --mem-usage support for s390x
On Mon, 29 Sep 2014 17:04:32 +0800 Baoquan He b...@redhat.com wrote: On 09/26/14 at 01:34pm, Michael Holzheu wrote: On Fri, 26 Sep 2014 16:55:46 +0800 Isn't this a chicken-and-egg problem? In order to determine vmalloc start I have to be able to read memory. But in order to read memory I have to call get_kcore_dump_loads() first. What about using /proc/iomem to find out if an address is a real address? Well, that's good it works for s390. Anyway in get_kcore_dump_loads() it just gets the physical ram region, and filter out the unwanted region, so your method is good. In x86_64, the is_vmalloc_addr_x86_64 is not only filtering the vmalloc, but vmmemmap and modules_vaadr region. For simplicity it's only named as is_vmalloc_addr. Not sure if I understood, why ths is_real_addr() function does not work for x86_64. Also for x86 all three areas, vmalloc, vmemmap, and modules_vaddr, are virtual memory regions with addresses outside of the the memory ranges where /proc/iommem reports physical memory, right? So the new is_real_addr() function should return false for that areas. is_real_addr() should work for x86_64, this almost does the way kexec-tools is doing. Originally I just consider this for x86_64, skipped other ARCHs. From x86_64 point of view, processing kcore only need to pick up the program segments we want. If it's hard to handle it, I am fine with it that you use the is_real_addr to do it. But please don't use this name, it could be is_phy_addr() or somthing like that. Please post your patch and test x86_64 too and we can review it. In fact the other way is wrapping your is_real_addr() to is_vmalloc_addr_s390(), and make it work for s390. If later other ARCH also has this requirements, raise it to be common function. Anyway it's fine to me if it can work. Hi Baoquan, Because I don't have the possibility to test on x86_64 I decided to make a s390x only patch. I will send it with the next note. Thanks for your help! Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] makedumpfile: Enable --mem-usage for s390x
Implement is_vmalloc_addr() using /proc/iommem parsing to enable the new makedumpfile option --mem-usage. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 26 ++ makedumpfile.h |3 ++- 2 files changed, 28 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -9227,6 +9227,32 @@ int is_crashkernel_mem_reserved(void) return !!crash_reserved_mem_nr; } +struct addr_check { + unsigned long addr; + int found; +}; + +static int phys_addr_callback(void *data, int nr, char *str, + unsigned long base, unsigned long length) +{ + struct addr_check *addr_check = data; + unsigned long addr = addr_check-addr; + + if (addr = base addr base + length) { + addr_check-found = 1; + return -1; + } + return 0; +} + +int is_iomem_phys_addr(unsigned long addr) +{ + struct addr_check addr_check = {addr, 0}; + + iomem_for_each_line(System RAM\n, phys_addr_callback, addr_check); + return addr_check.found; +} + static int get_page_offset(void) { struct utsname utsname; --- a/makedumpfile.h +++ b/makedumpfile.h @@ -820,7 +820,7 @@ unsigned long long vaddr_to_paddr_s390x( #define get_machdep_info() get_machdep_info_s390x() #define get_versiondep_info() TRUE #define vaddr_to_paddr(X) vaddr_to_paddr_s390x(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) (!is_iomem_phys_addr(X)) #endif /* s390x */ #ifdef __ia64__ /* ia64 */ @@ -1567,6 +1567,7 @@ int read_disk_dump_header(struct disk_du int read_kdump_sub_header(struct kdump_sub_header *kh, char *filename); void close_vmcoreinfo(void); int close_files_for_creating_dumpfile(void); +int is_iomem_phys_addr(unsigned long addr); /* ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: Add --mem-usage support for s390x
On Fri, 26 Sep 2014 16:55:46 +0800 Baoquan He b...@redhat.com wrote: On 09/26/14 at 10:10am, Michael Holzheu wrote: On Thu, 25 Sep 2014 17:44:12 +0800 Baoquan He b...@redhat.com wrote: On 09/24/14 at 05:19pm, Michael Holzheu wrote: On Tue, 23 Sep 2014 10:40:58 +0800 Baoquan He b...@redhat.com wrote: [snip] For s390x this is not so easy because vmalloc_start is dependent on the memory size of the system (see setup_memory_end() in arch/s390/kernel/setup.c). Unfortunately info-max_mapnr is not set at that time. I am not aware of s390 arch and memory layout. But I can explain what those versiondep_info are used for, hope they can help. In fact in x86_64, page_offset is got for set_kcore_vmcoreinfo(), there the vmcoreinfo_addr need be converted to kvaddr. Since vmcoreinfo_addr is a physical addr, we can't use it directly. And VMALLOC_START/VMEMMAP_START/MODULES_VADDR are all used to filter this virtual addr space region since our vmcore only care about the physical ram addr region. If you need get these before they are used for s390 arch. If necessary you can build a different code flow if you can achive the goal. All these are all used to get dumpable load segments from kcore. Isn't this a chicken-and-egg problem? In order to determine vmalloc start I have to be able to read memory. But in order to read memory I have to call get_kcore_dump_loads() first. What about using /proc/iomem to find out if an address is a real address? Well, that's good it works for s390. Anyway in get_kcore_dump_loads() it just gets the physical ram region, and filter out the unwanted region, so your method is good. In x86_64, the is_vmalloc_addr_x86_64 is not only filtering the vmalloc, but vmmemmap and modules_vaadr region. For simplicity it's only named as is_vmalloc_addr. Not sure if I understood, why ths is_real_addr() function does not work for x86_64. Also for x86 all three areas, vmalloc, vmemmap, and modules_vaddr, are virtual memory regions with addresses outside of the the memory ranges where /proc/iommem reports physical memory, right? So the new is_real_addr() function should return false for that areas. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: Add --mem-usage support for s390x
On Tue, 23 Sep 2014 10:40:58 +0800 Baoquan He b...@redhat.com wrote: On 09/22/14 at 05:02pm, Michael Holzheu wrote: Hello Baoquan, I looked into your patches and tried to add s390x support. My naive approach was to just enable the is_vmalloc_addr() for s390x: --- a/makedumpfile.h +++ b/makedumpfile.h @@ -814,13 +814,15 @@ unsigned long long vaddr_to_paddr_ppc(un #endif /* powerpc32 */ #ifdef __s390x__ /* s390x */ +int is_vmalloc_addr_s390x(ulong vaddr); int get_machdep_info_s390x(void); unsigned long long vaddr_to_paddr_s390x(unsigned long vaddr); #define get_phys_base()TRUE #define get_machdep_info() get_machdep_info_s390x() #define get_versiondep_info() TRUE #define vaddr_to_paddr(X) vaddr_to_paddr_s390x(X) -#define is_vmalloc_addr(X) TRUE +#define is_vmalloc_addr(X) is_vmalloc_addr_s390x(X) #endif /* s390x */ #ifdef __ia64__ /* ia64 */ Hi Michael, Please alse provide a get_versiondep_info_s390x since page_offset is needed in set_kcore_vmcoreinfo() and other information need it too, such as VMALLOC_START/VMEMMAP_START/MODULES_VADDR, if you want to provide a is_vmalloc_addr_s390x before initial() is called. Hello Baoquan, Thanks for the hint! I looked into the x86_64 implementation of get_versiondep_info() where version dependent constants are used for vmalloc_start and others. For s390x this is not so easy because vmalloc_start is dependent on the memory size of the system (see setup_memory_end() in arch/s390/kernel/setup.c). Unfortunately info-max_mapnr is not set at that time. Any ideas? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array
On Tue, 1 Apr 2014 05:06:33 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: [...] OTOH, Michal's patch is still neded to fix non-Xen non-cyclic dumps. Yes, the fix for create_1st_bitmap() is still necessary. Michael, could you fix your patch ? We need to add the conditional check for Xen like below: + if (!is_xen_memory()) { +for (i = 0; i info-num_mem_map; i++) { + if (info-mem_map_data[i].mem_map == NOT_MEMMAP_ADDR) + continue; + max_pfn = MAX(max_pfn, info-mem_map_data[i].pfn_end); + } + info-max_mapnr = MIN(info-max_mapnr, max_pfn); + } Hello Atsushi and Petr, Based on the discussion I removed the checks in exclude_xen3_user_domain() and exclude_xen4_user_domain() and added the is_xen_memory() check int get_mem_map(). Here the updated patch: --- [PATCH] makedumpfile: Fix bitmap create for adjusted info-max_mapnr If info-max_mapnr has been adjusted, for example because the dumped system has specified the mem= kernel parameter, makedumpfile writes the following error messages for Xen dumps or when the --non-cyclic option has been specified: set_bitmap: Can't read the bitmap(/tmp/kdump_bitmapBsKAUe). Invalid argument Fix this and consider info-max_mapnr in the create_1st_bitmap() function. In addition to this, do not adjust max_mapnr for Xen dumps. For Xen info-max_mapnr gives the maximum machine PFN and the data in mem_section describes the Dom0 kernel memory map that gets initialized from info-dom0_mapnr. It may be substantially smaller than info-max_mapnr. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 15 ++- 1 file changed, 10 insertions(+), 5 deletions(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2868,12 +2868,14 @@ get_mem_map(void) * than is dumped. For example when mem= has been used for the * dumped system. */ - for (i = 0; i info-num_mem_map; i++) { - if (info-mem_map_data[i].mem_map == NOT_MEMMAP_ADDR) - continue; - max_pfn = MAX(max_pfn, info-mem_map_data[i].pfn_end); + if (!is_xen_memory()) { + for (i = 0; i info-num_mem_map; i++) { + if (info-mem_map_data[i].mem_map == NOT_MEMMAP_ADDR) + continue; + max_pfn = MAX(max_pfn, info-mem_map_data[i].pfn_end); + } + info-max_mapnr = MIN(info-max_mapnr, max_pfn); } - info-max_mapnr = MIN(info-max_mapnr, max_pfn); return ret; } @@ -4402,6 +4404,9 @@ create_1st_bitmap(void) pfn_start = paddr_to_pfn(phys_start); pfn_end = paddr_to_pfn(phys_end); + if (pfn_start info-max_mapnr) + continue; + pfn_end = MIN(pfn_end, info-max_mapnr); for (pfn = pfn_start; pfn pfn_end; pfn++) { set_bit_on_1st_bitmap(pfn); ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array
On Mon, 31 Mar 2014 09:48:05 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: [snip] That's because the bitmap length is calculated in prepare_bitmap_buffer using info-max_mapnr, but create_1st_bitmap() still loops over all PT_LOAD segments, calling set_bit_on_1st_bitmap() for each PFN. The offset may easily fall beyond the bitmap size. What about the following patch. It works for me when I specify the --non-cyclic option. Michael --- [PATCH] makedumpfile: Fix bitmap create for adjusted info-max_mapnr If info-max_mapnr has been adjusted, for example because the dumped system has specified the mem= kernel parameter, makedumpfile writes the following error messages for Xen dumps or when the --non-cyclic option has been specified: set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap96s9V8). Success This looks confusing, is it an actual message ? I suppose it must be Invalid argument like Petr's log. Right, I get Invalid argument. No idea from where I pasted Success here... Fix this and consider info-max_mapnr in the create bitmap functions. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- [snip] I found another issue of truncating max_mapnr for Xen. The bitmap manages MFN(machine frame number) for Xen while __exclude_unnecessary_pages() treats PFN(guest-physical frame number). __exclude_unnecessary_pages() expects that all bits of PFNs are mapped in the bitmap even if it was reduced by truncated max_mapnr. However, PtoM mapping isn't linear(probably...), there is no guarantee that a set of continuous PFNs is mapped in a set of continuous MFNs. So the actual I/O offset can exceed the bitmap size when the bitmap size is reduced. In the first place, we shouldn't truncate max_mapnr based on dom0's mem_section since there are some domU's memories on Xen dumps. Now, I think a better way for Xen is just leaving max_mapnr as it is. Do you agree with my view ? I don't know the Xen details so I would leave it to Petr to answer this question. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array
On Fri, 28 Mar 2014 12:00:47 +0100 Petr Tesarik ptesa...@suse.cz wrote: On Thu, 27 Mar 2014 14:54:41 +0100 Michael Holzheu holz...@linux.vnet.ibm.com wrote: [snip] Here the fixed patch: Thanks, I'll merge the fixed version into v1.5.6. Great! I'm sorry to spoil the party, but this patch broke Xen dumps for me. I'm getting an long series of these messages: set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap91pbsO). Invalid argument set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap91pbsO). Invalid argument set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap91pbsO). Invalid argument ... In fact, it most likely broke all non-cyclic dumps. That's because the bitmap length is calculated in prepare_bitmap_buffer using info-max_mapnr, but create_1st_bitmap() still loops over all PT_LOAD segments, calling set_bit_on_1st_bitmap() for each PFN. The offset may easily fall beyond the bitmap size. What about the following patch. It works for me when I specify the --non-cyclic option. Michael --- [PATCH] makedumpfile: Fix bitmap create for adjusted info-max_mapnr If info-max_mapnr has been adjusted, for example because the dumped system has specified the mem= kernel parameter, makedumpfile writes the following error messages for Xen dumps or when the --non-cyclic option has been specified: set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap96s9V8). Success Fix this and consider info-max_mapnr in the create bitmap functions. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c |9 + 1 file changed, 9 insertions(+) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -4402,6 +4402,9 @@ create_1st_bitmap(void) pfn_start = paddr_to_pfn(phys_start); pfn_end = paddr_to_pfn(phys_end); + if (pfn_start info-max_mapnr) + continue; + pfn_end = MIN(phys_end, info-max_mapnr); for (pfn = pfn_start; pfn pfn_end; pfn++) { set_bit_on_1st_bitmap(pfn); @@ -7511,6 +7514,9 @@ exclude_xen3_user_domain(void) pfn = paddr_to_pfn(phys_start); pfn_end = paddr_to_pfn(phys_end); size= pfn_end - pfn; + if (pfn info-max_mapnr) + continue; + pfn_end = MIN(phys_end, info-max_mapnr); for (j = 0; pfn pfn_end; pfn++, j++) { print_progress(PROGRESS_XEN_DOMAIN, j + (size * i), @@ -7575,6 +7581,9 @@ exclude_xen4_user_domain(void) pfn = paddr_to_pfn(phys_start); pfn_end = paddr_to_pfn(phys_end); size= pfn_end - pfn; + if (pfn info-max_mapnr) + continue; + pfn_end = MIN(phys_end, info-max_mapnr); for (j = 0; pfn pfn_end; pfn++, j++) { print_progress(PROGRESS_XEN_DOMAIN, j + (size * i), ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array
On Fri, 28 Mar 2014 12:00:47 +0100 Petr Tesarik ptesa...@suse.cz wrote: On Thu, 27 Mar 2014 14:54:41 +0100 Michael Holzheu holz...@linux.vnet.ibm.com wrote: On Thu, 27 Mar 2014 05:19:06 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Michael, On Wed, 26 Mar 2014 10:55:07 +0100 (a/T) HATAYAMA Daisuke d.hatay...@jp.fujitsu.com wrote: From: Michael Holzheu holz...@linux.vnet.ibm.com Subject: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array Date: Tue, 25 Mar 2014 17:14:20 +0100 [snip] With this patch makedumpfile gets the maximum page frame number from the mem_map array and adjusts info-max_mapnr if this value is smaller than the value calculated from the ELF header. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2829,7 +2829,8 @@ get_mem_map_without_mm(void) int get_mem_map(void) { -int ret; +unsigned long max_pfn = 0; +int ret, i; Please define max_pfn as unsigned long long. Ok done. And for i, switch (get_mem_type()) { case SPARSEMEM: @@ -2861,6 +2862,17 @@ get_mem_map(void) ret = FALSE; break; } +/* + * Adjust max_mapnr for the case that Linux uses less memory + * than is dumped. For example when mem= has been used for the + * dumped system. + */ +for (i = 0; i info-num_mem_map; i++) { info-num_mem_map is defined as unsigned int. I guess some warning about comparison with different signedness occurs. Ah ok... With the default CFLAGS for makedumpfile-1.5.5 tarball I do not get any warning. When I add -W to CFLAGS, I get lots of warnings including the one you mentioned. Here the fixed patch: Thanks, I'll merge the fixed version into v1.5.6. Great! I'm sorry to spoil the party, but this patch broke Xen dumps for me. I'm getting an long series of these messages: set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap91pbsO). Invalid argument set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap91pbsO). Invalid argument set_bitmap: Can't read the bitmap(/tmp/kdump_bitmap91pbsO). Invalid argument ... In fact, it most likely broke all non-cyclic dumps. That's because the bitmap length is calculated in prepare_bitmap_buffer using info-max_mapnr, but create_1st_bitmap() still loops over all PT_LOAD segments, calling set_bit_on_1st_bitmap() for each PFN. The offset may easily fall beyond the bitmap size. Hello Petr, I can reproduce your issue with the --non-cyclic option and I will look into this. Thanks for testing this! Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array
On Thu, 27 Mar 2014 05:19:06 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Michael, On Wed, 26 Mar 2014 10:55:07 +0100 (a/T) HATAYAMA Daisuke d.hatay...@jp.fujitsu.com wrote: From: Michael Holzheu holz...@linux.vnet.ibm.com Subject: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array Date: Tue, 25 Mar 2014 17:14:20 +0100 [snip] With this patch makedumpfile gets the maximum page frame number from the mem_map array and adjusts info-max_mapnr if this value is smaller than the value calculated from the ELF header. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2829,7 +2829,8 @@ get_mem_map_without_mm(void) int get_mem_map(void) { -int ret; +unsigned long max_pfn = 0; +int ret, i; Please define max_pfn as unsigned long long. Ok done. And for i, switch (get_mem_type()) { case SPARSEMEM: @@ -2861,6 +2862,17 @@ get_mem_map(void) ret = FALSE; break; } +/* + * Adjust max_mapnr for the case that Linux uses less memory + * than is dumped. For example when mem= has been used for the + * dumped system. + */ +for (i = 0; i info-num_mem_map; i++) { info-num_mem_map is defined as unsigned int. I guess some warning about comparison with different signedness occurs. Ah ok... With the default CFLAGS for makedumpfile-1.5.5 tarball I do not get any warning. When I add -W to CFLAGS, I get lots of warnings including the one you mentioned. Here the fixed patch: Thanks, I'll merge the fixed version into v1.5.6. Great! Thanks for your support! Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array
On Wed, 26 Mar 2014 10:55:07 +0100 (a/T) HATAYAMA Daisuke d.hatay...@jp.fujitsu.com wrote: From: Michael Holzheu holz...@linux.vnet.ibm.com Subject: [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array Date: Tue, 25 Mar 2014 17:14:20 +0100 [snip] With this patch makedumpfile gets the maximum page frame number from the mem_map array and adjusts info-max_mapnr if this value is smaller than the value calculated from the ELF header. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2829,7 +2829,8 @@ get_mem_map_without_mm(void) int get_mem_map(void) { - int ret; + unsigned long max_pfn = 0; + int ret, i; Please define max_pfn as unsigned long long. Ok done. And for i, switch (get_mem_type()) { case SPARSEMEM: @@ -2861,6 +2862,17 @@ get_mem_map(void) ret = FALSE; break; } + /* +* Adjust max_mapnr for the case that Linux uses less memory +* than is dumped. For example when mem= has been used for the +* dumped system. +*/ + for (i = 0; i info-num_mem_map; i++) { info-num_mem_map is defined as unsigned int. I guess some warning about comparison with different signedness occurs. Ah ok... With the default CFLAGS for makedumpfile-1.5.5 tarball I do not get any warning. When I add -W to CFLAGS, I get lots of warnings including the one you mentioned. Here the fixed patch: --- [PATCH 2/2] makedumpfile: Use max_pfn from mem_map array There are dump mechansims like s390 stand-alond dump or KVM virsh dump that write the physical memory of a machine and are not aware of the dumped operating system. For those dump mechanisms it can happen that for the Linux kernel of the dumped system the mem= kernel parameter has been specified. In this case max_mapnr that makedumpfile gets from the ELF header can be bigger than the maximum page frame number used by the dumped Linux kernel. With this patch makedumpfile gets the maximum page frame number from the mem_map array and adjusts info-max_mapnr if this value is smaller than the value calculated from the ELF header. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 13 + 1 file changed, 13 insertions(+) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2829,6 +2829,8 @@ get_mem_map_without_mm(void) int get_mem_map(void) { + unsigned long long max_pfn = 0; + unsigned int i; int ret; switch (get_mem_type()) { @@ -2861,6 +2863,17 @@ get_mem_map(void) ret = FALSE; break; } + /* +* Adjust max_mapnr for the case that Linux uses less memory +* than is dumped. For example when mem= has been used for the +* dumped system. +*/ + for (i = 0; i info-num_mem_map; i++) { + if (info-mem_map_data[i].mem_map == NOT_MEMMAP_ADDR) + continue; + max_pfn = MAX(max_pfn, info-mem_map_data[i].pfn_end); + } + info-max_mapnr = MIN(info-max_mapnr, max_pfn); return ret; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile: get_max_mapnr() from ELF header problem
On Tue, 25 Mar 2014 01:14:21 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: [snip] But it looks like get_mm_sparsemem() does not check for zero. The nr_to_section() function just returns an invalid address (something between 0 and 4096) for section in case we get zero from the mem_section entry. This is address is then used for calculating mem_map: In other architectures, the check by is_kaddr() avoids to read invalid address, but it doesn't do anything in the case of s390 due to the its memory management mechanism: s390x: Fix KVBASE to correct value fors390x architecture. http://lists.infradead.org/pipermail/kexec/2011-March/004930.html Right, for s390 the zero page is valid. Finally I've understood the cause of this issue completely, thanks for your report. mem_map = section_mem_map_addr(section); mem_map = sparse_decode_mem_map(mem_map, section_nr); With the patch below I could use makedumpfile (1.5.3) successfully on the 1TB dump with mem=1G. I attached the -D output that is created by makedumpfile with the patch. But compared to my first patch it takes much longer and the resulting dump is bigger (version 1.5.3): | Dump time | Dump size -+-+--- First patch | 10 sec | 124 MB Second patch | 87 minutes | 6348 MB No idea why the dump is bigger with the second patch. I think the time is consumed in write_kdump_pages_cyclic() by checking for zero pages for the whole range: I suppose this difference was resolved with the v2 of the second patch, right? Right, with the last patch the dump time and size were ok. [snip] So the first patch would be better for my scenario. What in particular are your concerns with that patch? I think the v2 second patch is a reasonable patch to fix the bug of get_mm_sparsemem(). Additionally, the latest patch you posted to adjust max_mapnr (which using mem_map_data[]) is acceptable instead of the first patch. So could you re-post the two as a formal patch set? I mean patch descriptions and your signature are needed. Ok great! I will resend the patches. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 0/2] makdumpfile: Add mem= handling for physical memory dumps
There are dump mechansims like s390 stand-alone dump or KVM virsh dump that write the physical memory of a machine and that are not aware of the dumped operating system. If for the Linux kernel of the dumped system the mem= kernel parameter has been specified, the max_mapnr that makedumpfile gets from the ELF header can be bigger than the maximum page frame number used by the dumped Linux kernel. This can lead to makedumpfile errors on s390x and can also lead to extended dump times and sizes. The following two patches for version 1.5.5 fix these issues: Michael Holzheu (2): makedumpfile: Fix zero checking of get_mm_sparsemem() makedumpfile: Use max_pfn from mem_map array makedumpfile.c | 36 ++-- 1 file changed, 30 insertions(+), 6 deletions(-) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 1/2] makedumpfile: Fix zero checking of get_mm_sparsemem()
Currently zero entries in the mem_section array and the section arrays from the mem_section entries are not checked correctly. The problem especially hits the s390x architecture because there the is_kvaddr() function returns true for the first memory page. So add missing zero checks and set mem_map to NOT_MEMMAP_ADDR for all found zero entries. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 22 +- 1 file changed, 17 insertions(+), 5 deletions(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2690,11 +2690,14 @@ nr_to_section(unsigned long nr, unsigned { unsigned long addr; - if (is_sparsemem_extreme()) + if (is_sparsemem_extreme()) { + if (mem_sec[SECTION_NR_TO_ROOT(nr)] == 0) + return NOT_KV_ADDR; addr = mem_sec[SECTION_NR_TO_ROOT(nr)] + (nr SECTION_ROOT_MASK()) * SIZE(mem_section); - else + } else { addr = SYMBOL(mem_section) + (nr * SIZE(mem_section)); + } if (!is_kvaddr(addr)) return NOT_KV_ADDR; @@ -2778,10 +2781,19 @@ get_mm_sparsemem(void) } for (section_nr = 0; section_nr num_section; section_nr++) { section = nr_to_section(section_nr, mem_sec); - mem_map = section_mem_map_addr(section); - mem_map = sparse_decode_mem_map(mem_map, section_nr); - if (!is_kvaddr(mem_map)) + if (section == NOT_KV_ADDR) { mem_map = NOT_MEMMAP_ADDR; + } else { + mem_map = section_mem_map_addr(section); + if (mem_map == 0) { + mem_map = NOT_MEMMAP_ADDR; + } else { + mem_map = sparse_decode_mem_map(mem_map, + section_nr); + if (!is_kvaddr(mem_map)) + mem_map = NOT_MEMMAP_ADDR; + } + } pfn_start = section_nr * PAGES_PER_SECTION(); pfn_end = pfn_start + PAGES_PER_SECTION(); if (info-max_mapnr pfn_end) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 2/2] makedumpfile: Use max_pfn from mem_map array
There are dump mechansims like s390 stand-alond dump or KVM virsh dump that write the physical memory of a machine and are not aware of the dumped operating system. For those dump mechanisms it can happen that for the Linux kernel of the dumped system the mem= kernel parameter has been specified. In this case max_mapnr that makedumpfile gets from the ELF header can be bigger than the maximum page frame number used by the dumped Linux kernel. With this patch makedumpfile gets the maximum page frame number from the mem_map array and adjusts info-max_mapnr if this value is smaller than the value calculated from the ELF header. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2829,7 +2829,8 @@ get_mem_map_without_mm(void) int get_mem_map(void) { - int ret; + unsigned long max_pfn = 0; + int ret, i; switch (get_mem_type()) { case SPARSEMEM: @@ -2861,6 +2862,17 @@ get_mem_map(void) ret = FALSE; break; } + /* +* Adjust max_mapnr for the case that Linux uses less memory +* than is dumped. For example when mem= has been used for the +* dumped system. +*/ + for (i = 0; i info-num_mem_map; i++) { + if (info-mem_map_data[i].mem_map == NOT_MEMMAP_ADDR) + continue; + max_pfn = MAX(max_pfn, info-mem_map_data[i].pfn_end); + } + info-max_mapnr = MIN(info-max_mapnr, max_pfn); return ret; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile: get_max_mapnr() from ELF header problem
On Wed, 19 Mar 2014 07:14:25 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Atsushi, I debugged my problem a bit further and tried to implement a function that gets the maximum page frame number from the Linux kernel memory management structures. I am no memory management expert, so the following patch probably is not complete, but at least for my setup it worked. The patch looks good for your case, but I don't think it's a proper approach for this problem. Hello Atsushi, If you don't like that solution, what about using the mem_map_data[] array of makedumpfile to adjust max_mapnr? The patch below also works fine for my dump. Michael --- makedumpfile.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2829,7 +2829,8 @@ get_mem_map_without_mm(void) int get_mem_map(void) { - int ret; + unsigned long max_pfn = 0; + int ret, i; switch (get_mem_type()) { case SPARSEMEM: @@ -2861,6 +2862,17 @@ get_mem_map(void) ret = FALSE; break; } + /* +* Adjust max_mapnr for the case that Linux uses less memory +* than is dumped. For example when mem= has been used for the +* dumped system. +*/ + for (i = 0; i info-num_mem_map; i++) { + if (info-mem_map_data[i].mem_map == NOT_MEMMAP_ADDR) + continue; + max_pfn = MAX(max_pfn, info-mem_map_data[i].pfn_end); + } + info-max_mapnr = max_pfn; return ret; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile: get_max_mapnr() from ELF header problem
On Wed, 19 Mar 2014 07:14:25 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Atsushi, I debugged my problem a bit further and tried to implement a function that gets the maximum page frame number from the Linux kernel memory management structures. I am no memory management expert, so the following patch probably is not complete, but at least for my setup it worked. The patch looks good for your case, but I don't think it's a proper approach for this problem. Now, I think this is a problem of get_mm_sparsemem() in makedumpfile. To say in more detail, the problem is wrong calculating the address of unused mem_map. Looking at the log you sent, some addresses of mem_map corresponding to unused pages look invalid like below: mem_map (256) mem_map: 8c0002018 pfn_start : 100 pfn_end: 101 mem_map (257) mem_map: 8184040 pfn_start : 101 pfn_end: 102 ... mem_map (544) mem_map: a82400012f14fffc pfn_start : 220 pfn_end: 221 ...(and more) However, makedumpfile should calculate such unused mem_map addresses as 0(NOT_MEMMAP_ADDR). Actually it works as expected at least in my environment(x86_64): ... mem_map (16) mem_map: 0 pfn_start : 8 pfn_end: 88000 mem_map (17) mem_map: 0 pfn_start : 88000 pfn_end: 9 ... makedumpfile get the address from mem_section.section_mem_map, it will be initialized with zero: [CONFIG_SPARSEMEM_EXTREAM] paging_init() sparse_memory_present_with_active_regions() memory_present() sparse_index_init() sparse_index_alloc() // allocate mem_section with kzalloc() makedumpfile assumes the value of unused mem_section will remain as 0, but I suspect this assumption may be broken in your environment. Hello Atshushi, I noticed that my last patch was not complete. It only checked the mem_section[] array for zero entries. But as you noticed, we also have to check the section array that we get from the mem_section entries. So I updated the patch. Michael --- makedumpfile.c | 22 +- 1 file changed, 17 insertions(+), 5 deletions(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2690,11 +2690,14 @@ nr_to_section(unsigned long nr, unsigned { unsigned long addr; - if (is_sparsemem_extreme()) + if (is_sparsemem_extreme()) { + if (mem_sec[SECTION_NR_TO_ROOT(nr)] == 0) + return NOT_KV_ADDR; addr = mem_sec[SECTION_NR_TO_ROOT(nr)] + (nr SECTION_ROOT_MASK()) * SIZE(mem_section); - else + } else { addr = SYMBOL(mem_section) + (nr * SIZE(mem_section)); + } if (!is_kvaddr(addr)) return NOT_KV_ADDR; @@ -2778,10 +2781,19 @@ get_mm_sparsemem(void) } for (section_nr = 0; section_nr num_section; section_nr++) { section = nr_to_section(section_nr, mem_sec); - mem_map = section_mem_map_addr(section); - mem_map = sparse_decode_mem_map(mem_map, section_nr); - if (!is_kvaddr(mem_map)) + if (section == NOT_KV_ADDR) { mem_map = NOT_MEMMAP_ADDR; + } else { + mem_map = section_mem_map_addr(section); + if (mem_map == 0) { + mem_map = NOT_MEMMAP_ADDR; + } else { + mem_map = sparse_decode_mem_map(mem_map, + section_nr); + if (!is_kvaddr(mem_map)) + mem_map = NOT_MEMMAP_ADDR; + } + } pfn_start = section_nr * PAGES_PER_SECTION(); pfn_end = pfn_start + PAGES_PER_SECTION(); if (info-max_mapnr pfn_end) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile: get_max_mapnr() from ELF header problem
Hello Atsushi, I debugged my problem a bit further and tried to implement a function that gets the maximum page frame number from the Linux kernel memory management structures. I am no memory management expert, so the following patch probably is not complete, but at least for my setup it worked. Michael --- makedumpfile.c | 58 + 1 file changed, 58 insertions(+) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -2029,6 +2029,48 @@ pgdat4: return SYMBOL(contig_page_data); } +int +get_max_pfn(void) +{ + unsigned long pgdat, node_start_pfn, node_spanned_pages, max_pfn = 0; + int num_nodes, node; + + if ((node = next_online_node(0)) 0) { + ERRMSG(Can't get next online node.\n); + return FALSE; + } + if (!(pgdat = next_online_pgdat(node))) { + ERRMSG(Can't get pgdat list.\n); + return FALSE; + } + for (num_nodes = 1; num_nodes = vt.numnodes; num_nodes++) { + if (!readmem(VADDR, pgdat + OFFSET(pglist_data.node_start_pfn), + node_start_pfn, sizeof node_start_pfn)) { + ERRMSG(Can't get node_start_pfn.\n); + return FALSE; + } + if (!readmem(VADDR, +pgdat + OFFSET(pglist_data.node_spanned_pages), +node_spanned_pages, sizeof node_spanned_pages)) { + ERRMSG(Can't get node_spanned_pages.\n); + return FALSE; + } + max_pfn = MAX(max_pfn, (node_start_pfn + node_spanned_pages)); + if (num_nodes vt.numnodes) { + if ((node = next_online_node(node + 1)) 0) { + ERRMSG(Can't get next online node.\n); + return FALSE; + } else if (!(pgdat = next_online_pgdat(node))) { + ERRMSG(Can't determine pgdat list (node %d).\n, + node); + return FALSE; + } + } + } + info-max_mapnr = max_pfn; + return TRUE; +} + void dump_mem_map(unsigned long long pfn_start, unsigned long long pfn_end, unsigned long mem_map, int num_mm) @@ -2853,6 +2908,9 @@ out: if (!get_numnodes()) return FALSE; + if (!get_max_pfn()) + return FALSE; + if (!get_mem_map()) return FALSE; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile: get_max_mapnr() from ELF header problem
On Wed, 12 Mar 2014 06:01:47 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: (2014/02/28 21:41), Michael Holzheu wrote: Hello Atsushi, [snip] Looking into source code a little, max_mapnr is used only for calculating a size of two bitmaps. I guess there's any s390-specific issue. On second thought, Michael's log looks strange. Excluding unnecessary pages: [ 21 %] vtop_s390x: Address too big for the number of page table levels. readmem: Can't convert a virtual address(8000180104670) to physical address. readmem: type_addr: 0, addr:8000180104670, size:32768 __exclude_unnecessary_pages: Can't read the buffer of struct page. This message was shown during translating the virtual address of mem_map to the physical address: __exclude_unnecessary_pages(): if (!readmem(VADDR, mem_map, page_cache + (index_pg * SIZE(page)), SIZE(page) * pfn_mm)) { ERRMSG(Can't read the buffer of struct page.\n); return FALSE; } However, this should succeed even if mem= was specified because the corresponding page table must exist in the memory image since it was used by kernel actually. The address translation logic may have an issue. To be honest I don't really understand what happens when the error occurs. My test is a 1 TiB dump of a Linux system that has set mem=1G. With makedumpfile 1.5.3 I see the following stack backtrace: (gdb) bt #0 vtop_s390x (vaddr=2251803034918936) at ./arch/s390x.c:236 #1 0x8001de44 in vaddr_to_paddr_s390x (vaddr=2251803034918936) at ./arch/s390x.c:300 #2 0x8001fb50 in readmem (type_addr=0, addr=2251803034918936, bufptr=0x3ff6cf0, size=32768) at makedumpfile.c:349 #3 0x80034cf2 in __exclude_unnecessary_pages ( mem_map=2251803034918936, pfn_start=16777216, pfn_end=16842752) at makedumpfile.c:4189 #4 0x80035716 in exclude_unnecessary_pages_cyclic () at makedumpfile.c:4349 #5 0x800358e4 in update_cyclic_region (pfn=0) at makedumpfile.c:4380 #6 0x800384e0 in get_num_dumpable_cyclic () at makedumpfile.c:5060 #7 0x80036850 in create_dump_bitmap () at makedumpfile.c:4585 #8 0x800429c8 in create_dumpfile () at makedumpfile.c:7533 #9 0x800490fc in main (argc=5, argv=0x3fff3d8) at makedumpfile.c:8651 Looks like makdumpfile wants to read a virtual address 2251803034918936 (hex 0x8C0002018) which can't be resolved by the three level kernel page table (max is 4 TiB here). In the __exclude_unnecessary_pages() function the variables have the following values: (gdb) print pfn_end $1 = 16842752 (gdb) print pfn $2 = 16777216 (gdb) print pfn_start $3 = 16777216 (gdb) print mem_map $4 = 2251803034918936 I would appreciate any hints! Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile: get_max_mapnr() from ELF header problem
On Tue, 11 Mar 2014 06:22:41 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: On Mon, 3 Mar 2014 03:11:23 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: The tools do a physical memory detection and that defines the range of memory to be dumped and also defines the memory chunks for the ELF header. makedumpfile is designed for kdump, this means it relies on dependable ELF headers. If we support such an incorrect ELF header, makedumpfile has to get the actual memory map from vmcore (but I have no ideas how to do it now) and re-calculate all PT_LOAD regions with it. It sounds too much work for irregular case, I don't plan to take care of it now. Ok, fair. And I think we are not the only ones that have this problem. For example, the KVM virsh dump probably also has that problem. virsh dump seems to have the same issue as you said, but I suppose qemu developers don't worry about that because they are developing an original way to dump guest's memory in kdump-compressed format as dump-guest-memory command. It seems that they know such case is out of the scope of makedumpfile. Even if they create a kdump-compressed format dump, they (probably) do not filter while dumping. Therefore for large dumps post-processing with makedumpfile could still make sense, e.g. for transfering the dumps. Because qemu is not aware of kernel parameters this will also fail when mem= has been used. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile: get_max_mapnr() from ELF header problem
On Mon, 3 Mar 2014 03:11:23 + Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Michael, Hello Atsushi, On s390 we have the following little problem: We use hypervisor or stand-alone dump tools to create Linux system dumps. These tools do not know the kernel parameter line and dump the full physical memory. We use makedumpfile to filter those dumps. If a Linux system has specified the mem= parameter, the dump tools still dump the whole phypsical memory. I guess this is a problem of the tools, it sounds that the tools ignore the actual memory map and just make wrong ELF headers. How do the tools decide the range of System RAM to create ELF headers ? The tools do a physical memory detection and that defines the range of memory to be dumped and also defines the memory chunks for the ELF header. And I think we are not the only ones that have this problem. For example, the KVM virsh dump probably also has that problem. At least, if the tools respect the actual memory map like /proc/vmcore, it can create correct ELF headers and makedumpfile will work normally. As I said, the tools do not know the Linux memory map. They only know the physical available memory. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
makedumpfile: get_max_mapnr() from ELF header problem
Hello Atsushi, On s390 we have the following little problem: We use hypervisor or stand-alone dump tools to create Linux system dumps. These tools do not know the kernel parameter line and dump the full physical memory. We use makedumpfile to filter those dumps. If a Linux system has specified the mem= parameter, the dump tools still dump the whole phypsical memory. Unfortunately in get_max_mapnr() makedumpfile uses the ELF header to get the maxmimum page frame number. Since this is not the correct value in our case makedumpfile fails to filter the dump. We get the following error on s390 with makedumpfile version 1.5.3: makedumpfile -c -d 31 vmcore dump.kdump cyclic buffer size has been changed: 22156083 = 22156032 Excluding unnecessary pages: [ 21 %] vtop_s390x: Address too big for the number of page table levels. readmem: Can't convert a virtual address(8000180104670) to physical address. readmem: type_addr: 0, addr:8000180104670, size:32768 __exclude_unnecessary_pages: Can't read the buffer of struct page. Excluding unnecessary pages: [ 23 %] vtop_s390x: Address too big for the number of page table levels. readmem: Can't convert a virtual address(8000180104670) to physical address. readmem: type_addr: 0, addr:8000180104670, size:327681.5.5 __exclude_unnecessary_pages: Can't read the buffer of struct page. Since version 1.5.4 makedumpfile seems to loop in __exclude_unnecessary_pages(). We thought about several ways to fix this problem but have not found a good solution up to now. Do you have an idea how we could fix that? Best Regards, Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/2 V2] raw i/o and root device to use less memory
On Fri, 10 Jan 2014 11:58:30 -0600 Cliff Wickman c...@sgi.com wrote: [snip] Use O_DIRECT (raw) i/o for the dump and for the bitmaps file, so that writing to those files does not allocate kernel memory for page cache. Hello Cliff, Have you tested O_DIRECT together with the new vmcore mmap interface? IIRC when we tried O_DIRECT together with mmap for some reason it did not work. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 1/2 V2] raw i/o and root device to use less memory
On Mon, 13 Jan 2014 07:30:33 -0600 Cliff Wickman c...@sgi.com wrote: On Mon, Jan 13, 2014 at 10:58:31AM +0100, Michael Holzheu wrote: On Fri, 10 Jan 2014 11:58:30 -0600 Cliff Wickman c...@sgi.com wrote: [snip] Use O_DIRECT (raw) i/o for the dump and for the bitmaps file, so that writing to those files does not allocate kernel memory for page cache. Hello Cliff, Have you tested O_DIRECT together with the new vmcore mmap interface? IIRC when we tried O_DIRECT together with mmap for some reason it did not work. Michael Hi Michael, How new of a kernel do I need? i.e. how 'new' of an mmap interface? The latest kernel I used was 3.0.93 (sle11sp3). And makedumpfile was 1.5.5. I think sles11 SP3 does not support mmap for /proc/vmcore. Upstream you need at least kernel 3.11. For makedumpfile version 1.5.5 should be ok. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/6] kexec: A new system call to allow in kernel loading
On Fri, 22 Nov 2013 05:34:03 -0800 ebied...@xmission.com (Eric W. Biederman) wrote: Vivek Goyal vgo...@redhat.com writes: There is also a huge missing piece of this in that your purgatory is not checking a hash of the loaded image before jumping too it. Without that this is a huge regression at least for the kexec on panic case. We absolutely need to check that the kernel sitting around in memory has not been corrupted before we let it run very far. Agreed. This should not be hard. It is just a matter of calcualting digest of segments. I will store it in kimge and verify digest again before passing control to control page. Will fix it in next version. Nak. The verification needs to happen in purgatory. The verification needs to happen in code whose runtime environment is does not depend on random parts of the kernel. Anything else is a regression in maintainability and reliability. Hello Vivek, Just to be sure that you have not forgotten the following s390 detail: On s390 we first call purgatory with parameter 0 for doing the checksum test. If this fails, we can have as backup solution our traditional stand-alone dump. In case tha checksum test was ok, we call purgatory a second time with parameter 1 which then starts kdump. Could you please ensure that this mechanism also works after your rework. Best Regards, Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/6] kexec: A new system call to allow in kernel loading
On Mon, 25 Nov 2013 10:36:20 -0500 Vivek Goyal vgo...@redhat.com wrote: On Mon, Nov 25, 2013 at 11:04:28AM +0100, Michael Holzheu wrote: On Fri, 22 Nov 2013 05:34:03 -0800 ebied...@xmission.com (Eric W. Biederman) wrote: Vivek Goyal vgo...@redhat.com writes: There is also a huge missing piece of this in that your purgatory is not checking a hash of the loaded image before jumping too it. Without that this is a huge regression at least for the kexec on panic case. We absolutely need to check that the kernel sitting around in memory has not been corrupted before we let it run very far. Agreed. This should not be hard. It is just a matter of calcualting digest of segments. I will store it in kimge and verify digest again before passing control to control page. Will fix it in next version. Nak. The verification needs to happen in purgatory. The verification needs to happen in code whose runtime environment is does not depend on random parts of the kernel. Anything else is a regression in maintainability and reliability. Hello Vivek, Just to be sure that you have not forgotten the following s390 detail: On s390 we first call purgatory with parameter 0 for doing the checksum test. If this fails, we can have as backup solution our traditional stand-alone dump. In case tha checksum test was ok, we call purgatory a second time with parameter 1 which then starts kdump. Could you please ensure that this mechanism also works after your rework. Hi Michael, All that logic in in arch dependent portion of s390? If yes, I am not touching any arch dependent part of s390 yet and only doing implementation of x86. Yes, part of s390 architecture code (kernel and kexec purgatory). kernel: --- arch/s390/kernel/machine_kexec.c: kdump_csum_valid() - rc = start_kdump(0); __do_machine_kdump() - start_kdump(1) kexec tools: purgatory/arch/s390/setup-s390.S cghi %r2,0 je verify_checksums Generic changes should be usable by s390 and you should be able to do same thing there. Though we are still detating whether segment checksum verification logic should be part of purgatory or core kernel. Yes, that was my concern. If you move the purgatory checksum logic to the kernel we probably have to consider our s390 checksum test. Thanks! Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/3] procfs, vmcore: fix regression of mmap on /proc/vmcore since v3.12-rc1
Hello Hatayama, We successfully tested your patches on s390, mmap for /proc/vmcore seems to work again. Thanks! Michael On Mon, 14 Oct 2013 18:36:06 +0900 HATAYAMA Daisuke d.hatay...@jp.fujitsu.com wrote: This patch set fixes regression of mmap on /proc/vmcore since v3.12-rc1. The primary one is the 2nd patch. The 1st patch is another bug I found during investiation and it affects mmap on /proc/vmcore even if only 2nd patch is applied, so fix it together in this patch set. The last patch is just for cleaning up of target function in both 1st and 2nd patches. --- HATAYAMA Daisuke (3): procfs: fix unintended truncation of returned mapped address procfs: Call default get_unmapped_area on MMU-present architectures procfs: clean up proc_reg_get_unmapped_area for 80-column limit fs/proc/inode.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
mmap for /proc/vmcore broken since 3.12-rc1
Hello Alexey, Looks like the following commit broke mmap for /proc/vmcore: commit c4fe24485729fc2cbff324c111e67a1cc2f9adea Author: Alexey Dobriyan adobri...@gmail.com Date: Tue Aug 20 22:17:24 2013 +0300 sparc: fix PCI device proc file mmap(2) Because /proc/vmcore (fs/proc/vmcore.c) does not implement the get_unmapped_area() fops function mmap now always returns EIO. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v5 0/5] kdump: Allow ELF header creation in new kernel
On Fri, 14 Jun 2013 14:54:54 -0400 Vivek Goyal vgo...@redhat.com wrote: On Fri, Jun 07, 2013 at 06:55:56PM +0200, Michael Holzheu wrote: [..] In this patch series I did not include the discussed ELF header swap trick patch because with the ELF header read functions this patch currently is not necessary. Michael, Would be good to this change atleast in a separate patch series so that we have common mechanism across arches and understanding code becomes easier. Ok fine, we can do that in a separate patch series on top. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v5 1/5] vmcore: Introduce ELF header in new memory feature
On Fri, 14 Jun 2013 14:54:02 -0400 Vivek Goyal vgo...@redhat.com wrote: On Fri, Jun 07, 2013 at 06:55:57PM +0200, Michael Holzheu wrote: [..] @@ -935,10 +967,17 @@ static int __init vmcore_init(void) { int rc = 0; - /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ - if (!(is_vmcore_usable())) - return rc; + /* +* If elfcorehdr= has not been passed in cmdline, try to get the +* header from 2nd kernel, then capture the dump. +*/ + if (!(is_vmcore_usable())) { + rc = elfcorehdr_alloc(); + if (rc) + return rc; + } Hi Michael, Patch description says that elfcorehdr_alloc() returns the addr and size of elf headers. But that does not seem to be the case here. Has it been modified in later patches. Sorry, that is a relict of one of my previous experiments where I tried to implement elfcorehdr_addr() similar to the way as you suggest it now. Because elfcorehdr_addr is a global variable, I decided to not pass it in the functions. But of course I can change that again if you prefer that. Also will it be better if we call elfcorehdr_alloc() always and then check for is_vmcore_usable(). Something like. elfcorehdr_addr = elfcorehdr_alloc() if (elfcorehdr_addr ) return elfcorehdr_addr if (!(is_vmcore_usable())) return error Ok, but then I think elfcorehdr_alloc() should also return the elfcorehdr_size. So what about the following patch: --- fs/proc/vmcore.c | 65 + include/linux/crash_dump.h |6 2 files changed, 60 insertions(+), 11 deletions(-) --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -123,6 +123,36 @@ static ssize_t read_from_oldmem(char *bu return read; } +/* + * Architectures may override this function to allocate ELF header in 2nd kernel + */ +int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) +{ + return 0; +} + +/* + * Architectures may override this function to free header + */ +void __weak elfcorehdr_free(unsigned long long addr) +{} + +/* + * Architectures may override this function to read from ELF header + */ +ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to read from notes sections + */ +ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -322,7 +352,7 @@ static int __init update_note_header_siz notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, offset); if (rc 0) { kfree(notes_section); return rc; @@ -409,7 +439,8 @@ static int __init copy_notes_elf64(const if (phdr_ptr-p_type != PT_NOTE) continue; offset = phdr_ptr-p_offset; - rc = read_from_oldmem(notes_buf, phdr_ptr-p_memsz, offset, 0); + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr-p_memsz, + offset); if (rc 0) return rc; notes_buf += phdr_ptr-p_memsz; @@ -510,7 +541,7 @@ static int __init update_note_header_siz notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, offset); if (rc 0) { kfree(notes_section); return rc; @@ -597,7 +628,8 @@ static int __init copy_notes_elf32(const if (phdr_ptr-p_type != PT_NOTE) continue; offset = phdr_ptr-p_offset; - rc = read_from_oldmem(notes_buf, phdr_ptr-p_memsz, offset, 0); + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr-p_memsz, + offset); if (rc 0) return rc; notes_buf += phdr_ptr-p_memsz; @@ -793,7 +825,7 @@ static int __init parse_crash_elf64_head addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)ehdr, sizeof(Elf64_Ehdr), addr, 0); + rc = elfcorehdr_read((char *)ehdr, sizeof(Elf64_Ehdr), addr); if (rc 0) return rc; @@ -820,7 +852,7 @@ static int __init
Re: [PATCH 0/2] kdump/mmap: Fix mmap of /proc/vmcore for s390
On Mon, 3 Jun 2013 11:59:40 -0400 Vivek Goyal vgo...@redhat.com wrote: On Mon, Jun 03, 2013 at 03:27:18PM +0200, Michael Holzheu wrote: [..] If not, how would remap_pfn_range() work with HSA region when /proc/vmcore is mmaped()? I am no memory management expert, so I discussed that with Martin Schwidefsky (s390 architecture maintainer). Perhaps something like the following could work: After vmcore_mmap() is called the HSA pages are not initially mapped in the page tables. So when user space accesses those parts of /proc/vmcore, a fault will be generated. We implement a mechanism that in this case the HSA is copied to a new page in the page cache and a mapping is created for it. Since the page is allocated in the page cache, it can be released afterwards by the kernel when we get memory pressure. Our current idea for such an implementation: * Create new address space (struct address_space) for /proc/vmcore. * Implement new vm_operations_struct vmcore_mmap_ops with new vmcore_fault() .fault callback for /proc/vmcore. * Set vma-vm_ops to vmcore_mmap_ops in mmap_vmcore(). * The vmcore_fault() function will get a new page cache page, copy HSA page to page cache page add it to vmcore address space. To see how this could work, we looked into the functions filemap_fault() in mm/filemap.c and relay_buf_fault() in kernel/relay.c. What do you think? I am not mm expert either but above proposal sounds reasonable to me. So remap_pfn_range() call will go in arch dependent code so that arch can decide which range can be mapped right away and which ranges will be filed in when fault happens? I am assuming that s390 will map everything except for pfn between 0 and HSA_SIZE. Yes, for [0 - HSA_SIZE] the fault handler will be called and for the rest we establish a mapping with remap_pfn_range() as it is currently done. Therefore no fault handler will be called for that part of /proc/vmcore. I will try to find out if it is doable that way. And regular s390 kdump will map everyting right away and will not have to rely on fault mechanism? Yes, as kdump on the other archs. Thanks Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/2] kdump/mmap: Fix mmap of /proc/vmcore for s390
On Thu, 30 May 2013 16:38:47 -0400 Vivek Goyal vgo...@redhat.com wrote: On Wed, May 29, 2013 at 01:51:44PM +0200, Michael Holzheu wrote: [..] START QUOTE [PATCH v3 1/3] kdump: Introduce ELF header in new memory feature Currently for s390 we create the ELF core header in the 2nd kernel with a small trick. We relocate the addresses in the ELF header in a way that for the /proc/vmcore code it seems to be in the 1st kernel (old) memory and the read_from_oldmem() returns the correct data. This allows the /proc/vmcore code to use the ELF header in the 2nd kernel. END QUOTE For our current zfcpdump project (see [PATCH 3/3]s390/kdump: Use vmcore for zfcpdump) we could no longer use this trick. Therefore we sent you the patches to get a clean interface for ELF header creation in the 2nd kernel. Hi Michael, Few more questions. - What's the difference between zfcpdump and kdump. I thought zfcpdump just boots specific kernel from fixed drive? If yes, why can't that kernel prepare headers in similar way as regular kdump kernel does and gain from kdump kernel swap trick? Correct, the zfcpdump kernel is booted from a fixed disk drive. The difference is that the zfcpdump HSA memory is not mapped into real memory. It is accessed using a read memory interface memcpy_hsa() that copies memory from the hypervisor owned HSA memory into the Linux memory. So it looks like the following: +--+ ++ | | memcpy_hsa() || | zfcpdump | -- | HSA memory | | | || +--+ ++ | | | old mem | | | +--+ In the copy_oldmem_page() function for zfcpdump we do the following: copy_oldmem_page_zfcpdump(...) { if (src ZFCPDUMP_HSA_SIZE) { if (memcpy_hsa(buf, src, csize, userbuf) 0) return -EINVAL; } else { if (userbuf) copy_to_user_real((buf, src, csize); else memcpy_real(buf, src, csize); } } So I think for zfcpdump we only can use the read() interface of /proc/vmcore. But this is sufficient for us since we also provide the s390 specific zfcpdump user space that copies /proc/vmcore. Also, we are accessing the contents of elf headers using physical address. If that's the case, does it make a difference if data is in old kernel's memory or new kernel's memory. We will use the physical address and create a temporary mapping and it should not make a difference whether same physical page is already mapped in current kernel or not. Only restriction this places is that all ELF header needs to be contiguous. I see that s390 code already creates elf headers using kzalloc_panic(). So memory allocated should by physically contiguous. So can't we just put __pa(elfcorebuf) in elfcorehdr_addr. And same is true for p_offset fields in PT_NOTE headers and everything should work fine? Only problem we can face is that at some point of time kzalloc() might not be able to contiguous memory request. We can handle that once s390 runs into those issues. You are anyway allocating memory using kzalloc(). And if this works for s390 kdump, it should work for zfcpdump too? So your suggestion is that copy_oldmem_page() should also be used for copying memory from the new kernel, correct? For kdump on s390 I think this will work with the new ELF header swap patch. With that patch access to [0, OLDMEM_SIZE] will uniquely identify an address in the new kernel and access to [OLDMEM_BASE, OLDMEM_BASE + OLDMEM_SIZE] will identify an address in the old kernel. For zfcpdump currently we add a load from [0, HSA_SIZE] where p_offset equals p_paddr. Therefore we can't distinguish in copy_oldmem_page() if we read from oldmem (HSA) or newmem. The range [0, HSA_SIZE] is used twice. As a workaroun we could use an artificial p_offset for the HSA memory chunk that is not used by the 1st kernel physical memory. This is not really beautiful, but probably doable. When I tried to implement this for kdump, I noticed another problem with the vmcore mmap patches. Our copy_oldmem_page() function uses memcpy_real() to access the old 1st kernel memory. This function switches to real mode and therefore does not require any page tables. But as a side effect of that we can't copy to vmalloc memory. The mmap patches use vmalloc memory for notes_buf. So currently using our copy_oldmem_page() fails here. If copy_oldmem_page() now also must be able to copy to vmalloc memory, we would have to add new code for that: * oldmem - newmem (real): Use direct memcpy_real() * oldmem - newmem (vmalloc): Use intermediate buffer with memcpy_real() * newmem - newmem: Use memcpy() What do you think? Best Regards, Michael ___ kexec mailing list kexec@lists.infradead.org http
Re: [PATCH 0/2] kdump/mmap: Fix mmap of /proc/vmcore for s390
On Tue, 28 May 2013 09:55:01 -0400 Vivek Goyal vgo...@redhat.com wrote: On Sat, May 25, 2013 at 02:52:17PM +0200, Michael Holzheu wrote: [snip] Besides of the newmem mechanism, for completeness, we also implemented the oldmem ELF header mechansim in kexec. But this is disabled by default. See: #ifdef WITH_ELF_HEADER in kexec/arch/s390/crashdump-s390.c Currently no distribution uses the oldmem mechanism. Hi Michael, Mechanism to read from newmem is not there yet (your patches are still being reviewed) and you mentioned that no distribution is building kexec-tools with WITH_ELF_HEADER on s390. So how things are currently working for kdump on s390? Hello Vivek, On s390, if we do not get the ELF header from the 1st kernel with elfcorehdr=, we build the ELF header in the 2nd kernel. This is currently the default because WITH_ELF_HEADER is not defined for kexec tools. START QUOTE [PATCH v3 1/3] kdump: Introduce ELF header in new memory feature Currently for s390 we create the ELF core header in the 2nd kernel with a small trick. We relocate the addresses in the ELF header in a way that for the /proc/vmcore code it seems to be in the 1st kernel (old) memory and the read_from_oldmem() returns the correct data. This allows the /proc/vmcore code to use the ELF header in the 2nd kernel. END QUOTE For our current zfcpdump project (see [PATCH 3/3]s390/kdump: Use vmcore for zfcpdump) we could no longer use this trick. Therefore we sent you the patches to get a clean interface for ELF header creation in the 2nd kernel. Therefore, if necessary, IMHO we can switch to the ELF header memory swap mechanism for s390 in the kernel. Of course we would then also have to adjust the (disabled) kexec code. So are you saying that s390 is ready to switch to mechanism of creating ELF headers in first kernel by kexec-tools and new kernel does not have to preare ELF headers? No, I meant that currently nobody is using the kexec tools ELF header creation in the 1st kernel on s390. We create the ELF header in the 2nd kernel (mainly because of our cpuplugd issue). Therefore, I think, we can safely change the ELF header creation in 2nd kernel to use your p_offset swap trick *and* we remove the swap code in the copy_oldmem_page() implementation (same kernel). Best Regards, Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH 0/2] kdump/mmap: Fix mmap of /proc/vmcore for s390
On Wed, 29 May 2013 12:23:26 -0400 Vivek Goyal vgo...@redhat.com wrote: On Wed, May 29, 2013 at 01:51:44PM +0200, Michael Holzheu wrote: On Tue, 28 May 2013 09:55:01 -0400 Vivek Goyal vgo...@redhat.com wrote: On Sat, May 25, 2013 at 02:52:17PM +0200, Michael Holzheu wrote: [snip] Besides of the newmem mechanism, for completeness, we also implemented the oldmem ELF header mechansim in kexec. But this is disabled by default. See: #ifdef WITH_ELF_HEADER in kexec/arch/s390/crashdump-s390.c Currently no distribution uses the oldmem mechanism. Hi Michael, Mechanism to read from newmem is not there yet (your patches are still being reviewed) and you mentioned that no distribution is building kexec-tools with WITH_ELF_HEADER on s390. So how things are currently working for kdump on s390? Hello Vivek, On s390, if we do not get the ELF header from the 1st kernel with elfcorehdr=, we build the ELF header in the 2nd kernel. This is currently the default because WITH_ELF_HEADER is not defined for kexec tools. START QUOTE [PATCH v3 1/3] kdump: Introduce ELF header in new memory feature Currently for s390 we create the ELF core header in the 2nd kernel with a small trick. We relocate the addresses in the ELF header in a way that for the /proc/vmcore code it seems to be in the 1st kernel (old) memory and the read_from_oldmem() returns the correct data. This allows the /proc/vmcore code to use the ELF header in the 2nd kernel. END QUOTE For our current zfcpdump project (see [PATCH 3/3]s390/kdump: Use vmcore for zfcpdump) we could no longer use this trick. Therefore we sent you the patches to get a clean interface for ELF header creation in the 2nd kernel. Therefore, if necessary, IMHO we can switch to the ELF header memory swap mechanism for s390 in the kernel. Of course we would then also have to adjust the (disabled) kexec code. So are you saying that s390 is ready to switch to mechanism of creating ELF headers in first kernel by kexec-tools and new kernel does not have to preare ELF headers? No, I meant that currently nobody is using the kexec tools ELF header creation in the 1st kernel on s390. We create the ELF header in the 2nd kernel (mainly because of our cpuplugd issue). Therefore, I think, we can safely change the ELF header creation in 2nd kernel to use your p_offset swap trick *and* we remove the swap code in the copy_oldmem_page() implementation (same kernel). Ok. Got it. So s390 can fix it in kernel without creating any backward compatibility issues (given the fact that nobody sees to be using kexec-tools to build headers). So please go ahead and fix it and that should solve your mmap() issue too. Also please fix kexec-tools and that change will not be backward compatible. Ok, I will do this. I think we should add this swap in ELF header patch to the kdump: Allow ELF header creation in new kernel patch series (on top of the mmap patch series). Because when I remove the swap code from copy_oldmem_page(), the old trick to access the ELF header in the first kernel memory will no longer work. Is that ok for you? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 1/2] kdump/mmap: Introduce arch_oldmem_remap_pfn_range()
From: Jan Willeke will...@de.ibm.com Currently the /proc/vmcore mmap patches are not working on s390. The problem is that on s390 the kernel in not relocatable and therefore always runs in the lower memory area. Therefore for kdump on s390 we swap the lower memory area with the crashkernel area before starting the kdump kernel: [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] To fix /proc/vmcore mmap memory below OLDMEMSIZE needs to be mapped with OLDMEM_BASE as offset. To achieve that, a new weak function arch_oldmem_remap_pfn_range() is introduced. Signed-off-by: Jan Willeke will...@de.ibm.com Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- fs/proc/vmcore.c | 15 ++- include/linux/crash_dump.h | 5 + 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 80221d7..3eda0ac 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -123,6 +123,19 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } +/* + * Architetures may override this function to map oldmen + */ +int __weak arch_oldmem_remap_pfn_range(struct vm_area_struct *vma, + unsigned long from, + unsigned long pfn, + unsigned long size, + pgprot_t prot) +{ + return remap_pfn_range(vma, from, pfn, size, prot); +} + + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -267,7 +280,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (size tsz) tsz = size; paddr = m-paddr + start - m-offset; - if (remap_pfn_range(vma, vma-vm_start + len, + if (arch_oldmem_remap_pfn_range(vma, vma-vm_start + len, paddr PAGE_SHIFT, tsz, vma-vm_page_prot)) { do_munmap(vma-vm_mm, vma-vm_start, len); diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 37e4f8d..da300a7 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -14,6 +14,11 @@ extern unsigned long long elfcorehdr_size; extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); +extern int __weak arch_oldmem_remap_pfn_range(struct vm_area_struct *vma, + unsigned long from, + unsigned long pfn, + unsigned long size, + pgprot_t prot); /* Architecture code defines this if there are other possible ELF * machine types, e.g. on bi-arch capable hardware. */ -- 1.8.1.6 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH 0/2] kdump/mmap: Fix mmap of /proc/vmcore for s390
Hello Vivek and Hatayama, Currently the /proc/vmcore mmap patches are not working on s390. The problem is that on s390 the kernel in not relocatable and therefore always runs in the lower memory area. Therefore for kdump on s390 we swap the lower memory area with the crashkernel area before starting the kdump kernel: [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] To fix /proc/vmcore mmap memory below OLDMEMSIZE needs to be mapped with OLDMEM_BASE as offset. To achieve that, a new weak function arch_oldmem_remap_pfn_range() is introduced. If you agree with our approach, could you integrate the two patches into the mmap patch series? Best Regards, Michael --- Jan Willeke (2): kdump/mmap: Introduce arch_oldmem_remap_pfn_range() s390/kdump/mmap: Implement arch_oldmem_remap_pfn_range() for s390 arch/s390/kernel/crash_dump.c | 27 +++ fs/proc/vmcore.c | 15 ++- include/linux/crash_dump.h| 5 + 3 files changed, 46 insertions(+), 1 deletion(-) -- 1.8.1.6 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v3 2/3] s390/kdump: Use ELF header in new memory feature
This patch now exchanges the old relocate mechanism with the new arch function call override mechanism that allows to create the ELF core header in the 2nd kernel. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- arch/s390/kernel/crash_dump.c | 64 ++- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index f703d91..aeb1207 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -21,6 +21,9 @@ #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) #define PTR_DIFF(x, y) ((unsigned long)(((char *) (x)) - ((unsigned long) (y +static size_t elfcorebuf_sz; +static char *elfcorebuf; + /* * Copy one page from oldmem * @@ -325,14 +328,6 @@ static int get_mem_chunk_cnt(void) } /* - * Relocate pointer in order to allow vmcore code access the data - */ -static inline unsigned long relocate(unsigned long addr) -{ - return OLDMEM_BASE + addr; -} - -/* * Initialize ELF loads (new kernel) */ static int loads_init(Elf64_Phdr *phdr, u64 loads_offset) @@ -383,7 +378,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) ptr = nt_vmcoreinfo(ptr); memset(phdr, 0, sizeof(*phdr)); phdr-p_type = PT_NOTE; - phdr-p_offset = relocate(notes_offset); + phdr-p_offset = notes_offset; phdr-p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); phdr-p_memsz = phdr-p_filesz; return ptr; @@ -392,7 +387,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) /* * Create ELF core header (new kernel) */ -static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) +static int s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) { Elf64_Phdr *phdr_notes, *phdr_loads; int mem_chunk_cnt; @@ -414,28 +409,59 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); - ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + ptr = notes_init(phdr_notes, ptr, (unsigned long) hdr + hdr_off); /* Init loads */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off); + loads_init(phdr_loads, hdr_off); *elfcorebuf_sz = hdr_off; - *elfcorebuf = (void *) relocate((unsigned long) hdr); + *elfcorebuf = hdr; BUG_ON(*elfcorebuf_sz alloc_size); + return 0; } /* - * Create kdump ELF core header in new kernel, if it has not been passed via - * the elfcorehdr kernel parameter + * Return address of ELF core header (new or old memory) */ -static int setup_kdump_elfcorehdr(void) +unsigned long long arch_get_crash_header(void) { - size_t elfcorebuf_sz; - char *elfcorebuf; + if (elfcorebuf) + return elfcorehdr_addr; + else + return __pa(elfcorebuf); +} +/* + * Free crash header + */ +void arch_free_crash_header(void) +{ + kfree(elfcorebuf); + elfcorebuf = 0; +} + +/* + * Read from crash header (new or old memory) + */ +ssize_t arch_read_from_crash_header(char *buf, size_t count, u64 *ppos) +{ + if (elfcorebuf) + memcpy(buf, (void *)*ppos, count); + else + copy_from_oldmem(buf, (void *)*ppos, count); + *ppos += count; + return count; +} + +/* + * Create kdump ELF core header in new kernel if it has not been passed via + * the elfcorehdr= kernel parameter + */ +static int setup_kdump_elfcorehdr(void) +{ if (!OLDMEM_BASE || is_kdump_kernel()) return -EINVAL; s390_elf_corehdr_create(elfcorebuf, elfcorebuf_sz); - elfcorehdr_addr = (unsigned long long) elfcorebuf; + elfcorehdr_addr = __pa(elfcorebuf); elfcorehdr_size = elfcorebuf_sz; return 0; } -- 1.8.1.6 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 0/3] kdump: Allow ELF header creation in new kernel
Hello Vivek, Here the 2nd version of the patch set. We tried to implement the function using the arch_get/free_crash_header() weak function as you suggested. We kept the ELFCORE_ADDR_NEWMEM define so that is_kdump_kernel() also works for our case. ChangeLog = v1 = v2) - Rebase 3.10-rc2 + vmcore mmap patches - Introduced arch_get/free_crash_header() and ELFCORE_ADDR_NEWMEM Feature Description === For s390 we want to use /proc/vmcore for our SCSI stand-alone dump (zfcpdump). We have support where the first HSA_SIZE bytes are saved into a hypervisor owned memory area (HSA) before the kdump kernel is booted. When the kdump kernel starts, it is restricted to use only HSA_SIZE bytes. The advantages of this mechanism are: * No crashkernel memory has to be defined in the old kernel. * Early boot problems (before kexec_load has been done) can be dumped * Non-Linux systems can be dumped. We modify the s390 copy_oldmem_page() function to read from the HSA memory if memory below HSA_SIZE bytes is requested. Since we cannot use the kexec tool to load the kernel in this scenario, we have to build the ELF header in the 2nd (kdump/new) kernel. So with the following patch set we would like to introduce the new function that the ELF header for /proc/vmcore can be created in the 2nd kernel memory. The following steps are done during zfcpdump execution: 1. Production system crashes 2. User boots a SCSI disk that has been prepared with the zfcpdump tool 3. Hypervisor saves CPU state of boot CPU and HSA_SIZE bytes of memory into HSA 4. Boot loader loads kernel into low memory area 5. Kernel boots and uses only HSA_SIZE bytes of memory 6. Kernel saves registers of non-boot CPUs 7. Kernel does memory detection for dump memory map 8. Kernel creates ELF header for /proc/vmcore 9. /proc/vmcore uses this header for initialization 10. The zfcpdump user space reads /proc/vmcore to write dump to SCSI disk - copy_oldmem_page() copies from HSA for memory below HSA_SIZE - copy_oldmem_page() copies from real memory for memory above HSA_SIZE --- Jan Willeke (1): s390/kdump: Use vmcore for zfcpdump Michael Holzheu (2): kdump: Introduce ELF header in new memory feature s390/kdump: Use arch function call for kdump arch/s390/Kconfig | 3 +- arch/s390/include/asm/sclp.h | 1 + arch/s390/kernel/crash_dump.c | 126 +++--- drivers/s390/char/zcore.c | 6 +- fs/proc/vmcore.c | 80 +-- include/linux/crash_dump.h| 3 + 6 files changed, 164 insertions(+), 55 deletions(-) -- 1.8.1.6 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 2/3] s390/kdump: Use arch function call for kdump
Currently for s390 we create the ELF core header in the 2nd kernel with a small trick. We relocate the addresses in the ELF header in a way that for the /proc/vmcore code it seems to be in the 1st kernel (old) memory and the read_from_oldmem() returns the correct data. This allows the /proc/vmcore code to use the ELF header in the 2nd kernel. This patch now exchanges the old mechanism with the new and much cleaner arch function call override feature that now offcially allows to create the ELF core header in the 2nd kernel. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- arch/s390/kernel/crash_dump.c | 52 --- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index f703d91..4fe7fa7 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -21,6 +21,9 @@ #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) #define PTR_DIFF(x, y) ((unsigned long)(((char *) (x)) - ((unsigned long) (y +static size_t elfcorebuf_sz; +static char *elfcorebuf; + /* * Copy one page from oldmem * @@ -325,14 +328,6 @@ static int get_mem_chunk_cnt(void) } /* - * Relocate pointer in order to allow vmcore code access the data - */ -static inline unsigned long relocate(unsigned long addr) -{ - return OLDMEM_BASE + addr; -} - -/* * Initialize ELF loads (new kernel) */ static int loads_init(Elf64_Phdr *phdr, u64 loads_offset) @@ -383,7 +378,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) ptr = nt_vmcoreinfo(ptr); memset(phdr, 0, sizeof(*phdr)); phdr-p_type = PT_NOTE; - phdr-p_offset = relocate(notes_offset); + phdr-p_offset = notes_offset; phdr-p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); phdr-p_memsz = phdr-p_filesz; return ptr; @@ -392,7 +387,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) /* * Create ELF core header (new kernel) */ -static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) +static int arch_vmcore_get_elf_hdr(char **elfcorebuf, size_t *elfcorebuf_sz) { Elf64_Phdr *phdr_notes, *phdr_loads; int mem_chunk_cnt; @@ -414,13 +409,37 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); - ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + ptr = notes_init(phdr_notes, ptr, (unsigned long) hdr + hdr_off); /* Init loads */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off); + loads_init(phdr_loads, hdr_off); *elfcorebuf_sz = hdr_off; - *elfcorebuf = (void *) relocate((unsigned long) hdr); + *elfcorebuf = hdr; BUG_ON(*elfcorebuf_sz alloc_size); + return 0; +} + +/* + * If elfcorehdr= has been specified, return 1st kernel ELF header + * otherwise return the new 2nd kernel ELF header. + */ +unsigned long long arch_get_crash_header(void) +{ + if (elfcorehdr_addr != ELFCORE_ADDR_NEWMEM) + return elfcorehdr_addr; + else + return __pa(elfcorebuf); +} + +/* + * Free crash header + */ +void arch_free_crash_header(void) +{ + if (elfcorehdr_addr != ELFCORE_ADDR_NEWMEM) + return; + kfree(elfcorebuf); + elfcorebuf = 0; } /* @@ -429,13 +448,10 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) */ static int setup_kdump_elfcorehdr(void) { - size_t elfcorebuf_sz; - char *elfcorebuf; - if (!OLDMEM_BASE || is_kdump_kernel()) return -EINVAL; - s390_elf_corehdr_create(elfcorebuf, elfcorebuf_sz); - elfcorehdr_addr = (unsigned long long) elfcorebuf; + arch_vmcore_get_elf_hdr(elfcorebuf, elfcorebuf_sz); + elfcorehdr_addr = ELFCORE_ADDR_NEWMEM; elfcorehdr_size = elfcorebuf_sz; return 0; } -- 1.8.1.6 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH v2 1/3] kdump: Introduce ELF header in new memory feature
Currently for s390 we create the ELF core header in the 2nd kernel with a small trick. We relocate the addresses in the ELF header in a way that for the /proc/vmcore code it seems to be in the 1st kernel (old) memory and the read_from_oldmem() returns the correct data. This allows the /proc/vmcore code to use the ELF header in the 2nd kernel. This patch now exchanges the old mechanism with the new and much cleaner function call override feature that now offcially allows to create the ELF core header in the 2nd kernel. To use the new feature the following has to be done by the architecture backend code: * Set elfcorehdr_addr to ELFCORE_ADDR_NEWMEM - is_kdump_kernel() will return true * Override arch_get_crash_header() to return the address of the ELF header in new memory. * Override arch_free_crash_header() to free the memory of the ELF header in new memory. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- fs/proc/vmcore.c | 80 +++--- include/linux/crash_dump.h | 3 ++ 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 6ba32f8..d97ed31 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -123,6 +123,47 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } +/* + * Read from the current (new) kernel memory + */ +static ssize_t read_from_newmem(char *buf, size_t count, u64 *ppos, int userbuf) +{ + if (userbuf) { + if (copy_to_user(buf, (void *)*ppos, count)) + return -EFAULT; + } else { + memcpy(buf, (void *)*ppos, count); + } + *ppos += count; + return count; +} + +/* + * Provide access to the header + */ +static ssize_t read_from_crash_header(char *buf, size_t count, u64 *ppos, + int userbuf) +{ + if (elfcorehdr_addr == ELFCORE_ADDR_NEWMEM) + return read_from_newmem(buf, count, ppos, userbuf); + else + return read_from_oldmem(buf, count, ppos, userbuf); +} + +/* + * Architetures may override this function to get header address + */ +unsigned long long __weak arch_get_crash_header(void) +{ + return elfcorehdr_addr; +} + +/* + * Architetures may override this function to free header + */ +void __weak arch_free_crash_header(void) +{} + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -356,7 +397,7 @@ static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr, notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, offset, 0); + rc = read_from_crash_header(notes_section, max_sz, offset, 0); if (rc 0) { kfree(notes_section); return rc; @@ -412,7 +453,7 @@ static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, offset, 0); + rc = read_from_crash_header(notes_section, max_sz, offset, 0); if (rc 0) { kfree(notes_section); return rc; @@ -428,8 +469,8 @@ static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } offset = phdr_ptr-p_offset; - rc = read_from_oldmem(notes_buf + phdr_sz, real_sz, - offset, 0); + rc = read_from_crash_header(notes_buf + phdr_sz, real_sz, + offset, 0); if (rc 0) { kfree(notes_section); return rc; @@ -533,7 +574,7 @@ static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr, notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, offset, 0); + rc = read_from_crash_header(notes_section, max_sz, offset, 0); if (rc 0) { kfree(notes_section); return rc; @@ -589,7 +630,7 @@ static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, offset, 0); + rc = read_from_crash_header
[PATCH 1/4] kdump: Introduce ELFCORE_ADDR_NEWMEM
Currently vmcore gets the ELF header from oldmem using the global variable elfcorehdr_addr. This patch introduces a new possible value ELFCORE_ADDR_NEWMEM. This indicates that the ELF header is allocated in the new (2nd) kernel. In this case a new architecture function arch_vmcore_get_elf_hdr() is called to obtain address and length of the ELF header. The ELF header that is created in the 2nd kernel already contains the correct relative offsets in the ELF notes and loads sections. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- fs/proc/vmcore.c | 65 -- include/linux/crash_dump.h | 4 ++- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 17f7e08..71db4e6 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -487,6 +487,18 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets_newmem(struct list_head *vc_list) +{ + loff_t vmcore_off = elfcorebuf_sz; + struct vmcore *m; + + list_for_each_entry(m, vc_list, list) { + m-offset = vmcore_off; + vmcore_off += m-size; + } +} + +/* Sets offset fields of vmcore elements. */ static void __init set_vmcore_list_offsets_elf64(char *elfptr, struct list_head *vc_list) { @@ -636,7 +648,7 @@ static int __init parse_crash_elf32_headers(void) return 0; } -static int __init parse_crash_elf_headers(void) +static int __init parse_crash_elf_headers_oldmem(void) { unsigned char e_ident[EI_NIDENT]; u64 addr; @@ -672,6 +684,52 @@ static int __init parse_crash_elf_headers(void) return 0; } +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +int __weak arch_vmcore_get_elf_hdr(char **elfcorebuf, size_t *elfcorebuf_sz) +{ + return -EOPNOTSUPP; +} + +static int parse_crash_elf_headers_newmem(void) +{ + unsigned char e_ident[EI_NIDENT]; + int rc; + + rc = arch_vmcore_get_elf_hdr(elfcorebuf, elfcorebuf_sz); + if (rc) + return rc; + memcpy(e_ident, elfcorebuf, EI_NIDENT); + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { + pr_warn(Warning: Core image elf header not found\n); + rc = -EINVAL; + goto fail; + } + if (e_ident[EI_CLASS] == ELFCLASS64) { + rc = process_ptload_program_headers_elf64(elfcorebuf, + elfcorebuf_sz, + vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets_newmem(vmcore_list); + vmcore_size = get_vmcore_size_elf64(elfcorebuf); + } else if (e_ident[EI_CLASS] == ELFCLASS32) { + rc = process_ptload_program_headers_elf32(elfcorebuf, + elfcorebuf_sz, + vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets_newmem(vmcore_list); + vmcore_size = get_vmcore_size_elf32(elfcorebuf); + } + return 0; +fail: + kfree(elfcorebuf); + return rc; +} + /* Init function for vmcore module. */ static int __init vmcore_init(void) { @@ -680,7 +738,10 @@ static int __init vmcore_init(void) /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ if (!(is_vmcore_usable())) return rc; - rc = parse_crash_elf_headers(); + if (elfcorehdr_addr == ELFCORE_ADDR_NEWMEM) + rc = parse_crash_elf_headers_newmem(); + else + rc = parse_crash_elf_headers_oldmem(); if (rc) { pr_warn(Kdump: vmcore not initialized\n); return rc; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 37e4f8d..9424d4fc 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -8,10 +8,12 @@ #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) +#define ELFCORE_ADDR_NEWMEM(-3ULL) extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; - +extern int arch_vmcore_get_elf_hdr(char **elfcorebuf, + size_t *elfcorebuf_sz); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); -- 1.8.1.6 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: Check dump file early
On Mon, 22 Apr 2013 09:01:23 +0900 HATAYAMA Daisuke d.hatay...@jp.fujitsu.com wrote: + if (info-flag_force) { + if (access(path, W_OK) == 0) + return TRUE; /* We have write permission */ + err_str = strerror(errno); + } else { + err_str = File exists; How about strerror(EEXIST)? It's better to avoid hard code to use the same string as what libc returns. Yes, this solution is better. Here the updated patch: --- makedumpfile.c | 37 - 1 file changed, 36 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -730,6 +730,24 @@ open_dump_file(void) } int +check_dump_file(const char *path) +{ + char *err_str; + + if (access(path, F_OK) != 0) + return TRUE; /* File does not exist */ + if (info-flag_force) { + if (access(path, W_OK) == 0) + return TRUE; /* We have write permission */ + err_str = strerror(errno); + } else { + err_str = strerror(EEXIST); + } + ERRMSG(Can't open the dump file (%s). %s\n, path, err_str); + return FALSE; +} + +int open_dump_bitmap(void) { int i, fd; @@ -8609,6 +8627,9 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } + if (!check_dump_file(info-name_dumpfile)) + goto out; + if (!open_files_for_rearranging_dumpdata()) goto out; @@ -8626,9 +8647,11 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } - if (!reassemble_dumpfile()) + if (!check_dump_file(info-name_dumpfile)) goto out; + if (!reassemble_dumpfile()) + goto out; MSG(\n); MSG(The dumpfile is saved to %s.\n, info-name_dumpfile); } else if (info-flag_dmesg) { @@ -8637,6 +8660,8 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } + if (!check_dump_file(info-name_dumpfile)) + goto out; if (!dump_dmesg()) goto out; @@ -8648,6 +8673,16 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } + if (info-flag_split) { + for (i = 0; i info-num_dumpfile; i++) { + if (!check_dump_file(SPLITTING_DUMPFILE(i))) + goto out; + } + } else { + if (!check_dump_file(info-name_dumpfile)) + goto out; + } + if (!create_dumpfile()) goto out; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] makedumpfile: Check dump file early
On s390 the makedumpfile tool sometimes is used directly by users on the command line. Currently the check if the dump file already exists is done after the filtering function has been called. Therefore, for large dumps the user has to wait for filtering and after some time he gets the error message open_dump_file: Can't open the dump file(out). File exists. This patch improves this by adding an early check for the existence of the dump file. In case the -f (force) option has been specified it is checked that an existing file is writable. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- makedumpfile.c | 37 - 1 file changed, 36 insertions(+), 1 deletion(-) --- a/makedumpfile.c +++ b/makedumpfile.c @@ -730,6 +730,24 @@ open_dump_file(void) } int +check_dump_file(const char *path) +{ + char *err_str; + + if (access(path, F_OK) != 0) + return TRUE; /* File does not exist */ + if (info-flag_force) { + if (access(path, W_OK) == 0) + return TRUE; /* We have write permission */ + err_str = strerror(errno); + } else { + err_str = File exists; + } + ERRMSG(Can't open the dump file (%s). %s\n, path, err_str); + return FALSE; +} + +int open_dump_bitmap(void) { int i, fd; @@ -8609,6 +8627,9 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } + if (!check_dump_file(info-name_dumpfile)) + goto out; + if (!open_files_for_rearranging_dumpdata()) goto out; @@ -8626,9 +8647,11 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } - if (!reassemble_dumpfile()) + if (!check_dump_file(info-name_dumpfile)) goto out; + if (!reassemble_dumpfile()) + goto out; MSG(\n); MSG(The dumpfile is saved to %s.\n, info-name_dumpfile); } else if (info-flag_dmesg) { @@ -8637,6 +8660,8 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } + if (!check_dump_file(info-name_dumpfile)) + goto out; if (!dump_dmesg()) goto out; @@ -8648,6 +8673,16 @@ main(int argc, char *argv[]) MSG(Try `makedumpfile --help' for more information.\n); goto out; } + if (info-flag_split) { + for (i = 0; i info-num_dumpfile; i++) { + if (!check_dump_file(SPLITTING_DUMPFILE(i))) + goto out; + } + } else { + if (!check_dump_file(info-name_dumpfile)) + goto out; + } + if (!create_dumpfile()) goto out; ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] kexec/s390: Replace clgfi with cghi
The clgfi instruction needs at least z9 machine level. To allow kexec-tools compiled also with z900, this patch replaces clgfi with the older cghi instruction. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- purgatory/arch/s390/setup-s390.S |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: kexec-tools-2.0.3/purgatory/arch/s390/setup-s390.S === --- kexec-tools-2.0.3.orig/purgatory/arch/s390/setup-s390.S +++ kexec-tools-2.0.3/purgatory/arch/s390/setup-s390.S @@ -16,7 +16,7 @@ purgatory_start: larl%r15,lstack_end aghi%r15,-160 - clgfi %r2,0 + cghi%r2,0 je verify_checksums brasl %r14,purgatory ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] kexec/s390: Replace clgfi with cghi
Hello Simon, From the Principles of Operations (the s390 bible): The first operand is compared with the second operand, and the result is indicated in the condition code. For COMPARE LOGICAL IMMEDIATE (CLGFI), the first operand is treated as 64 bits, and the second operand is treated as 32 bits with 32 zeros appended on the left. This instruction works with 32 bit immediate values. For COMPARE HALFWORD IMMEDIATE (CGHI), the first operand is treated as a 64-bit signed binary integer. This instruction works only with 16 bit immediate values. See: http://pic.dhe.ibm.com/infocenter/ratdevz/v8r0/topic/com.ibm.tpf.toolkit.hlasm.doc/dz9zr006.pdf Because I only want to check against zero the two instructions will have the same effect in the purgatory code. Michael On Fri, 15 Mar 2013 16:57:33 +0100 Simon Horman ho...@verge.net.au wrote: On Fri, Mar 15, 2013 at 01:46:32PM +0100, Michael Holzheu wrote: The clgfi instruction needs at least z9 machine level. To allow kexec-tools compiled also with z900, this patch replaces clgfi with the older cghi instruction. Hi, could you update the changelog to include a brief description of the difference between the two instructions - I assume they work the same way in the context of this change. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- purgatory/arch/s390/setup-s390.S |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: kexec-tools-2.0.3/purgatory/arch/s390/setup-s390.S === --- kexec-tools-2.0.3.orig/purgatory/arch/s390/setup-s390.S +++ kexec-tools-2.0.3/purgatory/arch/s390/setup-s390.S @@ -16,7 +16,7 @@ purgatory_start: larl%r15,lstack_end aghi%r15,-160 - clgfi %r2,0 + cghi%r2,0 je verify_checksums brasl %r14,purgatory ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] makedumpfile: s390x: Allow HW Change-bit override for page table entries
The HW Change-bit override (0x100) is used now for s390x. This patch allows page table entries that have set this bit. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- arch/s390x.c |4 ++-- makedumpfile.h |1 - 2 files changed, 2 insertions(+), 3 deletions(-) --- a/arch/s390x.c +++ b/arch/s390x.c @@ -195,10 +195,10 @@ static ulong _kl_pg_table_deref_s390x(un readmem(VADDR, table + offset, entry, sizeof(entry)); /* * Check if the page table entry could be read and doesn't have -* any of the reserved bits set. +* the reserved bit set. * Check if the page table entry has the invalid bit set. */ - if (entry (_PAGE_CO | _PAGE_ZERO | _PAGE_INVALID)) { + if (entry (_PAGE_ZERO | _PAGE_INVALID)) { ERRMSG(Invalid page table entry.\n); return 0; } --- a/makedumpfile.h +++ b/makedumpfile.h @@ -575,7 +575,6 @@ do { \ #define _SEGMENT_INDEX_SHIFT 20 /* Hardware bits in the page table entry */ -#define _PAGE_CO 0x100 /* HW Change-bit override */ #define _PAGE_ZERO 0x800 /* Bit pos 52 must conatin zero */ #define _PAGE_INVALID 0x400 /* HW invalid bit */ #define _PAGE_INDEX_SHIFT 12 ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] makedumpfile: s390x: Add 2GB frame support for page table walker
Hello Atsushi, On Mon, 22 Oct 2012 16:20:56 +0900 Atsushi Kumagai kumagai-atsu...@mxc.nes.nec.co.jp wrote: Hello Michael, On Fri, 19 Oct 2012 18:29:12 +0200 Michael Holzheu holz...@linux.vnet.ibm.com wrote: [snip] --- a/arch/s390x.c +++ b/arch/s390x.c @@ -247,6 +247,11 @@ vtop_s390x(unsigned long vaddr) return NOT_PADDR; } table = entry _REGION_ENTRY_ORIGIN; + if ((entry _REGION_ENTRY_LARGE) (level == 1)) { + table = ~0x7fffUL; + paddr = table + (vaddr 0x7ffUL); ^^^ There is 0x7ff(27bit) but the patch for crash uses 0x7fff(31bit). Which is correct ? You are right! This is a typo in the makedumpfile patch. It should be 0x7fff (31 bit). Thanks for your careful review! Here the corrected patch: --- arch/s390x.c | 6 ++ makedumpfile.h | 1 + 2 files changed, 7 insertions(+) diff --git a/arch/s390x.c b/arch/s390x.c index 02b9e63..5854f10 100644 --- a/arch/s390x.c +++ b/arch/s390x.c @@ -247,6 +247,11 @@ vtop_s390x(unsigned long vaddr) return NOT_PADDR; } table = entry _REGION_ENTRY_ORIGIN; + if ((entry _REGION_ENTRY_LARGE) (level == 1)) { + table = ~0x7fffUL; + paddr = table + (vaddr 0x7fffUL); + return paddr; + } len = RSG_TABLE_LENGTH(entry); level--; } @@ -257,6 +262,7 @@ vtop_s390x(unsigned long vaddr) * if no, then get the page table entry using PX index. */ if (entry _SEGMENT_ENTRY_LARGE) { + table = ~_PAGE_BYTE_INDEX_MASK; paddr = table + (vaddr _PAGE_BYTE_INDEX_MASK); } else { entry = _kl_pg_table_deref_s390x(vaddr, diff --git a/makedumpfile.h b/makedumpfile.h index 2b88fa4..62bc5cc 100644 --- a/makedumpfile.h +++ b/makedumpfile.h @@ -563,6 +563,7 @@ do { \ #define _REGION_ENTRY_TYPE_MASK0x0c/* region table type mask */ #define _REGION_ENTRY_INVALID 0x20/* invalid region table entry */ #define _REGION_ENTRY_LENGTH 0x03/* region table length */ +#define _REGION_ENTRY_LARGE0x400 #define _REGION_OFFSET_MASK0x7ffUL /* region/segment table offset mask */ #define RSG_TABLE_LEVEL(x) (((x) _REGION_ENTRY_TYPE_MASK) 2) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH v1 0/2] x86, apic: Disable BSP if boot cpu is AP
On Fri, 19 Oct 2012 11:17:53 -0400 Vivek Goyal vgo...@redhat.com wrote: [..] On Fri, Oct 19, 2012 at 12:20:54PM +0900, HATAYAMA Daisuke wrote: I am skeptical that this approach is going to fly in practice. Dumping huge images, processing and transferring these is not very practical. So I would rather narrow down the problem on a running system and take filtered dump of appropriate component where I suspect the problem is. [..] capability was the primary reason that s390 also wants to support kdump otherwise there firmware dumping mechanism was working just fine. I don't know s390 firmware dumping mechanism at all, but is it possble for s390 to filter crash dump even on firmware dumping mechanism? AFAIK, s390 dump mechanism could not filter dump and tha's the reason they wanted to support kdump and /proc/vmcore so that makedumpfile could filter it. I am CCing Michael Holzheu, who did the s390 kdump work. He can tell it better. Correct. The other s390 dump mechanisms (stand-alone and hypervisor dump) are not able to do filtering and therefore the handling of large dumps has been a problem for us. This was one of the main reasons to implement kdump on s390. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] makedumpfile: s390x: Add 2GB frame support for page table walker
On s390 the zEC12 machines support 2GB frames. In order to walk page tables correctly add support to the page table walker function so it detects 2GB frames. Signed-off-by: Heiko Carstens heiko.carst...@de.ibm.com Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- arch/s390x.c |6 ++ makedumpfile.h |1 + 2 files changed, 7 insertions(+) --- a/arch/s390x.c +++ b/arch/s390x.c @@ -247,6 +247,11 @@ vtop_s390x(unsigned long vaddr) return NOT_PADDR; } table = entry _REGION_ENTRY_ORIGIN; + if ((entry _REGION_ENTRY_LARGE) (level == 1)) { + table = ~0x7fffUL; + paddr = table + (vaddr 0x7ffUL); + return paddr; + } len = RSG_TABLE_LENGTH(entry); level--; } @@ -257,6 +262,7 @@ vtop_s390x(unsigned long vaddr) * if no, then get the page table entry using PX index. */ if (entry _SEGMENT_ENTRY_LARGE) { + table = ~_PAGE_BYTE_INDEX_MASK; paddr = table + (vaddr _PAGE_BYTE_INDEX_MASK); } else { entry = _kl_pg_table_deref_s390x(vaddr, --- a/makedumpfile.h +++ b/makedumpfile.h @@ -563,6 +563,7 @@ do { \ #define _REGION_ENTRY_TYPE_MASK0x0c/* region table type mask */ #define _REGION_ENTRY_INVALID 0x20/* invalid region table entry */ #define _REGION_ENTRY_LENGTH 0x03/* region table length */ +#define _REGION_ENTRY_LARGE0x400 #define _REGION_OFFSET_MASK0x7ffUL /* region/segment table offset mask */ #define RSG_TABLE_LEVEL(x) (((x) _REGION_ENTRY_TYPE_MASK) 2) ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: makedumpfile memory usage grows with system memory size
Hello Kumagai, On Fri, 2012-04-06 at 17:09 +0900, Atsushi Kumagai wrote: Hello Michael, On Mon, 02 Apr 2012 19:15:33 +0200 Michael Holzheu holz...@linux.vnet.ibm.com wrote: Hello Ken'ichi, On Thu, 2012-03-29 at 17:09 +0900, Ken'ichi Ohmichi wrote: On Wed, 28 Mar 2012 17:22:04 -0400 makedumpfile uses the system memory of 2nd-kernel for a bitmap if RHEL. The bitmap represents each page of 1st-kernel is excluded or not. So the bitmap size depends on 1st-kernel's system memory. Does this mean that makedumpfile's memory demand linearly grows with 1 bit per page of 1-st kernel's memory? Yes, you are right. (Precisely, 2 bit per page.) Is that the exact factor, if /tmp is in memory? Or is there any other memory allocation that is not constant regarding the 1-st kernel memory size? bitmap file is main cause of memory consuming if 2nd kernel uses initramfs only. There are other parts where the size of allocated memory varies based on 1-st kernel memory size, but they don't have big influence. Thanks for the explanation. I ask because I want to exactly calculate the required size for the crashkernel parameter. On s390 the kdump kernel memory consumption is fix and not dependent on the 1st kernel memory size. So based on your explanation I, will use: crashkernel=base size + variable size where variable size = pages of 1st kernel * (2 + x) / 8 where x is the variable makedumpfile memory allocation that is on top of the bitmap allocation. What would be a good value for x? Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] makedumpfile: Compile fix
From: Michael Holzheu holz...@linux.vnet.ibm.com When compiling makedumpfile git head or 1.4.1 on s390x, I get the following gcc error: # gcc -g -O2 -Wall -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -DVERSION='1.4.1' -DRELEASE_DATE='6 January 2012' -D__s390__ print_info.o dwarf_info.o elf_info.o erase_info.o sadump_info.o arch/arm.o arch/x86.o arch/x86_64.o arch/ia64.o arch/ppc64.o arch/s390x.o -o makedumpfile makedumpfile.c -static -ldw -lbz2 -lebl -ldl -lelf -lz In file included from makedumpfile.c:21:0: sadump_info.h:86:1: error: expected identifier or '(' before '{' token sadump_info.h:85:19: warning: 'sadump_num_online_cpus' used but never defined The attached patch fixes this problem. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- sadump_info.h |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/sadump_info.h +++ b/sadump_info.h @@ -82,7 +82,7 @@ static inline int sadump_initialize_bitm return FALSE; } -static inline int sadump_num_online_cpus(void); +static inline int sadump_num_online_cpus(void) { return 0; } ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
[PATCH] kdump: Define KEXEC_NOTE_BYTES arch specific for s390x
From: Michael Holzheu holz...@linux.vnet.ibm.com kdump only allocates memory for the prstatus ELF note. For s390x, besides of prstatus multiple ELF notes for various different register types are stored. Therefore the currently allocated memory is not sufficient. With this patch the KEXEC_NOTE_BYTES macro can be defined by architecture code and for s390x it is set to the correct size now. Signed-off-by: Michael Holzheu holz...@linux.vnet.ibm.com --- arch/s390/include/asm/kexec.h | 18 ++ include/linux/kexec.h |2 ++ 2 files changed, 20 insertions(+) --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -42,6 +42,24 @@ /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 +/* + * Size for s390x ELF notes per CPU + * + * Seven notes plus zero note at the end: prstatus, fpregset, timer, + * tod_cmp, tod_reg, control regs, and prefix + */ +#define KEXEC_NOTE_BYTES \ + (ALIGN(sizeof(struct elf_note), 4) * 8 + \ +ALIGN(sizeof(CORE), 4) * 7 + \ +ALIGN(sizeof(struct elf_prstatus), 4) + \ +ALIGN(sizeof(elf_fpregset_t), 4) + \ +ALIGN(sizeof(u64), 4) + \ +ALIGN(sizeof(u64), 4) + \ +ALIGN(sizeof(u32), 4) + \ +ALIGN(sizeof(u64) * 16, 4) + \ +ALIGN(sizeof(u32), 4) \ + ) + /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -50,9 +50,11 @@ * note header. For kdump, the code in vmcore.c runs in the context * of the second kernel to combine them into one note. */ +#ifndef KEXEC_NOTE_BYTES #define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) + \ KEXEC_CORE_NOTE_NAME_BYTES +\ KEXEC_CORE_NOTE_DESC_BYTES ) +#endif /* * This structure is used to hold the arguments that are used when loading ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] crash: s390x: Auto-detect the correct MAX_PHYSMEM_BITS used in vmcore being analyzed
Hi Dave, On Thu, 2011-12-22 at 08:47 -0500, Dave Anderson wrote: - Original Message - Hello Dave and Mahesh, We also tried to solve the problem and came up with an alternative solution. The function s390x_max_physmem_bits() solves the following equation: array_len == NR_MEM_SECTIONS / SECTIONS_PER_ROOT 2^(MAX_PYSMEM_BITS - SECTION_SIZE_BITS) --- ==PAGE_SIZE - sizeof(struct mem_section) This leads to the following: MAX_PYSMEM_BITS == SECTION_SIZE_BITS + log2(array_len) + log2(PAGE_SIZE) - log2(sizeof(struct mem_section)); I tested the patch with 42 and 46 bits and for both it seems to work. I will leave now for vacation and will return January the 9th 2012. I leave it to you which solution to take. No -- I leave to you guys to decide. I appreciate the simplicity of your solution, but Mahesh's patch is easier to understand. So please decide before you go home -- I was hoping to get a release out today! Unfortunately Mahesh is currently not online. We still have some time because Martin's kernel patch that introduces the change will go into Linux version 3.3. So perhaps you make your crash release without this patch. Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] crash: s390x: Auto-detect the correct MAX_PHYSMEM_BITS used in vmcore being analyzed
Hi Dave, On Thu, 2011-12-22 at 09:19 -0500, Dave Anderson wrote: - Original Message - Unfortunately Mahesh is currently not online. We still have some time because Martin's kernel patch that introduces the change will go into Linux version 3.3. So perhaps you make your crash release without this patch. Michael Tell you what -- I'm going to make a hybrid patch, using Mahesh's more understandable-yet-longer function, but with your verify_pfn() and STRUCT_SIZE_INIT(mem_section) movement, along with a default setting of 42 and a non-fatal WARNING message if things fail. I'll verify it on RHEL5 and RHEL6. If you want to change it later, that will be fine, too. Ok, I will have a look at your next crash release in the new year. Thanks! Michael ___ kexec mailing list kexec@lists.infradead.org http://lists.infradead.org/mailman/listinfo/kexec
Re: [PATCH] crash: s390x: Auto-detect the correct MAX_PHYSMEM_BITS used in vmcore being analyzed
Hello Dave and Mahesh, We also tried to solve the problem and came up with an alternative solution. The function s390x_max_physmem_bits() solves the following equation: array_len == NR_MEM_SECTIONS / SECTIONS_PER_ROOT 2^(MAX_PYSMEM_BITS - SECTION_SIZE_BITS) --- ==PAGE_SIZE - sizeof(struct mem_section) This leads to the following: MAX_PYSMEM_BITS == SECTION_SIZE_BITS + log2(array_len) + log2(PAGE_SIZE) - log2(sizeof(struct mem_section)); I tested the patch with 42 and 46 bits and for both it seems to work. I will leave now for vacation and will return January the 9th 2012. I leave it to you which solution to take. Merry Christmas! Michael PS: I think this patch also fixes a bug in verify_ptn()... BTOP vs. PTOB :-) --- kernel.c |2 +- memory.c |2 +- s390x.c | 17 - 3 files changed, 18 insertions(+), 3 deletions(-) --- a/kernel.c +++ b/kernel.c @@ -275,7 +275,7 @@ kernel_init() MEMBER_OFFSET_INIT(prio_array_nr_active, prio_array, nr_active); STRUCT_SIZE_INIT(runqueue, rqstruct); STRUCT_SIZE_INIT(prio_array, prio_array); - + STRUCT_SIZE_INIT(mem_section, mem_section); MEMBER_OFFSET_INIT(rq_cfs, rq, cfs); /* --- a/memory.c +++ b/memory.c @@ -13601,7 +13601,7 @@ verify_pfn(ulong pfn) for (i = machdep-max_physmem_bits; i machdep-bits; i++) mask |= ((physaddr_t)1 i); - if (mask BTOP(pfn)) + if (mask PTOB(pfn)) return FALSE; return TRUE; --- a/s390x.c +++ b/s390x.c @@ -17,6 +17,7 @@ */ #ifdef S390X #include elf.h +#include math.h #include defs.h #include netdump.h @@ -283,6 +284,19 @@ static void s390x_process_elf_notes(void } /* + * The older s390x kernels use _MAX_PHYSMEM_BITS as 42 and the + * newer kernels use 46 bits. This function calculates the bits + * via a generic function. + */ +static int s390x_max_physmem_bits(void) +{ + unsigned long array_len = get_array_length(mem_section, NULL, 0); + + return _SECTION_SIZE_BITS + log2(array_len) + 12 /* log2(PAGE_SIZE) */ + - log2(SIZE(mem_section)); +} + +/* * Do all necessary machine-specific setup here. This is called several * times during initialization. */ @@ -350,13 +364,14 @@ s390x_init(int when) if (!machdep-hz) machdep-hz = HZ; machdep-section_size_bits = _SECTION_SIZE_BITS; - machdep-max_physmem_bits = _MAX_PHYSMEM_BITS; + machdep-max_physmem_bits = s390x_max_physmem_bits(); s390x_offsets_init(); break; case POST_INIT: break; } + fprintf(stderr, XXX %d\n, machdep-max_physmem_bits); } /* Hello Dave and Mahesh, We also tried to solve the problem and came up with an alternative solution. The function s390x_max_physmem_bits() solves the following equation: array_len == NR_MEM_SECTIONS / SECTIONS_PER_ROOT 2^(MAX_PYSMEM_BITS - SECTION_SIZE_BITS) --- ==PAGE_SIZE - sizeof(struct mem_section) This leads to the following: MAX_PYSMEM_BITS == SECTION_SIZE_BITS + log2(array_len) + log2(PAGE_SIZE) - log2(sizeof(struct mem_section)); I tested the patch with 42 and 46 bits and for both it seems to work. I will leave now for vacation and will return January the 9th 2012. I leave it to you which solution to take. Merry Christmas! Michael PS: I think this patch also fixes a bug in verify_ptn()... BTOP vs. PTOB :-) --- kernel.c |2 +- memory.c |2 +- s390x.c | 17 - 3 files changed, 18 insertions(+), 3 deletions(-) --- a/kernel.c +++ b/kernel.c @@ -275,7 +275,7 @@ kernel_init() MEMBER_OFFSET_INIT(prio_array_nr_active, prio_array, nr_active); STRUCT_SIZE_INIT(runqueue, rqstruct); STRUCT_SIZE_INIT(prio_array, prio_array); - + STRUCT_SIZE_INIT(mem_section, mem_section); MEMBER_OFFSET_INIT(rq_cfs, rq, cfs); /* --- a/memory.c +++ b/memory.c @@ -13601,7 +13601,7 @@ verify_pfn(ulong pfn) for (i = machdep-max_physmem_bits; i machdep-bits; i++) mask |= ((physaddr_t)1 i); - if (mask BTOP(pfn)) + if (mask PTOB(pfn)) return FALSE; return TRUE; --- a/s390x.c +++ b/s390x.c @@ -17,6 +17,7 @@ */ #ifdef S390X #include elf.h +#include math.h #include defs.h #include netdump.h @@ -283,6 +284,19 @@ static void s390x_process_elf_notes(void } /* + * The older s390x kernels use _MAX_PHYSMEM_BITS as 42 and the + * newer kernels use 46 bits. This function calculates the bits + * via a generic function. + */