[PATCH 0/3] makedumpfile/arm64: support flipped VA and 52-bit kernel VA

2021-01-28 Thread kazuhito . hagio
Pingfan, Bhupesh and arm64 folks,

Here is a modified series for arm64 5.4+ kernels.
I would like to proceed with this if no big issue, i.e. a regression.

changes from the RFC series:
- remove get_elf_loads() no longer used
- remove unnecessary 3/4 patch
- add get_kvbase_arm64() change
- add commit messages

Kazuhito Hagio (3):
  Use vmcoreinfo note in /proc/kcore for --mem-usage option
  arm64: Make use of NUMBER(VA_BITS) in vmcoreinfo
  arm64: support flipped VA and 52-bit kernel VA

 arch/arm64.c   | 159 +
 elf_info.c |  49 --
 elf_info.h |   1 -
 makedumpfile.c |  28 --
 makedumpfile.h |   1 +
 5 files changed, 149 insertions(+), 89 deletions(-)

-- 
2.9.3


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 2/3] arm64: Make use of NUMBER(VA_BITS) in vmcoreinfo

2021-01-28 Thread kazuhito . hagio
From: Kazuhito Hagio 

Make use of the NUMBER(VA_BITS) in vmcoreinfo, which was added by
kernel commit 20a166243328 (Linux 4.12 and later kernels), as the
current way of guessing VA_BITS does not work on Linux 5.4 and
later kernels.

Signed-off-by: Bhupesh Sharma 
Signed-off-by: Kazuhito Hagio 
---
 arch/arm64.c | 63 
 1 file changed, 42 insertions(+), 21 deletions(-)

diff --git a/arch/arm64.c b/arch/arm64.c
index 3d7b416..2916b4f 100644
--- a/arch/arm64.c
+++ b/arch/arm64.c
@@ -345,6 +345,43 @@ get_stext_symbol(void)
return(found ? kallsym : FALSE);
 }
 
+static int
+get_va_bits_from_stext_arm64(void)
+{
+   ulong _stext;
+
+   _stext = get_stext_symbol();
+   if (!_stext) {
+   ERRMSG("Can't get the symbol of _stext.\n");
+   return FALSE;
+   }
+
+   /*
+* Derive va_bits as per arch/arm64/Kconfig. Note that this is a
+* best case approximation at the moment, as there can be
+* inconsistencies in this calculation (for e.g., for 52-bit
+* kernel VA case, the 48th bit is set in * the _stext symbol).
+*/
+   if ((_stext & PAGE_OFFSET_48) == PAGE_OFFSET_48) {
+   va_bits = 48;
+   } else if ((_stext & PAGE_OFFSET_47) == PAGE_OFFSET_47) {
+   va_bits = 47;
+   } else if ((_stext & PAGE_OFFSET_42) == PAGE_OFFSET_42) {
+   va_bits = 42;
+   } else if ((_stext & PAGE_OFFSET_39) == PAGE_OFFSET_39) {
+   va_bits = 39;
+   } else if ((_stext & PAGE_OFFSET_36) == PAGE_OFFSET_36) {
+   va_bits = 36;
+   } else {
+   ERRMSG("Cannot find a proper _stext for calculating VA_BITS\n");
+   return FALSE;
+   }
+
+   DEBUG_MSG("va_bits   : %d (guess from _stext)\n", va_bits);
+
+   return TRUE;
+}
+
 int
 get_machdep_info_arm64(void)
 {
@@ -398,27 +435,11 @@ get_xen_info_arm64(void)
 int
 get_versiondep_info_arm64(void)
 {
-   ulong _stext;
-
-   _stext = get_stext_symbol();
-   if (!_stext) {
-   ERRMSG("Can't get the symbol of _stext.\n");
-   return FALSE;
-   }
-
-   /* Derive va_bits as per arch/arm64/Kconfig */
-   if ((_stext & PAGE_OFFSET_36) == PAGE_OFFSET_36) {
-   va_bits = 36;
-   } else if ((_stext & PAGE_OFFSET_39) == PAGE_OFFSET_39) {
-   va_bits = 39;
-   } else if ((_stext & PAGE_OFFSET_42) == PAGE_OFFSET_42) {
-   va_bits = 42;
-   } else if ((_stext & PAGE_OFFSET_47) == PAGE_OFFSET_47) {
-   va_bits = 47;
-   } else if ((_stext & PAGE_OFFSET_48) == PAGE_OFFSET_48) {
-   va_bits = 48;
-   } else {
-   ERRMSG("Cannot find a proper _stext for calculating VA_BITS\n");
+   if (NUMBER(VA_BITS) != NOT_FOUND_NUMBER) {
+   va_bits = NUMBER(VA_BITS);
+   DEBUG_MSG("va_bits  : %d (vmcoreinfo)\n", va_bits);
+   } else if (get_va_bits_from_stext_arm64() == FALSE) {
+   ERRMSG("Can't determine va_bits.\n");
return FALSE;
}
 
-- 
2.9.3


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH 3/3] arm64: support flipped VA and 52-bit kernel VA

2021-01-28 Thread kazuhito . hagio
From: Kazuhito Hagio 

Linux 5.4 and later kernels for arm64 changed the kernel VA space
arrangement and introduced 52-bit kernel VAs by merging branch
commit b333b0ba2346.  Support 5.9+ kernels with vmcoreinfo entries
and 5.4+ kernels with best guessing.

However, the following conditions are not supported for now due to
no necessary information provided from kernel:
(1) 5.4 <= kernels <= 5.8 and
- if PA_BITS=52 && VA_BITS!=52
- with -x option if vabits_actual=52
(2) kernels < 5.4 with CONFIG_ARM64_USER_VA_BITS_52=y

(1) should be supported with kernel commit bbdbc11804ff and
1d50e5d0c5052 adding necessary information to vmcoreinfo.

Signed-off-by: Bhupesh Sharma 
Signed-off-by: Kazuhito Hagio 
---
 arch/arm64.c   | 100 -
 makedumpfile.c |   2 ++
 makedumpfile.h |   1 +
 3 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/arch/arm64.c b/arch/arm64.c
index 2916b4f..0bb7230 100644
--- a/arch/arm64.c
+++ b/arch/arm64.c
@@ -47,6 +47,8 @@ typedef struct {
 static int lpa_52_bit_support_available;
 static int pgtable_level;
 static int va_bits;
+static int vabits_actual;
+static int flipped_va;
 static unsigned long kimage_voffset;
 
 #define SZ_4K  4096
@@ -58,7 +60,6 @@ static unsigned long kimage_voffset;
 #define PAGE_OFFSET_42 ((0xUL) << 42)
 #define PAGE_OFFSET_47 ((0xUL) << 47)
 #define PAGE_OFFSET_48 ((0xUL) << 48)
-#define PAGE_OFFSET_52 ((0xUL) << 52)
 
 #define pgd_val(x) ((x).pgd)
 #define pud_val(x) (pgd_val((x).pgd))
@@ -218,12 +219,20 @@ pmd_page_paddr(pmd_t pmd)
 #define pte_index(vaddr)   (((vaddr) >> PAGESHIFT()) & 
(PTRS_PER_PTE - 1))
 #define pte_offset(dir, vaddr) (pmd_page_paddr((*dir)) + 
pte_index(vaddr) * sizeof(pte_t))
 
+/*
+ * The linear kernel range starts at the bottom of the virtual address
+ * space. Testing the top bit for the start of the region is a
+ * sufficient check and avoids having to worry about the tag.
+ */
+#define is_linear_addr(addr)   (flipped_va ?   \
+   (!((unsigned long)(addr) & (1UL << (vabits_actual - 1 : \
+   (!!((unsigned long)(addr) & (1UL << (vabits_actual - 1)
+
 static unsigned long long
 __pa(unsigned long vaddr)
 {
-   if (kimage_voffset == NOT_FOUND_NUMBER ||
-   (vaddr >= PAGE_OFFSET))
-   return (vaddr - PAGE_OFFSET + info->phys_base);
+   if (kimage_voffset == NOT_FOUND_NUMBER || is_linear_addr(vaddr))
+   return ((vaddr & ~PAGE_OFFSET) + info->phys_base);
else
return (vaddr - kimage_voffset);
 }
@@ -253,6 +262,7 @@ static int calculate_plat_config(void)
(PAGESIZE() == SZ_64K && va_bits == 42)) {
pgtable_level = 2;
} else if ((PAGESIZE() == SZ_64K && va_bits == 48) ||
+   (PAGESIZE() == SZ_64K && va_bits == 52) ||
(PAGESIZE() == SZ_4K && va_bits == 39) ||
(PAGESIZE() == SZ_16K && va_bits == 47)) {
pgtable_level = 3;
@@ -263,6 +273,7 @@ static int calculate_plat_config(void)
PAGESIZE(), va_bits);
return FALSE;
}
+   DEBUG_MSG("pgtable_level: %d\n", pgtable_level);
 
return TRUE;
 }
@@ -270,6 +281,9 @@ static int calculate_plat_config(void)
 unsigned long
 get_kvbase_arm64(void)
 {
+   if (flipped_va)
+   return PAGE_OFFSET;
+
return (0xUL << va_bits);
 }
 
@@ -382,22 +396,54 @@ get_va_bits_from_stext_arm64(void)
return TRUE;
 }
 
+static void
+get_page_offset_arm64(void)
+{
+   ulong page_end;
+   int vabits_min;
+
+   /*
+* See arch/arm64/include/asm/memory.h for more details of
+* the PAGE_OFFSET calculation.
+*/
+   vabits_min = (va_bits > 48) ? 48 : va_bits;
+   page_end = -(1UL << (vabits_min - 1));
+
+   if (SYMBOL(_stext) > page_end) {
+   flipped_va = TRUE;
+   info->page_offset = -(1UL << vabits_actual);
+   } else {
+   flipped_va = FALSE;
+   info->page_offset = -(1UL << (vabits_actual - 1));
+   }
+
+   DEBUG_MSG("page_offset   : %lx (from page_end check)\n",
+   info->page_offset);
+}
+
 int
 get_machdep_info_arm64(void)
 {
+   /* Check if va_bits is still not initialized. If still 0, call
+* get_versiondep_info() to initialize the same.
+*/
+   if (!va_bits)
+   get_versiondep_info_arm64();
+
/* Determine if the PA address range is 52-bits: ARMv8.2-LPA */
if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER) {
 

[PATCH 1/3] Use vmcoreinfo note in /proc/kcore for --mem-usage option

2021-01-28 Thread kazuhito . hagio
From: Kazuhito Hagio 

kernel commit 23c85094fe18 added vmcoreinfo note to /proc/kcore.
Use the vmcoreinfo note to get necessary information, especially
page_offset and phys_base on arm64, for the --mem-usage option.

Signed-off-by: Kazuhito Hagio 
---
 elf_info.c | 49 -
 elf_info.h |  1 -
 makedumpfile.c | 26 +-
 3 files changed, 21 insertions(+), 55 deletions(-)

diff --git a/elf_info.c b/elf_info.c
index a6624b5..e8affb7 100644
--- a/elf_info.c
+++ b/elf_info.c
@@ -698,55 +698,6 @@ get_elf32_ehdr(int fd, char *filename, Elf32_Ehdr *ehdr)
return TRUE;
 }
 
-int
-get_elf_loads(int fd, char *filename)
-{
-   int i, j, phnum, elf_format;
-   Elf64_Phdr phdr;
-
-   /*
-* Check ELF64 or ELF32.
-*/
-   elf_format = check_elf_format(fd, filename, , _pt_loads);
-   if (elf_format == ELF64)
-   flags_memory |= MEMORY_ELF64;
-   else if (elf_format != ELF32)
-   return FALSE;
-
-   if (!num_pt_loads) {
-   ERRMSG("Can't get the number of PT_LOAD.\n");
-   return FALSE;
-   }
-
-   /*
-* The below file information will be used as /proc/vmcore.
-*/
-   fd_memory   = fd;
-   name_memory = filename;
-
-   pt_loads = calloc(sizeof(struct pt_load_segment), num_pt_loads);
-   if (pt_loads == NULL) {
-   ERRMSG("Can't allocate memory for the PT_LOAD. %s\n",
-   strerror(errno));
-   return FALSE;
-   }
-   for (i = 0, j = 0; i < phnum; i++) {
-   if (!get_phdr_memory(i, ))
-   return FALSE;
-
-   if (phdr.p_type != PT_LOAD)
-   continue;
-
-   if (j >= num_pt_loads)
-   return FALSE;
-   if (!dump_Elf_load(, j))
-   return FALSE;
-   j++;
-   }
-
-   return TRUE;
-}
-
 static int exclude_segment(struct pt_load_segment **pt_loads,
   unsigned int *num_pt_loads, uint64_t start, uint64_t 
end)
 {
diff --git a/elf_info.h b/elf_info.h
index d9b5d05..d5416b3 100644
--- a/elf_info.h
+++ b/elf_info.h
@@ -44,7 +44,6 @@ int get_elf64_ehdr(int fd, char *filename, Elf64_Ehdr *ehdr);
 int get_elf32_ehdr(int fd, char *filename, Elf32_Ehdr *ehdr);
 int get_elf_info(int fd, char *filename);
 void free_elf_info(void);
-int get_elf_loads(int fd, char *filename);
 int set_kcore_vmcoreinfo(uint64_t vmcoreinfo_addr, uint64_t vmcoreinfo_len);
 int get_kcore_dump_loads(void);
 
diff --git a/makedumpfile.c b/makedumpfile.c
index ba0003a..768eda4 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -11445,6 +11445,7 @@ int show_mem_usage(void)
 {
uint64_t vmcoreinfo_addr, vmcoreinfo_len;
struct cycle cycle = {0};
+   int vmcoreinfo = FALSE;
 
if (!is_crashkernel_mem_reserved()) {
ERRMSG("No memory is reserved for crashkernel!\n");
@@ -11456,9 +11457,22 @@ int show_mem_usage(void)
if (!open_files_for_creating_dumpfile())
return FALSE;
 
-   if (!get_elf_loads(info->fd_memory, info->name_memory))
+   if (!get_elf_info(info->fd_memory, info->name_memory))
return FALSE;
 
+   /*
+* /proc/kcore on Linux 4.19 and later kernels have vmcoreinfo note in
+* NOTE segment.  See commit 23c85094fe18.
+*/
+   if (has_vmcoreinfo()) {
+   off_t offset;
+   unsigned long size;
+
+   get_vmcoreinfo(, );
+   vmcoreinfo = read_vmcoreinfo_from_vmcore(offset, size, FALSE);
+   DEBUG_MSG("Read vmcoreinfo from NOTE segment: %d\n", 
vmcoreinfo);
+   }
+
if (!get_page_offset())
return FALSE;
 
@@ -11466,11 +11480,13 @@ int show_mem_usage(void)
if (!get_phys_base())
return FALSE;
 
-   if (!get_sys_kernel_vmcoreinfo(_addr, _len))
-   return FALSE;
+   if (!vmcoreinfo) {
+   if (!get_sys_kernel_vmcoreinfo(_addr, 
_len))
+   return FALSE;
 
-   if (!set_kcore_vmcoreinfo(vmcoreinfo_addr, vmcoreinfo_len))
-   return FALSE;
+   if (!set_kcore_vmcoreinfo(vmcoreinfo_addr, vmcoreinfo_len))
+   return FALSE;
+   }
 
if (!initial())
return FALSE;
-- 
2.9.3


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH 3/4] Get kernel version from OSRELEASE in vmcoreinfo

2021-01-14 Thread kazuhito . hagio
From: Kazuhito Hagio 

Signed-off-by: Kazuhito Hagio 
---
 makedumpfile.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/makedumpfile.c b/makedumpfile.c
index 199748b..fdfe437 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -1182,6 +1182,7 @@ check_release(void)
}
}
 
+   info->kernel_version = FALSE;
info->kernel_version = get_kernel_version(info->system_utsname.release);
if (info->kernel_version == FALSE) {
ERRMSG("Can't get the kernel version.\n");
@@ -2480,6 +2481,9 @@ read_vmcoreinfo_basic_info(void)
if (strlen(info->release))
continue;
strcpy(info->release, buf + strlen(STR_OSRELEASE));
+
+   if (!info->kernel_version)
+   info->kernel_version = 
get_kernel_version(info->release);
}
if (strncmp(buf, STR_PAGESIZE, strlen(STR_PAGESIZE)) == 0) {
page_size = strtol(buf+strlen(STR_PAGESIZE),,10);
-- 
2.9.3


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH 0/4] makedumpfile/arm64: support flipped VA and 52-bit kernel VA

2021-01-14 Thread kazuhito . hagio
Hi Bhupesh, Alexander, Pingfan and arm64 guys,

I've rewritten Bhupesh's v5 patch [1] in order to avoid some limitations and
now this patchset
- can be built on some old arm64 environments hopefully,
- can be built on x86_64 machines with TARGET=aarch64,
- can process vmcores on different version of kernel from captured version.

Could you test this?  The patchset is also on GitHub [2] and the commit
messages are to be written later.

However the following conditions are probably not supported:
- 5.11-rc is not taken into account
- 5.4 <= kernels <= 5.8 (*1)
  * if PA_BITS=52 && VA_BITS!=52
  * with -x option if vabits_actual=52
- kernels < 5.4 with CONFIG_ARM64_USER_VA_BITS_52=y (e.g. RHEL8 kernels)

(*1) supported with kernel commit bbdbc11804ff and 1d50e5d0c5052.

I think especially MAX_PHYSMEM_BITS cannot be guessed correctly only with
the entries in vmcoreinfo (even with Bhupesh's original patch), so basically
I would like distributions that uses 5.4 to 5.8 to backport Bhupesh's kernel
patches above if the patchset does not work well.

[1] http://lists.infradead.org/pipermail/kexec/2020-September/021336.html
[2] https://github.com/k-hagio/makedumpfile/commits/arm64.kh.test2

Kazuhito Hagio (4):
  Use ELF vmcoreinfo note for --mem-usage option
  arm64: use NUMBER(VA_BITS) in vmcoreinfo
  Get kernel version from OSRELEASE in vmcoreinfo
  arm64: support flipped VA and 52-bit kernel VA

 arch/arm64.c   | 155 -
 makedumpfile.c |  32 ++--
 makedumpfile.h |   1 +
 3 files changed, 149 insertions(+), 39 deletions(-)

-- 
2.9.3


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH 4/4] arm64: support flipped VA and 52-bit kernel VA

2021-01-14 Thread kazuhito . hagio
From: Kazuhito Hagio 

Based on Bhupesh's patch and contains Pingfan's idea.

Signed-off-by: Bhupesh Sharma 
Signed-off-by: Kazuhito Hagio 
---
 arch/arm64.c   | 95 --
 makedumpfile.c |  2 ++
 makedumpfile.h |  1 +
 3 files changed, 83 insertions(+), 15 deletions(-)

diff --git a/arch/arm64.c b/arch/arm64.c
index 61ec89a..4ece19d 100644
--- a/arch/arm64.c
+++ b/arch/arm64.c
@@ -47,6 +47,8 @@ typedef struct {
 static int lpa_52_bit_support_available;
 static int pgtable_level;
 static int va_bits;
+static int vabits_actual;
+static int flipped_va;
 static unsigned long kimage_voffset;
 
 #define SZ_4K  4096
@@ -58,7 +60,6 @@ static unsigned long kimage_voffset;
 #define PAGE_OFFSET_42 ((0xUL) << 42)
 #define PAGE_OFFSET_47 ((0xUL) << 47)
 #define PAGE_OFFSET_48 ((0xUL) << 48)
-#define PAGE_OFFSET_52 ((0xUL) << 52)
 
 #define pgd_val(x) ((x).pgd)
 #define pud_val(x) (pgd_val((x).pgd))
@@ -218,12 +219,20 @@ pmd_page_paddr(pmd_t pmd)
 #define pte_index(vaddr)   (((vaddr) >> PAGESHIFT()) & 
(PTRS_PER_PTE - 1))
 #define pte_offset(dir, vaddr) (pmd_page_paddr((*dir)) + 
pte_index(vaddr) * sizeof(pte_t))
 
+/*
+ * The linear kernel range starts at the bottom of the virtual address
+ * space. Testing the top bit for the start of the region is a
+ * sufficient check and avoids having to worry about the tag.
+ */
+#define is_linear_addr(addr)   (flipped_va ?   \
+   (!((unsigned long)(addr) & (1UL << (vabits_actual - 1 : \
+   (!!((unsigned long)(addr) & (1UL << (vabits_actual - 1)
+
 static unsigned long long
 __pa(unsigned long vaddr)
 {
-   if (kimage_voffset == NOT_FOUND_NUMBER ||
-   (vaddr >= PAGE_OFFSET))
-   return (vaddr - PAGE_OFFSET + info->phys_base);
+   if (kimage_voffset == NOT_FOUND_NUMBER || is_linear_addr(vaddr))
+   return ((vaddr & ~PAGE_OFFSET) + info->phys_base);
else
return (vaddr - kimage_voffset);
 }
@@ -253,6 +262,7 @@ static int calculate_plat_config(void)
(PAGESIZE() == SZ_64K && va_bits == 42)) {
pgtable_level = 2;
} else if ((PAGESIZE() == SZ_64K && va_bits == 48) ||
+   (PAGESIZE() == SZ_64K && va_bits == 52) ||
(PAGESIZE() == SZ_4K && va_bits == 39) ||
(PAGESIZE() == SZ_16K && va_bits == 47)) {
pgtable_level = 3;
@@ -263,6 +273,7 @@ static int calculate_plat_config(void)
PAGESIZE(), va_bits);
return FALSE;
}
+   DEBUG_MSG("pgtable_level: %d\n", pgtable_level);
 
return TRUE;
 }
@@ -383,22 +394,54 @@ get_va_bits_from_stext_arm64(void)
return TRUE;
 }
 
+static void
+get_page_offset_arm64(void)
+{
+   ulong page_end;
+   int vabits_min;
+
+   /*
+* See arch/arm64/include/asm/memory.h for more details of
+* the PAGE_OFFSET calculation.
+*/
+   vabits_min = (va_bits > 48) ? 48 : va_bits;
+   page_end = -(1UL << (vabits_min - 1));
+
+   if (SYMBOL(_stext) > page_end) {
+   flipped_va = TRUE;
+   info->page_offset = -(1UL << vabits_actual);
+   } else {
+   flipped_va = FALSE;
+   info->page_offset = -(1UL << (vabits_actual - 1));
+   }
+
+   DEBUG_MSG("page_offset   : %lx (from page_end check)\n",
+   info->page_offset);
+}
+
 int
 get_machdep_info_arm64(void)
 {
+   /* Check if va_bits is still not initialized. If still 0, call
+* get_versiondep_info() to initialize the same.
+*/
+   if (!va_bits)
+   get_versiondep_info_arm64();
+
/* Determine if the PA address range is 52-bits: ARMv8.2-LPA */
if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER) {
info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
+   DEBUG_MSG("max_physmem_bits : %ld (vmcoreinfo)\n", 
info->max_physmem_bits);
if (info->max_physmem_bits == 52)
lpa_52_bit_support_available = 1;
-   } else
-   info->max_physmem_bits = 48;
+   } else {
+   if (va_bits == 52)
+   info->max_physmem_bits = 52; /* just guess */
+   else
+   info->max_physmem_bits = 48;
 
-   /* Check if va_bits is still not initialized. If still 0, call
-* get_versiondep_info() to initialize the same.
-*/
-   if (!va_bits)
-   get_versiondep_info_arm64();
+   DEBUG_MSG("max_physmem_bits : %ld (guess)\n&qu

[RFC PATCH 2/4] arm64: use NUMBER(VA_BITS) in vmcoreinfo

2021-01-14 Thread kazuhito . hagio
From: Kazuhito Hagio 

Signed-off-by: Bhupesh Sharma 
Signed-off-by: Kazuhito Hagio 
---
 arch/arm64.c | 64 
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/arch/arm64.c b/arch/arm64.c
index 3d7b416..61ec89a 100644
--- a/arch/arm64.c
+++ b/arch/arm64.c
@@ -345,6 +345,44 @@ get_stext_symbol(void)
return(found ? kallsym : FALSE);
 }
 
+static int
+get_va_bits_from_stext_arm64(void)
+{
+   ulong _stext;
+
+   _stext = get_stext_symbol();
+   if (!_stext) {
+   ERRMSG("Can't get the symbol of _stext.\n");
+   return FALSE;
+   }
+
+   /*
+* Derive va_bits as per arch/arm64/Kconfig. Note that this is a
+* best case approximation at the moment, as there can be
+* inconsistencies in this calculation (for e.g., for
+* 52-bit kernel VA case, the 48th bit is set in
+* the _stext symbol).
+*/
+   if ((_stext & PAGE_OFFSET_48) == PAGE_OFFSET_48) {
+   va_bits = 48;
+   } else if ((_stext & PAGE_OFFSET_47) == PAGE_OFFSET_47) {
+   va_bits = 47;
+   } else if ((_stext & PAGE_OFFSET_42) == PAGE_OFFSET_42) {
+   va_bits = 42;
+   } else if ((_stext & PAGE_OFFSET_39) == PAGE_OFFSET_39) {
+   va_bits = 39;
+   } else if ((_stext & PAGE_OFFSET_36) == PAGE_OFFSET_36) {
+   va_bits = 36;
+   } else {
+   ERRMSG("Cannot find a proper _stext for calculating VA_BITS\n");
+   return FALSE;
+   }
+
+   DEBUG_MSG("va_bits   : %d (approximation via _stext)\n", va_bits);
+
+   return TRUE;
+}
+
 int
 get_machdep_info_arm64(void)
 {
@@ -398,27 +436,11 @@ get_xen_info_arm64(void)
 int
 get_versiondep_info_arm64(void)
 {
-   ulong _stext;
-
-   _stext = get_stext_symbol();
-   if (!_stext) {
-   ERRMSG("Can't get the symbol of _stext.\n");
-   return FALSE;
-   }
-
-   /* Derive va_bits as per arch/arm64/Kconfig */
-   if ((_stext & PAGE_OFFSET_36) == PAGE_OFFSET_36) {
-   va_bits = 36;
-   } else if ((_stext & PAGE_OFFSET_39) == PAGE_OFFSET_39) {
-   va_bits = 39;
-   } else if ((_stext & PAGE_OFFSET_42) == PAGE_OFFSET_42) {
-   va_bits = 42;
-   } else if ((_stext & PAGE_OFFSET_47) == PAGE_OFFSET_47) {
-   va_bits = 47;
-   } else if ((_stext & PAGE_OFFSET_48) == PAGE_OFFSET_48) {
-   va_bits = 48;
-   } else {
-   ERRMSG("Cannot find a proper _stext for calculating VA_BITS\n");
+   if (NUMBER(VA_BITS) != NOT_FOUND_NUMBER) {
+   va_bits = NUMBER(VA_BITS);
+   DEBUG_MSG("va_bits  : %d (vmcoreinfo)\n", va_bits);
+   } else if (get_va_bits_from_stext_arm64() == FALSE) {
+   ERRMSG("Can't determine va_bits.\n");
return FALSE;
}
 
-- 
2.9.3


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[RFC PATCH 1/4] Use ELF vmcoreinfo note for --mem-usage option

2021-01-14 Thread kazuhito . hagio
From: Kazuhito Hagio 

Signed-off-by: Kazuhito Hagio 
---
 makedumpfile.c | 26 +-
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index ecd63fa..199748b 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -11426,6 +11426,7 @@ int show_mem_usage(void)
 {
uint64_t vmcoreinfo_addr, vmcoreinfo_len;
struct cycle cycle = {0};
+   int vmcoreinfo = FALSE;
 
if (!is_crashkernel_mem_reserved()) {
ERRMSG("No memory is reserved for crashkernel!\n");
@@ -11437,9 +11438,22 @@ int show_mem_usage(void)
if (!open_files_for_creating_dumpfile())
return FALSE;
 
-   if (!get_elf_loads(info->fd_memory, info->name_memory))
+   if (!get_elf_info(info->fd_memory, info->name_memory))
return FALSE;
 
+   /*
+* /proc/kcore on Linux 4.19 and later kernels have vmcoreinfo note in
+* NOTE segment.  See commit 23c85094fe18.
+*/
+   if (has_vmcoreinfo()) {
+   off_t offset;
+   unsigned long size;
+
+   get_vmcoreinfo(, );
+   vmcoreinfo = read_vmcoreinfo_from_vmcore(offset, size, FALSE);
+   DEBUG_MSG("Read vmcoreinfo from NOTE segment: %d\n", 
vmcoreinfo);
+   }
+
if (!get_page_offset())
return FALSE;
 
@@ -11447,11 +11461,13 @@ int show_mem_usage(void)
if (!get_phys_base())
return FALSE;
 
-   if (!get_sys_kernel_vmcoreinfo(_addr, _len))
-   return FALSE;
+   if (!vmcoreinfo) {
+   if (!get_sys_kernel_vmcoreinfo(_addr, 
_len))
+   return FALSE;
 
-   if (!set_kcore_vmcoreinfo(vmcoreinfo_addr, vmcoreinfo_len))
-   return FALSE;
+   if (!set_kcore_vmcoreinfo(vmcoreinfo_addr, vmcoreinfo_len))
+   return FALSE;
+   }
 
if (!initial())
return FALSE;
-- 
2.9.3


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: makedumpfile and tmpfs contents

2020-12-11 Thread Kazuhito Hagio
Hi Rajen,

On Fri, Dec 11, 2020 at 11:36 PM Rajendra Dendukuri
 wrote:
>
> Hi,
>
> Is there a possibility or an option to recover contents of a tmpfs by
> the crashkernel after the original kernel has kdump'ed. The idea is to
> store a few other system log files stored in a tmpfs similar to how
> dmesg logs are recovered. Any other ideas to solve this requirement?

I think it would be too hard for makedumpfile to do this, but if you can use
the crash utility to extract contents of a tmpfs from a vmcore, my "cacheutils"
crash extension module [1] might be useful, since tmpfs uses page cache.

[1] https://github.com/k-hagio/crash-cacheutils

Thanks,
Kazu

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH] sadump, kaslr: fix failure of calculating kaslr_offset due to an sadump format restriction

2020-07-10 Thread Kazuhito Hagio
On Thu, Jul 9, 2020 at 6:30 PM HATAYAMA Daisuke  wrote:
>
> We faced recently a memory dump collected by sadump where unused part
> of register values are non-zero. For the crash dump, calculating
> kaslr_offset fails because it is based on the assumption that unused
> part of register values in the sadump format are always zero cleared.
>
> The problem is that used and unused part of register values are
> rigorously indistinguishable in the sadump format. Although there is
> kernel data structure that represents a map between logical cpu
> numbers and lapic ids, they cannot be used in order to calculate
> kaslr_offset.
>
> To fix this, we have no choice but use a trial-and-error approach: try
> to use each entry of register values in order until we find a good
> pair of cr3 and idtr by which we can refer to linux_banner symbol as
> expected.
>
> Signed-off-by: HATAYAMA Daisuke 

I've replaced some indent spaces with tabs and merged:
https://github.com/makedumpfile/makedumpfile/commit/3c0cf7a93cff83f1e711e241eb47fcb096a451c5

Thanks,
Kazu

> ---
>  sadump_info.c | 140 
> --
>  1 file changed, 97 insertions(+), 43 deletions(-)
>
> diff --git a/sadump_info.c b/sadump_info.c
> index 46867ce..aa9a048 100644
> --- a/sadump_info.c
> +++ b/sadump_info.c
> @@ -101,6 +101,7 @@ static int lookup_diskset(unsigned long long 
> whole_offset, int *diskid,
>   unsigned long long *disk_offset);
>  static int max_mask_cpu(void);
>  static int cpu_online_mask_init(void);
> +static int linux_banner_sanity_check(ulong cr3);
>  static int per_cpu_init(void);
>  static int get_data_from_elf_note_desc(const char *note_buf, uint32_t 
> n_descsz,
>char *name, uint32_t n_type, char 
> **data);
> @@ -1293,6 +1294,30 @@ finish:
> return ret;
>  }
>
> +static int linux_banner_sanity_check(ulong cr3)
> +{
> +   unsigned long linux_banner_paddr;
> +   char buf[sizeof("Linux version")];
> +
> +   linux_banner_paddr = vtop4_x86_64_pagetable(SYMBOL(linux_banner), 
> cr3);
> +   if (linux_banner_paddr == NOT_PADDR) {
> +   DEBUG_MSG("sadump: linux_banner address translation 
> failed\n");
> +   return FALSE;
> +   }
> +
> +   if (!readmem(PADDR, linux_banner_paddr, , sizeof(buf))) {
> +   DEBUG_MSG("sadump: reading linux_banner failed\n");
> +   return FALSE;
> +   }
> +
> +   if (!STRNEQ(buf, "Linux version")) {
> +   DEBUG_MSG("sadump: linux_banner sanity check failed\n");
> +   return FALSE;
> +   }
> +
> +   return TRUE;
> +}
> +
>  /*
>   * Calculate kaslr_offset and phys_base
>   *
> @@ -1370,59 +1395,86 @@ calc_kaslr_offset(void)
>  {
> struct sadump_header *sh = si->sh_memory;
> uint64_t idtr = 0, cr3 = 0, idtr_paddr;
> -   struct sadump_smram_cpu_state smram, zero;
> +   struct sadump_smram_cpu_state smram;
> int apicid;
> unsigned long divide_error_vmcore, divide_error_vmlinux;
> unsigned long kaslr_offset, phys_base;
> unsigned long kaslr_offset_kdump, phys_base_kdump;
> +   int sanity_check_passed = FALSE;
>
> -   memset(, 0, sizeof(zero));
> for (apicid = 0; apicid < sh->nr_cpus; ++apicid) {
> -   if (!get_smram_cpu_state(apicid, )) {
> -   ERRMSG("get_smram_cpu_state error\n");
> +
> +DEBUG_MSG("sadump: apicid: %d\n", apicid);
> +
> +if (!get_smram_cpu_state(apicid, )) {
> +ERRMSG("get_smram_cpu_state error\n");
> +return FALSE;
> +}
> +
> +idtr = 
> ((uint64_t)smram.IdtUpper)<<32|(uint64_t)smram.IdtLower;
> +
> +if (!smram.Cr3 || !idtr) {
> +DEBUG_MSG("sadump: cr3: %lx idt: %lx, skipped\n",
> +  smram.Cr3,
> +  idtr);
> +continue;
> +}
> +
> +if ((SYMBOL(pti_init) != NOT_FOUND_SYMBOL) ||
> +(SYMBOL(kaiser_init) != NOT_FOUND_SYMBOL))
> +cr3 = smram.Cr3 & 
> ~(CR3_PCID_MASK|PTI_USER_PGTABLE_MASK);
> +else
> +cr3 = smram.Cr3 & ~CR3_PCID_MASK;
> +
> +/* Convert virtual address of IDT table to physical address 
> */
> +   idtr_paddr = vtop4_x86_64_pagetable(idtr, cr3);
> +if (idtr_paddr == NOT_PADDR) {
> +DEBUG_MSG("sadump: converting IDT physical address "
> + "failed.\n");
> +continue;
> +}
> +
> +   /* Now we can calculate kaslr_offset and phys_base */
> +   divide_error_vmlinux = SYMBOL(divide_error);
> +   divide_error_vmcore = get_vec0_addr(idtr_paddr);
> + 

RE: [PATCH] makedumpfile/s390: Add get_kaslr_offset() for s390x

2019-12-17 Thread Kazuhito Hagio
Hi Mikhail,

> -Original Message-
> Hi,
> 
> On 12.12.2019 17:12, Kazuhito Hagio wrote:
> > Hi Mikhail,
> >
> >> -Original Message-
> >> Hello Kazu,
> >>
> >> I think we can try to generalize the kaslr offset extraction.
> >> I won't speak for other architectures, but for s390 that 
> >> get_kaslr_offset_arm64()
> >> should work fine. The only concern of mine is this TODO statement:
> >>
> >> if (_text <= vaddr && vaddr <= _end) {
> >>DEBUG_MSG("info->kaslr_offset: %lx\n", info->kaslr_offset);
> >>return info->kaslr_offset;
> >>} else {
> >>/*
> >>* TODO: we need to check if it is vmalloc/vmmemmap/module
> >>* address, we will have different offset
> >>*/
> >>return 0;
> >> }
> >>
> >> Could you explain this one?
> >
> > Probably it was considered that the check would be needed to support
> > the whole KASLR behavior when get_kaslr_offset_x86_64() was written
> > originally.
> >
> > But in the current makedumpfile for x86_64 and arm64 supporting KASLR,
> > the offset we need is the one for symbol addresses in vmlinux only.
> > As I said below, module symbol addresses are retrieved from vmcore.
> > Other addresses should not be passed to the function for now, as far
> > as I know.
> >
> > So I think the TODO comment is confusing, and it would be better to
> > remove it or change it to something like:
> > /*
> >  * Returns 0 if vaddr does not need the offset to be added,
> >  * e.g. for module address.
> >  */
> >
> > But if s390 uses get_kaslr_offset() in its arch-specific code to
> > adjust addresses other than kernel text address, we might need to
> > modify it for s390, not generalize it.
> 
> Currently, s390 doesn't use get_kaslr_offset() in its arch-specific
> code.

OK, I pushed a patch that generalizes it to my test repository.
Could you enable s390 to use it and test?
https://github.com/k-hagio/makedumpfile/tree/add-get_kaslr_offset_general

Thanks,
Kazu

> 
> >
> > Thanks,
> > Kazu
> >
> >>
> >> Thanks,
> >> Mikhail
> >>
> >> On 09.12.2019 23:02, Kazuhito Hagio wrote:
> >>> Hi Mikhail,
> >>>
> >>> Sorry for late reply.
> >>>
> >>>> -Original Message-
> >>>> Since kernel v5.2 KASLR is supported on s390. In makedumpfile however no
> >>>> support has been added yet. This patch adds the arch specific function
> >>>> get_kaslr_offset() for s390x.
> >>>> Since the values in vmcoreinfo are already relocated, the patch is
> >>>> mainly relevant for vmlinux processing (-x option).
> >>>
> >>> In the current implementation of makedumpfile, the get_kaslr_offset(vaddr)
> >>> is supposed to return the KASLR offset only when the offset is needed to
> >>> add to the vaddr.  So generally symbols from kernel (vmlinux) need it, but
> >>> symbols from modules are resolved dynamically and don't need the offset.
> >> \>
> >>> This patch always returns the offset if any, as a result, I guess this 
> >>> patch
> >>> will not work as expected with module symbols in filter config file.
> >>>
> >>> So... How about making get_kaslr_offset_arm64() general for other archs
> >>> (get_kaslr_offset_general() or something), then using it also for s390?
> >>> If OK, I can do that generalization.
> >>>
> >>> Thanks,
> >>> Kazu
> >>>
> >>>>
> >>>> Signed-off-by: Philipp Rudo 
> >>>> Signed-off-by: Mikhail Zaslonko 
> >>>> ---
> >>>>  arch/s390x.c   | 32 
> >>>>  makedumpfile.h |  3 ++-
> >>>>  2 files changed, 34 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/arch/s390x.c b/arch/s390x.c
> >>>> index bf9d58e..892df14 100644
> >>>> --- a/arch/s390x.c
> >>>> +++ b/arch/s390x.c
> >>>> @@ -122,6 +122,38 @@ get_machdep_info_s390x(void)
> >>>>  return TRUE;
> >>>>  }
> >>>>
> >>>> +unsigned long
> >>>> +get_kaslr_offset_s390x(unsigned long vaddr)
> >>>> +{
> >>>> +unsigned int i;
> >>>> +char buf[BUFSIZE_FGETS], *endp;
> >>>&

RE: [PATCH] makedumpfile/s390: Add get_kaslr_offset() for s390x

2019-12-12 Thread Kazuhito Hagio
Hi Mikhail,

> -Original Message-
> Hello Kazu,
> 
> I think we can try to generalize the kaslr offset extraction.
> I won't speak for other architectures, but for s390 that 
> get_kaslr_offset_arm64()
> should work fine. The only concern of mine is this TODO statement:
> 
> if (_text <= vaddr && vaddr <= _end) {
>   DEBUG_MSG("info->kaslr_offset: %lx\n", info->kaslr_offset);
>   return info->kaslr_offset;
>   } else {
>   /*
>   * TODO: we need to check if it is vmalloc/vmmemmap/module
>   * address, we will have different offset
>   */
>   return 0;
> }
> 
> Could you explain this one?

Probably it was considered that the check would be needed to support
the whole KASLR behavior when get_kaslr_offset_x86_64() was written
originally.

But in the current makedumpfile for x86_64 and arm64 supporting KASLR,
the offset we need is the one for symbol addresses in vmlinux only.
As I said below, module symbol addresses are retrieved from vmcore.
Other addresses should not be passed to the function for now, as far
as I know.

So I think the TODO comment is confusing, and it would be better to
remove it or change it to something like:
/*
 * Returns 0 if vaddr does not need the offset to be added,
 * e.g. for module address.
 */

But if s390 uses get_kaslr_offset() in its arch-specific code to
adjust addresses other than kernel text address, we might need to
modify it for s390, not generalize it.

Thanks,
Kazu

> 
> Thanks,
> Mikhail
> 
> On 09.12.2019 23:02, Kazuhito Hagio wrote:
> > Hi Mikhail,
> >
> > Sorry for late reply.
> >
> >> -Original Message-
> >> Since kernel v5.2 KASLR is supported on s390. In makedumpfile however no
> >> support has been added yet. This patch adds the arch specific function
> >> get_kaslr_offset() for s390x.
> >> Since the values in vmcoreinfo are already relocated, the patch is
> >> mainly relevant for vmlinux processing (-x option).
> >
> > In the current implementation of makedumpfile, the get_kaslr_offset(vaddr)
> > is supposed to return the KASLR offset only when the offset is needed to
> > add to the vaddr.  So generally symbols from kernel (vmlinux) need it, but
> > symbols from modules are resolved dynamically and don't need the offset.
> \>
> > This patch always returns the offset if any, as a result, I guess this patch
> > will not work as expected with module symbols in filter config file.
> >
> > So... How about making get_kaslr_offset_arm64() general for other archs
> > (get_kaslr_offset_general() or something), then using it also for s390?
> > If OK, I can do that generalization.
> >
> > Thanks,
> > Kazu
> >
> >>
> >> Signed-off-by: Philipp Rudo 
> >> Signed-off-by: Mikhail Zaslonko 
> >> ---
> >>  arch/s390x.c   | 32 
> >>  makedumpfile.h |  3 ++-
> >>  2 files changed, 34 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/arch/s390x.c b/arch/s390x.c
> >> index bf9d58e..892df14 100644
> >> --- a/arch/s390x.c
> >> +++ b/arch/s390x.c
> >> @@ -122,6 +122,38 @@ get_machdep_info_s390x(void)
> >>return TRUE;
> >>  }
> >>
> >> +unsigned long
> >> +get_kaslr_offset_s390x(unsigned long vaddr)
> >> +{
> >> +  unsigned int i;
> >> +  char buf[BUFSIZE_FGETS], *endp;
> >> +
> >> +  if (!info->file_vmcoreinfo)
> >> +  return FALSE;
> >> +
> >> +  if (fseek(info->file_vmcoreinfo, 0, SEEK_SET) < 0) {
> >> +  ERRMSG("Can't seek the vmcoreinfo file(%s). %s\n",
> >> + info->name_vmcoreinfo, strerror(errno));
> >> +  return FALSE;
> >> +  }
> >> +
> >> +  while (fgets(buf, BUFSIZE_FGETS, info->file_vmcoreinfo)) {
> >> +  i = strlen(buf);
> >> +  if (!i)
> >> +  break;
> >> +  if (buf[i - 1] == '\n')
> >> +  buf[i - 1] = '\0';
> >> +  if (strncmp(buf, STR_KERNELOFFSET,
> >> +  strlen(STR_KERNELOFFSET)) == 0) {
> >> +  info->kaslr_offset =
> >> +  strtoul(buf + strlen(STR_KERNELOFFSET), , 
> >> 16);
> >> +  DEBUG_MSG("info->kaslr_offset: %lx\n", 
> >> info->kaslr_offset);
> >> +  }
> >> +  }
> >> +
> >> +  return info->kaslr_offset;
> >> +}
> >> +
> >&

RE: [PATCH] makedumpfile: assign bitmap2 fd for sub process during refiltering

2019-12-11 Thread Kazuhito Hagio
Hi Pingfan,

> -Original Message-
> > Reading the code, I think
> > - the issue might occur not only in refiltering, but also the first 
> > filtering
> >   with --split and --work-dir option (forced non-cyclic mode).
> > - pefer to gather things for --split option into 
> > writeout_multiple_dumpfiles()
> >   if we can, for readability.
> Yes, all of the cases suffer from sharing fd across processes
> >
> > So does the following patch work for you and your test?
> > I could not have reproduced the issue yet.
> I tried to fetch a machine to test. It pass 50 times test with your
> patch. While if without this patch, it failed about 1 out of 4 times.

Good. Thank you for testing.

> > BTW, what do you see when the issue occurs? an error or broken dump?
> The test case is refiltering, "makedumpfile --split  -d 31
> /root/vmcore-p9b-21 dumpfile_{1,2,3} 2>&1"
> And it can not complete the dump.

Ah, I got it. I probably could reproduce the issue:
---
Excluding unnecessary pages   : [100.0 %] \
readpage_kdump_compressed: pfn(2fc1000) is excluded from vmcore.
readmem: type_addr: 1, addr:2fc100, size:4096
read_pfn: Can't get the page data.
...
writeout_multiple_dumpfiles: Child process(30993) finished incompletely.(256)

makedumpfile Failed.
---

Refiltering was the factor that makedumpfile failed, but I think this bug
also can break the dumpfile silently by excluding pages wrongly even in
first filtering.

I guess this bug was hard to debug. Thank you so much for working on this!
Modified your patch and pushed.
https://sourceforge.net/p/makedumpfile/code/ci/5519b3eba68544dc484d85e9540d440d93f8c924/

Thanks,
Kazu




___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v4 0/4] makedumpfile/arm64: Add support for ARMv8.2 extensions

2019-12-10 Thread Kazuhito Hagio
> -Original Message-
> > -Original Message-
> > This is your makedumpfile pulled from sourceforge .
> >
> > It would be helpful if you bumped the VERSION and DATE to be certain we are 
> > using the correct pieces .
> 
> Good suggestion.
> 
> I wanted the command line that executed makedumpfile in debug message
> as well, so I'll think about adding them together.

Done.
https://sourceforge.net/p/makedumpfile/code/ci/180a3958c30d95cb1d8e8c341baaf267f7eaef89/

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile/s390: Add get_kaslr_offset() for s390x

2019-12-09 Thread Kazuhito Hagio
Hi Mikhail,

Sorry for late reply.

> -Original Message-
> Since kernel v5.2 KASLR is supported on s390. In makedumpfile however no
> support has been added yet. This patch adds the arch specific function
> get_kaslr_offset() for s390x.
> Since the values in vmcoreinfo are already relocated, the patch is
> mainly relevant for vmlinux processing (-x option).

In the current implementation of makedumpfile, the get_kaslr_offset(vaddr)
is supposed to return the KASLR offset only when the offset is needed to
add to the vaddr.  So generally symbols from kernel (vmlinux) need it, but
symbols from modules are resolved dynamically and don't need the offset.

This patch always returns the offset if any, as a result, I guess this patch
will not work as expected with module symbols in filter config file.

So... How about making get_kaslr_offset_arm64() general for other archs
(get_kaslr_offset_general() or something), then using it also for s390?
If OK, I can do that generalization.

Thanks,
Kazu

> 
> Signed-off-by: Philipp Rudo 
> Signed-off-by: Mikhail Zaslonko 
> ---
>  arch/s390x.c   | 32 
>  makedumpfile.h |  3 ++-
>  2 files changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/s390x.c b/arch/s390x.c
> index bf9d58e..892df14 100644
> --- a/arch/s390x.c
> +++ b/arch/s390x.c
> @@ -122,6 +122,38 @@ get_machdep_info_s390x(void)
>   return TRUE;
>  }
> 
> +unsigned long
> +get_kaslr_offset_s390x(unsigned long vaddr)
> +{
> + unsigned int i;
> + char buf[BUFSIZE_FGETS], *endp;
> +
> + if (!info->file_vmcoreinfo)
> + return FALSE;
> +
> + if (fseek(info->file_vmcoreinfo, 0, SEEK_SET) < 0) {
> + ERRMSG("Can't seek the vmcoreinfo file(%s). %s\n",
> +info->name_vmcoreinfo, strerror(errno));
> + return FALSE;
> + }
> +
> + while (fgets(buf, BUFSIZE_FGETS, info->file_vmcoreinfo)) {
> + i = strlen(buf);
> + if (!i)
> + break;
> + if (buf[i - 1] == '\n')
> + buf[i - 1] = '\0';
> + if (strncmp(buf, STR_KERNELOFFSET,
> + strlen(STR_KERNELOFFSET)) == 0) {
> + info->kaslr_offset =
> + strtoul(buf + strlen(STR_KERNELOFFSET), , 
> 16);
> + DEBUG_MSG("info->kaslr_offset: %lx\n", 
> info->kaslr_offset);
> + }
> + }
> +
> + return info->kaslr_offset;
> +}
> +
>  static int
>  is_vmalloc_addr_s390x(unsigned long vaddr)
>  {
> diff --git a/makedumpfile.h b/makedumpfile.h
> index ac11e90..26f6247 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -1071,11 +1071,12 @@ unsigned long long vaddr_to_paddr_ppc(unsigned long 
> vaddr);
>  int get_machdep_info_s390x(void);
>  unsigned long long vaddr_to_paddr_s390x(unsigned long vaddr);
>  int is_iomem_phys_addr_s390x(unsigned long addr);
> +unsigned long get_kaslr_offset_s390x(unsigned long vaddr);
>  #define find_vmemmap()   stub_false()
>  #define get_phys_base()  stub_true()
>  #define get_machdep_info()   get_machdep_info_s390x()
>  #define get_versiondep_info()stub_true()
> -#define get_kaslr_offset(X)  stub_false()
> +#define get_kaslr_offset(X)  get_kaslr_offset_s390x(X)
>  #define vaddr_to_paddr(X)vaddr_to_paddr_s390x(X)
>  #define paddr_to_vaddr(X)paddr_to_vaddr_general(X)
>  #define is_phys_addr(X)  is_iomem_phys_addr_s390x(X)
> --
> 2.17.1
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile/Makefile: remove -lebl from LIBS

2019-12-09 Thread Kazuhito Hagio
> -Original Message-
> 
> On 12/07/2019 12:28 AM, Kazuhito Hagio wrote:
> >> -Original Message-
> >>
> >> On 12/05/2019 06:36 AM, Kazuhito Hagio wrote:
> >>> Hi Pingfan,
> >>>
> >>> Thank you for the patch.
> >>>
> >>>> -Original Message-
> >>>> since the following commit, -lebl has been removed from elfutils.
> >>>> commit b833c731359af12af9f16bcb621b3cdc170eafbc
> >>>> Author: Mark Wielaard 
> >>>> Date:   Thu Aug 29 23:34:11 2019 +0200
> >>>>
> >>>> libebl: Don't install libebl.a, libebl.h and remove backends from 
> >>>> spec.
> >>>>
> >>>> All archive members from libebl.a are now in libdw.a. We don't 
> >>>> generate
> >>>> separate backend shared libraries anymore. So remove them from the
> >>>> elfutils.spec file.
> >>>>
> >>>> Signed-off-by: Mark Wielaard 
> >>>>
> >>>> So remove it from LIBS for makedumpfile
> >>>
> >>> It seems that this is ok with the latest elfutils, but with older ones?
> >>> Is it possible to remove -lebl when elfutils does not have libebl.a?
> >> I have no idea about it for now. The method to check version depends on
> >> distribution. Is it doable by checking /usr/lib64/libebl ?
> >
> > We have 'try-run' function written by Petr in the Makefile, which checks
> > if clock_gettime() requies -lrt.  How about utilizing it like this?
> >
> > diff --git a/Makefile b/Makefile
> > index 1fdb6286e85d..d4d1fb563209 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -50,7 +50,7 @@ OBJ_PART=$(patsubst %.c,%.o,$(SRC_PART))
> >  SRC_ARCH = arch/arm.c arch/arm64.c arch/x86.c arch/x86_64.c arch/ia64.c 
> > arch/ppc64.c arch/s390x.c
> arch/ppc.c arch/sparc64.c
> >  OBJ_ARCH=$(patsubst %.c,%.o,$(SRC_ARCH))
> >
> > -LIBS = -ldw -lbz2 -lebl -ldl -lelf -lz
> > +LIBS = -ldw -lbz2 -ldl -lelf -lz
> >  ifneq ($(LINKTYPE), dynamic)
> >  LIBS := -static $(LIBS)
> >  endif
> > @@ -79,6 +79,11 @@ LINK_TEST_PROG="int clock_gettime(); int main(){ return 
> > clock_gettime(); }"
> >  LIBS := $(LIBS) $(call try-run,\
> > echo $(LINK_TEST_PROG) | $(CC) $(CFLAGS) -o "$$TMP" -x c -,,-lrt)
> >
> > +# elfutils-0.178 or later does not install libebl.a.
> > +LINK_TEST_PROG="int main() { return 0; }"
> > +LIBS := $(LIBS) $(call try-run,\
> > +   echo $(LINK_TEST_PROG) | $(CC) -o "$$TMP" -x c - -lebl,-lebl,)
> > +
> >  all: makedumpfile
> >
> >  $(OBJ_PART): $(SRC_PART)
> >
> >
> > If libebl.a does not exist (gcc with -lebl fails), it will not append
> > -lebl to LIBS.
> >
> Yes, it sounds a good idea.
> 
> Should I sumbit another patch or you will do by yourself?

Modified and applied.
https://sourceforge.net/p/makedumpfile/code/ci/71e798cb1b85e4879a19607ebb0a061cbc92f70f/

Thanks!
Kazu

> 
> Thanks,
> Pingfan
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: assign bitmap2 fd for sub process during refiltering

2019-12-09 Thread Kazuhito Hagio


> -Original Message-
> From: piliu 
> Sent: Monday, December 9, 2019 1:06 AM
> To: Hagio Kazuhito(萩尾 一仁) ; kexec@lists.infradead.org
> Subject: Re: [PATCH] makedumpfile: assign bitmap2 fd for sub process during 
> refiltering
> 
> 
> 
> On 12/07/2019 06:11 AM, Kazuhito Hagio wrote:
> > Hi Pingfan,
> >
> >> -Original Message-
> >> In refiltering mode, each sub process inherits bitmap2->fd from parent.
> >> Then they lseek()/read() on the same fd, which means that they interference
> >> with each other.
> >>
> >> This breaks the purpose of SPLITTING_FD_BITMAP(i) for each sub process.
> >> Fix it by assigning a sub process dedicated fd to bitmap2->fd.
> >>
> >> Signed-off-by: Pingfan Liu 
> >
> > Thanks for the patch.
> > I'm still reading the code, but it might be better to apply this to 
> > bitmap1->fd
> > as well?  see you next week..
> Yes. Although during my test, bitmap1 is not touched, but it is a
> reasonable step to against any future bug.

Reading the code, I think
- the issue might occur not only in refiltering, but also the first filtering
  with --split and --work-dir option (forced non-cyclic mode).
- pefer to gather things for --split option into writeout_multiple_dumpfiles()
  if we can, for readability.

So does the following patch work for you and your test?
I could not have reproduced the issue yet.

diff --git a/makedumpfile.c b/makedumpfile.c
index b9e9dfbd45ba..674c6a00e2dd 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -10091,6 +10091,10 @@ writeout_multiple_dumpfiles(void)
info->split_start_pfn = SPLITTING_START_PFN(i);
info->split_end_pfn   = SPLITTING_END_PFN(i);
 
+   if (!info->flag_cyclic) {
+   info->bitmap1->fd = info->fd_bitmap;
+   info->bitmap2->fd = info->fd_bitmap;
+   }
if (!reopen_dump_memory())
exit(1);
if ((status = writeout_dumpfile()) == FALSE)


BTW, what do you see when the issue occurs? an error or broken dump?

Thanks,
Kazu

> 
> Thanks,
> Pingfan
> >
> > Thanks,
> > Kazu
> >
> >> ---
> >>  makedumpfile.c | 3 ++-
> >>  1 file changed, 2 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/makedumpfile.c b/makedumpfile.c
> >> index d76a435..1dc8640 100644
> >> --- a/makedumpfile.c
> >> +++ b/makedumpfile.c
> >> @@ -8857,7 +8857,8 @@ write_kdump_pages_and_bitmap_cyclic(struct 
> >> cache_data *cd_header, struct cache_d
> >>if (info->flag_cyclic) {
> >>if (!prepare_bitmap2_buffer())
> >>return FALSE;
> >> -  }
> >> +  } else if (info->flag_refiltering)
> >> +  info->bitmap2->fd = info->fd_bitmap;
> >>
> >>/*
> >> * Write pages and bitmap cyclically.
> >> --
> >> 2.7.5
> >>
> >
> >
> >
> > ___
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
> >
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: assign bitmap2 fd for sub process during refiltering

2019-12-06 Thread Kazuhito Hagio
Hi Pingfan,

> -Original Message-
> In refiltering mode, each sub process inherits bitmap2->fd from parent.
> Then they lseek()/read() on the same fd, which means that they interference
> with each other.
> 
> This breaks the purpose of SPLITTING_FD_BITMAP(i) for each sub process.
> Fix it by assigning a sub process dedicated fd to bitmap2->fd.
> 
> Signed-off-by: Pingfan Liu 

Thanks for the patch.
I'm still reading the code, but it might be better to apply this to bitmap1->fd
as well?  see you next week..

Thanks,
Kazu

> ---
>  makedumpfile.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index d76a435..1dc8640 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -8857,7 +8857,8 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data 
> *cd_header, struct cache_d
>   if (info->flag_cyclic) {
>   if (!prepare_bitmap2_buffer())
>   return FALSE;
> - }
> + } else if (info->flag_refiltering)
> + info->bitmap2->fd = info->fd_bitmap;
> 
>   /*
>* Write pages and bitmap cyclically.
> --
> 2.7.5
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile/Makefile: remove -lebl from LIBS

2019-12-06 Thread Kazuhito Hagio
> -Original Message-
> 
> On 12/05/2019 06:36 AM, Kazuhito Hagio wrote:
> > Hi Pingfan,
> >
> > Thank you for the patch.
> >
> >> -Original Message-
> >> since the following commit, -lebl has been removed from elfutils.
> >> commit b833c731359af12af9f16bcb621b3cdc170eafbc
> >> Author: Mark Wielaard 
> >> Date:   Thu Aug 29 23:34:11 2019 +0200
> >>
> >> libebl: Don't install libebl.a, libebl.h and remove backends from spec.
> >>
> >> All archive members from libebl.a are now in libdw.a. We don't generate
> >> separate backend shared libraries anymore. So remove them from the
> >> elfutils.spec file.
> >>
> >> Signed-off-by: Mark Wielaard 
> >>
> >> So remove it from LIBS for makedumpfile
> >
> > It seems that this is ok with the latest elfutils, but with older ones?
> > Is it possible to remove -lebl when elfutils does not have libebl.a?
> I have no idea about it for now. The method to check version depends on
> distribution. Is it doable by checking /usr/lib64/libebl ?

We have 'try-run' function written by Petr in the Makefile, which checks
if clock_gettime() requies -lrt.  How about utilizing it like this?

diff --git a/Makefile b/Makefile
index 1fdb6286e85d..d4d1fb563209 100644
--- a/Makefile
+++ b/Makefile
@@ -50,7 +50,7 @@ OBJ_PART=$(patsubst %.c,%.o,$(SRC_PART))
 SRC_ARCH = arch/arm.c arch/arm64.c arch/x86.c arch/x86_64.c arch/ia64.c 
arch/ppc64.c arch/s390x.c arch/ppc.c arch/sparc64.c
 OBJ_ARCH=$(patsubst %.c,%.o,$(SRC_ARCH))
 
-LIBS = -ldw -lbz2 -lebl -ldl -lelf -lz
+LIBS = -ldw -lbz2 -ldl -lelf -lz
 ifneq ($(LINKTYPE), dynamic)
 LIBS := -static $(LIBS)
 endif
@@ -79,6 +79,11 @@ LINK_TEST_PROG="int clock_gettime(); int main(){ return 
clock_gettime(); }"
 LIBS := $(LIBS) $(call try-run,\
echo $(LINK_TEST_PROG) | $(CC) $(CFLAGS) -o "$$TMP" -x c -,,-lrt)
 
+# elfutils-0.178 or later does not install libebl.a.
+LINK_TEST_PROG="int main() { return 0; }"
+LIBS := $(LIBS) $(call try-run,\
+   echo $(LINK_TEST_PROG) | $(CC) -o "$$TMP" -x c - -lebl,-lebl,)
+
 all: makedumpfile
 
 $(OBJ_PART): $(SRC_PART)


If libebl.a does not exist (gcc with -lebl fails), it will not append
-lebl to LIBS.

Thanks,
Kazu

> 
> Thanks,
> Pingfan
> >
> > Thanks,
> > Kazu
> >
> >>
> >> Signed-off-by: Pingfan Liu 
> >> ---
> >>  Makefile | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/Makefile b/Makefile
> >> index 1fdb628..df21b93 100644
> >> --- a/Makefile
> >> +++ b/Makefile
> >> @@ -50,7 +50,7 @@ OBJ_PART=$(patsubst %.c,%.o,$(SRC_PART))
> >>  SRC_ARCH = arch/arm.c arch/arm64.c arch/x86.c arch/x86_64.c arch/ia64.c 
> >> arch/ppc64.c arch/s390x.c
> >> arch/ppc.c arch/sparc64.c
> >>  OBJ_ARCH=$(patsubst %.c,%.o,$(SRC_ARCH))
> >>
> >> -LIBS = -ldw -lbz2 -lebl -ldl -lelf -lz
> >> +LIBS = -ldw -lbz2 -ldl -lelf -lz
> >>  ifneq ($(LINKTYPE), dynamic)
> >>  LIBS := -static $(LIBS)
> >>  endif
> >> --
> >> 2.7.5
> >>
> >
> >
> >
> > ___
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
> >
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v4 0/4] makedumpfile/arm64: Add support for ARMv8.2 extensions

2019-12-05 Thread Kazuhito Hagio
> -Original Message-
> This is your makedumpfile pulled from sourceforge .
> 
> It would be helpful if you bumped the VERSION and DATE to be certain we are 
> using the correct pieces .

Good suggestion.

I wanted the command line that executed makedumpfile in debug message
as well, so I'll think about adding them together.

Thanks,
Kazu
___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v4 3/4] makedumpfile/arm64: Add support for ARMv8.2-LVA (52-bit kernel VA support)

2019-12-05 Thread Kazuhito Hagio
> -Original Message-
> > > > +/*
> > > > + * The linear kernel range starts at the bottom of the virtual address
> > > > + * space. Testing the top bit for the start of the region is a
> > > > + * sufficient check and avoids having to worry about the tag.
> > > > + */
> > > > +#define is_linear_addr(addr)   (!(((unsigned long)addr) & (1UL << 
> > > > (vabits_actual - 1
> > >
> > > Does this check cover 5.3 or earlier kernels?
> > > There is no case that vabits_actual is zero?
> 
> We can set vabits_actual as va_bits for older kernels. That shouldn't
> be a big change.
> Will add it in v5. See more below ...
> 
> > As you know, 14c127c957c1 ("arm64: mm: Flip kernel VA space") changed
> > the check for linear address:
> >
> > -#define __is_lm_address(addr)  (!!((addr) & BIT(VA_BITS - 1)))
> > +#define __is_lm_address(addr)  (!((addr) & BIT(VA_BITS - 1)))
> >
> > so if we use the same check as kernel has, I think we will need the
> > former one to support earlier kernels.
> 
> See above, we can use va_bits where vabits_actual is not present.

Yes, but it is not the problem that I wanted to say here.

The problem is that, even if we set vabits_actual to va_bits, we cannot
determine whether an address is in linear map range with just one macro
for 5.3 and 5.4 kernels.

Because the bit value to be checked by the macro changed:

5.3 VA_BITS=48
  linear map : 0x8000 to 0x
5.4 VA_BITS=48
  linear map : 0x to 0x7fff

or I missed something?

Thanks,
Kazu

> 
> > > > +
> > > >  static unsigned long long
> > > >  __pa(unsigned long vaddr)
> > > >  {
> > > > if (kimage_voffset == NOT_FOUND_NUMBER ||
> > > > -   (vaddr >= PAGE_OFFSET))
> > > > -   return (vaddr - PAGE_OFFSET + info->phys_base);
> > > > +   is_linear_addr(vaddr))
> > > > +   return (vaddr + info->phys_base - PAGE_OFFSET);
> > > > else
> > > > return (vaddr - kimage_voffset);
> > > >  }
> > > > @@ -253,6 +261,7 @@ static int calculate_plat_config(void)
> > > > (PAGESIZE() == SZ_64K && va_bits == 42)) {
> > > > pgtable_level = 2;
> > > > } else if ((PAGESIZE() == SZ_64K && va_bits == 48) ||
> > > > +   (PAGESIZE() == SZ_64K && va_bits == 52) ||
> > > > (PAGESIZE() == SZ_4K && va_bits == 39) ||
> > > > (PAGESIZE() == SZ_16K && va_bits == 47)) {
> > > > pgtable_level = 3;
> > > > @@ -287,6 +296,16 @@ get_phys_base_arm64(void)
> > > > return TRUE;
> > > > }
> > > >
> > > > +   /* If both vabits_actual and va_bits are now initialized, always
> > > > +* prefer vabits_actual over va_bits to calculate PAGE_OFFSET
> > > > +* value.
> > > > +*/
> > > > +   if (vabits_actual && va_bits && vabits_actual != va_bits) {
> > > > +   info->page_offset = (-(1UL << vabits_actual));
> > > > +   DEBUG_MSG("page_offset: %lx (via vabits_actual)\n",
> > > > +   info->page_offset);
> > > > +   }
> > > > +
> > >
> > > Is this for --mem-usage?
> > > If so, let's drop from this patch and think about it later because
> > > some additional base functions will be needed for the option, I think.
> 
> Ok.
> 
> > > > if (get_num_pt_loads() && PAGE_OFFSET) {
> > > > for (i = 0;
> > > > get_pt_load(i, _start, NULL, _start, NULL);
> > > > @@ -406,6 +425,73 @@ get_stext_symbol(void)
> > > > return(found ? kallsym : FALSE);
> > > >  }
> > > >
> > > > +static int
> > > > +get_va_bits_from_stext_arm64(void)
> > > > +{
> > > > +   ulong _stext;
> > > > +
> > > > +   _stext = get_stext_symbol();
> > > > +   if (!_stext) {
> > > > +   ERRMSG("Can't get the symbol of _stext.\n");
> > > > +   return FALSE;
> > > > +   }
> > > > +
> > > > +   /* Derive va_bits as per arch/arm64/Kconfig. Note that this is a
> > > > +* best case approximation at the moment, as there can be
> > > > +* inconsistencies in this calculation (for e.g., for
> > > > +* 52-bit kernel VA case, even the 48th bit might be set in
> > > > +* the _stext symbol).
> > > > +*
> > > > +* So, we need to rely on the actual VA_BITS symbol in the
> > > > +* vmcoreinfo for a accurate value.
> > > > +*
> > > > +* TODO: Improve this further once there is a closure with arm64
> > > > +* kernel maintainers on the same.
> > > > +*/
> > > > +   if ((_stext & PAGE_OFFSET_52) == PAGE_OFFSET_52) {
> > > > +   va_bits = 52;
> > > > +   } else if ((_stext & PAGE_OFFSET_48) == PAGE_OFFSET_48) {
> > > > +   va_bits = 48;
> > > > +   } else if ((_stext & PAGE_OFFSET_47) == PAGE_OFFSET_47) {
> > > > +   va_bits = 47;
> > > > +   } else if ((_stext & PAGE_OFFSET_42) == PAGE_OFFSET_42) {
> > > > +   va_bits = 42;
> > > > +   } else if ((_stext & PAGE_OFFSET_39) == PAGE_OFFSET_39) {
> > > > +   va_bits = 39;
> > > > + 

RE: [PATCH v4 2/4] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-12-05 Thread Kazuhito Hagio
> -Original Message-
> Hi Kazu,
> 
> On Wed, Dec 4, 2019 at 11:07 PM Kazuhito Hagio  wrote:
> >
> > > -Original Message-
> > > ARMv8.2-LPA architecture extension (if available on underlying hardware)
> > > can support 52-bit physical addresses, while the kernel virtual
> > > addresses remain 48-bit.
> > >
> > > Make sure that we read the 52-bit PA address capability from
> > > 'MAX_PHYSMEM_BITS' variable (if available in vmcoreinfo) and
> > > accordingly change the pte_to_phy() mask values and also traverse
> > > the page-table walk accordingly.
> > >
> > > Also make sure that it works well for the existing 48-bit PA address
> > > platforms and also on environments which use newer kernels with 52-bit
> > > PA support but hardware which is not ARM8.2-LPA compliant.
> > >
> > > I have sent a kernel patch upstream to add 'MAX_PHYSMEM_BITS' to
> > > vmcoreinfo for arm64 (see [0]).
> > >
> > > This patch is in accordance with ARMv8 Architecture Reference Manual
> > > version D.a
> > >
> > > [0]. http://lists.infradead.org/pipermail/kexec/2019-November/023960.html
> > >
> > > Cc: Kazuhito Hagio 
> > > Cc: John Donnelly 
> > > Cc: kexec@lists.infradead.org
> > > Signed-off-by: Bhupesh Sharma 
> > > ---
> > >  arch/arm64.c | 292 
> > > +--
> > >  1 file changed, 204 insertions(+), 88 deletions(-)
> > >
> > > diff --git a/arch/arm64.c b/arch/arm64.c
> > > index 3516b340adfd..ecb19139e178 100644
> > > --- a/arch/arm64.c
> > > +++ b/arch/arm64.c
> > > @@ -39,72 +39,184 @@ typedef struct {
> > >   unsigned long pte;
> > >  } pte_t;
> > >
> >
> > > +#define __pte(x) ((pte_t) { (x) } )
> > > +#define __pmd(x) ((pmd_t) { (x) } )
> > > +#define __pud(x) ((pud_t) { (x) } )
> > > +#define __pgd(x) ((pgd_t) { (x) } )
> >
> > Is it possible to remove these macros?
> 
> Ok, will fix in v5.
> 
> > > +
> > > +static int lpa_52_bit_support_available;
> > >  static int pgtable_level;
> > >  static int va_bits;
> > >  static unsigned long kimage_voffset;
> > >
> > > -#define SZ_4K(4 * 1024)
> > > -#define SZ_16K   (16 * 1024)
> > > -#define SZ_64K   (64 * 1024)
> > > -#define SZ_128M  (128 * 1024 * 1024)
> > > +#define SZ_4K4096
> > > +#define SZ_16K   16384
> > > +#define SZ_64K   65536
> > >
> > > -#define PAGE_OFFSET_36 ((0xUL) << 36)
> > > -#define PAGE_OFFSET_39 ((0xUL) << 39)
> > > -#define PAGE_OFFSET_42 ((0xUL) << 42)
> > > -#define PAGE_OFFSET_47 ((0xUL) << 47)
> > > -#define PAGE_OFFSET_48 ((0xUL) << 48)
> > > +#define PAGE_OFFSET_36   ((0xUL) << 36)
> > > +#define PAGE_OFFSET_39   ((0xUL) << 39)
> > > +#define PAGE_OFFSET_42   ((0xUL) << 42)
> > > +#define PAGE_OFFSET_47   ((0xUL) << 47)
> > > +#define PAGE_OFFSET_48   ((0xUL) << 48)
> > > +#define PAGE_OFFSET_52   ((0xUL) << 52)
> > >
> > >  #define pgd_val(x)   ((x).pgd)
> > >  #define pud_val(x)   (pgd_val((x).pgd))
> > >  #define pmd_val(x)   (pud_val((x).pud))
> > >  #define pte_val(x)   ((x).pte)
> > >
> > > -#define PAGE_MASK(~(PAGESIZE() - 1))
> > > -#define PGDIR_SHIFT  ((PAGESHIFT() - 3) * pgtable_level + 3)
> > > -#define PTRS_PER_PGD (1 << (va_bits - PGDIR_SHIFT))
> > > -#define PUD_SHIFTget_pud_shift_arm64()
> > > -#define PUD_SIZE (1UL << PUD_SHIFT)
> > > -#define PUD_MASK (~(PUD_SIZE - 1))
> > > -#define PTRS_PER_PTE (1 << (PAGESHIFT() - 3))
> > > -#define PTRS_PER_PUD PTRS_PER_PTE
> > > -#define PMD_SHIFT((PAGESHIFT() - 3) * 2 + 3)
> > > -#define PMD_SIZE (1UL << PMD_SHIFT)
> > > -#define PMD_MASK (~(PMD_SIZE - 1))
> >
> > > +/

RE: [PATCH v4 1/4] tree-wide: Retrieve 'MAX_PHYSMEM_BITS' from vmcoreinfo (if available)

2019-12-05 Thread Kazuhito Hagio
Hi Bhupesh,

> -Original Message-
> > > -Original Message-
> > > This patch adds a common feature for archs (except arm64, for which
> > > similar support is added via subsequent patch) to retrieve
> > > 'MAX_PHYSMEM_BITS' from vmcoreinfo (if available).
> >
> > We already have the calibrate_machdep_info() function, which sets
> > info->max_physmem_bits from vmcoreinfo, so practically we don't need
> > to add this patch for the benefit.

I meant that we already have an arch-independent setter for 
info->max_physmem_bits:

 3714 int
 3715 calibrate_machdep_info(void)
 3716 {
 3717 if (NUMBER(MAX_PHYSMEM_BITS) > 0)
 3718 info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
 3719 
 3720 if (NUMBER(SECTION_SIZE_BITS) > 0)
 3721 info->section_size_bits = NUMBER(SECTION_SIZE_BITS);
 3722 
 3723 return TRUE;
 3724 }

so if NUMBER(MAX_PHYSMEM_BITS) appears, it is automatically used in makedumpfile
without this patch 1/4.

Thanks,
Kazu

> 
> Since other user-space tools like crash use the 'MAX_PHYSMEM_BITS' value as 
> well
> it was agreed with the arm64 maintainers that it would be a good
> approach to export the
> same in vmcoreinfo and not use different methods to determine the same
> in user-space.
> 
> Take an example of the PPC makedumpfile implementation for example. It
> uses the following complex method of dtereming
> 'info->max_physmem_bits':
> int
> set_ppc64_max_physmem_bits(void)
> {
> long array_len = ARRAY_LENGTH(mem_section);
> /*
>  * The older ppc64 kernels uses _MAX_PHYSMEM_BITS as 42 and the
>  * newer kernels 3.7 onwards uses 46 bits.
>  */
> 
> info->max_physmem_bits  = _MAX_PHYSMEM_BITS_ORIG ;
> if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> return TRUE;
> 
> info->max_physmem_bits  = _MAX_PHYSMEM_BITS_3_7;
> if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> return TRUE;
> 
> info->max_physmem_bits  = _MAX_PHYSMEM_BITS_4_19;
> if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> return TRUE;
> 
> info->max_physmem_bits  = _MAX_PHYSMEM_BITS_4_20;
> if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> return TRUE;
> 
> return FALSE;
> }
> 
> This might need modification and introduction of another
> _MAX_PHYSMEM_BITS_x_y macro when this changes for a newer kernel
> version.
> 
> I think this makes the code error-prone and hard to read. Its much
> better to replace it with:
> /* Check if we can get MAX_PHYSMEM_BITS from vmcoreinfo */
> if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER) {
> info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
> return TRUE;
> } else {
> ..
> }
> 
> I think it will reduce future reworks (as per kernel versions) and
> also reduce issues while backporting makedumpfile to older kernels.
> 
> What do you think?
> 
> Regards,
> Bhupesh
> > > I recently posted a kernel patch (see [0]) which appends
> > > 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself rather than
> > > in arch-specific code, so that user-space code can also benefit from
> > > this addition to the vmcoreinfo and use it as a standard way of
> > > determining 'SECTIONS_SHIFT' value in 'makedumpfile' utility.
> > >
> > > This patch ensures backward compatibility for kernel versions in which
> > > 'MAX_PHYSMEM_BITS' is not available in vmcoreinfo.
> > >
> > > [0]. http://lists.infradead.org/pipermail/kexec/2019-November/023960.html
> > >
> > > Cc: Kazuhito Hagio 
> > > Cc: John Donnelly 
> > > Cc: kexec@lists.infradead.org
> > > Signed-off-by: Bhupesh Sharma 
> > > ---
> > >  arch/arm.c |  8 +++-
> > >  arch/ia64.c|  7 ++-
> > >  arch/ppc.c |  8 +++-
> > >  arch/ppc64.c   | 49 -
> > >  arch/s390x.c   | 29 ++---
> > >  arch/sparc64.c |  9 +++--
> > >  arch/x86.c | 34 --
> > >  arch/x86_64.c  | 27 ---
> > >  8 files changed, 109 insertions(+), 62 deletions(-)
> > >
> > 

RE: [PATCH v4 3/4] makedumpfile/arm64: Add support for ARMv8.2-LVA (52-bit kernel VA support)

2019-12-05 Thread Kazuhito Hagio
> -Original Message-
> > -Original Message-
> > With ARMv8.2-LVA architecture extension availability, arm64 hardware
> > which supports this extension can support upto 52-bit virtual
> > addresses. It is specially useful for having a 52-bit user-space virtual
> > address space while the kernel can still retain 48-bit/52-bit virtual
> > addressing.
> >
> > Since at the moment we enable the support of this extension in the
> > kernel via a CONFIG flag (CONFIG_ARM64_VA_BITS_52), so there are
> > no clear mechanisms in user-space to determine this CONFIG
> > flag value and use it to determine the kernel-space VA address range
> > values.
> >
> > 'makedumpfile' can instead use 'TCR_EL1.T1SZ' value from vmcoreinfo
> > which indicates the size offset of the memory region addressed by
> > TTBR1_EL1 (and hence can be used for determining the
> > vabits_actual value).
> >
> > The user-space computation for determining whether an address lies in
> > the linear map range is the same as we have in kernel-space:
> >
> >   #define __is_lm_address(addr) (!(((u64)addr) & BIT(vabits_actual - 
> > 1)))
> >
> > I have sent a kernel patch upstream to add 'TCR_EL1.T1SZ' to
> > vmcoreinfo for arm64 (see [0]).
> >
> > This patch is in accordance with ARMv8 Architecture Reference Manual
> > version D.a
> >
> > Note that with these changes the '--mem-usage' option will not work
> > properly for arm64 (a subsequent patch in this series will address the
> > same) and there is a discussion on-going with the arm64 maintainers to
> > find a way-out for the same (via standard kernel symbols like _stext).
> >
> > [0].http://lists.infradead.org/pipermail/kexec/2019-November/023962.html
> >
> > Cc: Kazuhito Hagio 
> > Cc: John Donnelly 
> > Cc: kexec@lists.infradead.org
> > Signed-off-by: Bhupesh Sharma 
> > ---
> >  arch/arm64.c   | 148 
> > +
> >  makedumpfile.c |   2 +
> >  makedumpfile.h |   3 +-
> >  3 files changed, 122 insertions(+), 31 deletions(-)
> >
> > diff --git a/arch/arm64.c b/arch/arm64.c
> > index ecb19139e178..094d73b8a60f 100644
> > --- a/arch/arm64.c
> > +++ b/arch/arm64.c
> > @@ -47,6 +47,7 @@ typedef struct {
> >  static int lpa_52_bit_support_available;
> >  static int pgtable_level;
> >  static int va_bits;
> > +static int vabits_actual;
> >  static unsigned long kimage_voffset;
> >
> >  #define SZ_4K  4096
> > @@ -218,12 +219,19 @@ pmd_page_paddr(pmd_t pmd)
> >  #define pte_index(vaddr)   (((vaddr) >> PAGESHIFT()) & 
> > (PTRS_PER_PTE - 1))
> >  #define pte_offset(dir, vaddr) (pmd_page_paddr((*dir)) + 
> > pte_index(vaddr) * sizeof(pte_t))
> >
> > +/*
> > + * The linear kernel range starts at the bottom of the virtual address
> > + * space. Testing the top bit for the start of the region is a
> > + * sufficient check and avoids having to worry about the tag.
> > + */
> > +#define is_linear_addr(addr)   (!(((unsigned long)addr) & (1UL << 
> > (vabits_actual - 1
> 
> Does this check cover 5.3 or earlier kernels?
> There is no case that vabits_actual is zero?

As you know, 14c127c957c1 ("arm64: mm: Flip kernel VA space") changed
the check for linear address:

-#define __is_lm_address(addr)  (!!((addr) & BIT(VA_BITS - 1)))
+#define __is_lm_address(addr)  (!((addr) & BIT(VA_BITS - 1)))

so if we use the same check as kernel has, I think we will need the
former one to support earlier kernels.

> 
> > +
> >  static unsigned long long
> >  __pa(unsigned long vaddr)
> >  {
> > if (kimage_voffset == NOT_FOUND_NUMBER ||
> > -   (vaddr >= PAGE_OFFSET))
> > -   return (vaddr - PAGE_OFFSET + info->phys_base);
> > +   is_linear_addr(vaddr))
> > +   return (vaddr + info->phys_base - PAGE_OFFSET);
> > else
> > return (vaddr - kimage_voffset);
> >  }
> > @@ -253,6 +261,7 @@ static int calculate_plat_config(void)
> > (PAGESIZE() == SZ_64K && va_bits == 42)) {
> > pgtable_level = 2;
> > } else if ((PAGESIZE() == SZ_64K && va_bits == 48) ||
> > +   (PAGESIZE() == SZ_64K && va_bits == 52) ||
> > (PAGESIZE() == SZ_4K && va_bits == 39) ||
> > (PAGESIZE() == SZ_16K && va_bits == 47)) {
> >   

RE: [PATCH] makedumpfile/Makefile: remove -lebl from LIBS

2019-12-04 Thread Kazuhito Hagio
Hi Pingfan,

Thank you for the patch.

> -Original Message-
> since the following commit, -lebl has been removed from elfutils.
> commit b833c731359af12af9f16bcb621b3cdc170eafbc
> Author: Mark Wielaard 
> Date:   Thu Aug 29 23:34:11 2019 +0200
> 
> libebl: Don't install libebl.a, libebl.h and remove backends from spec.
> 
> All archive members from libebl.a are now in libdw.a. We don't generate
> separate backend shared libraries anymore. So remove them from the
> elfutils.spec file.
> 
> Signed-off-by: Mark Wielaard 
> 
> So remove it from LIBS for makedumpfile

It seems that this is ok with the latest elfutils, but with older ones?
Is it possible to remove -lebl when elfutils does not have libebl.a?

Thanks,
Kazu

> 
> Signed-off-by: Pingfan Liu 
> ---
>  Makefile | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/Makefile b/Makefile
> index 1fdb628..df21b93 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -50,7 +50,7 @@ OBJ_PART=$(patsubst %.c,%.o,$(SRC_PART))
>  SRC_ARCH = arch/arm.c arch/arm64.c arch/x86.c arch/x86_64.c arch/ia64.c 
> arch/ppc64.c arch/s390x.c
> arch/ppc.c arch/sparc64.c
>  OBJ_ARCH=$(patsubst %.c,%.o,$(SRC_ARCH))
> 
> -LIBS = -ldw -lbz2 -lebl -ldl -lelf -lz
> +LIBS = -ldw -lbz2 -ldl -lelf -lz
>  ifneq ($(LINKTYPE), dynamic)
>  LIBS := -static $(LIBS)
>  endif
> --
> 2.7.5
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v4 4/4] makedumpfile: Mark --mem-usage option unsupported for arm64

2019-12-04 Thread Kazuhito Hagio
> -Original Message-
> This patch marks '--mem-usage' option as unsupported for arm64
> architecture.
> 
> With the newer arm64 kernels supporting 48-bit/52-bit VA address spaces
> and keeping a single binary for supporting the same, the address of
> kernel symbols like _stext which could be earlier used to determine
> VA_BITS value, can no longer to determine whether VA_BITS is set to 48
> or 52 in the kernel space.

The --mem-usage option works with older arm64 kernels, so we should not
mark it unsupported for all arm64 kernels.

(If we use ELF note vmcoreinfo in kcore, is it possible to support the
option?  Let's think about it later..)

Thanks,
Kazu

> 
> Hence for now, it makes sense to mark '--mem-usage' option as
> unsupported for arm64 architecture until we have more clarity from arm64
> kernel maintainers on how to manage the same in future
> kernel/makedumpfile versions.
> 
> Cc: John Donnelly 
> Cc: Kazuhito Hagio 
> Cc: kexec@lists.infradead.org
> Signed-off-by: Bhupesh Sharma 
> ---
>  makedumpfile.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index baf559e4d74e..ae60466a1e9c 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -11564,6 +11564,11 @@ main(int argc, char *argv[])
>   MSG("\n");
>   MSG("The dmesg log is saved to %s.\n", info->name_dumpfile);
>   } else if (info->flag_mem_usage) {
> +#ifdef __aarch64__
> + MSG("mem-usage not supported for arm64 architecure.\n");
> + goto out;
> +#endif
> +
>   if (!check_param_for_creating_dumpfile(argc, argv)) {
>   MSG("Commandline parameter is invalid.\n");
>   MSG("Try `makedumpfile --help' for more 
> information.\n");
> --
> 2.7.4
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v4 3/4] makedumpfile/arm64: Add support for ARMv8.2-LVA (52-bit kernel VA support)

2019-12-04 Thread Kazuhito Hagio
> -Original Message-
> With ARMv8.2-LVA architecture extension availability, arm64 hardware
> which supports this extension can support upto 52-bit virtual
> addresses. It is specially useful for having a 52-bit user-space virtual
> address space while the kernel can still retain 48-bit/52-bit virtual
> addressing.
> 
> Since at the moment we enable the support of this extension in the
> kernel via a CONFIG flag (CONFIG_ARM64_VA_BITS_52), so there are
> no clear mechanisms in user-space to determine this CONFIG
> flag value and use it to determine the kernel-space VA address range
> values.
> 
> 'makedumpfile' can instead use 'TCR_EL1.T1SZ' value from vmcoreinfo
> which indicates the size offset of the memory region addressed by
> TTBR1_EL1 (and hence can be used for determining the
> vabits_actual value).
> 
> The user-space computation for determining whether an address lies in
> the linear map range is the same as we have in kernel-space:
> 
>   #define __is_lm_address(addr)   (!(((u64)addr) & BIT(vabits_actual - 
> 1)))
> 
> I have sent a kernel patch upstream to add 'TCR_EL1.T1SZ' to
> vmcoreinfo for arm64 (see [0]).
> 
> This patch is in accordance with ARMv8 Architecture Reference Manual
> version D.a
> 
> Note that with these changes the '--mem-usage' option will not work
> properly for arm64 (a subsequent patch in this series will address the
> same) and there is a discussion on-going with the arm64 maintainers to
> find a way-out for the same (via standard kernel symbols like _stext).
> 
> [0].http://lists.infradead.org/pipermail/kexec/2019-November/023962.html
> 
> Cc: Kazuhito Hagio 
> Cc: John Donnelly 
> Cc: kexec@lists.infradead.org
> Signed-off-by: Bhupesh Sharma 
> ---
>  arch/arm64.c   | 148 
> +
>  makedumpfile.c |   2 +
>  makedumpfile.h |   3 +-
>  3 files changed, 122 insertions(+), 31 deletions(-)
> 
> diff --git a/arch/arm64.c b/arch/arm64.c
> index ecb19139e178..094d73b8a60f 100644
> --- a/arch/arm64.c
> +++ b/arch/arm64.c
> @@ -47,6 +47,7 @@ typedef struct {
>  static int lpa_52_bit_support_available;
>  static int pgtable_level;
>  static int va_bits;
> +static int vabits_actual;
>  static unsigned long kimage_voffset;
> 
>  #define SZ_4K4096
> @@ -218,12 +219,19 @@ pmd_page_paddr(pmd_t pmd)
>  #define pte_index(vaddr) (((vaddr) >> PAGESHIFT()) & 
> (PTRS_PER_PTE - 1))
>  #define pte_offset(dir, vaddr)   (pmd_page_paddr((*dir)) + 
> pte_index(vaddr) * sizeof(pte_t))
> 
> +/*
> + * The linear kernel range starts at the bottom of the virtual address
> + * space. Testing the top bit for the start of the region is a
> + * sufficient check and avoids having to worry about the tag.
> + */
> +#define is_linear_addr(addr) (!(((unsigned long)addr) & (1UL << 
> (vabits_actual - 1

Does this check cover 5.3 or earlier kernels?
There is no case that vabits_actual is zero?

> +
>  static unsigned long long
>  __pa(unsigned long vaddr)
>  {
>   if (kimage_voffset == NOT_FOUND_NUMBER ||
> - (vaddr >= PAGE_OFFSET))
> - return (vaddr - PAGE_OFFSET + info->phys_base);
> + is_linear_addr(vaddr))
> + return (vaddr + info->phys_base - PAGE_OFFSET);
>   else
>   return (vaddr - kimage_voffset);
>  }
> @@ -253,6 +261,7 @@ static int calculate_plat_config(void)
>   (PAGESIZE() == SZ_64K && va_bits == 42)) {
>   pgtable_level = 2;
>   } else if ((PAGESIZE() == SZ_64K && va_bits == 48) ||
> + (PAGESIZE() == SZ_64K && va_bits == 52) ||
>   (PAGESIZE() == SZ_4K && va_bits == 39) ||
>   (PAGESIZE() == SZ_16K && va_bits == 47)) {
>   pgtable_level = 3;
> @@ -287,6 +296,16 @@ get_phys_base_arm64(void)
>   return TRUE;
>   }
> 
> + /* If both vabits_actual and va_bits are now initialized, always
> +  * prefer vabits_actual over va_bits to calculate PAGE_OFFSET
> +  * value.
> +  */
> + if (vabits_actual && va_bits && vabits_actual != va_bits) {
> + info->page_offset = (-(1UL << vabits_actual));
> + DEBUG_MSG("page_offset: %lx (via vabits_actual)\n",
> + info->page_offset);
> + }
> +

Is this for --mem-usage?
If so, let's drop from this patch and think about it later because
some additional base functions will be needed for the option, I think.

>   if (get_num_pt_loads() &a

RE: [PATCH v4 2/4] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-12-04 Thread Kazuhito Hagio
> -Original Message-
> ARMv8.2-LPA architecture extension (if available on underlying hardware)
> can support 52-bit physical addresses, while the kernel virtual
> addresses remain 48-bit.
> 
> Make sure that we read the 52-bit PA address capability from
> 'MAX_PHYSMEM_BITS' variable (if available in vmcoreinfo) and
> accordingly change the pte_to_phy() mask values and also traverse
> the page-table walk accordingly.
> 
> Also make sure that it works well for the existing 48-bit PA address
> platforms and also on environments which use newer kernels with 52-bit
> PA support but hardware which is not ARM8.2-LPA compliant.
> 
> I have sent a kernel patch upstream to add 'MAX_PHYSMEM_BITS' to
> vmcoreinfo for arm64 (see [0]).
> 
> This patch is in accordance with ARMv8 Architecture Reference Manual
> version D.a
> 
> [0]. http://lists.infradead.org/pipermail/kexec/2019-November/023960.html
> 
> Cc: Kazuhito Hagio 
> Cc: John Donnelly 
> Cc: kexec@lists.infradead.org
> Signed-off-by: Bhupesh Sharma 
> ---
>  arch/arm64.c | 292 
> +--
>  1 file changed, 204 insertions(+), 88 deletions(-)
> 
> diff --git a/arch/arm64.c b/arch/arm64.c
> index 3516b340adfd..ecb19139e178 100644
> --- a/arch/arm64.c
> +++ b/arch/arm64.c
> @@ -39,72 +39,184 @@ typedef struct {
>   unsigned long pte;
>  } pte_t;
> 

> +#define __pte(x) ((pte_t) { (x) } )
> +#define __pmd(x) ((pmd_t) { (x) } )
> +#define __pud(x) ((pud_t) { (x) } )
> +#define __pgd(x) ((pgd_t) { (x) } )

Is it possible to remove these macros?

> +
> +static int lpa_52_bit_support_available;
>  static int pgtable_level;
>  static int va_bits;
>  static unsigned long kimage_voffset;
> 
> -#define SZ_4K(4 * 1024)
> -#define SZ_16K   (16 * 1024)
> -#define SZ_64K   (64 * 1024)
> -#define SZ_128M  (128 * 1024 * 1024)
> +#define SZ_4K4096
> +#define SZ_16K   16384
> +#define SZ_64K   65536
> 
> -#define PAGE_OFFSET_36 ((0xUL) << 36)
> -#define PAGE_OFFSET_39 ((0xUL) << 39)
> -#define PAGE_OFFSET_42 ((0xUL) << 42)
> -#define PAGE_OFFSET_47 ((0xUL) << 47)
> -#define PAGE_OFFSET_48 ((0xUL) << 48)
> +#define PAGE_OFFSET_36   ((0xUL) << 36)
> +#define PAGE_OFFSET_39   ((0xUL) << 39)
> +#define PAGE_OFFSET_42   ((0xUL) << 42)
> +#define PAGE_OFFSET_47   ((0xUL) << 47)
> +#define PAGE_OFFSET_48   ((0xUL) << 48)
> +#define PAGE_OFFSET_52   ((0xUL) << 52)
> 
>  #define pgd_val(x)   ((x).pgd)
>  #define pud_val(x)   (pgd_val((x).pgd))
>  #define pmd_val(x)   (pud_val((x).pud))
>  #define pte_val(x)   ((x).pte)
> 
> -#define PAGE_MASK(~(PAGESIZE() - 1))
> -#define PGDIR_SHIFT  ((PAGESHIFT() - 3) * pgtable_level + 3)
> -#define PTRS_PER_PGD (1 << (va_bits - PGDIR_SHIFT))
> -#define PUD_SHIFTget_pud_shift_arm64()
> -#define PUD_SIZE (1UL << PUD_SHIFT)
> -#define PUD_MASK (~(PUD_SIZE - 1))
> -#define PTRS_PER_PTE (1 << (PAGESHIFT() - 3))
> -#define PTRS_PER_PUD PTRS_PER_PTE
> -#define PMD_SHIFT((PAGESHIFT() - 3) * 2 + 3)
> -#define PMD_SIZE (1UL << PMD_SHIFT)
> -#define PMD_MASK (~(PMD_SIZE - 1))

> +/* See 'include/uapi/linux/const.h' for definitions below */
> +#define __AC(X,Y)(X##Y)
> +#define _AC(X,Y) __AC(X,Y)
> +#define _AT(T,X) ((T)(X))
> +
> +/* See 'include/asm/pgtable-types.h' for definitions below */
> +typedef unsigned long pteval_t;
> +typedef unsigned long pmdval_t;
> +typedef unsigned long pudval_t;
> +typedef unsigned long pgdval_t;

Is it possible to remove these macros/typedefs as well?
I don't think they make the code easier to read..

Thanks,
Kazu

> +
> +#define PAGE_SHIFT   PAGESHIFT()
> +
> +/* See 'arch/arm64/include/asm/pgtable-hwdef.h' for definitions below */
> +
> +#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n)  ((PAGE_SHIFT - 3) * (4 - (n)) + 
> 3)
> +
> +#define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))
> +
> +/*
> + * PMD_SHIFT determines the size a level 2 page table entry can map.
> + */
> +#define PMD_SHIFTARM64_HW_PGTABLE_LEVEL_SHIFT(2)
> +#define PMD_SIZE (_AC(1, UL) <

RE: [PATCH v4 1/4] tree-wide: Retrieve 'MAX_PHYSMEM_BITS' from vmcoreinfo (if available)

2019-12-04 Thread Kazuhito Hagio
Hi Bhupesh,

Sorry for the late reply.

> -Original Message-
> This patch adds a common feature for archs (except arm64, for which
> similar support is added via subsequent patch) to retrieve
> 'MAX_PHYSMEM_BITS' from vmcoreinfo (if available).

We already have the calibrate_machdep_info() function, which sets
info->max_physmem_bits from vmcoreinfo, so practically we don't need
to add this patch for the benefit.

Thanks,
Kazu

> 
> I recently posted a kernel patch (see [0]) which appends
> 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself rather than
> in arch-specific code, so that user-space code can also benefit from
> this addition to the vmcoreinfo and use it as a standard way of
> determining 'SECTIONS_SHIFT' value in 'makedumpfile' utility.
> 
> This patch ensures backward compatibility for kernel versions in which
> 'MAX_PHYSMEM_BITS' is not available in vmcoreinfo.
> 
> [0]. http://lists.infradead.org/pipermail/kexec/2019-November/023960.html
> 
> Cc: Kazuhito Hagio 
> Cc: John Donnelly 
> Cc: kexec@lists.infradead.org
> Signed-off-by: Bhupesh Sharma 
> ---
>  arch/arm.c |  8 +++-
>  arch/ia64.c|  7 ++-
>  arch/ppc.c |  8 +++-
>  arch/ppc64.c   | 49 -
>  arch/s390x.c   | 29 ++---
>  arch/sparc64.c |  9 +++--
>  arch/x86.c | 34 --
>  arch/x86_64.c  | 27 ---
>  8 files changed, 109 insertions(+), 62 deletions(-)
> 
> diff --git a/arch/arm.c b/arch/arm.c
> index af7442ac70bf..33536fc4dfc9 100644
> --- a/arch/arm.c
> +++ b/arch/arm.c
> @@ -81,7 +81,13 @@ int
>  get_machdep_info_arm(void)
>  {
>   info->page_offset = SYMBOL(_stext) & 0xUL;
> - info->max_physmem_bits = _MAX_PHYSMEM_BITS;
> +
> + /* Check if we can get MAX_PHYSMEM_BITS from vmcoreinfo */
> + if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER)
> + info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
> + else
> + info->max_physmem_bits = _MAX_PHYSMEM_BITS;
> +
>   info->kernel_start = SYMBOL(_stext);
>   info->section_size_bits = _SECTION_SIZE_BITS;
> 
> diff --git a/arch/ia64.c b/arch/ia64.c
> index 6c33cc7c8288..fb44dda47172 100644
> --- a/arch/ia64.c
> +++ b/arch/ia64.c
> @@ -85,7 +85,12 @@ get_machdep_info_ia64(void)
>   }
> 
>   info->section_size_bits = _SECTION_SIZE_BITS;
> - info->max_physmem_bits  = _MAX_PHYSMEM_BITS;
> +
> + /* Check if we can get MAX_PHYSMEM_BITS from vmcoreinfo */
> + if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER)
> + info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
> + else
> + info->max_physmem_bits  = _MAX_PHYSMEM_BITS;
> 
>   return TRUE;
>  }
> diff --git a/arch/ppc.c b/arch/ppc.c
> index 37c6a3b60cd3..ed9447427a30 100644
> --- a/arch/ppc.c
> +++ b/arch/ppc.c
> @@ -31,7 +31,13 @@ get_machdep_info_ppc(void)
>   unsigned long vmlist, vmap_area_list, vmalloc_start;
> 
>   info->section_size_bits = _SECTION_SIZE_BITS;
> - info->max_physmem_bits  = _MAX_PHYSMEM_BITS;
> +
> + /* Check if we can get MAX_PHYSMEM_BITS from vmcoreinfo */
> + if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER)
> + info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
> + else
> + info->max_physmem_bits  = _MAX_PHYSMEM_BITS;
> +
>   info->page_offset = __PAGE_OFFSET;
> 
>   if (SYMBOL(_stext) != NOT_FOUND_SYMBOL)
> diff --git a/arch/ppc64.c b/arch/ppc64.c
> index 9d8f2525f608..a3984eebdced 100644
> --- a/arch/ppc64.c
> +++ b/arch/ppc64.c
> @@ -466,30 +466,37 @@ int
>  set_ppc64_max_physmem_bits(void)
>  {
>   long array_len = ARRAY_LENGTH(mem_section);
> - /*
> -  * The older ppc64 kernels uses _MAX_PHYSMEM_BITS as 42 and the
> -  * newer kernels 3.7 onwards uses 46 bits.
> -  */
> -
> - info->max_physmem_bits  = _MAX_PHYSMEM_BITS_ORIG ;
> - if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> - || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> - return TRUE;
> -
> - info->max_physmem_bits  = _MAX_PHYSMEM_BITS_3_7;
> - if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> - || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> - return TRUE;
> 
> - info->max_physmem_bits  = _MAX_PHYSMEM_BITS_4_19;
> - if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> - || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT()

RE: [PATCH v4 0/4] makedumpfile/arm64: Add support for ARMv8.2 extensions

2019-11-13 Thread Kazuhito Hagio
Hi Bhupesh,

Thanks for the updated patchset.

I'm taking a look at this, but I will be out of office from tomorrow
until Nov 29th, so please expect some (long) delays in my response..

Thanks,
Kazu

> -Original Message-
> Changes since v3:
> 
> - v3 can be seen here:
>   http://lists.infradead.org/pipermail/kexec/2019-March/022534.html
> - Added a new patch (via [PATCH 4/4]) which marks '--mem-usage' option as
>   unsupported for arm64 architecture. With the newer arm64 kernels
>   supporting 48-bit/52-bit VA address spaces and keeping a single
>   binary for supporting the same, the address of
>   kernel symbols like _stext, which could be earlier used to determine
>   VA_BITS value, can no longer to determine whether VA_BITS is set to 48
>   or 52 in the kernel space. Hence for now, it makes sense to mark
>   '--mem-usage' option as unsupported for arm64 architecture until
>   we have more clarity from arm64 kernel maintainers on how to manage
>   the same in future kernel/makedumpfile versions.
> 
> Changes since v2:
> 
> - v2 can be seen here:
>   http://lists.infradead.org/pipermail/kexec/2019-February/022456.html
> - I missed some comments from Kazu sent on the LVA v1 patch when I sent
>   out the v2. So, addressing them now in v3.
> - Also added a patch that adds a tree-wide feature to read
>   'MAX_PHYSMEM_BITS' from vmcoreinfo (if available).
> 
> Changes since v1:
> 
> - v1 was sent as two separate patches:
>   http://lists.infradead.org/pipermail/kexec/2019-February/022424.html
>   (ARMv8.2-LPA)
>   http://lists.infradead.org/pipermail/kexec/2019-February/022425.html
>   (ARMv8.2-LVA)
> - v2 combined the two in a single patchset and also addresses Kazu's
>   review comments.
> 
> This patchset adds support for ARMv8.2 extensions in makedumpfile code.
> I cover the following two cases with this patchset:
>  - 48-bit kernel VA + 52-bit PA (LPA)
>  - 52-bit kernel VA (LVA) + 52-bit PA (LPA)
>  - 48-bit kernel VA + 52-bit user-space VA (LVA)
>  - 52-bit kernel VA + 52-bit user-space VA (Full LVA)
> 
> This has been tested for the following user-cases:
> 1. Creating a dumpfile using /proc/vmcore,
> 2. Creating a dumpfile using /proc/kcore, and
> 3. Post-processing a vmcore.
> 
> I have tested this patchset on the following platforms, with kernels
> which support/do-not-support ARMv8.2 features:
> 1. CPUs which don't support ARMv8.2 features, e.g. qualcomm-amberwing,
>ampere-osprey.
> 2. Prototype models which support ARMv8.2 extensions (e.g. ARMv8 FVP
>simulation model).
> 
> Also a preparation patch has been added in this patchset which adds a
> common feature for archs (except arm64, for which similar support is
> added via subsequent patch) to retrieve 'MAX_PHYSMEM_BITS' from
> vmcoreinfo (if available).
> 
> I recently posted two kernel patches (see [0] and [1]) which append
> 'TCR_EL1.T1SZ' and 'MAX_PHYSMEM_BITS' to vmcoreinfo in the kernel
> code, so that user-space code can benefit from the same.
> 
> This patchset ensures backward compatibility for kernel versions in
> which 'TCR_EL1.T1SZ' and 'MAX_PHYSMEM_BITS' are not available in
> vmcoreinfo.
> 
> [0]. http://lists.infradead.org/pipermail/kexec/2019-November/023960.html
> [1]. http://lists.infradead.org/pipermail/kexec/2019-November/023962.html
> 
> Cc: John Donnelly 
> Cc: Kazuhito Hagio 
> Cc: kexec@lists.infradead.org
> 
> Bhupesh Sharma (4):
>   tree-wide: Retrieve 'MAX_PHYSMEM_BITS' from vmcoreinfo (if available)
>   makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)
>   makedumpfile/arm64: Add support for ARMv8.2-LVA (52-bit kernel VA
> support)
>   makedumpfile: Mark --mem-usage option unsupported for arm64
> 
>  arch/arm.c |   8 +-
>  arch/arm64.c   | 438 
> ++---
>  arch/ia64.c|   7 +-
>  arch/ppc.c |   8 +-
>  arch/ppc64.c   |  49 ---
>  arch/s390x.c   |  29 ++--
>  arch/sparc64.c |   9 +-
>  arch/x86.c |  34 +++--
>  arch/x86_64.c  |  27 ++--
>  makedumpfile.c |   7 +
>  makedumpfile.h |   3 +-
>  11 files changed, 439 insertions(+), 180 deletions(-)
> 
> --
> 2.7.4
> 
> 
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: makedumpfile: ELF format issues (RE: makedumpfile: Fix divide by zero in print_report())

2019-11-13 Thread Kazuhito Hagio
Hi Dave,

I think I've fixed the ELF issues which I could reproduce:
- wrong statistics
- e_phnum overflow

If you still see any problems with the latest makedumpfile,
please let me know.

Thanks,
Kazu

> -Original Message-
> > -Original Message-
> > >  > > There are some other failure cases with non-null data, so maybe 
> > > there's >1 bug here.
> > >  > > I've not seen an obvious pattern to this. eg...
> > >  > >
> > >  > > https://pastebin.com/2uM4sBCF
> > >  > >
> > >  >
> > >  > As for this case, I suspect that Elf64_Ehdr.e_phnum overflows
> > >  > (i.e. num_loads_dumpfile > 65535):
> > >
> > > Oh, good catch.  These are 256GB machines, so after discarding
> > > everything, that explains why we end up with so many sections.
> > > This also explains why it sometimes works I think, when the discarding
> > > manages to get the total nr headers <64k.
> 
> I also could reproduce this issue on a system with 192GB memory.
> The note was actually overwritten by the following program headers.
> -
> num_loads_dumpfile=76318# more than 64k
> ehdr64.e_phnum=10783# overflowed
> note.p_offset=0x93708 .p_filesz=0x2958  # The note data is at 0x93708
> note cd_header->offset=0x40
> ...
> head->off= 90040 load.p_addr= 44552e000 .p_off=  ed270060 ...
>^ # these headers overwrote the note data.
> head->off= a0040 load.p_addr= 44563 .p_off=  ed272060 ...
> ...
> The dumpfile is saved to dump.Ed25.devel.
> 
> makedumpfile Completed.
> 
> # readelf -a dump.Ed25.devel
> ...
>   Number of program headers: 10783
> ...
> Displaying notes found at file offset 0x00093708 with length 0x2958:
>   Owner Data size   Description
>0x0007   Unknown note type: (0xdbce6060)
>description data: 00 00 7a 39 fff2 ff8a 
> # ../crash vmlinux dump.Ed25.devel
> 
> WARNING: possibly corrupt Elf64_Nhdr: n_namesz: 4185522176 n_descsz: 3 
> n_type: f4000
> ...
> WARNING: cannot read linux_banner string
> crash: vmlinux and dump.Ed25.devel do not match!
> -
> 
> > I think this will be the one of the causes, and had a look at how
> > we can fix it.  If you get a vmcore where this pattern occurs,
> > you can try this tree:
> > https://github.com/k-hagio/makedumpfile/tree/support-extended-elf
> >
> > Then, the crash utility also needs a patch to support a dumpfile
> > that has more than 64k program headers:
> > https://github.com/k-hagio/crash/tree/support-extended-elf
> 
> These trees look to work well, though need more tests and tweaks.
> -
> # readelf -a dump.Ed25.test
> ...
>   Number of program headers: 65535 (76319)  <<-- note + loads
> ...
> Displaying notes found at file offset 0x00413748 with length 0x2958:
>   Owner Data size   Description
>   CORE 0x0150   NT_PRSTATUS (prstatus structure)
>   CORE 0x0150   NT_PRSTATUS (prstatus structure)
>   CORE 0x0150   NT_PRSTATUS (prstatus structure)
> ...
> # ../crash-test vmlinux dump.Ed25.test
> 
> crash-test> help -D
> vmcore_data:
>   flags: c0 (KDUMP_LOCAL|KDUMP_ELF64)
>ndfd: 3
> ofp: 3141560
> header_size: 4284576
>num_pt_load_segments: 76318   <<-- loads
>  pt_load_segment[0]:
> -
> 
> It is possible that the issue occurs on general systems if they have
> large memory, so I'm going to proceed with those patches.
> 
> Thanks,
> Kazu
> 


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: Fix wrong statistics in ELF format mode

2019-11-12 Thread Kazuhito Hagio
> -Original Message-
> The -E option, which creates a dumpfile in ELF format, reports wrong
> statistics like the ones below, because:
>  (1) counts excluded pages repeatedly due to overlapped cycles
>  (2) does not calculate the number of memory hole pages in cyclic mode
>  (3) does not take account of the number of pages excluded actually
>  in ELF format, which excludes only contiguous 256 or more pages
>  that can be excluded.
> 
>   Original pages  : 0x
> Excluded pages   : 0x007daf05
>   Pages filled with zero  : 0x2dcc
>   Non-private cache pages : 0x000471d6
>   Private cache pages : 0x0001
>   User process data pages : 0x000147f1
>   Free pages  : 0x0077c771
>   Hwpoison pages  : 0x
>   Offline pages   : 0x
> Remaining pages  : 0xff8250fb
>   Memory Hole : 0x0044
>   --
>   Total pages : 0x0044
> 
> In order to fix this issue:
>  (1) start the first cycle from the start pfn of a segment to avoid
>  overlaps between cycles

finally I found a fault with this change.
Will merge the following patch into the original one.

diff --git a/makedumpfile.c b/makedumpfile.c
index 9569251ce0c7..ac19ed858416 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -56,8 +56,13 @@ static void first_cycle(mdf_pfn_t start, mdf_pfn_t max, 
struct cycle *cycle)
if (cycle->end_pfn > max)
cycle->end_pfn = max;
 
+   /*
+* Mitigate statistics problem in ELF dump mode.
+* A cycle must start with a pfn that is divisible by BITPERBYTE.
+* See create_bitmap_from_memhole().
+*/
if (info->flag_elf_dumpfile && cycle->start_pfn < start)
-   cycle->start_pfn = start;
+   cycle->start_pfn = round(start, BITPERBYTE);
 
cycle->exclude_pfn_start = 0;
cycle->exclude_pfn_end = 0;
@@ -7503,7 +7508,7 @@ get_loads_dumpfile_cyclic(void)
if (!create_2nd_bitmap())
return FALSE;
}
-   for (pfn = cycle.start_pfn; pfn < cycle.end_pfn; pfn++) 
{
+   for (pfn = MAX(pfn_start, cycle.start_pfn); pfn < 
cycle.end_pfn; pfn++) {
if (!is_dumpable(info->bitmap2, pfn, )) {
num_excluded++;
continue;
@@ -7598,7 +7603,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, 
struct cache_data *cd_page)
return FALSE;
}
 
-   for (pfn = cycle.start_pfn; pfn < cycle.end_pfn; pfn++) 
{
+   for (pfn = MAX(pfn_start, cycle.start_pfn); pfn < 
cycle.end_pfn; pfn++) {
if (info->flag_cyclic)
pfn_memhole--;


Thanks,
Kazu

>  (2) calculate the number of memory hole pages in cyclic mode
>  (3) introduce pfn_elf_excluded variable to store the actual number
>  of the excluded pages in ELF format
> 
> With the patch, a report message in ELF format mode becomes like this:
> 
>   Original pages  : 0x003f1538
> Excluded pages   : 0x003c8c9d
>in ELF format : 0x003c4319
>   Pages filled with zero  : 0x26d8
>   Non-private cache pages : 0x00047032
>   Private cache pages : 0x0001
>   User process data pages : 0x00014794
>   Free pages  : 0x0036adfe
>   Hwpoison pages  : 0x
>   Offline pages   : 0x
> Remaining pages  : 0x0002889b
>in ELF format : 0x0002d21f
> (The number of pages is reduced to 4%.)
>   Memory Hole : 0x0004eac8
>   --
>   Total pages : 0x00440000
> 
> where the "Excluded pages" and "Remaining pages" do not mean the
> actual numbers of excluded and remaining pages.  But remain the
> same for some reference.
> 
> Signed-off-by: Kazuhito Hagio 
> ---
>  makedumpfile.c | 27 +--
>  1 file changed, 25 insertions(+), 2 deletions(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 4a000112ba59..9569251ce0c7 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -56,6 +56,9 @@ static void first_cycle(mdf_pfn_t start, mdf_pfn_t max, 
> struct cycle *cycle)
>   if (cycle->end_pfn &g

[PATCH] makedumpfile: Add support for ELF extended numbering

2019-11-08 Thread Kazuhito Hagio
In ELF dump mode, since makedumpfile cannot handle more than PN_XNUM
(0x) program headers, if a resulting dumpfile needs such a number
of program headers, it creates a broken ELF dumpfile like this:

  # crash vmlinux dump.elf
  ...
  WARNING: possibly corrupt Elf64_Nhdr: n_namesz: 4185522176 n_descsz: 3 
n_type: f4000
  ...
  WARNING: cannot read linux_banner string
  crash: vmlinux and dump.elf do not match!

With this patch, if the actual number of program headers is PN_XNUM
or more, the e_phnum field of the ELF header is set to PN_XNUM, and
the actual number is set in the sh_info field of the section header
at index 0.

The section header is written just after the program headers, although
this order is not typical, for the sake of code simplisity.

Signed-off-by: Kazuhito Hagio 
---
 elf_info.c | 40 ---
 elf_info.h |  1 +
 makedumpfile.c | 73 --
 3 files changed, 91 insertions(+), 23 deletions(-)

diff --git a/elf_info.c b/elf_info.c
index 204bfbf69ae3..e9c267161bc2 100644
--- a/elf_info.c
+++ b/elf_info.c
@@ -123,8 +123,11 @@ check_elf_format(int fd, char *filename, int *phnum, 
unsigned int *num_load)
(*num_load) = 0;
if ((ehdr64.e_ident[EI_CLASS] == ELFCLASS64)
&& (ehdr32.e_ident[EI_CLASS] != ELFCLASS32)) {
-   (*phnum) = ehdr64.e_phnum;
-   for (i = 0; i < ehdr64.e_phnum; i++) {
+   if (!get_elf64_phnum(fd, filename, , phnum)) {
+   ERRMSG("Can't get phnum.\n");
+   return FALSE;
+   }
+   for (i = 0; i < (*phnum); i++) {
if (!get_elf64_phdr(fd, filename, i, )) {
ERRMSG("Can't find Phdr %d.\n", i);
return FALSE;
@@ -1035,6 +1038,34 @@ is_xen_memory(void)
return (flags_memory & MEMORY_XEN);
 }
 
+int
+get_elf64_phnum(int fd, char *filename, Elf64_Ehdr *ehdr, int *phnum)
+{
+   Elf64_Shdr shdr;
+
+   /*
+* Extended Numbering support
+* See include/uapi/linux/elf.h and elf(5) for more information.
+*/
+   if (ehdr->e_phnum == PN_XNUM) {
+   if (lseek(fd, ehdr->e_shoff, SEEK_SET) < 0) {
+   ERRMSG("Can't seek %s at 0x%lx. %s\n", filename,
+   ehdr->e_shoff, strerror(errno));
+   return FALSE;
+   }
+   if (read(fd, , ehdr->e_shentsize) != ehdr->e_shentsize) {
+   ERRMSG("Can't read %s at 0x%lx. %s\n", filename,
+   ehdr->e_shoff, strerror(errno));
+   return FALSE;
+   }
+
+   *phnum = shdr.sh_info;
+   } else
+   *phnum = ehdr->e_phnum;
+
+   return TRUE;
+}
+
 int
 get_phnum_memory(void)
 {
@@ -1047,7 +1078,10 @@ get_phnum_memory(void)
ERRMSG("Can't get ehdr64.\n");
return FALSE;
}
-   phnum = ehdr64.e_phnum;
+   if (!get_elf64_phnum(fd_memory, name_memory, , )) {
+   ERRMSG("Can't get phnum.\n");
+   return FALSE;
+   }
} else {/* ELF32 */
if (!get_elf32_ehdr(fd_memory, name_memory, )) {
ERRMSG("Can't get ehdr32.\n");
diff --git a/elf_info.h b/elf_info.h
index cd4ffa6feed3..934b60806a8b 100644
--- a/elf_info.h
+++ b/elf_info.h
@@ -54,6 +54,7 @@ int get_kcore_dump_loads(void);
 int is_elf64_memory(void);
 int is_xen_memory(void);
 
+int get_elf64_phnum(int fd, char *filename, Elf64_Ehdr *ehdr, int *phnum);
 int get_phnum_memory(void);
 int get_phdr_memory(int index, Elf64_Phdr *phdr);
 off_t get_offset_pt_load_memory(void);
diff --git a/makedumpfile.c b/makedumpfile.c
index 4a000112ba59..371c9a33b8ad 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -6862,10 +6862,17 @@ write_elf_header(struct cache_data *cd_header)
ERRMSG("Can't get ehdr64.\n");
goto out;
}
+
+   /* For PT_NOTE */
+   num_loads_dumpfile++;
+
/*
-* PT_NOTE(1) + PT_LOAD(1+)
+* Extended Numbering support
+* See include/uapi/linux/elf.h and elf(5) for more information.
 */
-   ehdr64.e_phnum = 1 + num_loads_dumpfile;
+   ehdr64.e_phnum = (num_loads_dumpfile >= PN_XNUM) ?
+   PN_XNUM : num_loads_dumpfile;
+
} else {/* ELF32 */
if (!get_elf32_ehdr(info->fd_memory,
info->name_memory, )) {
@@ -6878,20 +6885,6 @@ write_elf_header(struct ca

RE: makedumpfile: ELF format issues

2019-11-08 Thread Kazuhito Hagio


> -Original Message-
> > I dropped the ELF32 part from the crash patch, could you check this?
> > https://github.com/k-hagio/crash/tree/support-extended-elf.v2
> 
> Thanks Kazu -- the patch is queued for crash-7.2.8:
> 
>   
> https://github.com/crash-utility/crash/commit/c0bbd8fae4271159aee9e643350781909484c92f
> 
> Dave
> 

Thank you!

As for makedumpfile, I will take some more time to support refiltering
but it will not affect crash side.

Kazu

> 
> > This is for makedumpfile:
> > https://github.com/k-hagio/makedumpfile/tree/support-extended-elf.v2
> >
> > I will post the updated makedumpfile patch later for public review,
> > and would like to apply it next week.
> >
> > Thanks,
> > Kazu
> >
> >
> >
> 

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: makedumpfile: ELF format issues

2019-11-08 Thread Kazuhito Hagio
Hi Dave,

> -Original Message-
> > > It is possible that the issue occurs on general systems if they have
> > > large memory, so I'm going to proceed with those patches.
> >
> > Hi Kazu,
> >
> > Do you want me to go ahead with the crash utility patch?  It looks
> > safe enough to apply, and I did test it to make sure there were no
> > ill-effects with sample ELF dumpfiles.
> 
> Oh, thank you for your attention and testing.
> 
> I'm dropping the ELF32 parts of them, because I think they will not be
> used in the future.  (I estimate the theoretical minimum memory size
> that makedumpfile could use the extended numbering is 64GB+256MB on
> 4k page system.)

I dropped the ELF32 part from the crash patch, could you check this?
https://github.com/k-hagio/crash/tree/support-extended-elf.v2

This is for makedumpfile:
https://github.com/k-hagio/makedumpfile/tree/support-extended-elf.v2

I will post the updated makedumpfile patch later for public review,
and would like to apply it next week.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: makedumpfile: ELF format issues (RE: makedumpfile: Fix divide by zero in print_report()) (Kazuhito Hagio)

2019-11-07 Thread Kazuhito Hagio
Hi Dave,

> -Original Message-
> > > I think this will be the one of the causes, and had a look at how
> > > we can fix it.  If you get a vmcore where this pattern occurs,
> > > you can try this tree:
> > > https://github.com/k-hagio/makedumpfile/tree/support-extended-elf
> > >
> > > Then, the crash utility also needs a patch to support a dumpfile
> > > that has more than 64k program headers:
> > > https://github.com/k-hagio/crash/tree/support-extended-elf

> > It is possible that the issue occurs on general systems if they have
> > large memory, so I'm going to proceed with those patches.
> 
> Hi Kazu,
> 
> Do you want me to go ahead with the crash utility patch?  It looks
> safe enough to apply, and I did test it to make sure there were no
> ill-effects with sample ELF dumpfiles.

Oh, thank you for your attention and testing.

I'm dropping the ELF32 parts of them, because I think they will not be
used in the future.  (I estimate the theoretical minimum memory size
that makedumpfile could use the extended numbering is 64GB+256MB on
4k page system.)

I will let you know when it gets prepared.

Thanks!
Kazu
___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: makedumpfile: ELF format issues (RE: makedumpfile: Fix divide by zero in print_report())

2019-11-07 Thread Kazuhito Hagio
Hi,

> -Original Message-
> >  > > There are some other failure cases with non-null data, so maybe 
> > there's >1 bug here.
> >  > > I've not seen an obvious pattern to this. eg...
> >  > >
> >  > > https://pastebin.com/2uM4sBCF
> >  > >
> >  >
> >  > As for this case, I suspect that Elf64_Ehdr.e_phnum overflows
> >  > (i.e. num_loads_dumpfile > 65535):
> >
> > Oh, good catch.  These are 256GB machines, so after discarding
> > everything, that explains why we end up with so many sections.
> > This also explains why it sometimes works I think, when the discarding
> > manages to get the total nr headers <64k.

I also could reproduce this issue on a system with 192GB memory.
The note was actually overwritten by the following program headers.
-
num_loads_dumpfile=76318# more than 64k
ehdr64.e_phnum=10783# overflowed
note.p_offset=0x93708 .p_filesz=0x2958  # The note data is at 0x93708
note cd_header->offset=0x40
...
head->off= 90040 load.p_addr= 44552e000 .p_off=  ed270060 ...
   ^ # these headers overwrote the note data.
head->off= a0040 load.p_addr= 44563 .p_off=  ed272060 ...
...
The dumpfile is saved to dump.Ed25.devel.

makedumpfile Completed.

# readelf -a dump.Ed25.devel 
...
  Number of program headers: 10783
...
Displaying notes found at file offset 0x00093708 with length 0x2958:
  Owner Data size   Description
   0x0007   Unknown note type: (0xdbce6060)
   description data: 00 00 7a 39 fff2 ff8a 
# ../crash vmlinux dump.Ed25.devel

WARNING: possibly corrupt Elf64_Nhdr: n_namesz: 4185522176 n_descsz: 3 n_type: 
f4000
...
WARNING: cannot read linux_banner string
crash: vmlinux and dump.Ed25.devel do not match!
-

> I think this will be the one of the causes, and had a look at how
> we can fix it.  If you get a vmcore where this pattern occurs,
> you can try this tree:
> https://github.com/k-hagio/makedumpfile/tree/support-extended-elf
> 
> Then, the crash utility also needs a patch to support a dumpfile
> that has more than 64k program headers:
> https://github.com/k-hagio/crash/tree/support-extended-elf

These trees look to work well, though need more tests and tweaks.
-
# readelf -a dump.Ed25.test
...
  Number of program headers: 65535 (76319)  <<-- note + loads
...
Displaying notes found at file offset 0x00413748 with length 0x2958:
  Owner Data size   Description
  CORE 0x0150   NT_PRSTATUS (prstatus structure)
  CORE 0x0150   NT_PRSTATUS (prstatus structure)
  CORE 0x0150   NT_PRSTATUS (prstatus structure)
...
# ../crash-test vmlinux dump.Ed25.test

crash-test> help -D
vmcore_data: 
  flags: c0 (KDUMP_LOCAL|KDUMP_ELF64) 
   ndfd: 3
ofp: 3141560
header_size: 4284576
   num_pt_load_segments: 76318   <<-- loads
 pt_load_segment[0]:
-

It is possible that the issue occurs on general systems if they have
large memory, so I'm going to proceed with those patches.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: Fix off-by-one issue in exclude_nodata_pages()

2019-11-04 Thread Kazuhito Hagio
Hi Mikhail,

> -Original Message-
> When building a dump bitmap (2nd bitmap) for the ELF dump, the last pfn
> of the cycle is always ignored in exclude_nodata_pages() function due to
> off-by-one error on cycle boundary check. Thus, the respective bit of
> the bitmap is never cleared.
> That can lead to the error when such a pfn should not be dumpable (e.g.
> the last pfn of the ELF-load of zero filesize). Based on the bit in the
> bitmap the page is treated as dumpable in write_elf_pages_cyclic() function
> and the follow on error is triggered in write_elf_load_segment() function
> due to the failing sanity check of paddr_to_offset2():

Good catch.  I could reproduce this issue with a vmcore on hand and
confirm that the patch fixed it.  Looks good to me, applied.

Thanks,
Kazu

> 
>$ makedumpfile -E dump.elf dump.elf.E
>Checking for memory holes : [100.0 %] |
>write_elf_load_segment: Can't convert physaddr(7000) to an offset.
>makedumpfile Failed.
> 
> Signed-off-by: Mikhail Zaslonko 
> ---
>  makedumpfile.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index de0973f..4a00011 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -4740,7 +4740,7 @@ exclude_nodata_pages(struct cycle *cycle)
>   if (pfn < cycle->start_pfn)
>   pfn = cycle->start_pfn;
>   if (pfn_end >= cycle->end_pfn)
> - pfn_end = cycle->end_pfn - 1;
> + pfn_end = cycle->end_pfn;
>   while (pfn < pfn_end) {
>   clear_bit_on_2nd_bitmap(pfn, cycle);
>   ++pfn;
> --
> 2.17.1
> 
> 
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [Crash-utility] crash and makedumpfile with 5.3 missing memory in dump

2019-11-01 Thread Kazuhito Hagio
Hi,

> -Original Message-
> >> I'm trying to use crash to read a makedumpfile vmcore from 5.3, but I 
> >> always
> >> end up with an error when opening the dump.

As John mentioned, makedumpfile needs the following patch to work with
5.3 or later correctly.  This patch is in the devel branch now.
Please try the latest one.
https://sourceforge.net/p/makedumpfile/code/ci/7bdb468c2c99dd780c9a5321f93c79cbfdce2527/

Although the commit message says that it causes makedumpfile an error,
I saw some patterns later that makedumpfile created a broken dumpfile
without error.

(Usually I had to wrote "Required for kernel 5.3" in the commit message,
but forgot it at that time..)

Thanks,
Kazu

> >>
> >> I'm using the latest github crash
> >>
> >> crash 7.2.7++
> >> ...
> >>   crash: page excluded: kernel virtual address: 82110370  type:
> >>   "possible"
> >>   WARNING: cannot read cpu_possible_map
> >>   crash: page excluded: kernel virtual address: 82110360  type:
> >>   "present"
> >>   WARNING: cannot read cpu_present_map
> >>   crash: page excluded: kernel virtual address: 82110368  type:
> >>   "online"
> >>   WARNING: cannot read cpu_online_map
> >>   crash: page excluded: kernel virtual address: 82110358  type:
> >>   "active"
> >>   WARNING: cannot read cpu_active_map
> >>   crash: page excluded: kernel virtual address: 82011544  type:
> >>   "init_uts_ns"
> >>   crash: page excluded: kernel virtual address: 82110360  type:
> >>   "cpu_present_map"
> >>   crash: page excluded: kernel virtual address: 82110360  type:
> >>   "cpu_present_map"
> >>   WARNING: ORC unwinder: cannot read lookup_num_blocks
> >>   crash: seek error: kernel virtual address: 88822dffb000  type:
> >>   "memory section root table"
> >>
> >> The dump is created with the latest makedumpfile release
> >>
> >> makedumpfile: version 1.6.6 (released on 27 Jun 2019)
> >>
> >> It complains that it doesn't support the kernel
> >>
> >> Any ideas?
> >>
> >> -Andi
> >>



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: Makedumpfile help for 5.4.0.rc3 : Arm

2019-10-30 Thread Kazuhito Hagio


> -Original Message- 
> If there are corresponding kernel patches being considered to make
> makedumpfile work  on v8 and v8.x  systems I would be interesting in
> testing those also.
> 
> 
> makedumpfile should be able to work with a variety of VAbits as 48, or
> 52 settings.

Yes, Bhupesh will post his patches to do so.

The v3 patchset that was mentioned earlier is this:
https://lore.kernel.org/linux-arm-kernel/1553058574-18606-1-git-send-email-bhsha...@redhat.com/

We have to export things that we need and arm maintainers can export
to vmcoreinfo.

Thanks,
Kazu

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: Makedumpfile help for 5.4.0.rc3 : Arm

2019-10-30 Thread Kazuhito Hagio
Hi Bhupesh,

OK, thanks for letting me know that in your holidays. Have good ones.
I just thought that it might be good to share our understanding before
posting a kernel patch so that I can do something to support you if
need be.


Hi John,

You see that error with my test patch, which is only for testing though,
as I wrote below

> > And wrote a very draft patch, which works on a 5.4-rc4 kernel that
> > I modified to have NUMBER(vabits_actual) and NUMBER(MAX_PHYSMEM_BITS),

that patch requires a kernel fix to have these in vmcoreinfo.
Did you try it?

Thanks,
Kazu

> -Original Message-
> Hi Kazu,
> 
> Sorry for top posting, but I am on leave for Diwali Holidays and will
> return to the office in a couple of days.
> 
> I have the solution/patchset ready and I just need to finish the
> commit messages to make sure they are self-explanatory for upstream
> acceptance.
> 
> I will try to post them and also answer your email in detail when I
> return back from holidays.
> 
> Thanks for your patience.
> 
> Regards,
> Bhupesh
> 
> (Sent from my Android Phone)
> 
> On Tue, Oct 29, 2019 at 2:23 AM Kazuhito Hagio  wrote:
> >
> > Hi Bhupesh,
> >
> > > -Original Message-
> > > > > I am working on the changes in the vmcoreinfo after the 52-bit VA
> > > > > changes from Steve were accepted in Linux 5.4-rc1 ("Support for 52-bit
> > > > > virtual addressing in kernel space”).
> > > >
> > > >
> > > >I don’t see a commit with this title in linux-stable  ;  Could you 
> > > > be a little more specific what
> > > file  that was applied to ?
> > >
> > > You can have a look at the following commit (and dependencies) in
> > > Linus's tree 
> > > (git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git)
> > > which introduced 52-bit kernel VAs for arm64 architecture:
> > >
> > > commit b6d00d47e81a49f6cf462518c10408f37a3e6785
> > > Author: Steve Capper 
> > > Date:   Wed Aug 7 16:55:22 2019 +0100
> > >
> > > arm64: mm: Introduce 52-bit Kernel VAs
> >
> > Thank you for working on this.
> >
> > I've also had a look at them, and my understanding is that we need
> > in vmcoreinfo at least:
> >
> > - TCR_EL1.T1SZ (almost equals to vabits_actual) to determine:
> >   o PAGE_OFFSET
> >   o whether the kernel has the "flipped" linear map and the others.
> > We need a fix to __pa() with it. (see the patch below)
> >
> > - PA_BITS or MAX_PHYSMEM_BITS to determine:
> >   o whether SPARSEMEM_EXTREME or not, in is_sparsemem_extreme()
> >   o whether the kernel has 48-bit or 52-bit PA to switch
> > the calculation of pte-to-paddr. (this might be unnecessary?)
> >
> > Is this right?
> >
> > And wrote a very draft patch, which works on a 5.4-rc4 kernel that
> > I modified to have NUMBER(vabits_actual) and NUMBER(MAX_PHYSMEM_BITS),
> > with VA_BITS=48 config or VA_BITS=52 config running in 48-bit mode.
> > https://github.com/k-hagio/makedumpfile/commit/fd9d86ea05b38e9edbb8c0ac3ebd612d5d485df3
> >
> > (I don't intend to export them as they are, it's just for an experiment.
> > And no support for --mem-usage option, "real" 52-bit PA, and so on.)
> >
> > As for MAX_PHYSMEM_BITS, I don't stick to export it for all architectures,
> > although I told you that it would be better to do so in the past.
> > If it's hard to do so, it's fine with me to export it or something similar
> > only for arm64 for now..
> >
> > Your thoughts?
> >
> > Thanks,
> > Kazu
> >
> 

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: Makedumpfile help for 5.4.0.rc3 : Arm

2019-10-28 Thread Kazuhito Hagio
Hi Bhupesh,

> -Original Message-
> > > I am working on the changes in the vmcoreinfo after the 52-bit VA
> > > changes from Steve were accepted in Linux 5.4-rc1 ("Support for 52-bit
> > > virtual addressing in kernel space”).
> >
> >
> >I don’t see a commit with this title in linux-stable  ;  Could you be a 
> > little more specific what
> file  that was applied to ?
> 
> You can have a look at the following commit (and dependencies) in
> Linus's tree 
> (git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git)
> which introduced 52-bit kernel VAs for arm64 architecture:
> 
> commit b6d00d47e81a49f6cf462518c10408f37a3e6785
> Author: Steve Capper 
> Date:   Wed Aug 7 16:55:22 2019 +0100
> 
> arm64: mm: Introduce 52-bit Kernel VAs

Thank you for working on this.

I've also had a look at them, and my understanding is that we need
in vmcoreinfo at least:

- TCR_EL1.T1SZ (almost equals to vabits_actual) to determine:
  o PAGE_OFFSET
  o whether the kernel has the "flipped" linear map and the others.
We need a fix to __pa() with it. (see the patch below)

- PA_BITS or MAX_PHYSMEM_BITS to determine:
  o whether SPARSEMEM_EXTREME or not, in is_sparsemem_extreme()
  o whether the kernel has 48-bit or 52-bit PA to switch
the calculation of pte-to-paddr. (this might be unnecessary?)

Is this right?

And wrote a very draft patch, which works on a 5.4-rc4 kernel that
I modified to have NUMBER(vabits_actual) and NUMBER(MAX_PHYSMEM_BITS),
with VA_BITS=48 config or VA_BITS=52 config running in 48-bit mode.
https://github.com/k-hagio/makedumpfile/commit/fd9d86ea05b38e9edbb8c0ac3ebd612d5d485df3

(I don't intend to export them as they are, it's just for an experiment.
And no support for --mem-usage option, "real" 52-bit PA, and so on.)

As for MAX_PHYSMEM_BITS, I don't stick to export it for all architectures,
although I told you that it would be better to do so in the past.
If it's hard to do so, it's fine with me to export it or something similar
only for arm64 for now..

Your thoughts?

Thanks,
Kazu

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: makedumpfile: ELF format issues (RE: makedumpfile: Fix divide by zero in print_report())

2019-10-28 Thread Kazuhito Hagio
> -Original Message-
>  > > There are some other failure cases with non-null data, so maybe there's 
> >1 bug here.
>  > > I've not seen an obvious pattern to this. eg...
>  > >
>  > > https://pastebin.com/2uM4sBCF
>  > >
>  >
>  > As for this case, I suspect that Elf64_Ehdr.e_phnum overflows
>  > (i.e. num_loads_dumpfile > 65535):
> 
> Oh, good catch.  These are 256GB machines, so after discarding
> everything, that explains why we end up with so many sections.
> This also explains why it sometimes works I think, when the discarding
> manages to get the total nr headers <64k.

I think this will be the one of the causes, and had a look at how
we can fix it.  If you get a vmcore where this pattern occurs,
you can try this tree:
https://github.com/k-hagio/makedumpfile/tree/support-extended-elf

Then, the crash utility also needs a patch to support a dumpfile
that has more than 64k program headers:
https://github.com/k-hagio/crash/tree/support-extended-elf

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: makedumpfile: ELF format issues (RE: makedumpfile: Fix divide by zero in print_report())

2019-10-17 Thread Kazuhito Hagio
> -Original Message-
> On Wed, Oct 09, 2019 at 08:03:51PM +, Kazuhito Hagio wrote:
> 
>  > In this case, was the "makedumpfile Completed." message emitted?
>  > It looks like the buffer of program headers was not written to the file..
>  >
>  > Anyway, a debugging patch attached below.
> 
> Our kdump tooling redirects makedumpfile output to dmesg, and unfortunately 
> this debug
> patch produces so much info it filled the ring buffer, so we didn't
> catch the beginning.

ah, if makedumpfile makes more than 64k program headers, the debug log
will be more than 8MB.  I should have told you this..

> I'll rework things so that it redirects to a file instead of dmesg, but
> it's going to take me a while to get that deployed and tested.

If your hosts have a big space enough, thare is another way that
you use cp for /proc/vmcore and use makedumpfile after reboot.
For example:

  # cp --sparse=always /proc/vmcore vmcore.cp
  reboot
  # makedumpfile -E -d 31 --message-level 31 --cyclic-buffer 4096 vmcore.cp 
dump.Ed31

where the --cyclic-buffer option is needed to behave like in 2nd kernrel
on the one of your hosts:
  [   13.341818] Buffer size for the cyclic mode: 4194304

The captured vmcore.cp may be useful for trying a next patch first.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


makedumpfile: ELF format issues (RE: makedumpfile: Fix divide by zero in print_report())

2019-10-09 Thread Kazuhito Hagio
Hi Dave,

Thank you for the information.

> -Original Message-

> Common case seems to be:
> 
> ELF Header:
>   Magic:   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
>   Class: ELF64
>   Data:  2's complement, little endian
>   Version:   1 (current)
>   OS/ABI:UNIX - System V
>   ABI Version:   0
>   Type:  CORE (Core file)
>   Machine:   Advanced Micro Devices X86-64
>   Version:   0x1
>   Entry point address:   0x0
>   Start of program headers:  64 (bytes into file)
>   Start of section headers:  0 (bytes into file)
>   Flags: 0x0
>   Size of this header:   64 (bytes)
>   Size of program headers:   56 (bytes)
>   Number of program headers: 23881
>   Size of section headers:   0 (bytes)
>   Number of section headers: 0
>   Section header string table index: 0
> 
> There are no sections in this file.
> 
> There are no sections to group in this file.
> 
> Program Headers:
>   Type   Offset VirtAddr   PhysAddr
>  FileSizMemSiz  Flags  Align
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
> ... 
>   NULL   0x 0x 0x
>  0x 0x 0
>   NULL   0x 0x 0x
>  0x 0x 0
> 

In this case, was the "makedumpfile Completed." message emitted?
It looks like the buffer of program headers was not written to the file..

Anyway, a debugging patch attached below.

> There are some other failure cases with non-null data, so maybe there's >1 
> bug here.
> I've not seen an obvious pattern to this. eg...
> 
> https://pastebin.com/2uM4sBCF
> 

As for this case, I suspect that Elf64_Ehdr.e_phnum overflows
(i.e. num_loads_dumpfile > 65535):

 6851 /*
 6852  * Get the PT_LOAD number of the dumpfile.
 6853  */
 6854 if (!(num_loads_dumpfile = get_loads_dumpfile_cyclic())) {
 6855 ERRMSG("Can't get a number of PT_LOAD.\n");
 6856 goto out;
 6857 }
 6858 
 6859 if (is_elf64_memory()) { /* ELF64 */
 6860 if (!get_elf64_ehdr(info->fd_memory,
 6861 info->name_memory, )) {
 6862 ERRMSG("Can't get ehdr64.\n");
 6863 goto out;
 6864 }
 6865 /*
 6866  * PT_NOTE(1) + PT_LOAD(1+)
 6867  */
 6868 ehdr64.e_phnum = 1 + num_loads_dumpfile;

because e_phnum is uint16_t and the last LOAD of the dumpfile doesn't
reach up to the one of /proc/vmcore at all.

  LOAD   0x726029d4 0x88037ba1 0x00037ba1 <<-- 
paddr
 0x001c5000 0x004a9000  RWE0

[   12.743942] LOAD[ 6]1   408000 <<-- phys_end

If that is the case, it seems that we need to set it to PN_XNUM (0x)
and have an entry in section header table according to elf(5)..

> I'll put your patch on some of the affected hosts and see if this
> changes behaviour in any way.

If you can try the patch below, which includes the previous patch,
please show me:
- the debugging output of makedumpfile
- readelf -a vmcore
- ls -ls vmcore

Thanks,
Kazu


diff --git a/makedumpfile.c 

RE: makedumpfile: Fix divide by zero in print_report()

2019-10-07 Thread Kazuhito Hagio
> -Original Message-
> On Fri, Sep 27, 2019 at 08:39:04PM +, Kazuhito Hagio wrote:
>  > > -Original Message-
>  > > On Thu, Sep 26, 2019 at 06:41:48PM +, Kazuhito Hagio wrote:
>  > >
>  > >  > > -Original Message-
>  > >  > > If info->max_mapnr and pfn_memhole are equal, we divide by zero when
>  > >  > > trying determine the 'shrinking' value.
>  > >  > >
>  > >  > > On the system I saw this error, we arrived at this function with
>  > >  > > info->max_mapnr:0x0108 pfn_memhole:0x0108
>  > >  >
>  > >  > Thank you for the patch.
>  > >  > I suppose that you see the error with the -E option, right?
>  > >  >
>  > >  > It seems that the -E option has some problems with its statistics,
>  > >  > so I'm checking whether there is a better way to fix this.
>  > >
>  > > Yes, we use the -E option.
>  > > We manage to get useful info from the generated dump after this fix, so
>  > > it seems it only affects the statistics output.
>  >
>  > OK, the statistics in cyclic mode with the -E option is completely wrong
>  > but a possible fix is likely to affect the whole of cyclic processing, so
>  > I just cover the hole with your patch and leave the statistics problem as
>  > a known issue at this time.  I would revisit it when I have time.
>  >
>  > The patch was applied to the devel branch.
> 
> While this patch does avoid the divide by zero, some further analysis
> shows that there seems to be some deeper problem when we encounter this
> 'original pages = 0' situation.
> 
> Take a look at the attached output from makedumpfile.
> 
> Key part in the summary:
> 
> [  518.819690] Original pages  : 0x
> [  518.828894]   Excluded pages   : 0x03decd15
> [  518.838635] Pages filled with zero  : 0x000210ee
> [  518.849920] Non-private cache pages : 0x271a
> [  518.861218] Private cache pages : 0xda47
> [  518.872502] User process data pages : 0x03d6bdc8
> [  518.883786] Free pages  : 0x0004fcfe
> [  518.895070] Hwpoison pages  : 0x
> [  518.906356] Offline pages   : 0x
> [  518.917659]   Remaining pages  : 0xfc2132eb
> [  518.927398] Memory Hole : 0x0408
> 
> In this case, 'remaining pages' has gone negative which looks concerning.

This is the known issue that I wrote above and am looking for a safe fix.
How does this patch work?

--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -56,6 +56,9 @@ static void first_cycle(mdf_pfn_t start, mdf_pfn_t max, 
struct cycle *cycle)
if (cycle->end_pfn > max)
cycle->end_pfn = max;
 
+   if (cycle->start_pfn < start)
+   cycle->start_pfn = start;
+
cycle->exclude_pfn_start = 0;
cycle->exclude_pfn_end = 0;
 }
@@ -7595,6 +7598,9 @@ write_elf_pages_cyclic(struct cache_data *cd_header, 
struct cache_data *cd_page)
}
 
for (pfn = MAX(pfn_start, cycle.start_pfn); pfn < 
cycle.end_pfn; pfn++) {
+   if (info->flag_cyclic)
+   pfn_memhole--;
+
if (!is_dumpable(info->bitmap2, pfn, )) {
num_excluded++;
if ((pfn == pfn_end - 1) && frac_tail)

If it looks good, I'll look into its side effects further,
but might take some time..

> 
> And the crashdump seems corrupt:
> 
> 'crash' complains:
> WARNING: possibly corrupt Elf64_Nhdr: n_namesz: 2079035392 n_descsz: 3 
> n_type: 1000
> 
> vmcore-dmesg complains "Missing the log_buf symbol", even though the 
> makedumpfile log
> shows it was present at 822510a0
> 
> Readelf seems to think the notes sections are mangled.
> 
> # readelf -n vmcore
> 
> Displaying notes found at file offset 0x00015468 with length 0x556c:
>   Owner Data size   Description
>0x0007   Unknown note type: (0x727c79d4)
> readelf: vmcore: Warning: Corrupt note: name size is too big: 7beb9000
>   (NONE)   0x0003   Unknown note type: (0x1000)
> readelf: vmcore: Warning: Corrupt note: name size is too big: 55a000
>   (NONE)   0x   Unknown note type: (0x)
>   (NONE)   0x0001   Unknown note type: (0x0007)
> readelf: vmcore: Warning: note with invalid namesz and/or descs

RE: makedumpfile: Fix divide by zero in print_report()

2019-09-27 Thread Kazuhito Hagio
> -Original Message-
> On Thu, Sep 26, 2019 at 06:41:48PM +, Kazuhito Hagio wrote:
> 
>  > > -Original Message-
>  > > If info->max_mapnr and pfn_memhole are equal, we divide by zero when
>  > > trying determine the 'shrinking' value.
>  > >
>  > > On the system I saw this error, we arrived at this function with
>  > > info->max_mapnr:0x0108 pfn_memhole:0x0108
>  >
>  > Thank you for the patch.
>  > I suppose that you see the error with the -E option, right?
>  >
>  > It seems that the -E option has some problems with its statistics,
>  > so I'm checking whether there is a better way to fix this.
> 
> Yes, we use the -E option.
> We manage to get useful info from the generated dump after this fix, so
> it seems it only affects the statistics output.

OK, the statistics in cyclic mode with the -E option is completely wrong
but a possible fix is likely to affect the whole of cyclic processing, so
I just cover the hole with your patch and leave the statistics problem as
a known issue at this time.  I would revisit it when I have time.

The patch was applied to the devel branch.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: makedumpfile: Fix divide by zero in print_report()

2019-09-26 Thread Kazuhito Hagio
Hi Dave,

> -Original Message-
> If info->max_mapnr and pfn_memhole are equal, we divide by zero when
> trying determine the 'shrinking' value.
> 
> On the system I saw this error, we arrived at this function with
> info->max_mapnr:0x0108 pfn_memhole:0x0108

Thank you for the patch.
I suppose that you see the error with the -E option, right?

It seems that the -E option has some problems with its statistics,
so I'm checking whether there is a better way to fix this.

Thanks,
Kazu

> 
> Change the code to only print out the shrinking value if it makes sense.
> 
> Signed-off-by: Dave Jones 
> 
> diff -wbBdu -urN makedumpfile-1.6.6/makedumpfile.c 
> src/makedumpfile-code/makedumpfile.c
> --- makedumpfile-1.6.6/makedumpfile.c 2019-06-27 08:42:40.0 -0400
> +++ makedumpfile-code/makedumpfile.c  2019-09-24 15:28:06.456549495 -0400
> @@ -9778,8 +9778,6 @@
> 
>   pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
>   + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> - shrinking = (pfn_original - pfn_excluded) * 100;
> - shrinking = shrinking / pfn_original;
> 
>   REPORT_MSG("\n");
>   REPORT_MSG("Original pages  : 0x%016llx\n", pfn_original);
> @@ -9794,8 +9792,13 @@
>   REPORT_MSG("Offline pages   : 0x%016llx\n", pfn_offline);
>   REPORT_MSG("  Remaining pages  : 0x%016llx\n",
>   pfn_original - pfn_excluded);
> - REPORT_MSG("  (The number of pages is reduced to %lld%%.)\n",
> - shrinking);
> +
> + if (pfn_original != 0) {
> + shrinking = (pfn_original - pfn_excluded) * 100;
> + shrinking = shrinking / pfn_original;
> + REPORT_MSG("  (The number of pages is reduced to %lld%%.)\n",
> + shrinking);
> + }
>   REPORT_MSG("Memory Hole : 0x%016llx\n", pfn_memhole);
>   REPORT_MSG("--\n");
>   REPORT_MSG("Total pages : 0x%016llx\n", info->max_mapnr);



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [RFC PATCH] makedumpfile: exclude ZFS file cache pages

2019-08-06 Thread Kazuhito Hagio
Hi Don,

> -Original Message-
> From: Don Brady 
> 
> ZFS caches pages for file data in its Adaptive Replacement Cache (ARC).
> This cache is separate from the VFS page cache.  The amount of data
> cached can be significant and it would be ideal to exclude it from the
> crashdump file.  ZFS can tag these pages so they are easily identifiable
> from within makedumpfile.
> 
> ref https://github.com/zfsonlinux/zfs/pull/8899/files
> 
> Below is a suggested patch that can work in tandem with the above ZFS
> changes to exclude the ZFS ARC file data pages from a dump file.

Thank you for the patch.  The idea looks simple and good to me.

>From makedumpfile's viewpoint, it would be better to be separated from
the dump level (DL_EXCLUDE_CACHE_PRI) because there may be a case that
we want to exclude the ZFS ARC pages only, it can exclude unexpected pages
by accident, and a similar request to exclude private pages that have
another tag value can happen.

So what about adding a new option to specify a tag value and exclude
the pages that have it?  For example:

  # makedumpfile -l -d 1 --private-page-filter 0x2F5ABDF11ECAC4E

For simplicity, it would be enough to accept only a single value, not
multiple values/times for now.

Thanks,
Kazu

> 
> Signed-off-by: Don Brady 
> ---
>  makedumpfile.c | 36 ++--
>  1 file changed, 34 insertions(+), 2 deletions(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index d76a435..b760934 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -85,6 +85,7 @@ mdf_pfn_t pfn_zero;
>  mdf_pfn_t pfn_memhole;
>  mdf_pfn_t pfn_cache;
>  mdf_pfn_t pfn_cache_private;
> +mdf_pfn_t pfn_zfs_arc_pages;
>  mdf_pfn_t pfn_user;
>  mdf_pfn_t pfn_free;
>  mdf_pfn_t pfn_hwpoison;
> @@ -282,6 +283,20 @@ is_cache_page(unsigned long flags)
>   return FALSE;
>  }
> 
> +#define  ZFS_ABD_FILE_CACHE  0x2F5ABDF11ECAC4E
> +
> +static int
> +is_zfs_cache_page(unsigned long flags, unsigned long private)
> +{
> + /*
> +  * ZFS cached file data resides in pages with a private tag
> +  */
> + if (isPrivate(flags) && private == ZFS_ABD_FILE_CACHE)
> + return TRUE;
> +
> + return FALSE;
> +}
> +
>  static inline unsigned long
>  calculate_len_buf_out(long page_size)
>  {
> @@ -6048,6 +6063,13 @@ __exclude_unnecessary_pages(unsigned long mem_map,
>   else
>   pfn_counter = _cache;
>   }
> + /*
> +  * Exclude ZFS ARC pages
> +  */
> + else if ((info->dump_level & DL_EXCLUDE_CACHE_PRI)
> + && is_zfs_cache_page(flags, private)) {
> + pfn_counter = _zfs_arc_pages;
> + }
>   /*
>* Exclude the data page of the user process.
>*  - anonymous pages
> @@ -7551,6 +7573,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, 
> struct cache_data *cd_page)
>   if (info->flag_cyclic) {
>   pfn_zero = pfn_cache = pfn_cache_private = 0;
>   pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> + pfn_zfs_arc_pages = 0;
>   pfn_memhole = info->max_mapnr;
>   }
> 
> @@ -8833,6 +8856,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data 
> *cd_header, struct cache_d
>*/
>   pfn_zero = pfn_cache = pfn_cache_private = 0;
>   pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> + pfn_zfs_arc_pages = 0;
>   pfn_memhole = info->max_mapnr;
> 
>   /*
> @@ -9777,7 +9801,8 @@ print_report(void)
>   pfn_original = info->max_mapnr - pfn_memhole;
> 
>   pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> - + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> + + pfn_user + pfn_free + pfn_hwpoison + pfn_offline
> + + pfn_zfs_arc_pages;
>   shrinking = (pfn_original - pfn_excluded) * 100;
>   shrinking = shrinking / pfn_original;
> 
> @@ -9788,6 +9813,9 @@ print_report(void)
>   REPORT_MSG("Non-private cache pages : 0x%016llx\n", pfn_cache);
>   REPORT_MSG("Private cache pages : 0x%016llx\n",
>   pfn_cache_private);
> + if (pfn_zfs_arc_pages != 0)
> + REPORT_MSG("ZFS ARC file data pages : 0x%016llx\n",
> + pfn_zfs_arc_pages);
>   REPORT_MSG("User process data pages : 0x%016llx\n", pfn_user);
>   REPORT_MSG("Free pages  : 0x%016llx\n", pfn_free);
>   REPORT_MSG("Hwpoison pages  : 0x%016llx\n", pfn_hwpoison);
> @@ -9819,7 +9847,8 @@ print_mem_usage(void)
>   pfn_original = info->max_mapnr - pfn_memhole;
> 
>   pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> - + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> + + pfn_user + pfn_free + pfn_hwpoison + pfn_offline
> + + pfn_zfs_arc_pages;
>   shrinking = (pfn_original - 

RE: [RFC] makedumpfile: exclude ZFS file cache pages

2019-07-22 Thread Kazuhito Hagio
> -Original Message-
> From: kexec  On Behalf Of Donald Brady
> Sent: Friday, July 19, 2019 3:44 PM
> To: kexec@lists.infradead.org
> Subject: [RFC] makedumpfile: exclude ZFS file cache pages
> 
> Hello,
> 
> I have for consideration a makedumpfile patch that can exclude the ZFS
> file data cache pages. This is similar in capability as the exclusion
> of the VFS page cache pages.
> 
> My question to this list is how to proceed?  Should I post a patch or
> should I first explain how the ZFS pages are being marked and see if
> others here can recommend a better approach?

Hi Don,

Either is OK, but I don't have any knowledge about ZFS implementation
so the explanation of that and what your patch is going to do will be
helpful at least for me to review/discuss it. And as you say, someone
here may be able to help.

Thanks,
Kazu

> 
> Long term, ZFS should participate in the VFS page cache. However, that
> is a significant change and won't be available in the foreseeable
> future. There is an immediate benefit from excluding ZFS file cache
> pages that will benefit crash dump users that use ZFS.
> 
> Thanks in advance for your help!
> -Don
> 
> --
> DON BRADY
> Staff Engineer
> Delphix.com
> 
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[ANNOUNCE] makedumpfile 1.6.6 is released

2019-06-27 Thread Kazuhito Hagio
Hello,

I'm pleased to announce the release of makedumpfile 1.6.6.
Your comments/patches are welcome.

Main new features:
o Support for AMD Secure Memory Encryption
o Exclude pages that are logically offline
o Support new kernels
  - The supported kernel is updated to 5.1.9 in this version.

Changelog since v1.6.5:
0a8b504102db [v1.6.6] Update version (Kazuhito Hagio)
8c21fc7e7c52 [PATCH] Support newer kernels up to v5.1 (Kazuhito Hagio)
3222d4ad04c6 [PATCH] x86_64: fix get_kaslr_offset_x86_64() to return 
kaslr_offset correctly (Kazuhito Hagio)
d222b01e516b [PATCH] x86_64: Add support for AMD Secure Memory Encryption 
(Lianbo Jiang)
1743c7370868 [PATCH] exclude pages that are logically offline (David 
Hildenbrand)
feee755900e0 [PATCH] ppc64: fix a typo for checking the file pointer for null 
(Nisha Parrakat)
2f007b48c581 [PATCH v2] honor the CFLAGS from environment variables (Kairui 
Song)
b9da17259ef5 [PATCH] Some improvements of debugging messages (Kazuhito Hagio)
f349b51f6211 [PATCH] ppc64: increase MAX_PHYSMEM_BITS to 2PB (Hari Bathini)

Explanation of makedumpfile:
  To shorten the size of the dumpfile and the time of creating the
  dumpfile, makedumpfile copies only the necessary pages for analysis
  to the dumpfile from /proc/vmcore. You can specify the kind of
  unnecessary pages with dump_level. If you want to shorten the size
  further, enable the compression of the page data.

Download:
  The latest makedumpfile can be downloaded from the following URL.
  https://sourceforge.net/projects/makedumpfile/

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[ANNOUNCE] makedumpfile 1.6.6 preparation

2019-06-13 Thread Kazuhito Hagio
Hi,

I am planning to release makedumpfile v1.6.6 in the next two weeks.
It will support newer kernels up to v5.1.

If you have any patches that you would like included in v1.6.6,
please send them within one week.

For reference the patches since v1.6.5 are as follows:

8c21fc7e7c52 [PATCH] Support newer kernels up to v5.1
3222d4ad04c6 [PATCH] x86_64: fix get_kaslr_offset_x86_64() to return 
kaslr_offset correctly
d222b01e516b [PATCH] x86_64: Add support for AMD Secure Memory Encryption
1743c7370868 [PATCH] exclude pages that are logically offline
feee755900e0 [PATCH] ppc64: fix a typo for checking the file pointer for null
2f007b48c581 [PATCH v2] honor the CFLAGS from environment variables
b9da17259ef5 [PATCH] Some improvements of debugging messages
f349b51f6211 [PATCH] ppc64: increase MAX_PHYSMEM_BITS to 2PB

and "[PATCH] makedumpfile/arm64: fix get_kaslr_offset_arm64() to return
kaslr_offset correctly" I posted, if no objection to it.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] makedumpfile/arm64: fix get_kaslr_offset_arm64() to return kaslr_offset correctly

2019-06-04 Thread Kazuhito Hagio
Currently, the get_kaslr_offset_arm64() function has the following
condition to return info->kaslr_offset, but kernel text mapping is
placed in another range on arm64 by default, so it returns 0 for
kernel text addresses.

if (vaddr >= __START_KERNEL_map &&
vaddr < __START_KERNEL_map + info->kaslr_offset)

Consequently, kernel text symbols in erase config are resolved wrongly
with KASLR enabled vmcore, and makedumpfile erases unintended data.

Since the return value of get_kaslr_offset_arm64() is used in
resolve_config_entry() only, and in that case, we must have a vmlinux,
so get the addresses of _text and _end from vmlinux and use them.

Signed-off-by: Kazuhito Hagio 
---
 arch/arm64.c   | 24 ++--
 makedumpfile.h |  1 -
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/arm64.c b/arch/arm64.c
index 5fcf59d..a61d96f 100644
--- a/arch/arm64.c
+++ b/arch/arm64.c
@@ -215,6 +215,8 @@ get_kaslr_offset_arm64(unsigned long vaddr)
 {
unsigned int i;
char buf[BUFSIZE_FGETS], *endp;
+   static unsigned long _text = NOT_FOUND_SYMBOL;
+   static unsigned long _end = NOT_FOUND_SYMBOL;
 
if (!info->kaslr_offset && info->file_vmcoreinfo) {
if (fseek(info->file_vmcoreinfo, 0, SEEK_SET) < 0) {
@@ -237,9 +239,27 @@ get_kaslr_offset_arm64(unsigned long vaddr)
}
}
}
+   if (!info->kaslr_offset)
+   return 0;
+
+   if (_text == NOT_FOUND_SYMBOL) {
+   /*
+* Currently, the return value of this function is used in
+* resolve_config_entry() only, and in that case, we must
+* have a vmlinux.
+*/
+   if (info->name_vmlinux) {
+   _text = get_symbol_addr("_text");
+   _end = get_symbol_addr("_end");
+   }
+   DEBUG_MSG("_text: %lx, _end: %lx\n", _text, _end);
+   if (_text == NOT_FOUND_SYMBOL || _end == NOT_FOUND_SYMBOL) {
+   ERRMSG("Cannot determine _text and _end address\n");
+   return FALSE;
+   }
+   }
 
-   if (vaddr >= __START_KERNEL_map &&
-   vaddr < __START_KERNEL_map + info->kaslr_offset) {
+   if (_text <= vaddr && vaddr <= _end) {
DEBUG_MSG("info->kaslr_offset: %lx\n", info->kaslr_offset);
return info->kaslr_offset;
} else {
diff --git a/makedumpfile.h b/makedumpfile.h
index b1176b7..bd60acc 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -542,7 +542,6 @@ do { \
 #ifdef __aarch64__
 unsigned long get_kvbase_arm64(void);
 #define KVBASE get_kvbase_arm64()
-#define __START_KERNEL_map (0x8000UL)
 
 #endif /* aarch64 */
 
-- 
2.18.1



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v3 1/3] arm64, vmcoreinfo : Append 'PTRS_PER_PGD' to vmcoreinfo

2019-04-05 Thread Kazuhito Hagio
Hi James,

Thank you for your comment.

-Original Message-
> Hi Kazu,
> 
> On 27/03/2019 16:07, Kazuhito Hagio wrote:
> > On 3/26/2019 12:36 PM, James Morse wrote:
> >> On 20/03/2019 05:09, Bhupesh Sharma wrote:
> >>> With ARMv8.2-LVA architecture extension availability, arm64 hardware
> >>> which supports this extension can support a virtual address-space upto
> >>> 52-bits.
> >>>
> >>> Since at the moment we enable the support of this extension in kernel
> >>> via CONFIG flags, e.g.
> >>>  - User-space 52-bit LVA via CONFIG_ARM64_USER_VA_BITS_52
> >>>
> >>> so, there is no clear mechanism in the user-space right now to
> >>> determine these CONFIG flag values and hence determine the maximum
> >>> virtual address space supported by the underlying kernel.
> >>>
> >>> User-space tools like 'makedumpfile' therefore are broken currently
> >>> as they have no proper method to calculate the 'PTRS_PER_PGD' value
> >>> which is required to perform a page table walk to determine the
> >>> physical address of a corresponding virtual address found in
> >>> kcore/vmcoreinfo.
> >>>
> >>> If one appends 'PTRS_PER_PGD' number to vmcoreinfo for arm64,
> >>> it can be used in user-space to determine the maximum virtual address
> >>> supported by underlying kernel.
> >>
> >> I don't think this really solves the problem, it feels fragile.
> >>
> >> I can see how vmcoreinfo tells you VA_BITS==48, PAGE_SIZE==64K and 
> >> PTRS_PER_PGD=1024.
> >> You can use this to work out that the top level page table size isn't 
> >> consistent with a
> >> 48bit VA, so 52bit VA must be in use...
> >>
> >> But wasn't your problem walking the kernel page tables? In particular the 
> >> offset that we
> >> apply because the tables were based on a 48bit VA shifted up in 
> >> swapper_pg_dir.
> >>
> >> Where does the TTBR1_EL1 offset come from with this property? I assume 
> >> makedumpfile
> >> hard-codes it when it sees 52bit is in use ... somewhere.
> 
> > My understanding is that the TTBR1_EL1 offset comes from a kernel
> > virtual address with the exported PTRS_PER_PGD.
> >
> > With T1SZ is 48bit and T0SZ is 52bit,
> 
> (PTRS_PER_PGD doesn't tell you this, PTRS_PER_PGD lets you spot something odd 
> is
> happening, and this just happens to be the only odd combination today.)

I didn't intend to guess other things from PTRS_PER_PGD.

> > kva = 0x<--- start of kernel virtual address
> 
> Does makedumpfile have this value? If the kernel were using 52bit VA for 
> TTBR1 this value
> would be different.

This was an example address to show that pgd_index() automatically returns
a value including the offset for any kernel virtual address by the exported
PTRS_PER_PGD. In this case, even for the first virtual address, it returns
the non-zero value, which is the offset. (sorry for the poor explanation..)

So makedumpfile doesn't need the start address specifically to walk the page
tables, and I was thinking that exporting PTRS_PER_PGD may be stable unless
pgd_index() doesn't change.

> > pgd_index(kva) = (kva >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)
> >= (0x >> 42) & (1024 - 1)
> >= 0x003fffc0 & 0x3ff
> >= 0x3c0  <--- the offset (0x3c0) is included
> >
> > This is what kernel does now, so makedumpfile also wants to do.
> 
> Sure, and it would work today. I'm worried about tomorrow, where we support 
> something new,
> and need to bundle new information out through vmcoreinfo. This ends up being 
> used to
> fingerprint the kernel support, instead of as the value it was intended to be.

Yes, more stable and reasonable way is preferable.

> >> We haven't solved the problem!
> >>
> >> Today __cpu_setup() sets T0SZ and T1SZ differently for 52bit VA, but in 
> >> the future it
> >> could set them the same, or different the other-way-round.
> >>
> >> Will makedumpfile using this value keep working once T1SZ is 52bit VA too? 
> >> In this case
> >> there would be no ttbr offset.
> >
> > If T1SZ is 52bit, probably kernel virtual address starts from 
> > 0xfff0,
> 
> I didn't think this 'bottom of the ttbr1 mapping range' value was exposed 
> anywhere.
> Where can user-space get this from? (I can't see it in the vmcoreinfo list)
> 
> 
> > then the offset becomes 

RE: [PATCH v3 1/3] arm64, vmcoreinfo : Append 'PTRS_PER_PGD' to vmcoreinfo

2019-03-27 Thread Kazuhito Hagio
On 3/26/2019 12:36 PM, James Morse wrote:
> Hi Bhupesh,
> 
> On 20/03/2019 05:09, Bhupesh Sharma wrote:
> > With ARMv8.2-LVA architecture extension availability, arm64 hardware
> > which supports this extension can support a virtual address-space upto
> > 52-bits.
> >
> > Since at the moment we enable the support of this extension in kernel
> > via CONFIG flags, e.g.
> >  - User-space 52-bit LVA via CONFIG_ARM64_USER_VA_BITS_52
> >
> > so, there is no clear mechanism in the user-space right now to
> > determine these CONFIG flag values and hence determine the maximum
> > virtual address space supported by the underlying kernel.
> >
> > User-space tools like 'makedumpfile' therefore are broken currently
> > as they have no proper method to calculate the 'PTRS_PER_PGD' value
> > which is required to perform a page table walk to determine the
> > physical address of a corresponding virtual address found in
> > kcore/vmcoreinfo.
> >
> > If one appends 'PTRS_PER_PGD' number to vmcoreinfo for arm64,
> > it can be used in user-space to determine the maximum virtual address
> > supported by underlying kernel.
> 
> I don't think this really solves the problem, it feels fragile.
> 
> I can see how vmcoreinfo tells you VA_BITS==48, PAGE_SIZE==64K and 
> PTRS_PER_PGD=1024.
> You can use this to work out that the top level page table size isn't 
> consistent with a
> 48bit VA, so 52bit VA must be in use...
> 
> But wasn't your problem walking the kernel page tables? In particular the 
> offset that we
> apply because the tables were based on a 48bit VA shifted up in 
> swapper_pg_dir.
> 
> Where does the TTBR1_EL1 offset come from with this property? I assume 
> makedumpfile
> hard-codes it when it sees 52bit is in use ... somewhere.

My understanding is that the TTBR1_EL1 offset comes from a kernel
virtual address with the exported PTRS_PER_PGD.

With T1SZ is 48bit and T0SZ is 52bit,

kva = 0x<--- start of kernel virtual address
pgd_index(kva) = (kva >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)
   = (0x >> 42) & (1024 - 1)
   = 0x003fffc0 & 0x3ff
   = 0x3c0  <--- the offset (0x3c0) is included

This is what kernel does now, so makedumpfile also wants to do.

> We haven't solved the problem!
> 
> Today __cpu_setup() sets T0SZ and T1SZ differently for 52bit VA, but in the 
> future it
> could set them the same, or different the other-way-round.
> 
> Will makedumpfile using this value keep working once T1SZ is 52bit VA too? In 
> this case
> there would be no ttbr offset.

If T1SZ is 52bit, probably kernel virtual address starts from 
0xfff0,
then the offset becomes 0 with the pgd_index() above.
I think makedumpfile will keep working with that.

Thanks,
Kazu

> 
> If you need another vmcoreinfo flag once that happens, we've done something 
> wrong here.
> 
> (Not to mention what happens if the TTBR1_EL1 uses 52bit va, but TTBR0_EL1 
> doesn't)
> 
> 
> > Suggested-by: Steve Capper 
> 
> (CC: +Steve)
> 
> 
> Thanks,
> 
> James

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v2 2/2] crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to vmcoreinfo

2019-03-13 Thread Kazuhito Hagio
Hi Bhupesh,

-Original Message-
> Right now user-space tools like 'makedumpfile' and 'crash' need to rely
> on a best-guess method of determining value of 'MAX_PHYSMEM_BITS'
> supported by underlying kernel.
> 
> This value is used in user-space code to calculate the bit-space
> required to store a section for SPARESMEM (similar to the existing
> calculation method used in the kernel implementation):
> 
>   #define SECTIONS_SHIFT(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
> 
> Now, regressions have been reported in user-space utilities
> like 'makedumpfile' and 'crash' on arm64, with the recently added
> kernel support for 52-bit physical address space, as there is
> no clear method of determining this value in user-space
> (other than reading kernel CONFIG flags).
> 
> As per suggestion from makedumpfile maintainer (Kazu), it makes more
> sense to append 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself
> rather than in arch-specific code, so that the user-space code for other
> archs can also benefit from this addition to the vmcoreinfo and use it
> as a standard way of determining 'SECTIONS_SHIFT' value in user-land.
> 
> A reference 'makedumpfile' implementation which reads the
> 'MAX_PHYSMEM_BITS' value from vmcoreinfo in a arch-independent fashion
> is available here:
> 
> [0]. 
> https://github.com/bhupesh-sharma/makedumpfile/blob/remove-max-phys-mem-bit-v1/arch/ppc64.c#L471
> 
> Cc: Boris Petkov 
> Cc: Ingo Molnar 
> Cc: Thomas Gleixner 
> Cc: James Morse 
> Cc: Will Deacon 
> Cc: Michael Ellerman 
> Cc: Paul Mackerras 
> Cc: Benjamin Herrenschmidt 
> Cc: Dave Anderson 
> Cc: Kazuhito Hagio 
> Cc: x...@kernel.org
> Cc: linuxppc-...@lists.ozlabs.org
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: linux-ker...@vger.kernel.org
> Cc: kexec@lists.infradead.org
> Signed-off-by: Bhupesh Sharma 
> ---
>  kernel/crash_core.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 093c9f917ed0..44b90368e183 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -467,6 +467,7 @@ static int __init crash_save_vmcoreinfo_init(void)
>  #define PAGE_OFFLINE_MAPCOUNT_VALUE  (~PG_offline)
>   VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
>  #endif
> + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);

Some architectures define MAX_PHYSMEM_BITS only with CONFIG_SPARSEMEM,
so we need to move this to the #ifdef section that exports some
mem_section things.

Thanks!
Kazu

> 
>   arch_crash_save_vmcoreinfo();
>   update_vmcoreinfo_note();
> --
> 2.7.4
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v3] Remove the memory encryption mask to obtain the true physical address

2019-03-11 Thread Kazuhito Hagio
-Original Message-
> >> [PATCH v3] Remove the memory encryption mask to obtain the true physical 
> >> address
> >
> > I forgot to comment on the subject and the commit log..
> > I'll change this to
> >
> >   x86_64: Add support for AMD Secure Memory Encryption
> >
> > On 1/29/2019 9:48 PM, Lianbo Jiang wrote:
> >> For AMD machine with SME feature, if SME is enabled in the first
> >> kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
> >> the memory encryption mask, so makedumpfile needs to remove the
> >> memory encryption mask to obtain the true physical address.
> >
> > I added a few official words from some documents:
> > ---
> > On AMD machine with Secure Memory Encryption (SME) feature, if SME is
> > enabled, page tables contain a specific attribute bit (C-bit) in their
> > entries to indicate whether a page is encrypted or unencrypted.
> >
> > So get NUMBER(sme_mask) from vmcoreinfo, which stores the value of
> > the C-bit position, and drop it to obtain the true physical address.
> > ---
> >
> > If these are OK, I'll modify them when merging, so you don't need
> > to repost.
> >
> 
> It's fine to me. Thank you, Kazu.
> 
> Regards,
> Lianbo
> 
> > And, I'm thinking to merge this after the kernel patch gets merged
> > into the mainline.

Hi Lianbo,

I found your patch upstream. Applied to the devel branch.

Thank you!
Kazu


> >
> > Thanks for your work.
> > Kazu
> >
> >>
> >> Signed-off-by: Lianbo Jiang 
> >> ---
> >> Changes since v1:
> >> 1. Merge them into a patch.
> >> 2. The sme_mask is not an enum number, remove it.
> >> 3. Sanity check whether the sme_mask is in vmcoreinfo.
> >> 4. Deal with the huge pages case.
> >> 5. Cover the 5-level path.
> >>
> >> Changes since v2:
> >> 1. Change the sme_me_mask to entry_mask.
> >> 2. No need to remove the mask when makedumpfile prints out the
> >>value of the entry.
> >> 3. Remove the sme mask from the pte at the end of the __vtop4_x86_64().
> >> 4. Also need to remove the sme mask from page table entry in
> >>find_vmemmap_x86_64()
> >>
> >>  arch/x86_64.c  | 30 +++---
> >>  makedumpfile.c |  4 
> >>  makedumpfile.h |  1 +
> >>  3 files changed, 24 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/arch/x86_64.c b/arch/x86_64.c
> >> index 537fb78..9977466 100644
> >> --- a/arch/x86_64.c
> >> +++ b/arch/x86_64.c
> >> @@ -291,6 +291,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> >> pagetable)
> >>unsigned long page_dir, pgd, pud_paddr, pud_pte, pmd_paddr, pmd_pte;
> >>unsigned long pte_paddr, pte;
> >>unsigned long p4d_paddr, p4d_pte;
> >> +  unsigned long entry_mask = ENTRY_MASK;
> >>
> >>/*
> >> * Get PGD.
> >> @@ -302,6 +303,9 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> >> pagetable)
> >>return NOT_PADDR;
> >>}
> >>
> >> +  if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
> >> +  entry_mask &= ~(NUMBER(sme_mask));
> >> +
> >>if (check_5level_paging()) {
> >>page_dir += pgd5_index(vaddr) * sizeof(unsigned long);
> >>if (!readmem(PADDR, page_dir, , sizeof pgd)) {
> >> @@ -318,7 +322,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> >> pagetable)
> >>/*
> >> * Get P4D.
> >> */
> >> -  p4d_paddr  = pgd & ENTRY_MASK;
> >> +  p4d_paddr  = pgd & entry_mask;
> >>p4d_paddr += p4d_index(vaddr) * sizeof(unsigned long);
> >>if (!readmem(PADDR, p4d_paddr, _pte, sizeof p4d_pte)) {
> >>ERRMSG("Can't get p4d_pte (p4d_paddr:%lx).\n", 
> >> p4d_paddr);
> >> @@ -331,7 +335,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> >> pagetable)
> >>ERRMSG("Can't get a valid p4d_pte.\n");
> >>return NOT_PADDR;
> >>}
> >> -  pud_paddr  = p4d_pte & ENTRY_MASK;
> >> +  pud_paddr  = p4d_pte & entry_mask;
> >>}else {
> >>page_dir += pgd_index(vaddr) * sizeof(unsigned long);
> >>if (!readmem(PADDR, page_dir, , sizeof pgd)) {
> >> @@ -345,7 +349,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> >> pagetable)
> >>ERRMSG("Can't get a valid pgd.\n");
> >>return NOT_PADDR;
> >>}
> >> -  pud_paddr  = pgd & ENTRY_MASK;
> >> +  pud_paddr  = pgd & entry_mask;
> >>}
> >>
> >>/*
> >> @@ -364,13 +368,13 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> >> pagetable)
> >>return NOT_PADDR;
> >>}
> >>if (pud_pte & _PAGE_PSE)/* 1GB pages */
> >> -  return (pud_pte & ENTRY_MASK & PUD_MASK) +
> >> +  return (pud_pte & entry_mask & PUD_MASK) +
> >>(vaddr & ~PUD_MASK);
> >>
> >>/*
> >> * Get PMD.
> >> */
> >> -  pmd_paddr  = pud_pte & ENTRY_MASK;
> >> +  pmd_paddr  = pud_pte & entry_mask;
> >>pmd_paddr += pmd_index(vaddr) * sizeof(unsigned long);
> >>if (!readmem(PADDR, 

RE: [PATCH v2] makedumpfile: exclude pages that are logically offline

2019-03-11 Thread Kazuhito Hagio
-Original Message-
> On 27.11.18 17:32, Kazuhito Hagio wrote:
> >> Linux marks pages that are logically offline via a page flag (map count).
> >> Such pages e.g. include pages infated as part of a balloon driver or
> >> pages that were not actually onlined when onlining the whole section.
> >>
> >> While the hypervisor usually allows to read such inflated memory, we
> >> basically read and dump data that is completely irrelevant. Also, this
> >> might result in quite some overhead in the hypervisor. In addition,
> >> we saw some problems under Hyper-V, whereby we can crash the kernel by
> >> dumping, when reading memory of a partially onlined memory segment
> >> (for memory added by the Hyper-V balloon driver).
> >>
> >> Therefore, don't read and dump pages that are marked as being logically
> >> offline.
> >>
> >> Signed-off-by: David Hildenbrand 
> >
> > Thanks for the v2 update.
> > I'm going to merge this patch after the kernel patches are merged
> > and it tests fine with the kernel.
> >
> > Kazu
> 
> Hi Kazu,
> 
> the patches are now upstream. Thanks!

Tested OK at my end, too. Applied to the devel branch.

Offline pages   : 0x2400

Thank you!
Kazu

> 
> --
> 
> Thanks,
> 
> David / dhildenb

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: fixing possible memory leaks seen in static analysis

2019-02-28 Thread Kazuhito Hagio
Hi Nisha,

-Original Message-
> From: Nisha Parrakat 
> 
> Description:
> Fixed memory leaks found based on cppcheck and codesonar run.
> 
> Running makedumpfile with valgrind din't show any memory leaks but static
> analysis tools were used to check for possible memory leaks.
> 
> Signed-off-by: Nisha Parrakat 

Sorry for the delay.
The patch was low priority because it doesn't fix any actual bugs, and
the changes are executed in case of error and makedumpfile will exit
instantly after that, so it doesn't have much effect of saving memory.

And some parts of the patch look unnecessary..

> 
> diff -Naur a/dwarf_info.c b/dwarf_info.c
> --- a/dwarf_info.c2018-07-03 20:52:46.0 +0200
> +++ b/dwarf_info.c2019-01-29 21:17:45.949544759 +0100
> @@ -332,6 +332,8 @@
>   dwarf_info.dwfl = dwfl;
>   return TRUE;
>  err_out:
> + if( dwfl_fd != -1)
> + close(dwfl_fd);

This is unnecessary, because the dwfl_report_offline() function closes
it on success. (and also this will double-close it on failure.)

>   if (dwfl)
>   dwfl_end(dwfl);
> 
> diff -Naur a/erase_info.c b/erase_info.c
> --- a/erase_info.c2018-07-03 20:52:46.0 +0200
> +++ b/erase_info.c2019-01-29 21:17:45.949544759 +0100
> @@ -599,6 +599,8 @@
>   return ce;
> 
>  err_out:
> + if(ptr && ptr->name) /*free strdup malloced memory*/
> + free( ptr->name );

Did you check the free_config_entry() function below?

>   if (ce)
>   free_config_entry(ce);
>   if (ptr)
> @@ -1030,6 +1032,8 @@
>   }
>   return config;
>  err_out:
> + if (config && config->module_name) /*free strdup memory*/
> + free ( config->module_name );

Did you check the free_config() function below?

>   if (config)
>   free_config(config);
>   return NULL;
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 8923538..0f7b90a 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -2720,16 +2720,19 @@ copy_vmcoreinfo(off_t offset, unsigned long size)
>   if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
>   ERRMSG("Can't seek the dump memory(%s). %s\n",
>   info->name_memory, strerror(errno));
> +close(fd);
>   return FALSE;
>   }
>   if (read(info->fd_memory, , size) != size) {
>   ERRMSG("Can't read the dump memory(%s). %s\n",
>   info->name_memory, strerror(errno));
> +close(fd);
>   return FALSE;
>   }
>   if (write(fd, , size) != size) {
>   ERRMSG("Can't write the vmcoreinfo file(%s). %s\n",
>   info->name_vmcoreinfo, strerror(errno));
> +close(fd);
>   return FALSE;
>   }
>   if (close(fd) < 0) {
> @@ -3639,6 +3642,7 @@ initialize_bitmap_memory(void)
>   if (bmp->buf == NULL) {
>   ERRMSG("Can't allocate memory for the bitmap buffer. %s\n",
>   strerror(errno));
> + free(bmp);
>   return FALSE;
>   }
>   bmp->fd= info->fd_memory;
> @@ -8486,7 +8490,7 @@ write_eraseinfo(struct cache_data *cd_page, unsigned 
> long *size_out)
>   int i, j, obuf_size = 0, ei_size = 0;
>   int ret = FALSE;
>   unsigned long size_eraseinfo = 0;
> - char *obuf = NULL;
> + char *obuf = NULL; char *obuf_new = NULL;

We don't use this style normally. Instead,

  char *obuf = NULL, *obuf_new = NULL;

>   char size_str[MAX_SIZE_STR_LEN];
> 
>   for (i = 1; i < num_erase_info; i++) {
> @@ -8511,12 +8515,14 @@ write_eraseinfo(struct cache_data *cd_page, unsigned 
> long *size_out)
>*/
>   if (ei_size > obuf_size) {
>   obuf_size = ei_size;
> - obuf = realloc(obuf, obuf_size);
> - if (!obuf) {
> + obuf_new = realloc(obuf, obuf_size);
> + if (!obuf_new) {
>   ERRMSG("Can't allocate memory for"
>   " output buffer\n");
> +free(obuf);
>   return FALSE;

"goto out" is better than "free & return" in this function.

> - }
> + }else
> + obuf = obuf_new;
>   }
>   sprintf(obuf, "erase %s %s", erase_info[i].symbol_expr,
>   size_str);
> @@ -9074,12 +9080,14 @@ init_xen_crash_info(void)
>   if (lseek(info->fd_memory, offset_xen_crash_info, SEEK_SET) < 0) {
>   ERRMSG("Can't seek the dump memory(%s). %s\n",
>  info->name_memory, strerror(errno));
> + free(buf);
>   return FALSE;
>   }
>

RE: [PATCH v2 1/2] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-02-22 Thread Kazuhito Hagio
Hi Bhupesh,

-Original Message-
> Hi Kazu,
> 
> Thanks for the review.
> 
> On 02/21/2019 09:05 PM, Kazuhito Hagio wrote:
> > Hi Bhupesh,
> >
> > -Original Message-
> >> ARMv8.2-LPA architecture extension (if available on underlying hardware)
> >> can support 52-bit physical addresses, while the kernel virtual
> >> addresses remain 48-bit.
> >>
> >> This patch is in accordance with ARMv8 Architecture Reference Manual
> >> version D.a
> >>
> >> Make sure that we read the 52-bit PA address capability from
> >> 'MAX_PHYSMEM_BITS' variable (if available in vmcoreinfo) and
> >> accordingly change the pte_to_phy() mask values and also traverse
> >> the page-table walk accordingly.
> >>
> >> Also make sure that it works well for the existing 48-bit PA address
> >> platforms and also on environments which use newer kernels with 52-bit
> >> PA support but hardware which is not ARM8.2-LPA compliant.
> >>
> >> I have sent a kernel patch upstream to add 'MAX_PHYSMEM_BITS' to
> >> vmcoreinfo for arm64 (see [0]).
> >>
> >> [0]. http://lists.infradead.org/pipermail/kexec/2019-February/022411.html
> >>
> >> Signed-off-by: Bhupesh Sharma 
> >
> > This patch looks good to me.
> > For two slight things below, I will remove them when merging.
> >
> >> +/*
> >> + * Size mapped by an entry at level n ( 0 <= n <= 3)
> >> + * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
> >> + * in the final page. The maximum number of translation levels supported 
> >> by
> >> + * the architecture is 4. Hence, starting at at level n, we have further
> >> + * ((4 - n) - 1) levels of translation excluding the offset within the 
> >> page.
> >> + * So, the total number of bits mapped by an entry at level n is :
> >> + *
> >> + *  ((4 - n) - 1) * (PAGE_SHIFT - 3) + PAGE_SHIFT
> >> + *
> >> + * Rearranging it a bit we get :
> >> + *   (4 - n) * (PAGE_SHIFT - 3) + 3
> >> + */
> >
> > Will remove this comment.
> 
> Ok.
> 
> >> +#define pmd_offset_pgtbl_lvl_2(dir, vaddr) ((pmd_t *)dir)
> >> +#define pmd_offset_pgtbl_lvl_3(dir, vaddr) (pud_page_paddr((*(dir))) + 
> >> pmd_index(vaddr) *
> sizeof(pmd_t))
> >
> > Will remove these two macros not in use.
> 
> Ok.
> 
> >
> > And, as I said on another thread, I'm thinking to merge the following
> > patch after your patch 1/2, it tested OK with 48-bit and 52-bit PA
> > without NUMBER(MAX_PHYSMEM_BITS) in vmcoreinfo.
> > Do you think of any case that this will not work well?
> >
> > diff --git a/arch/arm64.c b/arch/arm64.c
> > index 29247a7..c7e60e0 100644
> > --- a/arch/arm64.c
> > +++ b/arch/arm64.c
> > @@ -127,6 +127,9 @@ typedef unsigned long pgdval_t;
> >*/
> >   #define SECTIONS_SIZE_BITS30
> >
> > +#define _MAX_PHYSMEM_BITS_48   48
> > +#define _MAX_PHYSMEM_BITS_52   52
> > +
> >   /*
> >* Hardware page table definitions.
> >*
> > @@ -402,17 +405,27 @@ get_stext_symbol(void)
> > return(found ? kallsym : FALSE);
> >   }
> >
> > +static int
> > +set_max_physmem_bits_arm64(void)
> > +{
> > +   long array_len = ARRAY_LENGTH(mem_section);
> > +
> > +   info->max_physmem_bits = _MAX_PHYSMEM_BITS_48;
> > +   if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> > +   || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> > +   return TRUE;
> > +
> > +   info->max_physmem_bits = _MAX_PHYSMEM_BITS_52;
> > +   if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> > +   || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> > +   return TRUE;
> > +
> > +   return FALSE;
> > +}
> > +
> >   int
> >   get_machdep_info_arm64(void)
> >   {
> > -   /* Determine if the PA address range is 52-bits: ARMv8.2-LPA */
> > -   if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER) {
> > -   info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
> > -   if (info->max_physmem_bits == 52)
> > -   lpa_52_bit_support_available = 1;
> > -   } else
> > -   info->max_physmem_bits = 48;
> > -
> > /* Check if va_bits is still not initialized. If still 0, call
> >  * get_versiondep_info() to initialize the same.
> >  */
> > @@ 

RE: [PATCH] arm64, vmcoreinfo : Append 'MAX_USER_VA_BITS' and 'MAX_PHYSMEM_BITS' to vmcoreinfo

2019-02-21 Thread Kazuhito Hagio
-Original Message-
> - Original Message -
> > Hi Kazu,
> >
> > On 02/20/2019 02:17 AM, Kazuhito Hagio wrote:
> > > Hi Bhupesh,
> > >
> > > -Original Message-
> > >> I am not sure you got a chance to look at the two regression cases I
> > >> reported here:
> > >> <http://lists.infradead.org/pipermail/kexec/2019-February/022449.html>
> > >>
> > >> Unfortunately the above suggestion doesn't provide any fix for
> > >> ARMv8.2-LPA regression (see text under heading '
> > >> (1). Regression Case 1 (ARMv8.2-LPA enabled kernel)')
> > >
> > > As for MAX_PHYSMEM_BITS, I realized that ppc64 makedumpfile can detect
> > > it because there is only one SECTION_SIZE_BITS for ppc64. I think we
> > > can use the same way as set_ppc64_max_physmem_bits() does also for
> > > arm64 for now. I'm going to write it for kernels not having
> > > NUMBER(MAX_PHYSMEM_BITS) in vmcoreinfo.
> >
> > I see two drawbacks with the above approach:
> >
> > a). This means that other user-space tools like crash-utility would
> > still be broken and would probably need to find MAX_PHYSMEM_BITS for
> > arm64 via a similar (hack'ish ?) approach.
> >
> > b). I am looking at the makedumpfile code for 'MAX_PHYSMEM_BITS'
> > determination for two archs as an example:
> >
> > ppc
> > ---
> >
> > int
> > set_ppc64_max_physmem_bits(void)
> > {
> >  long array_len = ARRAY_LENGTH(mem_section);
> >  /*
> >   * The older ppc64 kernels uses _MAX_PHYSMEM_BITS as 42 and the
> >   * newer kernels 3.7 onwards uses 46 bits.
> >   */
> >
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_ORIG ;
> >  if ((array_len == (NR_MEM_SECTIONS() / 
> > _SECTIONS_PER_ROOT_EXTREME()))
> >  || (array_len == (NR_MEM_SECTIONS() / 
> > _SECTIONS_PER_ROOT(
> >  return TRUE;
> >
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_3_7;
> >  if ((array_len == (NR_MEM_SECTIONS() / 
> > _SECTIONS_PER_ROOT_EXTREME()))
> >  || (array_len == (NR_MEM_SECTIONS() / 
> > _SECTIONS_PER_ROOT(
> >  return TRUE;
> >
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_4_19;
> >  if ((array_len == (NR_MEM_SECTIONS() / 
> > _SECTIONS_PER_ROOT_EXTREME()))
> >  || (array_len == (NR_MEM_SECTIONS() / 
> > _SECTIONS_PER_ROOT(
> >  return TRUE;
> >
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_4_20;
> >  if ((array_len == (NR_MEM_SECTIONS() / 
> > _SECTIONS_PER_ROOT_EXTREME()))
> >  || (array_len == (NR_MEM_SECTIONS() /  
> > _SECTIONS_PER_ROOT(
> >  return TRUE;
> >
> >  return FALSE;
> > }
> >
> > x86_64:
> > --
> >
> > int
> > get_versiondep_info_x86_64(void)
> > {
> >  /*
> >   * On linux-2.6.26, MAX_PHYSMEM_BITS is changed to 44 from 40.
> >   */
> >  if (info->kernel_version < KERNEL_VERSION(2, 6, 26))
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_ORIG;
> >  else if (info->kernel_version < KERNEL_VERSION(2, 6, 31))
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_2_6_26;
> >  else if(check_5level_paging())
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_5LEVEL;
> >  else
> >  info->max_physmem_bits  = _MAX_PHYSMEM_BITS_2_6_31;
> >
> >  ...
> > }
> >
> > Looking at the above, two questions come to my mind:
> >
> > - Do we really need all the above complexity in user-space code, to hoop
> > across various kernel versions and perform allocations for something
> > that can be so easily exported via vmcoreinfo? Also we need to see how
> > portable is the above code for a new kernel version - IMO, it will need
> > another fix patch when we update to a new kernel version in near future.
> 
> I agree -- not to mention that the "kernel version" way of determining things
> does not account for distribution-specific backports.
> 
> >
> > - Also do we need to replicate the above implementations across
> > user-space tools when they can also utilize the vmcoreinfo information
> > to determine the PA_BITS range without any additional arch/kernel
> > version specific details as the single point 

RE: [PATCH v2 1/2] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-02-21 Thread Kazuhito Hagio
Hi Bhupesh,

-Original Message-
> ARMv8.2-LPA architecture extension (if available on underlying hardware)
> can support 52-bit physical addresses, while the kernel virtual
> addresses remain 48-bit.
> 
> This patch is in accordance with ARMv8 Architecture Reference Manual
> version D.a
> 
> Make sure that we read the 52-bit PA address capability from
> 'MAX_PHYSMEM_BITS' variable (if available in vmcoreinfo) and
> accordingly change the pte_to_phy() mask values and also traverse
> the page-table walk accordingly.
> 
> Also make sure that it works well for the existing 48-bit PA address
> platforms and also on environments which use newer kernels with 52-bit
> PA support but hardware which is not ARM8.2-LPA compliant.
> 
> I have sent a kernel patch upstream to add 'MAX_PHYSMEM_BITS' to
> vmcoreinfo for arm64 (see [0]).
> 
> [0]. http://lists.infradead.org/pipermail/kexec/2019-February/022411.html
> 
> Signed-off-by: Bhupesh Sharma 

This patch looks good to me.
For two slight things below, I will remove them when merging.

> +/*
> + * Size mapped by an entry at level n ( 0 <= n <= 3)
> + * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
> + * in the final page. The maximum number of translation levels supported by
> + * the architecture is 4. Hence, starting at at level n, we have further
> + * ((4 - n) - 1) levels of translation excluding the offset within the page.
> + * So, the total number of bits mapped by an entry at level n is :
> + *
> + *  ((4 - n) - 1) * (PAGE_SHIFT - 3) + PAGE_SHIFT
> + *
> + * Rearranging it a bit we get :
> + *   (4 - n) * (PAGE_SHIFT - 3) + 3
> + */

Will remove this comment.

> +#define pmd_offset_pgtbl_lvl_2(dir, vaddr) ((pmd_t *)dir)
> +#define pmd_offset_pgtbl_lvl_3(dir, vaddr) (pud_page_paddr((*(dir))) + 
> pmd_index(vaddr) * sizeof(pmd_t))

Will remove these two macros not in use.


And, as I said on another thread, I'm thinking to merge the following
patch after your patch 1/2, it tested OK with 48-bit and 52-bit PA
without NUMBER(MAX_PHYSMEM_BITS) in vmcoreinfo.
Do you think of any case that this will not work well?

diff --git a/arch/arm64.c b/arch/arm64.c
index 29247a7..c7e60e0 100644
--- a/arch/arm64.c
+++ b/arch/arm64.c
@@ -127,6 +127,9 @@ typedef unsigned long pgdval_t;
  */
 #define SECTIONS_SIZE_BITS 30
 
+#define _MAX_PHYSMEM_BITS_48   48
+#define _MAX_PHYSMEM_BITS_52   52
+
 /*
  * Hardware page table definitions.
  *
@@ -402,17 +405,27 @@ get_stext_symbol(void)
return(found ? kallsym : FALSE);
 }
 
+static int
+set_max_physmem_bits_arm64(void)
+{
+   long array_len = ARRAY_LENGTH(mem_section);
+
+   info->max_physmem_bits = _MAX_PHYSMEM_BITS_48;
+   if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
+   || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
+   return TRUE;
+
+   info->max_physmem_bits = _MAX_PHYSMEM_BITS_52;
+   if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
+   || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
+   return TRUE;
+
+   return FALSE;
+}
+
 int
 get_machdep_info_arm64(void)
 {
-   /* Determine if the PA address range is 52-bits: ARMv8.2-LPA */
-   if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER) {
-   info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
-   if (info->max_physmem_bits == 52)
-   lpa_52_bit_support_available = 1;
-   } else
-   info->max_physmem_bits = 48;
-
/* Check if va_bits is still not initialized. If still 0, call
 * get_versiondep_info() to initialize the same.
 */
@@ -428,9 +441,24 @@ get_machdep_info_arm64(void)
info->section_size_bits = SECTIONS_SIZE_BITS;
 
DEBUG_MSG("kimage_voffset   : %lx\n", kimage_voffset);
-   DEBUG_MSG("max_physmem_bits : %ld\n", info->max_physmem_bits);
DEBUG_MSG("section_size_bits: %ld\n", info->section_size_bits);
 
+   /* Determine if the PA address range is 52-bits: ARMv8.2-LPA */
+   if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER) {
+   info->max_physmem_bits = NUMBER(MAX_PHYSMEM_BITS);
+   DEBUG_MSG("max_physmem_bits : %ld (vmcoreinfo)\n",
+   info->max_physmem_bits);
+   } else if (set_max_physmem_bits_arm64()) {
+   DEBUG_MSG("max_physmem_bits : %ld (detected)\n",
+   info->max_physmem_bits);
+   } else {
+   ERRMSG("Can't determine max_physmem_bits value\n");
+   return FALSE;
+   }
+
+   if (info->max_physmem_bits == 52)
+   lpa_52_bit_support_available = 1;
+
return TRUE;
 }


Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] arm64, vmcoreinfo : Append 'MAX_USER_VA_BITS' and 'MAX_PHYSMEM_BITS' to vmcoreinfo

2019-02-19 Thread Kazuhito Hagio
Hi Bhupesh,

-Original Message-
> I am not sure you got a chance to look at the two regression cases I
> reported here:
> 
> 
> Unfortunately the above suggestion doesn't provide any fix for
> ARMv8.2-LPA regression (see text under heading '
> (1). Regression Case 1 (ARMv8.2-LPA enabled kernel)')

As for MAX_PHYSMEM_BITS, I realized that ppc64 makedumpfile can detect
it because there is only one SECTION_SIZE_BITS for ppc64. I think we
can use the same way as set_ppc64_max_physmem_bits() does also for
arm64 for now. I'm going to write it for kernels not having
NUMBER(MAX_PHYSMEM_BITS) in vmcoreinfo.

Thanks,
Kazu

> 
> After going through the regression reports, I think exporting
> 'MAX_USER_VA_BITS' and 'MAX_PHYSMEM_BITS' to vmcoreinfo is sufficient
> for the above regressions (without over-complicating the stuff) as
> ARM64_TCR.T1SZ and friends seem to arch specific as compared to
> VA_BITS + 'MAX_USER_VA_BITS' .
> 
> Thanks,
> Bhupesh
___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: fix a typo for checking the file pointer for null.

2019-02-14 Thread Kazuhito Hagio
-Original Message-
> From: Nisha Parrakat 
> 
> Description:
> Static code analysis of makedumpfile code shows a mistake in checking
> the validity of a file descripter just attempted to open.
> 
> arch/ppc64.c: fixed the typo that missed checking fpb that was last attempted 
> to open.
> 
> Found during cppcheck on the code.
> 
> Signed-off-by: Nisha Parrakat 
> 
> --- a/arch/ppc64.c2019-01-29 23:08:27.099027763 +0100
> +++ b/arch/ppc64.c2019-01-29 23:08:58.567379337 +0100
> @@ -623,7 +623,7 @@
>   return FALSE;
>   }
>   fpb = fopen(f_crashbase, "r");
> - if (!fp) {
> + if (!fpb) {
>   ERRMSG("Cannot open %s\n", f_crashbase);
>   fclose(fp);
>   return FALSE;
> 

Thanks, applied to the devel branch.

Kazu




___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] makedumpfile: fixing possible memory leaks seen in static analysis

2019-02-14 Thread Kazuhito Hagio
From: Nisha Parrakat 

Description:
Fixed memory leaks found based on cppcheck and codesonar run.

Running makedumpfile with valgrind din't show any memory leaks but static
analysis tools were used to check for possible memory leaks.

Signed-off-by: Nisha Parrakat 

diff -Naur a/dwarf_info.c b/dwarf_info.c
--- a/dwarf_info.c  2018-07-03 20:52:46.0 +0200
+++ b/dwarf_info.c  2019-01-29 21:17:45.949544759 +0100
@@ -332,6 +332,8 @@
dwarf_info.dwfl = dwfl;
return TRUE;
 err_out:
+   if( dwfl_fd != -1)
+   close(dwfl_fd);
if (dwfl)
dwfl_end(dwfl);
 
diff -Naur a/erase_info.c b/erase_info.c
--- a/erase_info.c  2018-07-03 20:52:46.0 +0200
+++ b/erase_info.c  2019-01-29 21:17:45.949544759 +0100
@@ -599,6 +599,8 @@
return ce;
 
 err_out:
+   if(ptr && ptr->name) /*free strdup malloced memory*/
+   free( ptr->name );
if (ce)
free_config_entry(ce);
if (ptr)
@@ -1030,6 +1032,8 @@
}
return config;
 err_out:
+   if (config && config->module_name) /*free strdup memory*/
+   free ( config->module_name );
if (config)
free_config(config);
return NULL;

diff --git a/makedumpfile.c b/makedumpfile.c
index 8923538..0f7b90a 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -2720,16 +2720,19 @@ copy_vmcoreinfo(off_t offset, unsigned long size)
if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
ERRMSG("Can't seek the dump memory(%s). %s\n",
info->name_memory, strerror(errno));
+close(fd);
return FALSE;
}
if (read(info->fd_memory, , size) != size) {
ERRMSG("Can't read the dump memory(%s). %s\n",
info->name_memory, strerror(errno));
+close(fd);
return FALSE;
}
if (write(fd, , size) != size) {
ERRMSG("Can't write the vmcoreinfo file(%s). %s\n",
info->name_vmcoreinfo, strerror(errno));
+close(fd);
return FALSE;
}
if (close(fd) < 0) {
@@ -3639,6 +3642,7 @@ initialize_bitmap_memory(void)
if (bmp->buf == NULL) {
ERRMSG("Can't allocate memory for the bitmap buffer. %s\n",
strerror(errno));
+   free(bmp);
return FALSE;
}
bmp->fd= info->fd_memory;
@@ -8486,7 +8490,7 @@ write_eraseinfo(struct cache_data *cd_page, unsigned long 
*size_out)
int i, j, obuf_size = 0, ei_size = 0;
int ret = FALSE;
unsigned long size_eraseinfo = 0;
-   char *obuf = NULL;
+   char *obuf = NULL; char *obuf_new = NULL;
char size_str[MAX_SIZE_STR_LEN];
 
for (i = 1; i < num_erase_info; i++) {
@@ -8511,12 +8515,14 @@ write_eraseinfo(struct cache_data *cd_page, unsigned 
long *size_out)
 */
if (ei_size > obuf_size) {
obuf_size = ei_size;
-   obuf = realloc(obuf, obuf_size);
-   if (!obuf) {
+   obuf_new = realloc(obuf, obuf_size);
+   if (!obuf_new) {
ERRMSG("Can't allocate memory for"
" output buffer\n");
+free(obuf);
return FALSE;
-   }
+   }else
+   obuf = obuf_new;
}
sprintf(obuf, "erase %s %s", erase_info[i].symbol_expr,
size_str);
@@ -9074,12 +9080,14 @@ init_xen_crash_info(void)
if (lseek(info->fd_memory, offset_xen_crash_info, SEEK_SET) < 0) {
ERRMSG("Can't seek the dump memory(%s). %s\n",
   info->name_memory, strerror(errno));
+   free(buf);
return FALSE;
}
if (read(info->fd_memory, buf, size_xen_crash_info)
!= size_xen_crash_info) {
ERRMSG("Can't read the dump memory(%s). %s\n",
   info->name_memory, strerror(errno));
+   free(buf);
return FALSE;
}
 
@@ -10533,7 +10541,7 @@ reassemble_kdump_pages(void)
struct page_desc pd, pd_zero;
struct cache_data cd_pd, cd_data;
struct timespec ts_start;
-   char *data = NULL;
+   char *data = NULL; char* data_new = NULL;
unsigned long data_buf_size = info->page_size;
 
if (!prepare_bitmap2_buffer())
@@ -10663,11 +10671,13 @@ reassemble_kdump_pages(void)
 
if (SPLITTING_SIZE_EI(i) > data_buf_size) {
data_buf_size 

[PATCH] makedumpfile: fix a typo for checking the file pointer for null.

2019-02-14 Thread Kazuhito Hagio
From: Nisha Parrakat  

Description: 
Static code analysis of makedumpfile code shows a mistake in checking
the validity of a file descripter just attempted to open.

arch/ppc64.c: fixed the typo that missed checking fpb that was last attempted 
to open.

Found during cppcheck on the code.

Signed-off-by: Nisha Parrakat 

--- a/arch/ppc64.c  2019-01-29 23:08:27.099027763 +0100
+++ b/arch/ppc64.c  2019-01-29 23:08:58.567379337 +0100
@@ -623,7 +623,7 @@
return FALSE;
}
fpb = fopen(f_crashbase, "r");
-   if (!fp) {
+   if (!fpb) {
ERRMSG("Cannot open %s\n", f_crashbase);
fclose(fp);
return FALSE;


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-02-14 Thread Kazuhito Hagio
Hi Bhupesh,

-Original Message-
> >> Yes, the ARMv8 simulator can support both ARMv8.2 LPA and LVA features
> >> and I tested the above suggestion earlier also on the same and landed
> >> into incorrect paddress calculation issues.
> >>
> >> Since the simulator is slow and its time taking to insert rebuilt
> >> user-space components in the Linaro initramfs being used on the same, I
> >> would suggest that we go ahead with the above code for now and later
> >> when I have more results from the simulator/real hardware, I will send
> >> out a follow-up patch (if needed) to fix the paddress calculation.
> >
> > Hmm, it theoretically needs __pte_to_phys(), right?
> > So if you have an issue with it, there may be a bug somewhere
> > and need to debug it. Do you have the detailed information?
> 
> Its not very easy to get the detailed UART console logs from the
> simulator, so it is hard to get all the debug logs from makedumpfile, so
> I am trying debugging the issue via 'gdb' by adding it to the initramfs.

I don't know what environment the simulater is, but you cannot
capture a vmcore with cp or something? then debug makedumpfile
with it on a real arm64 machine. I usually do like this.

  # cp --sparse=always /proc/vmcore vmcore

I'm willing to debug it if you would send me the vmcore and vmlinux.

> 
> However till then to fix the regression reported with upstream
> makedumpfile on arm64 platforms which don't support ARMv8.2-LPA
> extensions (e.g. Cortex-A57) and run a newer kernel with PA=52-bit
> configuration, we can apply this patch for now.
> 
> I have tested this on non-ARMv8.2-LPA platforms like apm-osprey and
> huwaei-taishan and the makedumpfile can work fine.
> 
> I will come back with a follow-up patch (if needed) after some checks on
> the ARMv8 Simulator for the __pte_to_phys() part.
> 
> Thanks,
> Bhupesh
> 
> 
> 

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] arm64, vmcoreinfo : Append 'MAX_USER_VA_BITS' and 'MAX_PHYSMEM_BITS' to vmcoreinfo

2019-02-13 Thread Kazuhito Hagio
On 2/13/2019 1:22 PM, James Morse wrote:
> Hi guys,
> 
> On 13/02/2019 11:15, Dave Young wrote:
> > On 02/12/19 at 11:03pm, Kazuhito Hagio wrote:
> >> On 2/12/2019 2:59 PM, Bhupesh Sharma wrote:
> >>> BTW, in the makedumpfile enablement patch thread for ARMv8.2 LVA
> >>> (which I sent out for 52-bit User space VA enablement) (see [0]), Kazu
> >>> mentioned that the changes look necessary.
> >>>
> >>> [0]. http://lists.infradead.org/pipermail/kexec/2019-February/022431.html
> >>
> >>>>> The increased 'PTRS_PER_PGD' value for such cases needs to be then
> >>>>> calculated as is done by the underlying kernel
> 
> Aha! Nothing to do with which-bits-are-pfn in the tables...
> 
> You need to know if the top level PGD is 512bytes or bigger. As we use a
> kmem-cache the adjacent data could be some else's page tables.
> 
> Is this really a problem though? You can't pull the user-space pgd pointers 
> out
> of no-where, you must have walked some task_struct and struct_mm's to find 
> them.
> In which case you would have the VMAs on hand to tell you if its in the mapped
> user range.
> 
> It would be good to avoid putting something arch-specific in here if we can at
> all help it.
> 
> 
> >>>>> (see
> >>>>> 'arch/arm64/include/asm/pgtable-hwdef.h' for details):
> >>>>>
> >>>>> #define PTRS_PER_PGD  (1 << (MAX_USER_VA_BITS - PGDIR_SHIFT))
> >>
> >> Yes, this is the reason why makedumpfile needs the MAX_USER_VA_BITS.
> >> It is used for pgd_index() also in makedumpfile to walk page tables.
> >>
> >> /* to find an entry in a page-table-directory */
> >> #define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 
> >> 1))
> >
> > Since Dave mentioned crash tool does not need it, but crash should also
> > travel the pg tables.

The crash utility is always invoked with vmlinux, so it can read the
vabits_user variable directly from vmcore, but makedumpfile can not.

> > If this is really necessary it would be good to describe what will
> > happen without the patch, eg. some user visible error from an actual test 
> > etc.
> 
> Yes please, it would really help if there was a specific example we could 
> discuss.

With 52-bit user space and 48-bit kernel space configuration,
makedumpfile will not be able to convert a virtual kernel address
to a physical address, and fail to capture a dumpfile, because the
pgd_index() will return a wrong index.

But I don't have any suitable test system on hand, so have not tried
the kernel configuration actually. If found, I'll try.

Bhupesh, do you have any test result?

Thanks,
Kazu

> 
> 
> Thanks,
> 
> James

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] arm64, vmcoreinfo : Append 'MAX_USER_VA_BITS' and 'MAX_PHYSMEM_BITS' to vmcoreinfo

2019-02-12 Thread Kazuhito Hagio


On 2/12/2019 2:59 PM, Bhupesh Sharma wrote:
> BTW, in the makedumpfile enablement patch thread for ARMv8.2 LVA
> (which I sent out for 52-bit User space VA enablement) (see [0]), Kazu
> mentioned that the changes look necessary.
> 
> [0]. http://lists.infradead.org/pipermail/kexec/2019-February/022431.html

> > > The increased 'PTRS_PER_PGD' value for such cases needs to be then
> > > calculated as is done by the underlying kernel (see
> > > 'arch/arm64/include/asm/pgtable-hwdef.h' for details):
> > >
> > > #define PTRS_PER_PGD  (1 << (MAX_USER_VA_BITS - PGDIR_SHIFT))

Yes, this is the reason why makedumpfile needs the MAX_USER_VA_BITS.
It is used for pgd_index() also in makedumpfile to walk page tables.

/* to find an entry in a page-table-directory */
#define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))

Thanks,
Kazu

> > >
> > > Also, note that 'arch/arm64/include/asm/memory.h' defines 
> > > 'MAX_USER_VA_BITS'
> > > as 'VA_BITS' in case 'CONFIG_ARM64_USER_VA_BITS_52' is set to 'n':
> > >
> > > #ifdef CONFIG_ARM64_USER_VA_BITS_52
> > > #define MAX_USER_VA_BITS  52
> > > #else
> > > #define MAX_USER_VA_BITS  VA_BITS
> > > #endif
> > >
> > > So, makedumpfile will need this symbol exported in vmcore to make the 
> > > above
> > > determination.
> > >
> > > [0]. http://lists.infradead.org/pipermail/kexec/2019-February/022425.html
> > >
> > > Thanks,
> > > Bhupesh
> >
> > Thanks
> > Dave

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-02-12 Thread Kazuhito Hagio
On 2/12/2019 2:22 PM, Bhupesh Sharma wrote:
> > +#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / 
> > (PAGE_SHIFT - 3))

> > I agree to sync these macros with the kernel, but currently this one
> > is not used in your patches, and you wrote the path of the source file
> > (pgtable-hwdef.h) above for reference, so we don't need to import this
> > one for now. I'd like to import it when it is needed.
> 
> Ok, now I understand. You mean 'ARM64_HW_PGTABLE_LEVELS' macro here only
> and not 'ARM64_HW_PGTABLE_LEVEL_SHIFT' macro also, right? If yes, I
> agree to removing the former. Will fix this in v2.

Yes, it's the ARM64_HW_PGTABLE_LEVELS macro only.

> > My understanding is that with 64k page, we can convert a page table
> > entry to a physicall address without awareness of 52-bit.
> >
> > According to this patch, the top 4 bits of a 52-bit physical address
> > are positioned at bits 12..15 of a page table entry.
> >
> > commit 75387b92635e7dca410c1ef92cfe510019248f76
> > Author: Kristina Martsenko 
> > Date:   Wed Dec 13 17:07:21 2017 +
> >
> >  arm64: handle 52-bit physical addresses in page table entries
> >
> >  The top 4 bits of a 52-bit physical address are positioned at bits
> >  12..15 of a page table entry. Introduce macros to convert between a
> >  physical address and its placement in a table entry, and change all
> >  macros/functions that access PTEs to use them.
> >
> > With 64k page and non-52-bit kernel, it looks like the bits 12..15
> > are zero, so we can move the zeros to bits 49..51 because the zeros
> > don't affect the PA, for example:
> >
> >52-bit  non-52-bit (48-bit)
> >PTE 0x123456789000  0x12345678
> > v^  v^   __pte_to_phys() w/52-bit 
> > support
> >PA  0x000912345678  0x12345678
> >
> > I think that this was what the upstream maintainers said..
> > But "if (lpa_52_bit_support_available)" is also fine for me.
> 
> Well from my experience on arm32 and arm64 hardware enablement, assuming
> values of implementation defined fields for arm hardware can be risky :)
> 
> Lets see what the ARMv8 architecture reference manual says about the
> Bits [15:12] for a 64KB page size:
> 
> "Bits[15:12] of each valid translation table descriptor hold bits[51:48]
> of the output address, or of the address of the translation table to be
> used for the initial lookup at the next level of translation. If the
> implementation does not support 52-bit physical addresses, then it is
> IMPLEMENTATION DEFINED whether non-zero values for these bits generate
> an Address size fault."
> 
> So, it is unsafe to assume that for a 48-bit physical address
> implementation the Bits[15:12] cannot have non-zero values on certain
> hardware implementations. So assuming that these bits are always 0 and
> can be easily moved to Bits[51:48] for a 64K page and non-52bit kernel,
> can lead to IMPLEMENTATION DEFINED behavior.
> 
> Its better instead to have a predictable behavior in such cases and by
> explicitly calculating the paddress values using the right PTE High and
> Low masks in these cases we can minimize any hardware IMPLEMENTATION
> specific details (just like the handling done in kernel space).

I understood. This is the information I needed, thanks.

> > @@ -425,14 +628,13 @@ vaddr_to_paddr_arm64(unsigned long vaddr)
> >   ERRMSG("Can't get a valid pte.\n");
> >   return NOT_PADDR;
> >   } else {
> > -
> > -   paddr = (PAGEBASE(pte_val(ptev)) & PHYS_MASK)
> > +   paddr = (PAGEBASE(pte_val(ptev)) & 
> > PTE_ADDR_MASK)
> >   + (vaddr & (PAGESIZE() - 1));
> >>>
> >>> I think __pte_to_phys() is needed also here, not PTE_ADDR_MASK.
> >>
> >> I had some issues with the same on ARMv8 simulator, so lets stick with the 
> >> tested 'PTE_ADDR_MASK' usage
> for now.
> >
> > Did you test this PTE_ADDR_MASK line on a system actually using
> > 52-bit PA? If a 52-bit physical address is actually used, this will
> > return a wrong address, for example:
> >
> >PTE_ADDR_MASK  0xf000
> >PTE0x123456789000
> >PAGEBASE'd 0x12345678
> > v
> >paddr  0x12345678 + 64k offset // incorrect
> >
> > With 52-bit PA, the PTE_ADDR_MASK is used for __phys_to_pte_val(),
> > not __pte_to_phys().
> >
> > If I understand correctly, we should need __pte_to_phys() also here.
> >
> >paddr = __pte_to_phys(ptev)
> >+ (vaddr & (PAGESIZE() - 1));
> >
> > Could you try this?
> 
> Yes, the ARMv8 simulator can support both ARMv8.2 LPA and LVA features
> and I tested the above suggestion earlier also on the same and landed
> into incorrect paddress calculation issues.
> 
> Since the simulator is slow and its time taking to insert rebuilt
> user-space 

RE: [PATCH] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-02-12 Thread Kazuhito Hagio
Hi Bhupesh,

On 2/12/2019 3:44 AM, Bhupesh Sharma wrote:
 +/* See 'arch/arm64/include/asm/pgtable-hwdef.h' for definitions below */
 +
 +/*
 + * Number of page-table levels required to address 'va_bits' wide
 + * address, without section mapping. We resolve the top (va_bits - 
 PAGE_SHIFT)
 + * bits with (PAGE_SHIFT - 3) bits at each page table level. Hence:
 + *
 + *  levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), (PAGE_SHIFT - 3))
 + *
 + * where DIV_ROUND_UP(n, d) => (((n) + (d) - 1) / (d))
 + *
 + * We cannot include linux/kernel.h which defines DIV_ROUND_UP here
 + * due to build issues. So we open code DIV_ROUND_UP here:
 + *
 + * va_bits) - PAGE_SHIFT) + (PAGE_SHIFT - 3) - 1) / (PAGE_SHIFT - 
 3))
 + *
 + * which gets simplified as :
 + */
 +#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 
 3))
>>
>> Is this needed?
> 
> Yes, it is needed for both the LPA and LVA patches (the LVA patch was sent 
> out separately), since we need to calculate values like PMD_SHIFT on basis of 
> the page-table levels.
> 
> Also this makes the makedumpfile code confirm more to the recent kernel 
> definitions as otherwise one needs to read the makedumpfile code and dig 
> kernel change history to understand the calculation of these macros.
> 
> In future also, I plan to keep these values in sync with the kernel and send 
> patches accordingly.

I agree to sync these macros with the kernel, but currently this one
is not used in your patches, and you wrote the path of the source file
(pgtable-hwdef.h) above for reference, so we don't need to import this
one for now. I'd like to import it when it is needed.

 +/* Highest possible physical address supported */
 +static inline int
 +get_phy_mask_shift_arm64(void)
 +{
 +   int pa_bits = 48 /* default: 48-bit PA */;
 +
 +   if (NUMBER(MAX_PHYSMEM_BITS) != NOT_FOUND_NUMBER)
 +   pa_bits = NUMBER(MAX_PHYSMEM_BITS);
 +
 +   return pa_bits;
 +}
>>
>> Is this needed to be an inline function?
> 
> IMO, its better to keep this as inline.

Once you modify the part of setting info->max_physmem_bits below,
we can remove this function, because we don't use PHYS_MASK_SHIFT
anymore.

 -#define PMD_TYPE_MASK  3
 -#define PMD_TYPE_SECT  1
 -#define PMD_TYPE_TABLE 3
 +#define PHYS_MASK_SHIFTget_phy_mask_shift_arm64()

 -#define PUD_TYPE_MASK  3
 -#define PUD_TYPE_SECT  1
 -#define PUD_TYPE_TABLE 3
 +/* Helper API to convert between a physical address and its placement
 + * in a page table entry, taking care of 52-bit addresses.
 + */
 +static inline unsigned long
 +__pte_to_phys(pte_t pte)
 +{
 +   if (lpa_52_bit_support_available)
>>
>> OK.
>>
>> According to the related emails, it looks like "PAGESIZE() == SZ_64K"
>> is also usable here, but this is the same condition as kernel's one
>> and easy to understand.
> 
> No, like I clarified to the upstream maintainers, distributions like Fedora 
> already support a default page size of 64K, but the PA address space can be 
> 48 or 52, depending on the kernel version and kernel CONFIG flags used.

My understanding is that with 64k page, we can convert a page table
entry to a physicall address without awareness of 52-bit.

According to this patch, the top 4 bits of a 52-bit physical address
are positioned at bits 12..15 of a page table entry.

commit 75387b92635e7dca410c1ef92cfe510019248f76
Author: Kristina Martsenko 
Date:   Wed Dec 13 17:07:21 2017 +

arm64: handle 52-bit physical addresses in page table entries

The top 4 bits of a 52-bit physical address are positioned at bits
12..15 of a page table entry. Introduce macros to convert between a
physical address and its placement in a table entry, and change all
macros/functions that access PTEs to use them.

With 64k page and non-52-bit kernel, it looks like the bits 12..15
are zero, so we can move the zeros to bits 49..51 because the zeros
don't affect the PA, for example:

  52-bit  non-52-bit (48-bit)
  PTE 0x123456789000  0x12345678
   v^  v^   __pte_to_phys() w/52-bit support
  PA  0x000912345678  0x12345678

I think that this was what the upstream maintainers said..
But "if (lpa_52_bit_support_available)" is also fine for me.

 @@ -287,6 +481,15 @@ get_stext_symbol(void)
   int
   get_machdep_info_arm64(void)
   {
 +   int pa_bits;
 +
 +   /* Determine if the PA address range is 52-bits: ARMv8.2-LPA */
 +   pa_bits = get_phy_mask_shift_arm64();
 +   DEBUG_MSG("pa_bits: %d\n", pa_bits);
 +
 +   if (pa_bits == 52)
 +   lpa_52_bit_support_available = 1;
 +
>>
>> This looks a bit redundant, so 

RE: [PATCH] makedumpfile/arm64: Add support for ARMv8.2-LVA (52-bit user-space VA support)

2019-02-11 Thread Kazuhito Hagio
Hi Bhupesh,

On 2/7/2019 2:52 PM, Bhupesh Sharma wrote:
> With ARMv8.2-LVA architecture extension availability, arm64 hardware
> which supports this extension can support upto 52-bit virtual
> addresses. It is specially useful for having a 52-bit user-space virtual
> address space while the kernel can still retain 48-bit virtual
> addresses.
> 
> Since at the moment we enable the support of this extensions in the
> kernel via a CONFIG flags (CONFIG_ARM64_USER_VA_BITS_52), so there are
> no clear mechanisms in user-space to determine this CONFIG
> flag value and use it to determine the PA address range values.
> 
> 'makedumpfile' can instead use 'MAX_USER_VA_BITS' value to
> determine the maximum virtual physical address supported by user-space.
> If 'MAX_USER_VA_BITS' value is greater than 'VA_BITS' than we might be
> running a use-case where user-space is 52-bit and underlying kernel is
> still 48-bit. The increased 'PTRS_PER_PGD' value for such cases can then
> be calculated as is done by the underlying kernel (see kernel file
> 'arch/arm64/include/asm/pgtable-hwdef.h' for details):

Thank you for the patch.
I can't test the config yet, but this looks necessary.

> 
> I have sent a kernel patch upstream to add 'MAX_USER_VA_BITS' to
> vmcoreinfo for arm64 (see [0]).
> 
> This patch is in accordance with ARMv8 Architecture Reference Manual
> version D.a
> 
> [0].
> http://lists.infradead.org/pipermail/kexec/2019-February/022411.html
> 
> Signed-off-by: Bhupesh Sharma 
> ---
>  arch/arm64.c   | 105 
> +
>  makedumpfile.c |   2 ++
>  makedumpfile.h |   1 +
>  3 files changed, 79 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arm64.c b/arch/arm64.c
> index a1db7dc63107..b522e93f4e80 100644
> --- a/arch/arm64.c
> +++ b/arch/arm64.c
> @@ -47,6 +47,7 @@ typedef struct {
>  static int lpa_52_bit_support_available;
>  static int pgtable_level;
>  static int va_bits;
> +static int max_user_va_bits;
>  static unsigned long kimage_voffset;
> 
>  #define SZ_4K4096
> @@ -145,7 +146,7 @@ get_page_table_level_arm64(void)
>  #define PGDIR_SHIFT  ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - 
> (get_page_table_level_arm64()))
>  #define PGDIR_SIZE   (_AC(1, UL) << PGDIR_SHIFT)
>  #define PGDIR_MASK   (~(PGDIR_SIZE-1))
> -#define PTRS_PER_PGD (1 << ((va_bits) - PGDIR_SHIFT))
> +#define PTRS_PER_PGD (1 << ((max_user_va_bits) - PGDIR_SHIFT))
> 
>  /*
>   * Section address mask and size definitions.
> @@ -478,6 +479,46 @@ get_stext_symbol(void)
>   return(found ? kallsym : FALSE);
>  }
> 
> +static int
> +get_va_bits_from_stext_arm64(void)
> +{
> + ulong _stext;
> +
> + _stext = get_stext_symbol();
> + if (!_stext) {
> + ERRMSG("Can't get the symbol of _stext.\n");
> + return FALSE;
> + }
> +
> + /* Derive va_bits as per arch/arm64/Kconfig */
> + if ((_stext & PAGE_OFFSET_36) == PAGE_OFFSET_36) {
> + va_bits = 36;
> + } else if ((_stext & PAGE_OFFSET_39) == PAGE_OFFSET_39) {
> + va_bits = 39;
> + } else if ((_stext & PAGE_OFFSET_42) == PAGE_OFFSET_42) {
> + va_bits = 42;
> + } else if ((_stext & PAGE_OFFSET_47) == PAGE_OFFSET_47) {
> + va_bits = 47;
> + } else if ((_stext & PAGE_OFFSET_48) == PAGE_OFFSET_48) {
> + va_bits = 48;
> + } else {
> + ERRMSG("Cannot find a proper _stext for calculating VA_BITS\n");
> + return FALSE;
> + }
> +
> + DEBUG_MSG("va_bits  : %d\n", va_bits);

Please add "(_stext)" to the end of line for debugging.

> +
> + return TRUE;
> +}
> +
> +static void
> +get_page_offset_arm64(void)
> +{
> + info->page_offset = (0xUL) << (va_bits - 1);
> +
> + DEBUG_MSG("page_offset  : %lx\n", info->page_offset);
> +}
> +
>  int
>  get_machdep_info_arm64(void)
>  {
> @@ -493,8 +534,37 @@ get_machdep_info_arm64(void)
>   /* Check if va_bits is still not initialized. If still 0, call
>* get_versiondep_info() to initialize the same.
>*/
> + if (NUMBER(VA_BITS) != NOT_FOUND_NUMBER) {
> + va_bits = NUMBER(VA_BITS);
> + DEBUG_MSG("va_bits: %d (vmcoreinfo)\n",
> + va_bits);
> + }
> +
> + /* Check if va_bits is still not initialized. If still 0, call
> +  * get_versiondep_info() to initialize the same from _stext
> +  * symbol.
> +  */
>   if (!va_bits)
> - get_versiondep_info_arm64();
> + if (get_va_bits_from_stext_arm64() == ERROR)
> + return ERROR;

These ERRORs should be FALSE.

> +
> + get_page_offset_arm64();
> +
> + if (NUMBER(MAX_USER_VA_BITS) != NOT_FOUND_NUMBER) {
> + max_user_va_bits = NUMBER(MAX_USER_VA_BITS);
> + DEBUG_MSG("max_user_va_bits : %d (vmcoreinfo)\n",
> + max_user_va_bits);
> + }
> +
> +  

RE: [PATCH] makedumpfile/arm64: Add support for ARMv8.2-LPA (52-bit PA support)

2019-02-08 Thread Kazuhito Hagio
Hi Bhupesh,

On 2/6/2019 3:34 PM, Bhupesh Sharma wrote:
> Add kexec@lists.infradead.org.
> 
> On Thu, Feb 7, 2019 at 1:43 AM Bhupesh Sharma  wrote:
>>
>> With ARMv8.2-LPA architecture extension availability, arm64 hardware
>> which supports this extension can support upto 52-bit physical
>> addresses.
>>
>> Since at the moment we enable the support of this extensions in the
>> kernel via a CONFIG flag (CONFIG_ARM64_PA_BITS_52), so there are no
>> clear mechanisms in user-space to determine this CONFIG
>> flag value and use it to determine the PA address range values.
>>
>> 'makedumpfile' can instead use 'MAX_PHYSMEM_BITS' values to
>> determine the maximum physical address supported by underlying kernel.
>>
>> I have sent a kernel patch upstream to add 'MAX_PHYSMEM_BITS' to
>> vmcoreinfo for arm64.
>>
>> This patch is in accordance with ARMv8 Architecture Reference Manual
>> version D.a and also works well for the existing
>> lower PA address values (e.g. 48-bit PA addresses).
>>
>> [0].
>> http://lists.infradead.org/pipermail/kexec/2019-February/022411.html
>>
>> Signed-off-by: Bhupesh Sharma 
>> ---
>>  arch/arm64.c | 332 
>> +++
>>  1 file changed, 267 insertions(+), 65 deletions(-)
>>
>> diff --git a/arch/arm64.c b/arch/arm64.c
>> index 053519359cbc..a1db7dc63107 100644
>> --- a/arch/arm64.c
>> +++ b/arch/arm64.c
>> @@ -39,72 +39,276 @@ typedef struct {
>> unsigned long pte;
>>  } pte_t;
>>
>> +#define __pte(x)   ((pte_t) { (x) } )
>> +#define __pmd(x)   ((pmd_t) { (x) } )
>> +#define __pud(x)   ((pud_t) { (x) } )
>> +#define __pgd(x)   ((pgd_t) { (x) } )
>> +
>> +static int lpa_52_bit_support_available;
>>  static int pgtable_level;
>>  static int va_bits;
>>  static unsigned long kimage_voffset;
>>
>> -#define SZ_4K  (4 * 1024)
>> -#define SZ_16K (16 * 1024)
>> -#define SZ_64K (64 * 1024)
>> -#define SZ_128M(128 * 1024 * 1024)
>> +#define SZ_4K  4096
>> +#define SZ_16K 16384
>> +#define SZ_64K 65536
>>
>> -#define PAGE_OFFSET_36 ((0xUL) << 36)
>> -#define PAGE_OFFSET_39 ((0xUL) << 39)
>> -#define PAGE_OFFSET_42 ((0xUL) << 42)
>> -#define PAGE_OFFSET_47 ((0xUL) << 47)
>> -#define PAGE_OFFSET_48 ((0xUL) << 48)
>> +#define PAGE_OFFSET_36 ((0xUL) << 36)
>> +#define PAGE_OFFSET_39 ((0xUL) << 39)
>> +#define PAGE_OFFSET_42 ((0xUL) << 42)
>> +#define PAGE_OFFSET_47 ((0xUL) << 47)
>> +#define PAGE_OFFSET_48 ((0xUL) << 48)
>> +#define PAGE_OFFSET_52 ((0xUL) << 52)
>>
>>  #define pgd_val(x) ((x).pgd)
>>  #define pud_val(x) (pgd_val((x).pgd))
>>  #define pmd_val(x) (pud_val((x).pud))
>>  #define pte_val(x) ((x).pte)
>>
>> -#define PAGE_MASK  (~(PAGESIZE() - 1))
>> -#define PGDIR_SHIFT((PAGESHIFT() - 3) * pgtable_level + 3)
>> -#define PTRS_PER_PGD   (1 << (va_bits - PGDIR_SHIFT))
>> -#define PUD_SHIFT  get_pud_shift_arm64()
>> -#define PUD_SIZE   (1UL << PUD_SHIFT)
>> -#define PUD_MASK   (~(PUD_SIZE - 1))
>> -#define PTRS_PER_PTE   (1 << (PAGESHIFT() - 3))
>> -#define PTRS_PER_PUD   PTRS_PER_PTE
>> -#define PMD_SHIFT  ((PAGESHIFT() - 3) * 2 + 3)
>> -#define PMD_SIZE   (1UL << PMD_SHIFT)
>> -#define PMD_MASK   (~(PMD_SIZE - 1))
>> +/* See 'include/uapi/linux/const.h' for definitions below */
>> +#define __AC(X,Y)  (X##Y)
>> +#define _AC(X,Y)   __AC(X,Y)
>> +#define _AT(T,X)   ((T)(X))
>> +
>> +/* See 'include/asm/pgtable-types.h' for definitions below */
>> +typedef unsigned long pteval_t;
>> +typedef unsigned long pmdval_t;
>> +typedef unsigned long pudval_t;
>> +typedef unsigned long pgdval_t;
>> +
>> +#define PAGE_SIZE  get_pagesize_arm64()

Is this needed?

>> +#define PAGE_SHIFT PAGESHIFT()
>> +
>> +/* See 'arch/arm64/include/asm/pgtable-hwdef.h' for definitions below */
>> +
>> +/*
>> + * Number of page-table levels required to address 'va_bits' wide
>> + * address, without section mapping. We resolve the top (va_bits - 
>> PAGE_SHIFT)
>> + * bits with (PAGE_SHIFT - 3) bits at each page table level. Hence:
>> + *
>> + *  levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), (PAGE_SHIFT - 3))
>> + *
>> + * where DIV_ROUND_UP(n, d) => (((n) + (d) - 1) / (d))
>> + *
>> + * We cannot include linux/kernel.h which defines DIV_ROUND_UP here
>> + * due to build issues. So we open code DIV_ROUND_UP here:
>> + *
>> + * va_bits) - PAGE_SHIFT) + (PAGE_SHIFT - 3) - 1) / (PAGE_SHIFT - 
>> 3))
>> + *
>> + * which gets simplified as :
>> + */
>> +#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 

RE: [PATCH v3] Remove the memory encryption mask to obtain the true physical address

2019-02-04 Thread Kazuhito Hagio
> [PATCH v3] Remove the memory encryption mask to obtain the true physical 
> address

I forgot to comment on the subject and the commit log..
I'll change this to

  x86_64: Add support for AMD Secure Memory Encryption

On 1/29/2019 9:48 PM, Lianbo Jiang wrote:
> For AMD machine with SME feature, if SME is enabled in the first
> kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
> the memory encryption mask, so makedumpfile needs to remove the
> memory encryption mask to obtain the true physical address.

I added a few official words from some documents:
---
On AMD machine with Secure Memory Encryption (SME) feature, if SME is
enabled, page tables contain a specific attribute bit (C-bit) in their
entries to indicate whether a page is encrypted or unencrypted.

So get NUMBER(sme_mask) from vmcoreinfo, which stores the value of
the C-bit position, and drop it to obtain the true physical address.
---

If these are OK, I'll modify them when merging, so you don't need
to repost.

And, I'm thinking to merge this after the kernel patch gets merged
into the mainline.

Thanks for your work.
Kazu

> 
> Signed-off-by: Lianbo Jiang 
> ---
> Changes since v1:
> 1. Merge them into a patch.
> 2. The sme_mask is not an enum number, remove it.
> 3. Sanity check whether the sme_mask is in vmcoreinfo.
> 4. Deal with the huge pages case.
> 5. Cover the 5-level path.
> 
> Changes since v2:
> 1. Change the sme_me_mask to entry_mask.
> 2. No need to remove the mask when makedumpfile prints out the
>value of the entry.
> 3. Remove the sme mask from the pte at the end of the __vtop4_x86_64().
> 4. Also need to remove the sme mask from page table entry in
>find_vmemmap_x86_64()
> 
>  arch/x86_64.c  | 30 +++---
>  makedumpfile.c |  4 
>  makedumpfile.h |  1 +
>  3 files changed, 24 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86_64.c b/arch/x86_64.c
> index 537fb78..9977466 100644
> --- a/arch/x86_64.c
> +++ b/arch/x86_64.c
> @@ -291,6 +291,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   unsigned long page_dir, pgd, pud_paddr, pud_pte, pmd_paddr, pmd_pte;
>   unsigned long pte_paddr, pte;
>   unsigned long p4d_paddr, p4d_pte;
> + unsigned long entry_mask = ENTRY_MASK;
> 
>   /*
>* Get PGD.
> @@ -302,6 +303,9 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   return NOT_PADDR;
>   }
> 
> + if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
> + entry_mask &= ~(NUMBER(sme_mask));
> +
>   if (check_5level_paging()) {
>   page_dir += pgd5_index(vaddr) * sizeof(unsigned long);
>   if (!readmem(PADDR, page_dir, , sizeof pgd)) {
> @@ -318,7 +322,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   /*
>* Get P4D.
>*/
> - p4d_paddr  = pgd & ENTRY_MASK;
> + p4d_paddr  = pgd & entry_mask;
>   p4d_paddr += p4d_index(vaddr) * sizeof(unsigned long);
>   if (!readmem(PADDR, p4d_paddr, _pte, sizeof p4d_pte)) {
>   ERRMSG("Can't get p4d_pte (p4d_paddr:%lx).\n", 
> p4d_paddr);
> @@ -331,7 +335,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   ERRMSG("Can't get a valid p4d_pte.\n");
>   return NOT_PADDR;
>   }
> - pud_paddr  = p4d_pte & ENTRY_MASK;
> + pud_paddr  = p4d_pte & entry_mask;
>   }else {
>   page_dir += pgd_index(vaddr) * sizeof(unsigned long);
>   if (!readmem(PADDR, page_dir, , sizeof pgd)) {
> @@ -345,7 +349,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   ERRMSG("Can't get a valid pgd.\n");
>   return NOT_PADDR;
>   }
> - pud_paddr  = pgd & ENTRY_MASK;
> + pud_paddr  = pgd & entry_mask;
>   }
> 
>   /*
> @@ -364,13 +368,13 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   return NOT_PADDR;
>   }
>   if (pud_pte & _PAGE_PSE)/* 1GB pages */
> - return (pud_pte & ENTRY_MASK & PUD_MASK) +
> + return (pud_pte & entry_mask & PUD_MASK) +
>   (vaddr & ~PUD_MASK);
> 
>   /*
>* Get PMD.
>*/
> - pmd_paddr  = pud_pte & ENTRY_MASK;
> + pmd_paddr  = pud_pte & entry_mask;
>   pmd_paddr += pmd_index(vaddr) * sizeof(unsigned long);
>   if (!readmem(PADDR, pmd_paddr, _pte, sizeof pmd_pte)) {
>   ERRMSG("Can't get pmd_pte (pmd_paddr:%lx).\n", pmd_paddr);
> @@ -384,13 +388,13 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   return NOT_PADDR;
>   }
>   if (pmd_pte & _PAGE_PSE)/* 2MB pages */
> - return (pmd_pte & ENTRY_MASK & PMD_MASK) +
> + return (pmd_pte & entry_mask & PMD_MASK) +
> 

RE: [PATCH] arm64, vmcoreinfo : Append 'MAX_USER_VA_BITS' and 'MAX_PHYSMEM_BITS' to vmcoreinfo

2019-02-04 Thread Kazuhito Hagio
On 1/30/2019 8:48 PM, Dave Young wrote:
> + more people
> On 01/30/19 at 05:53pm, Bhupesh Sharma wrote:
> > With ARMv8.2-LVA and LPA architecture extensions, arm64 hardware which
> > supports these extensions can support upto 52-bit virtual and 52-bit
> > physical addresses respectively.
> >
> > Since at the moment we enable the support of these extensions via CONFIG
> > flags, e.g.
> >  - LPA via CONFIG_ARM64_PA_BITS_52
> >
> > there are no clear mechanisms in user-space right now to
> > deteremine these CONFIG flag values and also determine the PARange and
> > VARange address values.
> >
> > User-space tools like 'makedumpfile' and 'crash-utility' can instead
> > use the 'MAX_USER_VA_BITS' and 'MAX_PHYSMEM_BITS' values to determine
> > the maximum virtual address and physical address (respectively)
> > supported by underlying kernel.
> >
> > A reference 'makedumpfile' implementation which uses this approach to
> > determining the maximum physical address is available in [0].
> >
> > [0].
> https://github.com/bhupesh-sharma/makedumpfile/blob/52-bit-pa-support-via-vmcore-v1/arch/arm64.c#L490
> 
> I'm not objecting the patch, just want to make sure to make clear about
> things and make sure these issues are aware by people, and leave arm
> people to review the arm bits.
> 
> 1. MAX_PHYSMEM_BITS
> As we previously found, back to 2014 makedumpfile took a patch to read the
> value from vmcore but the kernel patch was not accepted.
> So we should first make clear if this is really needed, why other arches
> do not need this in makedumpfile.
> 
> If we really need it then should it be arm64 only?
> 
> If it is arm64 only then the makedumpfile code should read this number
> only for arm64.

Sorry for the delay.

According to the kernel patch, some of arm32 platforms may need it
http://lists.infradead.org/pipermail/kexec/2014-May/011909.html
but except for them (and arm64), makedumpfile can manage with kernel
version and some switches to determine this value so far.

> 
> Also Lianbo added the vmcoreinfo documents, I believe it stays in -tip
> tree,  need to make sure to document this as well.
> 
> 2. MAX_USER_VA_BITS
> Does makedumpfile care about userspace VA bits?  I do not see other code
> doing this,  Kazu and Dave A should be able to comment.

The mapping makedumpfile uses on arm64 is swapper_pg_dir only, so
unless the config affects its structure or something, makedumpfile
will not need this value.

Thanks,
Kazu

> 
> I tend to doubt about this.
> 
> >
> > Cc: AKASHI Takahiro 
> > Cc: Mark Rutland 
> > Cc: Will Deacon 
> > Cc: James Morse 
> > Signed-off-by: Bhupesh Sharma 
> > ---
> >  arch/arm64/kernel/crash_core.c | 2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
> > index ca4c3e12d8c5..ad231be5c0d8 100644
> > --- a/arch/arm64/kernel/crash_core.c
> > +++ b/arch/arm64/kernel/crash_core.c
> > @@ -10,6 +10,8 @@
> >  void arch_crash_save_vmcoreinfo(void)
> >  {
> > VMCOREINFO_NUMBER(VA_BITS);
> > +   VMCOREINFO_NUMBER(MAX_USER_VA_BITS);
> > +   VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
> > /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
> > vmcoreinfo_append_str("NUMBER(kimage_voffset)=0x%llx\n",
> > kimage_voffset);
> > --
> > 2.7.4
> >
> >
> > ___
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
> 
> Thanks
> Dave



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v2] makedumpfile: honor the CFLAGS from environment variables

2019-01-29 Thread Kazuhito Hagio
On 1/28/2019 10:14 PM, Kairui Song wrote:
> This makes it possible to pass in extra cflags, for example, hardening
> flags could be passed in with environment variable when building a
> hardened package.
> 
> Also introduce a CFLAGS_BASE to hold common CFLAGS, which simplify the
> CFLAGS definition.
> 
> Suggested-by: Kazuhito Hagio 
> Signed-off-by: Kairui Song 
> ---
> Update from V1:
>   - Use a CFLAGS_BASE to simplify CFLAGS definition
>   - Use immediate set rather than lazy set to avoid unexpected
> flag duplication
> 
>  Makefile | 9 -
>  1 file changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/Makefile b/Makefile
> index 612b9d0..bca9984 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -8,11 +8,10 @@ ifeq ($(strip $CC),)
>  CC   = gcc
>  endif
> 
> -CFLAGS = -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> -   -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE \
> -   -DVERSION='"$(VERSION)"' -DRELEASE_DATE='"$(DATE)"'
> -CFLAGS_ARCH  = -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> - -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
> +CFLAGS_BASE := $(CFLAGS) -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> +-D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
> +CFLAGS  := $(CFLAGS_BASE) -DVERSION='"$(VERSION)"' 
> -DRELEASE_DATE='"$(DATE)"'
> +CFLAGS_ARCH := $(CFLAGS_BASE)
>  # LDFLAGS = -L/usr/local/lib -I/usr/local/include
> 
>  HOST_ARCH := $(shell uname -m)
> --
> 2.20.1
> 

Thank you Kairui, applied.

Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: honor the CFLAGS from environment variables

2019-01-28 Thread Kazuhito Hagio
On 1/28/2019 4:51 PM, Kazuhito Hagio wrote:
> On 1/28/2019 5:59 AM, Simon Horman wrote:
> > On Mon, Jan 28, 2019 at 06:50:45PM +0800, Kairui Song wrote:
> > > This make it easier for passing extra cflags, for example hardening
> > > flags could be passed in with enviroment variable.
> > >
> > > Signed-off-by: Kairui Song 
> >
> > Thanks, I like this a lot.
> >
> > I would like to wake a little to see if there is review from
> > others before applying this.
> 
> I like this, too, and sorry to steal this... :-)
> 
> >
> > > ---
> > >  Makefile | 10 +-
> > >  1 file changed, 5 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/Makefile b/Makefile
> > > index 612b9d0..b511a78 100644
> > > --- a/Makefile
> > > +++ b/Makefile
> > > @@ -8,11 +8,11 @@ ifeq ($(strip $CC),)
> > >  CC   = gcc
> > >  endif
> > >
> > > -CFLAGS = -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > > -   -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE \
> > > -   -DVERSION='"$(VERSION)"' -DRELEASE_DATE='"$(DATE)"'
> > > -CFLAGS_ARCH  = -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > > - -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
> > > +CFLAGS_ARCH = $(CFLAGS) -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > > +   -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
> 
> This expands the whole CFLAGS in advance and duplicates some flags,
> so we can use ":=" instead of "=" for CFLAGS_ARCH ?
> If this is fine, I'll fix it when merging.

or it might be good to remove some redundant flags like this
at this opportunity.

CFLAGS_BASE := $(CFLAGS) -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
-D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
CFLAGS  := $(CFLAGS_BASE) -DVERSION='"$(VERSION)"' 
-DRELEASE_DATE='"$(DATE)"'
CFLAGS_ARCH := $(CFLAGS_BASE)

> 
> Thanks,
> Kazu
> 
> > > +CFLAGS += -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > > +  -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE \
> > > +  -DVERSION='"$(VERSION)"' -DRELEASE_DATE='"$(DATE)"'
> > >  # LDFLAGS = -L/usr/local/lib -I/usr/local/include
> > >
> > >  HOST_ARCH := $(shell uname -m)
> > > --
> > > 2.20.1
> > >
> > >
> > > ___
> > > kexec mailing list
> > > kexec@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/kexec
> > >
> 
> 
> 
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: honor the CFLAGS from environment variables

2019-01-28 Thread Kazuhito Hagio
On 1/28/2019 5:59 AM, Simon Horman wrote:
> On Mon, Jan 28, 2019 at 06:50:45PM +0800, Kairui Song wrote:
> > This make it easier for passing extra cflags, for example hardening
> > flags could be passed in with enviroment variable.
> >
> > Signed-off-by: Kairui Song 
> 
> Thanks, I like this a lot.
> 
> I would like to wake a little to see if there is review from
> others before applying this.

I like this, too, and sorry to steal this... :-)

> 
> > ---
> >  Makefile | 10 +-
> >  1 file changed, 5 insertions(+), 5 deletions(-)
> >
> > diff --git a/Makefile b/Makefile
> > index 612b9d0..b511a78 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -8,11 +8,11 @@ ifeq ($(strip $CC),)
> >  CC = gcc
> >  endif
> >
> > -CFLAGS = -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > - -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE \
> > - -DVERSION='"$(VERSION)"' -DRELEASE_DATE='"$(DATE)"'
> > -CFLAGS_ARCH= -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > -   -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
> > +CFLAGS_ARCH = $(CFLAGS) -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > + -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE

This expands the whole CFLAGS in advance and duplicates some flags,
so we can use ":=" instead of "=" for CFLAGS_ARCH ?
If this is fine, I'll fix it when merging.

Thanks,
Kazu

> > +CFLAGS += -g -O2 -Wall -D_FILE_OFFSET_BITS=64 \
> > +-D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE \
> > +-DVERSION='"$(VERSION)"' -DRELEASE_DATE='"$(DATE)"'
> >  # LDFLAGS = -L/usr/local/lib -I/usr/local/include
> >
> >  HOST_ARCH := $(shell uname -m)
> > --
> > 2.20.1
> >
> >
> > ___
> > kexec mailing list
> > kexec@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/kexec
> >



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v2] Remove the memory encryption mask to obtain the true physical address

2019-01-28 Thread Kazuhito Hagio
On 1/28/2019 9:24 AM, Lendacky, Thomas wrote:
> On 1/27/19 11:46 PM, Lianbo Jiang wrote:
> > For AMD machine with SME feature, if SME is enabled in the first
> > kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
> > the memory encryption mask, so makedumpfile needs to remove the
> > memory encryption mask to obtain the true physical address.
> >
> > Signed-off-by: Lianbo Jiang 
> > ---
> > Changes since v1:
> > 1. Merge them into a patch.
> > 2. The sme_mask is not an enum number, remove it.
> > 3. Sanity check whether the sme_mask is in vmcoreinfo.
> > 4. Deal with the huge pages case.
> > 5. Cover the 5-level path.
> >
> >  arch/x86_64.c  | 30 +-
> >  makedumpfile.c |  4 
> >  makedumpfile.h |  1 +
> >  3 files changed, 22 insertions(+), 13 deletions(-)
> >
> > diff --git a/arch/x86_64.c b/arch/x86_64.c
> > index 537fb78..7b3ed10 100644
> > --- a/arch/x86_64.c
> > +++ b/arch/x86_64.c
> > @@ -291,6 +291,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> > unsigned long page_dir, pgd, pud_paddr, pud_pte, pmd_paddr, pmd_pte;
> > unsigned long pte_paddr, pte;
> > unsigned long p4d_paddr, p4d_pte;
> > +   unsigned long sme_me_mask = ~0UL;
> >
> > /*
> >  * Get PGD.
> > @@ -302,6 +303,9 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> > return NOT_PADDR;
> > }
> >
> > +   if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
> > +   sme_me_mask = ~(NUMBER(sme_mask));
> 
> This is a bit confusing since this isn't the sme_me_mask any more, but the
> complement. Might want to somehow rename this so that it doesn't cause any
> confusion.
> 
> > +
> > if (check_5level_paging()) {
> > page_dir += pgd5_index(vaddr) * sizeof(unsigned long);
> > if (!readmem(PADDR, page_dir, , sizeof pgd)) {
> > @@ -309,7 +313,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> > return NOT_PADDR;
> > }
> > if (info->vaddr_for_vtop == vaddr)
> > -   MSG("  PGD : %16lx => %16lx\n", page_dir, pgd);
> > +   MSG("  PGD : %16lx => %16lx\n", page_dir, (pgd & 
> > sme_me_mask));
> 
> No need to remove the mask here.  You're just printing out the value of
> the entry. It might be nice to know whether the encryption bit is set or
> not - after all, ENTRY_MASK is still part of this value.

Agreed.

> 
> >
> > if (!(pgd & _PAGE_PRESENT)) {
> > ERRMSG("Can't get a valid pgd.\n");
> > @@ -318,20 +322,20 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> > /*
> >  * Get P4D.
> >  */
> > -   p4d_paddr  = pgd & ENTRY_MASK;
> > +   p4d_paddr  = pgd & ENTRY_MASK & sme_me_mask;
> 
> This goes back to my original comment that you should just make a local
> variable that is "ENTRY_MASK & ~(NUMBER(sme_mask))" since you are
> performing this ANDing everywhere ENTRY_MASK is used - except then you
> miss the one at the very end of this routine on the return statement.

This was my idea I said to Lianbo before seeing your comment, but
yes, including ENTRY_MASK in a local variable is better than that.
Thanks for your good suggestion.

As for the variable's name, I think that "entry_mask" is good enough,
but any better name?

  unsigned long entry_mask = ENTRY_MASK;

  if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
  entry_mask &= ~(NUMBER(sme_mask));
  ...
  p4d_paddr = pgd & entry_mask;

And, I found that the find_vmemmap_x86_64() function also uses the
page table for the -e option and looks to be affected by SME.
Lianbo, would you fix the function, too?

Thanks,
Kazu

> 
> > p4d_paddr += p4d_index(vaddr) * sizeof(unsigned long);
> > if (!readmem(PADDR, p4d_paddr, _pte, sizeof p4d_pte)) {
> > ERRMSG("Can't get p4d_pte (p4d_paddr:%lx).\n", 
> > p4d_paddr);
> > return NOT_PADDR;
> > }
> > if (info->vaddr_for_vtop == vaddr)
> > -   MSG("  P4D : %16lx => %16lx\n", p4d_paddr, p4d_pte);
> > +   MSG("  P4D : %16lx => %16lx\n", p4d_paddr, (p4d_pte & 
> > sme_me_mask));
> >
> > if (!(p4d_pte & _PAGE_PRESENT)) {
> > ERRMSG("Can't get a valid p4d_pte.\n");
> > return NOT_PADDR;
> > }
> > -   pud_paddr  = p4d_pte & ENTRY_MASK;
> > +   pud_paddr  = p4d_pte & ENTRY_MASK & sme_me_mask;
> > }else {
> > page_dir += pgd_index(vaddr) * sizeof(unsigned long);
> > if (!readmem(PADDR, page_dir, , sizeof pgd)) {
> > @@ -339,13 +343,13 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> > return NOT_PADDR;
> > }
> > if (info->vaddr_for_vtop == vaddr)
> > -   MSG("  PGD : %16lx => %16lx\n", page_dir, pgd);
> > +   

RE: [PATCH 2/2] Remove the memory encryption mask to obtain the true physical address

2019-01-24 Thread Kazuhito Hagio
On 1/23/2019 5:16 PM, Kazuhito Hagio wrote:
> On 1/22/2019 3:03 AM, Lianbo Jiang wrote:
> > For AMD machine with SME feature, if SME is enabled in the first
> > kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
> > the memory encryption mask, so makedumpfile needs to remove the
> > memory encryption mask to obtain the true physical address.
> >
> > Signed-off-by: Lianbo Jiang 
> > ---
> >  arch/x86_64.c  | 3 +++
> >  makedumpfile.c | 1 +
> >  2 files changed, 4 insertions(+)
> >
> > diff --git a/arch/x86_64.c b/arch/x86_64.c
> > index 537fb78..7651d36 100644
> > --- a/arch/x86_64.c
> > +++ b/arch/x86_64.c
> > @@ -346,6 +346,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> > return NOT_PADDR;
> > }
> > pud_paddr  = pgd & ENTRY_MASK;
> > +   pud_paddr = pud_paddr & ~(NUMBER(sme_mask));
> > }
> >
> > /*
> > @@ -371,6 +372,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> >  * Get PMD.
> >  */
> > pmd_paddr  = pud_pte & ENTRY_MASK;
> > +   pmd_paddr = pmd_paddr & ~(NUMBER(sme_mask));
> > pmd_paddr += pmd_index(vaddr) * sizeof(unsigned long);
> > if (!readmem(PADDR, pmd_paddr, _pte, sizeof pmd_pte)) {
> > ERRMSG("Can't get pmd_pte (pmd_paddr:%lx).\n", pmd_paddr);
> > @@ -391,6 +393,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> > pagetable)
> >  * Get PTE.
> >  */
> > pte_paddr  = pmd_pte & ENTRY_MASK;
> > +   pte_paddr = pte_paddr & ~(NUMBER(sme_mask));
> > pte_paddr += pte_index(vaddr) * sizeof(unsigned long);
> > if (!readmem(PADDR, pte_paddr, , sizeof pte)) {
> > ERRMSG("Can't get pte (pte_paddr:%lx).\n", pte_paddr);
> > diff --git a/makedumpfile.c b/makedumpfile.c
> > index a03aaa1..81c7bb4 100644
> > --- a/makedumpfile.c
> > +++ b/makedumpfile.c
> > @@ -977,6 +977,7 @@ next_page:
> > read_size = MIN(info->page_size - PAGEOFFSET(paddr), size);
> >
> > pgaddr = PAGEBASE(paddr);
> > +   pgaddr = pgaddr & ~(NUMBER(sme_mask));
> 
> Since NUMBER(sme_mask) is initialized with -1 (NOT_FOUND_NUMBER),
> if the sme_mask is not in vmcoreinfo, ~(NUMBER(sme_mask)) will be 0.
> So the four lines added above need
> 
>   if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
> ...

Considering hugepage and the code, it might be better to add
a local variable for the mask value to __vtop4_x86_64() function
and mask it without condition, for example

  unsigned long sme_mask = ~0UL;

  if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
  sme_mask = ~(NUMBER(sme_mask));
  ...
  pud_paddr = pgd & ENTRY_MASK & sme_mask;

to avoid adding lots of 'if' statements.

Thanks,
Kazu

> 
> and, what I'm wondering is whether it doesn't need to take hugepages
> into account such as this
> 
> 392 if (pmd_pte & _PAGE_PSE)/* 2MB pages */
> 393 return (pmd_pte & ENTRY_MASK & PMD_MASK) +
> 394 (vaddr & ~PMD_MASK);
> "arch/x86_64.c"
> 
> Thanks,
> Kazu
> 
> 
> > pgbuf = cache_search(pgaddr, read_size);
> > if (!pgbuf) {
> > ++cache_miss;
> > --
> > 2.17.1
> >
> 
> 
> 
> ___
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH 2/2] Remove the memory encryption mask to obtain the true physical address

2019-01-23 Thread Kazuhito Hagio
On 1/22/2019 3:03 AM, Lianbo Jiang wrote:
> For AMD machine with SME feature, if SME is enabled in the first
> kernel, the crashed kernel's page table(pgd/pud/pmd/pte) contains
> the memory encryption mask, so makedumpfile needs to remove the
> memory encryption mask to obtain the true physical address.
> 
> Signed-off-by: Lianbo Jiang 
> ---
>  arch/x86_64.c  | 3 +++
>  makedumpfile.c | 1 +
>  2 files changed, 4 insertions(+)
> 
> diff --git a/arch/x86_64.c b/arch/x86_64.c
> index 537fb78..7651d36 100644
> --- a/arch/x86_64.c
> +++ b/arch/x86_64.c
> @@ -346,6 +346,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>   return NOT_PADDR;
>   }
>   pud_paddr  = pgd & ENTRY_MASK;
> + pud_paddr = pud_paddr & ~(NUMBER(sme_mask));
>   }
> 
>   /*
> @@ -371,6 +372,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>* Get PMD.
>*/
>   pmd_paddr  = pud_pte & ENTRY_MASK;
> + pmd_paddr = pmd_paddr & ~(NUMBER(sme_mask));
>   pmd_paddr += pmd_index(vaddr) * sizeof(unsigned long);
>   if (!readmem(PADDR, pmd_paddr, _pte, sizeof pmd_pte)) {
>   ERRMSG("Can't get pmd_pte (pmd_paddr:%lx).\n", pmd_paddr);
> @@ -391,6 +393,7 @@ __vtop4_x86_64(unsigned long vaddr, unsigned long 
> pagetable)
>* Get PTE.
>*/
>   pte_paddr  = pmd_pte & ENTRY_MASK;
> + pte_paddr = pte_paddr & ~(NUMBER(sme_mask));
>   pte_paddr += pte_index(vaddr) * sizeof(unsigned long);
>   if (!readmem(PADDR, pte_paddr, , sizeof pte)) {
>   ERRMSG("Can't get pte (pte_paddr:%lx).\n", pte_paddr);
> diff --git a/makedumpfile.c b/makedumpfile.c
> index a03aaa1..81c7bb4 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -977,6 +977,7 @@ next_page:
>   read_size = MIN(info->page_size - PAGEOFFSET(paddr), size);
> 
>   pgaddr = PAGEBASE(paddr);
> + pgaddr = pgaddr & ~(NUMBER(sme_mask));

Since NUMBER(sme_mask) is initialized with -1 (NOT_FOUND_NUMBER),
if the sme_mask is not in vmcoreinfo, ~(NUMBER(sme_mask)) will be 0.
So the four lines added above need

  if (NUMBER(sme_mask) != NOT_FOUND_NUMBER)
...

and, what I'm wondering is whether it doesn't need to take hugepages
into account such as this

392 if (pmd_pte & _PAGE_PSE)/* 2MB pages */
393 return (pmd_pte & ENTRY_MASK & PMD_MASK) +
394 (vaddr & ~PMD_MASK);
"arch/x86_64.c"

Thanks,
Kazu


>   pgbuf = cache_search(pgaddr, read_size);
>   if (!pgbuf) {
>   ++cache_miss;
> --
> 2.17.1
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH 1/2] Makedumpfile: add a new variable 'sme_mask' to number_table

2019-01-23 Thread Kazuhito Hagio
Hi Lianbo,

On 1/22/2019 3:03 AM, Lianbo Jiang wrote:
> It will be used to store the sme mask for crashed kernel, the
> sme_mask denotes whether the old memory is encrypted or not.
> 
> Signed-off-by: Lianbo Jiang 
> ---
>  makedumpfile.c | 3 +++
>  makedumpfile.h | 1 +
>  2 files changed, 4 insertions(+)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 8923538..a03aaa1 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -1743,6 +1743,7 @@ get_structure_info(void)
>   ENUM_NUMBER_INIT(NR_FREE_PAGES, "NR_FREE_PAGES");
>   ENUM_NUMBER_INIT(N_ONLINE, "N_ONLINE");
>   ENUM_NUMBER_INIT(pgtable_l5_enabled, "pgtable_l5_enabled");
> + ENUM_NUMBER_INIT(sme_mask, "sme_mask");

This is useless because the sme_mask is not enum number.
Please remove it.

And, dividing this patchset into the two patches doesn't make sense
to me in this case. Could you merge them into a patch?

Thanks,
Kazu

> 
>   ENUM_NUMBER_INIT(PG_lru, "PG_lru");
>   ENUM_NUMBER_INIT(PG_private, "PG_private");
> @@ -2276,6 +2277,7 @@ write_vmcoreinfo_data(void)
>   WRITE_NUMBER("NR_FREE_PAGES", NR_FREE_PAGES);
>   WRITE_NUMBER("N_ONLINE", N_ONLINE);
>   WRITE_NUMBER("pgtable_l5_enabled", pgtable_l5_enabled);
> + WRITE_NUMBER("sme_mask", sme_mask);
> 
>   WRITE_NUMBER("PG_lru", PG_lru);
>   WRITE_NUMBER("PG_private", PG_private);
> @@ -2672,6 +2674,7 @@ read_vmcoreinfo(void)
>   READ_NUMBER("NR_FREE_PAGES", NR_FREE_PAGES);
>   READ_NUMBER("N_ONLINE", N_ONLINE);
>   READ_NUMBER("pgtable_l5_enabled", pgtable_l5_enabled);
> + READ_NUMBER("sme_mask", sme_mask);
> 
>   READ_NUMBER("PG_lru", PG_lru);
>   READ_NUMBER("PG_private", PG_private);
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 73813ed..e97b2e7 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -1912,6 +1912,7 @@ struct number_table {
>   longNR_FREE_PAGES;
>   longN_ONLINE;
>   longpgtable_l5_enabled;
> + longsme_mask;
> 
>   /*
>   * Page flags
> --
> 2.17.1
> 



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH 1/2 v6] kdump: add the vmcoreinfo documentation

2019-01-14 Thread Kazuhito Hagio
On 1/11/2019 7:33 AM, Borislav Petkov wrote:
> On Thu, Jan 10, 2019 at 08:19:43PM +0800, Lianbo Jiang wrote:
>> +init_uts_ns.name.release
>> +
>> +
>> +The version of the Linux kernel. Used to find the corresponding source
>> +code from which the kernel has been built.
>> +
> 
> ...
> 
>> +
>> +init_uts_ns
>> +---
>> +
>> +This is the UTS namespace, which is used to isolate two specific
>> +elements of the system that relate to the uname(2) system call. The UTS
>> +namespace is named after the data structure used to store information
>> +returned by the uname(2) system call.
>> +
>> +User-space tools can get the kernel name, host name, kernel release
>> +number, kernel version, architecture name and OS type from it.
> 
> Already asked this but no reply so lemme paste my question again:
> 
> "And this document already fulfills its purpose - those two vmcoreinfo
> exports are redundant and the first one can be removed.
> 
> And now that we agreed that VMCOREINFO is not an ABI and is very tightly
> coupled to the kernel version, init_uts_ns.name.release can be removed,
> yes?
> 
> Or is there anything speaking against that?"

As for makedumpfile, it will be not impossible to remove the
init_uts_ns.name.relase (OSRELEASE), but some fixes are needed.
Because historically OSRELEASE has been a kind of a mandatory entry
in vmcoreinfo from the beginning of vmcoreinfo, so makedumpfile uses
its existence to check whether a vmcoreinfo is sane.

Also, I think crash also will need to be fixed if it is removed.
So I hope it will be left as it is.

Thanks,
Kazu

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH 1/2 v4] kdump: add the vmcoreinfo documentation

2019-01-04 Thread Kazuhito Hagio
-Original Message-
> >> +===
> >> +What is the VMCOREINFO?
> >> +===
> >> +
> >> +VMCOREINFO is a special ELF note section. It contains various
> >> +information from the kernel like structure size, page size, symbol
> >> +values, field offsets, etc. These data are packed into an ELF note
> >> +section and used by user-space tools like crash and makedumpfile to
> >> +analyze a kernel's memory layout.
> >> +
> >> +To dump the VMCOREINFO contents, one can do:
> >> +
> >> +# makedumpfile -g VMCOREINFO -x vmlinux
> >
> > again, this command does not dump the VMCOREINFO in ELF note section.
> > It converts the vmlinux's debug infomation into a VMCOREINFO-like data.
> > So I don't think this command is suitable to introduce here.
> >
> 
> Thank you, Kazu.
> 
> As you mentioned, makedumpfile in 'devel' branch can print VMCOREINFO in 
> /proc/kcore,
> can i add the following command to this document?
> 
> #makedumpfile --mem-usage /proc/kcore -D

I don't know whether we can add unreleased softwares to the document.
Any comments on this?

If it's OK, crash also can dump it, so
---
To dump the VMCOREINFO contents, one can do:

  # crash vmlinux /proc/kcore
  ...
  > help -n
or
  # makedumpfile -D --mem-usage /proc/kcore
---

or remove that for now and update later?

> 
> >> +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|
> >> +PG_hwpoision|PG_head_mask
> >> +-
> >> +
> >> +Page attributes. These flags are used to filter free pages.
> >
> > Some of these are not used to filter *free* pages, so
> >
> > ... used to filter various unnecessary pages.
> >
> 
> Great. I will modify it in next post.
> 
> And also merge the 'PG_buddy' and 'PG_offline'  into the PG_* flag here.

Thanks, I think your "PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)" style is good.

Kazu

> 
> Many thanks.
> 
> Lianbo
> 
> >> +PAGE_BUDDY_MAPCOUNT_VALUE or ~PG_buddy
> >> +--
> >
> > then, this can be merged into the one above?
> >
> >> +==
> >> +x86_64
> >> +==
> > ...
> >> +PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)
> >> +
> >
> > This looks not only for x86_64, and also can be merged into
> > the PG_* flags?
> >
> > Thank you for your effort!
> > Kazu
> >
> >



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH 1/2 v4] kdump: add the vmcoreinfo documentation

2019-01-03 Thread Kazuhito Hagio
Hi Lianbo,

-Original Message-
> +===
> +What is the VMCOREINFO?
> +===
> +
> +VMCOREINFO is a special ELF note section. It contains various
> +information from the kernel like structure size, page size, symbol
> +values, field offsets, etc. These data are packed into an ELF note
> +section and used by user-space tools like crash and makedumpfile to
> +analyze a kernel's memory layout.
> +
> +To dump the VMCOREINFO contents, one can do:
> +
> +# makedumpfile -g VMCOREINFO -x vmlinux

again, this command does not dump the VMCOREINFO in ELF note section.
It converts the vmlinux's debug infomation into a VMCOREINFO-like data.
So I don't think this command is suitable to introduce here.

> +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|
> +PG_hwpoision|PG_head_mask
> +-
> +
> +Page attributes. These flags are used to filter free pages.

Some of these are not used to filter *free* pages, so

... used to filter various unnecessary pages.

> +PAGE_BUDDY_MAPCOUNT_VALUE or ~PG_buddy
> +--

then, this can be merged into the one above?

> +==
> +x86_64
> +==
...
> +PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)
> +

This looks not only for x86_64, and also can be merged into
the PG_* flags?

Thank you for your effort!
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH 1/2 v2] kdump: add the vmcoreinfo documentation

2018-12-10 Thread Kazuhito Hagio
-Original Message-
> > makedumpfile doesn't have any switch which dumps VMCOREINFO from kcore
> > for now. (I'm thinking to have makedumpfile dump it as debug message,
> > though.)
> 
> Might be useful as people are looking into using VMCOREINFO when
> debugging a live kernel...

Now makedumpfile in 'devel' branch can print VMCOREINFO in /proc/kcore
as debug message with the following command:

# ./makedumpfile --mem-usage /proc/kcore -D
...
VMCOREINFO   :
  OSRELEASE=4.20.0-0.rc2.git0.1.fc30.x86_64
  PAGESIZE=4096
page_size: 4096
  SYMBOL(init_uts_ns)=84213540
  ...

Also, it seems that recent crash utility in git repo with kernel 4.19
or newer can dump it with 'help -n' command:

# ./crash vmlinux /proc/kcore
...
crash> help -n
...
  Elf64_Nhdr:
  n_namesz: 11 ("VMCOREINFO")
  n_descsz: 1955
n_type: 0 (unknown)

  OSRELEASE=4.20.0-0.rc2.git0.1.fc30.x86_64
  PAGESIZE=4096
  SYMBOL(init_uts_ns)=84213540
  ...

Thanks,
Kazu
___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile/ppc64: increase MAX_PHYSMEM_BITS to 2PB

2018-12-06 Thread Kazuhito Hagio
> -Original Message-
> * Required for kernel 4.20
> 
> With kernel commit 4ffe713b7587 ("powerpc/mm: Increase the max addressable
> memory to 2PB"), MAX_PHYSMEM_BITS is bumped up to 51 for SPARSEMEM_VMEMMAP
> and SPARSEMEM_EXTREME case. Make the appropriate update here.
> 
> Signed-off-by: Hari Bathini 

Applied to the devel branch.

Thanks,
Kazu

> ---
>  arch/ppc64.c   |5 +
>  makedumpfile.h |1 +
>  2 files changed, 6 insertions(+)
> 
> diff --git a/arch/ppc64.c b/arch/ppc64.c
> index 947a125..5b8231e 100644
> --- a/arch/ppc64.c
> +++ b/arch/ppc64.c
> @@ -486,6 +486,11 @@ set_ppc64_max_physmem_bits(void)
>   || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
>   return TRUE;
> 
> + info->max_physmem_bits  = _MAX_PHYSMEM_BITS_4_20;
> + if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> + || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> + return TRUE;
> +
>   return FALSE;
>  }
> 
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 73813ed..d49f1f1 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -672,6 +672,7 @@ unsigned long get_kvbase_arm64(void);
>  #define _MAX_PHYSMEM_BITS_ORIG  (44)
>  #define _MAX_PHYSMEM_BITS_3_7   (46)
>  #define _MAX_PHYSMEM_BITS_4_19  (47)
> +#define _MAX_PHYSMEM_BITS_4_20  (51)
>  #define REGION_SHIFT(60UL)
>  #define VMEMMAP_REGION_ID   (0xfUL)
> 
> 

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH 1/2 v2] kdump: add the vmcoreinfo documentation

2018-12-05 Thread Kazuhito Hagio
> >> Generating VMCOREINFO is easy in the first kernel, for example:
> >> # makedumpfile -g VMCOREINFO -x vmlinux
> >
> > I get:
> >
> > $ makedumpfile -g VMCOREINFO -x vmlinux
> > The kernel version is not supported.
> > The makedumpfile operation may be incomplete.
> >
> > The vmcoreinfo is saved to VMCOREINFO.
> >
> > makedumpfile Completed.
> >
> > But the text file looks ok AFAICT. Please add that command to the
> > documentation file.
> >

Please note that this VMCOREINFO is generated from the information in
the vmlinux only, not from the running kernel and /proc/kcore. So if
we add a command to dump it from running kernel, it's not suitable.

(This switch is intended to make a data that is used instead of vmlinux
 when vmcore didn't have VMCOREINFO and makedumpfile is in a situation
 that we don't have vmlinux. Please see the explanation of the -i and
 -g options in makefumpfile(8).)

makedumpfile doesn't have any switch which dumps VMCOREINFO from kcore
for now. (I'm thinking to have makedumpfile dump it as debug message,
though.)

Thanks,
Kazu

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[ANNOUNCE] makedumpfile 1.6.5 is released

2018-12-05 Thread Kazuhito Hagio
Hi,

I'm pleased to announce the release of makedumpfile-1.6.5.
Your comments/patches are welcome.

Main new features:
o Improve support for arm64 system with KASLR
  - The -x option and --mem-usage option are supported.

o Support new kernels
  - The supported kernel is updated to 4.19.4 in this version.

Changelog:
o New feature
  - [PATCH 1/2] Use monotonic clock to calculate ETA and stats (Petr Tesarik) 
0f4e25c
  - [PATCH 2/2] Check if clock_gettime() requires -lrt (Petr Tesarik) aec51ef
  - [PATCH] arm64: Get 'info->page_offset' from PT_LOAD segments to support 
KASLR boot cases
  (Bhupesh Sharma) 94c97db
  - [PATCH] arm64: Add runtime kaslr offset if it exists (Bhupesh Sharma) 
616c98d
  - [PATCH] ppc64: increase MAX_PHYSMEM_BITS to 128TB (Hari Bathini) 1ce0987
  - [PATCH] x86_64: fix failure of getting kcore vmcoreinfo on kernel 4.19
  (Kazuhito Hagio) 1ea989b

o Bugfix
  - [PATCH v3] when refiltering, initialize refiltered bitmap2 from the kdump 
file's bitmap2
  (Pingfan Liu) 3adf612
  - [PATCH] Fix failure of detection of SPARSEMEM EXTREME in case of -x VMLINUX
  (Hatayama, Daisuke) 208124d
  - [PATCH] sadump: fix failure of reading 640 KB backup region if at over 4GB 
location
  (Hatayama, Daisuke) d015e6d
  - [PATCH] arm64: restore info->page_offset and implement 
paddr_to_vaddr_arm64()
  (Kazuhito Hagio) bc8b3bb
  - [PATCH] x86_64: fix an unnecessary message with --mem-usage option 
(Kazuhito Hagio) 53cf783

o Cleanup
  - [PATCH] Update help text to indicate --mem-usage is supported on archs 
other than x86_64
  (Bhupesh Sharma) 29dbb59
  - [PATCH] Prepare paddr_to_vaddr() for arch-specific p2v conversion (Kazuhito 
Hagio) 9eb5a31
  - [PATCH] Fix some compilation warnings with gcc-8.1.1 (Kazuhito Hagio) 
ec85943

Explanation of makedumpfile:
  To shorten the size of the dumpfile and the time of creating the
  dumpfile, makedumpfile copies only the necessary pages for analysis
  to the dumpfile from /proc/vmcore. You can specify the kind of
  unnecessary pages with dump_level. If you want to shorten the size
  further, enable the compression of the page data.

Download:
  You can download the latest makedumpfile from the following URL.
  Details of the change are written on the git page of the following site.
  https://sourceforge.net/projects/makedumpfile/

Method of installation:
  You can compile the makedumpfile command as follows;
  1. "tar -zxvf makedumpfile-x.y.z.tar.gz"
  2. "cd makedumpfile-x.y.z"
  3. "make; make install"

Usage:
  makedumpfile [-c|-l|-p|-E] [-d dump_level] [-x vmlinux] dump_mem dump_file

Example:
  If you want to exclude pages filled by zero, cache pages, user pages
  and free pages and to enable compression, please execute the following
  command.

  # makedumpfile -l -d 31 /proc/vmcore dumpfile


Thanks,
Kazu


___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile/ppc64: increase MAX_PHYSMEM_BITS to 2PB

2018-12-04 Thread Kazuhito Hagio
> -Original Message-
> * Required for kernel 4.20
> 
> With kernel commit 4ffe713b7587 ("powerpc/mm: Increase the max addressable
> memory to 2PB"), MAX_PHYSMEM_BITS is bumped up to 51 for SPARSEMEM_VMEMMAP
> and SPARSEMEM_EXTREME case. Make the appropriate update here.
> 
> Signed-off-by: Hari Bathini 

Hi Hari,

Thank you for your patch, as always.

I'm planning to release makedumpfile v1.6.5 tomorrow, which supports
up to 4.19 kernel, so this patch will be merged into v1.6.6.

Thanks,
Kazu

> ---
>  arch/ppc64.c   |5 +
>  makedumpfile.h |1 +
>  2 files changed, 6 insertions(+)
> 
> diff --git a/arch/ppc64.c b/arch/ppc64.c
> index 947a125..5b8231e 100644
> --- a/arch/ppc64.c
> +++ b/arch/ppc64.c
> @@ -486,6 +486,11 @@ set_ppc64_max_physmem_bits(void)
>   || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
>   return TRUE;
> 
> + info->max_physmem_bits  = _MAX_PHYSMEM_BITS_4_20;
> + if ((array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT_EXTREME()))
> + || (array_len == (NR_MEM_SECTIONS() / _SECTIONS_PER_ROOT(
> + return TRUE;
> +
>   return FALSE;
>  }
> 
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 73813ed..d49f1f1 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -672,6 +672,7 @@ unsigned long get_kvbase_arm64(void);
>  #define _MAX_PHYSMEM_BITS_ORIG  (44)
>  #define _MAX_PHYSMEM_BITS_3_7   (46)
>  #define _MAX_PHYSMEM_BITS_4_19  (47)
> +#define _MAX_PHYSMEM_BITS_4_20  (51)
>  #define REGION_SHIFT(60UL)
>  #define VMEMMAP_REGION_ID   (0xfUL)
> 
> 

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v2] makedumpfile: exclude pages that are logically offline

2018-11-27 Thread Kazuhito Hagio
> Linux marks pages that are logically offline via a page flag (map count).
> Such pages e.g. include pages infated as part of a balloon driver or
> pages that were not actually onlined when onlining the whole section.
> 
> While the hypervisor usually allows to read such inflated memory, we
> basically read and dump data that is completely irrelevant. Also, this
> might result in quite some overhead in the hypervisor. In addition,
> we saw some problems under Hyper-V, whereby we can crash the kernel by
> dumping, when reading memory of a partially onlined memory segment
> (for memory added by the Hyper-V balloon driver).
> 
> Therefore, don't read and dump pages that are marked as being logically
> offline.
> 
> Signed-off-by: David Hildenbrand 

Thanks for the v2 update.
I'm going to merge this patch after the kernel patches are merged
and it tests fine with the kernel.

Kazu

> ---
> 
> v1 -> v2:
> - Fix PAGE_BUDDY_MAPCOUNT_VALUE vs. PAGE_OFFLINE_MAPCOUNT_VALUE
> 
>  makedumpfile.c | 34 ++
>  makedumpfile.h |  1 +
>  2 files changed, 31 insertions(+), 4 deletions(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 8923538..a5f2ea9 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -88,6 +88,7 @@ mdf_pfn_t pfn_cache_private;
>  mdf_pfn_t pfn_user;
>  mdf_pfn_t pfn_free;
>  mdf_pfn_t pfn_hwpoison;
> +mdf_pfn_t pfn_offline;
> 
>  mdf_pfn_t num_dumped;
> 
> @@ -249,6 +250,21 @@ isHugetlb(unsigned long dtor)
>  && (SYMBOL(free_huge_page) == dtor));
>  }
> 
> +static int
> +isOffline(unsigned long flags, unsigned int _mapcount)
> +{
> + if (NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE) == NOT_FOUND_NUMBER)
> + return FALSE;
> +
> + if (flags & (1UL << NUMBER(PG_slab)))
> + return FALSE;
> +
> + if (_mapcount == (int)NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE))
> + return TRUE;
> +
> + return FALSE;
> +}
> +
>  static int
>  is_cache_page(unsigned long flags)
>  {
> @@ -2287,6 +2303,8 @@ write_vmcoreinfo_data(void)
>   WRITE_NUMBER("PG_hwpoison", PG_hwpoison);
> 
>   WRITE_NUMBER("PAGE_BUDDY_MAPCOUNT_VALUE", PAGE_BUDDY_MAPCOUNT_VALUE);
> + WRITE_NUMBER("PAGE_OFFLINE_MAPCOUNT_VALUE",
> +  PAGE_OFFLINE_MAPCOUNT_VALUE);
>   WRITE_NUMBER("phys_base", phys_base);
> 
>   WRITE_NUMBER("HUGETLB_PAGE_DTOR", HUGETLB_PAGE_DTOR);
> @@ -2687,6 +2705,7 @@ read_vmcoreinfo(void)
>   READ_SRCFILE("pud_t", pud_t);
> 
>   READ_NUMBER("PAGE_BUDDY_MAPCOUNT_VALUE", PAGE_BUDDY_MAPCOUNT_VALUE);
> + READ_NUMBER("PAGE_OFFLINE_MAPCOUNT_VALUE", PAGE_OFFLINE_MAPCOUNT_VALUE);
>   READ_NUMBER("phys_base", phys_base);
>  #ifdef __aarch64__
>   READ_NUMBER("VA_BITS", VA_BITS);
> @@ -6041,6 +6060,12 @@ __exclude_unnecessary_pages(unsigned long mem_map,
>   else if (isHWPOISON(flags)) {
>   pfn_counter = _hwpoison;
>   }
> + /*
> +  * Exclude pages that are logically offline.
> +  */
> + else if (isOffline(flags, _mapcount)) {
> + pfn_counter = _offline;
> + }
>   /*
>* Unexcludable page
>*/
> @@ -7522,7 +7547,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, 
> struct cache_data *cd_page)
>*/
>   if (info->flag_cyclic) {
>   pfn_zero = pfn_cache = pfn_cache_private = 0;
> - pfn_user = pfn_free = pfn_hwpoison = 0;
> + pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
>   pfn_memhole = info->max_mapnr;
>   }
> 
> @@ -8804,7 +8829,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data 
> *cd_header, struct cache_d
>* Reset counter for debug message.
>*/
>   pfn_zero = pfn_cache = pfn_cache_private = 0;
> - pfn_user = pfn_free = pfn_hwpoison = 0;
> + pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
>   pfn_memhole = info->max_mapnr;
> 
>   /*
> @@ -9749,7 +9774,7 @@ print_report(void)
>   pfn_original = info->max_mapnr - pfn_memhole;
> 
>   pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> - + pfn_user + pfn_free + pfn_hwpoison;
> + + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
>   shrinking = (pfn_original - pfn_excluded) * 100;
>   shrinking = shrinking / pfn_original;
> 
> @@ -9763,6 +9788,7 @@ print_report(void)
>   REPORT_MSG("User process data pages : 0x%016llx\n", pfn_user);
>   REPORT_MSG("Free pages  : 0x%016llx\n", pfn_free);
>   REPORT_MSG("Hwpoison pages  : 0x%016llx\n", pfn_hwpoison);
> + REPORT_MSG("Offline pages   : 0x%016llx\n", pfn_offline);
>   REPORT_MSG("  Remaining pages  : 0x%016llx\n",
>   pfn_original - pfn_excluded);
>   REPORT_MSG("  (The number of pages is reduced to %lld%%.)\n",
> @@ -9790,7 +9816,7 @@ 

RE: [PATCH v1] makedumpfile: exclude pages that are logically offline

2018-11-21 Thread Kazuhito Hagio
Hi David,

> Linux marks pages that are logically offline via a page flag (map count).
> Such pages e.g. include pages infated as part of a balloon driver or
> pages that were not actually onlined when onlining the whole section.
> 
> While the hypervisor usually allows to read such inflated memory, we
> basically read and dump data that is completely irrelevant. Also, this
> might result in quite some overhead in the hypervisor. In addition,
> we saw some problems under Hyper-V, whereby we can crash the kernel by
> dumping, when reading memory of a partially onlined memory segment
> (for memory added by the Hyper-V balloon driver).
> 
> Therefore, don't read and dump pages that are marked as being logically
> offline.
> 
> Signed-off-by: David Hildenbrand 
> ---
>  makedumpfile.c | 34 ++
>  makedumpfile.h |  1 +
>  2 files changed, 31 insertions(+), 4 deletions(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 8923538..b8bfd4c 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -88,6 +88,7 @@ mdf_pfn_t pfn_cache_private;
>  mdf_pfn_t pfn_user;
>  mdf_pfn_t pfn_free;
>  mdf_pfn_t pfn_hwpoison;
> +mdf_pfn_t pfn_offline;
> 
>  mdf_pfn_t num_dumped;
> 
> @@ -249,6 +250,21 @@ isHugetlb(unsigned long dtor)
>  && (SYMBOL(free_huge_page) == dtor));
>  }
> 
> +static int
> +isOffline(unsigned long flags, unsigned int _mapcount)
> +{
> + if (NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE) == NOT_FOUND_NUMBER)
> + return FALSE;

This is NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE), isn't it?
If so, I will correct it when merging.

Otherwise, looks good to me.

Thanks!
Kazu

> +
> + if (flags & (1UL << NUMBER(PG_slab)))
> + return FALSE;
> +
> + if (_mapcount == (int)NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE))
> + return TRUE;
> +
> + return FALSE;
> +}
> +
>  static int
>  is_cache_page(unsigned long flags)
>  {
> @@ -2287,6 +2303,8 @@ write_vmcoreinfo_data(void)
>   WRITE_NUMBER("PG_hwpoison", PG_hwpoison);
> 
>   WRITE_NUMBER("PAGE_BUDDY_MAPCOUNT_VALUE", PAGE_BUDDY_MAPCOUNT_VALUE);
> + WRITE_NUMBER("PAGE_OFFLINE_MAPCOUNT_VALUE",
> +  PAGE_OFFLINE_MAPCOUNT_VALUE);
>   WRITE_NUMBER("phys_base", phys_base);
> 
>   WRITE_NUMBER("HUGETLB_PAGE_DTOR", HUGETLB_PAGE_DTOR);
> @@ -2687,6 +2705,7 @@ read_vmcoreinfo(void)
>   READ_SRCFILE("pud_t", pud_t);
> 
>   READ_NUMBER("PAGE_BUDDY_MAPCOUNT_VALUE", PAGE_BUDDY_MAPCOUNT_VALUE);
> + READ_NUMBER("PAGE_OFFLINE_MAPCOUNT_VALUE", PAGE_OFFLINE_MAPCOUNT_VALUE);
>   READ_NUMBER("phys_base", phys_base);
>  #ifdef __aarch64__
>   READ_NUMBER("VA_BITS", VA_BITS);
> @@ -6041,6 +6060,12 @@ __exclude_unnecessary_pages(unsigned long mem_map,
>   else if (isHWPOISON(flags)) {
>   pfn_counter = _hwpoison;
>   }
> + /*
> +  * Exclude pages that are logically offline.
> +  */
> + else if (isOffline(flags, _mapcount)) {
> + pfn_counter = _offline;
> + }
>   /*
>* Unexcludable page
>*/
> @@ -7522,7 +7547,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, 
> struct cache_data *cd_page)
>*/
>   if (info->flag_cyclic) {
>   pfn_zero = pfn_cache = pfn_cache_private = 0;
> - pfn_user = pfn_free = pfn_hwpoison = 0;
> + pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
>   pfn_memhole = info->max_mapnr;
>   }
> 
> @@ -8804,7 +8829,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data 
> *cd_header, struct cache_d
>* Reset counter for debug message.
>*/
>   pfn_zero = pfn_cache = pfn_cache_private = 0;
> - pfn_user = pfn_free = pfn_hwpoison = 0;
> + pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
>   pfn_memhole = info->max_mapnr;
> 
>   /*
> @@ -9749,7 +9774,7 @@ print_report(void)
>   pfn_original = info->max_mapnr - pfn_memhole;
> 
>   pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> - + pfn_user + pfn_free + pfn_hwpoison;
> + + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
>   shrinking = (pfn_original - pfn_excluded) * 100;
>   shrinking = shrinking / pfn_original;
> 
> @@ -9763,6 +9788,7 @@ print_report(void)
>   REPORT_MSG("User process data pages : 0x%016llx\n", pfn_user);
>   REPORT_MSG("Free pages  : 0x%016llx\n", pfn_free);
>   REPORT_MSG("Hwpoison pages  : 0x%016llx\n", pfn_hwpoison);
> + REPORT_MSG("Offline pages   : 0x%016llx\n", pfn_offline);
>   REPORT_MSG("  Remaining pages  : 0x%016llx\n",
>   pfn_original - pfn_excluded);
>   REPORT_MSG("  (The number of pages is reduced to %lld%%.)\n",
> @@ -9790,7 +9816,7 @@ print_mem_usage(void)
>   pfn_original = info->max_mapnr - 

RE: [PATCH v2] x86_64, vmcoreinfo: Append 'page_offset_base' to vmcoreinfo

2018-11-19 Thread Kazuhito Hagio
On 11/15/2018 4:47 PM, Bhupesh Sharma wrote:
> Adding 'page_offset_base' to the vmcoreinfo can be specially useful for
> live-debugging of a running kernel via user-space utilities
> like makedumpfile (see [1]).

I agree.

> Recently, I saw an issue with the 'makedumpfile' utility (see [2] for
> details), whose live debugging feature is broken with newer kernels
> (I tested the same with 4.19-rc8+ kernel), as KCORE_REMAP segments were
> added to kcore, thus leading to an additional sections in the same, and
> makedumpfile is not longer able to determine the start of direct
> mapping of all physical memory, as it relies on traversing the PT_LOAD
> segments inside kcore and using the last PT_LOAD segment
> to determine the start of direct mapping.

Actually, the KCORE_REMAP segments were already removed from kcore by
commit bf904d2762ee ("x86/pti/64: Remove the SYSCALL64 entry trampoline")
and kcore's program headers got back to the previous ones, but this
fact shows that they are changeable.

So I think that if we have this NUMBER(page_offset_base) in vmcoreinfo
for x86_64, user-space tools (especially makedumpfile) would become
more stable against changes in kcore and vmcore, rather than depending
on their PT_LOAD values.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v2 2/2] makedumpfile/x86_64: Add 'page_offset_base' reading capability from VMCOREINFO

2018-11-16 Thread Kazuhito Hagio
Hi Bhupesh,

On 10/29/2018 3:50 AM, Bhupesh Sharma wrote:
> makedumpfile '--mem-usage' use-case is currently broken on x86_64 with
> latest kernels (see a sample log below with 4.19 kernel):
> 
> [root@hp-dl380pgen8-02-vm-15 ~]# makedumpfile -f --mem-usage /proc/kcore
>The kernel version is not supported.
>The makedumpfile operation may be incomplete.
>
>set_kcore_vmcoreinfo: Can't get the offset of VMCOREINFO(/proc/kcore).
>Success
> 
>makedumpfile Failed.
> 
> It seems that the issue is seen because KCORE_REMAP which was added
> to recent kernels and appears as a PT_LOAD in '/proc/kcore'.

I'm a bit late, but as for this problem:

* Since kernel 4.19 itself doesn't have NUMBER(page_offset_base) in vmcoreinfo,
we need to merge the patch I sent in order to support kernel 4.19.
http://lists.infradead.org/pipermail/kexec/2018-October/021749.html

* It looks like the following commit already removed KCORE_REMAP PT_LOAD from
/proc/kcore at 4.20-rc1, and the problem doesn't occur with mainline kernel.

commit bf904d2762ee6fc1e4acfcb0772bbfb4a27ad8a6
Author: Andy Lutomirski 
Date:   Mon Sep 3 15:59:44 2018 -0700

x86/pti/64: Remove the SYSCALL64 entry trampoline
...
-   /*
-* The cpu_entry_area alias addresses are not in the kernel binary
-* so they do not show up in /proc/kcore normally.  This adds entries
-* for them manually.
-*/
-   kclist_add_remap(_cpu(kcore_entry_trampoline, cpu),
-_entry_trampoline,
-_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE);

But yes, kcore PT_LOADs seems changeable like this.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH v2 1/2] makedumpfile/arm64: Use VMCOREINFO inside '/proc/kcore' (if available)

2018-11-16 Thread Kazuhito Hagio
Hi Bhupesh,

On 11/15/2018 3:27 PM, Bhupesh Sharma wrote:
>>> Note that PAGE_OFFSET is not constant on KASLR enabled arm64 kernels
>>> as the start of the linear range gets randomized as well (as per the
>>> KASLR seed) which means that PAGE_OFFSET is no longer equal to the
>>> macro:
>>> #define PAGE_OFFSET (UL(0x) - \
>>>(UL(1) << (VA_BITS - 1)) + 1)
>>
>> The PAGE_OFFSET macro itself is still constant in arm64 kernel and used by
>> p2v/v2p calculation, and also used by the test whether a virtual address is
>> in linear kernel range (as VA_BITS):

> I understand, but for me (and as I understood from other arm64 users
> as well), the kernel
> virtual address layout and the comments we have in place are probably
> broken after KASLR
> was enabled on arm64. This is something I shared with arm64 maintainers as 
> well:
> [1]. https://www.spinics.net/lists/arm-kernel/msg655933.html
> 
> Lets go through some details first.
> Please note that the PAGE_OFFSET is defined as
> ('arch/arm64/include/asm/memory.h'):
> 
> /*
>  * PAGE_OFFSET - the virtual address of the start of the linear map (top
>  * (VA_BITS - 1))
> 
> So, basically this points to the virtual address which indicates the
> start of linear mapping.
> Now, the linear region takes up exactly half of the kernel virtual
> address space.
> 
> So, with a normal non-KASLR boot (I am sharing the logs below on a
> system booted with 'nokaslr' in bootargs),
> the PAGE_OFFSET macro would normally point to the half mark of the
> kernel virtual address space.
> [I needed to revert the kernel commit
> "071929dbdd865f779a89ba3f1e06ba8d17dd3743: arm64: Stop printing the
> virtual memory layout" to get these logs]
> 
> [0.00] Virtual kernel memory layout:
> [0.00] modules : 0x - 0x0800
> (   128 MB)
> [0.00] vmalloc : 0x0800 - 0x7bdf
> (126847 GB)
> [0.00]   .text : 0x(ptrval) - 0x(ptrval)
> (  8768 KB)
> [0.00] .rodata : 0x(ptrval) - 0x(ptrval)
> (  5696 KB)
> [0.00]   .init : 0x(ptrval) - 0x(ptrval)
> (  4224 KB)
> [0.00]   .data : 0x(ptrval) - 0x(ptrval)
> (  1775 KB)
> [0.00].bss : 0x(ptrval) - 0x(ptrval)
> (  9193 KB)
> [0.00] fixed   : 0x7fdffe79 - 0x7fdffec0
> (  4544 KB)
> [0.00] PCI I/O : 0x7fdffee0 - 0x7fdfffe0
> (16 MB)
> [0.00] vmemmap : 0x7fe0 - 0x8000
> (   128 GB maximum)
> [0.00]   0x7fe0 - 0x7fe00400
> (64 MB actual)
> [0.00] memory  : 0x8000 - 0x8010
> ( 65536 MB)
> 
> This is on a system with 64K page size and VA_BITS=48.
> For such a configuration, PAGE_OFFSET is also calculated as
> 0x8000, which the
> same as the start address of the memory range (i.e memory  :
> 0x8000 - 0x8010)
> which depicts the linear region from where 'kmalloc' allocations take place.
> 
> Now, lets look at the boot logs on the same system with 'nokaslr'
> removed from bootargs (i.e. a KASLR boot case):
> 
> [0.00] Virtual kernel memory layout:
> [0.00] modules : 0x - 0x0800
> (   128 MB)
> [0.00] vmalloc : 0x0800 - 0x7bdf
> (126847 GB)
> [0.00]   .text : 0x(ptrval) - 0x(ptrval)
> (  8768 KB)
> [0.00] .rodata : 0x(ptrval) - 0x(ptrval)
> (  5696 KB)
> [0.00]   .init : 0x(ptrval) - 0x(ptrval)
> (  4224 KB)
> [0.00]   .data : 0x(ptrval) - 0x(ptrval)
> (  1775 KB)
> [0.00].bss : 0x(ptrval) - 0x(ptrval)
> (  9193 KB)
> [0.00] fixed   : 0x7fdffe79 - 0x7fdffec0
> (  4544 KB)
> [0.00] PCI I/O : 0x7fdffee0 - 0x7fdfffe0
> (16 MB)
> [0.00] vmemmap : 0x7fe0 - 0x8000
> (   128 GB maximum)
> [0.00]   0x7fea5b30 - 0x7fea5f30
> (64 MB actual)
> [0.00] memory  : 0xa96cc000 - 0xa97cc000
> ( 65536 MB)
> 
> As you will note here, the start of linear range as indicated by the
> 'memory' node is 0xa96cc000
> (this would be a random value for each KASLR boot), however the
> PAGE_OFFSET macro is still stuck
> at 0x8000, which is confusing (and probably incorrect), as
> the 'kmalloc' calls return addresses in the range
> '0xa96cc000 - 0xa97cc000' and not in the
> '0x8000 - 0x8010'  range
> anymore as we saw in the non-KASLR boot case.

Thanks for the details.
Its name may be confusing, but I think that whether it is correct or not is
how it is used. If there is a misuse bug 

RE: [PATCH v2 1/2] makedumpfile/arm64: Use VMCOREINFO inside '/proc/kcore' (if available)

2018-11-02 Thread Kazuhito Hagio
Hi Bhupesh,

On 10/29/2018 3:50 AM, Bhupesh Sharma wrote:
> makedumpfile commit 94c97db3fe859ca14d7b38b0ae9ee0ffb83689d2 (arm64: Get
> 'info->page_offset' from PT_LOAD segments to support KASLR boot cases)
> added a method to determine 'info->page_offset' from PT_LOAD segments
> for arm64 platforms.
> 
> In this commit, I hardcoded the 'NOT_PADDR_ARM64' macro as
> 0x10a8UL which was a valid value on qualcomm-amberwing
> boards (which was the arm64 board available with me at the time).
> 
> However, I was testing this change on several other arm64 boards
> like apm-mustang, huawei-taishan and hp-moonshot (now that I have
> access to them) and saw that this value can vary on the basis of
> the "Kernel code" memory range placement.
> 
> To fix the same, this patchset uses a new approach. Since kernel
> version 4.19-rc5 (commit
> 23c85094fe1895caefdd19ef624ee687ec5f4507 ["proc/kcore: add vmcoreinfo
> note to /proc/kcore"]), '/proc/kcore' contains a new
> PT_NOTE which carries the VMCOREINFO information.
> 
> If the same is available, we can use it for makedumpfile
> 'show_mem_usage()' functionality. This is especially useful
> for architectures like arm64 as we can get kernel symbols like
> 'VA_BITS' and 'kimage_voffset' from the '/proc/kcore' itself and use it
> to calculate 'info->page_offset' when we make a call to
> 'get_page_offset()'.

First, in show_mem_usage(), we set info->page_offset in advance
in order to get the offset of vmcoreinfo in /proc/kcore (not in PT_NOTE)
from /sys/kernel/vmcoreinfo information.

Since now PT_NOTE has vmcoreinfo in /proc/kcore as well, we don't
need get_page_offset() here any longer, right?

So I suggest the following patch for show_mem_usage(),
in addition, we can remove the get_elf_loads() function.

diff --git a/makedumpfile.c b/makedumpfile.c
index 91c1ab4..a02665b 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -11208,17 +11208,23 @@ int show_mem_usage(void)
if (!open_files_for_creating_dumpfile())
return FALSE;
 
-   if (!get_elf_loads(info->fd_memory, info->name_memory))
+   if (!get_elf_info(info->fd_memory, info->name_memory))
return FALSE;
 
-   if (!get_page_offset())
-   return FALSE;
+   /*
+* Since kernel 4.19, /proc/kcore also has vmcoreinfo in PT_NOTE,
+* in that case we don't need /sys/kernel/vmcoreinfo.
+*/
+   if (!has_vmcoreinfo()) {
+   if (!get_page_offset())
+   return FALSE;
 
-   if (!get_sys_kernel_vmcoreinfo(_addr, _len))
-   return FALSE;
+   if (!get_sys_kernel_vmcoreinfo(_addr, 
_len))
+   return FALSE;
 
-   if (!set_kcore_vmcoreinfo(vmcoreinfo_addr, vmcoreinfo_len))
-   return FALSE;
+   if (!set_kcore_vmcoreinfo(vmcoreinfo_addr, vmcoreinfo_len))
+   return FALSE;
+   }
 
if (!initial())
return FALSE;


> 
> This VMCOREINFO note provides us a standard interface which can be
> leveraged while debugging live (or primary) kernel with makedumpfile
> (and other user-space tools), especially to derive the machine specific
> details (for e.g. VA_BITS, PHYS_OFFSET and kimage_voffset for arm64
> arch). The same has been suggested by the arm64 kernel maintainers (see
> [0]) as the standard interface exposed by kernel for sharing
> machine specific details with the user-land via vmcoreinfo.
> 
> [0]. https://www.mail-archive.com/kexec@lists.infradead.org/msg20300.html
> 
> I will send a follow-up patch to read 'kaslr_offset' for arm64 cases
> from vmcoreinfo inside '/proc/kcore' after this patchset is applied.
> 
> Here are some details of the tests I ran:
> 
> Testing:
> 
> 1. arm64 boards tested:
>huawei-taishan, apm-mustang and qualcomm-amberwing boards.
> 
> 2. Use-cases tested:
>a) Primary kernel ->
>   [] --mem-usage:
>  # makedumpfile -f --mem-usage /proc/kcore
> 
>   [] filtering use-case:
>  # makedumpfile --split -d 31 -x vmlinux --config scrub.conf vmcore 
> dumpfile_{1,2,3}
> 
>   [] dumpfile creation:
>  # makedumpfile -d 31 -x vmlinux vmcore dumpfile
> 
>b) Crash kernel ->
>   [] dumpfile creation:
>  # makedumpfile -l --message-level 31 -d 31 /proc/vmcore dump
> 
> 3. Kernel versions tested:
>a) Kernel version 4.19-rc5 and above.
>b) Kernel version 4.14.
> 
> Fixes: 94c97db3fe859ca14d7b38b0ae9ee0ffb83689d2 "arm64: Get 
> 'info->page_offset' from PT_LOAD segments to
> support KASLR boot cases"
> Cc: Baoquan He 
> Cc: Kazuhito Hagio 
> Signed-off-by: Bhupesh Sharma 
> ---
>  arch/arm64.c   | 114 
> +++

RE: [PATCH] kdump, vmcoreinfo: Export sme_me_mask value to vmcoreinfo

2018-11-01 Thread Kazuhito Hagio
On 10/31/2018 6:10 AM, Borislav Petkov wrote:
> On Wed, Oct 31, 2018 at 10:47:48AM +0800, Dave Young wrote:
> > It is a mist only a few kdump people know them, documenting them will help
> > people to understand and review. It will also be clearer instead of
> > digging into code?
> 
> Wholeheartedly agreed. Especially if people start using vmcoreinfo for
> other stuff, like live debugging:
> 
> https://lkml.kernel.org/r/1540593788-28181-1-git-send-email-bhsha...@redhat.com

I also agree. If it can help reviewers and other users to understand
vmcoreinfo and can help itself to become more standard, it would be
better to write it.

One small thing as a vmcoreinfo user (not about the documentation),
I think it might be better to export each information to each variable
separately, not OR-ing them into a variable, because of code simpleness
of both kernel and tools, if there is no limitation in kernel.

Thanks,
Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


RE: [PATCH] makedumpfile: sadump: fix failure of reading 640 KB backup region if at over 4GB location

2018-10-31 Thread Kazuhito Hagio
On 10/29/2018 10:47 PM, Hatayama, Daisuke wrote:
> Currently, in function sadump_kdump_backup_region_init(), variable mem
> holding physical memory to read as a candidate of the ELF core header
> is of type unsigned int with just 4 byte length:
> 
> for (i = 0; i < ARRAY_LENGTH(kimage.segment); ++i) {
> char e_ident[EI_NIDENT];
> unsigned mem;
> 
> 
> mem=ULONG(buf+i*SIZE(kexec_segment)+OFFSET(kexec_segment.mem));
> if (!mem)
> continue;
> 
> if (!readmem(PADDR, mem, e_ident, SELFMAG)) {
> DEBUG_MSG("sadump: failed to read elfcorehdr 
> buffer\n");
> return;
> }
> 
> Thus, if backup region for the first 640KB physical memory is located
> at over 4GB location thanks to crashkernel=size,high like:
> 
> # grep crashkernel /proc/cmdline
> BOOT_IMAGE=(hd0,gpt2)/vmlinuz-4.18 root=/dev/mapper/rhel-root ro 
> crashkernel=512M,high
> 
> # grep Crash /proc/iomem
>   0600-15ff : Crash kernel
>   107f00-109eff : Crash kernel
> 
> crash> rd -p 0x109ef5d000
> 109ef5d000:  00010102464c457f.ELF
> 
> the upper 32-bit of the physical address in mem variable is dropped
> and readmem() fails while outputting the following debug message:
> 
> # LANG=C ./makedumpfile --message-level 8 -f -l -d 31 -x ./vmlinux 
> /dev/sdc vmcore-ld31
> sadump: read dump device as single partition
> sadump: single partition configuration
> page_size: 4096
> sadump: timezone information is missing
> sadump: idtr=fe00
> sadump: cr3=86b42e000
> sadump: idtr(phys)=4c35cc000
> sadump: devide_error(vmlinux)=81a00c50
> sadump: devide_error(vmcore)=a0c00c50
> sadump: cmdline vaddr: a1bcf008
> sadump: cmdline paddr: 4c35cf008
> sadump: cmdline buf vaddr: 8ae89ffceec0
> sadump: cmdline buf paddr: 109ffceec0
> sadump: kaslr_offset=1f20
> sadump: phys_base=4a1a0
> sadump: online cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 
> 21 22 23 24 25 26 27 28 29
> 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 
> 56 57 58 59
> sadump: nr_cpus: 60
> 
> sadump: failed to read elfcorehdr buffer <--- This is the debug message 
> indicating
>   reading ELF core header 
> fails
> 
> Then, the generated vmcore has invalid data in its first 640KB part.
> 
> The variable mem needs to have type of 64-bit length.
> 
> With this patch, kdump backup region is successfully found as follows:
> 
> # LANG=C ./makedumpfile --message-level 31 -f -l -d 31 -x ./vmlinux 
> /dev/sdc vmcore-ld31
> sadump: read dump device as single partition
> sadump: single partition configuration
> page_size: 4096
> sadump: timezone information is missing
> sadump: idtr=fe00
> sadump: cr3=86b42e000
> sadump: idtr(phys)=4c35cc000
> sadump: devide_error(vmlinux)=81a00c50
> sadump: devide_error(vmcore)=a0c00c50
> sadump: cmdline vaddr: a1bcf008
> sadump: cmdline paddr: 4c35cf008
> sadump: cmdline buf vaddr: 8ae89ffceec0
> sadump: cmdline buf paddr: 109ffceec0
> sadump: kaslr_offset=1f20
> sadump: phys_base=4a1a0
> sadump: online cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 
> 21 22 23 24 25 26 27 28 29
> 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 
> 56 57 58 59
> sadump: nr_cpus: 60
> The kernel version is not supported.
> The makedumpfile operation may be incomplete.
> sadump: SRC_START: 0x001000 SRC_SIZE: 0x09f000 
> SRC_OFFSET: 0x109ef61000
> sadump: kdump backup region used
> ..
> 
> By the way, before crashkernel=size,high was introduced, there was
> limitation that ELF core header resides at under 4GB location, so
> defining it as unsigned int was not entirely wrong at that time.
> 
> Signed-off-by: HATAYAMA Daisuke 
> ---
>  sadump_info.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/sadump_info.c b/sadump_info.c
> index dd50d48..a668dca 100644
> --- a/sadump_info.c
> +++ b/sadump_info.c
> @@ -2395,7 +2395,7 @@ sadump_kdump_backup_region_init(void)
>   elfcorehdr_p = 0;
>   for (i = 0; i < ARRAY_LENGTH(kimage.segment); ++i) {
>   char e_ident[EI_NIDENT];
> - unsigned mem;
> + unsigned long mem;
> 
>   mem=ULONG(buf+i*SIZE(kexec_segment)+OFFSET(kexec_segment.mem));
>   if (!mem)
> --
> 2.18.0
> 
> 

Thanks Hatayama-san, applied to devel branch.

Kazu



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


[PATCH] makedumpfile/x86_64: Fix calculation of page_offset for kernel 4.19

2018-10-26 Thread Kazuhito Hagio
Hi Bhupesh, Baoquan,

As for x86_64, I'm going to merge the patch below for fixing the
--mem-usage issue with kernel 4.19, if there is no objection.
I think the same approach will also work on arm64 with regard to
page_offset for the time being..

Thanks,
Kazu

--
From: Kazuhito Hagio 
Date: Fri, 26 Oct 2018 14:43:22 -0400
Subject: [PATCH] x86_64: Fix calculation of page_offset for kernel 4.19

* Required for kernel 4.19

Kernel commit 6855dc41b24619c3d1de3dbd27dd0546b0e45272 ("x86: Add
entry trampolines to kcore") added program headers for PTI entry
trampoline pages to /proc/kcore.

This caused the failure of makedumpfile --mem-usage due to wrong
calculation of page_offset.

  # makedumpfile --mem-usage /proc/kcore
  [...]
  set_kcore_vmcoreinfo: Can't get the offset of VMCOREINFO(/proc/kcore). Success

  makedumpfile Failed.

Since program headers for linear maps are located after ones for
kernel text and so on in /proc/vmcore and /proc/kcore, with this
patch, we use the last valid one to set page_offset.

Also, this patch adds a few debug messages for better debugging.

Cc: Bhupesh Sharma 
Cc: Baoquan He 
Signed-off-by: Kazuhito Hagio 
---
 arch/x86_64.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86_64.c b/arch/x86_64.c
index 2b3c0bb..ed2a970 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -95,10 +95,17 @@ get_page_offset_x86_64(void)
 ERRMSG("Can't read page_offset_base.\n");
 return FALSE;
}
+   DEBUG_MSG("page_offset  : %lx (from page_offset_base)\n",
+   info->page_offset);
return TRUE;
}
 
if (get_num_pt_loads()) {
+   /*
+* Since program headers for linear maps are located after
+* ones for kernel text and so on in /proc/vmcore and
+* /proc/kcore, we use the last valid one to set page_offset.
+*/
for (i = 0;
get_pt_load(i, _start, NULL, _start, NULL);
i++) {
@@ -106,9 +113,13 @@ get_page_offset_x86_64(void)
&& virt_start < __START_KERNEL_map
&& phys_start != NOT_PADDR) {
info->page_offset = virt_start - phys_start;
-   return TRUE;
}
}
+   if (info->page_offset) {
+   DEBUG_MSG("page_offset  : %lx (from pt_load)\n",
+   info->page_offset);
+   return TRUE;
+   }
}
 
if (info->kernel_version < KERNEL_VERSION(2, 6, 27)) {
@@ -119,6 +130,7 @@ get_page_offset_x86_64(void)
info->page_offset = __PAGE_OFFSET_2_6_27;
}
 
+   DEBUG_MSG("page_offset  : %lx (from constant)\n", info->page_offset);
return TRUE;
 }
 
-- 
1.8.3.1



___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


  1   2   >