from:"Nick Kossifidis"

[PATCH v4 4/5] RISC-V: Add kdump support

2021-04-18 Thread Nick Kossifidis

This patch adds support for kdump, the kernel will reserve a
region for the crash kernel and jump there on panic. In order
for userspace tools (kexec-tools) to prepare the crash kernel
kexec image, we also need to expose some information on
/proc/iomem for the memory regions used by the kernel and for
the region reserved for crash kernel. Note that on userspace
the device tree is used to determine the system's memory
layout so the "System RAM" on /proc/iomem is ignored.

I tested this on riscv64 qemu and works as expected, you may
test it by triggering a crash through /proc/sysrq_trigger:

echo c > /proc/sysrq_trigger

v4:
 * Re-base on top of "fixes" branch
 * Use CSR_* macros and PMD_SIZE where possible

v3:
 * Move ELF_CORE_COPY_REGS to asm/elf.h instead of uapi/asm/elf.h
 * Set stvec when disabling MMU
 * Minor cleanups and re-base

v2:
 * Properly populate the ioresources tree, so that it can be
   used later on for implementing strict /dev/mem.
 * Minor cleanups and re-base

Signed-off-by: Nick Kossifidis 
---
 arch/riscv/include/asm/elf.h|  6 +++
 arch/riscv/include/asm/kexec.h  | 19 +---
 arch/riscv/kernel/Makefile  |  2 +-
 arch/riscv/kernel/crash_save_regs.S | 56 +++
 arch/riscv/kernel/kexec_relocate.S  | 68 ++-
 arch/riscv/kernel/machine_kexec.c   | 43 +
 arch/riscv/kernel/setup.c   | 11 -
 arch/riscv/mm/init.c| 71 +
 8 files changed, 249 insertions(+), 27 deletions(-)
 create mode 100644 arch/riscv/kernel/crash_save_regs.S

diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
index 5c725e1df..f4b490cd0 100644
--- a/arch/riscv/include/asm/elf.h
+++ b/arch/riscv/include/asm/elf.h
@@ -81,4 +81,10 @@ extern int arch_setup_additional_pages(struct linux_binprm 
*bprm,
int uses_interp);
 #endif /* CONFIG_MMU */
 
+#define ELF_CORE_COPY_REGS(dest, regs) \
+do {   \
+   *(struct user_regs_struct *)&(dest) =   \
+   *(struct user_regs_struct *)regs;   \
+} while (0);
+
 #endif /* _ASM_RISCV_ELF_H */
diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
index 86e6e4922..1e9541019 100644
--- a/arch/riscv/include/asm/kexec.h
+++ b/arch/riscv/include/asm/kexec.h
@@ -23,11 +23,16 @@
 
 #define KEXEC_ARCH KEXEC_ARCH_RISCV
 
+extern void riscv_crash_save_regs(struct pt_regs *newregs);
+
 static inline void
 crash_setup_regs(struct pt_regs *newregs,
 struct pt_regs *oldregs)
 {
-   /* Dummy implementation for now */
+   if (oldregs)
+   memcpy(newregs, oldregs, sizeof(struct pt_regs));
+   else
+   riscv_crash_save_regs(newregs);
 }
 
 
@@ -40,10 +45,12 @@ struct kimage_arch {
 const extern unsigned char riscv_kexec_relocate[];
 const extern unsigned int riscv_kexec_relocate_size;
 
-typedef void (*riscv_kexec_do_relocate)(unsigned long first_ind_entry,
-   unsigned long jump_addr,
-   unsigned long fdt_addr,
-   unsigned long hartid,
-   unsigned long va_pa_off);
+typedef void (*riscv_kexec_method)(unsigned long first_ind_entry,
+  unsigned long jump_addr,
+  unsigned long fdt_addr,
+  unsigned long hartid,
+  unsigned long va_pa_off);
+
+extern riscv_kexec_method riscv_kexec_norelocate;
 
 #endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 211ef87de..41a1469b2 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -59,7 +59,7 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o
 endif
 obj-$(CONFIG_HOTPLUG_CPU)  += cpu-hotplug.o
 obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_KEXEC)+= kexec_relocate.o machine_kexec.o
+obj-$(CONFIG_KEXEC)+= kexec_relocate.o crash_save_regs.o 
machine_kexec.o
 
 obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
 
diff --git a/arch/riscv/kernel/crash_save_regs.S 
b/arch/riscv/kernel/crash_save_regs.S
new file mode 100644
index 0..7832fb763
--- /dev/null
+++ b/arch/riscv/kernel/crash_save_regs.S
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 FORTH-ICS/CARV
+ *  Nick Kossifidis 
+ */
+
+#include/* For RISCV_* and REG_* macros */
+#include/* For CSR_* macros */
+#include/* For offsets on pt_regs */
+#include  /* For SYM_* macros */
+
+.section ".text"
+SYM_CODE_START(riscv_crash_save_regs)
+   REG_S ra,  PT_RA(a0)/* x1 */
+   REG_S sp,  PT_SP(a0)/* x2 */
+   REG_S gp,  PT_GP(a0)/* x3 */
+   REG_S tp,  PT_TP(a0)/* x4 */
+   REG_S t0,  PT_T0(a0)/* x

[PATCH v4 3/5] RISC-V: Improve init_resources

2021-04-18 Thread Nick Kossifidis

* Kernel region is always present and we know where it is, no
need to look for it inside the loop, just ignore it like the
rest of the reserved regions within system's memory.

* Don't call memblock_free inside the loop, if called it'll split
the region of pre-allocated resources in two parts, messing things
up, just re-use the previous pre-allocated resource and free any
unused resources after both loops finish.

Signed-off-by: Nick Kossifidis 
---
 arch/riscv/kernel/setup.c | 91 ---
 1 file changed, 46 insertions(+), 45 deletions(-)

diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index f8f15332c..030554bab 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -60,6 +60,7 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * also add "System RAM" regions for compatibility with other
  * archs, and the rest of the known regions for completeness.
  */
+static struct resource kimage_res = { .name = "Kernel image", };
 static struct resource code_res = { .name = "Kernel code", };
 static struct resource data_res = { .name = "Kernel data", };
 static struct resource rodata_res = { .name = "Kernel rodata", };
@@ -80,45 +81,54 @@ static int __init add_resource(struct resource *parent,
return 1;
 }
 
-static int __init add_kernel_resources(struct resource *res)
+static int __init add_kernel_resources(void)
 {
int ret = 0;
 
/*
 * The memory region of the kernel image is continuous and
-* was reserved on setup_bootmem, find it here and register
-* it as a resource, then register the various segments of
-* the image as child nodes
+* was reserved on setup_bootmem, register it here as a
+* resource, with the various segments of the image as
+* child nodes.
 */
-   if (!(res->start <= code_res.start && res->end >= data_res.end))
-   return 0;
 
-   res->name = "Kernel image";
-   res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+   code_res.start = __pa_symbol(_text);
+   code_res.end = __pa_symbol(_etext) - 1;
+   code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-   /*
-* We removed a part of this region on setup_bootmem so
-* we need to expand the resource for the bss to fit in.
-*/
-   res->end = bss_res.end;
+   rodata_res.start = __pa_symbol(__start_rodata);
+   rodata_res.end = __pa_symbol(__end_rodata) - 1;
+   rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   data_res.start = __pa_symbol(_data);
+   data_res.end = __pa_symbol(_edata) - 1;
+   data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   bss_res.start = __pa_symbol(__bss_start);
+   bss_res.end = __pa_symbol(__bss_stop) - 1;
+   bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   kimage_res.start = code_res.start;
+   kimage_res.end = bss_res.end;
+   kimage_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-   ret = add_resource(&iomem_resource, res);
+   ret = add_resource(&iomem_resource, &kimage_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &code_res);
+   ret = add_resource(&kimage_res, &code_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &rodata_res);
+   ret = add_resource(&kimage_res, &rodata_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &data_res);
+   ret = add_resource(&kimage_res, &data_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &bss_res);
+   ret = add_resource(&kimage_res, &bss_res);
 
return ret;
 }
@@ -129,54 +139,42 @@ static void __init init_resources(void)
struct resource *res = NULL;
struct resource *mem_res = NULL;
size_t mem_res_sz = 0;
-   int ret = 0, i = 0;
-
-   code_res.start = __pa_symbol(_text);
-   code_res.end = __pa_symbol(_etext) - 1;
-   code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-   rodata_res.start = __pa_symbol(__start_rodata);
-   rodata_res.end = __pa_symbol(__end_rodata) - 1;
-   rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-   data_res.start = __pa_symbol(_data);
-   data_res.end = __pa_symbol(_edata) - 1;
-   data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-   bss_res.start = __pa_symbol(__bss_start);
-   bss_res.end = __pa_symbol(__bss_stop) - 1;
-   bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+   int num_resources = 0, res_idx = 0;
+   int ret = 0;
 
/* + 1 as memblock_alloc() might increase memblock.reserved.cnt */
-   mem_res_sz = (me

[PATCH v4 2/5] RISC-V: Add kexec support

2021-04-18 Thread Nick Kossifidis

This patch adds support for kexec on RISC-V. On SMP systems it depends
on HOTPLUG_CPU in order to be able to bring up all harts after kexec.
It also needs a recent OpenSBI version that supports the HSM extension.
I tested it on riscv64 QEMU on both an smp and a non-smp system.

v6:
 * Re-based on top of "fixes" branch
 * Use PAGE_SIZE and CSR_* macros where possible
 * Small cleanups

v5:
 * For now depend on MMU, further changes needed for NOMMU support
 * Make sure stvec is aligned
 * Cleanup some unneeded fences
 * Verify control code's buffer size
 * Compile kexec_relocate.S with medany and norelax

v4:
 * No functional changes, just re-based

v3:
 * Use the new smp_shutdown_nonboot_cpus() call.
 * Move riscv_kexec_relocate to .rodata

v2:
 * Pass needed parameters as arguments to riscv_kexec_relocate
   instead of using global variables.
 * Use kimage_arch to hold the fdt address of the included fdt.
 * Use SYM_* macros on kexec_relocate.S.
 * Compatibility with STRICT_KERNEL_RWX.
 * Compatibility with HOTPLUG_CPU for SMP
 * Small cleanups

Signed-off-by: Nick Kossifidis 
---
 arch/riscv/Kconfig |  15 +++
 arch/riscv/include/asm/kexec.h |  49 
 arch/riscv/kernel/Makefile |   5 +
 arch/riscv/kernel/kexec_relocate.S | 157 
 arch/riscv/kernel/machine_kexec.c  | 186 +
 5 files changed, 412 insertions(+)
 create mode 100644 arch/riscv/include/asm/kexec.h
 create mode 100644 arch/riscv/kernel/kexec_relocate.S
 create mode 100644 arch/riscv/kernel/machine_kexec.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4515a10c5..10cc19be0 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -386,6 +386,21 @@ config RISCV_SBI_V01
help
  This config allows kernel to use SBI v0.1 APIs. This will be
  deprecated in future once legacy M-mode software are no longer in use.
+
+config KEXEC
+   bool "Kexec system call"
+   select KEXEC_CORE
+   select HOTPLUG_CPU if SMP
+   depends on MMU
+   help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
new file mode 100644
index 0..86e6e4922
--- /dev/null
+++ b/arch/riscv/include/asm/kexec.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis 
+ */
+
+#ifndef _RISCV_KEXEC_H
+#define _RISCV_KEXEC_H
+
+#include /* For PAGE_SIZE */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
+
+/* Reserve a page for the control code buffer */
+#define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE
+
+#define KEXEC_ARCH KEXEC_ARCH_RISCV
+
+static inline void
+crash_setup_regs(struct pt_regs *newregs,
+struct pt_regs *oldregs)
+{
+   /* Dummy implementation for now */
+}
+
+
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+   unsigned long fdt_addr;
+};
+
+const extern unsigned char riscv_kexec_relocate[];
+const extern unsigned int riscv_kexec_relocate_size;
+
+typedef void (*riscv_kexec_do_relocate)(unsigned long first_ind_entry,
+   unsigned long jump_addr,
+   unsigned long fdt_addr,
+   unsigned long hartid,
+   unsigned long va_pa_off);
+
+#endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 647a47f54..211ef87de 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -10,6 +10,10 @@ CFLAGS_REMOVE_sbi.o  = $(CC_FLAGS_FTRACE)
 endif
 CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,)
 
+ifdef CONFIG_KEXEC
+AFLAGS_kexec_relocate.o := -mcmodel=medany -mno-relax
+endif
+
 extra-y += head.o
 extra-y += vmlinux.lds
 
@@ -55,6 +59,7 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o
 endif
 obj-$(CONFIG_HOTPLUG_CPU)  += cpu-hotplug.o
 obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_KEXEC)+= kexec_relocate.o machine_kexec.o
 
 obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
 
diff --git a/arch/riscv/kernel/kexec_relocate.S 
b/arch/riscv/kernel/kexec_relocate.S
new file mode 100644
index 0..84101d0a2
--- /dev/null
+++ b/arch/riscv/kernel/kexec_relocate.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyrigh

[PATCH v4 1/5] RISC-V: Add EM_RISCV to kexec UAPI header

2021-04-18 Thread Nick Kossifidis

Add RISC-V to the list of supported kexec architectures, we need to
add the definition early-on so that later patches can use it.

EM_RISCV is 243 as per ELF psABI specification here:
https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md

Signed-off-by: Nick Kossifidis 
---
 include/uapi/linux/kexec.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 05669c87a..778dc191c 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -42,6 +42,7 @@
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS( 8 << 16)
 #define KEXEC_ARCH_AARCH64 (183 << 16)
+#define KEXEC_ARCH_RISCV   (243 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16
-- 
2.26.2

[PATCH v4 0/5] RISC-V: Add kexec/kdump support

2021-04-18 Thread Nick Kossifidis

This patch series adds kexec/kdump and crash kernel
support on RISC-V. For testing the patches a patched
version of kexec-tools is needed (still a work in
progress) which can be found at:

https://riscv.ics.forth.gr/kexec-tools-patched.tar.xz

v4:
 * Rebase on top of "fixes" branch
 * Resolve Alex's comments

v3:
 * Rebase on newer kernel tree
 * Minor cleanups
 * Split UAPI changes to a separate patch
 * Improve / cleanup init_resources
 * Resolve Palmer's comments

v2:
 * Rebase on newer kernel tree
 * Minor cleanups
 * Properly populate the ioresources tre, so that it
   can be used later on for implementing strict /dev/mem
 * Use linux,usable-memory on /memory instead of a new binding
 * USe a reserved-memory node for ELF core header

Nick Kossifidis (5):
  RISC-V: Add EM_RISCV to kexec UAPI header
  RISC-V: Add kexec support
  RISC-V: Improve init_resources
  RISC-V: Add kdump support
  RISC-V: Add crash kernel support

 arch/riscv/Kconfig  |  25 
 arch/riscv/include/asm/elf.h|   6 +
 arch/riscv/include/asm/kexec.h  |  56 +++
 arch/riscv/kernel/Makefile  |   6 +
 arch/riscv/kernel/crash_dump.c  |  46 ++
 arch/riscv/kernel/crash_save_regs.S |  56 +++
 arch/riscv/kernel/kexec_relocate.S  | 223 
 arch/riscv/kernel/machine_kexec.c   | 193 
 arch/riscv/kernel/setup.c   | 114 --
 arch/riscv/mm/init.c| 104 +
 include/uapi/linux/kexec.h  |   1 +
 11 files changed, 784 insertions(+), 46 deletions(-)
 create mode 100644 arch/riscv/include/asm/kexec.h
 create mode 100644 arch/riscv/kernel/crash_dump.c
 create mode 100644 arch/riscv/kernel/crash_save_regs.S
 create mode 100644 arch/riscv/kernel/kexec_relocate.S
 create mode 100644 arch/riscv/kernel/machine_kexec.c

-- 
2.26.2

[PATCH v4 5/5] RISC-V: Add crash kernel support

2021-04-18 Thread Nick Kossifidis

This patch allows Linux to act as a crash kernel for use with
kdump. Userspace will let the crash kernel know about the
memory region it can use through linux,usable-memory property
on the /memory node (overriding its reg property), and about the
memory region where the elf core header of the previous kernel
is saved, through a reserved-memory node with a compatible string
of "linux,elfcorehdr". This approach is the least invasive and
re-uses functionality already present.

I tested this on riscv64 qemu and it works as expected, you
may test it by retrieving the dmesg of the previous kernel
through /proc/vmcore, using the vmcore-dmesg utility from
kexec-tools.

v4:
 * Rebase on top of "fixes" branch

v3:
 * Rebase

v2:
 * Use linux,usable-memory on /memory instead of a new binding
 * Use a reserved-memory node for ELF core header

Signed-off-by: Nick Kossifidis 
---
 arch/riscv/Kconfig | 10 
 arch/riscv/kernel/Makefile |  1 +
 arch/riscv/kernel/crash_dump.c | 46 ++
 arch/riscv/kernel/setup.c  | 12 +
 arch/riscv/mm/init.c   | 33 
 5 files changed, 102 insertions(+)
 create mode 100644 arch/riscv/kernel/crash_dump.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 10cc19be0..39aa18ef4 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -400,6 +400,16 @@ config KEXEC
 
  The name comes from the similarity to the exec system call.
 
+config CRASH_DUMP
+   bool "Build kdump crash kernel"
+   help
+ Generate crash dump after being started by kexec. This should
+ be normally only set in special crash dump kernels which are
+ loaded in the main kernel with kexec-tools into a specially
+ reserved region and then later executed after a crash by
+ kdump/kexec.
+
+ For more details see Documentation/admin-guide/kdump/kdump.rst
 
 endmenu
 
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 41a1469b2..d3081e4d9 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -60,6 +60,7 @@ endif
 obj-$(CONFIG_HOTPLUG_CPU)  += cpu-hotplug.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_KEXEC)+= kexec_relocate.o crash_save_regs.o 
machine_kexec.o
+obj-$(CONFIG_CRASH_DUMP)   += crash_dump.o
 
 obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
 
diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c
new file mode 100644
index 0..86cc0ada5
--- /dev/null
+++ b/arch/riscv/kernel/crash_dump.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code comes from arch/arm64/kernel/crash_dump.c
+ * Created by: AKASHI Takahiro 
+ * Copyright (C) 2017 Linaro Limited
+ */
+
+#include 
+#include 
+
+/**
+ * copy_oldmem_page() - copy one page from old kernel memory
+ * @pfn: page frame number to be copied
+ * @buf: buffer where the copied page is placed
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page
+ * @userbuf: if set, @buf is in a user address space
+ *
+ * This function copies one page from old kernel memory into buffer pointed by
+ * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes
+ * copied or negative error in case of failure.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+size_t csize, unsigned long offset,
+int userbuf)
+{
+   void *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((char __user *)buf, vaddr + offset, csize)) {
+   memunmap(vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   memunmap(vaddr);
+   return csize;
+}
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 31866dce9..ff398a3d8 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -66,6 +66,9 @@ static struct resource code_res = { .name = "Kernel code", };
 static struct resource data_res = { .name = "Kernel data", };
 static struct resource rodata_res = { .name = "Kernel rodata", };
 static struct resource bss_res = { .name = "Kernel bss", };
+#ifdef CONFIG_CRASH_DUMP
+static struct resource elfcorehdr_res = { .name = "ELF Core hdr", };
+#endif
 
 static int __init add_resource(struct resource *parent,
struct resource *res)
@@ -169,6 +172,15 @@ static void __init init_resources(void)
}
 #endif
 
+#ifdef CONFIG_CRASH_DUMP
+   if (elfcorehdr_size > 0) {
+   elfcorehdr_res.start = elfcorehdr_addr;
+   elfcorehdr_res.end = elfcorehdr_addr + elfcorehdr_size

Re: [PATCH v3 4/5] RISC-V: Add kdump support

2021-04-09 Thread Nick Kossifidis


Στις 2021-04-06 21:36, Alex Ghiti έγραψε:



+   /* Switch to physical addressing */
+   la  s4, 1f
+   sub s4, s4, s3
+   csrwstvec, s4
+   csrwsptbr, zero


satp is used everywhere instead of sptbr. And maybe you could CSR_
naming, like you did in riscv_crash_save_regs and like it's done in
head.S too.



ACK


+   crash_base = memblock_find_in_range(search_start, search_end,
+#ifdef CONFIG_64BIT
+   crash_size, SZ_2M);
+#else
+   crash_size, SZ_4M);
+#endif


You can use PMD_SIZE here and get rid of #ifdef.


+
+#ifdef CONFIG_64BIT
+   if (!IS_ALIGNED(crash_base, SZ_2M)) {
+#else
+   if (!IS_ALIGNED(crash_base, SZ_4M)) {
+#endif


Ditto here.



Will do.

Thanks a lot for your review !

Re: [PATCH v3 2/5] RISC-V: Add kexec support

2021-04-09 Thread Nick Kossifidis


Στις 2021-04-06 21:38, Alex Ghiti έγραψε:

Le 4/5/21 à 4:57 AM, Nick Kossifidis a écrit :

+
+/* Reserve a page for the control code buffer */
+#define KEXEC_CONTROL_PAGE_SIZE 4096


PAGE_SIZE instead ?



Yup, I'll change it.


+obj-${CONFIG_KEXEC}+= kexec_relocate.o machine_kexec.o


Other obj-$() use parenthesis.



ACK


+   li  s5, ((1 << PAGE_SHIFT) / RISCV_SZPTR)


1 << PAGE_SHIFT = PAGE_SIZE



ACK


+#if defined(CONFIG_HOTPLUG_CPU) && (CONFIG_SMP)


Shouldn't it be defined(CONFIG_SMP) ?



It depends on SMP anyway, I'll remove the second part.

Re: [PATCH v3 3/5] RISC-V: Improve init_resources

2021-04-09 Thread Nick Kossifidis


Στις 2021-04-06 11:22, Geert Uytterhoeven έγραψε:

Hi Nick,

On Tue, Apr 6, 2021 at 10:11 AM Nick Kossifidis  
wrote:

Hello Geert,
Στις 2021-04-06 10:19, Geert Uytterhoeven έγραψε:
> On Mon, Apr 5, 2021 at 10:57 AM Nick Kossifidis 
> wrote:
>> * Kernel region is always present and we know where it is, no
>> need to look for it inside the loop, just ignore it like the
>> rest of the reserved regions within system's memory.
>>
>> * Don't call memblock_free inside the loop, if called it'll split
>> the region of pre-allocated resources in two parts, messing things
>> up, just re-use the previous pre-allocated resource and free any
>> unused resources after both loops finish.
>>
>> * memblock_alloc may add a region when called, so increase the
>> number of pre-allocated regions by one to be on the safe side
>> (reported and patched by Geert Uytterhoeven)
>>
>> Signed-off-by: Geert Uytterhoeven 
>
> Where does this SoB come from?
>
>> Signed-off-by: Nick Kossifidis 
>
>> --- a/arch/riscv/kernel/setup.c
>> +++ b/arch/riscv/kernel/setup.c
>
>> @@ -129,53 +139,42 @@ static void __init init_resources(void)
>> struct resource *res = NULL;
>> struct resource *mem_res = NULL;
>> size_t mem_res_sz = 0;
>> -   int ret = 0, i = 0;
>> -
>> -   code_res.start = __pa_symbol(_text);
>> -   code_res.end = __pa_symbol(_etext) - 1;
>> -   code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>> -
>> -   rodata_res.start = __pa_symbol(__start_rodata);
>> -   rodata_res.end = __pa_symbol(__end_rodata) - 1;
>> -   rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>> -
>> -   data_res.start = __pa_symbol(_data);
>> -   data_res.end = __pa_symbol(_edata) - 1;
>> -   data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>> +   int num_resources = 0, res_idx = 0;
>> +   int ret = 0;
>>
>> -   bss_res.start = __pa_symbol(__bss_start);
>> -   bss_res.end = __pa_symbol(__bss_stop) - 1;
>> -   bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>> +   /* + 1 as memblock_alloc() might increase
>> memblock.reserved.cnt */
>> +   num_resources = memblock.memory.cnt + memblock.reserved.cnt +
>> 1;
>> +   res_idx = num_resources - 1;
>>
>> -   mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt) *
>> sizeof(*mem_res);
>
> Oh, you incorporated my commit ce989f1472ae350e ("RISC-V: Fix
> out-of-bounds
> accesses in init_resources()") (from v5.12-rc4) into your patch.
> Why? This means your patch does not apply against upstream.
>

Sorry if this looks awkward, I'm under the impression that new 
features

go on for-next instead of fixes and your patch hasn't been merged on
for-next yet. I thought it would be cleaner to have one patch to merge
for init_resources instead of two, and simpler for people to test the
series. I can rebase this on top of fixes if that works better for you
or Palmer.


Ideally the fixes branch is part of the next branch.  That also helps
to avoid other people having to fix conflicts when merging both.



OK I'll re-base this on top of fixes instead.

Re: [PATCH v3 0/5] RISC-V: Add kexec/kdump support

2021-04-09 Thread Nick Kossifidis


Στις 2021-04-07 19:29, Rob Herring έγραψε:

On Mon, Apr 05, 2021 at 11:57:07AM +0300, Nick Kossifidis wrote:

This patch series adds kexec/kdump and crash kernel
support on RISC-V. For testing the patches a patched
version of kexec-tools is needed (still a work in
progress) which can be found at:

https://riscv.ics.forth.gr/kexec-tools-patched.tar.xz

v3:
 * Rebase on newer kernel tree
 * Minor cleanups
 * Split UAPI changes to a separate patch
 * Improve / cleanup init_resources
 * Resolve Palmer's comments

v2:
 * Rebase on newer kernel tree
 * Minor cleanups
 * Properly populate the ioresources tre, so that it
   can be used later on for implementing strict /dev/mem
 * Use linux,usable-memory on /memory instead of a new binding


Where? In any case, that's not going to work well with EFI support
assuming like arm64, 'memory' is passed in UEFI structures instead of
DT. That's why there's now a /chosen linux,usable-memory-ranges
property.



Here:
https://elixir.bootlin.com/linux/v5.12-rc5/source/drivers/of/fdt.c#L1001

The "linux,usable-memory" binding is already defined and is part of
early_init_dt_scan_memory() which we call on mm/init.c to determine
system's memory layout. It's simple, clean and I don't see a reason
to use another binding on /chosen and add extra code for this, when
we already handle it on early_init_dt_scan_memory() anyway. As for
EFI, even when enabled, we still use DT to determine system memory
layout, not EFI structures, plus I don't see how EFI is relevant
here, the bootloader in kexec's case is Linux, not EFI. BTW the /memory
node is mandatory in any case, it should exist on DT regardless of EFI,
/chosen node on the other hand is -in general- optional, and we can 
still

boot a riscv system without /chosen node present (we only require it for
the built-in cmdline to work).

Also a simple grep for "linux,usable-memory-ranges" on the latest kernel
sources didn't return anything, there is also nothing on chosen.txt, 
where

is that binding documented/implemented ?


Isn't the preferred kexec interface the file based interface? I'd
expect a new arch to only support that. And there's common kexec DT
handling for that pending for 5.13.



Both approaches have their pros an cons, that's why both are available, 
in no
way CONFIG_KEXEC is deprecated in favor of CONFIG_KEXEC_FILE, at least 
not as
far as I know. The main point for the file-based syscall is to support 
secure
boot, since the image is loaded by the kernel directly without any 
processing
by the userspace tools, so it can be pre-signed by the kernel's 
"vendor". On
the other hand, the kernel part is more complicated and you can't pass a 
new

device tree, the kernel needs to re-use the existing one (or modify it
in-kernel), you can only override the cmdline.

This doesn't work for our use cases in FORTH, where we use kexec not 
only to
re-boot our systems, but also to boot to a system with different hw 
layout
(e.g. FPGA prototypes or systems with FPGAs on the side), device tree 
overlays
also don't cover our use cases. To give you an idea we can 
add/remove/modify
devices, move them to another region etc and still use kexec to avoid 
going
through the full boot cycle. We just unload their drivers, perform a 
full or
partial re-programming of the FPGA from within Linux, and kexec to the 
new

system with the new device tree. The file-based syscall can't cover this
scenario, in general it's less flexible and it's only there for secure 
boot,

not for using custom-built kernels, nor custom device tree images.

Security-wise the file load syscall provides guarantees for integrity 
and

authenticity, but depending on the kernel "vendor"'s infrastructure and
signing process this may allow e.g. to load an older/vulnerable kernel 
through
kexec and get away with it, there is no check as far as I know  to make 
sure
the loaded kernel is at least as old as the running kernel, the 
assumption is
that the "vendor" will use a different signing key/cert for each kernel 
and
that you'll kexec to a kernel/crash kernel that's the same version as 
the
running one. Until we have clear guidelines on how this is meant to be 
used

and have a discussion on secure boot within RISC-V (we have something on
the TEE TG but we'll probably switch to a SIG committee for this), I 
don't
see how this feature is a priority compared to the more generic 
CONFIG_KEXEC.


Regards,
Nick

Re: [PATCH v3 3/5] RISC-V: Improve init_resources

2021-04-06 Thread Nick Kossifidis


Hello Geert,

Στις 2021-04-06 10:19, Geert Uytterhoeven έγραψε:

Hi Nick,

Thanks for your patch!

On Mon, Apr 5, 2021 at 10:57 AM Nick Kossifidis  
wrote:

* Kernel region is always present and we know where it is, no
need to look for it inside the loop, just ignore it like the
rest of the reserved regions within system's memory.

* Don't call memblock_free inside the loop, if called it'll split
the region of pre-allocated resources in two parts, messing things
up, just re-use the previous pre-allocated resource and free any
unused resources after both loops finish.

* memblock_alloc may add a region when called, so increase the
number of pre-allocated regions by one to be on the safe side
(reported and patched by Geert Uytterhoeven)

Signed-off-by: Geert Uytterhoeven 


Where does this SoB come from?


Signed-off-by: Nick Kossifidis 



--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c



@@ -129,53 +139,42 @@ static void __init init_resources(void)
struct resource *res = NULL;
struct resource *mem_res = NULL;
size_t mem_res_sz = 0;
-   int ret = 0, i = 0;
-
-   code_res.start = __pa_symbol(_text);
-   code_res.end = __pa_symbol(_etext) - 1;
-   code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-   rodata_res.start = __pa_symbol(__start_rodata);
-   rodata_res.end = __pa_symbol(__end_rodata) - 1;
-   rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-   data_res.start = __pa_symbol(_data);
-   data_res.end = __pa_symbol(_edata) - 1;
-   data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+   int num_resources = 0, res_idx = 0;
+   int ret = 0;

-   bss_res.start = __pa_symbol(__bss_start);
-   bss_res.end = __pa_symbol(__bss_stop) - 1;
-   bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+   /* + 1 as memblock_alloc() might increase 
memblock.reserved.cnt */
+   num_resources = memblock.memory.cnt + memblock.reserved.cnt + 
1;

+   res_idx = num_resources - 1;

-   mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt) * 
sizeof(*mem_res);


Oh, you incorporated my commit ce989f1472ae350e ("RISC-V: Fix 
out-of-bounds

accesses in init_resources()") (from v5.12-rc4) into your patch.
Why? This means your patch does not apply against upstream.



Sorry if this looks awkward, I'm under the impression that new features 
go on for-next instead of fixes and your patch hasn't been merged on 
for-next yet. I thought it would be cleaner to have one patch to merge 
for init_resources instead of two, and simpler for people to test the 
series. I can rebase this on top of fixes if that works better for you 
or Palmer.


Regards,
Nick

[PATCH v3 1/5] RISC-V: Add EM_RISCV to kexec UAPI header

2021-04-05 Thread Nick Kossifidis

Add RISC-V to the list of supported kexec architecturs, we need to
add the definition early-on so that later patches can use it.

EM_RISCV is 243 as per ELF psABI specification here:
https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md

Signed-off-by: Nick Kossifidis 
---
 include/uapi/linux/kexec.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 05669c87a..778dc191c 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -42,6 +42,7 @@
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS( 8 << 16)
 #define KEXEC_ARCH_AARCH64 (183 << 16)
+#define KEXEC_ARCH_RISCV   (243 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16
-- 
2.26.2

[PATCH v3 3/5] RISC-V: Improve init_resources

2021-04-05 Thread Nick Kossifidis

* Kernel region is always present and we know where it is, no
need to look for it inside the loop, just ignore it like the
rest of the reserved regions within system's memory.

* Don't call memblock_free inside the loop, if called it'll split
the region of pre-allocated resources in two parts, messing things
up, just re-use the previous pre-allocated resource and free any
unused resources after both loops finish.

* memblock_alloc may add a region when called, so increase the
number of pre-allocated regions by one to be on the safe side
(reported and patched by Geert Uytterhoeven)

Signed-off-by: Geert Uytterhoeven 
Signed-off-by: Nick Kossifidis 
---
 arch/riscv/kernel/setup.c | 90 ---
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index e85bacff1..030554bab 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -60,6 +60,7 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * also add "System RAM" regions for compatibility with other
  * archs, and the rest of the known regions for completeness.
  */
+static struct resource kimage_res = { .name = "Kernel image", };
 static struct resource code_res = { .name = "Kernel code", };
 static struct resource data_res = { .name = "Kernel data", };
 static struct resource rodata_res = { .name = "Kernel rodata", };
@@ -80,45 +81,54 @@ static int __init add_resource(struct resource *parent,
return 1;
 }
 
-static int __init add_kernel_resources(struct resource *res)
+static int __init add_kernel_resources(void)
 {
int ret = 0;
 
/*
 * The memory region of the kernel image is continuous and
-* was reserved on setup_bootmem, find it here and register
-* it as a resource, then register the various segments of
-* the image as child nodes
+* was reserved on setup_bootmem, register it here as a
+* resource, with the various segments of the image as
+* child nodes.
 */
-   if (!(res->start <= code_res.start && res->end >= data_res.end))
-   return 0;
 
-   res->name = "Kernel image";
-   res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+   code_res.start = __pa_symbol(_text);
+   code_res.end = __pa_symbol(_etext) - 1;
+   code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-   /*
-* We removed a part of this region on setup_bootmem so
-* we need to expand the resource for the bss to fit in.
-*/
-   res->end = bss_res.end;
+   rodata_res.start = __pa_symbol(__start_rodata);
+   rodata_res.end = __pa_symbol(__end_rodata) - 1;
+   rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-   ret = add_resource(&iomem_resource, res);
+   data_res.start = __pa_symbol(_data);
+   data_res.end = __pa_symbol(_edata) - 1;
+   data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   bss_res.start = __pa_symbol(__bss_start);
+   bss_res.end = __pa_symbol(__bss_stop) - 1;
+   bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   kimage_res.start = code_res.start;
+   kimage_res.end = bss_res.end;
+   kimage_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   ret = add_resource(&iomem_resource, &kimage_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &code_res);
+   ret = add_resource(&kimage_res, &code_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &rodata_res);
+   ret = add_resource(&kimage_res, &rodata_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &data_res);
+   ret = add_resource(&kimage_res, &data_res);
if (ret < 0)
return ret;
 
-   ret = add_resource(res, &bss_res);
+   ret = add_resource(&kimage_res, &bss_res);
 
return ret;
 }
@@ -129,53 +139,42 @@ static void __init init_resources(void)
struct resource *res = NULL;
struct resource *mem_res = NULL;
size_t mem_res_sz = 0;
-   int ret = 0, i = 0;
-
-   code_res.start = __pa_symbol(_text);
-   code_res.end = __pa_symbol(_etext) - 1;
-   code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-   rodata_res.start = __pa_symbol(__start_rodata);
-   rodata_res.end = __pa_symbol(__end_rodata) - 1;
-   rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-   data_res.start = __pa_symbol(_data);
-   data_res.end = __pa_symbol(_edata) - 1;
-   data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+   int num_resources = 0, res_idx = 0;
+   int ret = 0;
 
-   bss_res.start = __pa_symbol(__bss_start);
-   bss

[PATCH v3 5/5] RISC-V: Add crash kernel support

2021-04-05 Thread Nick Kossifidis

From: Nick Kossifidis 

This patch allows Linux to act as a crash kernel for use with
kdump. Userspace will let the crash kernel know about the
memory region it can use through linux,usable-memory property
on the /memory node (overriding its reg property), and about the
memory region where the elf core header of the previous kernel
is saved, through a reserved-memory node with a compatible string
of "linux,elfcorehdr". This approach is the least invasive and
re-uses functionality already present.

I tested this on riscv64 qemu and it works as expected, you
may test it by retrieving the dmesg of the previous kernel
through /proc/vmcore, using the vmcore-dmesg utility from
kexec-tools.

v3:
 * Rebase

v2:
 * Use linux,usable-memory on /memory instead of a new binding
 * Use a reserved-memory node for ELF core header

Signed-off-by: Nick Kossifidis 
---
 arch/riscv/Kconfig | 10 
 arch/riscv/kernel/Makefile |  1 +
 arch/riscv/kernel/crash_dump.c | 46 ++
 arch/riscv/kernel/setup.c  | 12 +
 arch/riscv/mm/init.c   | 33 
 5 files changed, 102 insertions(+)
 create mode 100644 arch/riscv/kernel/crash_dump.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3716262ef..553c2dced 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -403,6 +403,16 @@ config KEXEC
 
  The name comes from the similarity to the exec system call.
 
+config CRASH_DUMP
+   bool "Build kdump crash kernel"
+   help
+ Generate crash dump after being started by kexec. This should
+ be normally only set in special crash dump kernels which are
+ loaded in the main kernel with kexec-tools into a specially
+ reserved region and then later executed after a crash by
+ kdump/kexec.
+
+ For more details see Documentation/admin-guide/kdump/kdump.rst
 
 endmenu
 
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 07f676ad3..bd66d2ce0 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -59,6 +59,7 @@ endif
 obj-$(CONFIG_HOTPLUG_CPU)  += cpu-hotplug.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-${CONFIG_KEXEC}+= kexec_relocate.o crash_save_regs.o 
machine_kexec.o
+obj-$(CONFIG_CRASH_DUMP)   += crash_dump.o
 
 obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
 
diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c
new file mode 100644
index 0..86cc0ada5
--- /dev/null
+++ b/arch/riscv/kernel/crash_dump.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code comes from arch/arm64/kernel/crash_dump.c
+ * Created by: AKASHI Takahiro 
+ * Copyright (C) 2017 Linaro Limited
+ */
+
+#include 
+#include 
+
+/**
+ * copy_oldmem_page() - copy one page from old kernel memory
+ * @pfn: page frame number to be copied
+ * @buf: buffer where the copied page is placed
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page
+ * @userbuf: if set, @buf is in a user address space
+ *
+ * This function copies one page from old kernel memory into buffer pointed by
+ * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes
+ * copied or negative error in case of failure.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+size_t csize, unsigned long offset,
+int userbuf)
+{
+   void *vaddr;
+
+   if (!csize)
+   return 0;
+
+   vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB);
+   if (!vaddr)
+   return -ENOMEM;
+
+   if (userbuf) {
+   if (copy_to_user((char __user *)buf, vaddr + offset, csize)) {
+   memunmap(vaddr);
+   return -EFAULT;
+   }
+   } else
+   memcpy(buf, vaddr + offset, csize);
+
+   memunmap(vaddr);
+   return csize;
+}
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 31866dce9..ff398a3d8 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -66,6 +66,9 @@ static struct resource code_res = { .name = "Kernel code", };
 static struct resource data_res = { .name = "Kernel data", };
 static struct resource rodata_res = { .name = "Kernel rodata", };
 static struct resource bss_res = { .name = "Kernel bss", };
+#ifdef CONFIG_CRASH_DUMP
+static struct resource elfcorehdr_res = { .name = "ELF Core hdr", };
+#endif
 
 static int __init add_resource(struct resource *parent,
struct resource *res)
@@ -169,6 +172,15 @@ static void __init init_resources(void)
}
 #endif
 
+#ifdef CONFIG_CRASH_DUMP
+   if (elfcorehdr_size > 0) {
+   elfcorehdr_res.start = elfcorehdr_addr;
+   elfcorehdr_res.end = elfcorehdr_addr + elfcorehdr_size - 1;
+

[PATCH v3 2/5] RISC-V: Add kexec support

2021-04-05 Thread Nick Kossifidis

This patch adds support for kexec on RISC-V. On SMP systems it depends
on HOTPLUG_CPU in order to be able to bring up all harts after kexec.
It also needs a recent OpenSBI version that supports the HSM extension.
I tested it on riscv64 QEMU on both an smp and a non-smp system.

v5:
 * For now depend on MMU, further changes needed for NOMMU support
 * Make sure stvec is aligned
 * Cleanup some unneeded fences
 * Verify control code's buffer size
 * Compile kexec_relocate.S with medany and norelax

v4:
 * No functional changes, just re-based

v3:
 * Use the new smp_shutdown_nonboot_cpus() call.
 * Move riscv_kexec_relocate to .rodata

v2:
 * Pass needed parameters as arguments to riscv_kexec_relocate
   instead of using global variables.
 * Use kimage_arch to hold the fdt address of the included fdt.
 * Use SYM_* macros on kexec_relocate.S.
 * Compatibility with STRICT_KERNEL_RWX.
 * Compatibility with HOTPLUG_CPU for SMP
 * Small cleanups

Signed-off-by: Nick Kossifidis 
---
 arch/riscv/Kconfig |  15 +++
 arch/riscv/include/asm/kexec.h |  47 
 arch/riscv/kernel/Makefile |   5 +
 arch/riscv/kernel/kexec_relocate.S | 156 
 arch/riscv/kernel/machine_kexec.c  | 186 +
 5 files changed, 409 insertions(+)
 create mode 100644 arch/riscv/include/asm/kexec.h
 create mode 100644 arch/riscv/kernel/kexec_relocate.S
 create mode 100644 arch/riscv/kernel/machine_kexec.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 8ea60a0a1..3716262ef 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -389,6 +389,21 @@ config RISCV_SBI_V01
help
  This config allows kernel to use SBI v0.1 APIs. This will be
  deprecated in future once legacy M-mode software are no longer in use.
+
+config KEXEC
+   bool "Kexec system call"
+   select KEXEC_CORE
+   select HOTPLUG_CPU if SMP
+   depends on MMU
+   help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
new file mode 100644
index 0..efc69feb4
--- /dev/null
+++ b/arch/riscv/include/asm/kexec.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis 
+ */
+
+#ifndef _RISCV_KEXEC_H
+#define _RISCV_KEXEC_H
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can use for the control code buffer */
+#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
+
+/* Reserve a page for the control code buffer */
+#define KEXEC_CONTROL_PAGE_SIZE 4096
+
+#define KEXEC_ARCH KEXEC_ARCH_RISCV
+
+static inline void
+crash_setup_regs(struct pt_regs *newregs,
+struct pt_regs *oldregs)
+{
+   /* Dummy implementation for now */
+}
+
+
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+   unsigned long fdt_addr;
+};
+
+const extern unsigned char riscv_kexec_relocate[];
+const extern unsigned int riscv_kexec_relocate_size;
+
+typedef void (*riscv_kexec_do_relocate)(unsigned long first_ind_entry,
+   unsigned long jump_addr,
+   unsigned long fdt_addr,
+   unsigned long hartid,
+   unsigned long va_pa_off);
+
+#endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 3dc0abde9..c2594018c 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -9,6 +9,10 @@ CFLAGS_REMOVE_patch.o  = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_sbi.o= $(CC_FLAGS_FTRACE)
 endif
 
+ifdef CONFIG_KEXEC
+AFLAGS_kexec_relocate.o := -mcmodel=medany -mno-relax
+endif
+
 extra-y += head.o
 extra-y += vmlinux.lds
 
@@ -54,6 +58,7 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o
 endif
 obj-$(CONFIG_HOTPLUG_CPU)  += cpu-hotplug.o
 obj-$(CONFIG_KGDB) += kgdb.o
+obj-${CONFIG_KEXEC}+= kexec_relocate.o machine_kexec.o
 
 obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
 
diff --git a/arch/riscv/kernel/kexec_relocate.S 
b/arch/riscv/kernel/kexec_relocate.S
new file mode 100644
index 0..616c20771
--- /dev/null
+++ b/arch/riscv/kernel/kexec_relocate.S
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 FORTH-ICS/CARV
+ *  Nick Kossifidis 
+ */
+
+#include/* For RISCV_* and REG_* macros */
+#include   /* For PAGE_SHIFT */
+#include  /* For SYM_* macro

[PATCH v3 4/5] RISC-V: Add kdump support

2021-04-05 Thread Nick Kossifidis

This patch adds support for kdump, the kernel will reserve a
region for the crash kernel and jump there on panic. In order
for userspace tools (kexec-tools) to prepare the crash kernel
kexec image, we also need to expose some information on
/proc/iomem for the memory regions used by the kernel and for
the region reserved for crash kernel. Note that on userspace
the device tree is used to determine the system's memory
layout so the "System RAM" on /proc/iomem is ignored.

I tested this on riscv64 qemu and works as expected, you may
test it by triggering a crash through /proc/sysrq_trigger:

echo c > /proc/sysrq_trigger

v3:
 * Move ELF_CORE_COPY_REGS to asm/elf.h instead of uapi/asm/elf.h
 * Set stvec when disabling MMU
 * Minor cleanups and re-base

v2:
 * Properly populate the ioresources tree, so that it can be
   used later on for implementing strict /dev/mem.
 * Minor cleanups and re-base

Signed-off-by: Nick Kossifidis 
---
 arch/riscv/include/asm/elf.h|  6 +++
 arch/riscv/include/asm/kexec.h  | 19 ---
 arch/riscv/kernel/Makefile  |  2 +-
 arch/riscv/kernel/crash_save_regs.S | 56 +
 arch/riscv/kernel/kexec_relocate.S  | 68 -
 arch/riscv/kernel/machine_kexec.c   | 43 +---
 arch/riscv/kernel/setup.c   | 11 -
 arch/riscv/mm/init.c| 77 +
 8 files changed, 255 insertions(+), 27 deletions(-)
 create mode 100644 arch/riscv/kernel/crash_save_regs.S

diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
index 5c725e1df..f4b490cd0 100644
--- a/arch/riscv/include/asm/elf.h
+++ b/arch/riscv/include/asm/elf.h
@@ -81,4 +81,10 @@ extern int arch_setup_additional_pages(struct linux_binprm 
*bprm,
int uses_interp);
 #endif /* CONFIG_MMU */
 
+#define ELF_CORE_COPY_REGS(dest, regs) \
+do {   \
+   *(struct user_regs_struct *)&(dest) =   \
+   *(struct user_regs_struct *)regs;   \
+} while (0);
+
 #endif /* _ASM_RISCV_ELF_H */
diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
index efc69feb4..4fd583acc 100644
--- a/arch/riscv/include/asm/kexec.h
+++ b/arch/riscv/include/asm/kexec.h
@@ -21,11 +21,16 @@
 
 #define KEXEC_ARCH KEXEC_ARCH_RISCV
 
+extern void riscv_crash_save_regs(struct pt_regs *newregs);
+
 static inline void
 crash_setup_regs(struct pt_regs *newregs,
 struct pt_regs *oldregs)
 {
-   /* Dummy implementation for now */
+   if (oldregs)
+   memcpy(newregs, oldregs, sizeof(struct pt_regs));
+   else
+   riscv_crash_save_regs(newregs);
 }
 
 
@@ -38,10 +43,12 @@ struct kimage_arch {
 const extern unsigned char riscv_kexec_relocate[];
 const extern unsigned int riscv_kexec_relocate_size;
 
-typedef void (*riscv_kexec_do_relocate)(unsigned long first_ind_entry,
-   unsigned long jump_addr,
-   unsigned long fdt_addr,
-   unsigned long hartid,
-   unsigned long va_pa_off);
+typedef void (*riscv_kexec_method)(unsigned long first_ind_entry,
+  unsigned long jump_addr,
+  unsigned long fdt_addr,
+  unsigned long hartid,
+  unsigned long va_pa_off);
+
+extern riscv_kexec_method riscv_kexec_norelocate;
 
 #endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index c2594018c..07f676ad3 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -58,7 +58,7 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o
 endif
 obj-$(CONFIG_HOTPLUG_CPU)  += cpu-hotplug.o
 obj-$(CONFIG_KGDB) += kgdb.o
-obj-${CONFIG_KEXEC}+= kexec_relocate.o machine_kexec.o
+obj-${CONFIG_KEXEC}+= kexec_relocate.o crash_save_regs.o 
machine_kexec.o
 
 obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
 
diff --git a/arch/riscv/kernel/crash_save_regs.S 
b/arch/riscv/kernel/crash_save_regs.S
new file mode 100644
index 0..7832fb763
--- /dev/null
+++ b/arch/riscv/kernel/crash_save_regs.S
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 FORTH-ICS/CARV
+ *  Nick Kossifidis 
+ */
+
+#include/* For RISCV_* and REG_* macros */
+#include/* For CSR_* macros */
+#include/* For offsets on pt_regs */
+#include  /* For SYM_* macros */
+
+.section ".text"
+SYM_CODE_START(riscv_crash_save_regs)
+   REG_S ra,  PT_RA(a0)/* x1 */
+   REG_S sp,  PT_SP(a0)/* x2 */
+   REG_S gp,  PT_GP(a0)/* x3 */
+   REG_S tp,  PT_TP(a0)/* x4 */
+   REG_S t0,  PT_T0(a0)/* x5 */
+   REG_S t1,  PT_T1(a0)/* x6 */
+   REG_S t2,  PT_T2(a0)/* x7 */
+   REG_S s0,  PT_S0(a0)

[PATCH v3 0/5] RISC-V: Add kexec/kdump support

2021-04-05 Thread Nick Kossifidis

This patch series adds kexec/kdump and crash kernel
support on RISC-V. For testing the patches a patched
version of kexec-tools is needed (still a work in
progress) which can be found at:

https://riscv.ics.forth.gr/kexec-tools-patched.tar.xz

v3:
 * Rebase on newer kernel tree
 * Minor cleanups
 * Split UAPI changes to a separate patch
 * Improve / cleanup init_resources
 * Resolve Palmer's comments

v2:
 * Rebase on newer kernel tree
 * Minor cleanups
 * Properly populate the ioresources tre, so that it
   can be used later on for implementing strict /dev/mem
 * Use linux,usable-memory on /memory instead of a new binding
 * USe a reserved-memory node for ELF core header

Nick Kossifidis (5):
  RISC-V: Add EM_RISCV to kexec UAPI header
  RISC-V: Add kexec support
  RISC-V: Improve init_resources
  RISC-V: Add kdump support
  RISC-V: Add crash kernel support

 arch/riscv/Kconfig  |  25 
 arch/riscv/include/asm/elf.h|   6 +
 arch/riscv/include/asm/kexec.h  |  54 +++
 arch/riscv/kernel/Makefile  |   6 +
 arch/riscv/kernel/crash_dump.c  |  46 ++
 arch/riscv/kernel/crash_save_regs.S |  56 +++
 arch/riscv/kernel/kexec_relocate.S  | 222 
 arch/riscv/kernel/machine_kexec.c   | 193 
 arch/riscv/kernel/setup.c   | 113 --
 arch/riscv/mm/init.c| 110 ++
 include/uapi/linux/kexec.h  |   1 +
 11 files changed, 787 insertions(+), 45 deletions(-)
 create mode 100644 arch/riscv/include/asm/kexec.h
 create mode 100644 arch/riscv/kernel/crash_dump.c
 create mode 100644 arch/riscv/kernel/crash_save_regs.S
 create mode 100644 arch/riscv/kernel/kexec_relocate.S
 create mode 100644 arch/riscv/kernel/machine_kexec.c

-- 
2.26.2

Re: [PATCH v3 11/17] riscv: Convert to GENERIC_CMDLINE

2021-03-29 Thread Nick Kossifidis


Στις 2021-03-26 17:26, Rob Herring έγραψε:

On Fri, Mar 26, 2021 at 8:20 AM Christophe Leroy
 wrote:




Le 26/03/2021 à 15:08, Andreas Schwab a écrit :
> On Mär 26 2021, Christophe Leroy wrote:
>
>> diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
>> index f8f15332caa2..e7c91ee478d1 100644
>> --- a/arch/riscv/kernel/setup.c
>> +++ b/arch/riscv/kernel/setup.c
>> @@ -20,6 +20,7 @@
>>   #include 
>>   #include 
>>   #include 
>> +#include 
>>
>>   #include 
>>   #include 
>> @@ -228,10 +229,8 @@ static void __init parse_dtb(void)
>>  }
>>
>>  pr_err("No DTB passed to the kernel\n");
>> -#ifdef CONFIG_CMDLINE_FORCE
>> -strlcpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
>> +cmdline_build(boot_command_line, NULL, COMMAND_LINE_SIZE);
>>  pr_info("Forcing kernel command line to: %s\n", boot_command_line);
>
> Shouldn't that message become conditional in some way?
>

You are right, I did something similar on ARM but looks like I missed 
it on RISCV.


How is this hunk even useful? Under what conditions can you boot
without a DTB? Even with a built-in DTB, the DT cmdline handling would
be called.

Rob



cced Paul who introduced this:
https://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git/commit/arch/riscv/kernel/setup.c?id=8fd6e05c7463b635e51ec7df0a1858c1b5a6e350

Re: [PATCH 1/5] lib: Add a generic version of devmem_is_allowed()

2020-07-09 Thread Nick Kossifidis

Στις 2020-07-10 08:38, Christoph Hellwig έγραψε:

On Thu, Jul 09, 2020 at 11:49:21PM +0300, Mike Rapoport wrote:

> +#ifndef CONFIG_GENERIC_DEVMEM_IS_ALLOWED
> +extern int devmem_is_allowed(unsigned long pfn);
> +#endif

Nit: no need for the extern here.

> +config GENERIC_LIB_DEVMEM_IS_ALLOWED
> +  bool
> +  select ARCH_HAS_DEVMEM_IS_ALLOWED

This seems to work the other way around from the usual Kconfig chains.
In the most cases ARCH_HAS_SOMETHING selects GENERIC_SOMETHING.

I believe nicer way would be to make

config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU && DEVMEM
	depends on ARCH_HAS_DEVMEM_IS_ALLOWED || 
GENERIC_LIB_DEVMEM_IS_ALLOWED

config GENERIC_LIB_DEVMEM_IS_ALLOWED
bool

and then s/select ARCH_HAS_DEVMEM_IS_ALLOWED/select 
GENERIC_LIB_DEVMEM_IS_ALLOWED/

in the arch Kconfigs and drop ARCH_HAS_DEVMEM_IS_ALLOWED in the end.

To take a step back:  Is there any reason to not just always
STRICT_DEVMEM? Maybe for a few architectures that don't currently
support a strict /dev/mem the generic version isn't quite correct, but
someone selecting the option and finding the issue is the best way to
figure that out..

During prototyping / testing having full access to all physical memory 
through /dev/mem is very useful. We should have it enabled by default 
but leave the config option there so that users / developers can disable 
it if needed IMHO.

Re: [PATCH 1/2] riscv: Register System RAM as iomem resources

2020-07-09 Thread Nick Kossifidis


Στις 2020-07-09 21:27, Palmer Dabbelt έγραψε:

On Tue, 16 Jun 2020 00:45:46 PDT (-0700), zong...@sifive.com wrote:

Add System RAM to /proc/iomem, various tools expect it such as kdump.
It is also needed for page_is_ram API which checks the specified 
address

whether registered as System RAM in iomem_resource list.

Signed-off-by: Zong Li 
---
 arch/riscv/mm/init.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index f4adb3684f3d..bbe816e03b2f 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -517,6 +517,27 @@ void mark_rodata_ro(void)
 }
 #endif

+void __init resource_init(void)
+{
+   struct memblock_region *region;
+
+   for_each_memblock(memory, region) {
+   struct resource *res;
+
+   res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
+   if (!res)
+   panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct resource));
+
+   res->name = "System RAM";
+		res->start = 
__pfn_to_phys(memblock_region_memory_base_pfn(region));
+		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 
1;

+   res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;


Looks like everyone else is checking MEMBLOCK_NOMAP before registering 
memory

regions.  I've added that and put this on for-next.  Thanks!

commit 11dc632bf515874c84887727614e8044452f1f28
gpg: Signature made Thu 09 Jul 2020 11:24:08 AM PDT
gpg:using RSA key 
2B3C3747446843B24A943A7A2E1319F35FBB1889

gpg:issuer "pal...@dabbelt.com"
gpg: Good signature from "Palmer Dabbelt " 
[ultimate]
gpg: aka "Palmer Dabbelt " 
[ultimate]

Author: Zong Li 
Date:   Tue Jun 16 15:45:46 2020 +0800

   riscv: Register System RAM as iomem resources
  Add System RAM to /proc/iomem, various tools expect it such as 
kdump.
   It is also needed for page_is_ram API which checks the specified 
address

   whether registered as System RAM in iomem_resource list.
  Signed-off-by: Zong Li 
   [Palmer: check MEMBLOCK_NOMAP]
   Signed-off-by: Palmer Dabbelt 

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index f4adb3684f3d..8b78fd23713e 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -517,6 +517,32 @@ void mark_rodata_ro(void)
}
#endif

+void __init resource_init(void)
+{
+   struct memblock_region *region;
+
+   for_each_memblock(memory, region) {
+   struct resource *res;
+
+   res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
+   if (!res)
+   panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct resource));
+
+   if (memblock_is_nomap(region) {
+   res->name = "reserved";
+   res->flags = IORESOURCE_MEM;
+   } else {
+   res->name = "System RAM";
+   res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+   }
+   res->start = 
__pfn_to_phys(memblock_region_memory_base_pfn(region));
+		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 
1;

+
+   request_resource(&iomem_resource, res);
+   }
+}
+
void __init paging_init(void)
{
setup_vm_final();
@@ -524,6 +550,7 @@ void __init paging_init(void)
sparse_init();
setup_zero_page();
zone_sizes_init();
+   resource_init();
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP



+
+   request_resource(&iomem_resource, res);
+   }
+}
+
 void __init paging_init(void)
 {
setup_vm_final();
@@ -524,6 +545,7 @@ void __init paging_init(void)
sparse_init();
setup_zero_page();
zone_sizes_init();
+   resource_init();
 }

 #ifdef CONFIG_SPARSEMEM_VMEMMAP


___
linux-riscv mailing list
linux-ri...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv


Zong Li sent a newer version of this series without this patch, since 
I'm handling this on the kexec/kdump series as well where other sections 
needed for kdump are also registered.

Re: [PATCH v2 5/8] riscv: Implement sv48 support

2020-06-27 Thread Nick Kossifidis


Στις 2020-06-03 11:11, Alexandre Ghiti έγραψε:

By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that 
roughly
offers ~160TB of virtual address space to userspace and allows up to 
64TB

of physical memory.

If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level 
into

PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.

Signed-off-by: Alexandre Ghiti 
Reviewed-by: Anup Patel 
---
 arch/riscv/Kconfig  |   6 +-
 arch/riscv/include/asm/csr.h|   3 +-
 arch/riscv/include/asm/fixmap.h |   1 +
 arch/riscv/include/asm/page.h   |  15 +++
 arch/riscv/include/asm/pgalloc.h|  36 +++
 arch/riscv/include/asm/pgtable-64.h |  97 -
 arch/riscv/include/asm/pgtable.h|  10 +-
 arch/riscv/kernel/head.S|   3 +-
 arch/riscv/mm/context.c |   2 +-
 arch/riscv/mm/init.c| 158 +---
 10 files changed, 307 insertions(+), 24 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e167f16131f4..3f73f60e9732 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -68,6 +68,7 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select HAVE_COPY_THREAD_TLS
select HAVE_ARCH_KASAN if MMU && 64BIT
+   select RELOCATABLE if 64BIT

 config ARCH_MMAP_RND_BITS_MIN
default 18 if 64BIT
@@ -106,7 +107,7 @@ config PAGE_OFFSET
default 0xC000 if 32BIT && MAXPHYSMEM_2GB
default 0x8000 if 64BIT && !MMU
default 0x8000 if 64BIT && MAXPHYSMEM_2GB
-   default 0xffe0 if 64BIT && !MAXPHYSMEM_2GB
+   default 0xc000 if 64BIT && !MAXPHYSMEM_2GB

 config ARCH_FLATMEM_ENABLE
def_bool y
@@ -155,8 +156,11 @@ config GENERIC_HWEIGHT
 config FIX_EARLYCON_MEM
def_bool MMU

+# On a 64BIT relocatable kernel, the 4-level page table is at runtime 
folded

+# on a 3-level page table when sv48 is not supported.
 config PGTABLE_LEVELS
int
+   default 4 if 64BIT && RELOCATABLE
default 3 if 64BIT
default 2

diff --git a/arch/riscv/include/asm/csr.h 
b/arch/riscv/include/asm/csr.h

index cec462e198ce..d41536c3f8d4 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -40,11 +40,10 @@
 #ifndef CONFIG_64BIT
 #define SATP_PPN   _AC(0x003F, UL)
 #define SATP_MODE_32   _AC(0x8000, UL)
-#define SATP_MODE  SATP_MODE_32
 #else
 #define SATP_PPN   _AC(0x0FFF, UL)
 #define SATP_MODE_39   _AC(0x8000, UL)
-#define SATP_MODE  SATP_MODE_39
+#define SATP_MODE_48   _AC(0x9000, UL)
 #endif

 /* Exception cause high bit - is an interrupt if set */
diff --git a/arch/riscv/include/asm/fixmap.h 
b/arch/riscv/include/asm/fixmap.h

index 2368d49eb4ef..d891cf9c73c5 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -27,6 +27,7 @@ enum fixed_addresses {
FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
FIX_PTE,
FIX_PMD,
+   FIX_PUD,
FIX_TEXT_POKE1,
FIX_TEXT_POKE0,
FIX_EARLYCON_MEM_BASE,
diff --git a/arch/riscv/include/asm/page.h 
b/arch/riscv/include/asm/page.h

index 48bb09b6a9b7..5e77fe7f0d6d 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -31,7 +31,19 @@
  * When not using MMU this corresponds to the first free page in
  * physical memory (aligned on a page boundary).
  */
+#ifdef CONFIG_RELOCATABLE
+#define PAGE_OFFSET__page_offset
+
+#ifdef CONFIG_64BIT
+/*
+ * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address 
space so

+ * define the PAGE_OFFSET value for SV39.
+ */
+#define PAGE_OFFSET_L3 0xffe0
+#endif /* CONFIG_64BIT */
+#else
 #define PAGE_OFFSET_AC(CONFIG_PAGE_OFFSET, UL)
+#endif /* CONFIG_RELOCATABLE */

 #define KERN_VIRT_SIZE (-PAGE_OFFSET)

@@ -102,6 +114,9 @@ extern unsigned long pfn_base;
 extern unsigned long max_low_pfn;
 extern unsigned long min_low_pfn;
 extern unsigned long kernel_virt_addr;
+#ifdef CONFIG_RELOCATABLE
+extern unsigned long __page_offset;
+#endif

 #define __pa_to_va_nodebug(x)	((void *)((unsigned long) (x) + 
va_pa_offset))

 #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
diff --git a/arch/riscv/include/asm/pgalloc.h 
b/arch/riscv/include/asm/pgalloc.h

index 3f601ee8233f..540eaa5a8658 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -36,6 +36,42 @@ static inline void pud_populate(struct mm_struct
*mm, pud_t *pud, pmd_t *pmd)

set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
 }
+
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, 
pud_t *pud)

+{
+   if (pgtable_

Re: [PATCH 2/2] riscv: Support CONFIG_STRICT_DEVMEM

2020-06-16 Thread Nick Kossifidis


Στις 2020-06-17 04:56, Zong Li έγραψε:
On Tue, Jun 16, 2020 at 8:27 PM Nick Kossifidis  
wrote:


Στις 2020-06-16 10:45, Zong Li έγραψε:
> Implement the 'devmem_is_allowed()' interface for RISC-V, like some of
> other architectures have done. It will be called from
> range_is_allowed()
> when userpsace attempts to access /dev/mem.
>
> Access to exclusive IOMEM and kernel RAM is denied unless
> CONFIG_STRICT_DEVMEM is set to 'n'.
>
> Test it by devmem, the result as follows:
>
>  - CONFIG_STRICT_DEVMEM=y
>   $ devmem 0x1001
>   0x
>   $ devmem 0x8020
>   0x106F
>
>  - CONFIG_STRICT_DEVMEM is not set
>   $ devmem 0x1001
>   devmem: mmap: Operation not permitted
>   $ devmem 0x8020
>   devmem: mmap: Operation not permitted
>
> Signed-off-by: Zong Li 
> ---
>  arch/riscv/Kconfig  |  1 +
>  arch/riscv/include/asm/io.h |  2 ++
>  arch/riscv/mm/init.c| 19 +++
>  3 files changed, 22 insertions(+)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 128192e14ff2..ffd7841ede4c 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -16,6 +16,7 @@ config RISCV
>   select ARCH_HAS_BINFMT_FLAT
>   select ARCH_HAS_DEBUG_VIRTUAL if MMU
>   select ARCH_HAS_DEBUG_WX
> + select ARCH_HAS_DEVMEM_IS_ALLOWED
>   select ARCH_HAS_GCOV_PROFILE_ALL
>   select ARCH_HAS_GIGANTIC_PAGE
>   select ARCH_HAS_MMIOWB
> diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h
> index 3835c3295dc5..04ac65ab93ce 100644
> --- a/arch/riscv/include/asm/io.h
> +++ b/arch/riscv/include/asm/io.h
> @@ -147,4 +147,6 @@ __io_writes_outs(outs, u64, q, __io_pbr(),
> __io_paw())
>
>  #include 
>
> +extern int devmem_is_allowed(unsigned long pfn);
> +
>  #endif /* _ASM_RISCV_IO_H */
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index bbe816e03b2f..5e7e61519acc 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -517,6 +517,25 @@ void mark_rodata_ro(void)
>  }
>  #endif
>
> +#ifdef CONFIG_STRICT_DEVMEM
> +#include 
> +/*
> + * devmem_is_allowed() checks to see if /dev/mem access to a certain
> address
> + * is valid. The argument is a physical page number.
> + *
> + * Disallow access to system RAM as well as device-exclusive MMIO
> regions.
> + * This effectively disable read()/write() on /dev/mem.
> + */
> +int devmem_is_allowed(unsigned long pfn)
> +{
> + if (iomem_is_exclusive(pfn << PAGE_SHIFT))
> + return 0;
> + if (!page_is_ram(pfn))
> + return 1;
> + return 0;
> +}
> +#endif
> +
>  void __init resource_init(void)
>  {
>   struct memblock_region *region;

This shouldn't be part of /mm/init.c, it has nothing to do with memory
initialization, I suggest we move it to another file like mmap.c on


Let me move it, thanks.

arm/arm64. Also before using iomem_is_exclusive we should probably 
also

mark any reserved regions with the no-map attribute as busy|exclusive,
reserved-memory regions are not necessarily part of the main memory so
the page_is_ram check may pass and iomem_is_exclusive won't do any 
good.


What do you think if we mark the reserved region in
kdump_resource_init, and change the kdump_resource_init to a more
generic name for initializing resources?


Sounds good to me, I'll work on this within the week. Do you agree with 
marking the no-map reserved-memory regions as exclusive ?

Re: [PATCH 2/2] riscv: Support CONFIG_STRICT_DEVMEM

2020-06-16 Thread Nick Kossifidis


Στις 2020-06-16 10:45, Zong Li έγραψε:

Implement the 'devmem_is_allowed()' interface for RISC-V, like some of
other architectures have done. It will be called from 
range_is_allowed()

when userpsace attempts to access /dev/mem.

Access to exclusive IOMEM and kernel RAM is denied unless
CONFIG_STRICT_DEVMEM is set to 'n'.

Test it by devmem, the result as follows:

 - CONFIG_STRICT_DEVMEM=y
$ devmem 0x1001
0x
$ devmem 0x8020
0x106F

 - CONFIG_STRICT_DEVMEM is not set
$ devmem 0x1001
devmem: mmap: Operation not permitted
$ devmem 0x8020
devmem: mmap: Operation not permitted

Signed-off-by: Zong Li 
---
 arch/riscv/Kconfig  |  1 +
 arch/riscv/include/asm/io.h |  2 ++
 arch/riscv/mm/init.c| 19 +++
 3 files changed, 22 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 128192e14ff2..ffd7841ede4c 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -16,6 +16,7 @@ config RISCV
select ARCH_HAS_BINFMT_FLAT
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEBUG_WX
+   select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_MMIOWB
diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h
index 3835c3295dc5..04ac65ab93ce 100644
--- a/arch/riscv/include/asm/io.h
+++ b/arch/riscv/include/asm/io.h
@@ -147,4 +147,6 @@ __io_writes_outs(outs, u64, q, __io_pbr(), 
__io_paw())


 #include 

+extern int devmem_is_allowed(unsigned long pfn);
+
 #endif /* _ASM_RISCV_IO_H */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index bbe816e03b2f..5e7e61519acc 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -517,6 +517,25 @@ void mark_rodata_ro(void)
 }
 #endif

+#ifdef CONFIG_STRICT_DEVMEM
+#include 
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain 
address

+ * is valid. The argument is a physical page number.
+ *
+ * Disallow access to system RAM as well as device-exclusive MMIO 
regions.

+ * This effectively disable read()/write() on /dev/mem.
+ */
+int devmem_is_allowed(unsigned long pfn)
+{
+   if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+   return 0;
+   if (!page_is_ram(pfn))
+   return 1;
+   return 0;
+}
+#endif
+
 void __init resource_init(void)
 {
struct memblock_region *region;


This shouldn't be part of /mm/init.c, it has nothing to do with memory 
initialization, I suggest we move it to another file like mmap.c on 
arm/arm64. Also before using iomem_is_exclusive we should probably also 
mark any reserved regions with the no-map attribute as busy|exclusive, 
reserved-memory regions are not necessarily part of the main memory so 
the page_is_ram check may pass and iomem_is_exclusive won't do any good.

Re: [PATCH 1/2] riscv: Register System RAM as iomem resources

2020-06-16 Thread Nick Kossifidis


Στις 2020-06-16 10:45, Zong Li έγραψε:

Add System RAM to /proc/iomem, various tools expect it such as kdump.
It is also needed for page_is_ram API which checks the specified 
address

whether registered as System RAM in iomem_resource list.

Signed-off-by: Zong Li 
---
 arch/riscv/mm/init.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index f4adb3684f3d..bbe816e03b2f 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -517,6 +517,27 @@ void mark_rodata_ro(void)
 }
 #endif

+void __init resource_init(void)
+{
+   struct memblock_region *region;
+
+   for_each_memblock(memory, region) {
+   struct resource *res;
+
+   res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
+   if (!res)
+   panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct resource));
+
+   res->name = "System RAM";
+   res->start = 
__pfn_to_phys(memblock_region_memory_base_pfn(region));
+		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 
1;

+   res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   request_resource(&iomem_resource, res);
+   }
+}
+
 void __init paging_init(void)
 {
setup_vm_final();
@@ -524,6 +545,7 @@ void __init paging_init(void)
sparse_init();
setup_zero_page();
zone_sizes_init();
+   resource_init();
 }

 #ifdef CONFIG_SPARSEMEM_VMEMMAP



I already have a patch for registering System RAM as an iomem resource 
on my kexec/kdump series. Since I don't care about System RAM regions 
being accurately exposed to userspace (I parse the current device tree 
instead) I just use memblock_start_of_DRAM/end_of_DRAM. This approach 
from arm64 codebase is better since it also handles the case of sparse 
memory regions but in order to be useful for kdump we need to add the 
various segments of the kernel image as child nodes to their respective 
region for kexec-tools. I'll re-spin my patchset anyway so I'll extend 
it to better handle System RAM regions.

Re: [PATCH, RFC] byteorder: sanity check toolchain vs kernel endianess

2019-04-12 Thread Nick Kossifidis


Στις 2019-04-12 17:53, Arnd Bergmann έγραψε:

On Fri, Apr 12, 2019 at 4:36 PM Christoph Hellwig  wrote:


When removing some dead big endian checks in the RISC-V code Nick
suggested that we should have some generic sanity checks.  I don't 
think

we should have thos inside the RISC-V code, but maybe it might make
sense to have these in the generic byteorder headers.  Note that these
are UAPI headers and some compilers might not actually define
__BYTE_ORDER__, so we first check that it actually exists.

Suggested-by: Nick Kossifidis 
Signed-off-by: Christoph Hellwig 


Acked-by: Arnd Bergmann 

Extra checking like this is good in general, but I'm not sure I see
exactly what kind of issue one might expect to prevent with this:

All architecture asm/byteorder.h headers either include the only
possible option, or they check the compiler defined macros:

arch/arc/include/uapi/asm/byteorder.h:#ifdef __BIG_ENDIAN__
arch/arm/include/uapi/asm/byteorder.h:#ifdef __ARMEB__
arch/arm64/include/uapi/asm/byteorder.h:#ifdef __AARCH64EB__
arch/c6x/include/uapi/asm/byteorder.h:#ifdef _BIG_ENDIAN
arch/microblaze/include/uapi/asm/byteorder.h:#ifdef __MICROBLAZEEL__
arch/mips/include/uapi/asm/byteorder.h:#if defined(__MIPSEB__)
arch/nds32/include/uapi/asm/byteorder.h:#ifdef __NDS32_EB__
arch/powerpc/include/uapi/asm/byteorder.h:#ifdef __LITTLE_ENDIAN__
arch/sh/include/uapi/asm/byteorder.h:#ifdef __LITTLE_ENDIAN__
arch/xtensa/include/uapi/asm/byteorder.h:#ifdef __XTENSA_EL__

Are you worried about toolchains that define those differently
from what these headers expect? Did you encounter such a case?

  Arnd


The following architectures just include the header file without
checking for any compiler macro:

alpha: little_endian.h
csky: little_endian.h
h8300: big_endian.h
hexagon: little_endian.h
ia64: little_endian.h
m68k: big_endian.h
nios2: little_endian.h
openrisc: big_endian.h
parisc: big_endian.h
riscv: little_endian.h
s390: big_endian.h
sparc: big_endian.h
unicore32: little_endian.h
x86: little_endian.h

Of those who do check for a compiler macro, they don't use the
generic macros (__ORDER_*_ENDIAN__) but arch-specific ones.

Only two architectures (mips and xtensa) that support both big
and little endian return an error in case the endianess can't be
determined, the rest will move on without including any
of *_endian.h files.

I think it's good to have a sanity check in-place for consistency.

Regards,
Nick

Re: [PATCH 3/3] RISC-V: Allow booting kernel from any 4KB aligned address

2019-04-10 Thread Nick Kossifidis


Στις 2019-03-13 00:08, Anup Patel έγραψε:

Currently, we have to boot RISCV64 kernel from a 2MB aligned physical
address and RISCV32 kernel from a 4MB aligned physical address. This
constraint is because initial pagetable setup (i.e. setup_vm()) maps
entire RAM using hugepages (i.e. 2MB for 3-level pagetable and 4MB for
2-level pagetable).

Further, the above booting contraint also results in memory wastage
because if we boot kernel from some  address (which is not same as
RAM start address) then RISCV kernel will map PAGE_OFFSET virtual 
address
lineraly to  physical address and memory between RAM start and 


will be reserved/unusable.

For example, RISCV64 kernel booted from 0x8020 will waste 2MB of 
RAM

and RISCV32 kernel booted from 0x8040 will waste 4MB of RAM.

This patch re-writes the initial pagetable setup code to allow booting
RISV32 and RISCV64 kernel from any 4KB (i.e. PAGE_SIZE) aligned 
address.


To achieve this:
1. We map kernel, dtb and only some amount of RAM (few MBs) using 4KB
   mappings in setup_vm() (called from head.S)
2. Once we reach paging_init() (called from setup_arch()) after
   memblock setup, we map all available memory banks using 4KB
   mappings and memblock APIs.

With this patch in-place, the booting constraint for RISCV32 and 
RISCV64

kernel is much more relaxed and we can now boot kernel very close to
RAM start thereby minimizng memory wastage.

Signed-off-by: Anup Patel 
---
 arch/riscv/include/asm/fixmap.h |   5 +
 arch/riscv/include/asm/pgtable-64.h |   5 +
 arch/riscv/include/asm/pgtable.h|   6 +-
 arch/riscv/kernel/head.S|   1 +
 arch/riscv/kernel/setup.c   |   4 +-
 arch/riscv/mm/init.c| 357 +++-
 6 files changed, 317 insertions(+), 61 deletions(-)

diff --git a/arch/riscv/include/asm/fixmap.h 
b/arch/riscv/include/asm/fixmap.h

index 57afe604b495..5cf53dd882e5 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -21,6 +21,11 @@
  */
 enum fixed_addresses {
FIX_HOLE,
+#define FIX_FDT_SIZE   SZ_1M
+   FIX_FDT_END,
+   FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
+   FIX_PTE,
+   FIX_PMD,
FIX_EARLYCON_MEM_BASE,
__end_of_fixed_addresses
 };
diff --git a/arch/riscv/include/asm/pgtable-64.h
b/arch/riscv/include/asm/pgtable-64.h
index 7aa0ea9bd8bb..56ecc3dc939d 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -78,6 +78,11 @@ static inline pmd_t pfn_pmd(unsigned long pfn, 
pgprot_t prot)

return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
 }

+static inline unsigned long _pmd_pfn(pmd_t pmd)
+{
+   return pmd_val(pmd) >> _PAGE_PFN_SHIFT;
+}
+
 #define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))

diff --git a/arch/riscv/include/asm/pgtable.h 
b/arch/riscv/include/asm/pgtable.h

index 1141364d990e..05fa2115e736 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -121,12 +121,16 @@ static inline void pmd_clear(pmd_t *pmdp)
set_pmd(pmdp, __pmd(0));
 }

-
 static inline pgd_t pfn_pgd(unsigned long pfn, pgprot_t prot)
 {
return __pgd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
 }

+static inline unsigned long _pgd_pfn(pgd_t pgd)
+{
+   return pgd_val(pgd) >> _PAGE_PFN_SHIFT;
+}
+
 #define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))

 /* Locate an entry in the page global directory */
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 7966262b4f9d..12a3ec5eb8ab 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -63,6 +63,7 @@ clear_bss_done:
/* Initialize page tables and relocate to virtual addresses */
la sp, init_thread_union + THREAD_SIZE
la a0, _start
+   mv a1, s1
call setup_vm
call relocate

diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index ecb654f6a79e..acdd0f74982b 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -30,6 +30,7 @@
 #include 
 #include 

+#include 
 #include 
 #include 
 #include 
@@ -62,7 +63,8 @@ unsigned long boot_cpu_hartid;

 void __init parse_dtb(unsigned int hartid, void *dtb)
 {
-   if (early_init_dt_scan(__va(dtb)))
+   dtb = (void *)fix_to_virt(FIX_FDT) + ((uintptr_t)dtb & ~PAGE_MASK);
+   if (early_init_dt_scan(dtb))
return;

pr_err("No DTB passed to the kernel\n");
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index f35299f2f3d5..ee55a4b90dec 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1,14 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
  * Copyright (C) 2012 Regents of the University of California
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public Licen

Re: [PATCH v2 6/6] RISC-V: Implement sparsemem

2018-12-17 Thread Nick Kossifidis


Hello Logan,

Στις 2018-10-15 20:57, Logan Gunthorpe έγραψε:

This patch implements sparsemem support for risc-v which helps pave the
way for memory hotplug and eventually P2P support.

We introduce Kconfig options for virtual and physical address bits 
which

are used to calculate the size of the vmemmap and set the
MAX_PHYSMEM_BITS.

The vmemmap is located directly before the VMALLOC region and sized
such that we can allocate enough pages to populate all the virtual
address space in the system (similar to the way it's done in arm64).

During initialization, call memblocks_present() and sparse_init(),
and provide a stub for vmemmap_populate() (all of which is similar to
arm64).

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Palmer Dabbelt 
Cc: Albert Ou 
Cc: Andrew Waterman 
Cc: Olof Johansson 
Cc: Michael Clark 
Cc: Rob Herring 
Cc: Zong Li 
---
 arch/riscv/Kconfig | 23 +++
 arch/riscv/include/asm/pgtable.h   | 21 +
 arch/riscv/include/asm/sparsemem.h | 11 +++
 arch/riscv/kernel/setup.c  |  4 +++-
 arch/riscv/mm/init.c   |  8 
 5 files changed, 62 insertions(+), 5 deletions(-)
 create mode 100644 arch/riscv/include/asm/sparsemem.h

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index a344980287a5..a1b5d758a542 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -52,12 +52,32 @@ config ZONE_DMA32
bool
default y if 64BIT

+config VA_BITS
+   int
+   default 32 if 32BIT
+   default 39 if 64BIT
+
+config PA_BITS
+   int
+   default 34 if 32BIT
+   default 56 if 64BIT
+
 config PAGE_OFFSET
hex
default 0xC000 if 32BIT && MAXPHYSMEM_2GB
default 0x8000 if 64BIT && MAXPHYSMEM_2GB
default 0xffe0 if 64BIT && MAXPHYSMEM_128GB

+config ARCH_FLATMEM_ENABLE
+   def_bool y
+
+config ARCH_SPARSEMEM_ENABLE
+   def_bool y
+   select SPARSEMEM_VMEMMAP_ENABLE
+
+config ARCH_SELECT_MEMORY_MODEL
+   def_bool ARCH_SPARSEMEM_ENABLE
+
 config STACKTRACE_SUPPORT
def_bool y

@@ -92,6 +112,9 @@ config PGTABLE_LEVELS
 config HAVE_KPROBES
def_bool n

+config HAVE_ARCH_PFN_VALID
+   def_bool y
+
 menu "Platform type"

 choice
diff --git a/arch/riscv/include/asm/pgtable.h 
b/arch/riscv/include/asm/pgtable.h

index 16301966d65b..e1162336f5ea 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -89,6 +89,23 @@ extern pgd_t swapper_pg_dir[];
 #define __S110 PAGE_SHARED_EXEC
 #define __S111 PAGE_SHARED_EXEC

+#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
+#define VMALLOC_END  (PAGE_OFFSET - 1)
+#define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE)
+
+/*
+ * Roughly size the vmemmap space to be large enough to fit enough
+ * struct pages to map half the virtual address space. Then
+ * position vmemmap directly below the VMALLOC region.
+ */
+#define VMEMMAP_SHIFT \
+   (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
+#define VMEMMAP_SIZE   (1UL << VMEMMAP_SHIFT)
+#define VMEMMAP_END(VMALLOC_START - 1)
+#define VMEMMAP_START  (VMALLOC_START - VMEMMAP_SIZE)
+
+#define vmemmap((struct page *)VMEMMAP_START)
+
 /*
  * ZERO_PAGE is a global shared page that is always zero,
  * used for zero-mapped memory areas, etc.
@@ -411,10 +428,6 @@ static inline void pgtable_cache_init(void)
/* No page table caches to initialize */
 }

-#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
-#define VMALLOC_END  (PAGE_OFFSET - 1)
-#define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE)
-
 /*
  * Task size is 0x400 for RV64 or 0xb80 for RV32.
  * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
diff --git a/arch/riscv/include/asm/sparsemem.h
b/arch/riscv/include/asm/sparsemem.h
new file mode 100644
index ..215530b24336
--- /dev/null
+++ b/arch/riscv/include/asm/sparsemem.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_SPARSEMEM_H
+#define __ASM_SPARSEMEM_H
+
+#ifdef CONFIG_SPARSEMEM
+#define MAX_PHYSMEM_BITS   CONFIG_PA_BITS
+#define SECTION_SIZE_BITS  30


Having memory blocks of a minimum size of 1GB doesn't make much sense. 
It makes it harder to implement hotplug on top of this since we'll only 
able to add/remove 1GB at a time. ARM used to do the same and they 
switched to 27bits (https://patchwork.kernel.org/patch/9172845/), ARM64 
still uses 1GB, x86 also uses 27bits and most archs also use something 
below 30. I believe we should go for 27bits as well or even better have 
this as a compile time option.


BTW memblocks_present is on master now (got merged 3 days ago).

Regards,
N.

Re: [PATCH] RISC-V: Make BSS section as the last section in vmlinux.lds.S

2018-12-17 Thread Nick Kossifidis


Στις 2018-12-17 11:36, Anup Patel έγραψε:
On Mon, Nov 26, 2018 at 11:42 AM Anup Patel  
wrote:


The objcopy only emits loadable sections when creating flat kernel
Image. To have minimal possible size of flat kernel Image, we should
have all non-loadable sections after loadable sections.

Currently, execption table section (loadable section) is after BSS
section (non-loadable section) in the RISC-V vmlinux.lds.S. This
is not optimal for having minimal flat kernel Image size hence this
patch makes BSS section as the last section in RISC-V vmlinux.lds.S.

In addition, we make BSS section aligned to 16byte instead of PAGE
aligned which further reduces flat kernel Image size by few KBs.

The flat kernel Image size of Linux-4.20-rc4 using GCC 8.2.0 is
8819980 bytes with current RISC-V vmlinux.lds.S and it reduces to
7991740 bytes with this patch applied using GCC 8.2.0. In summary,
this patch reduces Linux-4.20-rc4 flat kernel Image size by 809 KB.

Signed-off-by: Anup Patel 
---
 arch/riscv/kernel/vmlinux.lds.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/kernel/vmlinux.lds.S 
b/arch/riscv/kernel/vmlinux.lds.S

index 65df1dfdc303..cc99eed44931 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -74,8 +74,6 @@ SECTIONS
*(.sbss*)
}

-   BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
-
EXCEPTION_TABLE(0x10)
NOTES

@@ -83,6 +81,8 @@ SECTIONS
*(.rel.dyn*)
}

+   BSS_SECTION(0x10, 0x10, 0x10)
+
_end = .;

STABS_DEBUG
--
2.17.1



Hi All,

Any comment on this patch?

Regards,
Anup



Just a note on coding style, you should be using a macro instead of 0x10 
so that those who read the code can understand what it is and also a few 
comments since searching through the commit logs to understand why you 
used it isn't optimal.


Regards,
Nick

Re: [PATCH v2] riscv: add asm/unistd.h UAPI header

2018-11-17 Thread Nick Kossifidis


Hello David,

Στις 2018-11-08 21:02, David Abdurachmanov έγραψε:
Marcin Juszkiewicz reported issues while generating syscall table for 
riscv
using 4.20-rc1. The patch refactors our unistd.h files to match some 
other

architectures.

- Add asm/unistd.h UAPI header, which has __ARCH_WANT_NEW_STAT only for 
64-bit

- Remove asm/syscalls.h UAPI header and merge to asm/unistd.h
- Adjust kernel asm/unistd.h

So now asm/unistd.h UAPI header should show all syscalls for riscv.

Before this, Makefile simply put `#include ` into
generated asm/unistd.h UAPI header thus user didn't see:

- __NR_riscv_flush_icache
- __NR_newfstatat
- __NR_fstat

which are supported by riscv kernel.

Signed-off-by: David Abdurachmanov 
Cc: Arnd Bergmann 
Cc: Marcin Juszkiewicz 
Cc: Guenter Roeck 
Fixes: 67314ec7b025 ("RISC-V: Request newstat syscalls")
Signed-off-by: David Abdurachmanov 
---
 arch/riscv/include/asm/unistd.h|  5 ++--
 arch/riscv/include/uapi/asm/syscalls.h | 29 --
 arch/riscv/include/uapi/asm/unistd.h   | 41 ++
 3 files changed, 43 insertions(+), 32 deletions(-)
 delete mode 100644 arch/riscv/include/uapi/asm/syscalls.h
 create mode 100644 arch/riscv/include/uapi/asm/unistd.h

diff --git a/arch/riscv/include/asm/unistd.h 
b/arch/riscv/include/asm/unistd.h

index eff7aa9aa163..fef96f117b4d 100644
--- a/arch/riscv/include/asm/unistd.h
+++ b/arch/riscv/include/asm/unistd.h
@@ -13,10 +13,9 @@

 /*
  * There is explicitly no include guard here because this file is 
expected to

- * be included multiple times.  See uapi/asm/syscalls.h for more info.
+ * be included multiple times.
  */

-#define __ARCH_WANT_NEW_STAT
 #define __ARCH_WANT_SYS_CLONE
+
 #include 
-#include 
diff --git a/arch/riscv/include/uapi/asm/syscalls.h
b/arch/riscv/include/uapi/asm/syscalls.h
deleted file mode 100644
index 206dc4b0f6ea..
--- a/arch/riscv/include/uapi/asm/syscalls.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2017-2018 SiFive
- */
-
-/*
- * There is explicitly no include guard here because this file is 
expected to
- * be included multiple times in order to define the syscall macros 
via

- * __SYSCALL.
- */
-
-/*
- * Allows the instruction cache to be flushed from userspace.  Despite 
RISC-V
- * having a direct 'fence.i' instruction available to userspace (which 
we
- * can't trap!), that's not actually viable when running on Linux 
because the
- * kernel might schedule a process on another hart.  There is no way 
for
- * userspace to handle this without invoking the kernel (as it doesn't 
know the
- * thread->hart mappings), so we've defined a RISC-V specific system 
call to

- * flush the instruction cache.
- *
- * __NR_riscv_flush_icache is defined to flush the instruction cache 
over an
- * address range, with the flush applying to either all threads or 
just the
- * caller.  We don't currently do anything with the address range, 
that's just

- * in there for forwards compatibility.
- */
-#ifndef __NR_riscv_flush_icache
-#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15)
-#endif
-__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache)
diff --git a/arch/riscv/include/uapi/asm/unistd.h
b/arch/riscv/include/uapi/asm/unistd.h
new file mode 100644
index ..1f3bd3ebbb0d
--- /dev/null
+++ b/arch/riscv/include/uapi/asm/unistd.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2018 David Abdurachmanov 


+ *
+ * This program is free software; you can redistribute it and/or 
modify

+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see 
.

+ */
+
+#ifdef __LP64__
+#define __ARCH_WANT_NEW_STAT
+#endif /* __LP64__ */
+
+#include 
+
+/*
+ * Allows the instruction cache to be flushed from userspace.  Despite 
RISC-V
+ * having a direct 'fence.i' instruction available to userspace (which 
we
+ * can't trap!), that's not actually viable when running on Linux 
because the
+ * kernel might schedule a process on another hart.  There is no way 
for
+ * userspace to handle this without invoking the kernel (as it doesn't 
know the
+ * thread->hart mappings), so we've defined a RISC-V specific system 
call to

+ * flush the instruction cache.
+ *
+ * __NR_riscv_flush_icache is defined to flush the instruction cache 
over an
+ * address range, with the flush applying to either all threads or 
just the
+ * caller.  We don't currently do anything with the address range, 
that's just

+ * in there for forwards compati

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-08 Thread Nick Kossifidis

Στις 2018-11-08 17:54, Mark Rutland έγραψε:

On Thu, Nov 08, 2018 at 03:45:36PM +0200, Nick Kossifidis wrote:

Στις 2018-11-07 14:06, Mark Rutland έγραψε:
> On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
> > Mark and Sundeep thanks a lot for your feedback, I guess you convinced
> > me that having a device tree binding for the scheduler is not a
> > correct approach. It's not a device after all and I agree that the
> > device tree shouldn't become an OS configuration file.
>
> Good to hear.
>
> > Regarding multiple levels of shared resources my point is that since
> > cpu-map doesn't contain any information of what is shared among the
> > cluster/core members it's not easy to do any further translation. Last
> > time I checked the arm code that uses cpu-map, it only defines one
> > domain for SMT, one for MC and then everything else is ignored. No
> > matter how many clusters have been defined, anything above the core
> > level is the same (and then I guess you started talking about adding
> > "packages" on the representation side).
>
> While cpu-map doesn't contain that information today, we can *add* that
> information to the cpu-map binding if necessary.
>
> > The reason I proposed to have a binding for the scheduler directly is
> > not only because it's simpler and closer to what really happens in the
> > code, it also makes more sense to me than the combination of cpu-map
> > with all the related mappings e.g.  for numa or caches or power
> > domains etc.
> >
> > However you are right we could definitely augment cpu-map to include
> > support for what I'm saying and clean things up, and since you are
> > open about improving it here is a proposal that I hope you find
> > interesting:
> >
> > At first let's get rid of the  nodes, they don't make sense:
> >
> > thread0 {
> >  cpu = <&CPU0>;
> > };
> >
> > A thread node can't have more than one cpu entry and any properties
> > should be on the cpu node itself, so it doesn't / can't add any
> > more information. We could just have an array of cpu nodes on the
> >  node, it's much cleaner this way.
> >
> > core0 {
> >  members = <&CPU0>, <&CPU1>;
> > };
>
> Hold on. Rather than reinventing things from first principles, can we
> please discuss what you want to *achieve*, i.e. what information you
> need?
>
> Having a node is not a significant cost, and there are reasons we may
> want thread nodes. For example, it means that we can always refer to any
> level of topology with a phandle, and we might want to describe
> thread-affine devices in future.

You can use the phandle of the cpu node, the thread node doesn't add
anything more than complexity to the representation.

That adds complexity elsewhere, because the phandle of a CPU node is 
not

a child of the cpu-map node, and you can't simply walk up the tree
hierarchy to find parent nodes.

I see no reason to change this part of the binding. Given it's already
out there, with existing parsing code, changing it only serves to add
complexity to any common code which has to parse this.

Can we please drop the idea of dropping the thread node?

> There are a tonne of existing bindings that are ugly, but re-inventing
> them for taste reasons alone is more costly to the ecosystem than simply
> using the existing bindings. We avoid re-inventing bindings unless there
> is a functional problem e.g. cases which they cannot possibly describe.

We are talking about using something for RISC-V and possibly common 
across

different archs and, I don't see why we should keep the ugliness of a
binding spec plus in this case the  node can't possibly 
describe

anything else than a cpu= alias, it's redundant.

While it may be ugly, removing it only serves to add complexity for
common parsing code, and removes flexibility that we may want in 
future.

Its presence does not cause any technical problem.

For better or worse we're all sharing the same codebase here. The 
common

case matters, as does the complexity of the codebase as a whole.

I realise it can be frustrating to have to use something you feel is
sub-optimal, but putting up with a few nodes which you personally feel
are redundant is of much lower cost to the ecosystem than having
incompatible bindings where we could have one. If you put up with that,
you can focus your efforts on more worthwhile technical exercises.

The only reason I mentioned the issue with the  node is because
you said that you are open for improving cpu-map. I don't think it's
such a big deal and even if we decide against it, it's not

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-08 Thread Nick Kossifidis

Στις 2018-11-08 18:48, Sudeep Holla έγραψε:

On Thu, Nov 08, 2018 at 04:52:30PM +0200, Nick Kossifidis wrote:

Στις 2018-11-07 14:28, Sudeep Holla έγραψε:
>
> I agree, but we have kernel code using it(arm64/kernel/topology.c). It's
> too late to remove it. But we can always keep to optional if we move the
> ARM64 binding as generic to start with and mandate it for only ARM64.
>

That's my point as well, if we are going to define something to be 
used
by everybody and in this case, at least for RISC-V, there is no need 
to

carry this from the ARM64 binding.

Sure, whatever you don't need in RISC-V you can see if they can be made
optional. I don't think that should be a problem.

It shouldn't be that hard to fix this
in the future for ARM64 as well, we may give the new mapping another 
name,

maybe cpu-map2 or cpu-topology to slowly move to the new one.

No, we have it and we will continue to support it. It's not broken to
fix on ARM64. Why do you think that it's broken on ARM64 ?

I never said it's broken, I just assumed that if this binding becomes 
common

across archs you'd probably like to switch to it, so it should either be
backwards compatible with what you have (or as you said "keep them 
optional")
or take steps for the transition. I'm ok either way, It's not such a 
serious
thing to have the  nodes supported or keep the distinction 
between
"cores", "clusters" or whatever, I'm sorry if it looked that way. I'm 
just
saying that I'd like to have something cleaner for RISC-V and/or a 
binding
that's common for all, we can support both and be backwards compatible, 
or we

can keep it as is and mandate the  nodes only for ARM64 as you
suggested.

Changing the
dts files shouldn't be this hard, we can provide a script for it. We 
can
even contain some compatibility code that also understands  
nodes

and e.g. merges them together on a core node.

Sure, hang on this idea of scripting, we can make a better use of it.
Details later further in the mail.

[...]

> > The same also happens with the generic numa binding on
> > Documentation/devicetree/bindings/numa.txt
> > which says we should add the nuna-node-id on each of the cpu nodes.
> >
>
> Yes, but again what's the problem ?
>

There is no problem with the above bindings, the problem is that we 
have

to put them on each cpu node which is messy. We could instead put them
(optionally) on the various groupings used on cpu-map. This would 
allow
cpu-map to be more specific of what is shared across the members of 
each

group (core/cluster/whatever).

I think Mark has already explain why/how generic bindings are useful.
If you still have concerns, take it up separately and see how you can
build *perfect* bindings for RISC-V to avoid any legacy baggage.

We have reasons why we can't assume information about cache or power
domain topology from CPU topology. I have summarised them already and
we are not discussing.

But that's what you do on arch/arm/kernel/topology.c, you assume
information on power domain and cache topology based on the CPU topology
when you define the scheduling domain topology levels.

PowerPC also does the same on arch/powerpc/kernel/smp.c and basically if
you track the usage of struct sched_domain_topology_level you'll see 
that

every arch that uses it to instruct the scheduler about the scheduling
domains, does it based on its CPU topology, which I think we both agree
is wrong.

There may not be perfect bindings but there are
already supported and nothing is broken here to fix. DT bindings are
*not* same as code to fix it with a patch to the bindings themselves.
Once agreed and merged, they need to be treated like user ABI.

The only use of the devicetree spec bindings for caches if I'm not
mistaken are used on your cacheinfo driver for exporting them to 
userspace

through sysfs (thank you for cleaning this up btw). Even if we have the
cache topology  using the generic bindings, we are not doing anything 
with

that information but export it to userspace.

As for the power domain bindings, we have drivers/base/power/domain.c 
that

handles the generic bindings and it's being used by some drivers to put
devices to power domains (something used by the pm code), but from a 
quick

search it's not used for cpu nodes and I didn't see this info being used
for providing hints to the scheduler either. Even with the generic power
domain bindings in place, for CPU idle states, on ARM we have another
binding that's basically the same with the addition of 
"wakeup-latency-us".

For having different capacity there is no generic binding but I guess we
could use ARM's.

Of the generic bindings that relate to the scheduler's behavior, only 
the
numa binding is used as expected and only by ARM64, powerpc and spa

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-08 Thread Nick Kossifidis

Στις 2018-11-07 14:28, Sudeep Holla έγραψε:

On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:
[...]

Mark and Sudeep thanks a lot for your feedback, I guess you convinced
me

that having a device tree binding for the scheduler is not a correct
approach.

Thanks :)

It's not a device after all and I agree that the device tree shouldn't
become an OS configuration file. Regarding multiple levels of shared
resources my point is that since cpu-map doesn't contain any
information of

what is shared among the cluster/core members it's not easy to do any
further translation. Last time I checked the arm code that uses
cpu-map, it
only defines one domain for SMT, one for MC and then everything else
is
ignored. No matter how many clusters have been defined, anything above
the
core level is the same (and then I guess you started talking about
adding

"packages" on the representation side).

Correct.

The reason I proposed to have a binding for the scheduler directly is
not
only because it's simpler and closer to what really happens in the
code, it
also makes more sense to me than the combination of cpu-map with all
the

related mappings e.g. for numa or caches or power domains etc.

Again you are just looking at it with Linux kernel perspective.

However you are right we could definitely augment cpu-map to include
support

for what I'm saying and clean things up, and since you are open about
improving it here is a proposal that I hope you find interesting: At
first

let's get rid of the nodes, they don't make sense:

thread0 {
cpu = <&CPU0>;
};

Do you have any strong reasons to do so ?
Since it's already there for some time, I believe we can't remove it
for

backward compatibility reasons.

A thread node can't have more than one cpu entry and any properties
should be on the cpu node itself, so it doesn't / can't add any
more information. We could just have an array of cpu nodes on the
node, it's much cleaner this way.

core0 {
members = <&CPU0>, <&CPU1>;
};

I agree, but we have kernel code using it(arm64/kernel/topology.c).
It's
too late to remove it. But we can always keep to optional if we move
the

ARM64 binding as generic to start with and mandate it for only ARM64.

That's my point as well, if we are going to define something to be used
by everybody and in this case, at least for RISC-V, there is no need to
carry this from the ARM64 binding. It shouldn't be that hard to fix this
in the future for ARM64 as well, we may give the new mapping another
name,
maybe cpu-map2 or cpu-topology to slowly move to the new one. Changing
the

dts files shouldn't be this hard, we can provide a script for it. We can
even contain some compatibility code that also understands
nodes

and e.g. merges them together on a core node.

Then let's allow the cluster and core nodes to accept attributes that
are
common for the cpus they contain. Right now this is considered
invalid.

Yes, we have discussed in the past and decided not to. I am fine if we
need to change it, but assuming the topology implies other information
could be wrong. On ARM platforms we have learnt it, so we kept any
information away from topology. I assume same with RISC-V, different
vendors implement in different ways, so it's better to consider those
factors.

For power domains we have a generic binding described on
Documentation/devicetree/bindings/power/power_domain.txt
which basically says that we need to put power-domains = attribute on each of the cpu nodes.

OK, but what's wrong with that. I gives full flexibility.

The same happens with the capacity binding specified for arm on
Documentation/devicetree/bindings/arm/cpu-capacity.txt
which says we should add the capacity-dmips-mhz on each of the cpu
nodes.

Ditto, we may need this for our single cluster DSU systems.

The same also happens with the generic numa binding on
Documentation/devicetree/bindings/numa.txt
which says we should add the nuna-node-id on each of the cpu nodes.

Yes, but again what's the problem ?

There is no problem with the above bindings, the problem is that we have
to put them on each cpu node which is messy. We could instead put them
(optionally) on the various groupings used on cpu-map. This would allow
cpu-map to be more specific of what is shared across the members of each
group (core/cluster/whatever).

As I wrote on my answer to Mark previously, the bindings for infering
the cache topology, numa topology, power domain topology etc are already
there, they are in the devicet tree spec and provide very specific
informations we can use. Much "stronger" hints of what's going on at
the hw level. The cpu-map doesn't provide such information, it just
provides a view of how the various harts/threads are "packed" on the
chip,

not what the

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-08 Thread Nick Kossifidis


Στις 2018-11-07 14:06, Mark Rutland έγραψε:

On Wed, Nov 07, 2018 at 04:31:34AM +0200, Nick Kossifidis wrote:

Mark and Sundeep thanks a lot for your feedback, I guess you convinced
me that having a device tree binding for the scheduler is not a
correct approach. It's not a device after all and I agree that the
device tree shouldn't become an OS configuration file.


Good to hear.


Regarding multiple levels of shared resources my point is that since
cpu-map doesn't contain any information of what is shared among the
cluster/core members it's not easy to do any further translation. Last
time I checked the arm code that uses cpu-map, it only defines one
domain for SMT, one for MC and then everything else is ignored. No
matter how many clusters have been defined, anything above the core
level is the same (and then I guess you started talking about adding
"packages" on the representation side).


While cpu-map doesn't contain that information today, we can *add* that
information to the cpu-map binding if necessary.


The reason I proposed to have a binding for the scheduler directly is
not only because it's simpler and closer to what really happens in the
code, it also makes more sense to me than the combination of cpu-map
with all the related mappings e.g.  for numa or caches or power
domains etc.

However you are right we could definitely augment cpu-map to include
support for what I'm saying and clean things up, and since you are
open about improving it here is a proposal that I hope you find
interesting:

At first let's get rid of the  nodes, they don't make sense:

thread0 {
 cpu = <&CPU0>;
};

A thread node can't have more than one cpu entry and any properties
should be on the cpu node itself, so it doesn't / can't add any
more information. We could just have an array of cpu nodes on the
 node, it's much cleaner this way.

core0 {
 members = <&CPU0>, <&CPU1>;
};


Hold on. Rather than reinventing things from first principles, can we
please discuss what you want to *achieve*, i.e. what information you
need?

Having a node is not a significant cost, and there are reasons we may
want thread nodes. For example, it means that we can always refer to 
any

level of topology with a phandle, and we might want to describe
thread-affine devices in future.



You can use the phandle of the cpu node, the thread node doesn't add
anything more than complexity to the representation.


There are a tonne of existing bindings that are ugly, but re-inventing
them for taste reasons alone is more costly to the ecosystem than 
simply
using the existing bindings. We avoid re-inventing bindings unless 
there

is a functional problem e.g. cases which they cannot possibly describe.



We are talking about using something for RISC-V and possibly common 
across

different archs and, I don't see why we should keep the ugliness of a
binding spec plus in this case the  node can't possibly describe
anything else than a cpu= alias, it's redundant.

Then let's allow the cluster and core nodes to accept attributes that 
are
common for the cpus they contain. Right now this is considered 
invalid.


For power domains we have a generic binding described on
Documentation/devicetree/bindings/power/power_domain.txt
which basically says that we need to put power-domains = 
attribute on each of the cpu nodes.


FWIW, given this is arguably topological, I'm not personally averse to
describing this in the cpu-map, if that actually gains us more than the
complexity require to support it.

Given we don't do this for device power domains, I suspect that it's
simpler to stick with the existing binding.


The same happens with the capacity binding specified for arm on
Documentation/devicetree/bindings/arm/cpu-capacity.txt
which says we should add the capacity-dmips-mhz on each of the cpu 
nodes.


The cpu-map was intended to expose topological dtails, and this isn't
really a topological property. For example, Arm DynamIQ systems can 
have

heterogeneous CPUs within clusters.

I do not think it's worth moving this, tbh.


The same also happens with the generic numa binding on
Documentation/devicetree/bindings/numa.txt
which says we should add the nuna-node-id on each of the cpu nodes.


Is there a strong gain from moving this?

[...]



Right now with the device tree spec and the above bindings we can
use the information from cpu nodes to infer the cache topology,
the memory topology, the power domain topology etc.

We have of_find_next_cache_node and of_find_last_cache_level for example
that use "next-level-cache" and are used on powerpc (where this spec 
comes

from), that we can further build on top of them (since this is now part
of the  device tree spec anyway), to e.g. populate the levels of cache,
the levels of shared cache and finally create cpu masks for the 
different

cache shari

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-06 Thread Nick Kossifidis

Στις 2018-11-06 18:20, Mark Rutland έγραψε:

On Tue, Nov 06, 2018 at 05:26:01PM +0200, Nick Kossifidis wrote:

Στις 2018-11-06 16:13, Sudeep Holla έγραψε:
> On Fri, Nov 02, 2018 at 08:58:39PM +0200, Nick Kossifidis wrote:
> > Στις 2018-11-02 01:04, Atish Patra έγραψε:
> > > This patch series adds the cpu topology for RISC-V. It contains
> > > both the DT binding and actual source code. It has been tested on
> > > QEMU & Unleashed board.
> > >
> > > The idea is based on cpu-map in ARM with changes related to how
> > > we define SMT systems. The reason for adopting a similar approach
> > > to ARM as I feel it provides a very clear way of defining the
> > > topology compared to parsing cache nodes to figure out which cpus
> > > share the same package or core.  I am open to any other idea to
> > > implement cpu-topology as well.
> >
> > I was also about to start a discussion about CPU topology on RISC-V
> > after the last swtools group meeting. The goal is to provide the
> > scheduler with hints on how to distribute tasks more efficiently
> > between harts, by populating the scheduling domain topology levels
> > (https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
> > What we want to do is define cpu groups and assign them to
> > scheduling domains with the appropriate SD_ flags
> > 
(https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).
>
> OK are we defining a CPU topology binding for Linux scheduler ?
> NACK for all the approaches that assumes any knowledge of OS scheduler.

Is there any standard regarding CPU topology on the device tree spec ?
As far as I know there is none. We are talking about a Linux-specific
Device Tree binding so I don't see why defining a binding for the
Linux scheduler is out of scope.

Speaking as a DT binding maintainer, please avoid OS-specific DT
bindings wherever possible.

While DT bindings live in the kernel tree, they are not intended to be
Linux-specific, and other OSs (e.g. FreeBSD, zephyr) are aiming to
support the same bindings.

In general, targeting a specific OS is a bad idea, because the
implementation details of that OS change over time, or the bindings end
up being tailored to one specific use-case. Exposing details to the OS
such that the OS can make decisions at runtime is typically better.

Do you have cpu-map on other OSes as well ?

There is nothing OS-specific about cpu-map, and it may be of use to
other OSs.

> > So the cores that belong to a scheduling domain may share:
> > CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
> > Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
> > Power domain (SD_SHARE_POWERDOMAIN)
> >
>
> Too Linux kernel/scheduler specific to be part of $subject

All lists on the cc list are Linux specific, again I don't see your
point here are we talking about defining a standard CPU topology
scheme for the device tree spec or a Linux-specific CPU topology
binding such as cpu-map ?

The cpu-map binding is not intended to be Linux specific, and avoids
Linux-specific terminology.

While the cpu-map binding documentation is in the Linux source tree, 
the
binding itseld is not intended to be Linux-specific, and it 
deliberately

avoids Linux implementation details.

Even on this case your point is not valid, the information of two
harts sharing a common power domain or having the same or not
capacity/max frequency (or maybe capabilities/extensions in the
future), is not Linux specific. I just used the Linux specific macros
used by the Linux scheduler to point out the code path.  Even on other
OSes we still need a way to include this information on the CPU
topology, and currently cpu-map doesn't. Also the Linux implementation
of cpu-map ignores multiple levels of shared resources, we only get
one level for SMT and one level for MC last time I checked.

Given clusters can be nested, as in the very first example, I don't see
what prevents multiple levels of shared resources.

Can you please given an example of the topology your considering? Does
that share some resources across clusters at some level?

We are certainly open to improving the cpu-map binding.

Thanks,
Mark.

Mark and Sundeep thanks a lot for your feedback, I guess you convinced 
me
that having a device tree binding for the scheduler is not a correct 
approach.
It's not a device after all and I agree that the device tree shouldn't 
become
an OS configuration file. Regarding multiple levels of shared resources 
my point
is that since cpu-map doesn't contain any information of what is shared 
among
the cluster/core members it's not easy to do any further translation. 
Last time
I checked the arm code that uses cpu-map, it only defines one domain for 
SMT, on

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-06 Thread Nick Kossifidis

Στις 2018-11-06 16:13, Sudeep Holla έγραψε:

On Fri, Nov 02, 2018 at 08:58:39PM +0200, Nick Kossifidis wrote:

Hello All,

Στις 2018-11-02 01:04, Atish Patra έγραψε:
> This patch series adds the cpu topology for RISC-V. It contains
> both the DT binding and actual source code. It has been tested on
> QEMU & Unleashed board.
>
> The idea is based on cpu-map in ARM with changes related to how
> we define SMT systems. The reason for adopting a similar approach
> to ARM as I feel it provides a very clear way of defining the
> topology compared to parsing cache nodes to figure out which cpus
> share the same package or core.  I am open to any other idea to
> implement cpu-topology as well.
>

I was also about to start a discussion about CPU topology on RISC-V
after the last swtools group meeting. The goal is to provide the
scheduler with hints on how to distribute tasks more efficiently
between harts, by populating the scheduling domain topology levels
(https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
What we want to do is define cpu groups and assign them to
scheduling domains with the appropriate SD_ flags
(https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).

OK are we defining a CPU topology binding for Linux scheduler ?
NACK for all the approaches that assumes any knowledge of OS scheduler.

Is there any standard regarding CPU topology on the device tree spec ? 
As far as
I know there is none. We are talking about a Linux-specific Device Tree 
binding
so I don't see why defining a binding for the Linux scheduler is out of 
scope.

Do you have cpu-map on other OSes as well ?

So the cores that belong to a scheduling domain may share:
CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
Power domain (SD_SHARE_POWERDOMAIN)

Too Linux kernel/scheduler specific to be part of $subject

All lists on the cc list are Linux specific, again I don't see your 
point here
are we talking about defining a standard CPU topology scheme for the 
device tree

spec or a Linux-specific CPU topology binding such as cpu-map ?

Even on this case your point is not valid, the information of two harts 
sharing
a common power domain or having the same or not capacity/max frequency 
(or maybe
capabilities/extensions in the future), is not Linux specific. I just 
used the
Linux specific macros used by the Linux scheduler to point out the code 
path.
Even on other OSes we still need a way to include this information on 
the CPU
topology, and currently cpu-map doesn't. Also the Linux implementation 
of cpu-map
ignores multiple levels of shared resources, we only get one level for 
SMT and

one level for MC last time I checked.

In this context I believe using words like "core", "package",
"socket" etc can be misleading. For example the sample topology you
use on the documentation says that there are 4 cores that are part
of a package, however "package" has a different meaning to the
scheduler. Also we don't say anything in case they share a power
domain or if they have the same capacity or not. This mapping deals
only with cache hierarchy or other shared resources.

{Un,}fortunately those are terms used by hardware people.

And they are wrong, how the harts are physically packed doesn't imply 
their
actual topology. In general the "translation" is not always straight 
forward,
there are assumptions in place. We could use "cluster" of harts or 
"group" of

harts instead, they are more abstract.

Regards,
Nick

Re: [RFC 1/2] dt-bindings: topology: Add RISC-V cpu topology.

2018-11-06 Thread Nick Kossifidis


Στις 2018-11-05 21:38, Palmer Dabbelt έγραψε:

On Fri, 02 Nov 2018 06:09:39 PDT (-0700), robh...@kernel.org wrote:
On Thu, Nov 1, 2018 at 6:04 PM Atish Patra  
wrote:


Define a RISC-V cpu topology. This is based on cpu-map in ARM world.
But it doesn't need a separate thread node for defining SMT systems.
Multiple cpu phandle properties can be parsed to identify the sibling
hardware threads. Moreover, we do not have cluster concept in RISC-V.
So package is a better word choice than cluster for RISC-V.


There was a proposal to add package info for ARM recently. Not sure
what happened to that, but we don't need 2 different ways.

There's never going to be clusters for RISC-V? What prevents that?
Seems shortsighted to me.



Signed-off-by: Atish Patra 
---
 .../devicetree/bindings/riscv/topology.txt | 154 
+

 1 file changed, 154 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/riscv/topology.txt


diff --git a/Documentation/devicetree/bindings/riscv/topology.txt 
b/Documentation/devicetree/bindings/riscv/topology.txt

new file mode 100644
index ..96039ed3
--- /dev/null
+++ b/Documentation/devicetree/bindings/riscv/topology.txt
@@ -0,0 +1,154 @@
+===
+RISC-V cpu topology binding description
+===
+
+===
+1 - Introduction
+===
+
+In a RISC-V system, the hierarchy of CPUs can be defined through 
following nodes that

+are used to describe the layout of physical CPUs in the system:
+
+- packages
+- core
+
+The cpu nodes (bindings defined in [1]) represent the devices that
+correspond to physical CPUs and are to be mapped to the hierarchy 
levels.
+Simultaneous multi-threading (SMT) systems can also represent their 
topology
+by defining multiple cpu phandles inside core node. The details are 
explained

+in paragraph 3.


I don't see a reason to do this differently than ARM. That said, I
don't think the thread part is in use on ARM, so it could possibly be
changed.


+
+The remainder of this document provides the topology bindings for 
ARM, based


for ARM?


+on the Devicetree Specification, available from:
+
+https://www.devicetree.org/specifications/
+
+If not stated otherwise, whenever a reference to a cpu node phandle 
is made its
+value must point to a cpu node compliant with the cpu node bindings 
as

+documented in [1].
+A topology description containing phandles to cpu nodes that are not 
compliant

+with bindings standardized in [1] is therefore considered invalid.
+
+This cpu topology binding description is mostly based on the 
topology defined

+in ARM [2].
+===
+2 - cpu-topology node


cpu-map. Why change this?

What I would like to see is the ARM topology binding reworked to be
common or some good reasons why it doesn't work for RISC-V as-is.


I think it would be great if CPU topologies were not a RISC-V specific
thing.  We don't really do anything different than anyone else, so
it'd be great if we could all share the same spec and code.  Looking
quickly at the ARM cpu-map bindings, I don't see any reason why we
can't just use the same thing on RISC-V -- it's not quite how I'd do
it, but I don't think the differences are worth having another
implementation.  Mechanically I'm not sure how to do this: should
there just be a "Documentation/devicetree/bindings/cpu-map.txt"?

If everyone is OK with that then I vote we just go ahead and
genericise the ARM "cpu-map" stuff for CPU topology.  Sharing the
implementation looks fairly straight-forward as well.



Please check this out...
https://lkml.org/lkml/2018/11/3/99

It's also non arch-dependent and it can handle the scheduler's 
capabilities

better than cpu-map.

Re: [RFC 0/2] RISC-V: A proposal to add vendor-specific code

2018-11-05 Thread Nick Kossifidis


Hello Vincent,

Στις 2018-10-31 12:35, Vincent Chen έγραψε:

RISC-V permits each vendor to develop respective extension ISA based
on RISC-V standard ISA. This means that these vendor-specific features
may be compatible to their compiler and CPU. Therefore, each vendor may
be considered a sub-architecture of RISC-V. Currently, vendors do not
have the appropriate examples to add these specific features to the
kernel. In this RFC set, we propose an infrastructure that vendor can
easily hook their specific features into kernel. The first commit is
the main body of this infrastructure. In the second commit, we provide
a solution that allows dma_map_ops() to work without cache coherent
agent support. Cache coherent agent is unsupported for low-end CPUs in
the AndeStar RISC-V series. In order for Linux to run on these CPUs, we
need this solution to overcome the limitation of cache coherent agent
support. Hence, it also can be used as an example for the first commit.

  I am glad to discuss any ideas, so if you have any idea, please give
me some feedback.


So if I got this right, in your case you've added some CSRs 
(ucctlbeginaddr
and ucctlcommand) for marking regions as non-cacheable, and you provide 
your

own get_arch_dma_ops by overriding the default header files with your
vendor-specific ones.

Some things that are IMHO wrong with the proposed approach:

a) By directly modifying your custom CSRs, it means that we will need
compiler support in order to compile a kernel with your code in it. This
will break CI systems and will introduce various issues on testing and
reviewing your code. In general if we need custom toolchains to compile
the kernel, that may be unavailable (vendors will not always open source
their compiler support), we won't be able to maintain a decent level of
code quality in the tree. How can the maintainer push your code on the
repository if he/she can't even perform a basic compilation test ?

In contrast with ARM and others that have a standard set of possible
extensions (e.g. NEON, crc32 etc), and provide compiler support for 
those,

a similar approach is not valid for RISC-V. We could demand that vendors
add compiler support e.g. on gcc before submitting a patch with custom
assembly but I don't think this approach is feasible (one vendor's CSRs
or custom instructions may overlap with another's). I believe we should
just use SBI calls instead and let the firmware (and/or custom userspace
libraries) handle the custom CSRs/assembly instructions.

This is a concern also for lib/ and crypto/ where vendors might want to 
use
their own extensions for providing optimized assembly for their cores. 
It's
not a big deal to use SBI and handle vendor-specific stuff on firmware 
and/or
userspace, it's actually more flexible for the vendors since they can 
have
their own process for maintaining their firmware and releasing it in 
their
own terms/license etc. If we see that the SBI has too much overhead or 
is not

enough we can design it in a better way or extend it. It will still be
standard and easier to maintain than a fragmented ecosystem of mostly
unusable code, inside the mainline kernel.

In case we need to save/store registers related to a custom extension, I
guess we can also have an SBI call for saving/restoring custom registers
to/from S-managed memory and we should be ok. It should also be possible
to do any extra state handling in firmware if needed. I believe we can 
and

should avoid custom assembly on the kernel at all costs !


b) By using CONFIG_VENDOR_FOLDER_NAME you add a compile time dependency 
that
allows this kernel image to be built for a specific vendor. This is 
problematic
in different ways. At first it's not possible to have a kernel image 
that's

generic and can be used for all RISC-V systems. That's what distros want
and that's how they 've been working so far.

Also in case a vendor has many different boards with different 
implementation
details how will this approach help ? It would make more sense to have 
something
like arch/riscv/. Also have in mind that some extensions might 
be
available as IP to vendors so multiple vendors might use the same 
extension
made/licensed from another vendor. I think arch/riscv/ is 
more

appropriate. Since we can use mvendorid/marchid/mimpid to identify that
at runtime maybe we can have some wrapper code that initializes a struct 
of

function pointers early on setup_arch(), allowing vendors to populate it
according to the available extensions in their hw, based on the above 
ids.


We can also just use device-tree for that and mark the used extensions 
there.
We can even pass extension-specific parameters this way (e.g. to save 
CSRs)

in case of extensions that can be parametrized on hw design phase (I'm
thinking of IPs sold from companies with licenses for unlocking the X 
feature

or for supporting up to Y channels/slots/instances/whatever).


In any case it's an interesting subject that we definitely need to think 
about,

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-02 Thread Nick Kossifidis


Στις 2018-11-02 23:14, Atish Patra έγραψε:

On 11/2/18 11:59 AM, Nick Kossifidis wrote:

Hello All,

Στις 2018-11-02 01:04, Atish Patra έγραψε:

This patch series adds the cpu topology for RISC-V. It contains
both the DT binding and actual source code. It has been tested on
QEMU & Unleashed board.

The idea is based on cpu-map in ARM with changes related to how
we define SMT systems. The reason for adopting a similar approach
to ARM as I feel it provides a very clear way of defining the
topology compared to parsing cache nodes to figure out which cpus
share the same package or core.  I am open to any other idea to
implement cpu-topology as well.



I was also about to start a discussion about CPU topology on RISC-V
after the last swtools group meeting. The goal is to provide the
scheduler with hints on how to distribute tasks more efficiently
between harts, by populating the scheduling domain topology levels
(https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
What we want to do is define cpu groups and assign them to
scheduling domains with the appropriate SD_ flags
(https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).



Scheduler domain topology is already getting all the hints in the 
following way.


static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};

#ifdef CONFIG_SCHED_SMT
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
return topology_sibling_cpumask(cpu);
}
#endif

const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
}




That's a static definition of two scheduling domains that only deal
with SMT and MC, the only difference between them is the
SD_SHARE_PKG_RESOURCES flag. You can't even have multiple levels
of shared resources this way, whatever you have larger than a core
is ignored (it just goes to the MC domain). There is also no handling
of SD_SHARE_POWERDOMAIN or SD_SHARE_CPUCAPACITY.


So the cores that belong to a scheduling domain may share:
CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
Power domain (SD_SHARE_POWERDOMAIN)

In this context I believe using words like "core", "package",
"socket" etc can be misleading. For example the sample topology you
use on the documentation says that there are 4 cores that are part
of a package, however "package" has a different meaning to the
scheduler. Also we don't say anything in case they share a power
domain or if they have the same capacity or not. This mapping deals
only with cache hierarchy or other shared resources.

How about defining a dt scheme to describe the scheduler domain
topology levels instead ? e.g:

2 sets (or clusters if you prefer) of 2 SMT cores, each set with
a different capacity and power domain:

sched_topology {
   level0 { // SMT
shared = "power", "capacity", "resources";
group0 {
 members = <&hart0>, <&hart1>;
}
group1 {
 members = <&hart2>, <&hart3>;
}
group2 {
 members = <&hart4>, <&hart5>;
}
group3 {
 members = <&hart6>, <&hart7>;
}
   }
   level1 { // MC
shared = "power", "capacity"
group0 {
 members = <&hart0>, <&hart1>, <&hart2>, <&hart3>;
}
group1 {
 members = <&hart4>, <&hart5>, <&hart6>, <&hart7>;
}
   }
   top_level { // A group with all harts in it
shared = "" // There is nothing common for ALL harts, we could 
have

capacity here
   }
}



I agree that naming could have been better in the past. But it is what
it is now. I don't see any big advantages in this approach compared to
the existing approach where DT specifies what hardware looks like and
scheduler sets up it's domain based on different cpumasks.



It is what it is on ARM, it doesn't have to be the same on RISC-V, 
anyway
the name is a minor issue. The advantage of this approach is that you 
define the
scheduling domains on the device tree without needing a "translation" of 
a
topology map to scheduling domains. It can handle any scenario the 
scheduler
can handle, using all the available flags. In your approach no matter 
what

gets put to the device tree, the only hint the scheduler will get is one
level of SMT, one level of MC and the rest of the system. No power 
domain
sharing, no asymmetric scheduling, no multiple levels possible. Many 
features
of the scheduler remain unused. This approach can also get extended more 
easily

to e.g. support NUMA nodes and associate memory regions with groups.

Regards,
Nick

Re: [RFC 0/2] Add RISC-V cpu topology

2018-11-02 Thread Nick Kossifidis


Hello All,

Στις 2018-11-02 01:04, Atish Patra έγραψε:

This patch series adds the cpu topology for RISC-V. It contains
both the DT binding and actual source code. It has been tested on
QEMU & Unleashed board.

The idea is based on cpu-map in ARM with changes related to how
we define SMT systems. The reason for adopting a similar approach
to ARM as I feel it provides a very clear way of defining the
topology compared to parsing cache nodes to figure out which cpus
share the same package or core.  I am open to any other idea to
implement cpu-topology as well.



I was also about to start a discussion about CPU topology on RISC-V
after the last swtools group meeting. The goal is to provide the
scheduler with hints on how to distribute tasks more efficiently
between harts, by populating the scheduling domain topology levels
(https://elixir.bootlin.com/linux/v4.19/ident/sched_domain_topology_level).
What we want to do is define cpu groups and assign them to
scheduling domains with the appropriate SD_ flags
(https://github.com/torvalds/linux/blob/master/include/linux/sched/topology.h#L16).

So the cores that belong to a scheduling domain may share:
CPU capacity (SD_SHARE_CPUCAPACITY / SD_ASYM_CPUCAPACITY)
Package resources -e.g. caches, units etc- (SD_SHARE_PKG_RESOURCES)
Power domain (SD_SHARE_POWERDOMAIN)

In this context I believe using words like "core", "package",
"socket" etc can be misleading. For example the sample topology you
use on the documentation says that there are 4 cores that are part
of a package, however "package" has a different meaning to the
scheduler. Also we don't say anything in case they share a power
domain or if they have the same capacity or not. This mapping deals
only with cache hierarchy or other shared resources.

How about defining a dt scheme to describe the scheduler domain
topology levels instead ? e.g:

2 sets (or clusters if you prefer) of 2 SMT cores, each set with
a different capacity and power domain:

sched_topology {
 level0 { // SMT
  shared = "power", "capacity", "resources";
  group0 {
   members = <&hart0>, <&hart1>;
  }
  group1 {
   members = <&hart2>, <&hart3>;
  }
  group2 {
   members = <&hart4>, <&hart5>;
  }
  group3 {
   members = <&hart6>, <&hart7>;
  }
 }
 level1 { // MC
  shared = "power", "capacity"
  group0 {
   members = <&hart0>, <&hart1>, <&hart2>, <&hart3>;
  }
  group1 {
   members = <&hart4>, <&hart5>, <&hart6>, <&hart7>;
  }
 }
 top_level { // A group with all harts in it
  shared = "" // There is nothing common for ALL harts, we could have 
capacity here

 }
}

Regards,
Nick

Re: [PATCH] ath5k: cleanup channel to eprom_mode function

2013-02-26 Thread Nick Kossifidis

net/wireless/ath/ath5k/reset.c
> +++ b/drivers/net/wireless/ath/ath5k/reset.c
> @@ -984,9 +984,7 @@ ath5k_hw_commit_eeprom_settings(struct ath5k_hw *ah,
>   if (ah->ah_version == AR5K_AR5210)
>   return;
>
> - ee_mode = ath5k_eeprom_mode_from_channel(channel);
> - if (WARN_ON(ee_mode < 0))
> - return;
> + ee_mode = ath5k_eeprom_mode_from_channel(ah, channel);
>
>   /* Adjust power delta for channel 14 */
>   if (channel->center_freq == 2484)

Acked-by: Nick Kossifidis 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH] Entropy generator with 100 kB/s throughput

2013-02-22 Thread Nick Kossifidis

I'm so sorry, something went terribly wrong with gmail/thunderbird :-(

2013/2/22 Nick Kossifidis :
> Hello all,
>
> It's nice to see there is still discussion on the matter of using cpu
> timings for entropy. In general using cpu timings for gathering entropy is a
> nice idea but it's not that unpredictable. If someone can profile the system
> he/she can get enough infos to predict (to some point) the generator's
> outcome, especially during boot/reboot. You might pass the tests on a single
> run but if you try to compare the runs e.g. when booting the system multiple
> times you'll see they are correlated.
>
> Take a look at this:
>
> http://dl.dropbox.com/u/11670313/runtime-data.tar.bz2
>
> It's the output of this patch, before passing data to the entropy pool (btw
> did anyone review this patch or should I resend it ?):
>
> https://patchwork.kernel.org/patch/1759821/
>
> If you plot the datasets you'll see e.g. that across reboots you get a very
> similar distribution. It is somehow different across boots but still there
> is some correlation there too, notice the difference when fsck runs, still
> not much.
>
> The distribution is good in general, for mixing it with the rest in the
> system's entropy pool, but on its own I don't think it's enough, especially
> without crypto post-processing.



-- 
GPG ID: 0xEE878588
As you read this post global entropy rises. Have Fun ;-)
Nick
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] NET: ath5k, check ath5k_eeprom_mode_from_channel retval

2013-02-19 Thread Nick Kossifidis

On Tue Feb 19 15:36:07 2013, Jiri Slaby wrote:
> On 02/18/2013 01:47 AM, Nick Kossifidis wrote:
>> int
>> ath5k_eeprom_mode_from_channel(struct ieee80211_channel *channel)
>> {
>> switch (channel->hw_value) {
>> case AR5K_MODE_11A:
>> return AR5K_EEPROM_MODE_11A;
>> case AR5K_MODE_11G:
>> return AR5K_EEPROM_MODE_11G;
>> case AR5K_MODE_11B:
>> return AR5K_EEPROM_MODE_11B;
>> default:
>> return -1;
>> }
>> }
>>
>> I think we should just change that default to return 0 instead and add
>> an ATH5K_WARN there.
>
> Something like the attached patch? It needs ah to be propagated to
> eeprom. If you are fine with that, I'll send it as patch...
>
> thanks,

Just move the prototype on ath5k.h with the rest of them...

1523 /* EEPROM access functions */
1524 int ath5k_eeprom_init(struct ath5k_hw *ah);
1525 void ath5k_eeprom_detach(struct ath5k_hw *ah);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] NET: ath5k, check ath5k_eeprom_mode_from_channel retval

2013-02-17 Thread Nick Kossifidis

2013/2/7 Jiri Slaby :
> It can, if invalid argument given, return a negative value. In that
> case we would access arrays out-of-bounds and such. Check the value
> and yell loudly if that happened as it would be a bug in the
> implementation. (Instead of silently corrupting memory.)
>
> Signed-off-by: Jiri Slaby 
> Cc: Nick Kossifidis 
> Cc: "Luis R. Rodriguez" 
> ---
>  drivers/net/wireless/ath/ath5k/phy.c   | 4 
>  drivers/net/wireless/ath/ath5k/reset.c | 2 ++
>  2 files changed, 6 insertions(+)
>
> diff --git a/drivers/net/wireless/ath/ath5k/phy.c 
> b/drivers/net/wireless/ath/ath5k/phy.c
> index ab363f3..a78afa9 100644
> --- a/drivers/net/wireless/ath/ath5k/phy.c
> +++ b/drivers/net/wireless/ath/ath5k/phy.c
> @@ -1613,6 +1613,10 @@ ath5k_hw_update_noise_floor(struct ath5k_hw *ah)
> ah->ah_cal_mask |= AR5K_CALIBRATION_NF;
>
> ee_mode = ath5k_eeprom_mode_from_channel(ah->ah_current_channel);
> +   if (WARN_ON(ee_mode < 0)) {
> +   ah->ah_cal_mask &= ~AR5K_CALIBRATION_NF;
> +   return;
> +   }
>
> /* completed NF calibration, test threshold */
> nf = ath5k_hw_read_measured_noise_floor(ah);
> diff --git a/drivers/net/wireless/ath/ath5k/reset.c 
> b/drivers/net/wireless/ath/ath5k/reset.c
> index 4084b10..e2d8b2c 100644
> --- a/drivers/net/wireless/ath/ath5k/reset.c
> +++ b/drivers/net/wireless/ath/ath5k/reset.c
> @@ -985,6 +985,8 @@ ath5k_hw_commit_eeprom_settings(struct ath5k_hw *ah,
> return;
>
> ee_mode = ath5k_eeprom_mode_from_channel(channel);
> +   if (WARN_ON(ee_mode < 0))
> +   return;
>
> /* Adjust power delta for channel 14 */
> if (channel->center_freq == 2484)

int
ath5k_eeprom_mode_from_channel(struct ieee80211_channel *channel)
{
switch (channel->hw_value) {
case AR5K_MODE_11A:
return AR5K_EEPROM_MODE_11A;
case AR5K_MODE_11G:
return AR5K_EEPROM_MODE_11G;
case AR5K_MODE_11B:
return AR5K_EEPROM_MODE_11B;
default:
return -1;
}
}

I think we should just change that default to return 0 instead and add
an ATH5K_WARN there.



-- 
GPG ID: 0xEE878588
As you read this post global entropy rises. Have Fun ;-)
Nick
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] random: Mix cputime from each thread that exits to the pool

2012-11-17 Thread Nick Kossifidis

When a thread exits mix it's cputime (userspace + kernelspace) to the entropy 
pool.

We don't know how "random" this is, so we use add_device_randomness that 
doesn't mess
with entropy count.

Signed-off-by: Nick Kossifidis 
---
 kernel/posix-cpu-timers.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67..f07827a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -494,6 +495,8 @@ static void cleanup_timers(struct list_head *head,
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
+   add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+   sizeof(unsigned long long));
cleanup_timers(tsk->cpu_timers,
   tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
 
-- 
1.7.8.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] WDEV, ath5k, don't return int from bool function

2008-02-15 Thread Nick Kossifidis

2008/2/15, Jiri Slaby <[EMAIL PROTECTED]>:
> sparse sees int -> bool cast as an error:
>  hw.c:3754:10: warning: cast truncates bits from constant value (ffea 
> becomes 0)
>  Fix it by converting the rettype to int and check appropriately.
>
>  Signed-off-by: Jiri Slaby <[EMAIL PROTECTED]>
>  Cc: Nick Kossifidis <[EMAIL PROTECTED]>
>  Cc: Luis R. Rodriguez <[EMAIL PROTECTED]>
>  ---
>   drivers/net/wireless/ath5k/ath5k.h |2 +-
>   drivers/net/wireless/ath5k/base.c  |5 -
>   drivers/net/wireless/ath5k/hw.c|8 
>   3 files changed, 9 insertions(+), 6 deletions(-)
>
>  diff --git a/drivers/net/wireless/ath5k/ath5k.h 
> b/drivers/net/wireless/ath5k/ath5k.h
>  index c79066b..69dea33 100644
>  --- a/drivers/net/wireless/ath5k/ath5k.h
>  +++ b/drivers/net/wireless/ath5k/ath5k.h
>  @@ -1035,7 +1035,7 @@ struct ath5k_hw {
> unsigned int, unsigned int, enum ath5k_pkt_type, unsigned int,
> unsigned int, unsigned int, unsigned int, unsigned int,
> unsigned int, unsigned int, unsigned int);
>  -   bool (*ah_setup_xtx_desc)(struct ath5k_hw *, struct ath5k_desc *,
>  +   int (*ah_setup_xtx_desc)(struct ath5k_hw *, struct ath5k_desc *,
> unsigned int, unsigned int, unsigned int, unsigned int,
> unsigned int, unsigned int);
> int (*ah_proc_tx_desc)(struct ath5k_hw *, struct ath5k_desc *);
>  diff --git a/drivers/net/wireless/ath5k/base.c 
> b/drivers/net/wireless/ath5k/base.c
>  index 49d38e8..59e5d56 100644
>  --- a/drivers/net/wireless/ath5k/base.c
>  +++ b/drivers/net/wireless/ath5k/base.c
>  @@ -668,7 +668,10 @@ ath5k_attach(struct pci_dev *pdev, struct ieee80211_hw 
> *hw)
>  * return false w/o doing anything.  MAC's that do
>  * support it will return true w/o doing anything.
>  */
>  -   if (ah->ah_setup_xtx_desc(ah, NULL, 0, 0, 0, 0, 0, 0))
>  +   ret = ah->ah_setup_xtx_desc(ah, NULL, 0, 0, 0, 0, 0, 0);
>  +   if (ret < 0)
>  +   goto err;
>  +   if (ret > 0)
> __set_bit(ATH_STAT_MRRETRY, sc->status);
>
> /*
>  diff --git a/drivers/net/wireless/ath5k/hw.c 
> b/drivers/net/wireless/ath5k/hw.c
>  index 93a75f2..463413a 100644
>  --- a/drivers/net/wireless/ath5k/hw.c
>  +++ b/drivers/net/wireless/ath5k/hw.c
>  @@ -45,7 +45,7 @@ static int ath5k_hw_setup_4word_tx_desc(struct ath5k_hw *, 
> struct ath5k_desc *,
> unsigned int, unsigned int, enum ath5k_pkt_type, unsigned int,
> unsigned int, unsigned int, unsigned int, unsigned int, unsigned int,
> unsigned int, unsigned int);
>  -static bool ath5k_hw_setup_xr_tx_desc(struct ath5k_hw *, struct ath5k_desc 
> *,
>  +static int ath5k_hw_setup_xr_tx_desc(struct ath5k_hw *, struct ath5k_desc *,
> unsigned int, unsigned int, unsigned int, unsigned int, unsigned int,
> unsigned int);
>   static int ath5k_hw_proc_4word_tx_status(struct ath5k_hw *, struct 
> ath5k_desc *);
>  @@ -3733,7 +3733,7 @@ static int ath5k_hw_setup_4word_tx_desc(struct 
> ath5k_hw *ah,
>   /*
>   * Initialize a 4-word multirate tx descriptor on 5212
>   */
>  -static bool
>  +static int
>   ath5k_hw_setup_xr_tx_desc(struct ath5k_hw *ah, struct ath5k_desc *desc,
> unsigned int tx_rate1, u_int tx_tries1, u_int tx_rate2, u_int 
> tx_tries2,
> unsigned int tx_rate3, u_int tx_tries3)
>  @@ -3773,10 +3773,10 @@ ath5k_hw_setup_xr_tx_desc(struct ath5k_hw *ah, 
> struct ath5k_desc *desc,
>
>   #undef _XTX_TRIES
>
>  -   return true;
>  +   return 1;
> }
>
>  -   return false;
>  +   return 0;
>   }
>
>   /*
>


Acked-by: Nick Kossifidis <[EMAIL PROTECTED]>


-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] WDEV: ath5k, fix lock imbalance

2008-02-15 Thread Nick Kossifidis

2008/2/15, Jiri Slaby <[EMAIL PROTECTED]>:
> Omitted lock causes sparse warning
>  drivers/net/wireless/ath5k/base.c:1682:1: warning: context imbalance in 
> 'ath5k_tasklet_rx' - different lock contexts for basic block
>
>  Add the lock to the guilty fail path.
>
>  Signed-off-by: Jiri Slaby <[EMAIL PROTECTED]>
>  Cc: Nick Kossifidis <[EMAIL PROTECTED]>
>  Cc: Luis R. Rodriguez <[EMAIL PROTECTED]>
>  ---
>   drivers/net/wireless/ath5k/base.c |1 +
>   1 files changed, 1 insertions(+), 0 deletions(-)
>
>  diff --git a/drivers/net/wireless/ath5k/base.c 
> b/drivers/net/wireless/ath5k/base.c
>  index ddc8714..49d38e8 100644
>  --- a/drivers/net/wireless/ath5k/base.c
>  +++ b/drivers/net/wireless/ath5k/base.c
>  @@ -1715,6 +1715,7 @@ ath5k_tasklet_rx(unsigned long data)
> break;
> else if (unlikely(ret)) {
> ATH5K_ERR(sc, "error in processing rx descriptor\n");
>  +   spin_unlock(&sc->rxbuflock);
> return;
> }
>
>


Acked-by: Nick Kossifidis <[EMAIL PROTECTED]>


-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/1] WDEV: ath5k, test single chip before reset

2008-02-15 Thread Nick Kossifidis

2008/2/15, Jiri Slaby <[EMAIL PROTECTED]>:
> Move ath5k_hw_nic_wakeup after ah_single_chip being set, because we
>  test the value in there and decides whether reset or not.
>
>  Signed-off-by: Jiri Slaby <[EMAIL PROTECTED]>
>  Cc: Nick Kossifidis <[EMAIL PROTECTED]>
>  Cc: Luis R. Rodriguez <[EMAIL PROTECTED]>
>  ---
>   drivers/net/wireless/ath5k/hw.c |   21 ++---
>   1 files changed, 10 insertions(+), 11 deletions(-)
>
>  diff --git a/drivers/net/wireless/ath5k/hw.c 
> b/drivers/net/wireless/ath5k/hw.c
>  index 3a4bf40..9cdd27f 100644
>  --- a/drivers/net/wireless/ath5k/hw.c
>  +++ b/drivers/net/wireless/ath5k/hw.c
>  @@ -181,11 +181,6 @@ struct ath5k_hw *ath5k_hw_attach(struct ath5k_softc 
> *sc, u8 mac_version)
> else if (ah->ah_version <= AR5K_AR5211)
> ah->ah_proc_rx_desc = ath5k_hw_proc_old_rx_status;
>
>  -   /* Bring device out of sleep and reset it's units */
>  -   ret = ath5k_hw_nic_wakeup(ah, AR5K_INIT_MODE, true);
>  -   if (ret)
>  -   goto err_free;
>  -
> /* Get MAC, PHY and RADIO revisions */
> srev = ath5k_hw_reg_read(ah, AR5K_SREV);
> ah->ah_mac_srev = srev;
>  @@ -210,12 +205,13 @@ struct ath5k_hw *ath5k_hw_attach(struct ath5k_softc 
> *sc, u8 mac_version)
> }
>
> /* Identify single chip solutions */
>  -   if((srev <= AR5K_SREV_VER_AR5414) &&
>  -   (srev >= AR5K_SREV_VER_AR2424)) {
>  -   ah->ah_single_chip = true;
>  -   } else {
>  -   ah->ah_single_chip = false;
>  -   }
>  +   ah->ah_single_chip = srev >= AR5K_SREV_VER_AR2424 &&
>  +   srev <= AR5K_SREV_VER_AR5414;
>  +
>  +   /* Bring device out of sleep and reset it's units */
>  +   ret = ath5k_hw_nic_wakeup(ah, AR5K_INIT_MODE, true);
>  +   if (ret)
>  +   goto err_free;
>
> /* Single chip radio */
> if (ah->ah_radio_2ghz_revision == ah->ah_radio_5ghz_revision)
>

NACK, single chip flag is going out since only needed check is for
5424/2424 during nic_wakeup, 5413/2413 behave like the rest
5211/5212-combatible cards.

Sorry for delaying the relevant patches, i'm working with the 2413
card i got from Mike to make it work and it'll take some time (not as
easy as i thought, i probably miss something)...


-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] wireless/ath5k: renamed to ath5k_pci_driver to fix Section mismatch warnings

2008-02-02 Thread Nick Kossifidis

2008/2/2, Denis Cheng <[EMAIL PROTECTED]>:
> the struct pci_driver refered ath5k_pci_id_table which in __devinit section,
> the sparse tool suggest this renamed to "*driver", kills mismatch warnings.
>
> Signed-off-by: Denis Cheng <[EMAIL PROTECTED]>
> ---
>  drivers/net/wireless/ath5k/base.c |6 +++---
>  1 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/wireless/ath5k/base.c 
> b/drivers/net/wireless/ath5k/base.c
> index d6599d2..ddc8714 100644
> --- a/drivers/net/wireless/ath5k/base.c
> +++ b/drivers/net/wireless/ath5k/base.c
> @@ -153,7 +153,7 @@ static int  ath5k_pci_resume(struct pci_dev 
> *pdev);
>  #define ath5k_pci_resume NULL
>  #endif /* CONFIG_PM */
>
> -static struct pci_driver ath5k_pci_drv_id = {
> +static struct pci_driver ath5k_pci_driver = {
> .name   = "ath5k_pci",
> .id_table   = ath5k_pci_id_table,
> .probe  = ath5k_pci_probe,
> @@ -329,7 +329,7 @@ init_ath5k_pci(void)
>
> ath5k_debug_init();
>
> -   ret = pci_register_driver(&ath5k_pci_drv_id);
> +   ret = pci_register_driver(&ath5k_pci_driver);
> if (ret) {
> printk(KERN_ERR "ath5k_pci: can't register pci driver\n");
> return ret;
> @@ -341,7 +341,7 @@ init_ath5k_pci(void)
>  static void __exit
>  exit_ath5k_pci(void)
>  {
> -   pci_unregister_driver(&ath5k_pci_drv_id);
> +   pci_unregister_driver(&ath5k_pci_driver);
>
> ath5k_debug_finish();
>  }
> --
> 1.5.3.8
>
>

Thanx ;-)

Acked-by: Nick Kossifidis <[EMAIL PROTECTED]>

-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.24-rc5-mm1 ath5k build issue

2007-12-14 Thread Nick Kossifidis

2007/12/14, Dave Young <[EMAIL PROTECTED]>:
> On Dec 10, 2007 1:55 AM, Nick Kossifidis <[EMAIL PROTECTED]> wrote:
> > 2007/12/7, Dave Young <[EMAIL PROTECTED]>:
> >
> > > Hi,
> > >
> > > 2.6.24-rc4-mm1 build failed at drivers/net/wireless/ath5k/base.c for some 
> > > inline functions like this:
> > > drivers/net/wireless/ath5k/base.c:292: sorry, unimplemented: inlining 
> > > failed in call to 'ath5k_extend_tsf': function body not available
> > >
> > > fix it with adjust the order of inline function body.
> > >
> > > Signed-off-by: Dave Young <[EMAIL PROTECTED]>
> > >
> > > ---
> > > drivers/net/wireless/ath5k/base.c |   67 
> > > --
> > > 1 file changed, 29 insertions(+), 38 deletions(-)
> > >
> > > diff -upr linux/drivers/net/wireless/ath5k/base.c 
> > > linux.new/drivers/net/wireless/ath5k/base.c
> > > --- linux/drivers/net/wireless/ath5k/base.c 2007-12-07 
> > > 10:01:42.0 +0800
> > > +++ linux.new/drivers/net/wireless/ath5k/base.c 2007-12-07 
> > > 10:01:49.0 +0800
> > > @@ -250,8 +250,19 @@ static int ath5k_rxbuf_setup(struct ath
> > >  static int ath5k_txbuf_setup(struct ath5k_softc *sc,
> > > struct ath5k_buf *bf,
> > > struct ieee80211_tx_control *ctl);
> > > +
> > >  static inline void ath5k_txbuf_free(struct ath5k_softc *sc,
> > > -   struct ath5k_buf *bf);
> > > +   struct ath5k_buf *bf)
> > > +{
> > > +   BUG_ON(!bf);
> > > +   if (!bf->skb)
> > > +   return;
> > > +   pci_unmap_single(sc->pdev, bf->skbaddr, bf->skb->len,
> > > +   PCI_DMA_TODEVICE);
> > > +   dev_kfree_skb(bf->skb);
> > > +   bf->skb = NULL;
> > > +}
> > > +
> > >  /* Queues setup */
> > >  static struct  ath5k_txq *ath5k_txq_setup(struct ath5k_softc *sc,
> > > int qtype, int subtype);
> > > @@ -278,14 +289,29 @@ static intath5k_beacon_setup(struct at
> > > struct ieee80211_tx_control *ctl);
> > >  static voidath5k_beacon_send(struct ath5k_softc *sc);
> > >  static voidath5k_beacon_config(struct ath5k_softc *sc);
> > > -static inline u64 ath5k_extend_tsf(struct ath5k_hw *ah, u32 rstamp);
> > > +
> > > +static inline u64 ath5k_extend_tsf(struct ath5k_hw *ah, u32 rstamp)
> > > +{
> > > +   u64 tsf = ath5k_hw_get_tsf64(ah);
> > > +
> > > +   if ((tsf & 0x7fff) < rstamp)
> > > +   tsf -= 0x8000;
> > > +
> > > +   return (tsf & ~0x7fff) | rstamp;
> > > +}
> > > +
> > >  /* Interrupt handling */
> > >  static int ath5k_init(struct ath5k_softc *sc);
> > >  static int ath5k_stop_locked(struct ath5k_softc *sc);
> > >  static int ath5k_stop_hw(struct ath5k_softc *sc);
> > >  static irqreturn_t ath5k_intr(int irq, void *dev_id);
> > >  static voidath5k_tasklet_reset(unsigned long data);
> > > -static inline void ath5k_update_txpow(struct ath5k_softc *sc);
> > > +
> > > +static inline void ath5k_update_txpow(struct ath5k_softc *sc)
> > > +{
> > > +   ath5k_hw_set_txpower_limit(sc->ah, 0);
> > > +}
> > > +
> > >  static voidath5k_calibrate(unsigned long data);
> > >  /* LED functions */
> > >  static voidath5k_led_off(unsigned long data);
> > > @@ -1341,21 +1367,6 @@ err_unmap:
> > > return ret;
> > >  }
> > >
> > > -static inline void
> > > -ath5k_txbuf_free(struct ath5k_softc *sc, struct ath5k_buf *bf)
> > > -{
> > > -   BUG_ON(!bf);
> > > -   if (!bf->skb)
> > > -   return;
> > > -   pci_unmap_single(sc->pdev, bf->skbaddr, bf->skb->len,
> > > -   PCI_DMA_TODEVICE);
> > > -   dev_kfree_skb(bf->skb);
> > > -   bf->skb = NULL;
> > > -}
> > > -
> > > -
> > > -
> > > -
> > >  /**\
> > >  * Queues setup *
> > >  \**/
> > > @@ -2046,20 +2057,6 @@ ath5k_beacon_config(struct ath5k_softc *
> > >  #undef TSF_TO_TU
> &g

Re: 2.6.24-rc4-mm1

2007-12-09 Thread Nick Kossifidis

2007/12/7, Dave Young <[EMAIL PROTECTED]>:
> Hi,
>
> 2.6.24-rc4-mm1 build failed at drivers/net/wireless/ath5k/base.c for some 
> inline functions like this:
> drivers/net/wireless/ath5k/base.c:292: sorry, unimplemented: inlining failed 
> in call to 'ath5k_extend_tsf': function body not available
>
> fix it with adjust the order of inline function body.
>
> Signed-off-by: Dave Young <[EMAIL PROTECTED]>
>
> ---
> drivers/net/wireless/ath5k/base.c |   67 
> --
> 1 file changed, 29 insertions(+), 38 deletions(-)
>
> diff -upr linux/drivers/net/wireless/ath5k/base.c 
> linux.new/drivers/net/wireless/ath5k/base.c
> --- linux/drivers/net/wireless/ath5k/base.c 2007-12-07 10:01:42.0 
> +0800
> +++ linux.new/drivers/net/wireless/ath5k/base.c 2007-12-07 10:01:49.0 
> +0800
> @@ -250,8 +250,19 @@ static int ath5k_rxbuf_setup(struct ath
>  static int ath5k_txbuf_setup(struct ath5k_softc *sc,
> struct ath5k_buf *bf,
> struct ieee80211_tx_control *ctl);
> +
>  static inline void ath5k_txbuf_free(struct ath5k_softc *sc,
> -   struct ath5k_buf *bf);
> +   struct ath5k_buf *bf)
> +{
> +   BUG_ON(!bf);
> +   if (!bf->skb)
> +   return;
> +   pci_unmap_single(sc->pdev, bf->skbaddr, bf->skb->len,
> +   PCI_DMA_TODEVICE);
> +   dev_kfree_skb(bf->skb);
> +   bf->skb = NULL;
> +}
> +
>  /* Queues setup */
>  static struct  ath5k_txq *ath5k_txq_setup(struct ath5k_softc *sc,
> int qtype, int subtype);
> @@ -278,14 +289,29 @@ static intath5k_beacon_setup(struct at
> struct ieee80211_tx_control *ctl);
>  static voidath5k_beacon_send(struct ath5k_softc *sc);
>  static voidath5k_beacon_config(struct ath5k_softc *sc);
> -static inline u64 ath5k_extend_tsf(struct ath5k_hw *ah, u32 rstamp);
> +
> +static inline u64 ath5k_extend_tsf(struct ath5k_hw *ah, u32 rstamp)
> +{
> +   u64 tsf = ath5k_hw_get_tsf64(ah);
> +
> +   if ((tsf & 0x7fff) < rstamp)
> +   tsf -= 0x8000;
> +
> +   return (tsf & ~0x7fff) | rstamp;
> +}
> +
>  /* Interrupt handling */
>  static int ath5k_init(struct ath5k_softc *sc);
>  static int ath5k_stop_locked(struct ath5k_softc *sc);
>  static int ath5k_stop_hw(struct ath5k_softc *sc);
>  static irqreturn_t ath5k_intr(int irq, void *dev_id);
>  static voidath5k_tasklet_reset(unsigned long data);
> -static inline void ath5k_update_txpow(struct ath5k_softc *sc);
> +
> +static inline void ath5k_update_txpow(struct ath5k_softc *sc)
> +{
> +   ath5k_hw_set_txpower_limit(sc->ah, 0);
> +}
> +
>  static voidath5k_calibrate(unsigned long data);
>  /* LED functions */
>  static voidath5k_led_off(unsigned long data);
> @@ -1341,21 +1367,6 @@ err_unmap:
> return ret;
>  }
>
> -static inline void
> -ath5k_txbuf_free(struct ath5k_softc *sc, struct ath5k_buf *bf)
> -{
> -   BUG_ON(!bf);
> -   if (!bf->skb)
> -   return;
> -   pci_unmap_single(sc->pdev, bf->skbaddr, bf->skb->len,
> -   PCI_DMA_TODEVICE);
> -   dev_kfree_skb(bf->skb);
> -   bf->skb = NULL;
> -}
> -
> -
> -
> -
>  /**\
>  * Queues setup *
>  \**/
> @@ -2046,20 +2057,6 @@ ath5k_beacon_config(struct ath5k_softc *
>  #undef TSF_TO_TU
>  }
>
> -static inline
> -u64 ath5k_extend_tsf(struct ath5k_hw *ah, u32 rstamp)
> -{
> -   u64 tsf = ath5k_hw_get_tsf64(ah);
> -
> -   if ((tsf & 0x7fff) < rstamp)
> -   tsf -= 0x8000;
> -
> -   return (tsf & ~0x7fff) | rstamp;
> -}
> -
> -
> -
> -
>  /\
>  * Interrupt handling *
>  \/
> @@ -2295,12 +2292,6 @@ ath5k_tasklet_reset(unsigned long data)
> ath5k_reset(sc->hw);
>  }
>
> -static inline void
> -ath5k_update_txpow(struct ath5k_softc *sc)
> -{
> -   ath5k_hw_set_txpower_limit(sc->ah, 0);
> -}
> -
>  /*
>   * Periodically recalibrate the PHY to account
>   * for temperature/environment changes.
>

We'll change their order in the code, plz keep prototype declarations
clean. I'll submit a patch asap on this.

-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Clarify pci_iomap() usage for MMIO-only devices

2007-09-19 Thread Nick Kossifidis

2007/9/19, Alan Cox <[EMAIL PROTECTED]>:
> > If you use ioport_map/unmap, then you really *should* access them with the
> > proper iomem accessors (ioread/iowrite). The fact that it may happen to
> > work (when using the default lib/iomap.c implementation, at least) on
> > some architectures and with the current implementation still doesn't mean
> > that you should necessarily use readb/writeb.
>
> Another reason we should enforce the use of ioread/iowrite is that some
> platforms do daft things like map mmio type devices through indirect
> register access. Sparc PCMCIA is a classic example. The only sane way to
> make these work is to require ioread/iowrite is used with iomap
>

Hmm, we 've had bug reports about driver (madwifi-old-openhal) not
working on Sparc64, can someone test if this is the case (the fact
that we used readl/writel instead of ioread/iowrite on
madwifi-old-openhal) ? It was really weird because we handle
big-endian machines ok (madwifi-old-openhal on ppc worked fine).

(ath5k was based on madwifi-old-openhal)

-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/5] Net: ath5k, use int as retval

2007-09-02 Thread Nick Kossifidis

2007/9/1, Jiri Slaby <[EMAIL PROTECTED]>:
> John W. Linville napsal(a):
> > On Tue, Aug 28, 2007 at 12:00:09PM -0400, Jiri Slaby wrote:
> >> ath5k, use int as retval
> >>
> >> Convert some functions to return int and proper negative return value on
> >> error as we are used to.
> >
> > Since I didn't apply 1/5, this one didn't apply either.  It seems
> > fine overall, so if you rediff I'll be happy to apply.
>
> Ok, I'll do it, thanks,

Can somebody commit my resent changes from madwifi-svn (cleanups,
kconfig, remove_hw_ from filenames etc) ? I don't have git repository
;-(


-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] Net: ath5k, kconfig changes

2007-08-31 Thread Nick Kossifidis

2007/8/31, Nick Kossifidis <[EMAIL PROTECTED]>:
> 2007/8/30, John W. Linville <[EMAIL PROTECTED]>:
> > On Thu, Aug 30, 2007 at 04:38:09AM +0300, Nick Kossifidis wrote:
> > > 2007/8/28, Christoph Hellwig <[EMAIL PROTECTED]>:
> >
> > > > Also this whole patch seems rather pointless.  It saves only
> > > > very little and turns the driver into a complete ifdef maze.
> >
> > > Also most
> > > people will use 5212 code only, 5211 cards are on some old laptops and
> > > 5210, well i couldn't even find  a 5210 for actual testing :P
> >
> > FWIW, I'd bet dollars to donuts that distros will enable them all
> > together.
> >
> > Is saving code space the only reason to turn these off?  How much
> > space do you save?
> >
> > Is there some way you can isolate and/or limit the number of ifdef
> > blocks further?  If so, we might consider a version of this patch
> > that depends on EMBEDDED or somesuch...?
> >
> > John
>
> O.K. as a first step i'll limit 5210 code only then, just an option
> like "support older 5210 chipsets" which is going to be off by default
> instead of 3 options. It's not just saving space, it's also saving
> some runtime checks. It's not really a gain in performance though,
> most checks are done during initialization and dfs setup, i just
> thought it would be usefull to save as much cpu as possible.
>

Well after some thought i removed them all, there is no real gain from
this in most cases (that ppl will use newer 5212 chips and
combatibles).



-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] Net: ath5k, kconfig changes

2007-08-30 Thread Nick Kossifidis

2007/8/30, John W. Linville <[EMAIL PROTECTED]>:
> On Thu, Aug 30, 2007 at 04:38:09AM +0300, Nick Kossifidis wrote:
> > 2007/8/28, Christoph Hellwig <[EMAIL PROTECTED]>:
>
> > > Also this whole patch seems rather pointless.  It saves only
> > > very little and turns the driver into a complete ifdef maze.
>
> > Also most
> > people will use 5212 code only, 5211 cards are on some old laptops and
> > 5210, well i couldn't even find  a 5210 for actual testing :P
>
> FWIW, I'd bet dollars to donuts that distros will enable them all
> together.
>
> Is saving code space the only reason to turn these off?  How much
> space do you save?
>
> Is there some way you can isolate and/or limit the number of ifdef
> blocks further?  If so, we might consider a version of this patch
> that depends on EMBEDDED or somesuch...?
>
> John

O.K. as a first step i'll limit 5210 code only then, just an option
like "support older 5210 chipsets" which is going to be off by default
instead of 3 options. It's not just saving space, it's also saving
some runtime checks. It's not really a gain in performance though,
most checks are done during initialization and dfs setup, i just
thought it would be usefull to save as much cpu as possible.

-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 5/5] Net: ath5k, kconfig changes

2007-08-29 Thread Nick Kossifidis

2007/8/28, Christoph Hellwig <[EMAIL PROTECTED]>:
> On Tue, Aug 28, 2007 at 12:01:30PM -0400, Jiri Slaby wrote:
> > +config ATH5K_AR5210
> > + bool "Support AR5210"
> > + depends on ATH5K
> > + default y
> > +
> > +config ATH5K_AR5211
> > + bool "Support AR5211"
> > + depends on ATH5K
> > + default y
> > +
> > +config ATH5K_AR5212
> > + bool "Support AR5212"
> > + depends on ATH5K
> > + default y
>
> Please don't add more default statements.
>
> Also this whole patch seems rather pointless.  It saves only
> very little and turns the driver into a complete ifdef maze.

It saves big chunks of code (not only initial register settings
arrays) and we'll extend it's use more inside ath5k_hw.c Trust me this
is a very useful step, eg. check out descriptor processing / setup or
PHY functions (calibrate/channel set etc). For example AR5210 mac chip
only comes with RF5110 phy chip so we can get rid of RF5111/RF5112
code, AR5211 comes with RF5111 so we can get rid of RF5110 and RF5112
code and AR5212 comes with RF5111 or RF5112 so we get rid of RF5110.
This thing also saves lots of checks during runtime (some of them
happen verry frequently, eg. durring descriptor processing). Also most
people will use 5212 code only, 5211 cards are on some old laptops and
5210, well i couldn't even find  a 5210 for actual testing :P

-- 
GPG ID: 0xD21DB2DB
As you read this post global entropy rises. Have Fun ;-)
Nick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ANNOUNCE] d80211 based driver for Intel PRO/Wireless 3945ABG

2007-02-09 Thread Nick Kossifidis


Over the past year we were able to make the necessary changes to the
microcode used with the 3945 such that we were able to remove the
regulatory daemon.


Great news !! Congratz ;-)


--
As you read this post global entropy rises. Have Fun ;-)
Nick
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Madwifi-devel] ANNOUNCE: SFLC helps developers assess ar5k (enabling free Atheros HAL)

2006-11-16 Thread Nick Kossifidis

Just in case you want to experiment, i have a working port of ar5k
that works on madwifi-old before the BSD - HEAD merge...

well i'm no good programmer (yet), i did what i could (i'm sure those
define macros are nasty for most people :P) and hope that helps...

you can git clone from the following link

rsync://rsync.ath-driver.org/gnumonks_users/mb/openhal.git

I'm working with the following madwifi revision...
svn checkout http://svn.madwifi.org/branches/madwifi-old -r 1142 madwifi

Don't forget to remove the ending ) in line 175 at
net80211/ieee80211_radiotap.h ;-)

Also you'll need to do some modifications to make it work with newer
kernel versions (MODULE_PARM stuff in if_ath.c)...

diff -Naurp madwifi-openhal/ath/if_ath.c madwifi-openhal-fixed/ath/if_ath.c
--- madwifi-openhal/ath/if_ath.c2005-06-24 06:41:22.0 -0400
+++ madwifi-openhal-fixed/ath/if_ath.c  2006-08-04 14:49:38.0 -0400
@@ -245,13 +245,20 @@ enum {
#endif

static  int countrycode = -1;
-MODULE_PARM(countrycode, "i");
-MODULE_PARM_DESC(countrycode, "Override default country code");
static  int outdoor = -1;
-MODULE_PARM(outdoor, "i");
-MODULE_PARM_DESC(outdoor, "Enable/disable outdoor use");
static  int xchanmode = -1;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,52))
+MODULE_PARM(countrycode, "i");
+MODULE_PARM(outdoor, "i");
MODULE_PARM(xchanmode, "i");
+#else
+#include 
+module_param(countrycode, int, 0);
+module_param(outdoor, int, 0);
+module_param(xchanmode, int, 0);
+#endif
+MODULE_PARM_DESC(countrycode, "Override default country code");
+MODULE_PARM_DESC(outdoor, "Enable/disable outdoor use");
MODULE_PARM_DESC(xchanmode, "Enable/disable extended channel mode");

int

See README for more details

I haven't looked much the MadWiFi code since then, perhaps you can
help in making ar5k work with newer madwifi versions + improve it's
functionality to help both linux and OpenBSD.

(The name OpenHAL came from John Bicket's initial port)

Happy coding ;-)
Nick

2006/11/16, Michael Renzmann <[EMAIL PROTECTED]>:

Hi.

Michael Buesch wrote:
> Well, it never worked for me. But I gave up trying about
> half a year ago. But maybe it's just stupid me. ;)

Well, we have various support channels (an IRC channel besides two
mailing lists, for example) that you are welcome to make use of in case
of problems :)

Bye, Mike

-
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
___
Madwifi-devel mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/madwifi-devel

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

57 matches

Mail list logo