date:20190403

[PATCH v2 5/5] riscv: Make mmap allocation top-down by default

2019-04-03 Thread Alexandre Ghiti

In order to avoid wasting user address space by using bottom-up mmap
allocation scheme, prefer top-down scheme when possible.

Before:
root@qemuriscv64:~# cat /proc/self/maps
0001-00016000 r-xp  fe:00 6389   /bin/cat.coreutils
00016000-00017000 r--p 5000 fe:00 6389   /bin/cat.coreutils
00017000-00018000 rw-p 6000 fe:00 6389   /bin/cat.coreutils
00018000-00039000 rw-p  00:00 0  [heap]
156000-16d000 r-xp  fe:00 7193   /lib/ld-2.28.so
16d000-16e000 r--p 00016000 fe:00 7193   /lib/ld-2.28.so
16e000-16f000 rw-p 00017000 fe:00 7193   /lib/ld-2.28.so
16f000-17 rw-p  00:00 0
17-172000 r-xp  00:00 0  [vdso]
174000-176000 rw-p  00:00 0
176000-1555674000 r-xp  fe:00 7187   /lib/libc-2.28.so
1555674000-1555678000 r--p 000fd000 fe:00 7187   /lib/libc-2.28.so
1555678000-155567a000 rw-p 00101000 fe:00 7187   /lib/libc-2.28.so
155567a000-15556a rw-p  00:00 0
3fffb9-3fffbb1000 rw-p  00:00 0  [stack]

After:
root@qemuriscv64:~# cat /proc/self/maps
0001-00016000 r-xp  fe:00 6389   /bin/cat.coreutils
00016000-00017000 r--p 5000 fe:00 6389   /bin/cat.coreutils
00017000-00018000 rw-p 6000 fe:00 6389   /bin/cat.coreutils
00018000-00039000 rw-p  00:00 0  [heap]
3ff7eb6000-3ff7ed8000 rw-p  00:00 0
3ff7ed8000-3ff7fd6000 r-xp  fe:00 7187   /lib/libc-2.28.so
3ff7fd6000-3ff7fda000 r--p 000fd000 fe:00 7187   /lib/libc-2.28.so
3ff7fda000-3ff7fdc000 rw-p 00101000 fe:00 7187   /lib/libc-2.28.so
3ff7fdc000-3ff7fe2000 rw-p  00:00 0
3ff7fe4000-3ff7fe6000 r-xp  00:00 0  [vdso]
3ff7fe6000-3ff7ffd000 r-xp  fe:00 7193   /lib/ld-2.28.so
3ff7ffd000-3ff7ffe000 r--p 00016000 fe:00 7193   /lib/ld-2.28.so
3ff7ffe000-3ff7fff000 rw-p 00017000 fe:00 7193   /lib/ld-2.28.so
3ff7fff000-3ff800 rw-p  00:00 0
3fff888000-3fff8a9000 rw-p  00:00 0  [stack]

Signed-off-by: Alexandre Ghiti 
---
 arch/riscv/Kconfig | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index eb56c82d8aa1..fe09f38ef9a9 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -49,6 +49,17 @@ config RISCV
select GENERIC_IRQ_MULTI_HANDLER
select ARCH_HAS_PTE_SPECIAL
select HAVE_EBPF_JIT if 64BIT
+   select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+   select HAVE_ARCH_MMAP_RND_BITS
+
+config ARCH_MMAP_RND_BITS_MIN
+   default 18
+
+# max bits determined by the following formula:
+#  VA_BITS - PAGE_SHIFT - 3
+config ARCH_MMAP_RND_BITS_MAX
+   default 33 if 64BIT # SV48 based
+   default 18
 
 config MMU
def_bool y
-- 
2.20.1

[PATCH v2 4/5] mips: Use generic mmap top-down layout

2019-04-03 Thread Alexandre Ghiti

mips uses a top-down layout by default that fits the generic functions.
At the same time, this commit allows to fix problem uncovered
and not fixed for mips here:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1429066.html

Signed-off-by: Alexandre Ghiti 
---
 arch/mips/Kconfig |  1 +
 arch/mips/include/asm/processor.h |  5 ---
 arch/mips/mm/mmap.c   | 57 ---
 3 files changed, 1 insertion(+), 62 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 4a5f5b0ee9a9..c21aa6371eab 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -14,6 +14,7 @@ config MIPS
select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+   select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
diff --git a/arch/mips/include/asm/processor.h 
b/arch/mips/include/asm/processor.h
index aca909bd7841..fba18d4a9190 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -29,11 +29,6 @@
 
 extern unsigned int vced_count, vcei_count;
 
-/*
- * MIPS does have an arch_pick_mmap_layout()
- */
-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
-
 #ifdef CONFIG_32BIT
 #ifdef CONFIG_KVM_GUEST
 /* User space process size is limited to 1GB in KVM Guest Mode */
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 2f616ebeb7e0..61e65a69bb09 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -20,33 +20,6 @@
 unsigned long shm_align_mask = PAGE_SIZE - 1;  /* Sane caches */
 EXPORT_SYMBOL(shm_align_mask);
 
-/* gap between mmap and stack */
-#define MIN_GAP (128*1024*1024UL)
-#define MAX_GAP ((TASK_SIZE)/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
-   if (current->personality & ADDR_COMPAT_LAYOUT)
-   return 1;
-
-   if (rlim_stack->rlim_cur == RLIM_INFINITY)
-   return 1;
-
-   return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
-   unsigned long gap = rlim_stack->rlim_cur;
-
-   if (gap < MIN_GAP)
-   gap = MIN_GAP;
-   else if (gap > MAX_GAP)
-   gap = MAX_GAP;
-
-   return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
 #define COLOUR_ALIGN(addr, pgoff)  \
addr) + shm_align_mask) & ~shm_align_mask) +\
 (((pgoff) << PAGE_SHIFT) & shm_align_mask))
@@ -144,36 +117,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file 
*filp,
addr0, len, pgoff, flags, DOWN);
 }
 
-unsigned long arch_mmap_rnd(void)
-{
-   unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
-   if (TASK_IS_32BIT_ADDR)
-   rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
-   else
-#endif /* CONFIG_COMPAT */
-   rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
-   return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
-   unsigned long random_factor = 0UL;
-
-   if (current->flags & PF_RANDOMIZE)
-   random_factor = arch_mmap_rnd();
-
-   if (mmap_is_legacy(rlim_stack)) {
-   mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-   mm->get_unmapped_area = arch_get_unmapped_area;
-   } else {
-   mm->mmap_base = mmap_base(random_factor, rlim_stack);
-   mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-   }
-}
-
 static inline unsigned long brk_rnd(void)
 {
unsigned long rnd = get_random_long();
-- 
2.20.1

[PATCH v2 3/5] arm: Use generic mmap top-down layout

2019-04-03 Thread Alexandre Ghiti

arm uses a top-down layout by default that fits the generic functions.
At the same time, this commit allows to fix the following problems:

- one uncovered and not fixed for arm here:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1429066.html

- the use of TASK_SIZE instead of STACK_TOP in mmap_base which, when
  address space of a task is 26 bits, would assign mmap base way too high.

Signed-off-by: Alexandre Ghiti 
---
 arch/arm/Kconfig |  1 +
 arch/arm/include/asm/processor.h |  2 --
 arch/arm/mm/mmap.c   | 52 
 3 files changed, 1 insertion(+), 54 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 850b4805e2d1..747101a8e989 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -28,6 +28,7 @@ config ARM
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
+   select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_EXTABLE_SORT if MMU
select CLONE_BACKWARDS
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 57fe73ea0f72..944ef1fb1237 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -143,8 +143,6 @@ static inline void prefetchw(const void *ptr)
 #endif
 #endif
 
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
 #endif
 
 #endif /* __ASM_ARM_PROCESSOR_H */
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index f866870db749..b8d912ac9e61 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -17,33 +17,6 @@
addr)+SHMLBA-1)&~(SHMLBA-1)) +  \
 (((pgoff)rlim_cur == RLIM_INFINITY)
-   return 1;
-
-   return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
-   unsigned long gap = rlim_stack->rlim_cur;
-
-   if (gap < MIN_GAP)
-   gap = MIN_GAP;
-   else if (gap > MAX_GAP)
-   gap = MAX_GAP;
-
-   return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
 /*
  * We need to ensure that shared mappings are correctly aligned to
  * avoid aliasing issues with VIPT caches.  We need to ensure that
@@ -171,31 +144,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const 
unsigned long addr0,
return addr;
 }
 
-unsigned long arch_mmap_rnd(void)
-{
-   unsigned long rnd;
-
-   rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
-   return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
-   unsigned long random_factor = 0UL;
-
-   if (current->flags & PF_RANDOMIZE)
-   random_factor = arch_mmap_rnd();
-
-   if (mmap_is_legacy(rlim_stack)) {
-   mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-   mm->get_unmapped_area = arch_get_unmapped_area;
-   } else {
-   mm->mmap_base = mmap_base(random_factor, rlim_stack);
-   mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-   }
-}
-
 /*
  * You really shouldn't be using read() or write() on /dev/mem.  This
  * might go away in the future.
-- 
2.20.1

Re: [RFC PATCH 1/9] drivers: regulator: qcom_spmi: enable linear range info

2019-04-03 Thread Mark Brown

On Thu, Apr 04, 2019 at 07:09:22AM +0200, Niklas Cassel wrote:
> From: Jorge Ramirez-Ortiz 
> 
> Signed-off-by: Jorge Ramirez-Ortiz 
> ---
>  drivers/regulator/qcom_spmi-regulator.c | 7 +++
>  1 file changed, 7 insertions(+)

This doesn't build:

  CC  drivers/regulator/qcom_spmi-regulator.o
drivers/regulator/qcom_spmi-regulator.c: In function 
‘qcom_spmi_regulator_probe’:
drivers/regulator/qcom_spmi-regulator.c:1837:29: error: 
‘SPMI_REGULATOR_LOGICAL_TYPE_HFS430’ undeclared (first use in this function); 
did you mean ‘SPMI_REGULATOR_LOGICAL_TYPE_FTSMPS’?
   if (vreg->logical_type == SPMI_REGULATOR_LOGICAL_TYPE_HFS430) {
 ^~
 SPMI_REGULATOR_LOGICAL_TYPE_FTSMPS
drivers/regulator/qcom_spmi-regulator.c:1837:29: note: each undeclared 
identifier is reported only once for each function it appears in

signature.asc
Description: PGP signature

[PATCH v2 2/5] arm64, mm: Move generic mmap layout functions to mm

2019-04-03 Thread Alexandre Ghiti

arm64 handles top-down mmap layout in a way that can be easily reused
by other architectures, so make it available in mm.

This commit also takes the opportunity to:
- make use of is_compat_task instead of specific arm64 test
  test_thread_flag(TIF_32BIT), which allows more genericity and is
  equivalent.
- fix the case where stack randomization should not be taken into
  account.

It then introduces a new config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
that can be set by other architectures to benefit from those functions.

Suggested-by: Christoph Hellwig 
Signed-off-by: Alexandre Ghiti 
---
 arch/Kconfig   |  8 
 arch/arm64/Kconfig |  1 +
 arch/arm64/include/asm/processor.h |  2 -
 arch/arm64/mm/mmap.c   | 72 
 kernel/sysctl.c|  6 ++-
 mm/util.c  | 77 +-
 6 files changed, 89 insertions(+), 77 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 3368786a..ef8d0b50cc41 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -684,6 +684,14 @@ config HAVE_ARCH_COMPAT_MMAP_BASES
  and vice-versa 32-bit applications to call 64-bit mmap().
  Required for applications doing different bitness syscalls.
 
+config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+   bool
+   help
+ This allows to use a set of generic functions to determine mmap base
+ address by giving priority to top-down scheme only if the process
+ is not in legacy mode (compat task, unlimited stack size or
+ sysctl_legacy_va_layout).
+
 config HAVE_COPY_THREAD_TLS
bool
help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7e34b9eba5de..670719a26b45 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -66,6 +66,7 @@ config ARM64
select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 5 || CC_IS_CLANG
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
+   select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_FRAME_POINTERS
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARM_AMBA
diff --git a/arch/arm64/include/asm/processor.h 
b/arch/arm64/include/asm/processor.h
index 5d9ce62bdebd..4de2a2fd605a 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -274,8 +274,6 @@ static inline void spin_lock_prefetch(const void *ptr)
 "nop") : : "p" (ptr));
 }
 
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
 #endif
 
 extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 842c8a5fcd53..c74224421216 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -31,78 +31,6 @@
 
 #include 
 
-/*
- * Leave enough space between the mmap area and the stack to honour ulimit in
- * the face of randomisation.
- */
-#define MIN_GAP (SZ_128M)
-#define MAX_GAP(STACK_TOP/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
-   if (current->personality & ADDR_COMPAT_LAYOUT)
-   return 1;
-
-   if (rlim_stack->rlim_cur == RLIM_INFINITY)
-   return 1;
-
-   return sysctl_legacy_va_layout;
-}
-
-unsigned long arch_mmap_rnd(void)
-{
-   unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
-   if (test_thread_flag(TIF_32BIT))
-   rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
-   else
-#endif
-   rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-   return rnd << PAGE_SHIFT;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
-   unsigned long gap = rlim_stack->rlim_cur;
-   unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap;
-
-   /* Values close to RLIM_INFINITY can overflow. */
-   if (gap + pad > gap)
-   gap += pad;
-
-   if (gap < MIN_GAP)
-   gap = MIN_GAP;
-   else if (gap > MAX_GAP)
-   gap = MAX_GAP;
-
-   return PAGE_ALIGN(STACK_TOP - gap - rnd);
-}
-
-/*
- * This function, called very early during the creation of a new process VM
- * image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
-   unsigned long random_factor = 0UL;
-
-   if (current->flags & PF_RANDOMIZE)
-   random_factor = arch_mmap_rnd();
-
-   /*
-* Fall back to the standard layout if the personality bit is set, or
-* if the expected stack growth is unlimited:
-*/
-   if (mmap_is_legacy(rlim_stack)) {
-   mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-   mm->get_unmapped_area = arch_get_unmapped_area;
-   } else {
-   mm->mmap_base = mmap_base(random_factor, rlim_stack);
-   mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-   }
-}
-
 /*
  * You really

[PATCH v2 1/5] mm, fs: Move randomize_stack_top from fs to mm

2019-04-03 Thread Alexandre Ghiti

This preparatory commit moves this function so that further introduction
of generic topdown mmap layout is contained only in mm/util.c.

Signed-off-by: Alexandre Ghiti 
---
 fs/binfmt_elf.c| 20 
 include/linux/mm.h |  2 ++
 mm/util.c  | 22 ++
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7d09d125f148..045f3b29d264 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -662,26 +662,6 @@ static unsigned long load_elf_interp(struct elfhdr 
*interp_elf_ex,
  * libraries.  There is no binary dependent code anywhere else.
  */
 
-#ifndef STACK_RND_MASK
-#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))/* 8MB of VA */
-#endif
-
-static unsigned long randomize_stack_top(unsigned long stack_top)
-{
-   unsigned long random_variable = 0;
-
-   if (current->flags & PF_RANDOMIZE) {
-   random_variable = get_random_long();
-   random_variable &= STACK_RND_MASK;
-   random_variable <<= PAGE_SHIFT;
-   }
-#ifdef CONFIG_STACK_GROWSUP
-   return PAGE_ALIGN(stack_top) + random_variable;
-#else
-   return PAGE_ALIGN(stack_top) - random_variable;
-#endif
-}
-
 static int load_elf_binary(struct linux_binprm *bprm)
 {
struct file *interpreter = NULL; /* to shut gcc up */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 76769749b5a5..087824a5059f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2312,6 +2312,8 @@ extern int install_special_mapping(struct mm_struct *mm,
   unsigned long addr, unsigned long len,
   unsigned long flags, struct page **pages);
 
+unsigned long randomize_stack_top(unsigned long stack_top);
+
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned 
long, unsigned long, unsigned long);
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
diff --git a/mm/util.c b/mm/util.c
index d559bde497a9..a54afb9b4faa 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -14,6 +14,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -291,6 +293,26 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
 }
 
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
+#endif
+
+unsigned long randomize_stack_top(unsigned long stack_top)
+{
+   unsigned long random_variable = 0;
+
+   if (current->flags & PF_RANDOMIZE) {
+   random_variable = get_random_long();
+   random_variable &= STACK_RND_MASK;
+   random_variable <<= PAGE_SHIFT;
+   }
+#ifdef CONFIG_STACK_GROWSUP
+   return PAGE_ALIGN(stack_top) + random_variable;
+#else
+   return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 {
-- 
2.20.1

[PATCH v2 0/5] Provide generic top-down mmap layout functions

2019-04-03 Thread Alexandre Ghiti

This series introduces generic functions to make top-down mmap layout
easily accessible to architectures, in particular riscv which was
the initial goal of this series.
The generic implementation was taken from arm64 and used successively
by arm, mips and finally riscv.

Note that in addition the series fixes 2 issues:
- stack randomization was taken into account even if not necessary.
- [1] fixed an issue with mmap base which did not take into account
  randomization but did not report it to arm and mips, so by moving
  arm64 into a generic library, this problem is now fixed for both
  architectures.

This work is an effort to factorize architecture functions to avoid
code duplication and oversights as in [1].

[1]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1429066.html

Changes in v2 as suggested by Christoph Hellwig:
  - Preparatory patch that moves randomize_stack_top
  - Fix duplicate config in riscv
  - Align #if defined on next line => this gives rise to a checkpatch
warning. I found this pattern all around the tree, in the same proportion
as the previous pattern which was less pretty:
git grep -C 1 -n -P "^#if defined.+\|\|.*$" 

Alexandre Ghiti (5):
  mm, fs: Move randomize_stack_top from fs to mm
  arm64, mm: Move generic mmap layout functions to mm
  arm: Use generic mmap top-down layout
  mips: Use generic mmap top-down layout
  riscv: Make mmap allocation top-down by default

 arch/Kconfig   |  8 +++
 arch/arm/Kconfig   |  1 +
 arch/arm/include/asm/processor.h   |  2 -
 arch/arm/mm/mmap.c | 52 
 arch/arm64/Kconfig |  1 +
 arch/arm64/include/asm/processor.h |  2 -
 arch/arm64/mm/mmap.c   | 72 --
 arch/mips/Kconfig  |  1 +
 arch/mips/include/asm/processor.h  |  5 --
 arch/mips/mm/mmap.c| 57 -
 arch/riscv/Kconfig | 11 
 fs/binfmt_elf.c| 20 --
 include/linux/mm.h |  2 +
 kernel/sysctl.c|  6 +-
 mm/util.c  | 99 +-
 15 files changed, 126 insertions(+), 213 deletions(-)

-- 
2.20.1

Re: [PATCH 5/7] ASoC: cs42l51: change mic bias DAPM

2019-04-03 Thread Mark Brown

On Wed, Apr 03, 2019 at 03:23:35PM +0200, Olivier Moysan wrote:
> Use SND_SOC_DAPM_SUPPLY for mic bias DAPM
> instead of deprecated SND_SOC_DAPM_MICBIAS.

There are existing users in mainline, have they all been updated to be
compatible with this, or verified that they don't need updates?

signature.asc
Description: PGP signature

Respond

2019-04-03 Thread Ella Golan

My name is Ms Ella Golan, I'm the Chief Executive Officer (C.P.A) of the First 
International Bank of Israel (FIBI).
I'm getting in touch with you in regards to a very important and urgent matter.
Kindly respond back at your earliest convenience so
I can provide you the details.

Faithfully,
Ms Ella Golan

Re: [PATCH v4 2/2] PCI: iproc: Add outbound configuration for 32-bit I/O region

2019-04-03 Thread Srinath Mannam

Hi Lorenzo,

I am sorry, I took your long time. In my commit log I gave details
about purpose of feature instead of implementation.
Thanks a lot for all inputs and knowledge. I will remember and follow
these notes while writing commit log.
commit log re-written by you is very much impressive and have detailed
description of feature and implementation.

Thank again for you patience.

Regards,
Srinath.

On Wed, Apr 3, 2019 at 5:01 PM Lorenzo Pieralisi
 wrote:
>
> On Wed, Apr 03, 2019 at 08:41:44AM +0530, Srinath Mannam wrote:
> > Hi Lorenzo,
> >
> > Please see my reply below,
> >
> > On Tue, Apr 2, 2019 at 7:08 PM Lorenzo Pieralisi
> >  wrote:
> > >
> > > On Tue, Apr 02, 2019 at 04:16:13PM +0530, Srinath Mannam wrote:
> > >
> > > [...]
> > >
> > > > > Ok - I start to understand. What does it mean in HW terms that your
> > > > > 32bit AXI address region size is 32MB ? Please explain to me in 
> > > > > details.
> > > > >
> > > > In our PCIe controller HW, AXI address from 0x4200 to 0x4400
> > > > of 32MB size and .
> > > > AXI address from 0x4 to 0x48000 of 2GB size are provided
> > > > to map ob address.
> > > > First IO region is inside 32bit address and second IO region is
> > > > outside 32bit address.
> > > > This code change is to map first IO region(0x4200 to 0x4400).
> > > >
> > > > > IIUC you are using an OARR0 of 128MB in size to map a 32MB address
> > > > > region, that's what I understand this patch does (and the lowest index
> > > > > corresponds to the smallest possible size - it is far from clear by
> > > > > looking at the patch).
> > > > Yes, lowest index corresponds to smallest possible size (128MB).
> > > > In our controller we have multiple windows like OARR0, OARR1, OARR2,
> > > > OARR3 all supports multiple sizes from 128MB to 1024MB.
> > > > These details are given at the top of this driver file, as shown
> > > > below. all windows supports 128MB size still we must use OARR0 window
> > > > to configure first IO region(0x4200 to 0x4400).
> > > >
> > > > static const struct iproc_pcie_ob_map paxb_v2_ob_map[] = {
> > > > {
> > > > /* OARR0/OMAP0 */
> > > > .window_sizes = { 128, 256 },
> > > > .nr_sizes = 2,
> > > > },
> > > > {
> > > > /* OARR1/OMAP1 */
> > > > .window_sizes = { 128, 256 },
> > > > .nr_sizes = 2,
> > > > },
> > > > {
> > > > /* OARR2/OMAP2 */
> > > > .window_sizes = { 128, 256, 512, 1024 },
> > > > .nr_sizes = 4,
> > > > },
> > > > {
> > > > /* OARR3/OMAP3 */
> > > > .window_sizes = { 128, 256, 512, 1024 },
> > > > .nr_sizes = 4,
> > > > },
> > > > };
> > >
> > > Ok so this patch allows mapping an AXI I/O window that is smaller
> > > than OARR possible sizes, why it was not done from the beginning
> > > I really do not know.
> > >
> > Same Iproc driver we use for multiple SOCs, in previous SOCs does not
> > have 32-bit AXI address region to map ob.
> > In the present SOC, 32-bit AXI address region is available so that
> > this change is added.
> >
> > > Now explain this to me please:
> > >
> > > > This patch add outbound window configuration to map below 32-bit I/O 
> > > > range
> > > > with corresponding PCI memory, which helps to access I/O region in ARM
> > > > 32-bit and one to one mapping of I/O region to PCI memory.
> > > >
> > > > Ex:
> > > > 1. ranges DT property given for current driver is,
> > > > ranges = <0x8300 0x0 0x4000 0x4 0x 0 0x4000>;
> > > > I/O region address is 0x4
> > > > 2. ranges DT property can be given after this patch,
> > > > ranges = <0x8300 0x0 0x4200 0x0 0x4200 0 0x200>;
> > > > I/O region address is 0x4200
> > >
> > > Why 1:1 AXI<->PCI address mapping is not possible in (1), how does the
> > > current code works on 32-bit systems and what's the benefit your change
> > > is bringing.
> > non-prefetchable memory range can only support 32-bit addresses, so
> > that we have taken 32-bit PCI bus address in (1).
> > current code does not work in 32-bit systems. In the present SOC with
> > this new change we can access from 32-bit CPU.
>
> Thank you. I rewrote the log and pushed patches to pci/iproc, please
> have a look (Ray/Scott please do have a look too) and report back
> if that's fine.
>
> Do you agree that the initial commit was lacking _significant_
> information ? Please remember that the commit log plays a fundamental
> part in understanding a change and this one is a very important one
> so I am being pedantic on it.
>
> Thanks,
> Lorenzo

Re: [PATCH 2/6] arm64/mm: Enable memory hot remove

2019-04-03 Thread Anshuman Khandual




On 04/03/2019 06:07 PM, Robin Murphy wrote:
> [ +Steve ]
> 
> Hi Anshuman,
> 
> On 03/04/2019 05:30, Anshuman Khandual wrote:
>> Memory removal from an arch perspective involves tearing down two different
>> kernel based mappings i.e vmemmap and linear while releasing related page
>> table pages allocated for the physical memory range to be removed.
>>
>> Define a common kernel page table tear down helper remove_pagetable() which
>> can be used to unmap given kernel virtual address range. In effect it can
>> tear down both vmemap or kernel linear mappings. This new helper is called
>> from both vmemamp_free() and ___remove_pgd_mapping() during memory removal.
>> The argument 'direct' here identifies kernel linear mappings.
>>
>> Vmemmap mappings page table pages are allocated through sparse mem helper
>> functions like vmemmap_alloc_block() which does not cycle the pages through
>> pgtable_page_ctor() constructs. Hence while removing it skips corresponding
>> destructor construct pgtable_page_dtor().
>>
>> While here update arch_add_mempory() to handle __add_pages() failures by
>> just unmapping recently added kernel linear mapping. Now enable memory hot
>> remove on arm64 platforms by default with ARCH_ENABLE_MEMORY_HOTREMOVE.
>>
>> This implementation is overall inspired from kernel page table tear down
>> procedure on X86 architecture.
> 
> A bit of a nit, but since this depends on at least patch #4 to work properly, 
> it would be good to reorder the series appropriately.

Sure will move up the generic changes forward.

>> Signed-off-by: Anshuman Khandual 
>> ---
>>   arch/arm64/Kconfig   |   3 +
>>   arch/arm64/include/asm/pgtable.h |  14 +++
>>   arch/arm64/mm/mmu.c  | 227 
>> ++-
>>   3 files changed, 241 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index a2418fb..db3e625 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -266,6 +266,9 @@ config HAVE_GENERIC_GUP
>>   config ARCH_ENABLE_MEMORY_HOTPLUG
>>   def_bool y
>>   +config ARCH_ENABLE_MEMORY_HOTREMOVE
>> +    def_bool y
>> +
>>   config ARCH_MEMORY_PROBE
>>   bool "Enable /sys/devices/system/memory/probe interface"
>>   depends on MEMORY_HOTPLUG
>> diff --git a/arch/arm64/include/asm/pgtable.h 
>> b/arch/arm64/include/asm/pgtable.h
>> index de70c1e..858098e 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -355,6 +355,18 @@ static inline int pmd_protnone(pmd_t pmd)
>>   }
>>   #endif
>>   +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +#define pmd_large(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
>> +#else
>> +#define pmd_large(pmd) 0
>> +#endif
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 3)
>> +#define pud_large(pud)    (pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT))
>> +#else
>> +#define pud_large(pmd) 0
>> +#endif
> 
> These seem rather different from the versions that Steve is proposing in the 
> generic pagewalk series - can you reach an agreement on which implementation 
> is preferred?

Sure will take a look.

> 
>> +
>>   /*
>>    * THP definitions.
>>    */
>> @@ -555,6 +567,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
>>     #else
>>   +#define pmd_index(addr) 0
>>   #define pud_page_paddr(pud)    ({ BUILD_BUG(); 0; })
>>     /* Match pmd_offset folding in  */
>> @@ -612,6 +625,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
>>     #else
>>   +#define pud_index(adrr)    0
>>   #define pgd_page_paddr(pgd)    ({ BUILD_BUG(); 0;})
>>     /* Match pud_offset folding in  */
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index e97f018..ae0777b 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -714,6 +714,198 @@ int kern_addr_valid(unsigned long addr)
>>     return pfn_valid(pte_pfn(pte));
>>   }
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pagetable(struct page *page, int order)
> 
> Do these need to be __meminit? AFAICS it's effectively redundant with the 
> containing #ifdef, and removal feels like it's inherently a later-than-init 
> thing anyway.

I was confused here a bit but even X86 does exactly the same. All these 
functions
are still labeled __meminit and all wrapped under CONFIG_MEMORY_HOTPLUG. Is 
there
any concern to have __meminit here ?

> 
>> +{
>> +    unsigned long magic;
>> +    unsigned int nr_pages = 1 << order;
>> +
>> +    if (PageReserved(page)) {
>> +    __ClearPageReserved(page);
>> +
>> +    magic = (unsigned long)page->freelist;
>> +    if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
>> +    while (nr_pages--)
>> +    put_page_bootmem(page++);
>> +    } else
>> +    while (nr_pages--)
>> +    free_reserved_page(page++);
>> +    } else
>> +    free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +#if (CONFIG_PGTABLE_LEVELS > 2)
>> +static void __meminit

Re: [PATCH 1/2] ASoC: rt5677: allow multiple interrupt sources

2019-04-03 Thread Mark Brown

On Wed, Apr 03, 2019 at 03:32:04PM -0600, Fletcher Woodruff wrote:
> On Mon, Apr 1, 2019 at 11:02 PM Mark Brown  wrote:

> > This looks unrelated to the polarity of the interupt?

> Yes this is separate. If a plug/unplug happens after regmap_read and
> before regmap_write, it will not be registered, so we loop to make
> sure that it's caught in a later iteration. I can clarify this in the
> patch notes.

Please submit one patch per change, each with a clear changelog, as
covered in SubmittingPatches.  This makes it much easier to review
things since it's easier to tell if the patch does what it was intended
to do.

signature.asc
Description: PGP signature

Re: [PATCH AUTOSEL 5.0 209/262] regulator: mcp16502: Include linux/gpio/consumer.h to fix build error

2019-04-03 Thread Mark Brown

On Wed, Apr 03, 2019 at 08:46:08PM -0400, Sasha Levin wrote:
> On Wed, Mar 27, 2019 at 07:32:11PM +, Mark Brown wrote:

> > > Fix below build error:
> > > drivers/regulator/mcp16502.c: In function ‘mcp16502_gpio_set_mode’:
> > > drivers/regulator/mcp16502.c:135:3: error: implicit declaration of 
> > > function ‘gpiod_set_value’; did you mean ‘gpio_set_value’? 
> > > [-Werror=implicit-function-declaration]
> > >gpiod_set_value(mcp->lpm, 0);

> > Does this error actually manifest in v5.0?

> I couldn't reproduce the error on f3c6a1a194317f^ so I assumed it
> requires a specialized config.

I suspect you will find this is a fix for a change that was only merged
in v5.1.


signature.asc
Description: PGP signature

linux-next: Tree for Apr 4

2019-04-03 Thread Stephen Rothwell

Hi all,

Changes since 20190403:

The sound-asoc tree lost its build failure.

The mfd tree lost its build failure.

The selinux tree lost its build failure.

The ipmi tree lost its build failure.

The staging tree gained conflicts against the spi and v4l-dvb trees.

Non-merge commits (relative to Linus' tree): 4662
 4557 files changed, 143113 insertions(+), 68010 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log
files in the Next directory.  Between each merge, the tree was built
with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
multi_v7_defconfig for arm and a native build of tools/perf. After
the final fixups (if any), I do an x86_64 modules_install followed by
builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
and sparc64 defconfig. And finally, a simple boot test of the powerpc
pseries_le_defconfig kernel in qemu (with and without kvm enabled).

Below is a summary of the state of the merge.

I am currently merging 298 trees (counting Linus' and 69 trees of bug
fix patches pending for the current merge release).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwell

$ git checkout master
$ git reset --hard stable
Merging origin/master (8ed86627f715 Merge branch 'for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid)
Merging fixes/master (b352face4ca9 adfs: mark expected switch fall-throughs)
Merging kspp-gustavo/for-next/kspp (2d212a1bac7e NFC: st21nfca: Fix 
fall-through warnings)
Merging kbuild-current/fixes (79a3aaa7b82e Linux 5.1-rc3)
Merging arc-current/for-curr (831e90bd606e ARC: PAE40: don't panic and instead 
turn off hw ioc)
Merging arm-current/fixes (d410a8a49e3e ARM: 8849/1: NOMMU: Fix encodings for 
PMSAv8's PRBAR4/PRLAR4)
Merging arm64-fixes/for-next/fixes (221f6eefc354 arm64: fix wrong check of 
on_sdei_stack in nmi context)
Merging m68k-current/for-linus (28713169d879 m68k: Add -ffreestanding to CFLAGS)
Merging powerpc-fixes/fixes (6f845ebec270 powerpc/pseries/mce: Fix misleading 
print for TLB mutlihit)
Merging sparc/master (7d762d69145a afs: Fix manually set volume location server 
list)
Merging fscrypt-current/for-stable (ae64f9bd1d36 Linux 4.15-rc2)
Merging net/master (b2e54b09a3d2 ip6_tunnel: Match to ARPHRD_TUNNEL6 for dev 
type)
Merging bpf/master (a090dbf25c56 Merge branch 'bpf-flow-dissector-fixes')
Merging ipsec/master (8742dc86d0c7 xfrm4: Fix uninitialized memory read in 
_decode_session4)
Merging netfilter/master (5f543a54eec0 net: hns3: fix for not calculating tx bd 
num correctly)
Merging ipvs/master (b2e3d68d1251 netfilter: nft_compat: destroy function must 
not have side effects)
Merging wireless-drivers/master (4837696f6b54 Merge tag 
'iwlwifi-for-kalle-2019-03-22' of 
git://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi-fixes)
Merging mac80211/master (53bf5811ca37 cfg80211: add ratelimited variants of err 
and warn)
Merging rdma-fixes/for-rc (1abe186ed8a6 IB/mlx5: Reset access mask when looping 
inside page fault handler)
Merging sound-current/for-linus (80690a276f44 ALSA: hda/realtek - Add quirk for 
Tuxedo XC 1509)
Merging sound-asoc-fixes/for-linus (33c10c7d0413 Merge branch 'asoc-5.1' into 
asoc-linus)
Merging regmap-fixes/for-linus (34fd5ecd01f0 Merge branch 'regmap-5.1' into 
regmap-linus)
Merging regulator-fixes/for-linus (29a825a05951 Merge branch 'regulator-5.1' 
into regulator-linus)
Merging spi-fixes/for-linus (c97f6ad307e9 Merge branch 'spi-5.1' into spi-linus)
Merging pci-current/for-linus (0fa635aec9ab PCI/LINK: Deduplicate bandwidth 
reports for multi-function devices)
Merging driver-core.current/driver-core-linus (79a3aaa7b82e Linux 5.1-rc3)
Merging tty.current/tty-linus (79a3aaa7b82e Linux 5.1-rc3)
Merging usb.current/usb-linus (79a3aaa7b82e Linux 5.1-rc3)
Merging usb-gadget-fixes/fixes (072684e8c58d USB: gadget: f_hid: fix deadlock 
in f_hidg_write())
Merging usb-serial-fixes/usb-linus (79a3aaa7b82e Linux 5.1-rc3)
Merging usb-chipidea-fixes/ci-for-usb-stab

Re: [PATCH 1/6] arm64/mm: Enable sysfs based memory hot add interface

2019-04-03 Thread Anshuman Khandual




On 04/03/2019 01:50 PM, David Hildenbrand wrote:
> On 03.04.19 06:30, Anshuman Khandual wrote:
>> Sysfs memory probe interface (/sys/devices/system/memory/probe) can accept
>> starting physical address of an entire memory block to be hot added into
>> the kernel. This is in addition to the existing ACPI based interface. This
>> just enables it with the required config CONFIG_ARCH_MEMORY_PROBE.
>>
> We recently discussed that the similar interface for removal should
> rather be moved to a debug/test module.

Can we maintain such a debug/test module mainline and enable it when required. 
Or
can have both add and remove interface at /sys/kernel/debug/ just for testing
purpose.

> 
> I wonder if we should try to do the same for the sysfs probing
> interface. Rather try to get rid of it than open the doors for more users.
> 

I understand your concern. Will drop this patch.

Re: [PATCH 1/6] arm64/mm: Enable sysfs based memory hot add interface

2019-04-03 Thread Anshuman Khandual




On 04/03/2019 06:42 PM, Robin Murphy wrote:
> On 03/04/2019 09:20, David Hildenbrand wrote:
>> On 03.04.19 06:30, Anshuman Khandual wrote:
>>> Sysfs memory probe interface (/sys/devices/system/memory/probe) can accept
>>> starting physical address of an entire memory block to be hot added into
>>> the kernel. This is in addition to the existing ACPI based interface. This
>>> just enables it with the required config CONFIG_ARCH_MEMORY_PROBE.
>>>
>>
>> We recently discussed that the similar interface for removal should
>> rather be moved to a debug/test module
>>
>> I wonder if we should try to do the same for the sysfs probing
>> interface. Rather try to get rid of it than open the doors for more users.
> 
> Agreed - if this option even exists in a released kernel, there's a risk that 
> distros will turn it on for the sake of it, and at that point arm64 is stuck 
> carrying the same ABI baggage as well.

True. Only if we really dont like that interface.

> 
> If users turn up in future with a desperate and unavoidable need for the 
> legacy half-an-API on arm64, we can always reconsider adding it at that 
> point. It was very much deliberate that my original hot-add support did not 
> include a patch like this one.

Sure. Will drop this one next time around.

Re: [PATCH -next] pinctrl: fsl: Make pinctrl_ipc_handle static

2019-04-03 Thread Linus Walleij

On Wed, Mar 20, 2019 at 9:15 PM Yue Haibing  wrote:

> From: YueHaibing 
>
> Fix sparse warning:
>
> drivers/pinctrl/freescale/pinctrl-scu.c:38:19: warning:
>  symbol 'pinctrl_ipc_handle' was not declared. Should it be static?
>
> Signed-off-by: YueHaibing 

Patch applied.

Yours,
Linus Walleij

[RFC PATCH 8/9] power: avs: Add support for CPR (Core Power Reduction)

2019-04-03 Thread Niklas Cassel

CPR (Core Power Reduction) is a technology that reduces core power on a
CPU or other device. It reads voltage settings in efuse from product
test process as initial settings.
Each OPP corresponds to a "corner" that has a range of valid voltages
for a particular frequency. While the device is running at a particular
frequency, CPR monitors dynamic factors such as temperature, etc. and
adjusts the voltage for that frequency accordingly to save power
and meet silicon characteristic requirements.

This driver is based on an RFC by Stephen Boyd[1], which in turn is
based on work by others on codeaurora.org[2].

[1] https://lkml.org/lkml/2015/9/18/833
[2] 
https://www.codeaurora.org/cgit/quic/la/kernel/msm-3.10/tree/drivers/regulator/cpr-regulator.c?h=msm-3.10

Co-developed-by: Jorge Ramirez-Ortiz 
Signed-off-by: Jorge Ramirez-Ortiz 
Signed-off-by: Niklas Cassel 
---
 drivers/power/avs/Kconfig|   15 +
 drivers/power/avs/Makefile   |1 +
 drivers/power/avs/qcom-cpr.c | 1777 ++
 3 files changed, 1793 insertions(+)
 create mode 100644 drivers/power/avs/qcom-cpr.c

diff --git a/drivers/power/avs/Kconfig b/drivers/power/avs/Kconfig
index a67eeace6a89..44d9f5bdc898 100644
--- a/drivers/power/avs/Kconfig
+++ b/drivers/power/avs/Kconfig
@@ -11,6 +11,21 @@ menuconfig POWER_AVS
 
  Say Y here to enable Adaptive Voltage Scaling class support.
 
+config QCOM_CPR
+   tristate "QCOM Core Power Reduction (CPR) support"
+   depends on POWER_AVS
+   select PM_OPP
+   help
+ Say Y here to enable support for the CPR hardware found on Qualcomm
+ SoCs like MSM8916.
+
+ This driver populates CPU OPPs tables and makes adjustments to the
+ tables based on feedback from the CPR hardware. If you want to do
+ CPUfrequency scaling say Y here.
+
+ To compile this driver as a module, choose M here: the module will
+ be called qcom-cpr
+
 config ROCKCHIP_IODOMAIN
 tristate "Rockchip IO domain support"
 depends on POWER_AVS && ARCH_ROCKCHIP && OF
diff --git a/drivers/power/avs/Makefile b/drivers/power/avs/Makefile
index ba4c7bc69225..88f4d5d49cba 100644
--- a/drivers/power/avs/Makefile
+++ b/drivers/power/avs/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_POWER_AVS_OMAP)   += smartreflex.o
 obj-$(CONFIG_ROCKCHIP_IODOMAIN)+= rockchip-io-domain.o
+obj-$(CONFIG_QCOM_CPR) += qcom-cpr.o
diff --git a/drivers/power/avs/qcom-cpr.c b/drivers/power/avs/qcom-cpr.c
new file mode 100644
index ..33552a0274ec
--- /dev/null
+++ b/drivers/power/avs/qcom-cpr.c
@@ -0,0 +1,1777 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2013-2015, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2019, Linaro Limited
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* Register Offsets for RB-CPR and Bit Definitions */
+
+/* RBCPR Version Register */
+#define REG_RBCPR_VERSION  0
+#define RBCPR_VER_20x02
+
+/* RBCPR Gate Count and Target Registers */
+#define REG_RBCPR_GCNT_TARGET(n)   (0x60 + 4 * n)
+
+#define RBCPR_GCNT_TARGET_TARGET_SHIFT 0
+#define RBCPR_GCNT_TARGET_TARGET_MASK  GENMASK(11, 0)
+#define RBCPR_GCNT_TARGET_GCNT_SHIFT   12
+#define RBCPR_GCNT_TARGET_GCNT_MASKGENMASK(9, 0)
+
+/* RBCPR Timer Control */
+#define REG_RBCPR_TIMER_INTERVAL   0x44
+#define REG_RBIF_TIMER_ADJUST  0x4c
+
+#define RBIF_TIMER_ADJ_CONS_UP_MASKGENMASK(3, 0)
+#define RBIF_TIMER_ADJ_CONS_UP_SHIFT   0
+#define RBIF_TIMER_ADJ_CONS_DOWN_MASK  GENMASK(3, 0)
+#define RBIF_TIMER_ADJ_CONS_DOWN_SHIFT 4
+#define RBIF_TIMER_ADJ_CLAMP_INT_MASK  GENMASK(7, 0)
+#define RBIF_TIMER_ADJ_CLAMP_INT_SHIFT 8
+
+/* RBCPR Config Register */
+#define REG_RBIF_LIMIT 0x48
+#define RBIF_LIMIT_CEILING_MASKGENMASK(5, 0)
+#define RBIF_LIMIT_CEILING_SHIFT   6
+#define RBIF_LIMIT_FLOOR_BITS  6
+#define RBIF_LIMIT_FLOOR_MASK  GENMASK(5, 0)
+
+#define RBIF_LIMIT_CEILING_DEFAULT RBIF_LIMIT_CEILING_MASK
+#define RBIF_LIMIT_FLOOR_DEFAULT   0
+
+#define REG_RBIF_SW_VLEVEL 0x94
+#define RBIF_SW_VLEVEL_DEFAULT 0x20
+
+#define REG_RBCPR_STEP_QUOT0x80
+#define RBCPR_STEP_QUOT_STEPQUOT_MASK  GENMASK(7, 0)
+#define RBCPR_STEP_QUOT_IDLE_CLK_MASK  GENMASK(3, 0)
+#define RBCPR_STEP_QUOT_IDLE_CLK_SHIFT 8
+
+/* RBCPR Control Register */
+#define REG_RBCPR_CTL  0x90
+
+#define RBCPR_CTL_LOOP_EN  BIT(0)
+#define RBCPR_CTL_TIMER_EN BIT(3)
+#define RBCPR_CTL_SW_AUTO_CONT_ACK_EN  BIT(5)
+#define RBCPR_CTL_SW_AUTO_CONT_NACK_DN_EN  BIT(6)
+#define RBCPR_CTL_COUNT_MODE   BIT(10)
+#define RBCPR_CTL_UP_THRESHOLD_MASKGENMASK(3,

[RFC PATCH 9/9] arm64: dts: qcom: qcs404: Add CPR and populate OPP tables

2019-04-03 Thread Niklas Cassel

Co-developed-by: Jorge Ramirez-Ortiz 
Signed-off-by: Jorge Ramirez-Ortiz 
Signed-off-by: Niklas Cassel 
---
 arch/arm64/boot/dts/qcom/qcs404.dtsi | 152 ++-
 1 file changed, 148 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/qcs404.dtsi 
b/arch/arm64/boot/dts/qcom/qcs404.dtsi
index 5747beb8d55a..3643dae09eb4 100644
--- a/arch/arm64/boot/dts/qcom/qcs404.dtsi
+++ b/arch/arm64/boot/dts/qcom/qcs404.dtsi
@@ -33,6 +33,8 @@
next-level-cache = <_0>;
clocks = <_glb>;
operating-points-v2 = <_opp_table>;
+   power-domains = <>;
+   power-domain-names = "cpr";
};
 
CPU1: cpu@101 {
@@ -43,6 +45,8 @@
next-level-cache = <_0>;
clocks = <_glb>;
operating-points-v2 = <_opp_table>;
+   power-domains = <>;
+   power-domain-names = "cpr";
};
 
CPU2: cpu@102 {
@@ -53,6 +57,8 @@
next-level-cache = <_0>;
clocks = <_glb>;
operating-points-v2 = <_opp_table>;
+   power-domains = <>;
+   power-domain-names = "cpr";
};
 
CPU3: cpu@103 {
@@ -63,6 +69,8 @@
next-level-cache = <_0>;
clocks = <_glb>;
operating-points-v2 = <_opp_table>;
+   power-domains = <>;
+   power-domain-names = "cpr";
};
 
L2_0: l2-cache {
@@ -72,17 +80,17 @@
};
 
cpu_opp_table: cpu_opp_table {
-   compatible = "operating-points-v2";
+   compatible = "operating-points-v2-qcom-cpu";
+   nvmem-cells = <_efuse_speedbin>;
opp-shared;
 
opp-109440 {
opp-hz = /bits/ 64 <109440>;
+   required-opps = <_opp1>;
};
opp-124800 {
opp-hz = /bits/ 64 <124800>;
-   };
-   opp-140160 {
-   opp-hz = /bits/ 64 <140160>;
+   required-opps = <_opp2>;
};
};
 
@@ -411,6 +419,11 @@
assigned-clock-rates = <1920>;
};
 
+   tcsr: syscon@1937000 {
+   compatible = "qcom,tcsr-qcs404", "syscon";
+   reg = <0x1937000 0x25000>;
+   };
+
tcsr_mutex_regs: syscon@1905000 {
compatible = "syscon";
reg = <0x01905000 0x2>;
@@ -812,6 +825,137 @@
status = "disabled";
};
};
+
+   qfprom: qfprom@a4000 {
+   compatible = "qcom,qfprom";
+   reg = <0xa4000 0x1000>;
+   #address-cells = <1>;
+   #size-cells = <1>;
+   cpr_efuse_speedbin: speedbin@13c {
+   reg = <0x13c 0x4>;
+   bits = <2 3>;
+   };
+   cpr_efuse_quot_offset1: qoffset1@231 {
+   reg = <0x231 0x4>;
+   bits = <4 7>;
+   };
+   cpr_efuse_quot_offset2: qoffset2@232 {
+   reg = <0x232 0x4>;
+   bits = <3 7>;
+   };
+   cpr_efuse_quot_offset3: qoffset3@233 {
+   reg = <0x233 0x4>;
+   bits = <2 7>;
+   };
+   cpr_efuse_init_voltage1: ivoltage1@229 {
+   reg = <0x229 0x4>;
+   bits = <4 6>;
+   };
+   cpr_efuse_init_voltage2: ivoltage2@22a {
+   reg = <0x22a 0x4>;
+   bits = <2 6>;
+   };
+   cpr_efuse_init_voltage3: ivoltage3@22b {
+   reg = <0x22b 0x4>;
+   bits = <0 6>;
+   };
+   cpr_efuse_quot1: quot1@22b {
+   reg = <0x22b 0x4>;
+   bits = <6 12>;
+   };
+   cpr_efuse_quot2: quot2@22d {
+   reg = <0x22d 0x4>;
+   bits = <2 12>;
+   };
+   cpr_efuse_quot3: quot3@230 {
+   reg = <0x230 0x4>;
+

[RFC PATCH 6/9] dt-bindings: opp: Add qcom-opp bindings with properties needed for CPR

2019-04-03 Thread Niklas Cassel

Add qcom-opp bindings with properties needed for Core Power Reduction (CPR).

CPR is included in a great variety of Qualcomm SoC, e.g. msm8916 and msm8996,
and was first introduced in msm8974.

Co-developed-by: Jorge Ramirez-Ortiz 
Signed-off-by: Jorge Ramirez-Ortiz 
Signed-off-by: Niklas Cassel 
---
 .../devicetree/bindings/opp/qcom-opp.txt  | 24 +++
 1 file changed, 24 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/opp/qcom-opp.txt

diff --git a/Documentation/devicetree/bindings/opp/qcom-opp.txt 
b/Documentation/devicetree/bindings/opp/qcom-opp.txt
new file mode 100644
index ..d24280467db7
--- /dev/null
+++ b/Documentation/devicetree/bindings/opp/qcom-opp.txt
@@ -0,0 +1,24 @@
+Qualcomm OPP bindings to describe OPP nodes
+
+The bindings are based on top of the operating-points-v2 bindings
+described in Documentation/devicetree/bindings/opp/opp.txt
+Additional properties are described below.
+
+* OPP Table Node
+
+Required properties:
+- compatible: Allow OPPs to express their compatibility. It should be:
+  "operating-points-v2-qcom-level"
+
+* OPP Node
+
+Optional properties:
+- opp-hz: Frequency in Hz, expressed as a 64-bit big-endian integer. Even
+  though a power domain doesn't need a opp-hz, there can be devices in the
+  power domain that need to know the highest supported frequency for each
+  corner/level (e.g. CPR), in order to properly initialize the hardware.
+
+- qcom,opp-fuse-level: A positive value representing the fuse corner/level
+  associated with this OPP node. Sometimes several corners/levels shares
+  a certain fuse corner/level. A fuse corner/level contains e.g. ref uV,
+  min uV, and max uV.
-- 
2.20.1

[RFC PATCH 4/9] cpufreq: qcom: support qcs404 on nvmem driver

2019-04-03 Thread Niklas Cassel

From: Jorge Ramirez-Ortiz 

Signed-off-by: Jorge Ramirez-Ortiz 
Co-developed-by: Niklas Cassel 
Signed-off-by: Niklas Cassel 
---
 drivers/cpufreq/qcom-cpufreq-nvmem.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c 
b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index 366c65a7132a..7fdc38218390 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -79,6 +80,13 @@ static enum _msm8996_version qcom_cpufreq_get_msm_id(void)
return version;
 }
 
+static int qcom_cpufreq_qcs404_name_version(struct device *cpu_dev,
+   struct nvmem_cell *speedbin_nvmem,
+   struct qcom_cpufreq_drv *drv)
+{
+   return 0;
+}
+
 static int qcom_cpufreq_kryo_name_version(struct device *cpu_dev,
  struct nvmem_cell *speedbin_nvmem,
  struct qcom_cpufreq_drv *drv)
@@ -191,6 +199,14 @@ static int qcom_cpufreq_probe(struct platform_device *pdev)
dev_err(cpu_dev, "Failed to set supported hardware\n");
goto free_opp;
}
+
+   ret = dev_pm_domain_attach(cpu_dev, false);
+   if (ret) {
+   if (ret == -EPROBE_DEFER)
+   goto free_opp;
+   dev_err(cpu_dev, "Could not attach to pm_domain: %d\n",
+   ret);
+   }
}
 
cpufreq_dt_pdev = platform_device_register_simple("cpufreq-dt", -1,
@@ -247,6 +263,8 @@ static const struct of_device_id qcom_cpufreq_match_list[] 
__initconst = {
  .data = qcom_cpufreq_kryo_name_version },
{ .compatible = "qcom,msm8996",
  .data = qcom_cpufreq_kryo_name_version },
+   { .compatible = "qcom,qcs404",
+ .data = qcom_cpufreq_qcs404_name_version },
{},
 };
 
-- 
2.20.1

[RFC PATCH 5/9] cpufreq: Add qcs404 to cpufreq-dt-platdev blacklist

2019-04-03 Thread Niklas Cassel

From: Jorge Ramirez-Ortiz 

Signed-off-by: Jorge Ramirez-Ortiz 
Co-developed-by: Niklas Cassel 
Signed-off-by: Niklas Cassel 
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c 
b/drivers/cpufreq/cpufreq-dt-platdev.c
index 47729a22c159..6c7e42558a38 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -123,6 +123,7 @@ static const struct of_device_id blacklist[] __initconst = {
 
{ .compatible = "qcom,apq8096", },
{ .compatible = "qcom,msm8996", },
+   { .compatible = "qcom,qcs404", },
 
{ .compatible = "st,stih407", },
{ .compatible = "st,stih410", },
-- 
2.20.1

[RFC PATCH 7/9] dt-bindings: power: avs: Add support for CPR (Core Power Reduction)

2019-04-03 Thread Niklas Cassel

Add DT bindings to describe the CPR HW found on certain Qualcomm SoCs.

Co-developed-by: Jorge Ramirez-Ortiz 
Signed-off-by: Jorge Ramirez-Ortiz 
Signed-off-by: Niklas Cassel 
---
 .../bindings/power/avs/qcom,cpr.txt   | 119 ++
 1 file changed, 119 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/power/avs/qcom,cpr.txt

diff --git a/Documentation/devicetree/bindings/power/avs/qcom,cpr.txt 
b/Documentation/devicetree/bindings/power/avs/qcom,cpr.txt
new file mode 100644
index ..541c9b31cd3b
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/avs/qcom,cpr.txt
@@ -0,0 +1,119 @@
+QCOM CPR (Core Power Reduction)
+
+CPR (Core Power Reduction) is a technology to reduce core power on a CPU
+or other device. Each OPP of a device corresponds to a "corner" that has
+a range of valid voltages for a particular frequency. While the device is
+running at a particular frequency, CPR monitors dynamic factors such as
+temperature, etc. and suggests adjustments to the voltage to save power
+and meet silicon characteristic requirements.
+
+- compatible:
+   Usage: required
+   Value type: 
+   Definition: must be "qcom,cpr"
+
+- reg:
+   Usage: required
+   Value type: 
+   Definition: base address and size of the rbcpr register region
+
+- interrupts:
+   Usage: required
+   Value type: 
+   Definition: list of three interrupts in order of irq0, irq1, irq2
+
+- acc-syscon:
+   Usage: optional
+   Value type: 
+   Definition: phandle to syscon for writing ACC settings
+
+- nvmem:
+   Usage: required
+   Value type: 
+   Definition: phandle to nvmem provider containing efuse settings
+
+- nvmem-names:
+   Usage: required
+   Value type: 
+   Definition: must be "qfprom"
+
+vdd-mx-supply = <_l3>;
+
+- qcom,cpr-ref-clk:
+   Usage: required
+   Value type: 
+   Definition: rate of reference clock in kHz
+
+- qcom,cpr-timer-delay-us:
+   Usage: required
+   Value type: 
+   Definition: delay in uS for the timer interval
+
+- qcom,cpr-timer-cons-up:
+   Usage: required
+   Value type: 
+   Definition: Consecutive number of timer intervals, or units of
+   qcom,cpr-timer-delay-us, that occur before issuing an up
+   interrupt
+
+- qcom,cpr-timer-cons-down:
+   Usage: required
+   Value type: 
+   Definition: Consecutive number of timer intervals, or units of
+   qcom,cpr-timer-delay-us, that occur before issuing a down
+   interrupt
+
+- qcom,cpr-up-threshold:
+   Usage: optional
+   Value type: 
+   Definition: The threshold for CPR to issue interrupt when error_steps
+   is greater than it when stepping up
+
+- qcom,cpr-down-threshold:
+   Usage: optional
+   Value type: 
+   Definition: The threshold for CPR to issue interrdownt when error_steps
+   is greater than it when stepping down
+
+- qcom,cpr-down-threshold:
+   Usage: optional
+   Value type: 
+   Definition: Idle clock cycles ring oscillator can be in
+
+- qcom,cpr-gcnt-us:
+   Usage: required
+   Value type: 
+   Definition: The time for gate count in uS
+
+- qcom,vdd-apc-step-up-limit:
+   Usage: required
+   Value type: 
+   Definition: Limit of vdd-apc-supply steps for scaling up
+
+- qcom,vdd-apc-step-down-limit:
+   Usage: required
+   Value type: 
+   Definition: Limit of vdd-apc-supply steps for scaling down
+
+Example:
+
+   avs@b018000 {
+   compatible = "qcom,cpr";
+   reg = <0xb018000 0x1000>;
+   interrupts = <0 15 1>, <0 16 1>, <0 17 1>;
+   vdd-mx-supply = <_l3>;
+   acc-syscon = <>;
+   nvmem = <>;
+   nvmem-names = "qfprom";
+
+   qcom,cpr-ref-clk = <19200>;
+   qcom,cpr-timer-delay-us = <5000>;
+   qcom,cpr-timer-cons-up = <0>;
+   qcom,cpr-timer-cons-down = <2>;
+   qcom,cpr-up-threshold = <0>;
+   qcom,cpr-down-threshold = <2>;
+   qcom,cpr-idle-clocks = <15>;
+   qcom,cpr-gcnt-us = <1>;
+   qcom,vdd-apc-step-up-limit = <1>;
+   qcom,vdd-apc-step-down-limit = <1>;
+   };
-- 
2.20.1

[RFC PATCH 3/9] cpufreq: qcom: create a driver struct

2019-04-03 Thread Niklas Cassel

create a driver struct to make it easier to free up all common
resources, and only call dev_pm_opp_set_supported_hw() if the
implementation has dynamically allocated versions.

Co-developed-by: Jorge Ramirez-Ortiz 
Signed-off-by: Jorge Ramirez-Ortiz 
Signed-off-by: Niklas Cassel 
---
 drivers/cpufreq/qcom-cpufreq-nvmem.c | 69 ++--
 1 file changed, 46 insertions(+), 23 deletions(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c 
b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index 652a1de2a5d4..366c65a7132a 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -43,6 +43,11 @@ enum _msm8996_version {
NUM_OF_MSM8996_VERSIONS,
 };
 
+struct qcom_cpufreq_drv {
+   struct opp_table **opp_tables;
+   u32 *versions;
+};
+
 static struct platform_device *cpufreq_dt_pdev, *cpufreq_pdev;
 
 static enum _msm8996_version qcom_cpufreq_get_msm_id(void)
@@ -76,12 +81,16 @@ static enum _msm8996_version qcom_cpufreq_get_msm_id(void)
 
 static int qcom_cpufreq_kryo_name_version(struct device *cpu_dev,
  struct nvmem_cell *speedbin_nvmem,
- u32 *versions)
+ struct qcom_cpufreq_drv *drv)
 {
size_t len;
u8 *speedbin;
enum _msm8996_version msm8996_version;
 
+   drv->versions = kzalloc(sizeof(*drv->versions), GFP_KERNEL);
+   if (!drv->versions)
+   return -ENOMEM;
+
msm8996_version = qcom_cpufreq_get_msm_id();
if (NUM_OF_MSM8996_VERSIONS == msm8996_version) {
dev_err(cpu_dev, "Not Snapdragon 820/821!");
@@ -94,10 +103,10 @@ static int qcom_cpufreq_kryo_name_version(struct device 
*cpu_dev,
 
switch (msm8996_version) {
case MSM8996_V3:
-   *versions = 1 << (unsigned int)(*speedbin);
+   *drv->versions = 1 << (unsigned int)(*speedbin);
break;
case MSM8996_SG:
-   *versions = 1 << ((unsigned int)(*speedbin) + 4);
+   *drv->versions = 1 << ((unsigned int)(*speedbin) + 4);
break;
default:
BUG();
@@ -110,15 +119,14 @@ static int qcom_cpufreq_kryo_name_version(struct device 
*cpu_dev,
 
 static int qcom_cpufreq_probe(struct platform_device *pdev)
 {
-   struct opp_table **opp_tables;
+   struct qcom_cpufreq_drv *drv;
int (*get_version)(struct device *cpu_dev,
   struct nvmem_cell *speedbin_nvmem,
-  u32 *versions);
+  struct qcom_cpufreq_drv *drv);
struct nvmem_cell *speedbin_nvmem;
struct device_node *np;
struct device *cpu_dev;
unsigned cpu;
-   u32 versions;
const struct of_device_id *match;
int ret;
 
@@ -141,23 +149,31 @@ static int qcom_cpufreq_probe(struct platform_device 
*pdev)
return -ENOENT;
}
 
+   drv = kzalloc(sizeof(*drv), GFP_KERNEL);
+   if (!drv)
+   return -ENOMEM;
+
speedbin_nvmem = of_nvmem_cell_get(np, NULL);
of_node_put(np);
if (IS_ERR(speedbin_nvmem)) {
if (PTR_ERR(speedbin_nvmem) != -EPROBE_DEFER)
dev_err(cpu_dev, "Could not get nvmem cell: %ld\n",
PTR_ERR(speedbin_nvmem));
-   return PTR_ERR(speedbin_nvmem);
+   ret = PTR_ERR(speedbin_nvmem);
+   goto free_drv;
}
 
-   ret = get_version(cpu_dev, speedbin_nvmem, );
+   ret = get_version(cpu_dev, speedbin_nvmem, drv);
nvmem_cell_put(speedbin_nvmem);
if (ret)
-   return ret;
+   goto free_drv;
 
-   opp_tables = kcalloc(num_possible_cpus(), sizeof(*opp_tables), 
GFP_KERNEL);
-   if (!opp_tables)
-   return -ENOMEM;
+   drv->opp_tables = kcalloc(num_possible_cpus(), sizeof(*drv->opp_tables),
+ GFP_KERNEL);
+   if (!drv->opp_tables) {
+   ret = -ENOMEM;
+   goto free_drv;
+   }
 
for_each_possible_cpu(cpu) {
cpu_dev = get_cpu_device(cpu);
@@ -166,10 +182,12 @@ static int qcom_cpufreq_probe(struct platform_device 
*pdev)
goto free_opp;
}
 
-   opp_tables[cpu] = dev_pm_opp_set_supported_hw(cpu_dev,
- , 1);
-   if (IS_ERR(opp_tables[cpu])) {
-   ret = PTR_ERR(opp_tables[cpu]);
+   if (drv->versions)
+   drv->opp_tables[cpu] =
+   dev_pm_opp_set_supported_hw(cpu_dev,
+   drv->versions, 1);
+   if (IS_ERR(drv->opp_tables[cpu])) {
+   ret = PTR_ERR(drv->opp_tables[cpu]);
dev_err(cpu_dev,

[RFC PATCH 2/9] cpufreq: qcom: Re-organise kryo cpufreq to use it for other nvmem based qcom socs

2019-04-03 Thread Niklas Cassel

From: Sricharan R 

The kryo cpufreq driver reads the nvmem cell and uses that data to
populate the opps. There are other qcom cpufreq socs like krait which
does similar thing. Except for the interpretation of the read data,
rest of the driver is same for both the cases. So pull the common things
out for reuse.

Signed-off-by: Sricharan R 
Signed-off-by: Niklas Cassel 
---
 ...ryo-cpufreq.txt => qcom-nvmem-cpufreq.txt} |  16 +--
 drivers/cpufreq/Kconfig.arm   |   4 +-
 drivers/cpufreq/Makefile  |   2 +-
 ...om-cpufreq-kryo.c => qcom-cpufreq-nvmem.c} | 124 +++---
 4 files changed, 85 insertions(+), 61 deletions(-)
 rename Documentation/devicetree/bindings/opp/{kryo-cpufreq.txt => 
qcom-nvmem-cpufreq.txt} (97%)
 rename drivers/cpufreq/{qcom-cpufreq-kryo.c => qcom-cpufreq-nvmem.c} (69%)

diff --git a/Documentation/devicetree/bindings/opp/kryo-cpufreq.txt 
b/Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt
similarity index 97%
rename from Documentation/devicetree/bindings/opp/kryo-cpufreq.txt
rename to Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt
index c2127b96805a..f4a7123730c3 100644
--- a/Documentation/devicetree/bindings/opp/kryo-cpufreq.txt
+++ b/Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt
@@ -1,13 +1,13 @@
-Qualcomm Technologies, Inc. KRYO CPUFreq and OPP bindings
+Qualcomm Technologies, Inc. NVMEM CPUFreq and OPP bindings
 ===
 
-In Certain Qualcomm Technologies, Inc. SoCs like apq8096 and msm8996
-that have KRYO processors, the CPU ferequencies subset and voltage value
-of each OPP varies based on the silicon variant in use.
+In Certain Qualcomm Technologies, Inc. SoCs like apq8096 and msm8996,
+the CPU frequencies subset and voltage value of each OPP varies based on
+the silicon variant in use.
 Qualcomm Technologies, Inc. Process Voltage Scaling Tables
 defines the voltage and frequency value based on the msm-id in SMEM
 and speedbin blown in the efuse combination.
-The qcom-cpufreq-kryo driver reads the msm-id and efuse value from the SoC
+The qcom-cpufreq-nvmem driver reads the msm-id and efuse value from the SoC
 to provide the OPP framework with required information (existing HW bitmap).
 This is used to determine the voltage and frequency value for each OPP of
 operating-points-v2 table when it is parsed by the OPP framework.
@@ -19,7 +19,7 @@ In 'cpus' nodes:
 
 In 'operating-points-v2' table:
 - compatible: Should be
-   - 'operating-points-v2-kryo-cpu' for apq8096 and msm8996.
+   - 'operating-points-v2-qcom-cpu' for apq8096 and msm8996.
 - nvmem-cells: A phandle pointing to a nvmem-cells node representing the
efuse registers that has information about the
speedbin that is used to select the right frequency/voltage
@@ -127,7 +127,7 @@ Example 1:
};
 
cluster0_opp: opp_table0 {
-   compatible = "operating-points-v2-kryo-cpu";
+   compatible = "operating-points-v2-qcom-cpu";
nvmem-cells = <_efuse>;
opp-shared;
 
@@ -338,7 +338,7 @@ Example 1:
};
 
cluster1_opp: opp_table1 {
-   compatible = "operating-points-v2-kryo-cpu";
+   compatible = "operating-points-v2-qcom-cpu";
nvmem-cells = <_efuse>;
opp-shared;
 
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 179a1d302f48..2e4aefa0f34d 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -110,8 +110,8 @@ config ARM_OMAP2PLUS_CPUFREQ
depends on ARCH_OMAP2PLUS
default ARCH_OMAP2PLUS
 
-config ARM_QCOM_CPUFREQ_KRYO
-   tristate "Qualcomm Kryo based CPUFreq"
+config ARM_QCOM_CPUFREQ_NVMEM
+   tristate "Qualcomm nvmem based CPUFreq"
depends on ARM64
depends on QCOM_QFPROM
depends on QCOM_SMEM
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 689b26c6f949..8e83fd73bd2d 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -63,7 +63,7 @@ obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ)   += omap-cpufreq.o
 obj-$(CONFIG_ARM_PXA2xx_CPUFREQ)   += pxa2xx-cpufreq.o
 obj-$(CONFIG_PXA3xx)   += pxa3xx-cpufreq.o
 obj-$(CONFIG_ARM_QCOM_CPUFREQ_HW)  += qcom-cpufreq-hw.o
-obj-$(CONFIG_ARM_QCOM_CPUFREQ_KRYO)+= qcom-cpufreq-kryo.o
+obj-$(CONFIG_ARM_QCOM_CPUFREQ_NVMEM)   += qcom-cpufreq-nvmem.o
 obj-$(CONFIG_ARM_S3C2410_CPUFREQ)  += s3c2410-cpufreq.o
 obj-$(CONFIG_ARM_S3C2412_CPUFREQ)  += s3c2412-cpufreq.o
 obj-$(CONFIG_ARM_S3C2416_CPUFREQ)  += s3c2416-cpufreq.o
diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c 
b/drivers/cpufreq/qcom-cpufreq-nvmem.c
similarity index 69%
rename from drivers/cpufreq/qcom-cpufreq-kryo.c
rename to drivers/cpufreq/qcom-cpufreq-nvmem.c
index dd64dcf89c74..652a1de2a5d4 100644
--- a/drivers/cpufreq/qcom-cpufreq-kryo.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -9,7 +9,7 @@

[RFC PATCH 0/9] Add support for QCOM Core Power Reduction

2019-04-03 Thread Niklas Cassel

This is a first RFC for Core Power Reduction (CPR), a form of
Adaptive Voltage Scaling (AVS), found on certain Qualcomm SoCs.

Since this is simply an RFC, things like MAINTAINERS hasn't
been updated yet.

CPR is a technology that reduces core power on a CPU or on other device.
It reads voltage settings from efuses (that have been written in production),
it uses these voltage settings as initial values, for each OPP.

After moving to a certain OPP, CPR monitors dynamic factors such as
temperature, etc. and adjusts the voltage for that frequency accordingly
to save power and meet silicon characteristic requirements.

This driver is based on an RFC by Stephen Boyd[1], which in turn is
based on work by others on codeaurora.org[2].

[1] https://lkml.org/lkml/2015/9/18/833
[2] 
https://www.codeaurora.org/cgit/quic/la/kernel/msm-3.10/tree/drivers/regulator/cpr-regulator.c?h=msm-3.10


Jorge Ramirez-Ortiz (3):
  drivers: regulator: qcom_spmi: enable linear range info
  cpufreq: qcom: support qcs404 on nvmem driver
  cpufreq: Add qcs404 to cpufreq-dt-platdev blacklist

Niklas Cassel (5):
  cpufreq: qcom: create a driver struct
  dt-bindings: opp: Add qcom-opp bindings with properties needed for CPR
  dt-bindings: power: avs: Add support for CPR (Core Power Reduction)
  power: avs: Add support for CPR (Core Power Reduction)
  arm64: dts: qcom: qcs404: Add CPR and populate OPP tables

Sricharan R (1):
  cpufreq: qcom: Re-organise kryo cpufreq to use it for other nvmem
based qcom socs

 ...ryo-cpufreq.txt => qcom-nvmem-cpufreq.txt} |   16 +-
 .../devicetree/bindings/opp/qcom-opp.txt  |   24 +
 .../bindings/power/avs/qcom,cpr.txt   |  119 ++
 arch/arm64/boot/dts/qcom/qcs404.dtsi  |  152 +-
 drivers/cpufreq/Kconfig.arm   |4 +-
 drivers/cpufreq/Makefile  |2 +-
 drivers/cpufreq/cpufreq-dt-platdev.c  |1 +
 ...om-cpufreq-kryo.c => qcom-cpufreq-nvmem.c} |  197 +-
 drivers/power/avs/Kconfig |   15 +
 drivers/power/avs/Makefile|1 +
 drivers/power/avs/qcom-cpr.c  | 1777 +
 drivers/regulator/qcom_spmi-regulator.c   |7 +
 12 files changed, 2234 insertions(+), 81 deletions(-)
 rename Documentation/devicetree/bindings/opp/{kryo-cpufreq.txt => 
qcom-nvmem-cpufreq.txt} (97%)
 create mode 100644 Documentation/devicetree/bindings/opp/qcom-opp.txt
 create mode 100644 Documentation/devicetree/bindings/power/avs/qcom,cpr.txt
 rename drivers/cpufreq/{qcom-cpufreq-kryo.c => qcom-cpufreq-nvmem.c} (52%)
 create mode 100644 drivers/power/avs/qcom-cpr.c

-- 
2.20.1

[RFC PATCH 1/9] drivers: regulator: qcom_spmi: enable linear range info

2019-04-03 Thread Niklas Cassel

From: Jorge Ramirez-Ortiz 

Signed-off-by: Jorge Ramirez-Ortiz 
---
 drivers/regulator/qcom_spmi-regulator.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/regulator/qcom_spmi-regulator.c 
b/drivers/regulator/qcom_spmi-regulator.c
index 3193506eac6f..f2edf510b0df 100644
--- a/drivers/regulator/qcom_spmi-regulator.c
+++ b/drivers/regulator/qcom_spmi-regulator.c
@@ -1907,6 +1907,7 @@ MODULE_DEVICE_TABLE(of, qcom_spmi_regulator_match);
 static int qcom_spmi_regulator_probe(struct platform_device *pdev)
 {
const struct spmi_regulator_data *reg;
+   const struct spmi_voltage_range *range;
const struct of_device_id *match;
struct regulator_config config = { };
struct regulator_dev *rdev;
@@ -1996,6 +1997,12 @@ static int qcom_spmi_regulator_probe(struct 
platform_device *pdev)
}
}
 
+   if (vreg->logical_type == SPMI_REGULATOR_LOGICAL_TYPE_HFS430) {
+   /* since there is only one range */
+   range = spmi_regulator_find_range(vreg);
+   vreg->desc.uV_step = range->step_uV;
+   }
+
config.dev = dev;
config.driver_data = vreg;
config.regmap = regmap;
-- 
2.20.1

Re: [PATCH v2 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock

2019-04-03 Thread Juergen Gross

On 03/04/2019 18:01, Peter Zijlstra wrote:
> On Wed, Apr 03, 2019 at 11:39:09AM -0400, Alex Kogan wrote:
> 
 The patch that I am looking for is to have a separate
 numa_queued_spinlock_slowpath() that coexists with
 native_queued_spinlock_slowpath() and
 paravirt_queued_spinlock_slowpath(). At boot time, we select the most
 appropriate one for the system at hand.
>> Is this how this selection works today for paravirt?
>> I see a PARAVIRT_SPINLOCKS config option, but IIUC you are talking about a 
>> different mechanism here.
>> Can you, please, elaborate or give me a link to a page that explains that?
> 
> Oh man, you ask us to explain how paravirt patching works... that's
> magic :-)
> 
> Basically, the compiler will emit a bunch of indirect calls to the
> various pv_ops.*.* functions.
> 
> Then, at alternative_instructions() <- apply_paravirt() it will rewrite
> all these indirect calls to direct calls to the function pointers that
> are in the pv_ops structure at that time (+- more magic).
> 
> So we initialize the pv_ops.lock.* methods to the normal
> native_queued_spin*() stuff, if KVM/Xen/whatever setup detectors pv
> spnlock support changes the methods to the paravirt_queued_*() stuff.
> 
> If you wnt more details, you'll just have to read
> arch/x86/include/asm/paravirt*.h and arch/x86/kernel/paravirt*.c, I
> don't think there's a coherent writeup of all that.
> 
>>> Agreed; and until we have static_call, I think we can abuse the paravirt
>>> stuff for this.
>>>
>>> By the time we patch the paravirt stuff:
>>>
>>>  check_bugs()
>>>alternative_instructions()
>>>  apply_paravirt()
>>>
>>> we should already have enumerated the NODE topology and so nr_node_ids()
>>> should be set.
>>>
>>> So if we frob pv_ops.lock.queued_spin_lock_slowpath to
>>> numa_queued_spin_lock_slowpath before that, it should all get patched
>>> just right.
>>>
>>> That of course means the whole NUMA_AWARE_SPINLOCKS thing depends on
>>> PARAVIRT_SPINLOCK, which is a bit awkward…
> 
>> Just to mention here, the patch so far does not address paravirt, but
>> our goal is to add this support once we address all the concerns for
>> the native version.  So we will end up with four variants for the
>> queued_spinlock_slowpath() — one for each combination of
>> native/paravirt and NUMA/non-NUMA.  Or perhaps we do not need a
>> NUMA/paravirt variant?
> 
> I wouldn't bother with a pv version of the numa aware code at all. If
> you have overcommitted guests, topology is likely irrelevant anyway. If
> you have 1:1 pinned guests, they'll not use pv spinlocks anyway.
> 
> So keep it to tertiary choice:
> 
>  - native
>  - native/numa
>  - paravirt

Just for the records: the paravirt variant could easily choose whether
it wants to include a numa version just by using the existing hooks.
With PARAVIRT_SPINLOCK configured I guess even the native case would
need to use the paravirt hooks for selection of native or native/numa.

Without PARAVIRT_SPINLOCK this would be just an alternative() then?

Maybe the resulting code would be much more readable if we'd just
make PARAVIRT_SPINLOCK usable without the other PARAVIRT hooks? So
splitting up PARAVIRT into PARAVIRT_GUEST (timer hooks et al) and
the patching infrastructure, with PARAVIRT_GUEST and PARAVIRT_SPINLOCK
selecting PARAVIRT, and PARAVIRT_XXL selecting PARAVIRT_GUEST.


Juergen

Re: [PATCH 6/6] arm64/mm: Enable ZONE_DEVICE

2019-04-03 Thread Dan Williams

On Wed, Apr 3, 2019 at 9:42 PM Anshuman Khandual
 wrote:
>
>
>
> On 04/03/2019 07:28 PM, Robin Murphy wrote:
> > [ +Dan, Jerome ]
> >
> > On 03/04/2019 05:30, Anshuman Khandual wrote:
> >> Arch implementation for functions which create or destroy vmemmap mapping
> >> (vmemmap_populate, vmemmap_free) can comprehend and allocate from inside
> >> device memory range through driver provided vmem_altmap structure which
> >> fulfils all requirements to enable ZONE_DEVICE on the platform. Hence just
> >
> > ZONE_DEVICE is about more than just altmap support, no?
>
> Hot plugging the memory into a dev->numa_node's ZONE_DEVICE and initializing 
> the
> struct pages for it has stand alone and self contained use case. The driver 
> could
> just want to manage the memory itself but with struct pages either in the RAM 
> or
> in the device memory range through struct vmem_altmap. The driver may not 
> choose
> to opt for HMM, FS DAX, P2PDMA (use cases of ZONE_DEVICE) where it may have to
> map these pages into any user pagetable which would necessitate support for
> pte|pmd|pud_devmap.

What's left for ZONE_DEVICE if none of the above cases are used?

> Though I am still working towards getting HMM, FS DAX, P2PDMA enabled on 
> arm64,
> IMHO ZONE_DEVICE is self contained and can be evaluated in itself.

I'm not convinced. What's the specific use case.

>
> >
> >> enable ZONE_DEVICE by subscribing to ARCH_HAS_ZONE_DEVICE. But this is only
> >> applicable for ARM64_4K_PAGES (ARM64_SWAPPER_USES_SECTION_MAPS) only which
> >> creates vmemmap section mappings and utilize vmem_altmap structure.
> >
> > What prevents it from working with other page sizes? One of the foremost 
> > use-cases for our 52-bit VA/PA support is to enable mapping large 
> > quantities of persistent memory, so we really do need this for 64K pages 
> > too. FWIW, it appears not to be an issue for PowerPC.
>
>
> On !AR64_4K_PAGES vmemmap_populate() calls vmemmap_populate_basepages() which
> does not support struct vmem_altmap right now. Originally was planning to send
> the vmemmap_populate_basepages() enablement patches separately but will post 
> it
> here for review.
>
> >
> >> Signed-off-by: Anshuman Khandual 
> >> ---
> >>   arch/arm64/Kconfig | 1 +
> >>   1 file changed, 1 insertion(+)
> >>
> >> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> >> index db3e625..b5d8cf5 100644
> >> --- a/arch/arm64/Kconfig
> >> +++ b/arch/arm64/Kconfig
> >> @@ -31,6 +31,7 @@ config ARM64
> >>   select ARCH_HAS_SYSCALL_WRAPPER
> >>   select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
> >>   select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
> >> +select ARCH_HAS_ZONE_DEVICE if ARM64_4K_PAGES
> >
> > IIRC certain configurations (HMM?) don't even build if you just turn this 
> > on alone (although of course things may have changed elsewhere in the 
> > meantime) - crucially, though, from previous discussions[1] it seems 
> > fundamentally unsafe, since I don't think we can guarantee that nobody will 
> > touch the corners of ZONE_DEVICE that also require pte_devmap in order not 
> > to go subtly wrong. I did get as far as cooking up some patches to sort 
> > that out [2][3] which I never got round to posting for their own sake, so 
> > please consider picking those up as part of this series.
>
> In the previous discussion mentioned here [1] it sort of indicates that we
> cannot have a viable (ARCH_HAS_ZONE_DEVICE=y but !__HAVE_ARCH_PTE_DEVMAP). I
> dont understand why !

Because ZONE_DEVICE was specifically invented to solve get_user_pages() for DAX.

> The driver can just hotplug the range into ZONE_DEVICE,
> manage the memory itself without mapping them to user page table ever.

Then why do you even need 'struct page' objects?

> IIUC
> ZONE_DEVICE must not need user mapped device PFN support.

No, you don't understand correctly, or I don't understand how you plan
to use ZONE_DEVICE outside it's intended use case.

> All the corner case
> problems discussed previously come in once these new 'device PFN' memory which
> is now in ZONE_DEVICE get mapped into user page table.
>
> >
> > Robin.
> >
> >>   select ARCH_HAVE_NMI_SAFE_CMPXCHG
> >>   select ARCH_INLINE_READ_LOCK if !PREEMPT
> >>   select ARCH_INLINE_READ_LOCK_BH if !PREEMPT
> >>
> >
> >
> > [1] 
> > https://lore.kernel.org/linux-mm/caa9_cmfa9gs+1m1asyv1ty5jky3iho3cerhnraruwjw3pfm...@mail.gmail.com/#t
> > [2] 
> > http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=61816b833afdb56b49c2e58f5289ae18809e5d67
> > [3] 
> > http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=a5a16560eb1becf9a1d4cc0d03d6b5e76da4f4e1
> > (apologies to anyone if the linux-arm.org server is being flaky as usual 
> > and requires a few tries to respond properly)
>
> I have not evaluated pte_devmap(). Will consider [3] when enabling it. But
> I still dont understand why ZONE_DEVICE can not be enabled and used from a
> driver which never requires user mapping or pte|pmd|pud_devmap()

Question on a lockdep test case about mixed read-write ABBA

2019-04-03 Thread Yuyang Du

Hi Peter,

I observed this test case you wrote in Commit: e9149858974606
("locking/lockdep/selftests: Add mixed read-write ABBA").

static void rwsem_ABBA2(void)
{
   RSL(X1);
   ML(Y1);
   MU(Y1);
   RSU(X1);

   ML(Y1);
   RSL(X1);
   RSU(X1);
   MU(Y1); // should fail
}

Why should it fail? This is not a deadlock, right? The depencencies
would be built by lockdep though; that results in a false positive.

Thanks,
Yuyang

Re: [PATCH 6/6] arm64/mm: Enable ZONE_DEVICE

2019-04-03 Thread Anshuman Khandual




On 04/03/2019 09:37 PM, Jerome Glisse wrote:
> On Wed, Apr 03, 2019 at 02:58:28PM +0100, Robin Murphy wrote:
>> [ +Dan, Jerome ]
>>
>> On 03/04/2019 05:30, Anshuman Khandual wrote:
>>> Arch implementation for functions which create or destroy vmemmap mapping
>>> (vmemmap_populate, vmemmap_free) can comprehend and allocate from inside
>>> device memory range through driver provided vmem_altmap structure which
>>> fulfils all requirements to enable ZONE_DEVICE on the platform. Hence just
>>
>> ZONE_DEVICE is about more than just altmap support, no?
>>
>>> enable ZONE_DEVICE by subscribing to ARCH_HAS_ZONE_DEVICE. But this is only
>>> applicable for ARM64_4K_PAGES (ARM64_SWAPPER_USES_SECTION_MAPS) only which
>>> creates vmemmap section mappings and utilize vmem_altmap structure.
>>
>> What prevents it from working with other page sizes? One of the foremost
>> use-cases for our 52-bit VA/PA support is to enable mapping large quantities
>> of persistent memory, so we really do need this for 64K pages too. FWIW, it
>> appears not to be an issue for PowerPC.
>>
>>> Signed-off-by: Anshuman Khandual 
>>> ---
>>>   arch/arm64/Kconfig | 1 +
>>>   1 file changed, 1 insertion(+)
>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index db3e625..b5d8cf5 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -31,6 +31,7 @@ config ARM64
>>> select ARCH_HAS_SYSCALL_WRAPPER
>>> select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
>>> select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>>> +   select ARCH_HAS_ZONE_DEVICE if ARM64_4K_PAGES
>>
>> IIRC certain configurations (HMM?) don't even build if you just turn this on
>> alone (although of course things may have changed elsewhere in the meantime)
>> - crucially, though, from previous discussions[1] it seems fundamentally
>> unsafe, since I don't think we can guarantee that nobody will touch the
>> corners of ZONE_DEVICE that also require pte_devmap in order not to go
>> subtly wrong. I did get as far as cooking up some patches to sort that out
>> [2][3] which I never got round to posting for their own sake, so please
>> consider picking those up as part of this series.
> 
> Correct _do not_ enable ZONE_DEVICE without support for pte_devmap detection.

Driver managed ZONE_DEVICE memory which never maps into user page table is not
a valid use case for ZONE_DEVICE ? Also what about MEMORY_DEVICE_PRIVATE ? That
can never be mapped into user page table. A driver can still manage these non
coherent memory through it's struct pages (which will be allocated inside RAM)

> If you want some feature of ZONE_DEVICE. Like HMM as while DAX does require
> pte_devmap, HMM device private does not. So you would first have to split
> ZONE_DEVICE into more sub-features kconfig option.

CONFIG_ZONE_DEVICE does not do that already ! All it says is that a device
memory range can be plugged into ZONE_DEVICE either as PRIVATE (non-coherent)
or PUBLIC/PCI_P2PDMA (coherent) memory without mandating anything about how
these memory will be further used.

> 
> What is the end use case you are looking for ? Persistent memory ?

Persistent memory is one of the primary use cases.

Re: [PATCH v3 04/20] coresight: etm4x: Configure tracers to emit timestamps

2019-04-03 Thread Mike Leach

Hi,

On Thu, 4 Apr 2019 at 04:36, Mathieu Poirier  wrote:
>
> Configure timestamps to be emitted at regular intervals in the trace
> stream to temporally correlate instructions executed on different CPUs.
>
> Signed-off-by: Mathieu Poirier 
> ---
>  drivers/hwtracing/coresight/coresight-etm4x.c | 101 +-
>  1 file changed, 100 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c 
> b/drivers/hwtracing/coresight/coresight-etm4x.c
> index d64192c29860..46d337fd8442 100644
> --- a/drivers/hwtracing/coresight/coresight-etm4x.c
> +++ b/drivers/hwtracing/coresight/coresight-etm4x.c
> @@ -204,6 +204,90 @@ static void etm4_enable_hw_smp_call(void *info)
> arg->rc = etm4_enable_hw(arg->drvdata);
>  }
>
> +/*
> + * The goal of function etm4_config_timestamp_event() is to configure a
> + * counter that will tell the tracer to emit a timestamp packet when it
> + * reaches zero.  This is done in order to get a more fine grained idea
> + * of when instructions are executed so that they can be correlated
> + * with execution on other CPUs.
> + *
> + * To do this the counter itself is configured to self reload and
> + * TRCRSCTLR1 (always true) used to get the counter to decrement.  From
> + * there a resource selector is configured with the counter and the
> + * timestamp control register to use the resource selector to trigger the
> + * event that will insert a timestamp packet in the stream.
> + */
> +static int etm4_config_timestamp_event(struct etmv4_drvdata *drvdata)
> +{
> +   int ctridx, ret = -EINVAL;
> +   int counter, rselector;
> +   u32 val = 0;
> +   struct etmv4_config *config = >config;
> +
> +   /* No point in trying if we don't have at least one counter */
> +   if (!drvdata->nr_cntr)
> +   goto out;
> +
> +   /* Find a counter that hasn't been initialised */
> +   for (ctridx = 0; ctridx < drvdata->nr_cntr; ctridx++)
> +   if (config->cntr_val[ctridx] == 0)
> +   break;
> +
> +   /* All the counters have been configured already, bail out */
> +   if (ctridx == drvdata->nr_cntr) {
> +   pr_debug("%s: no available counter found\n", __func__);
> +   ret = -ENOSPC;
> +   goto out;
> +   }
> +
> +   /*
> +* Searching for an available resource selector to use, starting at
> +* '2' since every implementation has at least 2 resource selector.
> +* ETMIDR4 gives the number of resource selector _pairs_,
> +* hence multiply by 2.
> +*/
> +   for (rselector = 2; rselector < drvdata->nr_resource * 2; rselector++)
> +   if (!config->res_ctrl[rselector])
> +   break;
> +
> +   if (rselector == drvdata->nr_resource * 2) {
> +   pr_debug("%s: no available resource selector found\n", 
> __func__);
> +   ret = -ENOSPC;
> +   goto out;
> +   }
> +
> +   /* Remember what counter we used */
> +   counter = 1 << ctridx;
> +
> +   /*
> +* Initialise original and reload counter value to the smallest
> +* possible value in order to get as much precision as we can.
> +*/
> +   config->cntr_val[ctridx] = 1;
> +   config->cntrldvr[ctridx] = 1;
> +
> +   /* Set the trace counter control register */
> +   val =  0x1 << 16|  /* Bit 16, reload counter automatically */
> +  0x0 << 7 |  /* Select single resource selector */
> +  0x1;/* Resource selector 1, i.e always true */
> +
> +   config->cntr_ctrl[ctridx] = val;
> +
> +   val = 0x2 << 16 | /* Group 0b0010 - Counter and sequencers */
> + counter << 0;   /* Counter to use */
> +
> +   config->res_ctrl[rselector] = val;
> +
> +   val = 0x0 << 7  | /* Select single resource selector */
> + rselector;  /* Resource selector */
> +
> +   config->ts_ctrl = val;
> +
> +   ret = 0;
> +out:
> +   return ret;
> +}
> +
>  static int etm4_parse_event_config(struct etmv4_drvdata *drvdata,
>struct perf_event *event)
>  {
> @@ -239,9 +323,24 @@ static int etm4_parse_event_config(struct etmv4_drvdata 
> *drvdata,
> /* TRM: Must program this for cycacc to work */
> config->ccctlr = ETM_CYC_THRESHOLD_DEFAULT;
> }
> -   if (attr->config & BIT(ETM_OPT_TS))
> +   if (attr->config & BIT(ETM_OPT_TS)) {
> +   /*
> +* Configure timestamps to be emitted at regular intervals in
> +* order to correlate instructions executed on different CPUs
> +* (CPU-wide trace scenarios).
> +*/
> +   ret = etm4_config_timestamp_event(drvdata);
> +
> +   /*
> +* No need to go further if timestamp intervals can't
> +* be

Re: [PATCH 6/6] arm64/mm: Enable ZONE_DEVICE

2019-04-03 Thread Anshuman Khandual

On 04/03/2019 07:28 PM, Robin Murphy wrote:
> [ +Dan, Jerome ]
> 
> On 03/04/2019 05:30, Anshuman Khandual wrote:
>> Arch implementation for functions which create or destroy vmemmap mapping
>> (vmemmap_populate, vmemmap_free) can comprehend and allocate from inside
>> device memory range through driver provided vmem_altmap structure which
>> fulfils all requirements to enable ZONE_DEVICE on the platform. Hence just
> 
> ZONE_DEVICE is about more than just altmap support, no?

Hot plugging the memory into a dev->numa_node's ZONE_DEVICE and initializing the
struct pages for it has stand alone and self contained use case. The driver 
could
just want to manage the memory itself but with struct pages either in the RAM or
in the device memory range through struct vmem_altmap. The driver may not choose
to opt for HMM, FS DAX, P2PDMA (use cases of ZONE_DEVICE) where it may have to
map these pages into any user pagetable which would necessitate support for
pte|pmd|pud_devmap.

Though I am still working towards getting HMM, FS DAX, P2PDMA enabled on arm64,
IMHO ZONE_DEVICE is self contained and can be evaluated in itself.

> 
>> enable ZONE_DEVICE by subscribing to ARCH_HAS_ZONE_DEVICE. But this is only
>> applicable for ARM64_4K_PAGES (ARM64_SWAPPER_USES_SECTION_MAPS) only which
>> creates vmemmap section mappings and utilize vmem_altmap structure.
> 
> What prevents it from working with other page sizes? One of the foremost 
> use-cases for our 52-bit VA/PA support is to enable mapping large quantities 
> of persistent memory, so we really do need this for 64K pages too. FWIW, it 
> appears not to be an issue for PowerPC.

On !AR64_4K_PAGES vmemmap_populate() calls vmemmap_populate_basepages() which
does not support struct vmem_altmap right now. Originally was planning to send
the vmemmap_populate_basepages() enablement patches separately but will post it
here for review.

> 
>> Signed-off-by: Anshuman Khandual 
>> ---
>>   arch/arm64/Kconfig | 1 +
>>   1 file changed, 1 insertion(+)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index db3e625..b5d8cf5 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -31,6 +31,7 @@ config ARM64
>>   select ARCH_HAS_SYSCALL_WRAPPER
>>   select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
>>   select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>> +    select ARCH_HAS_ZONE_DEVICE if ARM64_4K_PAGES
> 
> IIRC certain configurations (HMM?) don't even build if you just turn this on 
> alone (although of course things may have changed elsewhere in the meantime) 
> - crucially, though, from previous discussions[1] it seems fundamentally 
> unsafe, since I don't think we can guarantee that nobody will touch the 
> corners of ZONE_DEVICE that also require pte_devmap in order not to go subtly 
> wrong. I did get as far as cooking up some patches to sort that out [2][3] 
> which I never got round to posting for their own sake, so please consider 
> picking those up as part of this series.

In the previous discussion mentioned here [1] it sort of indicates that we
cannot have a viable (ARCH_HAS_ZONE_DEVICE=y but !__HAVE_ARCH_PTE_DEVMAP). I
dont understand why ! The driver can just hotplug the range into ZONE_DEVICE,
manage the memory itself without mapping them to user page table ever. IIUC
ZONE_DEVICE must not need user mapped device PFN support. All the corner case
problems discussed previously come in once these new 'device PFN' memory which
is now in ZONE_DEVICE get mapped into user page table.

> 
> Robin.
> 
>>   select ARCH_HAVE_NMI_SAFE_CMPXCHG
>>   select ARCH_INLINE_READ_LOCK if !PREEMPT
>>   select ARCH_INLINE_READ_LOCK_BH if !PREEMPT
>>
> 
> 
> [1] 
> https://lore.kernel.org/linux-mm/caa9_cmfa9gs+1m1asyv1ty5jky3iho3cerhnraruwjw3pfm...@mail.gmail.com/#t
> [2] 
> http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=61816b833afdb56b49c2e58f5289ae18809e5d67
> [3] 
> http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=a5a16560eb1becf9a1d4cc0d03d6b5e76da4f4e1
> (apologies to anyone if the linux-arm.org server is being flaky as usual and 
> requires a few tries to respond properly)

I have not evaluated pte_devmap(). Will consider [3] when enabling it. But
I still dont understand why ZONE_DEVICE can not be enabled and used from a
driver which never requires user mapping or pte|pmd|pud_devmap() support.

Re: 15c8410c67 ("mm/slob.c: respect list_head abstraction layer"): WARNING: CPU: 0 PID: 1 at lib/list_debug.c:28 __list_add_valid

2019-04-03 Thread Tobin C. Harding

On Wed, Apr 03, 2019 at 03:54:17PM +1100, Tobin C. Harding wrote:
> On Wed, Apr 03, 2019 at 10:00:38AM +0800, kernel test robot wrote:
> > Greetings,
> > 
> > 0day kernel testing robot got the below dmesg and the first bad commit is
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
> > 
> > commit 15c8410c67adefd26ea0df1f1b86e1836051784b
> > Author: Tobin C. Harding 
> > AuthorDate: Fri Mar 29 10:01:23 2019 +1100
> > Commit: Stephen Rothwell 
> > CommitDate: Sat Mar 30 16:09:41 2019 +1100
> > 
> > mm/slob.c: respect list_head abstraction layer
> > 
> > Currently we reach inside the list_head.  This is a violation of the 
> > layer
> > of abstraction provided by the list_head.  It makes the code fragile.
> > More importantly it makes the code wicked hard to understand.
> > 
> > The code logic is based on the page in which an allocation was made, we
> > want to modify the slob_list we are working on to have this page at the
> > front.  We already have a function to check if an entry is at the front 
> > of
> > the list.  Recently a function was added to list.h to do the list
> > rotation.  We can use these two functions to reduce line count, reduce
> > code fragility, and reduce cognitive load required to read the code.
> > 
> > Use list_head functions to interact with lists thereby maintaining the
> > abstraction provided by the list_head structure.
> > 
> > Link: http://lkml.kernel.org/r/20190318000234.22049-3-to...@kernel.org
> > Signed-off-by: Tobin C. Harding 
> > Cc: Christoph Lameter 
> > Cc: David Rientjes 
> > Cc: Joonsoo Kim 
> > Cc: Pekka Enberg 
> > Cc: Roman Gushchin 
> > Signed-off-by: Andrew Morton 
> > Signed-off-by: Stephen Rothwell 
> > 
> > 2e1f88301e  include/linux/list.h: add list_rotate_to_front()
> > 15c8410c67  mm/slob.c: respect list_head abstraction layer
> > 05d08e2995  Add linux-next specific files for 20190402
> > +---+++---+
> > |   | 2e1f88301e | 
> > 15c8410c67 | next-20190402 |
> > +---+++---+
> > | boot_successes| 1009   | 198  
> >   | 299   |
> > | boot_failures | 0  | 2
> >   | 44|
> > | WARNING:at_lib/list_debug.c:#__list_add_valid | 0  | 2
> >   | 44|
> > | RIP:__list_add_valid  | 0  | 2
> >   | 44|
> > | WARNING:at_lib/list_debug.c:#__list_del_entry_valid   | 0  | 2
> >   | 25|
> > | RIP:__list_del_entry_valid| 0  | 2
> >   | 25|
> > | WARNING:possible_circular_locking_dependency_detected | 0  | 2
> >   | 44|
> > | RIP:_raw_spin_unlock_irqrestore   | 0  | 2
> >   | 2 |
> > | BUG:kernel_hang_in_test_stage | 0  | 0
> >   | 6 |
> > | BUG:unable_to_handle_kernel   | 0  | 0
> >   | 1 |
> > | Oops:#[##]| 0  | 0
> >   | 1 |
> > | RIP:slob_page_alloc   | 0  | 0
> >   | 1 |
> > | Kernel_panic-not_syncing:Fatal_exception  | 0  | 0
> >   | 1 |
> > | RIP:delay_tsc | 0  | 0
> >   | 2 |
> > +---+++---+
> > 
> > [2.618737] db_root: cannot open: /etc/target
> > [2.620114] mtdoops: mtd device (mtddev=name/number) must be supplied
> > [2.620967] slram: not enough parameters.
> > [2.621614] [ cut here ]
> > [2.622254] list_add corruption. prev->next should be next 
> > (aeeb71b0), but was cee1406d3f70. (prev=cee140422508).
> 
> Is this perhaps a false positive because we hackishly move the list_head
> 'head' and insert it back into the list.  Perhaps this is confusing the
> validation functions?

This has got me stumped.  I cannot create a test case where manipulating
a list with list_rotate_to_front() causes the list validation functions
to emit an error.  Also I cannot come up with a way on paper that it can
happen either.

I don't really know how to go forwards from here.  I'll sleep on it and
see if something comes to me, any ideas to look into please?

thanks,
Tobin.

I will be waiting for your urgent answer

2019-04-03 Thread Jucai LI




Beautiful day,
My name is Mr. Jucai Li, Chief Executive Officer of the Bank of China
I am looking for a business partner who will work with me in a joint venture.
Contact me in my private email for more details.
Email (jucailil...@gmail.com)
I am waiting to hear from you.
Many thanks,
Mr. Jucai Li

Re: [PATCH v1] ARM: dts: exynos: Add proper regulator states for suspend-to-mem for odroid-u3

2019-04-03 Thread Anand Moon

hi Krzysztof,

On Tue, 26 Mar 2019 at 16:28, Krzysztof Kozlowski  wrote:
>
> On Tue, 26 Mar 2019 at 11:35, Anand Moon  wrote:
>
> (...)
>
> > > This is third or fourth submission but you marked it as v1. This makes
> > > it very difficult to discuss and reference previous versions.
> > >
> > > The commit message did not change since beginning (first version). I
> > > asked twice that you need to explain exactly why you put the the
> > > regulator to off or on state in suspend. Why?
> > > Because:
> > > 1. This change looks without justification - once you put on, then you
> > > put off, now again on,
> > > 2. Anyone reading the code later must know the rationale why this was 
> > > done,
> > > 3. I am not quite sure whether this is good setting so I would be
> > > happy to be convinced.
> > >
> >
> > Like I mention in the patch summary that this.
> >
> > Current changes are based on
> > [0] 
> > https://www.kernel.org/doc/Documentation/devicetree/bindings/regulator/max77686.txt
> >
> >   Regulators which can be turned off during system suspend:
> > -LDOn : 2, 6-8, 10-12, 14-16,
> > -BUCKn : 1-4.
> >   Use standard regulator bindings for it ('regulator-off-in-suspend').
>
> I do not see how this is related to my questions.
>
> > > How to provide such explanation? The best in commit message. Sometimes
> > > in the comment in the code, depends.
> >
> > Ok I have been testing with following regulator debug prints to catch error.
> > [0] max77686-regulator.patch
> >
> > below is the console logs during suspend and resume.
> > ---
> > Last login: Sat Mar 23 18:22:46 on ttySAC1
> > [root@archl-u3e ~]# echo no > /sys/module/printk/parameters/console_suspend
> > [root@archl-u3e ~]# rtcwake -d /dev/rtc0 -m mem -s 10
> > rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Mar 23 19:56:17 2019
> > [   38.595854] PM: suspend entry (deep)
> > [   38.596603] PM: Syncing filesystems ... done.
> > [   38.629351] Freezing user space processes ... (elapsed 0.002 seconds) 
> > done.
> > [   38.633192] OOM killer disabled.
> > [   38.636035] Freezing remaining freezable tasks ... (elapsed 0.001
> > seconds) done.
> > [   38.675059] smsc95xx 1-2:1.0 eth0: entering SUSPEND2 mode
> > [   38.753120] dwc2 1248.hsotg: suspending usb gadget g_ether
> > [   38.754007] dwc2 1248.hsotg: new device is full-speed
> > [   38.758960] dwc2 1248.hsotg: dwc2_hsotg_ep_disable: called for ep0
> > [   38.765507] dwc2 1248.hsotg: dwc2_hsotg_ep_disable: called for ep0
> > [   38.774050] wake enabled for irq 119
> > [   38.775761] BUCK9: No configuration
> > [   38.779191] BUCK8_P3V3: No configuration
> > [   38.782852] BUCK7_2.0V: No configuration
> > [   38.786851] BUCK6_1.35V: No configuration
> > [   38.790752] VDDQ_CKEM1_2_1.2V: No configuration
> > [   38.796220] BUCK4: regulator suspend disable supported
> > [   38.800769] BUCK3: regulator suspend disable supported
> > [   38.806002] BUCK1: regulator suspend disable supported
> > [   38.810644] LDO26: No configuration
> > [   38.814169] VDDQ_LCD_1.8V: No configuration
> > [   38.818267] LDO24: No configuration
> > [   38.821732] LDO23: No configuration
> > [   38.825262] LDO22_VDDQ_MMC4_2.8V: No configuration
> > [   38.829992] TFLASH_2.8V: No configuration
> > [   38.834040] LDO20_1.8V: No configuration
> > [   38.837883] LDO19: No configuration
> > [   38.841349] LDO18: No configuration
> > [   38.844878] LDO17: No configuration
> > [   38.848667] LDO16: regulator suspend disable supported
> > [   38.853889] LDO15: regulator suspend disable supported
> > [   38.858931] LDO14: regulator suspend disable supported
> > [   38.863771] VDDQ_C2C_W_1.8V: No configuration
> > [   38.868378] LDO12: regulator suspend disable supported
> > [   38.873508] LDO11: regulator suspend disable supported
> > [   38.878545] LDO10: regulator suspend disable supported
> > [   38.883384] LDO9: No configuration
> > [   38.887190] LDO8: regulator suspend disable supported
> > [   38.892168] LDO7: regulator suspend disable supported
> > [   38.897279] LDO6: regulator suspend disable supported
> > [   38.901872] VDDQ_MMC1_3_1.8V: No configuration
> > [   38.906363] VDDQ_MMC2_2.8V: No configuration
> > [   38.910541] VDDQ_EXT_1.8V: No configuration
> > [   38.915134] LDO2: regulator suspend disable supported
> > [   38.919753] VDD_ALIVE_1.0V: No configuration
> > [   38.935229] usb3503 0-0008: switched to STANDBY mode
> > [   38.935981] wake enabled for irq 123
> > [   38.955192] samsung-pinctrl 1100.pinctrl: Setting external
> > wakeup interrupt mask: 0xfbfff7ff
> > [   38.975448] Disabling non-boot CPUs ...
> > [   39.029279] s3c2410-wdt 1006.watchdog: watchdog disabled
> > [   39.029576] wake disabled for irq 123
> > [   39.044319] usb3503 0-0008: switched to HUB mode
> > [   39.144089] wake disabled for irq 119
> > [   39.144812] dwc2 1248.hsotg: resuming usb gadget g_ether
> > [   39.422626] usb 1-2: reset

RE: [PATCH] usb: uas: fix usb subsystem hang after power off hub port

2019-04-03 Thread Kento.A.Kobayashi

Hi,

>> Root Cause
>> - Block layer timeout happens after power off UAS USB device which is 
>> accessed as reproduce step. During timeout error handler process, scsi host 
>> state becomes SHOST_CANCEL_RECOVERY that causes IO hangs up and lock cannot 
>> be released. And in final, usb subsystem hangs up.
>> Follow is function call:
>> blk_mq_timeout_work 
>>   …->scsi_times_out  (… means some functions are not listed before this 
>> function.)
>> …-> scsi_eh_scmd_add(scsi_host_set_state to SHOST_RECOVERY) 
>>   … -> scsi_error_handler
>> …-> uas_eh_device_reset_handler
>> -> usb_lock_device_for_reset  <- take lock
>>   -> usb_reset_device
>> …-> rebind = uas_post_reset (return 1 since ENODEV) 
>> …-> usb_unbind_and_rebind_marked_interfaces (rebind=1)
>>…-> uas_disconnect  (scsi_host_set_state to 
>> SHOST_CANCEL_RECOVERY)
>> … -> scsi_queue_rq
>
>How does scsi_queue_rq get called here?  As far as I can see, this shouldn't 
>happen.

We confirmed the function call path on linux 4.9 when this problem occured 
since we are working on it. In linux 4.9, the last function is scsi_request_fn 
instead of scsi_queue_rq. In staging.git, we think the scsi_queue_rq is called 
by follow path.
uas_disconnect
|- scsi_remove_host
 |- scsi_forget_host
  |- __scsi_remove_device
   |- device_del
|- bus_remove_device
 |- device_release_driver
  |- device_release_driver_internal
   |- __device_release_driver
|- drv->remove(dev) (sd_remove)  
 |- sd_shutdown
  |- sd_sync_cache
   |- scsi_execute
|- __scsi_execute
 |- blk_execute_rq
  |- blk_execute_rq_nowait
   |- blk_mq_sched_insert_request
|- blk_mq_run_hw_queue
 |- __blk_mq_delay_run_hw_queue
  |- __blk_mq_run_hw_queue
   |- blk_mq_sched_dispatch_requests
|- blk_mq_dispatch_rq_list
 |- q->mq_ops->queue_rq (scsi_queue_rq)

>> Countermeasure
>> - Make uas_post_reset doesn’t return 1 when ENODEV returns from 
>> uas_configure_endpoints since usb_unbind_and_rebind_marded_interfaces 
>> doesn’t need to do unbind/rebind operations in this situation.
>> blk_mq_timeout_work
>>   …->scsi_times_out  (… means some functions are not listed before this 
>> function.)
>> …-> scsi_eh_scmd_add(scsi_host_set_state to SHOST_RECOVERY) 
>>   … -> scsi_error_handler
>>…-> uas_eh_device_reset_handler (*1)
>>-> usb_lock_device_for_reset  <- take lock
>>  -> usb_reset_device
>>-> usb_reset_and_verify_device (return ENODEV and FAILED will 
>> be reported to *1)
>>-> uas_post_reset returns 0 when ENODEV => rebind=0 
>>-> usb_unbind_and_rebind_marked_interfaces (rebind=0)
>
>The difference is that uas_disconnect wasn't called here.  But that routine 
>should not cause any problems -- you're always supposed to be able to unbind a 
>driver from a device.  So it looks like this is not the right way to solve the 
>problem.

We confirmed usb_driver_release_interface will call usb_unbind_interface when 
this problem occurs.
So usb_unbind_interface will call driver disconnect callbak.

Regards,
Kento Kobayashi

[PATCH v1 12/12] ASoC: rockchip: pdm: Correct PDM_CTRL0 reg value

2019-04-03 Thread Sugar Zhang

This patch fix the wrong reg value for rk322x/rk322xh,
cuz there is no STORE JUSTIFIED MODE on it.

on rk322x/rk322xh, the same bit means PDM_MODE/RESERVED,
if the bit is set to RESERVED, the controller will not work.

Signed-off-by: Sugar Zhang 
---

 sound/soc/rockchip/rockchip_pdm.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sound/soc/rockchip/rockchip_pdm.c 
b/sound/soc/rockchip/rockchip_pdm.c
index e89beeb..6c0f242 100644
--- a/sound/soc/rockchip/rockchip_pdm.c
+++ b/sound/soc/rockchip/rockchip_pdm.c
@@ -210,7 +210,9 @@ static int rockchip_pdm_hw_params(struct snd_pcm_substream 
*substream,
regmap_update_bits(pdm->regmap, PDM_HPF_CTRL,
   PDM_HPF_LE | PDM_HPF_RE, PDM_HPF_LE | PDM_HPF_RE);
regmap_update_bits(pdm->regmap, PDM_CLK_CTRL, PDM_CLK_EN, PDM_CLK_EN);
-   regmap_update_bits(pdm->regmap, PDM_CTRL0, PDM_MODE_MSK, PDM_MODE_LJ);
+   if (pdm->version != RK_PDM_RK3229)
+   regmap_update_bits(pdm->regmap, PDM_CTRL0,
+  PDM_MODE_MSK, PDM_MODE_LJ);
 
val = 0;
switch (params_format(params)) {
@@ -468,7 +470,8 @@ static const struct regmap_config 
rockchip_pdm_regmap_config = {
 };
 
 static const struct of_device_id rockchip_pdm_match[] = {
-   { .compatible = "rockchip,pdm", },
+   { .compatible = "rockchip,pdm",
+ .data = (void *)RK_PDM_RK3229 },
{ .compatible = "rockchip,px30-pdm",
  .data = (void *)RK_PDM_RK3308 },
{ .compatible = "rockchip,rk1808-pdm",
-- 
2.7.4

[PATCH v1 11/12] ASoC: rockchip: pdm: Mark RXFIFO_DATA as volatile and precious

2019-04-03 Thread Sugar Zhang

This patch marks RXFIFO_DATA as precious to avoid being read
outside a call from the driver, such as regmap debugfs

Signed-off-by: Sugar Zhang 
---

 sound/soc/rockchip/rockchip_pdm.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/sound/soc/rockchip/rockchip_pdm.c 
b/sound/soc/rockchip/rockchip_pdm.c
index 955cdc2..e89beeb 100644
--- a/sound/soc/rockchip/rockchip_pdm.c
+++ b/sound/soc/rockchip/rockchip_pdm.c
@@ -415,6 +415,7 @@ static bool rockchip_pdm_rd_reg(struct device *dev, 
unsigned int reg)
case PDM_INT_CLR:
case PDM_INT_ST:
case PDM_DATA_VALID:
+   case PDM_RXFIFO_DATA:
case PDM_VERSION:
return true;
default:
@@ -429,6 +430,17 @@ static bool rockchip_pdm_volatile_reg(struct device *dev, 
unsigned int reg)
case PDM_FIFO_CTRL:
case PDM_INT_CLR:
case PDM_INT_ST:
+   case PDM_RXFIFO_DATA:
+   return true;
+   default:
+   return false;
+   }
+}
+
+static bool rockchip_pdm_precious_reg(struct device *dev, unsigned int reg)
+{
+   switch (reg) {
+   case PDM_RXFIFO_DATA:
return true;
default:
return false;
@@ -451,6 +463,7 @@ static const struct regmap_config 
rockchip_pdm_regmap_config = {
.writeable_reg = rockchip_pdm_wr_reg,
.readable_reg = rockchip_pdm_rd_reg,
.volatile_reg = rockchip_pdm_volatile_reg,
+   .precious_reg = rockchip_pdm_precious_reg,
.cache_type = REGCACHE_FLAT,
 };
 
-- 
2.7.4

[PATCH v1 10/12] ASoC: rockchip: pdm: adjust waterlevel in frame unit

2019-04-03 Thread Sugar Zhang

This patch make the waterlevel more reasonable, because the pdm
controller share the single FIFO(128 entries) with each channel.
adjust waterlevel in frame to meet the vad or dma frames request.

Signed-off-by: Sugar Zhang 
---

 sound/soc/rockchip/rockchip_pdm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sound/soc/rockchip/rockchip_pdm.c 
b/sound/soc/rockchip/rockchip_pdm.c
index 3e1c5fd..955cdc2 100644
--- a/sound/soc/rockchip/rockchip_pdm.c
+++ b/sound/soc/rockchip/rockchip_pdm.c
@@ -255,8 +255,9 @@ static int rockchip_pdm_hw_params(struct snd_pcm_substream 
*substream,
regmap_update_bits(pdm->regmap, PDM_CTRL0,
   PDM_PATH_MSK | PDM_VDW_MSK,
   val);
+   /* all channels share the single FIFO */
regmap_update_bits(pdm->regmap, PDM_DMA_CTRL, PDM_DMA_RDL_MSK,
-  PDM_DMA_RDL(16));
+  PDM_DMA_RDL(8 * params_channels(params)));
 
return 0;
 }
-- 
2.7.4

[PATCH v1 09/12] dt-bindings: sound: add compatible for rk1808

2019-04-03 Thread Sugar Zhang

This patch adds bindings for rk1808 soc.

Signed-off-by: Sugar Zhang 
---

 Documentation/devicetree/bindings/sound/rockchip,pdm.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/sound/rockchip,pdm.txt 
b/Documentation/devicetree/bindings/sound/rockchip,pdm.txt
index 3e0eb73..98572a2 100644
--- a/Documentation/devicetree/bindings/sound/rockchip,pdm.txt
+++ b/Documentation/devicetree/bindings/sound/rockchip,pdm.txt
@@ -4,6 +4,7 @@ Required properties:
 
 - compatible: "rockchip,pdm"
   - "rockchip,px30-pdm"
+  - "rockchip,rk1808-pdm"
   - "rockchip,rk3308-pdm"
 - reg: physical base address of the controller and length of memory mapped
   region.
-- 
2.7.4

[PATCH v1 08/12] ASoC: rockchip: pdm: add compatible for rk1808

2019-04-03 Thread Sugar Zhang

This patch adds support for rk1808, the pdm controller
is the same as rk3308.

Signed-off-by: Sugar Zhang 
---

 sound/soc/rockchip/rockchip_pdm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/soc/rockchip/rockchip_pdm.c 
b/sound/soc/rockchip/rockchip_pdm.c
index 4f93a74..3e1c5fd 100644
--- a/sound/soc/rockchip/rockchip_pdm.c
+++ b/sound/soc/rockchip/rockchip_pdm.c
@@ -457,6 +457,8 @@ static const struct of_device_id rockchip_pdm_match[] = {
{ .compatible = "rockchip,pdm", },
{ .compatible = "rockchip,px30-pdm",
  .data = (void *)RK_PDM_RK3308 },
+   { .compatible = "rockchip,rk1808-pdm",
+ .data = (void *)RK_PDM_RK3308 },
{ .compatible = "rockchip,rk3308-pdm",
  .data = (void *)RK_PDM_RK3308 },
{},
-- 
2.7.4

[PATCH v1 07/12] dt-bindings: sound: rockchip: add compatible for rk3308/px30

2019-04-03 Thread Sugar Zhang

This patch adds bindings for rk3308/px30.

Signed-off-by: Sugar Zhang 
---

 Documentation/devicetree/bindings/sound/rockchip,pdm.txt | 4 
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/sound/rockchip,pdm.txt 
b/Documentation/devicetree/bindings/sound/rockchip,pdm.txt
index 47f164f..3e0eb73 100644
--- a/Documentation/devicetree/bindings/sound/rockchip,pdm.txt
+++ b/Documentation/devicetree/bindings/sound/rockchip,pdm.txt
@@ -3,6 +3,8 @@
 Required properties:
 
 - compatible: "rockchip,pdm"
+  - "rockchip,px30-pdm"
+  - "rockchip,rk3308-pdm"
 - reg: physical base address of the controller and length of memory mapped
   region.
 - dmas: DMA specifiers for rx dma. See the DMA client binding,
@@ -12,6 +14,8 @@ Required properties:
 - clock-names: should contain following:
- "pdm_hclk": clock for PDM BUS
- "pdm_clk" : clock for PDM controller
+- resets: a list of phandle + reset-specifer paris, one for each entry in 
reset-names.
+- reset-names: reset names, should include "pdm-m".
 - pinctrl-names: Must contain a "default" entry.
 - pinctrl-N: One property must exist for each entry in
 pinctrl-names. See ../pinctrl/pinctrl-bindings.txt
-- 
2.7.4

[PATCH v1 06/12] ASoC: rockchip: pdm: fixup pdm fractional div

2019-04-03 Thread Sugar Zhang

This patch adds support fractional div for rk3308.

Signed-off-by: Sugar Zhang 
---

 sound/soc/rockchip/rockchip_pdm.c | 172 --
 sound/soc/rockchip/rockchip_pdm.h |   9 ++
 2 files changed, 139 insertions(+), 42 deletions(-)

diff --git a/sound/soc/rockchip/rockchip_pdm.c 
b/sound/soc/rockchip/rockchip_pdm.c
index c50494b..4f93a74 100644
--- a/sound/soc/rockchip/rockchip_pdm.c
+++ b/sound/soc/rockchip/rockchip_pdm.c
@@ -17,14 +17,23 @@
 #include 
 #include 
 #include 
+#include 
 #include 
+#include 
 #include 
+#include 
 #include 
 #include 
 
 #include "rockchip_pdm.h"
 
 #define PDM_DMA_BURST_SIZE (8) /* size * width: 8*4 = 32 bytes */
+#define PDM_SIGNOFF_CLK_RATE   (1)
+
+enum rk_pdm_version {
+   RK_PDM_RK3229,
+   RK_PDM_RK3308,
+};
 
 struct rk_pdm_dev {
struct device *dev;
@@ -32,22 +41,51 @@ struct rk_pdm_dev {
struct clk *hclk;
struct regmap *regmap;
struct snd_dmaengine_dai_dma_data capture_dma_data;
+   struct reset_control *reset;
+   enum rk_pdm_version version;
 };
 
 struct rk_pdm_clkref {
unsigned int sr;
unsigned int clk;
+   unsigned int clk_out;
+};
+
+struct rk_pdm_ds_ratio {
+   unsigned int ratio;
+   unsigned int sr;
 };
 
 static struct rk_pdm_clkref clkref[] = {
-   { 8000, 4096 },
-   { 11025, 56448000 },
-   { 12000, 6144 },
+   { 8000, 4096, 2048000 },
+   { 11025, 56448000, 2822400 },
+   { 12000, 6144, 3072000 },
+   { 8000, 98304000, 2048000 },
+   { 12000, 98304000, 3072000 },
+};
+
+static struct rk_pdm_ds_ratio ds_ratio[] = {
+   { 0, 192000 },
+   { 0, 176400 },
+   { 0, 128000 },
+   { 1, 96000 },
+   { 1, 88200 },
+   { 1, 64000 },
+   { 2, 48000 },
+   { 2, 44100 },
+   { 2, 32000 },
+   { 3, 24000 },
+   { 3, 22050 },
+   { 3, 16000 },
+   { 4, 12000 },
+   { 4, 11025 },
+   { 4, 8000 },
 };
 
-static unsigned int get_pdm_clk(unsigned int sr)
+static unsigned int get_pdm_clk(struct rk_pdm_dev *pdm, unsigned int sr,
+   unsigned int *clk_src, unsigned int *clk_out)
 {
-   unsigned int i, count, clk, div;
+   unsigned int i, count, clk, div, rate;
 
clk = 0;
if (!sr)
@@ -59,14 +97,39 @@ static unsigned int get_pdm_clk(unsigned int sr)
continue;
div = sr / clkref[i].sr;
if ((div & (div - 1)) == 0) {
+   *clk_out = clkref[i].clk_out;
+   rate = clk_round_rate(pdm->clk, clkref[i].clk);
+   if (rate != clkref[i].clk)
+   continue;
clk = clkref[i].clk;
+   *clk_src = clkref[i].clk;
break;
}
}
 
+   if (!clk) {
+   clk = clk_round_rate(pdm->clk, PDM_SIGNOFF_CLK_RATE);
+   *clk_src = clk;
+   }
return clk;
 }
 
+static unsigned int get_pdm_ds_ratio(unsigned int sr)
+{
+   unsigned int i, count, ratio;
+
+   ratio = 0;
+   if (!sr)
+   return ratio;
+
+   count = ARRAY_SIZE(ds_ratio);
+   for (i = 0; i < count; i++) {
+   if (sr == ds_ratio[i].sr)
+   ratio = ds_ratio[i].ratio;
+   }
+   return ratio;
+}
+
 static inline struct rk_pdm_dev *to_info(struct snd_soc_dai *dai)
 {
return snd_soc_dai_get_drvdata(dai);
@@ -95,40 +158,52 @@ static int rockchip_pdm_hw_params(struct snd_pcm_substream 
*substream,
struct rk_pdm_dev *pdm = to_info(dai);
unsigned int val = 0;
unsigned int clk_rate, clk_div, samplerate;
+   unsigned int clk_src, clk_out;
+   unsigned long m, n;
+   bool change;
int ret;
 
+   if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
+   return 0;
+
samplerate = params_rate(params);
-   clk_rate = get_pdm_clk(samplerate);
+   clk_rate = get_pdm_clk(pdm, samplerate, _src, _out);
if (!clk_rate)
return -EINVAL;
 
-   ret = clk_set_rate(pdm->clk, clk_rate);
+   ret = clk_set_rate(pdm->clk, clk_src);
if (ret)
return -EINVAL;
 
-   clk_div = DIV_ROUND_CLOSEST(clk_rate, samplerate);
-
-   switch (clk_div) {
-   case 320:
-   val = PDM_CLK_320FS;
-   break;
-   case 640:
-   val = PDM_CLK_640FS;
-   break;
-   case 1280:
-   val = PDM_CLK_1280FS;
-   break;
-   case 2560:
-   val = PDM_CLK_2560FS;
-   break;
-   case 5120:
-   val = PDM_CLK_5120FS;
-   break;
-   default:
-   dev_err(pdm->dev, "unsupported div: %d\n", clk_div);
-   return -EINVAL;
+   if (pdm->version == RK_PDM_RK3308) {
+

CAN I TRUST YOU I NEED YOUR ASSISTANCE AND TRUST

2019-04-03 Thread info




INVESTMENT  FOR  PARTNERSHIP.  Can I Trust You  



























My Dear Friend  Can I Trust You.docx
Description: MS-Word 2007 document


My Dear Friend  Can I Trust You.docx
Description: MS-Word 2007 document

[PATCH v1 05/12] ASoC: rockchip: pdm: change dma burst to 8

2019-04-03 Thread Sugar Zhang

This patch decreases the transfer bursts to avoid the fifo overrun.

Signed-off-by: Sugar Zhang 
---

 sound/soc/rockchip/rockchip_pdm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/rockchip/rockchip_pdm.c 
b/sound/soc/rockchip/rockchip_pdm.c
index abbb6d7..c50494b 100644
--- a/sound/soc/rockchip/rockchip_pdm.c
+++ b/sound/soc/rockchip/rockchip_pdm.c
@@ -24,7 +24,7 @@
 
 #include "rockchip_pdm.h"
 
-#define PDM_DMA_BURST_SIZE (16) /* size * width: 16*4 = 64 bytes */
+#define PDM_DMA_BURST_SIZE (8) /* size * width: 8*4 = 32 bytes */
 
 struct rk_pdm_dev {
struct device *dev;
-- 
2.7.4

Re: [RESEND PATCH v6 07/11] power: supply: max77650: add support for battery charger

2019-04-03 Thread Linus Walleij

On Tue, Mar 19, 2019 at 12:42 AM Bartosz Golaszewski  wrote:

> From: Bartosz Golaszewski 
>
> Add basic support for the battery charger for max77650 PMIC.
>
> Signed-off-by: Bartosz Golaszewski 

This looks like a clean and good driver to me.
Reviewed-by: Linus Walleij 

Yours,
Linus Walleij

Re: [PATCH 1/2] dt-bindings: pinctrl: meson: Add drive-strength property

2019-04-03 Thread Linus Walleij

On Sun, Mar 31, 2019 at 2:04 PM Jerome Brunet  wrote:
> On Sun, 2019-03-31 at 01:40 -0500, Rob Herring wrote:
> > On Thu, Mar 14, 2019 at 05:37:24PM +0100, Jerome Brunet wrote:
> > > From: Guillaume La Roque 

> > > +Optional properties :
> > > + - drive-strength: Drive strength for the specified pins in uA.
> >
> > The standard definition says this is in mA.
>
> Yes, the problem we have and the solutions we are thinking about are explained
> in detail in the cover letter of this patchset. Could you share your opinion
> on it ?

I like your idea with drive-strength-uA but I need the second opinion from the
device tree people.

Yours,
Linus Walleij

Re: [PATCH] lib: Fix possible incorrect result from rational fractions helper

2019-04-03 Thread tpiepho

On Mon, Apr 1, 2019 at 10:22 PM Andrew Morton <
a...@linux-foundation.org> wrote:

> On Sat, 30 Mar 2019 13:58:55 -0700 Trent Piepho 
> wrote:
> > In some cases the previous algorithm would not return the closest
> > approximation.  This would happen when a semi-convergent was the
> > closest, as the previous algorithm would only consider convergents.
> > 
> > As an example, consider an initial value of 5/4, and trying to find
> > the
> > closest approximation with a maximum of 4 for numerator and
> > denominator.
> > The previous algorithm would return 1/1 as the closest
> > approximation,
> > while this version will return the correct answer of 4/3.
> 
> What are the userspace-visible runtime effects of this change?

Ok, I looked into this in some detail.

This routine is used in two places in the video4linux code, but in
those cases it is only used to reduce a fraction to lowest terms, which
the existing code will do correctly.  This could be done more
efficiently with a different library routine but it would still be the
Euclidean alogrithm at its heart.  So no change.

The remain users are places where a fractional PLL divider is
programmed.  What would happen is something asked for a clock of X MHz
but instead gets Y MHz, where Y is close to X but not exactly due to
the hardware limitations.  After this change they might, in some cases,
get Y' MHz, where Y' is a little closer to X then Y was.

Users like this are: Three UARTs, in 8250_mid, 8250_lpss, and
imx.  One GPU in vp4_hdmi.  And three clock drivers, clk-cdce706, clk-si5351, 
and clk-fractional-divider.  The last is a generic clock driver and so would 
have more users referenced via device tree entries.

I think there's a bug in that one, it's limiting an N bit field that is
offset-by-1 to the range 0 .. (1<

Re: [PATCH 0/2] pinctrl: meson: add g12a drive strength support

2019-04-03 Thread Linus Walleij

On Thu, Mar 14, 2019 at 11:37 PM Jerome Brunet  wrote:

> Now the slightly annoying part :(
> The value achievable by the SoC are 0.5mA, 2.5mA, 3mA and 4mA and the DT 
> property
> 'drive-strength' is expressed in mA.
>
> 1) Rounding down the value, we could be requesting a 0mA drive strength.
>That would look weird.
> 2) Rounding up, we can't distinguish between 2.5mA and 3mA
>
> To solve this issue in this in this v1, we chose to document that, on Amlogic,
> drive-strength is expressed in uA instead of mA.
> It works well and there is no impact on the other platforms but I'm not sure 
> this
> is really OK with the DT rules ?

I want the DT people to say what they think about this.

> Linus, if this is not OK with you, here are 2 other options we are
> considering. We would be very interested to get your opinion on the matter:
>
> 1) instead the generic 'drive-strength' property, we could add an amlogic
> specific property, 'amlogic,drive-strength'. It would be expressed in uA
> and parsed in amlogic specific code.
> I think this option is kind of overkill. Expressing drive strength in uA is
> not really amlogic specific so it does not make much sense, but it would
> work ...
>
> 2) Add another generic property "drive-strength-uA". The change to do so
> would be minimal and could be benefit to other platforms later on.

I would go for 2).

But we really need input from bindings people on this.

Yours,
Linus Walleij

[PATCH v3 14/20] coresight: tmc-etr: Introduce the notion of process ID to ETR devices

2019-04-03 Thread Mathieu Poirier

In preparation to support CPU-wide trace scenarios, introduce the notion
of process ID to ETR devices.  That way events monitoring the same process
can use the same etr_buf, allowing multiple CPUs to use the same sink.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-tmc-etr.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index ac1efdfc0d07..e1774d4bb5f3 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "coresight-catu.h"
 #include "coresight-etm-perf.h"
@@ -25,6 +26,7 @@ struct etr_flat_buf {
 /*
  * etr_perf_buffer - Perf buffer used for ETR
  * @etr_buf- Actual buffer used by the ETR
+ * @pid- The PID this etr_perf_buffer belongs to.
  * @snaphost   - Perf session mode
  * @head   - handle->head at the beginning of the session.
  * @nr_pages   - Number of pages in the ring buffer.
@@ -32,6 +34,7 @@ struct etr_flat_buf {
  */
 struct etr_perf_buffer {
struct etr_buf  *etr_buf;
+   pid_t   pid;
boolsnapshot;
unsigned long   head;
int nr_pages;
@@ -1276,6 +1279,7 @@ static void *tmc_alloc_etr_buffer(struct coresight_device 
*csdev,
return NULL;
}
 
+   etr_perf->pid = task_pid_nr(event->owner);
etr_perf->snapshot = snapshot;
etr_perf->nr_pages = nr_pages;
etr_perf->pages = pages;
-- 
2.17.1

[PATCH v3 20/20] coresight: etb10: Add support for CPU-wide trace scenarios

2019-04-03 Thread Mathieu Poirier

This patch adds support for CPU-wide trace scenarios by making sure that
only the sources monitoring the same process have access to a common sink.
Because the sink is shared between sources, the first source to use the
sink switches it on while the last one does the cleanup.  Any attempt to
modify the HW is overlooked for as long as more than one source is using
a sink.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-etb10.c | 43 +--
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etb10.c 
b/drivers/hwtracing/coresight/coresight-etb10.c
index 7d64c41cd8ac..a2379c00d635 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -72,6 +72,8 @@
  * @miscdev:   specifics to handle "/dev/xyz.etb" entry.
  * @spinlock:  only one at a time pls.
  * @reading:   synchronise user space access to etb buffer.
+ * @pid:   Process ID of the process being monitored by the session
+ * that is using this component.
  * @buf:   area of memory where ETB buffer content gets sent.
  * @mode:  this ETB is being used.
  * @buffer_depth: size of @buf.
@@ -85,6 +87,7 @@ struct etb_drvdata {
struct miscdevice   miscdev;
spinlock_t  spinlock;
local_t reading;
+   pid_t   pid;
u8  *buf;
u32 mode;
u32 buffer_depth;
@@ -177,28 +180,49 @@ static int etb_enable_sysfs(struct coresight_device 
*csdev)
 static int etb_enable_perf(struct coresight_device *csdev, void *data)
 {
int ret = 0;
+   pid_t pid;
unsigned long flags;
struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+   struct perf_output_handle *handle = data;
 
spin_lock_irqsave(>spinlock, flags);
 
-   /* No need to continue if the component is already in use. */
-   if (drvdata->mode != CS_MODE_DISABLED) {
+   /* No need to continue if the component is already in used by sysFS. */
+   if (drvdata->mode == CS_MODE_SYSFS) {
+   ret = -EBUSY;
+   goto out;
+   }
+
+   /* Get a handle on the pid of the process to monitor */
+   pid = task_pid_nr(handle->event->owner);
+
+   if (drvdata->pid != -1 && drvdata->pid != pid) {
ret = -EBUSY;
goto out;
}
 
+   /*
+* No HW configuration is needed if the sink is already in
+* use for this session.
+*/
+   if (drvdata->pid == pid) {
+   atomic_inc(csdev->refcnt);
+   goto out;
+   }
+
/*
 * We don't have an internal state to clean up if we fail to setup
 * the perf buffer. So we can perform the step before we turn the
 * ETB on and leave without cleaning up.
 */
-   ret = etb_set_buffer(csdev, (struct perf_output_handle *)data);
+   ret = etb_set_buffer(csdev, handle);
if (ret)
goto out;
 
ret = etb_enable_hw(drvdata);
if (!ret) {
+   /* Associate with monitored process. */
+   drvdata->pid = pid;
drvdata->mode = CS_MODE_PERF;
atomic_inc(csdev->refcnt);
}
@@ -344,6 +368,8 @@ static int etb_disable(struct coresight_device *csdev)
/* Complain if we (somehow) got out of sync */
WARN_ON_ONCE(drvdata->mode == CS_MODE_DISABLED);
etb_disable_hw(drvdata);
+   /* Dissociate from monitored process. */
+   drvdata->pid = -1;
drvdata->mode = CS_MODE_DISABLED;
spin_unlock_irqrestore(>spinlock, flags);
 
@@ -414,7 +440,7 @@ static unsigned long etb_update_buffer(struct 
coresight_device *csdev,
const u32 *barrier;
u32 read_ptr, write_ptr, capacity;
u32 status, read_data;
-   unsigned long offset, to_read, flags;
+   unsigned long offset, to_read = 0, flags;
struct cs_buffers *buf = sink_config;
struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
@@ -424,6 +450,11 @@ static unsigned long etb_update_buffer(struct 
coresight_device *csdev,
capacity = drvdata->buffer_depth * ETB_FRAME_SIZE_WORDS;
 
spin_lock_irqsave(>spinlock, flags);
+
+   /* Don't do anything if another tracer is using this sink */
+   if (atomic_read(csdev->refcnt) != 1)
+   goto out;
+
__etb_disable_hw(drvdata);
CS_UNLOCK(drvdata->base);
 
@@ -534,6 +565,7 @@ static unsigned long etb_update_buffer(struct 
coresight_device *csdev,
}
__etb_enable_hw(drvdata);
CS_LOCK(drvdata->base);
+out:
spin_unlock_irqrestore(>spinlock, flags);
 
return to_read;
@@ -742,6 +774,9 @@ static int etb_probe(struct amba_device *adev, const struct 
amba_id *id)
if (!drvdata->buf)
return -ENOMEM;
 
+

[PATCH v3 08/20] coresight: Properly address concurrency in sink::update() functions

2019-04-03 Thread Mathieu Poirier

When operating in CPU-wide trace scenarios and working with an N:1
source/sink HW topology, update() functions need to be made atomic
in order to avoid racing with start and stop operations.

Signed-off-by: Mathieu Poirier 
Reviewed-by: Suzuki K Poulose 
---
 drivers/hwtracing/coresight/coresight-etb10.c   | 4 +++-
 drivers/hwtracing/coresight/coresight-tmc-etf.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etb10.c 
b/drivers/hwtracing/coresight/coresight-etb10.c
index 52b7d95ab498..6b50e781dc57 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -413,7 +413,7 @@ static unsigned long etb_update_buffer(struct 
coresight_device *csdev,
const u32 *barrier;
u32 read_ptr, write_ptr, capacity;
u32 status, read_data;
-   unsigned long offset, to_read;
+   unsigned long offset, to_read, flags;
struct cs_buffers *buf = sink_config;
struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
@@ -422,6 +422,7 @@ static unsigned long etb_update_buffer(struct 
coresight_device *csdev,
 
capacity = drvdata->buffer_depth * ETB_FRAME_SIZE_WORDS;
 
+   spin_lock_irqsave(>spinlock, flags);
__etb_disable_hw(drvdata);
CS_UNLOCK(drvdata->base);
 
@@ -532,6 +533,7 @@ static unsigned long etb_update_buffer(struct 
coresight_device *csdev,
}
__etb_enable_hw(drvdata);
CS_LOCK(drvdata->base);
+   spin_unlock_irqrestore(>spinlock, flags);
 
return to_read;
 }
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 30f868676540..a38ad2b0d95a 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -413,7 +413,7 @@ static unsigned long tmc_update_etf_buffer(struct 
coresight_device *csdev,
u32 *buf_ptr;
u64 read_ptr, write_ptr;
u32 status;
-   unsigned long offset, to_read;
+   unsigned long offset, to_read, flags;
struct cs_buffers *buf = sink_config;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
@@ -424,6 +424,7 @@ static unsigned long tmc_update_etf_buffer(struct 
coresight_device *csdev,
if (WARN_ON_ONCE(drvdata->mode != CS_MODE_PERF))
return 0;
 
+   spin_lock_irqsave(>spinlock, flags);
CS_UNLOCK(drvdata->base);
 
tmc_flush_and_stop(drvdata);
@@ -517,6 +518,7 @@ static unsigned long tmc_update_etf_buffer(struct 
coresight_device *csdev,
to_read = buf->nr_pages << PAGE_SHIFT;
}
CS_LOCK(drvdata->base);
+   spin_unlock_irqrestore(>spinlock, flags);
 
return to_read;
 }
-- 
2.17.1

[PATCH v3 18/20] coresight: tmc-etr: Add support for CPU-wide trace scenarios

2019-04-03 Thread Mathieu Poirier

This patch adds support for CPU-wide trace scenarios by making sure that
only the sources monitoring the same process have access to a common sink.
Because the sink is shared between sources, the first source to use the
sink switches it on while the last one does the cleanup.  Any attempt to
modify the HW is overlooked for as long as more than one source is using
a sink.

Signed-off-by: Mathieu Poirier 
---
 .../hwtracing/coresight/coresight-tmc-etr.c   | 38 ---
 drivers/hwtracing/coresight/coresight-tmc.c   |  2 +
 drivers/hwtracing/coresight/coresight-tmc.h   |  3 ++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 61110ef41d00..a91c1bc17e2d 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1473,6 +1473,13 @@ tmc_update_etr_buffer(struct coresight_device *csdev,
struct etr_buf *etr_buf = etr_perf->etr_buf;
 
spin_lock_irqsave(>spinlock, flags);
+
+   /* Don't do anything if another tracer is using this sink */
+   if (atomic_read(csdev->refcnt) != 1) {
+   spin_unlock_irqrestore(>spinlock, flags);
+   goto out;
+   }
+
if (WARN_ON(drvdata->perf_data != etr_perf)) {
lost = true;
spin_unlock_irqrestore(>spinlock, flags);
@@ -1512,17 +1519,15 @@ tmc_update_etr_buffer(struct coresight_device *csdev,
 static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, void *data)
 {
int rc = 0;
+   pid_t pid;
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
struct perf_output_handle *handle = data;
struct etr_perf_buffer *etr_perf = etm_perf_sink_config(handle);
 
spin_lock_irqsave(>spinlock, flags);
-   /*
-* There can be only one writer per sink in perf mode. If the sink
-* is already open in SYSFS mode, we can't use it.
-*/
-   if (drvdata->mode != CS_MODE_DISABLED || WARN_ON(drvdata->perf_data)) {
+/* Don't use this sink if it is already claimed by sysFS */
+   if (drvdata->mode == CS_MODE_SYSFS) {
rc = -EBUSY;
goto unlock_out;
}
@@ -1532,10 +1537,31 @@ static int tmc_enable_etr_sink_perf(struct 
coresight_device *csdev, void *data)
goto unlock_out;
}
 
+   /* Get a handle on the pid of the process to monitor */
+   pid = etr_perf->pid;
+
+   /* Do not proceed if this device is associated with another session */
+   if (drvdata->pid != -1 && drvdata->pid != pid) {
+   rc = -EBUSY;
+   goto unlock_out;
+   }
+
etr_perf->head = PERF_IDX2OFF(handle->head, etr_perf);
drvdata->perf_data = etr_perf;
+
+   /*
+* No HW configuration is needed if the sink is already in
+* use for this session.
+*/
+   if (drvdata->pid == pid) {
+   atomic_inc(csdev->refcnt);
+   goto unlock_out;
+   }
+
rc = tmc_etr_enable_hw(drvdata, etr_perf->etr_buf);
if (!rc) {
+   /* Associate with monitored process. */
+   drvdata->pid = pid;
drvdata->mode = CS_MODE_PERF;
atomic_inc(csdev->refcnt);
}
@@ -1579,6 +1605,8 @@ static int tmc_disable_etr_sink(struct coresight_device 
*csdev)
/* Complain if we (somehow) got out of sync */
WARN_ON_ONCE(drvdata->mode == CS_MODE_DISABLED);
tmc_etr_disable_hw(drvdata);
+   /* Dissociate from monitored process. */
+   drvdata->pid = -1;
drvdata->mode = CS_MODE_DISABLED;
 
spin_unlock_irqrestore(>spinlock, flags);
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c 
b/drivers/hwtracing/coresight/coresight-tmc.c
index 71c86cffc021..fd8267fd8e6b 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -417,6 +417,8 @@ static int tmc_probe(struct amba_device *adev, const struct 
amba_id *id)
devid = readl_relaxed(drvdata->base + CORESIGHT_DEVID);
drvdata->config_type = BMVAL(devid, 6, 7);
drvdata->memwidth = tmc_get_memwidth(devid);
+   /* This device is not associated with a session */
+   drvdata->pid = -1;
 
if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
if (np)
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h 
b/drivers/hwtracing/coresight/coresight-tmc.h
index c1b1700b2df7..503f1b3a3741 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.h
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -165,6 +165,8 @@ struct etr_buf {
  * @csdev: component vitals needed by the framework.
  * @miscdev:   specifics to handle "/dev/xyz.tmc" entry.
  * @spinlock:  only one at a time pls.
+ * @pid:   Process ID of the process being monitored by the

[PATCH v3 19/20] coresight: tmc-etf: Add support for CPU-wide trace scenarios

2019-04-03 Thread Mathieu Poirier

This patch adds support for CPU-wide trace scenarios by making sure that
only the sources monitoring the same process have access to a common sink.
Because the sink is shared between sources, the first source to use the
sink switches it on while the last one does the cleanup.  Any attempt to
modify the HW is overlooked for as long as more than one source is using
a sink.

Signed-off-by: Mathieu Poirier 
---
 .../hwtracing/coresight/coresight-tmc-etf.c   | 40 ---
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 1df1f8fade71..2527b5d3b65e 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -223,6 +223,7 @@ static int tmc_enable_etf_sink_sysfs(struct 
coresight_device *csdev)
 static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, void *data)
 {
int ret = 0;
+   pid_t pid;
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
struct perf_output_handle *handle = data;
@@ -233,18 +234,39 @@ static int tmc_enable_etf_sink_perf(struct 
coresight_device *csdev, void *data)
if (drvdata->reading)
break;
/*
-* In Perf mode there can be only one writer per sink.  There
-* is also no need to continue if the ETB/ETF is already
-* operated from sysFS.
+* No need to continue if the ETB/ETF is already operated
+* from sysFS.
 */
-   if (drvdata->mode != CS_MODE_DISABLED)
+   if (drvdata->mode == CS_MODE_SYSFS) {
+   ret = -EBUSY;
break;
+   }
+
+   /* Get a handle on the pid of the process to monitor */
+   pid = task_pid_nr(handle->event->owner);
+
+   if (drvdata->pid != -1 && drvdata->pid != pid) {
+   ret = -EBUSY;
+   break;
+   }
 
ret = tmc_set_etf_buffer(csdev, handle);
if (ret)
break;
+
+   /*
+* No HW configuration is needed if the sink is already in
+* use for this session.
+*/
+   if (drvdata->pid == pid) {
+   atomic_inc(csdev->refcnt);
+   break;
+   }
+
ret  = tmc_etb_enable_hw(drvdata);
if (!ret) {
+   /* Associate with monitored process. */
+   drvdata->pid = pid;
drvdata->mode = CS_MODE_PERF;
atomic_inc(csdev->refcnt);
}
@@ -300,6 +322,8 @@ static int tmc_disable_etf_sink(struct coresight_device 
*csdev)
/* Complain if we (somehow) got out of sync */
WARN_ON_ONCE(drvdata->mode == CS_MODE_DISABLED);
tmc_etb_disable_hw(drvdata);
+   /* Dissociate from monitored process. */
+   drvdata->pid = -1;
drvdata->mode = CS_MODE_DISABLED;
 
spin_unlock_irqrestore(>spinlock, flags);
@@ -414,7 +438,7 @@ static unsigned long tmc_update_etf_buffer(struct 
coresight_device *csdev,
u32 *buf_ptr;
u64 read_ptr, write_ptr;
u32 status;
-   unsigned long offset, to_read, flags;
+   unsigned long offset, to_read = 0, flags;
struct cs_buffers *buf = sink_config;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
@@ -426,6 +450,11 @@ static unsigned long tmc_update_etf_buffer(struct 
coresight_device *csdev,
return 0;
 
spin_lock_irqsave(>spinlock, flags);
+
+   /* Don't do anything if another tracer is using this sink */
+   if (atomic_read(csdev->refcnt) != 1)
+   goto out;
+
CS_UNLOCK(drvdata->base);
 
tmc_flush_and_stop(drvdata);
@@ -519,6 +548,7 @@ static unsigned long tmc_update_etf_buffer(struct 
coresight_device *csdev,
to_read = buf->nr_pages << PAGE_SHIFT;
}
CS_LOCK(drvdata->base);
+out:
spin_unlock_irqrestore(>spinlock, flags);
 
return to_read;
-- 
2.17.1

[PATCH v3 17/20] coresight: tmc-etr: Allocate and free ETR memory buffers for CPU-wide scenarios

2019-04-03 Thread Mathieu Poirier

This patch uses the PID of the process being traced to allocate and free
ETR memory buffers for CPU-wide scenarios.  The implementation is tailored
to handle both N:1 and 1:1 source/sink HW topologies.

Signed-off-by: Mathieu Poirier 
---
 .../hwtracing/coresight/coresight-tmc-etr.c   | 107 +-
 1 file changed, 104 insertions(+), 3 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 1346474ac019..61110ef41d00 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -8,6 +8,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -26,6 +28,7 @@ struct etr_flat_buf {
 
 /*
  * etr_perf_buffer - Perf buffer used for ETR
+ * @drvdata- The ETR drvdaga this buffer has been allocated for.
  * @etr_buf- Actual buffer used by the ETR
  * @pid- The PID this etr_perf_buffer belongs to.
  * @snaphost   - Perf session mode
@@ -34,6 +37,7 @@ struct etr_flat_buf {
  * @pages  - Array of Pages in the ring buffer.
  */
 struct etr_perf_buffer {
+   struct tmc_drvdata  *drvdata;
struct etr_buf  *etr_buf;
pid_t   pid;
boolsnapshot;
@@ -1210,6 +1214,72 @@ alloc_etr_buf(struct tmc_drvdata *drvdata, struct 
perf_event *event,
return etr_buf;
 }
 
+static struct etr_buf *
+get_perf_etr_buf_cpu_wide(struct tmc_drvdata *drvdata,
+ struct perf_event *event, int nr_pages,
+ void **pages, bool snapshot)
+{
+   int ret;
+   pid_t pid = task_pid_nr(event->owner);
+   struct etr_buf *etr_buf;
+
+retry:
+   /*
+* An etr_perf_buffer is associated with an event and holds a reference
+* to the AUX ring buffer that was created for that event.  In CPU-wide
+* N:1 mode multiple events (one per CPU), each with its own AUX ring
+* buffer, share a sink.  As such an etr_perf_buffer is created for each
+* event but a single etr_buf associated with the ETR is shared between
+* them.  The last event in a trace session will copy the content of the
+* etr_buf to its AUX ring buffer.  Ring buffer associated to other
+* events are simply not used an freed as events are destoyed.  We still
+* need to allocate a ring buffer for each event since we don't know
+* which event will be last.
+*/
+
+   /*
+* The first thing to do here is check if an etr_buf has already been
+* allocated for this session.  If so it is shared with this event,
+* otherwise it is created.
+*/
+   mutex_lock(>idr_mutex);
+   etr_buf = idr_find(>idr, pid);
+   if (etr_buf) {
+   refcount_inc(_buf->refcount);
+   mutex_unlock(>idr_mutex);
+   return etr_buf;
+   }
+
+   /* If we made it here no buffer has been allocated, do so now. */
+   mutex_unlock(>idr_mutex);
+
+   etr_buf = alloc_etr_buf(drvdata, event, nr_pages, pages, snapshot);
+   if (IS_ERR(etr_buf))
+   return etr_buf;
+
+   refcount_set(_buf->refcount, 1);
+
+   /* Now that we have a buffer, add it to the IDR. */
+   mutex_lock(>idr_mutex);
+   ret = idr_alloc(>idr, etr_buf, pid, pid + 1, GFP_KERNEL);
+   mutex_unlock(>idr_mutex);
+
+   /* Another event with this session ID has allocated this buffer. */
+   if (ret == -ENOSPC) {
+   tmc_free_etr_buf(etr_buf);
+   goto retry;
+   }
+
+   /* The IDR can't allocate room for a new session, abandon ship. */
+   if (ret == -ENOMEM) {
+   tmc_free_etr_buf(etr_buf);
+   return ERR_PTR(ret);
+   }
+
+
+   return etr_buf;
+}
+
 static struct etr_buf *
 get_perf_etr_buf_per_thread(struct tmc_drvdata *drvdata,
struct perf_event *event, int nr_pages,
@@ -1238,7 +1308,8 @@ get_perf_etr_buf(struct tmc_drvdata *drvdata, struct 
perf_event *event,
return get_perf_etr_buf_per_thread(drvdata, event, nr_pages,
   pages, snapshot);
 
-   return ERR_PTR(-ENOENT);
+   return get_perf_etr_buf_cpu_wide(drvdata, event, nr_pages,
+pages, snapshot);
 }
 
 static struct etr_perf_buffer *
@@ -1265,7 +1336,13 @@ tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, 
struct perf_event *event,
return ERR_PTR(-ENOMEM);
 
 done:
+   /*
+* Keep a reference to the ETR this buffer has been allocated for
+* in order to have access to the IDR in tmc_free_etr_buffer().
+*/
+   etr_perf->drvdata = drvdata;
etr_perf->etr_buf = etr_buf;
+
return etr_perf;
 }
 
@@ -1295,9 +1372,33 @@ static void *tmc_alloc_etr_buffer(struct

[PATCH v3 13/20] coresight: tmc-etr: Create per-thread buffer allocation function

2019-04-03 Thread Mathieu Poirier

Buffer allocation is different when dealing with per-thread and
CPU-wide sessions.  In preparation to support CPU-wide trace scenarios
simplify things by keeping allocation functions for both type separate.

Signed-off-by: Mathieu Poirier 
---
 .../hwtracing/coresight/coresight-tmc-etr.c   | 29 ++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index e9c77009188a..ac1efdfc0d07 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1206,6 +1206,33 @@ alloc_etr_buf(struct tmc_drvdata *drvdata, struct 
perf_event *event,
return etr_buf;
 }
 
+static struct etr_buf *
+get_perf_etr_buf_per_thread(struct tmc_drvdata *drvdata,
+   struct perf_event *event, int nr_pages,
+   void **pages, bool snapshot)
+{
+   struct etr_buf *etr_buf;
+
+   /*
+* In per-thread mode the etr_buf isn't shared, so just go ahead
+* with memory allocation.
+*/
+   etr_buf = alloc_etr_buf(drvdata, event, nr_pages, pages, snapshot);
+
+   return etr_buf;
+}
+
+static struct etr_buf *
+get_perf_etr_buf(struct tmc_drvdata *drvdata, struct perf_event *event,
+int nr_pages, void **pages, bool snapshot)
+{
+   if (event->cpu == -1)
+   return get_perf_etr_buf_per_thread(drvdata, event, nr_pages,
+  pages, snapshot);
+
+   return ERR_PTR(-ENOENT);
+}
+
 static struct etr_perf_buffer *
 tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, struct perf_event *event,
   int nr_pages, void **pages, bool snapshot)
@@ -1222,7 +1249,7 @@ tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, 
struct perf_event *event,
if (!etr_perf)
return ERR_PTR(-ENOMEM);
 
-   etr_buf = alloc_etr_buf(drvdata, event, nr_pages, pages, snapshot);
+   etr_buf = get_perf_etr_buf(drvdata, event, nr_pages, pages, snapshot);
if (!IS_ERR(etr_buf))
goto done;
 
-- 
2.17.1

[PATCH v3 06/20] coresight: Move reference counting inside sink drivers

2019-04-03 Thread Mathieu Poirier

When operating in CPU-wide mode with an N:1 source/sink HW topology,
multiple CPUs can access a sink concurrently.  As such reference counting
needs to happen when the device's spinlock is held to avoid racing with
other operations (start(), update(), stop()), such as:

session A   Session B
-   ---

enable_sink
atomic_inc(refcount)  = 1

...

atomic_dec(refcount) = 0enable_sink
if (refcount == 0) disable_sink
atomic_inc()

Signed-off-by: Mathieu Poirier 
Reviewed-by: Suzuki K Poulose 
---
 drivers/hwtracing/coresight/coresight-etb10.c | 21 ++
 .../hwtracing/coresight/coresight-tmc-etf.c   | 21 +++---
 .../hwtracing/coresight/coresight-tmc-etr.c   | 19 +++--
 drivers/hwtracing/coresight/coresight-tpiu.c  |  6 +++-
 drivers/hwtracing/coresight/coresight.c   | 28 +--
 5 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etb10.c 
b/drivers/hwtracing/coresight/coresight-etb10.c
index 71c2a3cdb866..5af50a852e87 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -5,6 +5,7 @@
  * Description: CoreSight Embedded Trace Buffer driver
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -159,14 +160,15 @@ static int etb_enable_sysfs(struct coresight_device 
*csdev)
goto out;
}
 
-   /* Nothing to do, the tracer is already enabled. */
-   if (drvdata->mode == CS_MODE_SYSFS)
-   goto out;
+   if (drvdata->mode == CS_MODE_DISABLED) {
+   ret = etb_enable_hw(drvdata);
+   if (ret)
+   goto out;
 
-   ret = etb_enable_hw(drvdata);
-   if (!ret)
drvdata->mode = CS_MODE_SYSFS;
+   }
 
+   atomic_inc(csdev->refcnt);
 out:
spin_unlock_irqrestore(>spinlock, flags);
return ret;
@@ -196,8 +198,10 @@ static int etb_enable_perf(struct coresight_device *csdev, 
void *data)
goto out;
 
ret = etb_enable_hw(drvdata);
-   if (!ret)
+   if (!ret) {
drvdata->mode = CS_MODE_PERF;
+   atomic_inc(csdev->refcnt);
+   }
 
 out:
spin_unlock_irqrestore(>spinlock, flags);
@@ -332,6 +336,11 @@ static int etb_disable(struct coresight_device *csdev)
 
spin_lock_irqsave(>spinlock, flags);
 
+   if (atomic_dec_return(csdev->refcnt)) {
+   spin_unlock_irqrestore(>spinlock, flags);
+   return -EBUSY;
+   }
+
/* Disable the ETB only if it needs to */
if (drvdata->mode != CS_MODE_DISABLED) {
etb_disable_hw(drvdata);
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index d4213e7c2c45..d50a608a60f1 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -4,6 +4,7 @@
  * Author: Mathieu Poirier 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -180,8 +181,10 @@ static int tmc_enable_etf_sink_sysfs(struct 
coresight_device *csdev)
 * sink is already enabled no memory is needed and the HW need not be
 * touched.
 */
-   if (drvdata->mode == CS_MODE_SYSFS)
+   if (drvdata->mode == CS_MODE_SYSFS) {
+   atomic_inc(csdev->refcnt);
goto out;
+   }
 
/*
 * If drvdata::buf isn't NULL, memory was allocated for a previous
@@ -200,11 +203,13 @@ static int tmc_enable_etf_sink_sysfs(struct 
coresight_device *csdev)
}
 
ret = tmc_etb_enable_hw(drvdata);
-   if (!ret)
+   if (!ret) {
drvdata->mode = CS_MODE_SYSFS;
-   else
+   atomic_inc(csdev->refcnt);
+   } else {
/* Free up the buffer if we failed to enable */
used = false;
+   }
 out:
spin_unlock_irqrestore(>spinlock, flags);
 
@@ -239,8 +244,10 @@ static int tmc_enable_etf_sink_perf(struct 
coresight_device *csdev, void *data)
if (ret)
break;
ret  = tmc_etb_enable_hw(drvdata);
-   if (!ret)
+   if (!ret) {
drvdata->mode = CS_MODE_PERF;
+   atomic_inc(csdev->refcnt);
+   }
} while (0);
spin_unlock_irqrestore(>spinlock, flags);
 
@@ -279,11 +286,17 @@ static int tmc_disable_etf_sink(struct coresight_device 
*csdev)
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
spin_lock_irqsave(>spinlock, flags);
+
if (drvdata->reading) {
spin_unlock_irqrestore(>spinlock, flags);
return -EBUSY;
}
 
+   if (atomic_dec_return(csdev->refcnt)) {
+   spin_unlock_irqrestore(>spinlock, flags);
+   return -EBUSY;
+

[PATCH v3 15/20] coresight: tmc-etr: Introduce the notion of reference counting to ETR devices

2019-04-03 Thread Mathieu Poirier

This patch adds reference counting to struct etr_buf so that, in CPU-wide
trace scenarios, shared buffers can be disposed of when no longer used.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-tmc-etr.c | 5 +
 drivers/hwtracing/coresight/coresight-tmc.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index e1774d4bb5f3..1346474ac019 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1221,7 +1222,11 @@ get_perf_etr_buf_per_thread(struct tmc_drvdata *drvdata,
 * with memory allocation.
 */
etr_buf = alloc_etr_buf(drvdata, event, nr_pages, pages, snapshot);
+   if (IS_ERR(etr_buf))
+   goto out;
 
+   refcount_set(_buf->refcount, 1);
+out:
return etr_buf;
 }
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h 
b/drivers/hwtracing/coresight/coresight-tmc.h
index 487c53701e9c..ee44906dffe8 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.h
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include 
 
 #define TMC_RSZ0x004
 #define TMC_STS0x00c
@@ -133,6 +134,7 @@ struct etr_buf_operations;
 
 /**
  * struct etr_buf - Details of the buffer used by ETR
+ * refcount; Number of sources currently using this etr_buf.
  * @mode   : Mode of the ETR buffer, contiguous, Scatter Gather etc.
  * @full   : Trace data overflow
  * @size   : Size of the buffer.
@@ -143,6 +145,7 @@ struct etr_buf_operations;
  * @private: Backend specific information for the buf
  */
 struct etr_buf {
+   refcount_t  refcount;
enum etr_mode   mode;
boolfull;
ssize_t size;
-- 
2.17.1

[PATCH v3 09/20] coresight: perf: Clean up function etm_setup_aux()

2019-04-03 Thread Mathieu Poirier

There is no point in allocating sink memory for a trace session if
there is not a way to free it once it is no longer needed.  As such make
sure the sink API function to allocate and free memory have been
implemented before moving ahead with the establishment of a trace
session.

Signed-off-by: Mathieu Poirier 
Reviewed-by: Suzuki K Poulose 
---
 drivers/hwtracing/coresight/coresight-etm-perf.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index bbfed70b3402..b8ca3800b56b 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -134,8 +134,7 @@ static void free_event_data(struct work_struct *work)
if (event_data->snk_config && !WARN_ON(cpumask_empty(mask))) {
cpu = cpumask_first(mask);
sink = coresight_get_sink(etm_event_cpu_path(event_data, cpu));
-   if (sink_ops(sink)->free_buffer)
-   sink_ops(sink)->free_buffer(event_data->snk_config);
+   sink_ops(sink)->free_buffer(event_data->snk_config);
}
 
for_each_cpu(cpu, mask) {
@@ -215,7 +214,7 @@ static void *etm_setup_aux(struct perf_event *event, void 
**pages,
sink = coresight_get_enabled_sink(true);
}
 
-   if (!sink || !sink_ops(sink)->alloc_buffer)
+   if (!sink)
goto err;
 
mask = _data->mask;
@@ -261,6 +260,9 @@ static void *etm_setup_aux(struct perf_event *event, void 
**pages,
if (cpu >= nr_cpu_ids)
goto err;
 
+   if (!sink_ops(sink)->alloc_buffer || !sink_ops(sink)->free_buffer)
+   goto err;
+
/* Allocate the sink buffer for this session */
event_data->snk_config =
sink_ops(sink)->alloc_buffer(sink, cpu, pages,
-- 
2.17.1

[PATCH v3 16/20] coresight: tmc-etr: Introduce the notion of IDR to ETR devices

2019-04-03 Thread Mathieu Poirier

In CPU-wide scenarios with an N:1 source/sink topology, sources share
the same sink.  In order to reuse the same sink for all sources an
IDR is needed to archive events that have already been accounted for.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-tmc.c | 4 
 drivers/hwtracing/coresight/coresight-tmc.h | 6 ++
 2 files changed, 10 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight-tmc.c 
b/drivers/hwtracing/coresight/coresight-tmc.c
index 2a02da3d630f..71c86cffc021 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -8,10 +8,12 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -447,6 +449,8 @@ static int tmc_probe(struct amba_device *adev, const struct 
amba_id *id)
 coresight_get_uci_data(id));
if (ret)
goto out;
+   idr_init(>idr);
+   mutex_init(>idr_mutex);
break;
case TMC_CONFIG_TYPE_ETF:
desc.type = CORESIGHT_DEV_TYPE_LINKSINK;
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h 
b/drivers/hwtracing/coresight/coresight-tmc.h
index ee44906dffe8..c1b1700b2df7 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.h
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -8,7 +8,9 @@
 #define _CORESIGHT_TMC_H
 
 #include 
+#include 
 #include 
+#include 
 #include 
 
 #define TMC_RSZ0x004
@@ -173,6 +175,8 @@ struct etr_buf {
  * @trigger_cntr: amount of words to store after a trigger.
  * @etr_caps:  Bitmask of capabilities of the TMC ETR, inferred from the
  * device configuration register (DEVID)
+ * @idr:   Holds etr_bufs allocated for this ETR.
+ * @idr_mutex: Access serialisation for idr.
  * @perf_data: PERF buffer for ETR.
  * @sysfs_data:SYSFS buffer for ETR.
  */
@@ -194,6 +198,8 @@ struct tmc_drvdata {
enum tmc_mem_intf_width memwidth;
u32 trigger_cntr;
u32 etr_caps;
+   struct idr  idr;
+   struct mutexidr_mutex;
struct etr_buf  *sysfs_buf;
void*perf_data;
 };
-- 
2.17.1

[PATCH v3 12/20] coresight: tmc-etr: Refactor function tmc_etr_setup_perf_buf()

2019-04-03 Thread Mathieu Poirier

Refactoring function tmc_etr_setup_perf_buf() so that it only deals
with the high level etr_perf_buffer, leaving the allocation of the
backend buffer (i.e etr_buf) to another function.

That way the backend buffer allocation function can decide if it wants
to reuse an existing buffer (CPU-wide trace scenarios) or simply create
a new one.

Signed-off-by: Mathieu Poirier 
---
 .../hwtracing/coresight/coresight-tmc-etr.c   | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 00db6a6ce23f..e9c77009188a 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1159,29 +1159,24 @@ static int tmc_enable_etr_sink_sysfs(struct 
coresight_device *csdev)
 }
 
 /*
- * tmc_etr_setup_perf_buf: Allocate ETR buffer for use by perf.
+ * alloc_etr_buf: Allocate ETR buffer for use by perf.
  * The size of the hardware buffer is dependent on the size configured
  * via sysfs and the perf ring buffer size. We prefer to allocate the
  * largest possible size, scaling down the size by half until it
  * reaches a minimum limit (1M), beyond which we give up.
  */
-static struct etr_perf_buffer *
-tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, struct perf_event *event,
-  int nr_pages, void **pages, bool snapshot)
+static struct etr_buf *
+alloc_etr_buf(struct tmc_drvdata *drvdata, struct perf_event *event,
+ int nr_pages, void **pages, bool snapshot)
 {
int node, cpu = event->cpu;
struct etr_buf *etr_buf;
-   struct etr_perf_buffer *etr_perf;
unsigned long size;
 
if (cpu == -1)
cpu = smp_processor_id();
node = cpu_to_node(cpu);
 
-   etr_perf = kzalloc_node(sizeof(*etr_perf), GFP_KERNEL, node);
-   if (!etr_perf)
-   return ERR_PTR(-ENOMEM);
-
/*
 * Try to match the perf ring buffer size if it is larger
 * than the size requested via sysfs.
@@ -1205,6 +1200,32 @@ tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, 
struct perf_event *event,
size /= 2;
} while (size >= TMC_ETR_PERF_MIN_BUF_SIZE);
 
+   return ERR_PTR(-ENOMEM);
+
+done:
+   return etr_buf;
+}
+
+static struct etr_perf_buffer *
+tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, struct perf_event *event,
+  int nr_pages, void **pages, bool snapshot)
+{
+   int node, cpu = event->cpu;
+   struct etr_buf *etr_buf;
+   struct etr_perf_buffer *etr_perf;
+
+   if (cpu == -1)
+   cpu = smp_processor_id();
+   node = cpu_to_node(cpu);
+
+   etr_perf = kzalloc_node(sizeof(*etr_perf), GFP_KERNEL, node);
+   if (!etr_perf)
+   return ERR_PTR(-ENOMEM);
+
+   etr_buf = alloc_etr_buf(drvdata, event, nr_pages, pages, snapshot);
+   if (!IS_ERR(etr_buf))
+   goto done;
+
kfree(etr_perf);
return ERR_PTR(-ENOMEM);
 
-- 
2.17.1

[PATCH v3 05/20] coresight: Adding return code to sink::disable() operation

2019-04-03 Thread Mathieu Poirier

In preparation to handle device reference counting inside of the sink
drivers, add a return code to the sink::disable() operation so that
proper action can be taken if a sink has not been disabled.

Signed-off-by: Mathieu Poirier 
Reviewed-by: Suzuki K Poulose 
---
 drivers/hwtracing/coresight/coresight-etb10.c   | 3 ++-
 drivers/hwtracing/coresight/coresight-tmc-etf.c | 5 +++--
 drivers/hwtracing/coresight/coresight-tmc-etr.c | 5 +++--
 drivers/hwtracing/coresight/coresight-tpiu.c| 3 ++-
 drivers/hwtracing/coresight/coresight.c | 6 +-
 include/linux/coresight.h   | 2 +-
 6 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etb10.c 
b/drivers/hwtracing/coresight/coresight-etb10.c
index 105782ea64c7..71c2a3cdb866 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -325,7 +325,7 @@ static void etb_disable_hw(struct etb_drvdata *drvdata)
coresight_disclaim_device(drvdata->base);
 }
 
-static void etb_disable(struct coresight_device *csdev)
+static int etb_disable(struct coresight_device *csdev)
 {
struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
unsigned long flags;
@@ -340,6 +340,7 @@ static void etb_disable(struct coresight_device *csdev)
spin_unlock_irqrestore(>spinlock, flags);
 
dev_dbg(drvdata->dev, "ETB disabled\n");
+   return 0;
 }
 
 static void *etb_alloc_buffer(struct coresight_device *csdev, int cpu,
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index a5f053f2db2c..d4213e7c2c45 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -273,7 +273,7 @@ static int tmc_enable_etf_sink(struct coresight_device 
*csdev,
return 0;
 }
 
-static void tmc_disable_etf_sink(struct coresight_device *csdev)
+static int tmc_disable_etf_sink(struct coresight_device *csdev)
 {
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
@@ -281,7 +281,7 @@ static void tmc_disable_etf_sink(struct coresight_device 
*csdev)
spin_lock_irqsave(>spinlock, flags);
if (drvdata->reading) {
spin_unlock_irqrestore(>spinlock, flags);
-   return;
+   return -EBUSY;
}
 
/* Disable the TMC only if it needs to */
@@ -293,6 +293,7 @@ static void tmc_disable_etf_sink(struct coresight_device 
*csdev)
spin_unlock_irqrestore(>spinlock, flags);
 
dev_dbg(drvdata->dev, "TMC-ETB/ETF disabled\n");
+   return 0;
 }
 
 static int tmc_enable_etf_link(struct coresight_device *csdev,
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index f684283890d3..33501777038a 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1392,7 +1392,7 @@ static int tmc_enable_etr_sink(struct coresight_device 
*csdev,
return -EINVAL;
 }
 
-static void tmc_disable_etr_sink(struct coresight_device *csdev)
+static int tmc_disable_etr_sink(struct coresight_device *csdev)
 {
unsigned long flags;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
@@ -1400,7 +1400,7 @@ static void tmc_disable_etr_sink(struct coresight_device 
*csdev)
spin_lock_irqsave(>spinlock, flags);
if (drvdata->reading) {
spin_unlock_irqrestore(>spinlock, flags);
-   return;
+   return -EBUSY;
}
 
/* Disable the TMC only if it needs to */
@@ -1412,6 +1412,7 @@ static void tmc_disable_etr_sink(struct coresight_device 
*csdev)
spin_unlock_irqrestore(>spinlock, flags);
 
dev_dbg(drvdata->dev, "TMC-ETR disabled\n");
+   return 0;
 }
 
 static const struct coresight_ops_sink tmc_etr_sink_ops = {
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c 
b/drivers/hwtracing/coresight/coresight-tpiu.c
index b2f72a1fa402..0d13da1b9df1 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -94,13 +94,14 @@ static void tpiu_disable_hw(struct tpiu_drvdata *drvdata)
CS_LOCK(drvdata->base);
 }
 
-static void tpiu_disable(struct coresight_device *csdev)
+static int tpiu_disable(struct coresight_device *csdev)
 {
struct tpiu_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
tpiu_disable_hw(drvdata);
 
dev_dbg(drvdata->dev, "TPIU disabled\n");
+   return 0;
 }
 
 static const struct coresight_ops_sink tpiu_sink_ops = {
diff --git a/drivers/hwtracing/coresight/coresight.c 
b/drivers/hwtracing/coresight/coresight.c
index 29cef898afba..13eda4693f81 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -239,9 +239,13 @@ static int

[PATCH v3 02/20] coresight: etm4x: Add kernel configuration for CONTEXTID

2019-04-03 Thread Mathieu Poirier

Set the proper bit in the configuration register when contextID tracing
has been requested by user space.  That way PE_CONTEXT elements are
generated by the tracers when a process is installed on a CPU.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/Kconfig  | 1 +
 drivers/hwtracing/coresight/coresight-etm-perf.c | 2 ++
 drivers/hwtracing/coresight/coresight-etm4x.c| 5 +
 include/linux/coresight-pmu.h| 2 ++
 tools/include/linux/coresight-pmu.h  | 2 ++
 5 files changed, 12 insertions(+)

diff --git a/drivers/hwtracing/coresight/Kconfig 
b/drivers/hwtracing/coresight/Kconfig
index ad34380cac49..44d1650f398e 100644
--- a/drivers/hwtracing/coresight/Kconfig
+++ b/drivers/hwtracing/coresight/Kconfig
@@ -75,6 +75,7 @@ config CORESIGHT_SOURCE_ETM4X
bool "CoreSight Embedded Trace Macrocell 4.x driver"
depends on ARM64
select CORESIGHT_LINKS_AND_SINKS
+   select PID_IN_CONTEXTIDR
help
  This driver provides support for the ETM4.x tracer module, tracing the
  instructions that a processor is executing. This is primarily useful
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 25ae56e924bb..bbfed70b3402 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -29,6 +29,7 @@ static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 
 /* ETMv3.5/PTM's ETMCR is 'config' */
 PMU_FORMAT_ATTR(cycacc,"config:" __stringify(ETM_OPT_CYCACC));
+PMU_FORMAT_ATTR(contextid, "config:" __stringify(ETM_OPT_CTXTID));
 PMU_FORMAT_ATTR(timestamp, "config:" __stringify(ETM_OPT_TS));
 PMU_FORMAT_ATTR(retstack,  "config:" __stringify(ETM_OPT_RETSTK));
 /* Sink ID - same for all ETMs */
@@ -36,6 +37,7 @@ PMU_FORMAT_ATTR(sinkid,   "config2:0-31");
 
 static struct attribute *etm_config_formats_attr[] = {
_attr_cycacc.attr,
+   _attr_contextid.attr,
_attr_timestamp.attr,
_attr_retstack.attr,
_attr_sinkid.attr,
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c 
b/drivers/hwtracing/coresight/coresight-etm4x.c
index 08ce37c9475d..732ae12fca9b 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -239,6 +239,11 @@ static int etm4_parse_event_config(struct etmv4_drvdata 
*drvdata,
if (attr->config & BIT(ETM_OPT_TS))
/* bit[11], Global timestamp tracing bit */
config->cfg |= BIT(11);
+
+   if (attr->config & BIT(ETM_OPT_CTXTID))
+   /* bit[6], Context ID tracing bit */
+   config->cfg |= BIT(ETM4_CFG_BIT_CTXTID);
+
/* return stack - enable if selected and supported */
if ((attr->config & BIT(ETM_OPT_RETSTK)) && drvdata->retstack)
/* bit[12], Return stack enable bit */
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index a1a959ba24ff..b0e35eec6499 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -12,11 +12,13 @@
 
 /* ETMv3.5/PTM's ETMCR config bit */
 #define ETM_OPT_CYCACC  12
+#define ETM_OPT_CTXTID 14
 #define ETM_OPT_TS  28
 #define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
+#define ETM4_CFG_BIT_CTXTID6
 #define ETM4_CFG_BIT_TS11
 #define ETM4_CFG_BIT_RETSTK12
 
diff --git a/tools/include/linux/coresight-pmu.h 
b/tools/include/linux/coresight-pmu.h
index a1a959ba24ff..b0e35eec6499 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -12,11 +12,13 @@
 
 /* ETMv3.5/PTM's ETMCR config bit */
 #define ETM_OPT_CYCACC  12
+#define ETM_OPT_CTXTID 14
 #define ETM_OPT_TS  28
 #define ETM_OPT_RETSTK 29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC4
+#define ETM4_CFG_BIT_CTXTID6
 #define ETM4_CFG_BIT_TS11
 #define ETM4_CFG_BIT_RETSTK12
 
-- 
2.17.1

[PATCH v3 07/20] coresight: Properly address errors in sink::disable() functions

2019-04-03 Thread Mathieu Poirier

When disabling a sink the reference counter ensures the operation goes
through if nobody else is using it.  As such if drvdata::mode is already
set do CS_MODE_DISABLED, it is an error and should be reported as such.

Signed-off-by: Mathieu Poirier 
Reviewed-by: Suzuki K Poulose 
---
 drivers/hwtracing/coresight/coresight-etb10.c   | 9 -
 drivers/hwtracing/coresight/coresight-tmc-etf.c | 9 -
 drivers/hwtracing/coresight/coresight-tmc-etr.c | 9 -
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etb10.c 
b/drivers/hwtracing/coresight/coresight-etb10.c
index 5af50a852e87..52b7d95ab498 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -341,11 +341,10 @@ static int etb_disable(struct coresight_device *csdev)
return -EBUSY;
}
 
-   /* Disable the ETB only if it needs to */
-   if (drvdata->mode != CS_MODE_DISABLED) {
-   etb_disable_hw(drvdata);
-   drvdata->mode = CS_MODE_DISABLED;
-   }
+   /* Complain if we (somehow) got out of sync */
+   WARN_ON_ONCE(drvdata->mode == CS_MODE_DISABLED);
+   etb_disable_hw(drvdata);
+   drvdata->mode = CS_MODE_DISABLED;
spin_unlock_irqrestore(>spinlock, flags);
 
dev_dbg(drvdata->dev, "ETB disabled\n");
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index d50a608a60f1..30f868676540 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -297,11 +297,10 @@ static int tmc_disable_etf_sink(struct coresight_device 
*csdev)
return -EBUSY;
}
 
-   /* Disable the TMC only if it needs to */
-   if (drvdata->mode != CS_MODE_DISABLED) {
-   tmc_etb_disable_hw(drvdata);
-   drvdata->mode = CS_MODE_DISABLED;
-   }
+   /* Complain if we (somehow) got out of sync */
+   WARN_ON_ONCE(drvdata->mode == CS_MODE_DISABLED);
+   tmc_etb_disable_hw(drvdata);
+   drvdata->mode = CS_MODE_DISABLED;
 
spin_unlock_irqrestore(>spinlock, flags);
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index f90bca971367..86e748d09dc3 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1416,11 +1416,10 @@ static int tmc_disable_etr_sink(struct coresight_device 
*csdev)
return -EBUSY;
}
 
-   /* Disable the TMC only if it needs to */
-   if (drvdata->mode != CS_MODE_DISABLED) {
-   tmc_etr_disable_hw(drvdata);
-   drvdata->mode = CS_MODE_DISABLED;
-   }
+   /* Complain if we (somehow) got out of sync */
+   WARN_ON_ONCE(drvdata->mode == CS_MODE_DISABLED);
+   tmc_etr_disable_hw(drvdata);
+   drvdata->mode = CS_MODE_DISABLED;
 
spin_unlock_irqrestore(>spinlock, flags);
 
-- 
2.17.1

[PATCH v3 01/20] coresight: pmu: Adding ITRACE property to cs_etm PMU

2019-04-03 Thread Mathieu Poirier

Add to the capabilities the ITRACE property so that ITRACE START events
are generated when the PMU is switched on by the core.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-etm-perf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 4d5a2b9f9d6a..25ae56e924bb 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -566,7 +566,8 @@ static int __init etm_perf_init(void)
 {
int ret;
 
-   etm_pmu.capabilities= PERF_PMU_CAP_EXCLUSIVE;
+   etm_pmu.capabilities= (PERF_PMU_CAP_EXCLUSIVE |
+  PERF_PMU_CAP_ITRACE);
 
etm_pmu.attr_groups = etm_pmu_attr_groups;
etm_pmu.task_ctx_nr = perf_sw_context;
-- 
2.17.1

[PATCH v3 03/20] coresight: etm4x: Skip selector pair 0

2019-04-03 Thread Mathieu Poirier

Resource selector pair 0 is always implemented and reserved.  As such
it should not be explicitly programmed.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-etm4x.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c 
b/drivers/hwtracing/coresight/coresight-etm4x.c
index 732ae12fca9b..d64192c29860 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -138,8 +138,11 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
   drvdata->base + TRCCNTVRn(i));
}
 
-   /* Resource selector pair 0 is always implemented and reserved */
-   for (i = 0; i < drvdata->nr_resource * 2; i++)
+   /*
+* Resource selector pair 0 is always implemented and reserved.  As
+* such start at 2.
+*/
+   for (i = 2; i < drvdata->nr_resource * 2; i++)
writel_relaxed(config->res_ctrl[i],
   drvdata->base + TRCRSCTLRn(i));
 
-- 
2.17.1

[PATCH v3 11/20] coresight: Communicate perf event to sink buffer allocation functions

2019-04-03 Thread Mathieu Poirier

Make struct perf_event available to sink buffer allocation functions in
order to use the pid they carry to allocate and free buffer memory along
with regimenting access to what source a sink can collect data for.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-etb10.c  |  7 ---
 .../hwtracing/coresight/coresight-etm-perf.c   |  2 +-
 .../hwtracing/coresight/coresight-tmc-etf.c|  7 ---
 .../hwtracing/coresight/coresight-tmc-etr.c| 18 ++
 include/linux/coresight.h  |  5 +++--
 5 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etb10.c 
b/drivers/hwtracing/coresight/coresight-etb10.c
index 6b50e781dc57..7d64c41cd8ac 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -351,10 +351,11 @@ static int etb_disable(struct coresight_device *csdev)
return 0;
 }
 
-static void *etb_alloc_buffer(struct coresight_device *csdev, int cpu,
- void **pages, int nr_pages, bool overwrite)
+static void *etb_alloc_buffer(struct coresight_device *csdev,
+ struct perf_event *event, void **pages,
+ int nr_pages, bool overwrite)
 {
-   int node;
+   int node, cpu = event->cpu;
struct cs_buffers *buf;
 
if (cpu == -1)
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 806b3dd5872d..3c6294432748 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -277,7 +277,7 @@ static void *etm_setup_aux(struct perf_event *event, void 
**pages,
 
/* Allocate the sink buffer for this session */
event_data->snk_config =
-   sink_ops(sink)->alloc_buffer(sink, cpu, pages,
+   sink_ops(sink)->alloc_buffer(sink, event, pages,
 nr_pages, overwrite);
if (!event_data->snk_config)
goto err;
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index a38ad2b0d95a..1df1f8fade71 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -350,10 +350,11 @@ static void tmc_disable_etf_link(struct coresight_device 
*csdev,
dev_dbg(drvdata->dev, "TMC-ETF disabled\n");
 }
 
-static void *tmc_alloc_etf_buffer(struct coresight_device *csdev, int cpu,
- void **pages, int nr_pages, bool overwrite)
+static void *tmc_alloc_etf_buffer(struct coresight_device *csdev,
+ struct perf_event *event, void **pages,
+ int nr_pages, bool overwrite)
 {
-   int node;
+   int node, cpu = event->cpu;
struct cs_buffers *buf;
 
if (cpu == -1)
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 86e748d09dc3..00db6a6ce23f 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1166,13 +1166,18 @@ static int tmc_enable_etr_sink_sysfs(struct 
coresight_device *csdev)
  * reaches a minimum limit (1M), beyond which we give up.
  */
 static struct etr_perf_buffer *
-tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, int node, int nr_pages,
-  void **pages, bool snapshot)
+tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, struct perf_event *event,
+  int nr_pages, void **pages, bool snapshot)
 {
+   int node, cpu = event->cpu;
struct etr_buf *etr_buf;
struct etr_perf_buffer *etr_perf;
unsigned long size;
 
+   if (cpu == -1)
+   cpu = smp_processor_id();
+   node = cpu_to_node(cpu);
+
etr_perf = kzalloc_node(sizeof(*etr_perf), GFP_KERNEL, node);
if (!etr_perf)
return ERR_PTR(-ENOMEM);
@@ -1210,16 +1215,13 @@ tmc_etr_setup_perf_buf(struct tmc_drvdata *drvdata, int 
node, int nr_pages,
 
 
 static void *tmc_alloc_etr_buffer(struct coresight_device *csdev,
- int cpu, void **pages, int nr_pages,
- bool snapshot)
+ struct perf_event *event, void **pages,
+ int nr_pages, bool snapshot)
 {
struct etr_perf_buffer *etr_perf;
struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
 
-   if (cpu == -1)
-   cpu = smp_processor_id();
-
-   etr_perf = tmc_etr_setup_perf_buf(drvdata, cpu_to_node(cpu),
+   etr_perf = tmc_etr_setup_perf_buf(drvdata, event,
  nr_pages, pages, snapshot);
if (IS_ERR(etr_perf)) {

[PATCH v3 04/20] coresight: etm4x: Configure tracers to emit timestamps

2019-04-03 Thread Mathieu Poirier

Configure timestamps to be emitted at regular intervals in the trace
stream to temporally correlate instructions executed on different CPUs.

Signed-off-by: Mathieu Poirier 
---
 drivers/hwtracing/coresight/coresight-etm4x.c | 101 +-
 1 file changed, 100 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c 
b/drivers/hwtracing/coresight/coresight-etm4x.c
index d64192c29860..46d337fd8442 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -204,6 +204,90 @@ static void etm4_enable_hw_smp_call(void *info)
arg->rc = etm4_enable_hw(arg->drvdata);
 }
 
+/*
+ * The goal of function etm4_config_timestamp_event() is to configure a
+ * counter that will tell the tracer to emit a timestamp packet when it
+ * reaches zero.  This is done in order to get a more fine grained idea
+ * of when instructions are executed so that they can be correlated
+ * with execution on other CPUs.
+ *
+ * To do this the counter itself is configured to self reload and
+ * TRCRSCTLR1 (always true) used to get the counter to decrement.  From
+ * there a resource selector is configured with the counter and the
+ * timestamp control register to use the resource selector to trigger the
+ * event that will insert a timestamp packet in the stream.
+ */
+static int etm4_config_timestamp_event(struct etmv4_drvdata *drvdata)
+{
+   int ctridx, ret = -EINVAL;
+   int counter, rselector;
+   u32 val = 0;
+   struct etmv4_config *config = >config;
+
+   /* No point in trying if we don't have at least one counter */
+   if (!drvdata->nr_cntr)
+   goto out;
+
+   /* Find a counter that hasn't been initialised */
+   for (ctridx = 0; ctridx < drvdata->nr_cntr; ctridx++)
+   if (config->cntr_val[ctridx] == 0)
+   break;
+
+   /* All the counters have been configured already, bail out */
+   if (ctridx == drvdata->nr_cntr) {
+   pr_debug("%s: no available counter found\n", __func__);
+   ret = -ENOSPC;
+   goto out;
+   }
+
+   /*
+* Searching for an available resource selector to use, starting at
+* '2' since every implementation has at least 2 resource selector.
+* ETMIDR4 gives the number of resource selector _pairs_,
+* hence multiply by 2.
+*/
+   for (rselector = 2; rselector < drvdata->nr_resource * 2; rselector++)
+   if (!config->res_ctrl[rselector])
+   break;
+
+   if (rselector == drvdata->nr_resource * 2) {
+   pr_debug("%s: no available resource selector found\n", 
__func__);
+   ret = -ENOSPC;
+   goto out;
+   }
+
+   /* Remember what counter we used */
+   counter = 1 << ctridx;
+
+   /*
+* Initialise original and reload counter value to the smallest
+* possible value in order to get as much precision as we can.
+*/
+   config->cntr_val[ctridx] = 1;
+   config->cntrldvr[ctridx] = 1;
+
+   /* Set the trace counter control register */
+   val =  0x1 << 16|  /* Bit 16, reload counter automatically */
+  0x0 << 7 |  /* Select single resource selector */
+  0x1;/* Resource selector 1, i.e always true */
+
+   config->cntr_ctrl[ctridx] = val;
+
+   val = 0x2 << 16 | /* Group 0b0010 - Counter and sequencers */
+ counter << 0;   /* Counter to use */
+
+   config->res_ctrl[rselector] = val;
+
+   val = 0x0 << 7  | /* Select single resource selector */
+ rselector;  /* Resource selector */
+
+   config->ts_ctrl = val;
+
+   ret = 0;
+out:
+   return ret;
+}
+
 static int etm4_parse_event_config(struct etmv4_drvdata *drvdata,
   struct perf_event *event)
 {
@@ -239,9 +323,24 @@ static int etm4_parse_event_config(struct etmv4_drvdata 
*drvdata,
/* TRM: Must program this for cycacc to work */
config->ccctlr = ETM_CYC_THRESHOLD_DEFAULT;
}
-   if (attr->config & BIT(ETM_OPT_TS))
+   if (attr->config & BIT(ETM_OPT_TS)) {
+   /*
+* Configure timestamps to be emitted at regular intervals in
+* order to correlate instructions executed on different CPUs
+* (CPU-wide trace scenarios).
+*/
+   ret = etm4_config_timestamp_event(drvdata);
+
+   /*
+* No need to go further if timestamp intervals can't
+* be configured.
+*/
+   if (ret)
+   goto out;
+
/* bit[11], Global timestamp tracing bit */
config->cfg |= BIT(11);
+   }
 
if (attr->config & BIT(ETM_OPT_CTXTID))
/* bit[6], Context ID tracing bit */

[PATCH v3 10/20] coresight: perf: Refactor function free_event_data()

2019-04-03 Thread Mathieu Poirier

Function free_event_data() is already busy and is bound to become
worse with the addition of CPU-wide trace scenarios.  As such spin
off a new function to strickly take care of the sink buffers.

Signed-off-by: Mathieu Poirier 
Reviewed-by: Suzuki K Poulose 
---
 .../hwtracing/coresight/coresight-etm-perf.c  | 24 ++-
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index b8ca3800b56b..806b3dd5872d 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -120,22 +120,34 @@ static int etm_event_init(struct perf_event *event)
return ret;
 }
 
+static void free_sink_buffer(struct etm_event_data *event_data)
+{
+   int cpu;
+   cpumask_t *mask = _data->mask;
+   struct coresight_device *sink;
+
+   if (WARN_ON(cpumask_empty(mask)))
+   return;
+
+   if (!event_data->snk_config)
+   return;
+
+   cpu = cpumask_first(mask);
+   sink = coresight_get_sink(etm_event_cpu_path(event_data, cpu));
+   sink_ops(sink)->free_buffer(event_data->snk_config);
+}
+
 static void free_event_data(struct work_struct *work)
 {
int cpu;
cpumask_t *mask;
struct etm_event_data *event_data;
-   struct coresight_device *sink;
 
event_data = container_of(work, struct etm_event_data, work);
mask = _data->mask;
 
/* Free the sink buffers, if there are any */
-   if (event_data->snk_config && !WARN_ON(cpumask_empty(mask))) {
-   cpu = cpumask_first(mask);
-   sink = coresight_get_sink(etm_event_cpu_path(event_data, cpu));
-   sink_ops(sink)->free_buffer(event_data->snk_config);
-   }
+   free_sink_buffer(event_data);
 
for_each_cpu(cpu, mask) {
struct list_head **ppath;
-- 
2.17.1

[PATCH v3 00/20] coresight: Add support for CPU-wide trace scenarios

2019-04-03 Thread Mathieu Poirier

This is the third revision of a patchset that adds support for CPU-wide
trace scenarios and as such, it is now possible to issue the following
commands:

# perf record -e cs_etm/@2007.etr/ -C 2,3 $COMMAND
# perf record -e cs_etm/@2007.etr/ -a $COMMAND

The solution is designed to work for both 1:1 and N:1 source/sink
topologies, though the former hasn't been tested for lack of access to HW.

Most of the changes revolve around allowing more than one event to use
a sink when operated from perf.  More specifically the first event to
use a sink switches it on while the last one is tasked to aggregate traces
and switching off the device.

This is the kernel part of the solution, with the user space portion to be
released in a later set.  All patches (user and kernel) have been rebased
on v5.1-rc3 and are hosted here[1].  Everything has been tested on Juno, the
410c dragonboard, and hikey620 platforms.

Regards,
Mathieu

[1]. https://git.linaro.org/people/mathieu.poirier/coresight.git 
(5.1-rc3-cpu-wide-v3) 

== Changes for v3 ==
* Added review-by tags (some were dropped due to patch refactoring).
* Split IDR and reference counting patches.
* Moved IDR to struct tmc_drvdata to support 1:1 source/sink topologies.
* Enhanced code comments related to design choices.
* Renamed ETR buffer allocation functions to have a stronger perf semantic.
* Rebased to v5.1-rc3.

== Changes for V2 ==
* Using define ETM4_CFG_BIT_CTXTID rather than hard coded value (Suzuki).
* Moved pid out of struct etr_buf and into struct etr_perf_buffer (Suzuki).
* Removed code related to forcing double buffering (Suzuki).
* Fixed function reallocarray() for older distributions (Mike).
* Fixed counter configuration when dealing with errors(Leo).
* Automatically selecting PID_IN_CONTEXTIDR with ETMv4 driver.
* Rebased to v5.1-rc2.

Mathieu Poirier (20):
  coresight: pmu: Adding ITRACE property to cs_etm PMU
  coresight: etm4x: Add kernel configuration for CONTEXTID
  coresight: etm4x: Skip selector pair 0
  coresight: etm4x: Configure tracers to emit timestamps
  coresight: Adding return code to sink::disable() operation
  coresight: Move reference counting inside sink drivers
  coresight: Properly address errors in sink::disable() functions
  coresight: Properly address concurrency in sink::update() functions
  coresight: perf: Clean up function etm_setup_aux()
  coresight: perf: Refactor function free_event_data()
  coresight: Communicate perf event to sink buffer allocation functions
  coresight: tmc-etr: Refactor function tmc_etr_setup_perf_buf()
  coresight: tmc-etr: Create per-thread buffer allocation function
  coresight: tmc-etr: Introduce the notion of process ID to ETR devices
  coresight: tmc-etr: Introduce the notion of reference counting to ETR
devices
  coresight: tmc-etr: Introduce the notion of IDR to ETR devices
  coresight: tmc-etr: Allocate and free ETR memory buffers for CPU-wide
scenarios
  coresight: tmc-etr: Add support for CPU-wide trace scenarios
  coresight: tmc-etf: Add support for CPU-wide trace scenarios
  coresight: etb10: Add support for CPU-wide trace scenarios

 drivers/hwtracing/coresight/Kconfig   |   1 +
 drivers/hwtracing/coresight/coresight-etb10.c |  83 --
 .../hwtracing/coresight/coresight-etm-perf.c  |  37 ++-
 drivers/hwtracing/coresight/coresight-etm4x.c | 113 +++-
 .../hwtracing/coresight/coresight-tmc-etf.c   |  82 --
 .../hwtracing/coresight/coresight-tmc-etr.c   | 261 --
 drivers/hwtracing/coresight/coresight-tmc.c   |   6 +
 drivers/hwtracing/coresight/coresight-tmc.h   |  12 +
 drivers/hwtracing/coresight/coresight-tpiu.c  |   9 +-
 drivers/hwtracing/coresight/coresight.c   |  28 +-
 include/linux/coresight-pmu.h |   2 +
 include/linux/coresight.h |   7 +-
 tools/include/linux/coresight-pmu.h   |   2 +
 13 files changed, 546 insertions(+), 97 deletions(-)

-- 
2.17.1

[PATCH] mm:workingset use real time to judge activity of the file page

2019-04-03 Thread Zhaoyang Huang

From: Zhaoyang Huang 

In previous implementation, the number of refault pages is used
for judging the refault period of each page, which is not precised as
eviction of other files will be affect a lot on current cache.
We introduce the timestamp into the workingset's entry and refault ratio
to measure the file page's activity. It helps to decrease the affection
of other files(average refault ratio can reflect the view of whole system
's memory).
The patch is tested on an Android system, which can be described as
comparing the launch time of an application between a huge memory
consumption. The result is launch time decrease 50% and the page fault
during the test decrease 80%.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/mmzone.h |  2 ++
 mm/workingset.c| 24 +---
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..c38ba0a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -240,6 +240,8 @@ struct lruvec {
atomic_long_t   inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long   refaults;
+   atomic_long_t   refaults_ratio;
+   atomic_long_t   prev_fault;
 #ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
 #endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 40ee02c..6361853 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -159,7 +159,7 @@
 NODES_SHIFT +  \
 MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
-
+#define EVICTION_JIFFIES (BITS_PER_LONG >> 3)
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the radix tree
@@ -175,18 +175,22 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, 
unsigned long eviction)
eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+   eviction = (eviction << EVICTION_JIFFIES) | (jiffies >> 
EVICTION_JIFFIES);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
 
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, unsigned long *prev_jiffp)
 {
unsigned long entry = (unsigned long)shadow;
int memcgid, nid;
+   unsigned long prev_jiff;
 
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+   entry >>= EVICTION_JIFFIES;
+   prev_jiff = (entry & ((1UL << EVICTION_JIFFIES) - 1)) << 
EVICTION_JIFFIES;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,6 +199,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, 
pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+   *prev_jiffp = prev_jiff;
 }
 
 /**
@@ -242,8 +247,12 @@ bool workingset_refault(void *shadow)
unsigned long refault;
struct pglist_data *pgdat;
int memcgid;
+   unsigned long refault_ratio;
+   unsigned long prev_jiff;
+   unsigned long avg_refault_time;
+   unsigned long refault_time;
 
-   unpack_shadow(shadow, , , );
+   unpack_shadow(shadow, , , , _jiff);
 
rcu_read_lock();
/*
@@ -288,10 +297,11 @@ bool workingset_refault(void *shadow)
 * list is not a problem.
 */
refault_distance = (refault - eviction) & EVICTION_MASK;
-
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
-   if (refault_distance <= active_file) {
+   lruvec->refaults_ratio = atomic_long_read(>inactive_age) / 
jiffies;
+   refault_time = jiffies - prev_jiff;
+   avg_refault_time = refault_distance / lruvec->refaults_ratio;
+   if (refault_time <= avg_refault_time) {
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
rcu_read_unlock();
return true;
@@ -521,7 +531,7 @@ static int __init workingset_init(void)
 * some more pages at runtime, so keep working with up to
 * double the initial memory by using totalram_pages as-is.
 */
-   timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+   timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT - EVICTION_JIFFIES;
max_order = fls_long(totalram_pages - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
-- 
1.9.1

Re: [PATCH v3 04/13] thermal: qoriq: Add local struct qoriq_sensor pointer

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 06:14, Andrey Smirnov wrote:
> Add local struct qoriq_sensor pointer in qoriq_tmu_register_tmu_zone()
> for brevity.
> 
> Signed-off-by: Andrey Smirnov 
> Cc: Chris Healy 
> Cc: Lucas Stach 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: Daniel Lezcano 
> Cc: Angus Ainslie (Purism) 
> Cc: linux-...@nxp.com
> Cc: linux...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  drivers/thermal/qoriq_thermal.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
> index 6d40b9788266..e281bdcfa11f 100644
> --- a/drivers/thermal/qoriq_thermal.c
> +++ b/drivers/thermal/qoriq_thermal.c
> @@ -114,18 +114,18 @@ static int qoriq_tmu_register_tmu_zone(struct 
> platform_device *pdev)
>  
>   for (id = 0; id < SITES_MAX; id++) {
>   struct thermal_zone_device *tzd;
> + struct qoriq_sensor *s;
>  
> - qdata->sensor[id] = devm_kzalloc(>dev,
> + s = qdata->sensor[id] = devm_kzalloc(>dev,
>   sizeof(struct qoriq_sensor), GFP_KERNEL);

I would not recommend this, especially if you use a variable helper for
clarity. Keep using the 's' variable and then assign qdata->sensor[id] =
s at the end when everything is ok. May be rename it 'sensor'?

>   if (!qdata->sensor[id])
>   return -ENOMEM;
>  
> - qdata->sensor[id]->id = id;
> - qdata->sensor[id]->qdata = qdata;
> + s->id = id;
> + s->qdata = qdata;
>  
>   tzd = devm_thermal_zone_of_sensor_register(>dev, id,
> -qdata->sensor[id],
> -_tz_ops);
> +s, _tz_ops);
>   if (IS_ERR(tzd)) {
>   if (PTR_ERR(tzd) == -ENODEV)
>   continue;
> 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

Re: [PATCH v3 03/13] thermal: qoriq: Don't store struct thermal_zone_device reference

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 06:14, Andrey Smirnov wrote:
> Struct thermal_zone_device reference stored as sensor's private data
> isn't really used anywhere in the code. Drop it.
> 
> Signed-off-by: Andrey Smirnov 
> Cc: Chris Healy 
> Cc: Lucas Stach 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: Daniel Lezcano 
> Cc: Angus Ainslie (Purism) 
> Cc: linux-...@nxp.com
> Cc: linux...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org

Acked-by: Daniel Lezcano 

> ---
>  drivers/thermal/qoriq_thermal.c | 15 +--
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
> index 91f9f49d2776..6d40b9788266 100644
> --- a/drivers/thermal/qoriq_thermal.c
> +++ b/drivers/thermal/qoriq_thermal.c
> @@ -65,7 +65,6 @@ struct qoriq_tmu_data;
>   * Thermal zone data
>   */
>  struct qoriq_sensor {
> - struct thermal_zone_device  *tzd;
>   struct qoriq_tmu_data   *qdata;
>   int id;
>  };
> @@ -114,6 +113,8 @@ static int qoriq_tmu_register_tmu_zone(struct 
> platform_device *pdev)
>   int id, sites = 0;
>  
>   for (id = 0; id < SITES_MAX; id++) {
> + struct thermal_zone_device *tzd;
> +
>   qdata->sensor[id] = devm_kzalloc(>dev,
>   sizeof(struct qoriq_sensor), GFP_KERNEL);
>   if (!qdata->sensor[id])
> @@ -121,13 +122,15 @@ static int qoriq_tmu_register_tmu_zone(struct 
> platform_device *pdev)
>  
>   qdata->sensor[id]->id = id;
>   qdata->sensor[id]->qdata = qdata;
> - qdata->sensor[id]->tzd = devm_thermal_zone_of_sensor_register(
> - >dev, id, qdata->sensor[id], _tz_ops);
> - if (IS_ERR(qdata->sensor[id]->tzd)) {
> - if (PTR_ERR(qdata->sensor[id]->tzd) == -ENODEV)
> +
> + tzd = devm_thermal_zone_of_sensor_register(>dev, id,
> +qdata->sensor[id],
> +_tz_ops);
> + if (IS_ERR(tzd)) {
> + if (PTR_ERR(tzd) == -ENODEV)
>   continue;
>   else
> - return PTR_ERR(qdata->sensor[id]->tzd);
> + return PTR_ERR(tzd);
>   }
>  
>   sites |= 0x1 << (15 - id);
> 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

Re: [PATCH v3 01/13] thermal: qoriq: Remove unnecessary DT node is NULL check

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 06:14, Andrey Smirnov wrote:
> This driver is meant to be used with Device Tree and there's no
> use-case where device's DT node is going to be NULL. Remove code
> protecting against that.

May be elaborate why is never going to be NULL?

> Signed-off-by: Andrey Smirnov 
> Cc: Chris Healy 
> Cc: Lucas Stach 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: Daniel Lezcano 
> Cc: Angus Ainslie (Purism) 
> Cc: linux-...@nxp.com
> Cc: linux...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org

Acked-by: Daniel Lezcano 

> ---
>  drivers/thermal/qoriq_thermal.c | 5 -
>  1 file changed, 5 deletions(-)
> 
> diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
> index 3b5f5b3fb1bc..7b364933bfb1 100644
> --- a/drivers/thermal/qoriq_thermal.c
> +++ b/drivers/thermal/qoriq_thermal.c
> @@ -193,11 +193,6 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
>   struct qoriq_tmu_data *data;
>   struct device_node *np = pdev->dev.of_node;
>  
> - if (!np) {
> - dev_err(>dev, "Device OF-Node is NULL");
> - return -ENODEV;
> - }
> -
>   data = devm_kzalloc(>dev, sizeof(struct qoriq_tmu_data),
>   GFP_KERNEL);
>   if (!data)
> 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

Re: [PATCH v2 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock

2019-04-03 Thread Alex Kogan

Hi, Hanjun. 

> On Apr 3, 2019, at 10:02 PM, Hanjun Guo  wrote:
> 
> Hi Alex,
> 
> On 2019/3/29 23:20, Alex Kogan wrote:
>> +
>> +static __always_inline void cna_init_node(struct mcs_spinlock *node, int 
>> cpuid,
>> +  u32 tail)
>> +{
>> +if (decode_numa_node(node->node_and_count) == -1)
>> +store_numa_node(node, numa_cpu_node(cpuid));
> 
> How about using cpu_to_node() here and #include  in this
> file, then the code can be reused for other architectures such as ARM64?
Good point. Thanks!

— Alex

> 
> Thanks
> Hanjun
>

Re: [PATCH v3 02/13] thermal: qoriq: Add local struct device pointer

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 06:14, Andrey Smirnov wrote:
> Use a local "struct device *dev" for brevity. No functional change
> intended.
> 
> Signed-off-by: Andrey Smirnov 
> Cc: Chris Healy 
> Cc: Lucas Stach 
> Cc: Zhang Rui 
> Cc: Eduardo Valentin 
> Cc: Daniel Lezcano 
> Cc: Angus Ainslie (Purism) 
> Cc: linux-...@nxp.com
> Cc: linux...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> ---

Acked-by: Daniel Lezcano 

>  drivers/thermal/qoriq_thermal.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
> index 7b364933bfb1..91f9f49d2776 100644
> --- a/drivers/thermal/qoriq_thermal.c
> +++ b/drivers/thermal/qoriq_thermal.c
> @@ -192,8 +192,9 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
>   int ret;
>   struct qoriq_tmu_data *data;
>   struct device_node *np = pdev->dev.of_node;
> + struct device *dev = >dev;
>  
> - data = devm_kzalloc(>dev, sizeof(struct qoriq_tmu_data),
> + data = devm_kzalloc(dev, sizeof(struct qoriq_tmu_data),
>   GFP_KERNEL);
>   if (!data)
>   return -ENOMEM;
> @@ -204,7 +205,7 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
>  
>   data->regs = of_iomap(np, 0);
>   if (!data->regs) {
> - dev_err(>dev, "Failed to get memory region\n");
> + dev_err(dev, "Failed to get memory region\n");
>   ret = -ENODEV;
>   goto err_iomap;
>   }
> @@ -217,7 +218,7 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
>  
>   ret = qoriq_tmu_register_tmu_zone(pdev);
>   if (ret < 0) {
> - dev_err(>dev, "Failed to register sensors\n");
> + dev_err(dev, "Failed to register sensors\n");
>   ret = -ENODEV;
>   goto err_iomap;
>   }
> 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

[PATCH] tools/power: turbostat: make output buffer extensible (Re: [PATCH v1] tools/power: turbostat: fix buffer overrun)

2019-04-03 Thread Naoya Horiguchi

Hi Prarit,

On Wed, Apr 03, 2019 at 07:42:45AM -0400, Prarit Bhargava wrote:
> 
> 
> On 4/3/19 3:02 AM, Naoya Horiguchi wrote:
> > turbostat could be terminated by general protection fault on some latest
> > hardwares which (for example) support 9 levels of C-states and show 18
> > "tADDED" lines. That bloats the total output and finally causes buffer
> > overrun.  So let's extend the buffer to avoid this.
> > 
> > This patch also removes duplicated "pc10:" line to reduce buffer usage.
> > 
> > Signed-off-by: Naoya Horiguchi 
> > ---
> >  tools/power/x86/turbostat/turbostat.c | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> > 
> > diff --git 
> > v5.1-rc3-mmotm-2019-04-02-17-16/tools/power/x86/turbostat/turbostat.c 
> > v5.1-rc3-mmotm-2019-04-02-17-16_patched/tools/power/x86/turbostat/turbostat.c
> > index c7727be..17b1f544 100644
> > --- v5.1-rc3-mmotm-2019-04-02-17-16/tools/power/x86/turbostat/turbostat.c
> > +++ 
> > v5.1-rc3-mmotm-2019-04-02-17-16_patched/tools/power/x86/turbostat/turbostat.c
> > @@ -861,7 +861,6 @@ int dump_counters(struct thread_data *t, struct 
> > core_data *c,
> > outp += sprintf(outp, "pc8: %016llX\n", p->pc8);
> > outp += sprintf(outp, "pc9: %016llX\n", p->pc9);
> > outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
> > -   outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
> > outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
> > outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
> > outp += sprintf(outp, "Joules PKG: %0X\n", p->energy_pkg);
> > @@ -5135,7 +5134,7 @@ int initialize_counters(int cpu_id)
> >  
> >  void allocate_output_buffer()
> >  {
> > -   output_buffer = calloc(1, (1 + topo.num_cpus) * 1024);
> > +   output_buffer = calloc(1, (1 + topo.num_cpus) * 2048);
> 
> Is there a better way to calculate the size of that buffer other than a magic
> number?

Straightforward way to calculate it is to define the list of printing items
and set needed buffer size for each one, then sum them up in initialization.
But that might make code hard to maintain because we already have many small
items and they are not in common format.

Another approach independent of magic number or fixed-sized buffer is to
extend the buffer with remalloc() when we are approaching the end.
I hope the following patch might help.

# This patch is relatively large (~400 lines) but most are simple replacement
# of "sprintf(outp, ...)" with "append_to_output_buffer()".

Thanks,
Naoya Horiguchi
--
From: Naoya Horiguchi 
Date: Thu, 4 Apr 2019 11:54:28 +0900
Subject: [PATCH] tools/power: turbostat: make output buffer extensible

"turbostat --Dump" could be terminated by general protection fault on
some latest hardwares which (for example) support 9 levels of C-states
and show 18 "tADDED" lines. That bloats the total output and finally
causes buffer overrun.  So this patch sugguests to extend the output
buffer when reaching the end.

This patch also removes duplicated "pc10:" line to reduce buffer usage.

Signed-off-by: Naoya Horiguchi 
---
 tools/power/x86/turbostat/turbostat.c | 397 ++
 1 file changed, 210 insertions(+), 187 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c 
b/tools/power/x86/turbostat/turbostat.c
index c7727be9719f..41d41c532a3e 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -84,6 +84,7 @@ double tsc_tweak = 1.0;
 unsigned int show_pkg_only;
 unsigned int show_core_only;
 char *output_buffer, *outp;
+ssize_t outbuf_size;
 unsigned int do_rapl;
 unsigned int do_dts;
 unsigned int do_ptm;
@@ -625,6 +626,28 @@ unsigned long long bic_lookup(char *name_list, enum 
show_hide_mode mode)
return retval;
 }
 
+static void *append_to_output_buffer(const char *fmt, ...)
+{
+   va_list args;
+
+   va_start(args, fmt);
+   outp += vsprintf(outp, fmt, args);
+
+   /* Approaching the buffer end, so extend it. */
+   if (outp - output_buffer >= (outbuf_size - 256)) {
+   int output_size = outp - output_buffer;
+
+   outbuf_size += 1024;
+   output_buffer = realloc(output_buffer, outbuf_size);
+   if (output_buffer == NULL)
+   err(-1, "realloc output buffer");
+   if (debug)
+   printf("Output buffer was extended.\n");
+   outp = output_buffer + output_size;
+   }
+   va_end(args);
+   return outp;
+}
 
 void print_header(char *delim)
 {
@@ -632,173 +655,173 @@ void print_header(char *delim)
int printed = 0;
 
if (DO_BIC(BIC_USEC))
-   outp += sprintf(outp, "%susec", (printed++ ? delim : ""));
+   outp = append_to_output_buffer("%susec", (printed++ ? delim : 
""));
if (DO_BIC(BIC_TOD))
-   outp += sprintf(outp, "%sTime_Of_Day_Seconds",

Re: [PATCH v1 3/3] thermal: rockchip: Support the PX30 SoC in thermal driver

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 08:43, Elaine Zhang wrote:
> PX30 SOC has two Temperature Sensors for CPU and GPU.
> 
> Signed-off-by: Elaine Zhang 
> ---
>  drivers/thermal/rockchip_thermal.c | 39 
> ++
>  1 file changed, 39 insertions(+)
> 
> diff --git a/drivers/thermal/rockchip_thermal.c 
> b/drivers/thermal/rockchip_thermal.c
> index faa6c7792155..d5c161e63361 100644
> --- a/drivers/thermal/rockchip_thermal.c
> +++ b/drivers/thermal/rockchip_thermal.c
> @@ -225,11 +225,15 @@ struct rockchip_thermal_data {
>  #define GRF_TSADC_TESTBIT_L  0x0e648
>  #define GRF_TSADC_TESTBIT_H  0x0e64c
>  
> +#define PX30_GRF_SOC_CON20x0408
> +
>  #define GRF_SARADC_TESTBIT_ON(0x10001 << 2)
>  #define GRF_TSADC_TESTBIT_H_ON   (0x10001 << 2)
>  #define GRF_TSADC_VCM_EN_L   (0x10001 << 7)
>  #define GRF_TSADC_VCM_EN_H   (0x10001 << 7)
>  
> +#define GRF_CON_TSADC_CH_INV (0x10001 << 1)
> +
>  /**
>   * struct tsadc_table - code to temperature conversion table
>   * @code: the value of adc channel
> @@ -692,6 +696,14 @@ static void rk_tsadcv3_initialize(struct regmap *grf, 
> void __iomem *regs,
>  regs + TSADCV2_AUTO_CON);
>  }
>  
> +static void rk_tsadcv4_initialize(struct regmap *grf, void __iomem *regs,
> +   enum tshut_polarity tshut_polarity)
> +{
> + rk_tsadcv2_initialize(grf, regs, tshut_polarity);
> + if (!IS_ERR(grf))

Why this test ? grf is not modified by the 'rk_tsadcv2_initialize' function.

> + regmap_write(grf, PX30_GRF_SOC_CON2, GRF_CON_TSADC_CH_INV);
> +}
> +
>  static void rk_tsadcv2_irq_ack(void __iomem *regs)
>  {
>   u32 val;
> @@ -821,6 +833,30 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   writel_relaxed(val, regs + TSADCV2_INT_EN);
>  }
>  
> +static const struct rockchip_tsadc_chip px30_tsadc_data = {
> + .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
> + .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
> + .chn_num = 2, /* 2 channels for tsadc */
> +
> + .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */
> + .tshut_temp = 95000,
> +
> + .initialize = rk_tsadcv4_initialize,
> + .irq_ack = rk_tsadcv3_irq_ack,
> + .control = rk_tsadcv3_control,
> + .get_temp = rk_tsadcv2_get_temp,
> + .set_alarm_temp = rk_tsadcv2_alarm_temp,
> + .set_tshut_temp = rk_tsadcv2_tshut_temp,
> + .set_tshut_mode = rk_tsadcv2_tshut_mode,
> +
> + .table = {
> + .id = rk3328_code_table,
> + .length = ARRAY_SIZE(rk3328_code_table),
> + .data_mask = TSADCV2_DATA_MASK,
> + .mode = ADC_INCREMENT,
> + },
> +};
> +
>  static const struct rockchip_tsadc_chip rv1108_tsadc_data = {
>   .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
>   .chn_num = 1, /* one channel for tsadc */
> @@ -993,6 +1029,9 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>  };
>  
>  static const struct of_device_id of_rockchip_thermal_match[] = {
> + {   .compatible = "rockchip,px30-tsadc",
> + .data = (void *)_tsadc_data,
> + },
>   {
>   .compatible = "rockchip,rv1108-tsadc",
>   .data = (void *)_tsadc_data,
> 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

Re: [PATCH v1 2/3] dt-bindings: rockchip-thermal: Support the PX30 SoC compatible

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 08:43, Elaine Zhang wrote:
> Add a new compatible for thermal founding on PX30 SoCs.
> 
> Signed-off-by: Elaine Zhang 
> ---
>  Documentation/devicetree/bindings/thermal/rockchip-thermal.txt | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt 
> b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
> index 43d744e5305e..c6aac9bcacf1 100644
> --- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
> +++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
> @@ -2,6 +2,7 @@
>  
>  Required properties:
>  - compatible : should be "rockchip,-tsadc"
> +   "rockchip,px30-tsadc":   found on PX30 SoCs
> "rockchip,rv1108-tsadc": found on RV1108 SoCs
> "rockchip,rk3228-tsadc": found on RK3228 SoCs
> "rockchip,rk3288-tsadc": found on RK3288 SoCs

Acked-by: Daniel Lezcano 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

Re: [PATCH v1 1/3] thermal: rockchip: add pinctrl control

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 08:43, Elaine Zhang wrote:
> Based on the TSADC Tshut mode to select pinctrl,
> instead of setting pinctrl based on architecture
> (Not depends on pinctrl setting by "init" or "default").
> And it requires setting the tshut polarity before select pinctrl.

I'm not sure to fully read the description. Can you rephrase/elaborate
the changelog?

> Signed-off-by: Elaine Zhang 
> ---
>  drivers/thermal/rockchip_thermal.c | 61 
> +++---
>  1 file changed, 50 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/thermal/rockchip_thermal.c 
> b/drivers/thermal/rockchip_thermal.c
> index 9c7643d62ed7..faa6c7792155 100644
> --- a/drivers/thermal/rockchip_thermal.c
> +++ b/drivers/thermal/rockchip_thermal.c
> @@ -34,7 +34,7 @@
>   */
>  enum tshut_mode {
>   TSHUT_MODE_CRU = 0,
> - TSHUT_MODE_GPIO,
> + TSHUT_MODE_OTP,

Why do you change the enum name? The impact on the patch is much higher,
no ?

>  };
>  
>  /**
> @@ -172,6 +172,9 @@ struct rockchip_thermal_data {
>   int tshut_temp;
>   enum tshut_mode tshut_mode;
>   enum tshut_polarity tshut_polarity;
> + struct pinctrl *pinctrl;
> + struct pinctrl_state *gpio_state;
> + struct pinctrl_state *otp_state;
>  };
>  
>  /**
> @@ -807,7 +810,7 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   u32 val;
>  
>   val = readl_relaxed(regs + TSADCV2_INT_EN);
> - if (mode == TSHUT_MODE_GPIO) {
> + if (mode == TSHUT_MODE_OTP) {
>   val &= ~TSADCV2_SHUT_2CRU_SRC_EN(chn);
>   val |= TSADCV2_SHUT_2GPIO_SRC_EN(chn);
>   } else {
> @@ -822,7 +825,7 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
>   .chn_num = 1, /* one channel for tsadc */
>  
> - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
> + .tshut_mode = TSHUT_MODE_OTP, /* default TSHUT via GPIO give PMIC */
>   .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
>   .tshut_temp = 95000,
>  
> @@ -846,7 +849,7 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
>   .chn_num = 1, /* one channel for tsadc */
>  
> - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
> + .tshut_mode = TSHUT_MODE_OTP, /* default TSHUT via GPIO give PMIC */
>   .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
>   .tshut_temp = 95000,
>  
> @@ -871,7 +874,7 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   .chn_id[SENSOR_GPU] = 2, /* gpu sensor is channel 2 */
>   .chn_num = 2, /* two channels for tsadc */
>  
> - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
> + .tshut_mode = TSHUT_MODE_OTP, /* default TSHUT via GPIO give PMIC */
>   .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
>   .tshut_temp = 95000,
>  
> @@ -919,7 +922,7 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
>   .chn_num = 2, /* two channels for tsadc */
>  
> - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
> + .tshut_mode = TSHUT_MODE_OTP, /* default TSHUT via GPIO give PMIC */
>   .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
>   .tshut_temp = 95000,
>  
> @@ -944,7 +947,7 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
>   .chn_num = 2, /* two channels for tsadc */
>  
> - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
> + .tshut_mode = TSHUT_MODE_OTP, /* default TSHUT via GPIO give PMIC */
>   .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
>   .tshut_temp = 95000,
>  
> @@ -969,7 +972,7 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem 
> *regs,
>   .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
>   .chn_num = 2, /* two channels for tsadc */
>  
> - .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
> + .tshut_mode = TSHUT_MODE_OTP, /* default TSHUT via GPIO give PMIC */
>   .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
>   .tshut_temp = 95000,
>  
> @@ -1080,6 +1083,20 @@ static int rockchip_thermal_get_temp(void *_sensor, 
> int *out_temp)
>   .set_trips = rockchip_thermal_set_trips,
>  };
>  
> +static void thermal_pinctrl_select_otp(struct rockchip_thermal_data *thermal)
> +{
> + if (!IS_ERR(thermal->pinctrl) && !IS_ERR_OR_NULL(thermal->otp_state))
> + pinctrl_select_state(thermal->pinctrl,
> +  thermal->otp_state);
> +}
> +
> +static void thermal_pinctrl_select_gpio(struct rockchip_thermal_data 
> *thermal)
> +{
> + if

Re: [PATCH v11 2/8] mfd: bd70528: Support ROHM bd70528 PMIC - core

2019-04-03 Thread Lee Jones

On Wed, 03 Apr 2019, Vaittinen, Matti wrote:

> On Wed, 2019-04-03 at 12:25 +0100, Lee Jones wrote:
> > On Wed, 03 Apr 2019, Matti Vaittinen wrote:
> > 
> > > On Wed, Apr 03, 2019 at 10:30:15AM +0100, Lee Jones wrote:
> > > > On Wed, 03 Apr 2019, Matti Vaittinen wrote:
> > > > 
> > > > > Hello Lee,
> > > > > 
> > > > > Thanks for taking a look on this again =) I agree with most of
> > > > > the
> > > > > comments and correct them at next version.
> > > > > 
> > > > > On Wed, Apr 03, 2019 at 08:31:52AM +0100, Lee Jones wrote:
> > > > > > On Mon, 25 Mar 2019, Matti Vaittinen wrote:
> > > > > > 
> > > > > > > ROHM BD70528MWV is an ultra-low quiescent current general
> > > > > > > purpose single-chip power management IC for battery-powered
> > > > > > > portable devices.
> > > > > > > 
> > > > > > > Add MFD core which enables chip access for following
> > > > > > > subdevices:
> > > > > > >   - regulators/LED drivers
> > > > > > >   - battery-charger
> > > > > > >   - gpios
> > > > > > >   - 32.768kHz clk
> > > > > > >   - RTC
> > > > > > >   - watchdog
> > > > > > > 
> > > > > > > Signed-off-by: Matti Vaittinen <
> > > > > > > matti.vaitti...@fi.rohmeurope.com>
> > > > > > > + * Mapping of main IRQ register bits to sub irq register
> > > > > > > offsets so
> > > > > > 
> > > > > > "sub-IRQ"
> > > > > > 
> > > > > > > + * that we can access corect sub IRQ registers based on
> > > > > > > bits that
> > > > > > 
> > > > > > "sub IRQ" is also fine, but please standardise.
> > > > > > 
> > > > > > I do prefer "sub-IRQ" though.
> > > > > 
> > > > > I'll go with "sub-IRQ" then
> > > > > 
> > > > > > > +
> > > > > > > +#define WD_CTRL_MAGIC1 0x55
> > > > > > > +#define WD_CTRL_MAGIC2 0xAA
> > > > > > > +/**
> > > > > > > + * bd70528_wdt_set - arm or disarm watchdog timer
> > > > > > > + *
> > > > > > > + * @data:device data for the PMIC instance we want to
> > > > > > > operate on
> > > > > > > + * @enable:  new state of WDT. zero to disable, non
> > > > > > > zero to enable
> > > > > > > + * @old_state:   previous state of WDT will be filled
> > > > > > > here
> > > > > > > + *
> > > > > > > + * Arm or disarm WDT on BD70528 PMIC. Expected to be
> > > > > > > called only by
> > > > > > > + * BD70528 RTC and BD70528 WDT drivers. The rtc_timer_lock
> > > > > > > must be taken
> > > > > > > + * by calling bd70528_wdt_lock before calling
> > > > > > > bd70528_wdt_set.
> > > > > > > + */
> > > > > > > +int bd70528_wdt_set(struct rohm_regmap_dev *data, int
> > > > > > > enable, int *old_state)
> > > > > > 
> > > > > > Why doesn't this reside in the watchdog driver?
> > > > > 
> > > > > If my memory serves me right we shortly discussed this already
> > > > > during v8
> > > > > review ;) Cant blame you though as I have seen some of the mail
> > > > > traffic
> > > > > going through your inbox :D
> > > > > 
> > > > > The motivation to have the functions exported from MFD is to
> > > > > not create
> > > > > sirect dependency between RTC and WDT. There may be cases where
> > > > > we want
> > > > > to leave either RTC or WDT out of compilation. MFD is always
> > > > > needed so
> > > > > the dependency from MFD to RTC/WDT does not harm.
> > > > > 
> > > > > (Here's some discussion necromancy if you are interested in re-
> > > > > reading
> > > > > how we did end up with this implementation:
> > > > > https://lore.kernel.org/lkml/20190212091723.GZ20638@dell/)
> > > > > 
> > > > > I hope you are still Ok with having the WDT control functions
> > > > > in MFD.
> > > > 
> > > > OOI, why does the RTC need to control the WDT?
> > > 
> > > I thought I had a comment about this somewhere in code... O_o Must
> > > have
> > > been in some development branch I had :/
> > > 
> > > Anyways, setting the RTC counter may cause watchdog to trigger. It
> > > is not
> > > further explained why but I would guess watchdog uses RTC counter
> > > to check
> > > if it should've been pinged already. So RTC needs to disable watch
> > > dog for
> > > the duration of hwclock setting and enable it again after the new
> > > time is
> > > set. I can add a comment about this to MFD driver if it helps :)
> > 
> > How does the user select between using the RTC and the WDT?
> > 
> > Or are the generally both enabled at the same time?
> > 
> 
> Both RTC and WDT can be enabled at the same time. But they are not
> required to be used. When WDT is enabled, it uses current RTC time as
> 'base' (and RTC time is running no matter if we have the RTC driver
> here or not) - and time-out gets scheduled to specified amount of time
> into future. (Same setting timeout into the future happens when WDT is
> pinged).
> 
> When we set RTC, we disable WDT (if it was enabled), set clock and re-
> enable WDT. This causes the previously used time-out value to be set to
> WDT again. This works Ok because BD70528 does not support 'short ping
> detection'. Only side-effect will be one 'prolonged' WDT feeding period
> when RTC is set. (absolute time when RTC was set minus

linux-next: manual merge of the staging tree with the spi tree

2019-04-03 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the staging tree got conflicts in:

  drivers/staging/mt7621-spi/Kconfig
  drivers/staging/mt7621-spi/Makefile

between commit:

  cbd66c626e16 ("spi: mt7621: Move SPI driver out of staging")

from the spi tree and commits:

  99b75a4e3275 ("staging: add missing SPDX lines to Kconfig files")
  97ed8eab2a00 ("staging: add missing SPDX lines to Makefile files")

from the staging tree.

I fixed it up (I just removed the files) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging. You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell


pgpi9WtVDE9ZU.pgp
Description: OpenPGP digital signature

Re: [PATCH 3/4] mfd: ti-lmu: Remove LM3532 backlight driver references

2019-04-03 Thread Lee Jones

On Wed, 03 Apr 2019, Jacek Anaszewski wrote:

> On 4/3/19 9:57 AM, Lee Jones wrote:
> > On Mon, 25 Mar 2019, Jacek Anaszewski wrote:
> > 
> > > On 3/25/19 8:53 AM, Lee Jones wrote:
> > > > On Sat, 23 Mar 2019, Jacek Anaszewski wrote:
> > > > 
> > > > > Hi Lee,
> > > > > 
> > > > > Can we have your ack for this going via LED tree, please?
> > > > 
> > > > Patch looks okay.
> > > > 
> > > > You can take it through the LED, but if you do I will need you to send
> > > > me a pull-request to a minimised immutable branch please.
> > > > 
> > > > If you cannot do this, I can apply the set and provide the same to
> > > > you.
> > > > 
> > > > If you choose the former:
> > > > 
> > > > Acked-for-MFD-by: Lee Jones 
> > > > 
> > > > Please let me know what you decide
> > > > 
> > > 
> > > I've been exposing integration branches in the past, but after Linus'
> > > message [0] I have my doubts now. I wonder if it wouldn't make more
> > > sense if I just took the patches, and you'd cherry-pick them only in
> > > case such a need occurs. This way we would avoid this whole merge
> > > noise, which in an optimistic and very plausible case will not be needed
> > > at all.
> > > 
> > > [0] https://lkml.org/lkml/2017/4/19/1104
> > 
> > That email is 2 years old, and does not seem relevant to what we're
> > trying to achieve.  I've only ever had issues when *not* creating
> > immutable branches for these, cross subsystem scenarios.  The
> > shared branches I create are always minimalist and never change.
> > 
> > I'm happy to take the patches and create a suitable pull-request for
> > you if you are uncomfortable with the process.  I just need your Ack
> > to do so.  Up to you.
> 
> I don't have any problem with the process. The clear gain of cherry
> picking is more linear history. And the branch can be always created
> when such a need occurs in linux-next.
> 
> That being said, I will send you a pull request once we sort out
> the problem with obtaining a reference to the backlight node.

Sounds good.  Thanks Jacek.

-- 
Lee Jones [李琼斯]
Linaro Services Technical Lead
Linaro.org │ Open source software for ARM SoCs
Follow Linaro: Facebook | Twitter | Blog

linux-next: manual merge of the staging tree with the v4l-dvb tree

2019-04-03 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the staging tree got conflicts in:

  drivers/staging/media/mt9t031/Kconfig
  drivers/staging/media/mt9t031/Makefile

between commit:

  dfe571ca8daa ("media: soc_camera: Remove leftover files, add TODO")

from the v4l-dvb tree and commits:

  99b75a4e3275 ("staging: add missing SPDX lines to Kconfig files")
  97ed8eab2a00 ("staging: add missing SPDX lines to Makefile files")

from the staging tree.

I fixed it up (I just removed the files) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging.  You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell


pgp6EX9_KpB17.pgp
Description: OpenPGP digital signature

Re: [PATCH v4] arm64: dts: ls1088a: add one more thermal zone node

2019-04-03 Thread Daniel Lezcano

On 01/04/2019 05:29, Yuantian Tang wrote:
> Ls1088a has 2 thermal sensors, core cluster and SoC platform. Core cluster
> sensor is used to monitor the temperature of core and SoC platform is for
> platform. The current dts only support the first sensor.
> This patch adds the second sensor node to dts to enable it.
> 
> Signed-off-by: Yuantian Tang 
> ---
> v4:
>   - use hyphen instead of underscore in node name
> v3:
>   - use more descriptive name for each zone
> v2:
>   - Add more information about sensors to description
> PS: In order to keep consistency to the first thermal-zone node, there will
> be "WARNING: line over 80 characters" warnings.
> 
>  arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi |   43 +--
>  1 files changed, 39 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi 
> b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> index 661137f..d6d4ff2 100644
> --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> @@ -129,19 +129,19 @@
>   };
>  
>   thermal-zones {
> - cpu_thermal: cpu-thermal {
> + core-cluster-thermal {
>   polling-delay-passive = <1000>;
>   polling-delay = <5000>;
>   thermal-sensors = < 0>;
>  
>   trips {
> - cpu_alert: cpu-alert {
> + core_cluster_alert: core-cluster-alert {
>   temperature = <85000>;
>   hysteresis = <2000>;
>   type = "passive";
>   };
>  
> - cpu_crit: cpu-crit {
> + core_cluster_crit: core-cluster-crit {
>   temperature = <95000>;
>   hysteresis = <2000>;
>   type = "critical";
> @@ -150,7 +150,42 @@
>  
>   cooling-maps {
>   map0 {
> - trip = <_alert>;
> + trip = <_cluster_alert>;
> + cooling-device =
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> + < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>;
> + };

Does it make sense to assign the same cooling devices to two different
thermal zones running different instances of governor ?

> + };
> + };
> +
> + platform-thermal {
> + polling-delay-passive = <1000>;
> + polling-delay = <5000>;
> + thermal-sensors = < 1>;
> +
> + trips {
> + platform_alert: platform-alert {
> + temperature = <85000>;
> + hysteresis = <2000>;
> + type = "passive";
> + };
> +
> + platform_crit: platform-crit {
> + temperature = <95000>;
> + hysteresis = <2000>;
> + type = "critical";
> + };
> + };
> +
> + cooling-maps {
> + map0 {
> + trip = <_alert>;
>   cooling-device =
>   < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
>   < THERMAL_NO_LIMIT 
> THERMAL_NO_LIMIT>,
> 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

linux-next: manual merge of the staging tree with the spi tree

2019-04-03 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the staging tree got a conflict in:

  drivers/spi/spi-mt7621.c

between commit:

  cbd66c626e16 ("spi: mt7621: Move SPI driver out of staging")

from the spi tree and commit:

  18f0e249da67 ("staging: mt7621-spi: Remove parentheses")

from the staging tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/spi/spi-mt7621.c
index ae836114ee3d,0e0e67280b00..
--- a/drivers/spi/spi-mt7621.c
+++ b/drivers/spi/spi-mt7621.c
@@@ -305,8 -301,8 +303,8 @@@ static int mt7621_spi_setup(struct spi_
struct mt7621_spi *rs = spidev_to_mt7621_spi(spi);
  
if ((spi->max_speed_hz == 0) ||
 -  (spi->max_speed_hz > (rs->sys_freq / 2)))
 +  (spi->max_speed_hz > (rs->sys_freq / 2)))
-   spi->max_speed_hz = (rs->sys_freq / 2);
+   spi->max_speed_hz = rs->sys_freq / 2;
  
if (spi->max_speed_hz < (rs->sys_freq / 4097)) {
dev_err(>dev, "setup: requested speed is too low %d Hz\n",


pgpmMHGcRcfJC.pgp
Description: OpenPGP digital signature

[PATCH] of: Documentation: Correct return value from of_overlay_fdt_apply

2019-04-03 Thread Chris Packham

The return from of_overlay_fdt_apply() just indicates success or fail.
The cookie is returned via reference.

Signed-off-by: Chris Packham 
---
 Documentation/devicetree/overlay-notes.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/overlay-notes.txt 
b/Documentation/devicetree/overlay-notes.txt
index 725fb8d255c1..62f2003d6205 100644
--- a/Documentation/devicetree/overlay-notes.txt
+++ b/Documentation/devicetree/overlay-notes.txt
@@ -88,7 +88,8 @@ Overlay in-kernel API
 The API is quite easy to use.
 
 1. Call of_overlay_fdt_apply() to create and apply an overlay changeset. The
-return value is an error or a cookie identifying this overlay.
+return indicates success or failure. A a cookie identifying this overlay is
+returned via reference on success.
 
 2. Call of_overlay_remove() to remove and cleanup the overlay changeset
 previously created via the call to of_overlay_fdt_apply(). Removal of an
-- 
2.21.0

Re: [PATCH v3 0/3] Add restrictions for kexec/kdump jumping between 5-level and 4-level kernel

2019-04-03 Thread Baoquan He

PING

Any comment for this patch, Or consider to merge?

On 03/12/19 at 06:30pm, Baoquan He wrote:
> This is v3 post.
> 
> The original v1 post can be found here:
> http://lkml.kernel.org/r/20180829141624.13985-1-...@redhat.com
> 
> Later a v1 RESEND version:
> http://lkml.kernel.org/r/20190125022817.29506-1-...@redhat.com
> 
> V2 post is here:
> http://lkml.kernel.org/r/20190312005004.19182-1-...@redhat.com
> 
> This patchset is trying to fix several issues for kexec/kdump when
> dynamic switching of paging mode is enabled in x86_64. The current
> kernel supports 5-level paging mode, and supports dynamically choosing
> paging mode during bootup according to kernel image, hardware and
> kernel parameter setting. This flexibility brings several issues for
> kexec/kdump:
> 
> Issues:
> 1)
> Dynamic switching between paging mode requires code change in target
> kernel. So we can't kexec jump from 5-level kernel to old 4-level
> kernel which lacks the code change.
> 
> 2)
> Switching from 5-level paging to 4-level paging kernel would fail, if
> kexec() put kernel image above 64TiB of memory.
> 
> 3)
> Kdump jumping has similar issue as 2). This require us to only
> reserve crashkernel below 64TB, otherwise jumping from 5-level to
> 4-level kernel will fail.
> 
> Note:
> Since we have two interfaces kexec_load() and kexec_file_load() to load
> kexec/kdump kernel, handling for them is a little different. For
> kexec_load(), most of the loading job is done in user space utility
> kexec_tools. However, for kexec_file_load(), most of the loading codes
> have moved into kernel because of kernel image verification.
> 
> Fixes:
> a) For issue 1), we need check if XLF_5LEVEL is set, otherwise error out
>a message. 
>   -This need be done in both kernel and kexec_tools utility.
>   -Patch 2/3 is the handling of kernel part.
>   -Will post user space patch to kexec mailing list later.
> 
> b) For issue 2), we need check if both XLF_5LEVEL and XLF_5LEVEL_ENABLED
>are set, otherwise error out a message.
>   -This only need be done in kexec_tools utility. Because for
>kexec_file_load(), the current code searches area to put kernel from
>bottom to up in system RAM, we usually can always find an area below
>4 GB, no need to worry about 5-level kernel jumping to 4-level
>kernel. While for kexec_load(), it's top down seraching area for kernel
>loading, and implemented in user space. We need make sure that
>5-level kernel find an area under 64 TB for a kexec-ed kernel of
>4-level.
>   -Will post user space patch to kexec mailing list later.
> 
> c) For issues 3), just limit kernel to reserve crashkernel below 64 TB.
>   -This only need be done in kernel.
>   -It doesn't need to check bit XLF_5LEVEL or XLF_5LEVEL_ENABLED, we
>just simply limit it below 64 TB which should be enough. Because
>crashernel is reserved during the 1st kernel's bootup, we don't know
>what kernel will be loaded for kdump usage.
>   -Patch 3/3 handles this.
> 
> Concerns from reviewing comments:
> 1)
> In v1, hpa raised concern that why the paging mode checking is not done
> before kexec jumping, the discussion can be found here:
> 
> http://lkml.kernel.org/r/alpine.deb.2.21.1809051002020.1...@nanos.tec.linutronix.de
> 
> As tglx said, it might be not doable for kdump since kdump kernel's
> reserved crashkernel region only owns a portion of memory, may
> be above 4G; and might be not safer to do paging mode checking and
> switching thing after crash.
> 
> 2)
> In v1 RESEND post, tglx asked why only bit XLF_5LEVEL is checked, even
> though two bits XLF_5LEVEL or XLF_5LEVEL_ENABLED added. So add more
> words to explain it in *Fixes* b).
> 
> Changelog:
> v2->v3:
>   Change the constant to match the notation for the rest of defines as
>   Kirill suggested;
> v1->v2:
>   Correct the subject of patch 1 according to tglx's comment;
>   Add more information to cover-letter to address reviewers' concerns;
> 
> Baoquan He (3):
>   x86/boot: Add xloadflags bits for 5-level kernel checking
>   x86/kexec/64: Error out if try to jump to old 4-level kernel from
> 5-level kernel
>   x86/kdump/64: Change the upper limit of crashkernel reservation
> 
>  arch/x86/boot/header.S| 12 +++-
>  arch/x86/include/uapi/asm/bootparam.h |  2 ++
>  arch/x86/kernel/kexec-bzimage64.c |  5 +
>  arch/x86/kernel/setup.c   | 18 ++
>  4 files changed, 32 insertions(+), 5 deletions(-)
> 
> -- 
> 2.17.2
>

Re: [PATCH v3 0/2] x86/mm/KASLR: Change the granularity of randomization to PUD size in 5-level

2019-04-03 Thread Baoquan He

PING

Is there any comment for this patchset, or could we consider to merge
them?

On 03/08/19 at 10:56am, Baoquan He wrote:
> This is v3 post, v2 post is here:
> http://lkml.kernel.org/r/20190228003522.9957-1-...@redhat.com
> v1 can be found here:
> http://lkml.kernel.org/r/20190224132231.4878-1-...@redhat.com
> 
> Background:
> ***
> Earlier, during a series of KASLR patch reviewing, Ingo got the current
> memory region KASLR only has granularity of randomization in PUD size in
> 4-level paging mode, and P4D size in 5-level paging mode, He suggested
> me to try to change both of them to be PMD size at granularity:
> 
>   http://lkml.kernel.org/r/20180912100135.gb3...@gmail.com
> 
> Later, I changed code to support PMD level of randomization for both
> 4-level and 5-level.
> 
>   https://github.com/baoquan-he/linux/commits/mm-kaslr-2m-aligned
> 
> The test passed on my KVM guest with 1 GB RAM, but failed when I
> increased the RAM to 4 GB, and failed either on larger RAM.
> 
> After analyzing, it's because that 1 GB page mapping need be mapped at 1
> GB aligned physical address for intel CPU. The 2 MB level of randomization
> will break it and cause error. Please check below table in intel IA32 manual.
> 
>   Table 4-15. Format of an IA-32e Page-Directory-Pointer-Table Entry (PDPTE) 
> that Maps a 1-GByte Page
> 
> So PMD level of randomization for mm KASLR is not doable.
> 
> However, during investigation and testing above code, it turns out that the
> current code is misleading to build identity mapping for the real mode
> trampoline in case KASLR enabled. From code, only a small area (which is
> smaller than 1 MB) need be identity mapped. Please check below patch which
> is from above mm-kaslr-2m-aligned patch series. it only builds up 2 MB
> identity maping for real mode trampoline, and test passed on machines
> with 32 GB RAM of 4-level and on KVM guest of 5-level.
> 
> https://github.com/baoquan-he/linux/commit/e120e67fbf9a5aa818d20084d8dea5b4a27ecf97
> 
> Result:
> Make a patchset to:
>   1)change code to only build 1 GB of area for real mode trampoline,
> namely only copy one PUD entry where physical address 0 resides;
> 
>   2)improve the randomization granularity of 5-level from P4D size to PUD 
> size.
> 
> Changelog:
> v2->v3:
>   Improve patch 1/2 according to Kirill's comments:
> *) Adjust code change of 1/2;
> *) Add code comment to explain the two kinds of mapping thing for
>real mode;
>   
> v1->v2:
>   Improve patch according to Kirill's suggestions:
> *)Add more information to code comment for better understanding;
> *)Improve code to save one low memory page in 4-level;
> 
> Baoquan He (2):
>   x86/mm/KASLR: Only build one PUD entry of area for real mode
> trampoline
>   x86/mm/KASLR: Change the granularity of randomization to PUD size in
> 5-level
> 
>  arch/x86/mm/kaslr.c | 98 -
>  1 file changed, 43 insertions(+), 55 deletions(-)
> 
> -- 
> 2.17.2
>

[RFC PATCH 06/25] mm: migrate: Make the number of copy threads adjustable via sysctl.

2019-04-03 Thread Zi Yan

From: Zi Yan 

Signed-off-by: Zi Yan 
---
 kernel/sysctl.c | 9 +
 mm/copy_page.c  | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d8490e..0eae0b8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -102,6 +102,7 @@
 #if defined(CONFIG_SYSCTL)
 
 extern int accel_page_copy;
+extern unsigned int limit_mt_num;
 
 /* External variables not in a header file. */
 extern int suid_dumpable;
@@ -1441,6 +1442,14 @@ static struct ctl_table vm_table[] = {
.extra1 = ,
.extra2 = ,
},
+   {
+   .procname   = "limit_mt_num",
+   .data   = _mt_num,
+   .maxlen = sizeof(limit_mt_num),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+   .extra1 = ,
+   },
 {
.procname   = "hugetlb_shm_group",
.data   = _hugetlb_shm_group,
diff --git a/mm/copy_page.c b/mm/copy_page.c
index 9cf849c..6665e3d 100644
--- a/mm/copy_page.c
+++ b/mm/copy_page.c
@@ -23,7 +23,7 @@
 #include 
 
 
-const unsigned int limit_mt_num = 4;
+unsigned int limit_mt_num = 4;
 
 /*  multi-threaded copy page  
*/
 
-- 
2.7.4

[RFC PATCH 10/25] mm: migrate: copy_page_lists_mt() to copy a page list using multi-threads.

2019-04-03 Thread Zi Yan

From: Zi Yan 

This prepare the support for migrate_page_concur(), which migrates
multiple pages at the same time.

Signed-off-by: Zi Yan 
---
 mm/copy_page.c | 123 +
 mm/internal.h  |   2 +
 2 files changed, 125 insertions(+)

diff --git a/mm/copy_page.c b/mm/copy_page.c
index 84f1c02..d2fd67e 100644
--- a/mm/copy_page.c
+++ b/mm/copy_page.c
@@ -126,6 +126,129 @@ int copy_page_multithread(struct page *to, struct page 
*from, int nr_pages)
 
return err;
 }
+
+int copy_page_lists_mt(struct page **to, struct page **from, int nr_items)
+{
+   int err = 0;
+   unsigned int total_mt_num = limit_mt_num;
+   int to_node = page_to_nid(*to);
+   int i;
+   struct copy_page_info *work_items[NR_CPUS] = {0};
+   const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);
+   int cpu_id_list[NR_CPUS] = {0};
+   int cpu;
+   int max_items_per_thread;
+   int item_idx;
+
+   total_mt_num = min_t(unsigned int, total_mt_num,
+
cpumask_weight(per_node_cpumask));
+
+
+   if (total_mt_num > num_online_cpus())
+   return -ENODEV;
+
+   /* Each threads get part of each page, if nr_items < totla_mt_num */
+   if (nr_items < total_mt_num)
+   max_items_per_thread = nr_items;
+   else
+   max_items_per_thread = (nr_items / total_mt_num) +
+   ((nr_items % total_mt_num)?1:0);
+
+
+   for (cpu = 0; cpu < total_mt_num; ++cpu) {
+   work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
+   sizeof(struct 
copy_item)*max_items_per_thread, GFP_KERNEL);
+   if (!work_items[cpu]) {
+   err = -ENOMEM;
+   goto free_work_items;
+   }
+   }
+
+   i = 0;
+   for_each_cpu(cpu, per_node_cpumask) {
+   if (i >= total_mt_num)
+   break;
+   cpu_id_list[i] = cpu;
+   ++i;
+   }
+
+   if (nr_items < total_mt_num) {
+   for (cpu = 0; cpu < total_mt_num; ++cpu) {
+   INIT_WORK((struct work_struct *)work_items[cpu],
+ copy_page_work_queue_thread);
+   work_items[cpu]->num_items = max_items_per_thread;
+   }
+
+   for (item_idx = 0; item_idx < nr_items; ++item_idx) {
+   unsigned long chunk_size = PAGE_SIZE * 
hpage_nr_pages(from[item_idx]) / total_mt_num;
+   char *vfrom = kmap(from[item_idx]);
+   char *vto = kmap(to[item_idx]);
+   VM_BUG_ON(PAGE_SIZE * hpage_nr_pages(from[item_idx]) % 
total_mt_num);
+   BUG_ON(hpage_nr_pages(to[item_idx]) !=
+  hpage_nr_pages(from[item_idx]));
+
+   for (cpu = 0; cpu < total_mt_num; ++cpu) {
+   work_items[cpu]->item_list[item_idx].to = vto + 
chunk_size * cpu;
+   work_items[cpu]->item_list[item_idx].from = 
vfrom + chunk_size * cpu;
+   work_items[cpu]->item_list[item_idx].chunk_size 
=
+   chunk_size;
+   }
+   }
+
+   for (cpu = 0; cpu < total_mt_num; ++cpu)
+   queue_work_on(cpu_id_list[cpu],
+ system_highpri_wq,
+ (struct work_struct 
*)work_items[cpu]);
+   } else {
+   item_idx = 0;
+   for (cpu = 0; cpu < total_mt_num; ++cpu) {
+   int num_xfer_per_thread = nr_items / total_mt_num;
+   int per_cpu_item_idx;
+
+   if (cpu < (nr_items % total_mt_num))
+   num_xfer_per_thread += 1;
+
+   INIT_WORK((struct work_struct *)work_items[cpu],
+ copy_page_work_queue_thread);
+
+   work_items[cpu]->num_items = num_xfer_per_thread;
+   for (per_cpu_item_idx = 0; per_cpu_item_idx < 
work_items[cpu]->num_items;
+++per_cpu_item_idx, ++item_idx) {
+   work_items[cpu]->item_list[per_cpu_item_idx].to 
= kmap(to[item_idx]);
+   
work_items[cpu]->item_list[per_cpu_item_idx].from =
+   kmap(from[item_idx]);
+   
work_items[cpu]->item_list[per_cpu_item_idx].chunk_size =
+   PAGE_SIZE * 
hpage_nr_pages(from[item_idx]);
+
+   BUG_ON(hpage_nr_pages(to[item_idx]) !=
+  hpage_nr_pages(from[item_idx]));

[RFC PATCH 23/25] memory manage: page migration based page manipulation between NUMA nodes.

2019-04-03 Thread Zi Yan

From: Zi Yan 

Users are expected to set memcg max size to reflect their memory
resource allocation policy. The syscall simply migrates pages belong
to the application's memcg between from_node to to_node, where
from_node is considered fast memory and to_node is considered slow
memory. In common cases, active(hot) pages are migrated from to_node
to from_node and inactive(cold) pages are migrated from from_node to
to_node.

Separate migration for base pages and huge pages to achieve high
throughput.

1. They are migrated via different calls.
2. 4KB base pages are not transferred via multi-threaded.
3. All pages are migrated together if no optimization is used.

Signed-off-by: Zi Yan 
---
 mm/memory_manage.c | 275 +
 1 file changed, 275 insertions(+)

diff --git a/mm/memory_manage.c b/mm/memory_manage.c
index e8dddbf..d63ad25 100644
--- a/mm/memory_manage.c
+++ b/mm/memory_manage.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -15,6 +16,11 @@
 
 #include "internal.h"
 
+enum isolate_action {
+   ISOLATE_COLD_PAGES = 1,
+   ISOLATE_HOT_PAGES,
+   ISOLATE_HOT_AND_COLD_PAGES,
+};
 
 static unsigned long shrink_lists_node_memcg(pg_data_t *pgdat,
struct mem_cgroup *memcg, unsigned long nr_to_scan)
@@ -78,6 +84,272 @@ static int shrink_lists(struct task_struct *p, struct 
mm_struct *mm,
return err;
 }
 
+static unsigned long isolate_pages_from_lru_list(pg_data_t *pgdat,
+   struct mem_cgroup *memcg, unsigned long nr_pages,
+   struct list_head *base_page_list,
+   struct list_head *huge_page_list,
+   unsigned long *nr_taken_base_page,
+   unsigned long *nr_taken_huge_page,
+   enum isolate_action action)
+{
+   struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
+   enum lru_list lru;
+   unsigned long nr_all_taken = 0;
+
+   if (nr_pages == ULONG_MAX)
+   nr_pages = memcg_size_node(memcg, pgdat->node_id);
+
+   lru_add_drain_all();
+
+   for_each_evictable_lru(lru) {
+   unsigned long nr_scanned, nr_taken;
+   int file = is_file_lru(lru);
+   struct scan_control sc = {.may_unmap = 1};
+
+   if (action == ISOLATE_COLD_PAGES && is_active_lru(lru))
+   continue;
+   if (action == ISOLATE_HOT_PAGES && !is_active_lru(lru))
+   continue;
+
+   spin_lock_irq(>lru_lock);
+
+   /* Isolate base pages */
+   sc.isolate_only_base_page = 1;
+   nr_taken = isolate_lru_pages(nr_pages, lruvec, base_page_list,
+   _scanned, , lru);
+   /* Isolate huge pages */
+   sc.isolate_only_base_page = 0;
+   sc.isolate_only_huge_page = 1;
+   nr_taken += isolate_lru_pages(nr_pages - nr_scanned, lruvec,
+   huge_page_list, _scanned, , lru);
+
+   __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
+
+   spin_unlock_irq(>lru_lock);
+
+   nr_all_taken += nr_taken;
+
+   if (nr_all_taken > nr_pages)
+   break;
+   }
+
+   return nr_all_taken;
+}
+
+static int migrate_to_node(struct list_head *page_list, int nid,
+   enum migrate_mode mode)
+{
+   bool migrate_concur = mode & MIGRATE_CONCUR;
+   int num = 0;
+   int from_nid;
+   int err;
+
+   if (list_empty(page_list))
+   return num;
+
+   from_nid = page_to_nid(list_first_entry(page_list, struct page, lru));
+
+   if (migrate_concur)
+   err = migrate_pages_concur(page_list, alloc_new_node_page,
+   NULL, nid, mode, MR_SYSCALL);
+   else
+   err = migrate_pages(page_list, alloc_new_node_page,
+   NULL, nid, mode, MR_SYSCALL);
+
+   if (err) {
+   struct page *page;
+
+   list_for_each_entry(page, page_list, lru)
+   num += hpage_nr_pages(page);
+   pr_debug("%d pages failed to migrate from %d to %d\n",
+   num, from_nid, nid);
+
+   putback_movable_pages(page_list);
+   }
+   return num;
+}
+
+static inline int _putback_overflow_pages(unsigned long max_nr_pages,
+   struct list_head *page_list, unsigned long *nr_remaining_pages)
+{
+   struct page *page;
+   LIST_HEAD(putback_list);
+
+   if (list_empty(page_list))
+   return max_nr_pages;
+
+   *nr_remaining_pages = 0;
+   /* in case we need to drop the whole list */
+   page = list_first_entry(page_list, struct page, lru);
+   if (max_nr_pages <= (2 * hpage_nr_pages(page))) {
+   max_nr_pages = 0;
+   putback_movable_pages(page_list);
+   goto

[RFC PATCH 08/25] mm: migrate: Add copy_page_dma into migrate_page_copy.

2019-04-03 Thread Zi Yan

From: Zi Yan 

Fallback to copy_highpage when it fails.

Signed-off-by: Zi Yan 
---
 include/linux/migrate_mode.h   |  1 +
 include/uapi/linux/mempolicy.h |  1 +
 mm/migrate.c   | 31 +--
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index 5bc8a77..4f7f5557 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -23,6 +23,7 @@ enum migrate_mode {
MIGRATE_MODE_MASK = 3,
MIGRATE_SINGLETHREAD= 0,
MIGRATE_MT  = 1<<4,
+   MIGRATE_DMA = 1<<5,
 };
 
 #endif /* MIGRATE_MODE_H_INCLUDED */
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 890269b..49573a6 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -48,6 +48,7 @@ enum {
 #define MPOL_MF_LAZY(1<<3) /* Modifies '_MOVE:  lazy migrate on fault */
 #define MPOL_MF_INTERNAL (1<<4)/* Internal flags start here */
 
+#define MPOL_MF_MOVE_DMA (1<<5)/* Use DMA page copy routine */
 #define MPOL_MF_MOVE_MT  (1<<6)/* Use multi-threaded page copy routine 
*/
 
 #define MPOL_MF_VALID  (MPOL_MF_STRICT   | \
diff --git a/mm/migrate.c b/mm/migrate.c
index 8a344e2..09114d3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -553,15 +553,21 @@ int migrate_huge_page_move_mapping(struct address_space 
*mapping,
  * specialized.
  */
 static void __copy_gigantic_page(struct page *dst, struct page *src,
-   int nr_pages)
+   int nr_pages, enum migrate_mode mode)
 {
int i;
struct page *dst_base = dst;
struct page *src_base = src;
+   int rc = -EFAULT;
 
for (i = 0; i < nr_pages; ) {
cond_resched();
-   copy_highpage(dst, src);
+
+   if (mode & MIGRATE_DMA)
+   rc = copy_page_dma(dst, src, 1);
+
+   if (rc)
+   copy_highpage(dst, src);
 
i++;
dst = mem_map_next(dst, dst_base, i);
@@ -582,7 +588,7 @@ static void copy_huge_page(struct page *dst, struct page 
*src,
nr_pages = pages_per_huge_page(h);
 
if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
-   __copy_gigantic_page(dst, src, nr_pages);
+   __copy_gigantic_page(dst, src, nr_pages, mode);
return;
}
} else {
@@ -597,6 +603,8 @@ static void copy_huge_page(struct page *dst, struct page 
*src,
 
if (mode & MIGRATE_MT)
rc = copy_page_multithread(dst, src, nr_pages);
+   else if (mode & MIGRATE_DMA)
+   rc = copy_page_dma(dst, src, nr_pages);
 
if (rc)
for (i = 0; i < nr_pages; i++) {
@@ -674,8 +682,9 @@ void migrate_page_copy(struct page *newpage, struct page 
*page,
 {
if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page, mode);
-   else
+   else {
copy_highpage(newpage, page);
+   }
 
migrate_page_states(newpage, page);
 }
@@ -1511,7 +1520,8 @@ static int store_status(int __user *status, int start, 
int value, int nr)
 }
 
 static int do_move_pages_to_node(struct mm_struct *mm,
-   struct list_head *pagelist, int node, bool migrate_mt)
+   struct list_head *pagelist, int node,
+   bool migrate_mt, bool migrate_dma)
 {
int err;
 
@@ -1519,7 +1529,8 @@ static int do_move_pages_to_node(struct mm_struct *mm,
return 0;
 
err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
-   MIGRATE_SYNC | (migrate_mt ? MIGRATE_MT : 
MIGRATE_SINGLETHREAD),
+   MIGRATE_SYNC | (migrate_mt ? MIGRATE_MT : 
MIGRATE_SINGLETHREAD) |
+   (migrate_dma ? MIGRATE_DMA : MIGRATE_SINGLETHREAD),
MR_SYSCALL);
if (err)
putback_movable_pages(pagelist);
@@ -1642,7 +1653,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t 
task_nodes,
start = i;
} else if (node != current_node) {
err = do_move_pages_to_node(mm, , current_node,
-   flags & MPOL_MF_MOVE_MT);
+   flags & MPOL_MF_MOVE_MT, flags & 
MPOL_MF_MOVE_DMA);
if (err)
goto out;
err = store_status(status, start, current_node, i - 
start);
@@ -1666,7 +1677,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t 
task_nodes,
goto out_flush;
 
err = do_move_pages_to_node(mm, , current_node,
-   flags & MPOL_MF_MOVE_MT);
+   flags &

[RFC PATCH 24/25] memory manage: limit migration batch size.

2019-04-03 Thread Zi Yan

From: Zi Yan 

Make migration batch size adjustable to avoid excessive migration
overheads when a lot of pages are under migration.

Signed-off-by: Zi Yan 
---
 kernel/sysctl.c|  8 
 mm/memory_manage.c | 60 --
 2 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b8712eb..b92e2da9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,6 +105,7 @@ extern int accel_page_copy;
 extern unsigned int limit_mt_num;
 extern int use_all_dma_chans;
 extern int limit_dma_chans;
+extern int migration_batch_size;
 
 /* External variables not in a header file. */
 extern int suid_dumpable;
@@ -1470,6 +1471,13 @@ static struct ctl_table vm_table[] = {
.extra1 = ,
 },
 {
+   .procname   = "migration_batch_size",
+   .data   = _batch_size,
+   .maxlen = sizeof(migration_batch_size),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+},
+{
.procname   = "hugetlb_shm_group",
.data   = _hugetlb_shm_group,
.maxlen = sizeof(gid_t),
diff --git a/mm/memory_manage.c b/mm/memory_manage.c
index d63ad25..8b76fcf 100644
--- a/mm/memory_manage.c
+++ b/mm/memory_manage.c
@@ -16,6 +16,8 @@
 
 #include "internal.h"
 
+int migration_batch_size = 16;
+
 enum isolate_action {
ISOLATE_COLD_PAGES = 1,
ISOLATE_HOT_PAGES,
@@ -137,35 +139,49 @@ static unsigned long 
isolate_pages_from_lru_list(pg_data_t *pgdat,
 }
 
 static int migrate_to_node(struct list_head *page_list, int nid,
-   enum migrate_mode mode)
+   enum migrate_mode mode, int batch_size)
 {
bool migrate_concur = mode & MIGRATE_CONCUR;
+   bool unlimited_batch_size = (batch_size <=0 || !migrate_concur);
int num = 0;
-   int from_nid;
+   int from_nid = -1;
int err;
 
if (list_empty(page_list))
return num;
 
-   from_nid = page_to_nid(list_first_entry(page_list, struct page, lru));
+   while (!list_empty(page_list)) {
+   LIST_HEAD(batch_page_list);
+   int i;
 
-   if (migrate_concur)
-   err = migrate_pages_concur(page_list, alloc_new_node_page,
-   NULL, nid, mode, MR_SYSCALL);
-   else
-   err = migrate_pages(page_list, alloc_new_node_page,
-   NULL, nid, mode, MR_SYSCALL);
+   /* it should move all pages to batch_page_list if 
!migrate_concur */
+   for (i = 0; i < batch_size || unlimited_batch_size; i++) {
+   struct page *item = list_first_entry_or_null(page_list, 
struct page, lru);
+   if (!item)
+   break;
+   list_move(>lru, _page_list);
+   }
 
-   if (err) {
-   struct page *page;
+   from_nid = page_to_nid(list_first_entry(_page_list, 
struct page, lru));
 
-   list_for_each_entry(page, page_list, lru)
-   num += hpage_nr_pages(page);
-   pr_debug("%d pages failed to migrate from %d to %d\n",
-   num, from_nid, nid);
+   if (migrate_concur)
+   err = migrate_pages_concur(_page_list, 
alloc_new_node_page,
+   NULL, nid, mode, MR_SYSCALL);
+   else
+   err = migrate_pages(_page_list, 
alloc_new_node_page,
+   NULL, nid, mode, MR_SYSCALL);
 
-   putback_movable_pages(page_list);
+   if (err) {
+   struct page *page;
+
+   list_for_each_entry(page, _page_list, lru)
+   num += hpage_nr_pages(page);
+
+   putback_movable_pages(_page_list);
+   }
}
+   pr_debug("%d pages failed to migrate from %d to %d\n",
+   num, from_nid, nid);
return num;
 }
 
@@ -325,10 +341,12 @@ static int do_mm_manage(struct task_struct *p, struct 
mm_struct *mm,
/* Migrate pages to slow node */
/* No multi-threaded migration for base pages */
nr_isolated_fast_base_pages -=
-   migrate_to_node(_base_page_list, slow_nid, mode & 
~MIGRATE_MT);
+   migrate_to_node(_base_page_list, slow_nid,
+   mode & ~MIGRATE_MT, migration_batch_size);
 
nr_isolated_fast_huge_pages -=
-   migrate_to_node(_huge_page_list, slow_nid, mode);
+   migrate_to_node(_huge_page_list, slow_nid, mode,
+   migration_batch_size);
}
 
if (nr_isolated_fast_base_pages != ULONG_MAX &&
@@ -342,10 +360,12 @@

[RFC PATCH 05/25] mm: migrate: Add vm.accel_page_copy in sysfs to control page copy acceleration.

2019-04-03 Thread Zi Yan

From: Zi Yan 

Since base page migration did not gain any speedup from
multi-threaded methods, we only accelerate the huge page case.

Signed-off-by: Zi Yan 
---
 kernel/sysctl.c | 11 +++
 mm/migrate.c|  6 ++
 2 files changed, 17 insertions(+)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5da394..3d8490e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,6 +101,8 @@
 
 #if defined(CONFIG_SYSCTL)
 
+extern int accel_page_copy;
+
 /* External variables not in a header file. */
 extern int suid_dumpable;
 #ifdef CONFIG_COREDUMP
@@ -1430,6 +1432,15 @@ static struct ctl_table vm_table[] = {
.extra2 = ,
},
 #endif
+   {
+   .procname   = "accel_page_copy",
+   .data   = _page_copy,
+   .maxlen = sizeof(accel_page_copy),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+   .extra1 = ,
+   .extra2 = ,
+   },
 {
.procname   = "hugetlb_shm_group",
.data   = _hugetlb_shm_group,
diff --git a/mm/migrate.c b/mm/migrate.c
index dd6ccbe..8a344e2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -55,6 +55,8 @@
 
 #include "internal.h"
 
+int accel_page_copy = 1;
+
 /*
  * migrate_prep() needs to be called before we start compiling a list of pages
  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
@@ -589,6 +591,10 @@ static void copy_huge_page(struct page *dst, struct page 
*src,
nr_pages = hpage_nr_pages(src);
}
 
+   /* Try to accelerate page migration if it is not specified in mode  */
+   if (accel_page_copy)
+   mode |= MIGRATE_MT;
+
if (mode & MIGRATE_MT)
rc = copy_page_multithread(dst, src, nr_pages);
 
-- 
2.7.4

[RFC PATCH 12/25] exchange pages: new page migration mechanism: exchange_pages()

2019-04-03 Thread Zi Yan

From: Zi Yan 

It exchanges two pages by unmapping both first, then exchanging the
data of the pages using a u64 register, and finally remapping both
pages.

It saves the overheads of allocating two new pages in two
back-to-back migrate_pages().

Signed-off-by: Zi Yan 
---
 include/linux/exchange.h |  23 ++
 include/linux/ksm.h  |   4 +
 mm/Makefile  |   1 +
 mm/exchange.c| 597 +++
 mm/ksm.c |  35 +++
 5 files changed, 660 insertions(+)
 create mode 100644 include/linux/exchange.h
 create mode 100644 mm/exchange.c

diff --git a/include/linux/exchange.h b/include/linux/exchange.h
new file mode 100644
index 000..778068e
--- /dev/null
+++ b/include/linux/exchange.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_EXCHANGE_H
+#define _LINUX_EXCHANGE_H
+
+#include 
+
+struct exchange_page_info {
+   struct page *from_page;
+   struct page *to_page;
+
+   struct anon_vma *from_anon_vma;
+   struct anon_vma *to_anon_vma;
+
+   int from_page_was_mapped;
+   int to_page_was_mapped;
+
+   struct list_head list;
+};
+
+int exchange_pages(struct list_head *exchange_list,
+   enum migrate_mode mode,
+   int reason);
+#endif /* _LINUX_EXCHANGE_H */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index e48b1e4..170312d 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -55,6 +55,7 @@ void rmap_walk_ksm(struct page *page, struct 
rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 bool reuse_ksm_page(struct page *page,
struct vm_area_struct *vma, unsigned long address);
+void ksm_exchange_page(struct page *to_page, struct page *from_page);
 
 #else  /* !CONFIG_KSM */
 
@@ -92,6 +93,9 @@ static inline bool reuse_ksm_page(struct page *page,
struct vm_area_struct *vma, unsigned long address)
 {
return false;
+static inline void ksm_exchange_page(struct page *to_page,
+   struct page *from_page)
+{
 }
 #endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */
diff --git a/mm/Makefile b/mm/Makefile
index fa02a9f..5e6c591 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -45,6 +45,7 @@ obj-y += init-mm.o
 obj-y += memblock.o
 
 obj-y += copy_page.o
+obj-y += exchange.o
 
 ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS)   += madvise.o
diff --git a/mm/exchange.c b/mm/exchange.c
new file mode 100644
index 000..626bbea
--- /dev/null
+++ b/mm/exchange.c
@@ -0,0 +1,597 @@
+/*
+ * Exchange two in-use pages. Page flags and page->mapping are exchanged
+ * as well. Only anonymous pages are supported.
+ *
+ * Copyright (C) 2016 NVIDIA, Zi Yan 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+#include "internal.h"
+
+/*
+ * Move a list of individual pages
+ */
+struct pages_to_node {
+   unsigned long from_addr;
+   int from_status;
+
+   unsigned long to_addr;
+   int to_status;
+};
+
+struct page_flags {
+   unsigned int page_error :1;
+   unsigned int page_referenced:1;
+   unsigned int page_uptodate:1;
+   unsigned int page_active:1;
+   unsigned int page_unevictable:1;
+   unsigned int page_checked:1;
+   unsigned int page_mappedtodisk:1;
+   unsigned int page_dirty:1;
+   unsigned int page_is_young:1;
+   unsigned int page_is_idle:1;
+   unsigned int page_swapcache:1;
+   unsigned int page_writeback:1;
+   unsigned int page_private:1;
+   unsigned int __pad:3;
+};
+
+
+static void exchange_page(char *to, char *from)
+{
+   u64 tmp;
+   int i;
+
+   for (i = 0; i < PAGE_SIZE; i += sizeof(tmp)) {
+   tmp = *((u64*)(from + i));
+   *((u64*)(from + i)) = *((u64*)(to + i));
+   *((u64*)(to + i)) = tmp;
+   }
+}
+
+static inline void exchange_highpage(struct page *to, struct page *from)
+{
+   char *vfrom, *vto;
+
+   vfrom = kmap_atomic(from);
+   vto = kmap_atomic(to);
+   exchange_page(vto, vfrom);
+   kunmap_atomic(vto);
+   kunmap_atomic(vfrom);
+}
+
+static void __exchange_gigantic_page(struct page *dst, struct page *src,
+   int nr_pages)
+{
+   int i;
+   struct page *dst_base = dst;
+   struct page *src_base = src;
+
+   for (i = 0; i < nr_pages; ) {
+   cond_resched();
+   exchange_highpage(dst, src);
+
+   i++;
+   dst = mem_map_next(dst, dst_base, i);
+   src = mem_map_next(src, src_base, i);
+   }
+}
+
+static void exchange_huge_page(struct page *dst, struct page *src)
+{
+   int i;
+   int nr_pages;
+
+   if (PageHuge(src)) {
+   /* hugetlbfs page */
+

[RFC PATCH 11/25] mm: migrate: Add concurrent page migration into move_pages syscall.

2019-04-03 Thread Zi Yan

From: Zi Yan 

Concurrent page migration unmaps all pages in a list, copy all pages
in one function (copy_page_list*), finally remaps all new pages.
This is different from existing page migration process which migrate
one page at a time.

Only anonymous pages are supported. All file-backed pages are still
migrated sequentially. Because locking becomes more complicated when
a list of file-backed pages belong to different files, which might
cause deadlocks if locks on each file are not done properly.

Signed-off-by: Zi Yan 
---
 include/linux/migrate.h|   6 +
 include/linux/migrate_mode.h   |   1 +
 include/uapi/linux/mempolicy.h |   1 +
 mm/migrate.c   | 543 -
 4 files changed, 542 insertions(+), 9 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 5218a07..1001a1c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -67,6 +67,8 @@ extern int migrate_page(struct address_space *mapping,
enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
unsigned long private, enum migrate_mode mode, int reason);
+extern int migrate_pages_concur(struct list_head *l, new_page_t new, 
free_page_t free,
+   unsigned long private, enum migrate_mode mode, int reason);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 extern void putback_movable_page(struct page *page);
 
@@ -87,6 +89,10 @@ static inline int migrate_pages(struct list_head *l, 
new_page_t new,
free_page_t free, unsigned long private, enum migrate_mode mode,
int reason)
{ return -ENOSYS; }
+static inline int migrate_pages_concur(struct list_head *l, new_page_t new,
+   free_page_t free, unsigned long private, enum migrate_mode mode,
+   int reason)
+   { return -ENOSYS; }
 static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
{ return -EBUSY; }
 
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index 4f7f5557..68263da 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -24,6 +24,7 @@ enum migrate_mode {
MIGRATE_SINGLETHREAD= 0,
MIGRATE_MT  = 1<<4,
MIGRATE_DMA = 1<<5,
+   MIGRATE_CONCUR  = 1<<6,
 };
 
 #endif /* MIGRATE_MODE_H_INCLUDED */
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 49573a6..eb6560e 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -50,6 +50,7 @@ enum {
 
 #define MPOL_MF_MOVE_DMA (1<<5)/* Use DMA page copy routine */
 #define MPOL_MF_MOVE_MT  (1<<6)/* Use multi-threaded page copy routine 
*/
+#define MPOL_MF_MOVE_CONCUR  (1<<7)/* Move pages in a batch */
 
 #define MPOL_MF_VALID  (MPOL_MF_STRICT   | \
 MPOL_MF_MOVE | \
diff --git a/mm/migrate.c b/mm/migrate.c
index 09114d3..ad02797 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -57,6 +57,15 @@
 
 int accel_page_copy = 1;
 
+
+struct page_migration_work_item {
+   struct list_head list;
+   struct page *old_page;
+   struct page *new_page;
+   struct anon_vma *anon_vma;
+   int page_was_mapped;
+};
+
 /*
  * migrate_prep() needs to be called before we start compiling a list of pages
  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
@@ -1396,6 +1405,509 @@ static int unmap_and_move_huge_page(new_page_t 
get_new_page,
return rc;
 }
 
+static int __unmap_page_concur(struct page *page, struct page *newpage,
+   struct anon_vma **anon_vma,
+   int *page_was_mapped,
+   int force, enum migrate_mode mode)
+{
+   int rc = -EAGAIN;
+   bool is_lru = !__PageMovable(page);
+
+   *anon_vma = NULL;
+   *page_was_mapped = 0;
+
+   if (!trylock_page(page)) {
+   if (!force || ((mode & MIGRATE_MODE_MASK) == MIGRATE_ASYNC))
+   goto out;
+
+   /*
+* It's not safe for direct compaction to call lock_page.
+* For example, during page readahead pages are added locked
+* to the LRU. Later, when the IO completes the pages are
+* marked uptodate and unlocked. However, the queueing
+* could be merging multiple pages for one bio (e.g.
+* mpage_readpages). If an allocation happens for the
+* second or third page, the process can end up locking
+* the same page twice and deadlocking. Rather than
+* trying to be clever about what pages can be locked,
+* avoid the use of lock_page for direct compaction
+* altogether.
+

[RFC PATCH 01/25] mm: migrate: Change migrate_mode to support combination migration modes.

2019-04-03 Thread Zi Yan

From: Zi Yan 

No functionality is changed. Prepare for the following patches,
which add parallel, concurrent page migration modes in conjunction
to the existing modes.

Signed-off-by: Zi Yan 
---
 fs/aio.c | 10 +-
 fs/f2fs/data.c   |  4 ++--
 fs/hugetlbfs/inode.c |  2 +-
 fs/iomap.c   |  2 +-
 fs/ubifs/file.c  |  2 +-
 include/linux/migrate_mode.h |  2 ++
 mm/balloon_compaction.c  |  2 +-
 mm/compaction.c  | 22 +++---
 mm/migrate.c | 18 +-
 mm/zsmalloc.c|  2 +-
 10 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 38b741a..0a88dfd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -389,7 +389,7 @@ static int aio_migratepage(struct address_space *mapping, 
struct page *new,
 * happen under the ctx->completion_lock. That does not work with the
 * migration workflow of MIGRATE_SYNC_NO_COPY.
 */
-   if (mode == MIGRATE_SYNC_NO_COPY)
+   if ((mode & MIGRATE_MODE_MASK) == MIGRATE_SYNC_NO_COPY)
return -EINVAL;
 
rc = 0;
@@ -1300,10 +1300,10 @@ static long read_events(struct kioctx *ctx, long 
min_nr, long nr,
  * Create an aio_context capable of receiving at least nr_events.
  * ctxp must not point to an aio_context that already exists, and
  * must be initialized to 0 prior to the call.  On successful
- * creation of the aio_context, *ctxp is filled in with the resulting 
+ * creation of the aio_context, *ctxp is filled in with the resulting
  * handle.  May fail with -EINVAL if *ctxp is not initialized,
- * if the specified nr_events exceeds internal limits.  May fail 
- * with -EAGAIN if the specified nr_events exceeds the user's limit 
+ * if the specified nr_events exceeds internal limits.  May fail
+ * with -EAGAIN if the specified nr_events exceeds the user's limit
  * of available events.  May fail with -ENOMEM if insufficient kernel
  * resources are available.  May fail with -EFAULT if an invalid
  * pointer is passed for ctxp.  Will fail with -ENOSYS if not
@@ -1373,7 +1373,7 @@ COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 
__user *, ctx32p)
 #endif
 
 /* sys_io_destroy:
- * Destroy the aio_context specified.  May cancel any outstanding 
+ * Destroy the aio_context specified.  May cancel any outstanding
  * AIOs and block on completion.  Will fail with -ENOSYS if not
  * implemented.  May fail with -EINVAL if the context pointed to
  * is invalid.
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 97279441..e7f0e3a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2792,7 +2792,7 @@ int f2fs_migrate_page(struct address_space *mapping,
 
/* migrating an atomic written page is safe with the inmem_lock hold */
if (atomic_written) {
-   if (mode != MIGRATE_SYNC)
+   if ((mode & MIGRATE_MODE_MASK) != MIGRATE_SYNC)
return -EBUSY;
if (!mutex_trylock(>inmem_lock))
return -EAGAIN;
@@ -2825,7 +2825,7 @@ int f2fs_migrate_page(struct address_space *mapping,
f2fs_clear_page_private(page);
}
 
-   if (mode != MIGRATE_SYNC_NO_COPY)
+   if ((mode & MIGRATE_MODE_MASK) != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
migrate_page_states(newpage, page);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ec32fec..04ba8bb 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -885,7 +885,7 @@ static int hugetlbfs_migrate_page(struct address_space 
*mapping,
set_page_private(page, 0);
}
 
-   if (mode != MIGRATE_SYNC_NO_COPY)
+   if ((mode & MIGRATE_MODE_MASK) != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
migrate_page_states(newpage, page);
diff --git a/fs/iomap.c b/fs/iomap.c
index abdd18e..8ee3f9f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -584,7 +584,7 @@ iomap_migrate_page(struct address_space *mapping, struct 
page *newpage,
SetPagePrivate(newpage);
}
 
-   if (mode != MIGRATE_SYNC_NO_COPY)
+   if ((mode & MIGRATE_MODE_MASK) != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
migrate_page_states(newpage, page);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5d2ffb1..2bb8788 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1490,7 +1490,7 @@ static int ubifs_migrate_page(struct address_space 
*mapping,
SetPagePrivate(newpage);
}
 
-   if (mode != MIGRATE_SYNC_NO_COPY)
+   if ((mode & MIGRATE_MODE_MASK) != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
migrate_page_states(newpage, page);
diff --git

[RFC PATCH 07/25] mm: migrate: Add copy_page_dma to use DMA Engine to copy pages.

2019-04-03 Thread Zi Yan

From: Zi Yan 

vm.use_all_dma_chans will grab all usable DMA channels
vm.limit_dma_chans will limit how many DMA channels in use

Signed-off-by: Zi Yan 
---
 include/linux/highmem.h  |   1 +
 include/linux/sched/sysctl.h |   3 +
 kernel/sysctl.c  |  19 +++
 mm/copy_page.c   | 291 +++
 4 files changed, 314 insertions(+)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 0f50dc5..119bb39 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -277,5 +277,6 @@ static inline void copy_highpage(struct page *to, struct 
page *from)
 #endif
 
 int copy_page_multithread(struct page *to, struct page *from, int nr_pages);
+int copy_page_dma(struct page *to, struct page *from, int nr_pages);
 
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 99ce6d7..ce11241 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -90,4 +90,7 @@ extern int sched_energy_aware_handler(struct ctl_table 
*table, int write,
 loff_t *ppos);
 #endif
 
+extern int sysctl_dma_page_migration(struct ctl_table *table, int write,
+void __user *buffer, size_t *lenp,
+loff_t *ppos);
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0eae0b8..b8712eb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -103,6 +103,8 @@
 
 extern int accel_page_copy;
 extern unsigned int limit_mt_num;
+extern int use_all_dma_chans;
+extern int limit_dma_chans;
 
 /* External variables not in a header file. */
 extern int suid_dumpable;
@@ -1451,6 +1453,23 @@ static struct ctl_table vm_table[] = {
.extra1 = ,
},
 {
+   .procname   = "use_all_dma_chans",
+   .data   = _all_dma_chans,
+   .maxlen = sizeof(use_all_dma_chans),
+   .mode   = 0644,
+   .proc_handler   = sysctl_dma_page_migration,
+   .extra1 = ,
+   .extra2 = ,
+},
+{
+   .procname   = "limit_dma_chans",
+   .data   = _dma_chans,
+   .maxlen = sizeof(limit_dma_chans),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+   .extra1 = ,
+},
+{
.procname   = "hugetlb_shm_group",
.data   = _hugetlb_shm_group,
.maxlen = sizeof(gid_t),
diff --git a/mm/copy_page.c b/mm/copy_page.c
index 6665e3d..5e7a797 100644
--- a/mm/copy_page.c
+++ b/mm/copy_page.c
@@ -126,3 +126,294 @@ int copy_page_multithread(struct page *to, struct page 
*from, int nr_pages)
 
return err;
 }
+/*  DMA copy page  */
+#include 
+#include 
+
+#define NUM_AVAIL_DMA_CHAN 16
+
+
+int use_all_dma_chans = 0;
+int limit_dma_chans = NUM_AVAIL_DMA_CHAN;
+
+
+struct dma_chan *copy_chan[NUM_AVAIL_DMA_CHAN] = {0};
+struct dma_device *copy_dev[NUM_AVAIL_DMA_CHAN] = {0};
+
+
+
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+int sysctl_dma_page_migration(struct ctl_table *table, int write,
+void __user *buffer, size_t *lenp,
+loff_t *ppos)
+{
+   int err = 0;
+   int use_all_dma_chans_prior_val = use_all_dma_chans;
+   dma_cap_mask_t copy_mask;
+
+   if (write && !capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+   if (err < 0)
+   return err;
+   if (write) {
+   /* Grab all DMA channels  */
+   if (use_all_dma_chans_prior_val == 0 && use_all_dma_chans == 1) 
{
+   int i;
+
+   dma_cap_zero(copy_mask);
+   dma_cap_set(DMA_MEMCPY, copy_mask);
+
+   dmaengine_get();
+   for (i = 0; i < NUM_AVAIL_DMA_CHAN; ++i) {
+   if (!copy_chan[i]) {
+   copy_chan[i] = 
dma_request_channel(copy_mask, NULL, NULL);
+   }
+   if (!copy_chan[i]) {
+   pr_err("%s: cannot grab channel: %d\n", 
__func__, i);
+   continue;
+   }
+
+   copy_dev[i] = copy_chan[i]->device;
+
+   if (!copy_dev[i]) {
+   pr_err("%s: no device: %d\n", __func__, 
i);
+   continue;
+

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 956 matches

Mail list logo