date:20190628

[PATCH bpf 3/3] selftests: bpf: add tests for shifts by zero

2019-06-28 Thread Luke Nelson

There are currently no tests for ALU64 shift operations when the shift
amount is 0. This adds 6 new tests to make sure they are equivalent
to a no-op. The x32 JIT had such bugs that could have been caught by
these tests.

Cc: Xi Wang 
Signed-off-by: Luke Nelson 
---
 .../selftests/bpf/verifier/basic_instr.c  | 85 +++
 1 file changed, 85 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/basic_instr.c 
b/tools/testing/selftests/bpf/verifier/basic_instr.c
index ed91a7b9a456..071dbc889e8c 100644
--- a/tools/testing/selftests/bpf/verifier/basic_instr.c
+++ b/tools/testing/selftests/bpf/verifier/basic_instr.c
@@ -90,6 +90,91 @@
},
.result = ACCEPT,
 },
+{
+   "lsh64 by 0 imm",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 1),
+   BPF_LD_IMM64(BPF_REG_1, 1),
+   BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 0),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 1,
+},
+{
+   "rsh64 by 0 imm",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 1),
+   BPF_LD_IMM64(BPF_REG_1, 0x1LL),
+   BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+   BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 0),
+   BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 1,
+},
+{
+   "arsh64 by 0 imm",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 1),
+   BPF_LD_IMM64(BPF_REG_1, 0x1LL),
+   BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+   BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 0),
+   BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 1,
+},
+{
+   "lsh64 by 0 reg",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 1),
+   BPF_LD_IMM64(BPF_REG_1, 1),
+   BPF_LD_IMM64(BPF_REG_2, 0),
+   BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_2),
+   BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 1,
+},
+{
+   "rsh64 by 0 reg",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 1),
+   BPF_LD_IMM64(BPF_REG_1, 0x1LL),
+   BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+   BPF_LD_IMM64(BPF_REG_3, 0),
+   BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_3),
+   BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 1,
+},
+{
+   "arsh64 by 0 reg",
+   .insns = {
+   BPF_LD_IMM64(BPF_REG_0, 1),
+   BPF_LD_IMM64(BPF_REG_1, 0x1LL),
+   BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+   BPF_LD_IMM64(BPF_REG_3, 0),
+   BPF_ALU64_REG(BPF_ARSH, BPF_REG_1, BPF_REG_3),
+   BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+   BPF_MOV64_IMM(BPF_REG_0, 2),
+   BPF_EXIT_INSN(),
+   },
+   .result = ACCEPT,
+   .retval = 1,
+},
 {
"invalid 64-bit BPF_END",
.insns = {
-- 
2.20.1

[PATCH bpf 2/3] bpf, x32: Fix bug with ALU64 {LSH,RSH,ARSH} BPF_K shift by 0

2019-06-28 Thread Luke Nelson

The current x32 BPF JIT does not correctly compile shift operations when
the immediate shift amount is 0. The expected behavior is for this to
be a no-op.

The following program demonstrates the bug. The expexceted result is 1,
but the current JITed code returns 2.

  r0 = 1
  r1 = 1
  r1 <<= 0
  if r1 == 1 goto end
  r0 = 2
end:
  exit

This patch simplifies the code and fixes the bug.

Fixes: 03f5781be2c7 ("bpf, x86_32: add eBPF JIT compiler for ia32")
Co-developed-by: Xi Wang 
Signed-off-by: Xi Wang 
Signed-off-by: Luke Nelson 
---
 arch/x86/net/bpf_jit_comp32.c | 63 ---
 1 file changed, 6 insertions(+), 57 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index f34ef513f4f9..1d12d2174085 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -894,27 +894,10 @@ static inline void emit_ia32_lsh_i64(const u8 dst[], 
const u32 val,
}
/* Do LSH operation */
if (val < 32) {
-   /* shl dreg_hi,imm8 */
-   EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val);
-   /* mov ebx,dreg_lo */
-   EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+   /* shld dreg_hi,dreg_lo,imm8 */
+   EMIT4(0x0F, 0xA4, add_2reg(0xC0, dreg_hi, dreg_lo), val);
/* shl dreg_lo,imm8 */
EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
-
-   /* IA32_ECX = 32 - val */
-   /* mov ecx,val */
-   EMIT2(0xB1, val);
-   /* movzx ecx,ecx */
-   EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
-   /* neg ecx */
-   EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
-   /* add ecx,32 */
-   EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
-
-   /* shr ebx,cl */
-   EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
-   /* or dreg_hi,ebx */
-   EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
} else if (val >= 32 && val < 64) {
u32 value = val - 32;
 
@@ -960,27 +943,10 @@ static inline void emit_ia32_rsh_i64(const u8 dst[], 
const u32 val,
 
/* Do RSH operation */
if (val < 32) {
-   /* shr dreg_lo,imm8 */
-   EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
-   /* mov ebx,dreg_hi */
-   EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+   /* shrd dreg_lo,dreg_hi,imm8 */
+   EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val);
/* shr dreg_hi,imm8 */
EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
-
-   /* IA32_ECX = 32 - val */
-   /* mov ecx,val */
-   EMIT2(0xB1, val);
-   /* movzx ecx,ecx */
-   EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
-   /* neg ecx */
-   EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
-   /* add ecx,32 */
-   EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
-
-   /* shl ebx,cl */
-   EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
-   /* or dreg_lo,ebx */
-   EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
} else if (val >= 32 && val < 64) {
u32 value = val - 32;
 
@@ -1025,27 +991,10 @@ static inline void emit_ia32_arsh_i64(const u8 dst[], 
const u32 val,
}
/* Do RSH operation */
if (val < 32) {
-   /* shr dreg_lo,imm8 */
-   EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
-   /* mov ebx,dreg_hi */
-   EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+   /* shrd dreg_lo,dreg_hi,imm8 */
+   EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val);
/* ashr dreg_hi,imm8 */
EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
-
-   /* IA32_ECX = 32 - val */
-   /* mov ecx,val */
-   EMIT2(0xB1, val);
-   /* movzx ecx,ecx */
-   EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
-   /* neg ecx */
-   EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
-   /* add ecx,32 */
-   EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
-
-   /* shl ebx,cl */
-   EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
-   /* or dreg_lo,ebx */
-   EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
} else if (val >= 32 && val < 64) {
u32 value = val - 32;
 
-- 
2.20.1

[PATCH bpf 1/3] bpf, x32: Fix bug with ALU64 {LSH,RSH,ARSH} BPF_X shift by 0

2019-06-28 Thread Luke Nelson

The current x32 BPF JIT for shift operations is not correct when the
shift amount in a register is 0. The expected behavior is a no-op, whereas
the current implementation changes bits in the destination register.

The following example demonstrates the bug. The expected result of this
program is 1, but the current JITed code returns 2.

  r0 = 1
  r1 = 1
  r2 = 0
  r1 <<= r2
  if r1 == 1 goto end
  r0 = 2
end:
  exit

The bug is caused by an incorrect assumption by the JIT that a shift by
32 clear the register. On x32 however, shifts use the lower 5 bits of
the source, making a shift by 32 equivalent to a shift by 0.

This patch fixes the bug using double-precision shifts, which also
simplifies the code.

Fixes: 03f5781be2c7 ("bpf, x86_32: add eBPF JIT compiler for ia32")
Co-developed-by: Xi Wang 
Signed-off-by: Xi Wang 
Signed-off-by: Luke Nelson 
---
 arch/x86/net/bpf_jit_comp32.c | 221 --
 1 file changed, 23 insertions(+), 198 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index b29e82f190c7..f34ef513f4f9 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -724,9 +724,6 @@ static inline void emit_ia32_lsh_r64(const u8 dst[], const 
u8 src[],
 {
u8 *prog = *pprog;
int cnt = 0;
-   static int jmp_label1 = -1;
-   static int jmp_label2 = -1;
-   static int jmp_label3 = -1;
u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
 
@@ -745,79 +742,23 @@ static inline void emit_ia32_lsh_r64(const u8 dst[], 
const u8 src[],
/* mov ecx,src_lo */
EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
 
-   /* cmp ecx,32 */
-   EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
-   /* Jumps when >= 32 */
-   if (is_imm8(jmp_label(jmp_label1, 2)))
-   EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
-   else
-   EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
-
-   /* < 32 */
-   /* shl dreg_hi,cl */
-   EMIT2(0xD3, add_1reg(0xE0, dreg_hi));
-   /* mov ebx,dreg_lo */
-   EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+   /* shld dreg_hi,dreg_lo,cl */
+   EMIT3(0x0F, 0xA5, add_2reg(0xC0, dreg_hi, dreg_lo));
/* shl dreg_lo,cl */
EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
 
-   /* IA32_ECX = -IA32_ECX + 32 */
-   /* neg ecx */
-   EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
-   /* add ecx,32 */
-   EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
-
-   /* shr ebx,cl */
-   EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
-   /* or dreg_hi,ebx */
-   EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
-
-   /* goto out; */
-   if (is_imm8(jmp_label(jmp_label3, 2)))
-   EMIT2(0xEB, jmp_label(jmp_label3, 2));
-   else
-   EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
-
-   /* >= 32 */
-   if (jmp_label1 == -1)
-   jmp_label1 = cnt;
+   /* if ecx >= 32, mov dreg_lo into dreg_hi and clear dreg_lo */
 
-   /* cmp ecx,64 */
-   EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
-   /* Jumps when >= 64 */
-   if (is_imm8(jmp_label(jmp_label2, 2)))
-   EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
-   else
-   EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+   /* cmp ecx,32 */
+   EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+   /* skip the next two instructions (4 bytes) when < 32 */
+   EMIT2(IA32_JB, 4);
 
-   /* >= 32 && < 64 */
-   /* sub ecx,32 */
-   EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
-   /* shl dreg_lo,cl */
-   EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
/* mov dreg_hi,dreg_lo */
EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
-
/* xor dreg_lo,dreg_lo */
EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
 
-   /* goto out; */
-   if (is_imm8(jmp_label(jmp_label3, 2)))
-   EMIT2(0xEB, jmp_label(jmp_label3, 2));
-   else
-   EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
-
-   /* >= 64 */
-   if (jmp_label2 == -1)
-   jmp_label2 = cnt;
-   /* xor dreg_lo,dreg_lo */
-   EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
-   /* xor dreg_hi,dreg_hi */
-   EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
-
-   if (jmp_label3 == -1)
-   jmp_label3 = cnt;
-
if (dstk) {
/* mov dword ptr [ebp+off],dreg_lo */
EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
@@ -836,9 +777,6 @@ static inline void emit_ia32_arsh_r64(const u8 dst[], const 
u8 src[],
 {
u8 *prog = *pprog;
int cnt = 0;
-   static int jmp_label1 = -1;
-   static int jmp_label2 = -1;
-   static int jmp_label3 = -1;
u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
 
@@ -857,78 +795,22 @@ static inline void emit_ia32_arsh_r64(const

[PATCH] csky: Improve abiv1 mem ops performance with glibc codes

2019-06-28 Thread guoren

From: Guo Ren 

These codes are copied from glibc/string directory, they are the generic
implementation for string operations. We may further optimize them with
assembly code in the future.

In fact these code isn't tested enough for kernel, but we've tested them
on glibc and it seems good. We just trust them :)

Signed-off-by: Guo Ren 
Cc: Arnd Bergmann 
---
 arch/csky/abiv1/Makefile |   6 +
 arch/csky/abiv1/inc/abi/string.h |  15 ++
 arch/csky/abiv1/memcmp.c | 310 ++
 arch/csky/abiv1/memcopy.h| 179 ++
 arch/csky/abiv1/memmove.c|  93 +
 arch/csky/abiv1/memset.c |  71 +++
 arch/csky/abiv1/strcpy.c |  17 ++
 arch/csky/abiv1/strksyms.c   |   5 +
 arch/csky/abiv1/strlen.c |  89 +
 arch/csky/abiv1/wordcopy.c   | 397 +++
 10 files changed, 1182 insertions(+)
 create mode 100644 arch/csky/abiv1/memcmp.c
 create mode 100644 arch/csky/abiv1/memcopy.h
 create mode 100644 arch/csky/abiv1/memmove.c
 create mode 100644 arch/csky/abiv1/memset.c
 create mode 100644 arch/csky/abiv1/strcpy.c
 create mode 100644 arch/csky/abiv1/strlen.c
 create mode 100644 arch/csky/abiv1/wordcopy.c

diff --git a/arch/csky/abiv1/Makefile b/arch/csky/abiv1/Makefile
index b8a7c2a..60b60fe 100644
--- a/arch/csky/abiv1/Makefile
+++ b/arch/csky/abiv1/Makefile
@@ -5,3 +5,9 @@ obj-y   += cacheflush.o
 obj-y  += mmap.o
 obj-y  += memcpy.o
 obj-y  += strksyms.o
+obj-y  += memcmp.o
+obj-y  += memset.o
+obj-y  += memmove.o
+obj-y  += strcpy.o
+obj-y  += strlen.o
+obj-y  += wordcopy.o
diff --git a/arch/csky/abiv1/inc/abi/string.h b/arch/csky/abiv1/inc/abi/string.h
index 0cd4338..a62d7a0 100644
--- a/arch/csky/abiv1/inc/abi/string.h
+++ b/arch/csky/abiv1/inc/abi/string.h
@@ -4,7 +4,22 @@
 #ifndef __ABI_CSKY_STRING_H
 #define __ABI_CSKY_STRING_H
 
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, __kernel_size_t);
+
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *, const void *, __kernel_size_t);
 
+#define __HAVE_ARCH_MEMMOVE
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+#define __HAVE_ARCH_MEMSET
+extern void *memset(void *, int,  __kernel_size_t);
+
+#define __HAVE_ARCH_STRCPY
+extern char *strcpy(char *, const char *);
+
+#define __HAVE_ARCH_STRLEN
+extern __kernel_size_t strlen(const char *);
+
 #endif /* __ABI_CSKY_STRING_H */
diff --git a/arch/csky/abiv1/memcmp.c b/arch/csky/abiv1/memcmp.c
new file mode 100644
index 000..766c5f5
--- /dev/null
+++ b/arch/csky/abiv1/memcmp.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 1991-2018 Free Software Foundation, Inc.
+
+#include "memcopy.h"
+
+#undef memcmp
+
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
+# define __THROW
+
+# ifndef WORDS_BIGENDIAN
+#  define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
+# else
+#  define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2)))
+# endif
+
+#ifdef WORDS_BIGENDIAN
+# define CMP_LT_OR_GT(a, b) ((a) > (b) ? 1 : -1)
+#else
+# define CMP_LT_OR_GT(a, b) memcmp_bytes ((a), (b))
+#endif
+
+/* BE VERY CAREFUL IF YOU CHANGE THIS CODE!  */
+
+/* The strategy of this memcmp is:
+
+   1. Compare bytes until one of the block pointers is aligned.
+
+   2. Compare using memcmp_common_alignment or
+  memcmp_not_common_alignment, regarding the alignment of the other
+  block after the initial byte operations.  The maximum number of
+  full words (of type op_t) are compared in this way.
+
+   3. Compare the few remaining bytes.  */
+
+#ifndef WORDS_BIGENDIAN
+/* memcmp_bytes -- Compare A and B bytewise in the byte order of the machine.
+   A and B are known to be different.
+   This is needed only on little-endian machines.  */
+
+static int memcmp_bytes (op_t, op_t) __THROW;
+
+static int
+memcmp_bytes (op_t a, op_t b)
+{
+  long int srcp1 = (long int) 
+  long int srcp2 = (long int) 
+  op_t a0, b0;
+
+  do
+{
+  a0 = ((byte *) srcp1)[0];
+  b0 = ((byte *) srcp2)[0];
+  srcp1 += 1;
+  srcp2 += 1;
+}
+  while (a0 == b0);
+  return a0 - b0;
+}
+#endif
+
+static int memcmp_common_alignment (long, long, size_t) __THROW;
+
+/* memcmp_common_alignment -- Compare blocks at SRCP1 and SRCP2 with LEN `op_t'
+   objects (not LEN bytes!).  Both SRCP1 and SRCP2 should be aligned for
+   memory operations on `op_t's.  */
+static int
+memcmp_common_alignment (long int srcp1, long int srcp2, size_t len)
+{
+  op_t a0, a1;
+  op_t b0, b1;
+
+  switch (len % 4)
+{
+default: /* Avoid warning about uninitialized local variables.  */
+case 2:
+  a0 =

[PATCH] perf: Remove duplicate headers

2019-06-28 Thread Souptick Joarder

Removed duplicate headers which are included twice.

Signed-off-by: Souptick Joarder 
Reviewed-by: Mukesh Ojha 
---
 tools/perf/util/data.c | 1 -
 tools/perf/util/get_current_dir_name.c | 1 -
 tools/perf/util/stat-display.c | 1 -
 3 files changed, 3 deletions(-)

diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 6a64f71..509a41e 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -8,7 +8,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include "data.h"
diff --git a/tools/perf/util/get_current_dir_name.c 
b/tools/perf/util/get_current_dir_name.c
index 267aa60..ebb80cd 100644
--- a/tools/perf/util/get_current_dir_name.c
+++ b/tools/perf/util/get_current_dir_name.c
@@ -5,7 +5,6 @@
 #include "util.h"
 #include 
 #include 
-#include 
 
 /* Android's 'bionic' library, for one, doesn't have this */
 
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 6d043c7..7b3a16c 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -12,7 +12,6 @@
 #include "string2.h"
 #include "sane_ctype.h"
 #include "cgroup.h"
-#include 
 #include 
 
 #define CNTR_NOT_SUPPORTED ""
-- 
1.9.1

shrink_dentry_list() logics change (was Re: [RFC PATCH v3 14/15] dcache: Implement partial shrink via Slab Movable Objects)

2019-06-28 Thread Al Viro

On Sat, Jun 29, 2019 at 05:08:44AM +0100, Al Viro wrote:
> > The reason we don't hit that problem with regular memory shrinker is
> > this:
> > unregister_shrinker(>s_shrink);
> > fs->kill_sb(s);
> > in deactivate_locked_super().  IOW, shrinker for this fs is gone
> > before we get around to shutdown.  And so are all normal sources
> > of dentry eviction for that fs.
> > 
> > Your earlier variants all suffer the same problem - picking a page
> > shared by dentries from several superblocks can run into trouble
> > if it overlaps with umount of one of those.

PS: the problem is not gone in the next iteration of the patchset in
question.  The patch I'm proposing (including dput_to_list() and _ONLY_
compile-tested) follows.  Comments?

diff --git a/fs/dcache.c b/fs/dcache.c
index 8136bda27a1f..dfe21a649c96 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -860,6 +860,32 @@ void dput(struct dentry *dentry)
 }
 EXPORT_SYMBOL(dput);
 
+static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+__must_hold(>d_lock)
+{
+   if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+   /* let the owner of the list it's on deal with it */
+   --dentry->d_lockref.count;
+   } else {
+   if (dentry->d_flags & DCACHE_LRU_LIST)
+   d_lru_del(dentry);
+   if (!--dentry->d_lockref.count)
+   d_shrink_add(dentry, list);
+   }
+}
+
+void dput_to_list(struct dentry *dentry, struct list_head *list)
+{
+   rcu_read_lock();
+   if (likely(fast_dput(dentry))) {
+   rcu_read_unlock();
+   return;
+   }
+   rcu_read_unlock();
+   if (!retain_dentry(dentry))
+   __dput_to_list(dentry, list);
+   spin_unlock(>d_lock);
+}
 
 /* This must be called with d_lock held */
 static inline void __dget_dlock(struct dentry *dentry)
@@ -1088,18 +1114,9 @@ static void shrink_dentry_list(struct list_head *list)
rcu_read_unlock();
d_shrink_del(dentry);
parent = dentry->d_parent;
+   if (parent != dentry)
+   __dput_to_list(parent, list);
__dentry_kill(dentry);
-   if (parent == dentry)
-   continue;
-   /*
-* We need to prune ancestors too. This is necessary to prevent
-* quadratic behavior of shrink_dcache_parent(), but is also
-* expected to be beneficial in reducing dentry cache
-* fragmentation.
-*/
-   dentry = parent;
-   while (dentry && !lockref_put_or_lock(>d_lockref))
-   dentry = dentry_kill(dentry);
}
 }

Re: [RFC PATCH v3 14/15] dcache: Implement partial shrink via Slab Movable Objects

2019-06-28 Thread Al Viro

On Thu, Apr 11, 2019 at 10:02:00PM +0100, Al Viro wrote:

> Aaaarrgghhh...  No, we can't.  Look: we get one candidate dentry in isolate
> phase.  We put it into shrink list.  umount(2) comes and calls
> shrink_dcache_for_umount(), which calls shrink_dcache_parent(root).
> In the meanwhile, shrink_dentry_list() is run and does __dentry_kill() on
> that one dentry.  Fine, it's gone - before shrink_dcache_parent() even
> sees it.  Now shrink_dentry_list() holds a reference to its parent and
> is about to drop it in
> dentry = parent;
> while (dentry && !lockref_put_or_lock(>d_lockref))
> dentry = dentry_kill(dentry);
> And dropped it will be, but... shrink_dcache_parent() has finished the
> scan, without finding *anything* with zero refcount - the thing that used
> to be on the shrink list was already gone before shrink_dcache_parent()
> has gotten there and the reference to parent was not dropped yet.  So
> shrink_dcache_for_umount() plows past shrink_dcache_parent(), walks the
> tree and complains loudly about "busy" dentries (that parent we hadn't
> finished dropping), and then we proceed with filesystem shutdown.
> In the meanwhile, dentry_kill() finally gets to killing dentry and
> triggers an unexpected late call of ->d_iput() on a filesystem that
> has already been far enough into shutdown - far enough to destroy the
> data structures needed for that sucker.
> 
> The reason we don't hit that problem with regular memory shrinker is
> this:
> unregister_shrinker(>s_shrink);
> fs->kill_sb(s);
> in deactivate_locked_super().  IOW, shrinker for this fs is gone
> before we get around to shutdown.  And so are all normal sources
> of dentry eviction for that fs.
> 
> Your earlier variants all suffer the same problem - picking a page
> shared by dentries from several superblocks can run into trouble
> if it overlaps with umount of one of those.

FWIW, I think I see a kinda-sorta sane solution.  Namely, add

static void __dput_to_list(struct dentry *dentry, struct list_head *list)
{
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
/* let the owner of the list it's on deal with it */
--dentry->d_lockref.count;
} else {
if (dentry->d_flags & DCACHE_LRU_LIST)
d_lru_del(dentry);
if (!--dentry->d_lockref.count)
d_shrink_add(parent, list);
}
}

and have
shrink_dentry_list() do this in the end of loop:
d_shrink_del(dentry);
parent = dentry->d_parent;
/* both dentry and parent are locked at that point */
if (parent != dentry) {
/*
 * We need to prune ancestors too. This is necessary to
 * prevent quadratic behavior of shrink_dcache_parent(),
 * but is also expected to be beneficial in reducing
 * dentry cache fragmentation.
 */
__dput_to_list(parent, list);
}
__dentry_kill(dentry);
}

instead of
d_shrink_del(dentry);
parent = dentry->d_parent;
__dentry_kill(dentry);
if (parent == dentry)
continue;
/*
 * We need to prune ancestors too. This is necessary to prevent
 * quadratic behavior of shrink_dcache_parent(), but is also
 * expected to be beneficial in reducing dentry cache
 * fragmentation.
 */
dentry = parent;
while (dentry && !lockref_put_or_lock(>d_lockref))
dentry = dentry_kill(dentry);
}
we have there now.  Linus, do you see any problems with that change?  AFAICS,
that should avoid the problem described above.  Moreover, it seems to allow
a fun API addition:

void dput_to_list(struct dentry *dentry, struct list_head *list)
{
rcu_read_lock();
if (likely(fast_dput(dentry))) {
rcu_read_unlock();
return;
}
rcu_read_unlock();
if (!retain_dentry(dentry))
__dput_to_list(dentry, list);
spin_unlock(>d_lock);
}

allowing to take an empty list, do a bunch of dput_to_list() (under spinlocks,
etc.), then, once we are in better locking conditions, shrink_dentry_list()
to take them all out.  I can see applications for that in e.g. fs/namespace.c -
quite a bit of kludges with ->mnt_ex_mountpoint would be killable that way,
and there would be a chance to transfer the contribution to ->d_count of
mountpoint from struct mount to struct mountpoint (i.e. make any number of
mounts on the same mountpoint dentry contribute only 1 to its ->d_count,
not the number of such mounts).

Re: [PATCH v8 3/7] cpu-topology: Move cpu topology code to common code.

2019-06-28 Thread Hanjun Guo

On 2019/6/28 3:52, Atish Patra wrote:
> Both RISC-V & ARM64 are using cpu-map device tree to describe
> their cpu topology. It's better to move the relevant code to
> a common place instead of duplicate code.
> 
> To: Will Deacon 
> To: Catalin Marinas 

Using Cc: is better.

> Signed-off-by: Atish Patra 
> [Tested on QDF2400]
> Tested-by: Jeffrey Hugo 
> [Tested on Juno and other embedded platforms.]
> Tested-by: Sudeep Holla 
> Reviewed-by: Sudeep Holla 
> Acked-by: Will Deacon 
> Acked-by: Greg Kroah-Hartman 
> ---
>  arch/arm64/include/asm/topology.h |  23 ---
>  arch/arm64/kernel/topology.c  | 303 +-
>  drivers/base/arch_topology.c  | 296 +
>  include/linux/arch_topology.h |  28 +++
>  include/linux/topology.h  |   1 +
>  5 files changed, 329 insertions(+), 322 deletions(-)

Tested on Kunpeng920 ARM64 server, works good,

# lscpu
Architecture:aarch64
Byte Order:  Little Endian
CPU(s):  96
On-line CPU(s) list: 0-95
Thread(s) per core:  1
Core(s) per socket:  48
Socket(s):   2
NUMA node(s):4
Vendor ID:   0x48
Model:   0
Stepping:0x1
CPU max MHz: 2600.
CPU min MHz: 260.
BogoMIPS:200.00
L1d cache:   64K
L1i cache:   64K
L2 cache:512K
L3 cache:32768K
NUMA node0 CPU(s):   0-23
NUMA node1 CPU(s):   24-47
NUMA node2 CPU(s):   48-71
NUMA node3 CPU(s):   72-95
Flags:   fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp 
asimdhp cpuid asimdrdm jscvt fcma dcpop asimddp asimdfhm

Tested-by: Hanjun Guo 

For the ACPI code,

Acked-by: Hanjun Guo 

Thanks
Hanjun

Re: [PATCH v7 0/3] media: v4l2-subdev: Verify arguments in v4l2_subdev_call()

2019-06-28 Thread Niklas Söderlund

Hi,

This patch breaks rcar-vin. I'm sorry I did not find out before it was 
merged as a8fa55078a7784a9 ("media: v4l2-subdev: Verify arguments in 
v4l2_subdev_call()").

The problem is that rcar-vin calls enum_mbus_code in its bound callback.  
At this point call_enum_mbus_code() is invoked which then calls 
check_pad(). At this point sd->entity.graph_obj.mdev is not set so the 
check if (pad > 0) fails and the binding of the subdevice in rcar-vin 
fails.

I'm not sure how to best solve this, suggestions are appreciated. I see 
two options, move the call to enum_mbus_code from the bound to the 
complete callback or make sure the mdev is associated with the subdev 
before the bound callback is invoked. I don't like the former as I think 
the complete callback should be removed ;-)

On 2019-05-20 23:27:44 +0200, Janusz Krzysztofik wrote:
> Correctness of format type (try or active) and pad ID parameters passed
> to subdevice operation callbacks is now verified only for IOCTL calls.
> However, those callbacks are also used by drivers, e.g., V4L2 host
> interfaces.
> 
> Since both subdev_do_ioctl() and drivers are using v4l2_subdev_call()
> macro while calling subdevice operations, move those parameter checks
> from subdev_do_ioctl() to v4l2_subdev_call().  Also, add check for
> non-NULL pointers, including pad config if V4L2_SUBDEV_FORMAT_TRY is
> requested.
> 
> Having that done, we can avoid taking care of those checks inside
> drivers.
> 
> Janusz Krzysztofik (3):
>   media: v4l2-subdev: Verify arguments in v4l2_subdev_call()
>   media: v4l2-subdev: Verify v4l2_subdev_call() pointer arguments
>   media: v4l2-subdev: Verify v4l2_subdev_call() pad config argument
> 
>  drivers/media/v4l2-core/v4l2-subdev.c | 268 +-
>  include/media/v4l2-subdev.h   |   6 +
>  2 files changed, 188 insertions(+), 86 deletions(-)
> 
> Changelog:
> v6->v7:
> Changes suggested by Sakari - thanks!
> - never succeed pad check on media entities with pad_num == 0,
> - allow pad 0 on subdevies not registered as media entities.
> 
> v5->v6:
> - rename wrappers to call_something() as suggested by Sakari - thanks!
> - make check_ functions inline - also on Sakari's suggestion, thanks!
> - drop patch 2/4 and remove WARN_ONs from remaining patches to avoid
>   kernel WARNs on non-kernel bugs - thanks Hans for pointing this out!
> 
> v4->v5:
> - a few coding style and code formatting changes,
> - require CONFIG_MEDIA_CONTROLLER, not CONFIG_VIDEO_V4L2_SUBDEV_API,
>   for a valid pad ID check,
> - perform pad ID check only if at least one pad is configured so
>   drivers which don't configure pads are not affected if built with
>   CONFIG_MEDIA_CONTROLLER defined,
> - issue kernel warnings on invalid parameters (new patch - 2/4),
> - validate pointers before using them (new patch - 3/4).
> 
> v3->v4:
> - fix 'struct' keyword missing from patch 2/2,
> - fix checkpatch reported style issue in patch 2/2
> Sorry for that.
> 
> v2->v3:
> - add patch 2/2 with pad config check,
> - adjust continuation line alignments in patch 1/2 to match those
>   used in 2/2.
> 
> v1->v2:
> - replace the horrible macro with a structure of wrapper functions;
>   inspired by Hans' and Sakari's comments - thanks!
> 
> -- 
> 2.21.0
>

[RFC PATCH v2 3/3] arm64: Add CPU hotplug support

2019-06-28 Thread Xiongfeng Wang

To support CPU hotplug, we need to implement 'acpi_(un)map_cpu()' and
'arch_(un)register_cpu()' for ARM64. These functions are called in
'acpi_processor_hotadd_init()/acpi_processor_remove()' when the CPU is hot
added into or hot removed from the system.

Signed-off-by: Xiongfeng Wang 
---
 arch/arm64/kernel/acpi.c  | 22 ++
 arch/arm64/kernel/setup.c | 17 +
 2 files changed, 39 insertions(+)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 2804330..57835fa 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -25,6 +25,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -284,3 +285,24 @@ int apei_claim_sea(struct pt_regs *regs)
 
return err;
 }
+
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+int *pcpu)
+{
+   int cpu;
+
+   cpu = acpi_map_cpuid(physid, acpi_id);
+   *pcpu = cpu;
+   set_cpu_present(cpu, true);
+
+   return 0;
+}
+EXPORT_SYMBOL(acpi_map_cpu);
+
+int acpi_unmap_cpu(int cpu)
+{
+   set_cpu_present(cpu, false);
+
+   return 0;
+}
+EXPORT_SYMBOL(acpi_unmap_cpu);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 7f4d12a..f2a881e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -398,3 +398,20 @@ static int __init register_kernel_offset_dumper(void)
return 0;
 }
 __initcall(register_kernel_offset_dumper);
+
+int arch_register_cpu(int num)
+{
+   struct cpu *cpu = _cpu(cpu_data.cpu, num);
+
+   cpu->hotpluggable = 1;
+   return register_cpu(cpu, num);
+}
+EXPORT_SYMBOL(arch_register_cpu);
+
+void arch_unregister_cpu(int num)
+{
+   struct cpu *cpu = _cpu(cpu_data.cpu, num);
+
+   unregister_cpu(cpu);
+}
+EXPORT_SYMBOL(arch_unregister_cpu);
-- 
1.7.12.4

[RFC PATCH v2 1/3] ACPI / scan: evaluate _STA for processors declared via ASL Device statement

2019-06-28 Thread Xiongfeng Wang

When we scan all the acpi namespace node in
acpi_scan_init()->acpi_bus_scan(), we evaluate '_STA' method for processor
type node to determine whether the device is present. But processors can
also be declared via ASL Device statement. ACPI 6.3 spec specifically
says that the Processor statement is deprecated and a Device statement
should be used for processors. In that case, acpi_object_type is
ACPI_TYPE_DEVICE rather than ACPI_TYPE_PROCESSOR.

Current code doesn't evaluate '_STA' for nodes with ACPI_TYPE_DEVICE, and
the device status is set to 'present' as default. This patch get the
device status from '_STA' method for processors declared via ASL Device
statement if it does have a '_STA' method.

Signed-off-by: Xiongfeng Wang 

---
I am not sure if I should set 'type' as ACPI_BUS_TYPE_PROCESSOR rather
than ACPI_BUS_TYPE_DEVICE for processors declared via ASL Device
statement.
---
 drivers/acpi/scan.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 0e28270..cec43f6 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "internal.h"
@@ -1687,6 +1688,7 @@ static int acpi_bus_type_and_status(acpi_handle handle, 
int *type,
 {
acpi_status status;
acpi_object_type acpi_type;
+   struct acpi_device_info *info;
 
status = acpi_get_type(handle, _type);
if (ACPI_FAILURE(status))
@@ -1699,6 +1701,16 @@ static int acpi_bus_type_and_status(acpi_handle handle, 
int *type,
return -ENODEV;
 
*type = ACPI_BUS_TYPE_DEVICE;
+
+   status = acpi_get_object_info(handle, );
+   if (ACPI_SUCCESS(status) && info->valid & ACPI_VALID_HID &&
+   !strcmp(info->hardware_id.string,
+   ACPI_PROCESSOR_DEVICE_HID)) {
+   status = acpi_bus_get_status_handle(handle, sta);
+   if (ACPI_SUCCESS(status))
+   break;
+   }
+
/*
 * acpi_add_single_object updates this once we've an acpi_device
 * so that acpi_bus_get_status' quirk handling can be used.
-- 
1.7.12.4

[RFC PATCH v2 2/3] arm64: mark all the GICC nodes in MADT as possible cpu

2019-06-28 Thread Xiongfeng Wang

We set 'cpu_possible_mask' based on the enabled GICC node in MADT. If
the GICC node is disabled, we will skip initializing the kernel data
structure for that CPU.

To support CPU hotplug, we need to initialize some CPU related data
structure in advance. This patch mark all the GICC nodes as possible CPU
and only these enabled GICC nodes as present CPU.

Signed-off-by: Xiongfeng Wang 
---
 arch/arm64/kernel/setup.c |  2 +-
 arch/arm64/kernel/smp.c   | 11 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 7e541f9..7f4d12a 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -359,7 +359,7 @@ static int __init topology_init(void)
for_each_online_node(i)
register_one_node(i);
 
-   for_each_possible_cpu(i) {
+   for_each_online_cpu(i) {
struct cpu *cpu = _cpu(cpu_data.cpu, i);
cpu->hotpluggable = 1;
register_cpu(cpu, i);
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6dcf960..6d9983c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -525,16 +525,14 @@ struct acpi_madt_generic_interrupt 
*acpi_cpu_get_madt_gicc(int cpu)
 {
u64 hwid = processor->arm_mpidr;
 
-   if (!(processor->flags & ACPI_MADT_ENABLED)) {
-   pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", 
hwid);
-   return;
-   }
-
if (hwid & ~MPIDR_HWID_BITMASK || hwid == INVALID_HWID) {
pr_err("skipping CPU entry with invalid MPIDR 0x%llx\n", hwid);
return;
}
 
+   if (!(processor->flags & ACPI_MADT_ENABLED))
+   pr_debug("disabled CPU entry with 0x%llx MPIDR\n", hwid);
+
if (is_mpidr_duplicate(cpu_count, hwid)) {
pr_err("duplicate CPU MPIDR 0x%llx in MADT\n", hwid);
return;
@@ -755,7 +753,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
if (err)
continue;
 
-   set_cpu_present(cpu, true);
+   if ((cpu_madt_gicc[cpu].flags & ACPI_MADT_ENABLED))
+   set_cpu_present(cpu, true);
numa_store_cpu_info(cpu);
}
 }
-- 
1.7.12.4

[RFC PATCH v2 0/3] Support CPU hotplug for ARM64

2019-06-28 Thread Xiongfeng Wang

This patchset mark all the GICC node in MADT as possible CPUs even though it
is disabled. But only those enabled GICC node are marked as present CPUs.
So that kernel will initialize some CPU related data structure in advance before
the CPU is actually hot added into the system. This patchset also implement 
'acpi_(un)map_cpu()' and 'arch_(un)register_cpu()' for ARM64. These functions 
are
needed to enable CPU hotplug.

To support CPU hotplug, we need to add all the possible GICC node in MADT
including those CPUs that are not present but may be hot added later. Those
CPUs are marked as disabled in GICC nodes.

Changelog:

v1 -> v2:
rebase the thrid patch to the lastest kernel

Xiongfeng Wang (3):
  ACPI / scan: evaluate _STA for processors declared via ASL Device
statement
  arm64: mark all the GICC nodes in MADT as possible cpu
  arm64: Add CPU hotplug support

 arch/arm64/kernel/acpi.c  | 22 ++
 arch/arm64/kernel/setup.c | 19 ++-
 arch/arm64/kernel/smp.c   | 11 +--
 drivers/acpi/scan.c   | 12 
 4 files changed, 57 insertions(+), 7 deletions(-)

-- 
1.7.12.4

[PATCH -next] ASoC: madera: Remove duplicated include from cs47l35.c

2019-06-28 Thread YueHaibing

Remove duplicated include.

Signed-off-by: YueHaibing 
---
 sound/soc/codecs/cs47l35.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sound/soc/codecs/cs47l35.c b/sound/soc/codecs/cs47l35.c
index 511d0d6fa962..e3585c1dab3d 100644
--- a/sound/soc/codecs/cs47l35.c
+++ b/sound/soc/codecs/cs47l35.c
@@ -19,7 +19,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include

Re: [PATCH v4] net: netfilter: Fix rpfilter dropping vrf packets by mistake

2019-06-28 Thread linmiaohe

On 6/29/19 1:05 AM, David Ahern wrote:
> On 6/28/19 3:06 AM, Miaohe Lin wrote:
> > diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c 
> > b/net/ipv6/netfilter/ip6t_rpfilter.c
> > index 6bcaf7357183..3c4a1772c15f 100644
> > --- a/net/ipv6/netfilter/ip6t_rpfilter.c
> > +++ b/net/ipv6/netfilter/ip6t_rpfilter.c
> > @@ -55,6 +55,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, 
> > const struct sk_buff *skb,
> > if (rpfilter_addr_linklocal(>saddr)) {
> > lookup_flags |= RT6_LOOKUP_F_IFACE;
> > fl6.flowi6_oif = dev->ifindex;
> > +   /* Set flowi6_oif for vrf devices to lookup route in l3mdev domain. */
> > +   } else if (netif_is_l3_master(dev) || netif_is_l3_slave(dev)) {
> > +   lookup_flags |= FLOWI_FLAG_SKIP_NH_OIF;
>
> you don't need to set that flag here. It is done by the fib_rules code as 
> needed.
>
You're right. Fib rules code would set FLOWI_FLAG_SKIP_NH_OIF flag.  But I set
it here for distinguish with the flags & XT_RPFILTER_LOOSE branch. Without
this, they do the same work and maybe should be  combined. I don't want to
do that as that makes code confusing.
Is this code snipet below ok ? If so, I would delete this flag setting.
 
   } else if (netif_is_l3_master(dev) || netif_is_l3_slave(dev)) {
   fl6.flowi6_oif = dev->ifindex;
} else if ((flags & XT_RPFILTER_LOOSE) == 0)
fl6.flowi6_oif = dev->ifindex;

Re: BUG: unable to handle kernel paging request in __do_softirq

2019-06-28 Thread syzbot


syzbot has bisected this bug to:

commit e9db4ef6bf4ca9894bb324c76e01b8f1a16b2650
Author: John Fastabend 
Date:   Sat Jun 30 13:17:47 2018 +

bpf: sockhash fix omitted bucket lock in sock_close

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=16aac819a0
start commit:   29f785ff Merge branch 'fixes' of git://git.kernel.org/pub/..
git tree:   upstream
final crash:https://syzkaller.appspot.com/x/report.txt?x=15aac819a0
console output: https://syzkaller.appspot.com/x/log.txt?x=11aac819a0
kernel config:  https://syzkaller.appspot.com/x/.config?x=e5c77f8090a3b96b
dashboard link: https://syzkaller.appspot.com/bug?extid=0b224895cb9454584de1
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1076d132a0

Reported-by: syzbot+0b224895cb9454584...@syzkaller.appspotmail.com
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

Re: [PATCH RFC 0/3] Support CPU hotplug for ARM64

2019-06-28 Thread Xiongfeng Wang

Sorry, the third patch can't be applied to the lastest kernel. I will send 
another
version and attach the method to test this patchset.

On 2019/6/28 19:13, Xiongfeng Wang wrote:
> This patchset mark all the GICC node in MADT as possible CPUs even though it
> is disabled. But only those enabled GICC node are marked as present CPUs.
> So that kernel will initialize some CPU related data structure in advance 
> before
> the CPU is actually hot added into the system. This patchset also implement 
> 'acpi_(un)map_cpu()' and 'arch_(un)register_cpu()' for ARM64. These functions 
> are
> needed to enable CPU hotplug.
> 
> To support CPU hotplug, we need to add all the possible GICC node in MADT
> including those CPUs that are not present but may be hot added later. Those
> CPUs are marked as disabled in GICC nodes.
> 
> Xiongfeng Wang (3):
>   ACPI / scan: evaluate _STA for processors declared via ASL Device
> statement
>   arm64: mark all the GICC nodes in MADT as possible cpu
>   arm64: Add CPU hotplug support
> 
>  arch/arm64/kernel/acpi.c  | 22 ++
>  arch/arm64/kernel/setup.c | 19 ++-
>  arch/arm64/kernel/smp.c   | 11 +--
>  drivers/acpi/scan.c   | 12 
>  4 files changed, 57 insertions(+), 7 deletions(-)
>

Re: net: check before dereferencing netdev_ops during busy poll

2019-06-28 Thread Josh Elsasser

On Jun 28, 2019, at 3:55 PM, Sasha Levin  wrote:

> What's the upstream commit id?

The commit wasn't needed upstream, as I only sent the original patch after
79e7fff47b7b ("net: remove support for per driver ndo_busy_poll()") had
made the fix unnecessary in Linus' tree.

May've gotten lost in the shuffle due to my poor Fixes tags. The patch in
question applied only on top of the 4.9 stable release at the time, but the
actual NPE had been around in some form since 3.11 / 0602129286705 ("net: add
low latency socket poll").

Josh

Re: [PATCH RT v2] Fix a lockup in wait_for_completion() and friends

2019-06-28 Thread Steven Rostedt

On Fri, 10 May 2019 12:33:18 +0200
Sebastian Andrzej Siewior  wrote:

> On 2019-05-09 14:33:20 [-0500], miny...@acm.org wrote:
> > From: Corey Minyard 
> > 
> > The function call do_wait_for_common() has a race condition that
> > can result in lockups waiting for completions.  Adding the thread
> > to (and removing the thread from) the wait queue for the completion
> > is done outside the do loop in that function.  However, if the thread
> > is woken up, the swake_up_locked() function will delete the entry
> > from the wait queue.  If that happens and another thread sneaks
> > in and decrements the done count in the completion to zero, the
> > loop will go around again, but the thread will no longer be in the
> > wait queue, so there is no way to wake it up.  
> 
> applied, thank you.
> 

When I applied this patch to 4.19-rt, I get the following lock up:

watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [sh:745]
Modules linked in: floppy i915 drm_kms_helper drm fb_sys_fops sysimgblt 
sysfillrect syscopyarea iosf_mbi i2c_algo_bit video
CPU: 2 PID: 745 Comm: sh Not tainted 4.19.56-test-rt23+ #16
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by 
O.E.M., BIOS SDBLI944.86P 05/08/2007
RIP: 0010:_raw_spin_unlock_irq+0x17/0x4d
Code: 48 8b 12 0f ba e2 12 73 07 e8 f1 4a 92 ff 31 c0 5b 5d c3 66 66 66 66 90 
55 48 89 e5 c6 07 00 e8 de 3d a3 ff fb bf 01 00 00 00  a7 27 9a ff 65 8b 05 
c8 7f 93 7e 85 c0 74 1f a9 ff ff
 ff 7f 75
RSP: 0018:c9c8bbb8 EFLAGS: 0246 ORIG_RAX: ff13
RAX:  RBX: c9c8bd58 RCX: 0003
RDX:  RSI: 8108ffab RDI: 0001
RBP: c9c8bbb8 R08: 816dcd76 R09: 00020600
R10: 0400 R11: 001c0eef1808 R12: c9c8bbc8
R13: c9f13ca0 R14: 888074b2d7d8 R15: 8880789efe10
FS:  () GS:88807b30() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 0030662001b8 CR3: 376ac000 CR4: 06e0
Call Trace:
 swake_up_all+0xa6/0xde
 __d_lookup_done+0x7c/0xc7
 __d_add+0x44/0xf7
 d_splice_alias+0x208/0x218
 ext4_lookup+0x1a6/0x1c5
 path_openat+0x63a/0xb15
 ? preempt_latency_stop+0x25/0x27
 do_filp_open+0x51/0xae
 ? trace_preempt_on+0xde/0xe7
 ? rt_spin_unlock+0x13/0x24
 ? __alloc_fd+0x145/0x155
 do_sys_open+0x81/0x125
 __x64_sys_open+0x21/0x23
 do_syscall_64+0x5c/0x6e
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

I haven't really looked too much into it though. I ran out of time :-/

-- Steve

[PATCH] mtd: rawnand: ingenic: Fix ingenic_ecc dependency

2019-06-28 Thread Paul Cercueil

If MTD_NAND_JZ4780 is y and MTD_NAND_JZ4780_BCH is m,
which select CONFIG_MTD_NAND_INGENIC_ECC to m, building fails:

drivers/mtd/nand/raw/ingenic/ingenic_nand.o: In function `ingenic_nand_remove':
ingenic_nand.c:(.text+0x177): undefined reference to `ingenic_ecc_release'
drivers/mtd/nand/raw/ingenic/ingenic_nand.o: In function 
`ingenic_nand_ecc_correct':
ingenic_nand.c:(.text+0x2ee): undefined reference to `ingenic_ecc_correct'

To fix that, the ingenic_nand and ingenic_ecc modules have been fused
into one single module.
- The ingenic_ecc.c code is now compiled in only if
  $(CONFIG_MTD_NAND_INGENIC_ECC) is set. This is now a boolean instead
  of tristate.
- To avoid changing the module name, the ingenic_nand.c file is moved to
  ingenic_nand_drv.c. Then the module name is still ingenic_nand.
- Since ingenic_ecc.c is no more a module, the module-specific macros
  have been dropped, and the functions are no more exported for use by
  the ingenic_nand driver.

Fixes: 15de8c6efd0e ("mtd: rawnand: ingenic: Separate top-level and SoC 
specific code")
Signed-off-by: Paul Cercueil 
Reported-by: Arnd Bergmann 
Reported-by: Hulk Robot 
Cc: YueHaibing 
Cc: sta...@vger.kernel.org
---
 drivers/mtd/nand/raw/ingenic/Kconfig | 2 +-
 drivers/mtd/nand/raw/ingenic/Makefile| 4 +++-
 drivers/mtd/nand/raw/ingenic/ingenic_ecc.c   | 9 -
 .../raw/ingenic/{ingenic_nand.c => ingenic_nand_drv.c}   | 0
 4 files changed, 4 insertions(+), 11 deletions(-)
 rename drivers/mtd/nand/raw/ingenic/{ingenic_nand.c => ingenic_nand_drv.c} 
(100%)

diff --git a/drivers/mtd/nand/raw/ingenic/Kconfig 
b/drivers/mtd/nand/raw/ingenic/Kconfig
index 19a96ce515c1..66b7cffdb0c2 100644
--- a/drivers/mtd/nand/raw/ingenic/Kconfig
+++ b/drivers/mtd/nand/raw/ingenic/Kconfig
@@ -16,7 +16,7 @@ config MTD_NAND_JZ4780
 if MTD_NAND_JZ4780
 
 config MTD_NAND_INGENIC_ECC
-   tristate
+   bool
 
 config MTD_NAND_JZ4740_ECC
tristate "Hardware BCH support for JZ4740 SoC"
diff --git a/drivers/mtd/nand/raw/ingenic/Makefile 
b/drivers/mtd/nand/raw/ingenic/Makefile
index 1ac4f455baea..b63d36889263 100644
--- a/drivers/mtd/nand/raw/ingenic/Makefile
+++ b/drivers/mtd/nand/raw/ingenic/Makefile
@@ -2,7 +2,9 @@
 obj-$(CONFIG_MTD_NAND_JZ4740) += jz4740_nand.o
 obj-$(CONFIG_MTD_NAND_JZ4780) += ingenic_nand.o
 
-obj-$(CONFIG_MTD_NAND_INGENIC_ECC) += ingenic_ecc.o
+ingenic_nand-y += ingenic_nand_drv.o
+ingenic_nand-$(CONFIG_MTD_NAND_INGENIC_ECC) += ingenic_ecc.o
+
 obj-$(CONFIG_MTD_NAND_JZ4740_ECC) += jz4740_ecc.o
 obj-$(CONFIG_MTD_NAND_JZ4725B_BCH) += jz4725b_bch.o
 obj-$(CONFIG_MTD_NAND_JZ4780_BCH) += jz4780_bch.o
diff --git a/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c 
b/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c
index d3e085c5685a..c954189606f6 100644
--- a/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c
+++ b/drivers/mtd/nand/raw/ingenic/ingenic_ecc.c
@@ -30,7 +30,6 @@ int ingenic_ecc_calculate(struct ingenic_ecc *ecc,
 {
return ecc->ops->calculate(ecc, params, buf, ecc_code);
 }
-EXPORT_SYMBOL(ingenic_ecc_calculate);
 
 /**
  * ingenic_ecc_correct() - detect and correct bit errors
@@ -51,7 +50,6 @@ int ingenic_ecc_correct(struct ingenic_ecc *ecc,
 {
return ecc->ops->correct(ecc, params, buf, ecc_code);
 }
-EXPORT_SYMBOL(ingenic_ecc_correct);
 
 /**
  * ingenic_ecc_get() - get the ECC controller device
@@ -111,7 +109,6 @@ struct ingenic_ecc *of_ingenic_ecc_get(struct device_node 
*of_node)
}
return ecc;
 }
-EXPORT_SYMBOL(of_ingenic_ecc_get);
 
 /**
  * ingenic_ecc_release() - release the ECC controller device
@@ -122,7 +119,6 @@ void ingenic_ecc_release(struct ingenic_ecc *ecc)
clk_disable_unprepare(ecc->clk);
put_device(ecc->dev);
 }
-EXPORT_SYMBOL(ingenic_ecc_release);
 
 int ingenic_ecc_probe(struct platform_device *pdev)
 {
@@ -159,8 +155,3 @@ int ingenic_ecc_probe(struct platform_device *pdev)
return 0;
 }
 EXPORT_SYMBOL(ingenic_ecc_probe);
-
-MODULE_AUTHOR("Alex Smith ");
-MODULE_AUTHOR("Harvey Hunt ");
-MODULE_DESCRIPTION("Ingenic ECC common driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/nand/raw/ingenic/ingenic_nand.c 
b/drivers/mtd/nand/raw/ingenic/ingenic_nand_drv.c
similarity index 100%
rename from drivers/mtd/nand/raw/ingenic/ingenic_nand.c
rename to drivers/mtd/nand/raw/ingenic/ingenic_nand_drv.c
-- 
2.21.0.593.g511ec345e18

Re: [PATCH V3 2/2] sched/fair: Fallback to sched-idle CPU if idle CPU isn't found

2019-06-28 Thread Subhra Mazumdar




On 6/25/19 10:06 PM, Viresh Kumar wrote:

We try to find an idle CPU to run the next task, but in case we don't
find an idle CPU it is better to pick a CPU which will run the task the
soonest, for performance reason.

A CPU which isn't idle but has only SCHED_IDLE activity queued on it
should be a good target based on this criteria as any normal fair task
will most likely preempt the currently running SCHED_IDLE task
immediately. In fact, choosing a SCHED_IDLE CPU over a fully idle one
shall give better results as it should be able to run the task sooner
than an idle CPU (which requires to be woken up from an idle state).

This patch updates both fast and slow paths with this optimization.

Signed-off-by: Viresh Kumar 
---
  kernel/sched/fair.c | 43 +--
  1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1277adc3e7ed..2e0527fd468c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5376,6 +5376,15 @@ static struct {
  
  #endif /* CONFIG_NO_HZ_COMMON */
  
+/* CPU only has SCHED_IDLE tasks enqueued */

+static int sched_idle_cpu(int cpu)
+{
+   struct rq *rq = cpu_rq(cpu);
+
+   return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+   rq->nr_running);
+}
+

Shouldn't this check if rq->curr is also sched idle? And why not drop the
rq->nr_running non zero check?

Re: [PATCH net-next 02/12] net: hns3: enable DCB when TC num is one and pfc_en is non-zero

2019-06-28 Thread Yunsheng Lin

On 2019/6/29 2:47, Willem de Bruijn wrote:
> On Fri, Jun 28, 2019 at 7:53 AM Huazhong Tan  wrote:
>>
>> From: Yunsheng Lin 
>>
>> Currently when TC num is one, the DCB will be disabled no matter if
>> pfc_en is non-zero or not.
>>
>> This patch enables the DCB if pfc_en is non-zero, even when TC num
>> is one.
>>
>> Signed-off-by: Yunsheng Lin 
>> Signed-off-by: Peng Li 
>> Signed-off-by: Huazhong Tan 
> 
>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c 
>> b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
>> index 9edae5f..cb2fb5a 100644
>> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
>> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
>> @@ -597,8 +597,10 @@ static void hclge_tm_tc_info_init(struct hclge_dev 
>> *hdev)
>> hdev->tm_info.prio_tc[i] =
>> (i >= hdev->tm_info.num_tc) ? 0 : i;
>>
>> -   /* DCB is enabled if we have more than 1 TC */
>> -   if (hdev->tm_info.num_tc > 1)
>> +   /* DCB is enabled if we have more than 1 TC or pfc_en is
>> +* non-zero.
>> +*/
>> +   if (hdev->tm_info.num_tc > 1 || hdev->tm_info.pfc_en)
> 
> small nit: comments that just repeat the condition are not very informative.
> 
> More helpful might be to explain why the DCB should be enabled in both
> these cases. Though such detailed comments, if useful, are better left
> to the commit message usually.

Very helpful suggestion. thanks.
Will keep that in mind next time.

> 
>> hdev->flag |= HCLGE_FLAG_DCB_ENABLE;
>> else
>> hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE;
>> @@ -1388,6 +1390,19 @@ void hclge_tm_schd_info_update(struct hclge_dev 
>> *hdev, u8 num_tc)
>> hclge_tm_schd_info_init(hdev);
>>  }
>>
>> +void hclge_tm_pfc_info_update(struct hclge_dev *hdev)
>> +{
>> +   /* DCB is enabled if we have more than 1 TC or pfc_en is
>> +* non-zero.
>> +*/
>> +   if (hdev->tm_info.num_tc > 1 || hdev->tm_info.pfc_en)
>> +   hdev->flag |= HCLGE_FLAG_DCB_ENABLE;
>> +   else
>> +   hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE;
>> +
>> +   hclge_pfc_info_init(hdev);
>> +}
> 
> Avoid introducing this code duplication by defining a helper?
> 

Will send out a new patch to remove the code duplication by defining a helper.

> .
>

Re: [PATCH v2 2/3] of/platform: Add functional dependency link from DT bindings

2019-06-28 Thread David Collins

Hello Saravana,

On 6/27/19 7:22 PM, Saravana Kannan wrote:
> diff --git a/drivers/of/platform.c b/drivers/of/platform.c
> index 04ad312fd85b..8d690fa0f47c 100644
> --- a/drivers/of/platform.c
> +++ b/drivers/of/platform.c
> @@ -61,6 +61,72 @@ struct platform_device *of_find_device_by_node(struct 
> device_node *np)
>  EXPORT_SYMBOL(of_find_device_by_node);
>  
>  #ifdef CONFIG_OF_ADDRESS
> +static int of_link_binding(struct device *dev, char *binding, char *cell)
> +{
> + struct of_phandle_args sup_args;
> + struct platform_device *sup_dev;
> + unsigned int i = 0, links = 0;
> + u32 dl_flags = DL_FLAG_AUTOPROBE_CONSUMER;
> +
> + while (!of_parse_phandle_with_args(dev->of_node, binding, cell, i,
> +_args)) {
> + i++;
> + sup_dev = of_find_device_by_node(sup_args.np);
> + if (!sup_dev)
> + continue;

This check means that a required dependency link between a consumer and
supplier will not be added in the case that the consumer device is created
before the supply device.  If the supplier device is created and
immediately bound to its driver after late_initcall_sync(), then it is
possible for the sync_state() callback of the supplier to be called before
the consumer gets a chance to probe since its link was never captured.

of_platform_default_populate() below will only create devices for the
first level DT nodes directly under "/".  Suppliers DT nodes can exist as
second level nodes under a first level bus node (e.g. I2C, SPMI, RPMh,
etc).  Thus, it is quite likely that not all supplier devices will have
been created when device_link_check_waiting_consumers() is called.

As far as I can tell, this effectively breaks the sync_state()
functionality (and thus proxy un-voting built on top of it) when using
kernel modules for both the supplier and consumer drivers which are probed
after late_initcall_sync().  I'm not sure how this can be avoided given
that the linking is done between devices in the process of sequentially
adding devices.  Perhaps linking between device nodes instead of devices
might be able to overcome this issue.

> + if (device_link_add(dev, _dev->dev, dl_flags))
> + links++;
> + put_device(_dev->dev);
> + }
> + if (links < i)
> + return -ENODEV;
> + return 0;
> +}
> +
> +/*
> + * List of bindings and their cell names (use NULL if no cell names) from 
> which
> + * device links need to be created.
> + */
> +static char *link_bindings[] = {
> +#ifdef CONFIG_OF_DEVLINKS
> + "clocks", "#clock-cells",
> + "interconnects", "#interconnect-cells",
> +#endif
> +};

This list and helper function above are missing support for regulator
-supply properties.  We require this support on
QTI boards in order to handle regulator proxy un-voting when booting with
kernel modules.  Are you planning to add this support in a follow-on
version of this patch or in an additional patch?

Note that handling regulator supply properties will be very challenging
for at least these reasons:

1. There is not a consistent DT property name used for regulator supplies.

2. The device node referenced in a regulator supply phandle is usually not
the device node which correspond to the device pointer for the supplier.
This is because a single regulator supplier device node (which will have
an associated device pointer) typically has a subnode for each of the
regulators it supports.  Consumers then use phandles for the subnodes.

3. The specification of parent supplies for regulators frequently results
in *-supply properties in a node pointing to child subnodes of that node.
 See [1] for an example.  Special care would need to be taken to avoid
trying to mark a regulator supplier as a supplier to itself as well as to
avoid blocking its own probing due to an unlinked supply dependency.

4. Not all DT properties of the form "*-supply" are regulator supplies.
(Note, this case has been discussed, but I was not able to locate an
example of it.)

Clocks also have a problem.  A recent patch [2] allows clock provider
parent clocks to be specified via DT.  This could lead to cases of
circular "clocks" property dependencies where there are two clock supplier
devices A and B with A having some clocks with B clock parents along with
B having some clocks with A clock parents.  If "clocks" properties are
followed, then neither device would ever be able to probe.

This does not present a problem without this patch series because the
clock framework supports late binding of parents specifically to avoid
issues with clocks not registering in perfectly topological order of
parent dependencies.

> +
> +static int of_link_to_suppliers(struct device *dev)
> +{
> + unsigned int i = 0;
> + bool done = true;
> +
> + if (unlikely(!dev->of_node))
> + return 0;
> +
> + for (i = 0; i < ARRAY_SIZE(link_bindings) / 2; i++)
> + if

[PATCH 0/2] make RB_DECLARE_CALLBACKS more generic

2019-06-28 Thread Michel Lespinasse

These changes are intended to make the RB_DECLARE_CALLBACKS macro
more generic (allowing the aubmented subtree information to be a struct
instead of a scalar) and tweak the macro arguments to be more similar
to INTERVAL_TREE_DEFINE().

Michel Lespinasse (2):
  augmented rbtree: add comments for RB_DECLARE_CALLBACKS macro
  augmented rbtree: rework the RB_DECLARE_CALLBACKS macro definition

 arch/x86/mm/pat_rbtree.c   | 11 --
 drivers/block/drbd/drbd_interval.c | 13 ---
 include/linux/interval_tree_generic.h  | 13 +--
 include/linux/rbtree_augmented.h   | 51 +++---
 lib/rbtree_test.c  | 11 --
 mm/mmap.c  | 26 -
 tools/include/linux/rbtree_augmented.h | 51 +++---
 7 files changed, 107 insertions(+), 69 deletions(-)

-- 
2.22.0.410.gd8fdbe21b5-goog

[PATCH 2/2] augmented rbtree: rework the RB_DECLARE_CALLBACKS macro definition

2019-06-28 Thread Michel Lespinasse

- Change the definition of the RBCOMPUTE function. The propagate
  callback repeatedly calls RBCOMPUTE as it moves from leaf to root.
  it wants to stop recomputing once the augmented subtree information
  doesn't change. This was previously checked using the == operator,
  but that only works when the augmented subtree information is a
  scalar field. This commit modifies the RBCOMPUTE function so that
  it now sets the augmented subtree information instead of returning it,
  and returns a boolean value indicating if the propagate callback
  should stop.

- Reorder the RB_DECLARE_CALLBACKS macro arguments, following the
  style of the INTERVAL_TREE_DEFINE macro, so that RBSTATIC and RBNAME
  are passed last.

The generated code should not change when the RBCOMPUTE function is inlined,
which is the typical / intended case.

The motivation for this change is that I want to introduce augmented rbtree
uses where the augmented data for the subtree is a struct instead of a scalar.

Signed-off-by: Michel Lespinasse 
---
 arch/x86/mm/pat_rbtree.c   | 11 --
 drivers/block/drbd/drbd_interval.c | 13 ---
 include/linux/interval_tree_generic.h  | 13 +--
 include/linux/rbtree_augmented.h   | 17 -
 lib/rbtree_test.c  | 11 --
 mm/mmap.c  | 26 -
 tools/include/linux/rbtree_augmented.h | 51 +++---
 7 files changed, 84 insertions(+), 58 deletions(-)

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index fa16036fa592..f1701f6e3c49 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -54,7 +54,7 @@ static u64 get_subtree_max_end(struct rb_node *node)
return ret;
 }
 
-static u64 compute_subtree_max_end(struct memtype *data)
+static inline bool compute_subtree_max_end(struct memtype *data, bool exit)
 {
u64 max_end = data->end, child_max_end;
 
@@ -66,11 +66,14 @@ static u64 compute_subtree_max_end(struct memtype *data)
if (child_max_end > max_end)
max_end = child_max_end;
 
-   return max_end;
+   if (exit && data->subtree_max_end == max_end)
+   return true;
+   data->subtree_max_end = max_end;
+   return false;
 }
 
-RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
-u64, subtree_max_end, compute_subtree_max_end)
+RB_DECLARE_CALLBACKS(struct memtype, rb, subtree_max_end,
+compute_subtree_max_end, static, memtype_rb_augment_cb)
 
 /* Find the first (lowest start addr) overlapping range from rb tree */
 static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
diff --git a/drivers/block/drbd/drbd_interval.c 
b/drivers/block/drbd/drbd_interval.c
index c58986556161..6decee82a797 100644
--- a/drivers/block/drbd/drbd_interval.c
+++ b/drivers/block/drbd/drbd_interval.c
@@ -20,8 +20,8 @@ sector_t interval_end(struct rb_node *node)
  * node and of its children.  Called for @node and its parents whenever the end
  * may have changed.
  */
-static inline sector_t
-compute_subtree_last(struct drbd_interval *node)
+static inline bool
+compute_subtree_last(struct drbd_interval *node, bool exit)
 {
sector_t max = node->sector + (node->size >> 9);
 
@@ -35,11 +35,14 @@ compute_subtree_last(struct drbd_interval *node)
if (right > max)
max = right;
}
-   return max;
+   if (exit && node->end == max)
+   return true;
+   node->end = max;
+   return false;
 }
 
-RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb,
-sector_t, end, compute_subtree_last);
+RB_DECLARE_CALLBACKS(struct drbd_interval, rb, end, compute_subtree_last,
+static, augment_callbacks);
 
 /**
  * drbd_insert_interval  -  insert a new interval into a tree
diff --git a/include/linux/interval_tree_generic.h 
b/include/linux/interval_tree_generic.h
index 1f97ce26..c54ce9ea152d 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -42,7 +42,8 @@
  \
 /* Callbacks for augmented rbtree insert and remove */   \
  \
-static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node)   \
+static inline bool ITPREFIX ## _compute_subtree_last(ITSTRUCT *node, \
+bool exit)   \
 {\
ITTYPE max = ITLAST(node), subtree_last;  \
if (node->ITRB.rb_left) { \
@@ -57,11 +58,15 @@ static inline ITTYPE ITPREFIX ## 
_compute_subtree_last(ITSTRUCT *node)\
if (max < subtree_last)

[PATCH 1/2] augmented rbtree: add comments for RB_DECLARE_CALLBACKS macro

2019-06-28 Thread Michel Lespinasse

Add a short comment summarizing the arguments to RB_DECLARE_CALLBACKS.
The arguments are also now capitalized. This copies the style of the
INTERVAL_TREE_DEFINE macro.

No functional changes in this commit, only comments and capitalization.

Signed-off-by: Michel Lespinasse 
---
 include/linux/rbtree_augmented.h | 54 +++-
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index f1ed3fc80bbb..5923495276e0 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -72,39 +72,51 @@ rb_insert_augmented_cached(struct rb_node *node,
rb_insert_augmented(node, >rb_root, augment);
 }
 
-#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,  \
-rbtype, rbaugmented, rbcompute)\
+/*
+ * Template for declaring augmented rbtree callbacks
+ *
+ * RBSTATIC:'static' or empty
+ * RBNAME:  name of the rb_augment_callbacks structure
+ * RBSTRUCT:struct type of the tree nodes
+ * RBFIELD: name of struct rb_node field within RBSTRUCT
+ * RBTYPE:  type of the RBAUGMENTED field
+ * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
+ */
+
+#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,  \
+RBTYPE, RBAUGMENTED, RBCOMPUTE)\
 static inline void \
-rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \
+RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \
 {  \
while (rb != stop) {\
-   rbstruct *node = rb_entry(rb, rbstruct, rbfield);   \
-   rbtype augmented = rbcompute(node); \
-   if (node->rbaugmented == augmented) \
+   RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);   \
+   RBTYPE augmented = RBCOMPUTE(node); \
+   if (node->RBAUGMENTED == augmented) \
break;  \
-   node->rbaugmented = augmented;  \
-   rb = rb_parent(>rbfield); \
+   node->RBAUGMENTED = augmented;  \
+   rb = rb_parent(>RBFIELD); \
}   \
 }  \
 static inline void \
-rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)
\
+RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)
\
 {  \
-   rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);\
-   rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);\
-   new->rbaugmented = old->rbaugmented;\
+   RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);\
+   RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);\
+   new->RBAUGMENTED = old->RBAUGMENTED;\
 }  \
 static void\
-rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)  \
+RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)  \
 {  \
-   rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);\
-   rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);\
-   new->rbaugmented = old->rbaugmented;\
-   old->rbaugmented = rbcompute(old);  \
+   RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);\
+   RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);\
+   new->RBAUGMENTED = old->RBAUGMENTED;\
+   old->RBAUGMENTED = RBCOMPUTE(old);  \
 }  \
-rbstatic const struct rb_augment_callbacks rbname = {  \
-   .propagate = rbname ## _propagate,  \
-   .copy = rbname ## _copy,\
-   .rotate = rbname ## _rotate \
+RBSTATIC const struct rb_augment_callbacks RBNAME = {  \
+

Re: [PATCH v4 3/3] fpga: dfl: fme: add power management support

2019-06-28 Thread Wu Hao

On Fri, Jun 28, 2019 at 10:55:14AM -0700, Guenter Roeck wrote:
> On Thu, Jun 27, 2019 at 12:53:38PM +0800, Wu Hao wrote:
> > This patch adds support for power management private feature under
> > FPGA Management Engine (FME). This private feature driver registers
> > a hwmon for power (power1_input), thresholds information, e.g.
> > (power1_max / crit / max_alarm / crit_alarm) and also read-only sysfs
> > interfaces for other power management information. For configuration,
> > user could write threshold values via above power1_max / crit sysfs
> > interface under hwmon too.
> > 
> > Signed-off-by: Luwei Kang 
> > Signed-off-by: Xu Yilun 
> > Signed-off-by: Wu Hao 
> > ---
> > v2: create a dfl_fme_power hwmon to expose power sysfs interfaces.
> > move all sysfs interfaces under hwmon
> > consumed  --> hwmon power1_input
> > threshold1--> hwmon power1_cap
> > threshold2--> hwmon power1_crit
> > threshold1_status --> hwmon power1_cap_status
> > threshold2_status --> hwmon power1_crit_status
> > xeon_limit--> hwmon power1_xeon_limit
> > fpga_limit--> hwmon power1_fpga_limit
> 
> How do those limits differ from the other limits ?
> We do have powerX_cap and powerX_cap_max, and from the context
> it appears that you could possibly at least use power1_cap_max
> (and power1_cap instead of power1_max) instead of
> power1_fpga_limit.

Thanks a lot for the review and comments.

Actually xeon/fpga_limit are introduced for different purpose. It shows
the power limit of CPU and FPGA, that may be useful in some integrated
solution, e.g. CPU and FPGA shares power. We should never these
interfaces as throttling thresholds.

> 
> > ltr   --> hwmon power1_ltr
> > v3: rename some hwmon sysfs interfaces to follow hwmon ABI.
> > power1_cap --> power1_max
> > power1_cap_status  --> power1_max_alarm
> > power1_crit_status --> power1_crit_alarm
> 
> power1_cap is standard ABI, and since the value is enforced by HW,
> it should be usable.

As you see, in thermal management, threshold1 and threshold2 are
mapped to temp1_max_alarm and temp1_crit_alarm. So we feel that if
it will be friendly to user that we keep using max_alarm and crit_alarm
in power management for threshold1 and threshold2 too.

Do you think if we can keep this, or it's better to switch back to
power1_cap?


> 
> > update sysfs doc for above sysfs interface changes.
> > replace scnprintf with sprintf in sysfs interface.
> > v4: use HWMON_CHANNEL_INFO.
> > update date in sysfs doc.
> > ---
> >  Documentation/ABI/testing/sysfs-platform-dfl-fme |  67 +++
> >  drivers/fpga/dfl-fme-main.c  | 221 
> > +++
> >  2 files changed, 288 insertions(+)
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme 
> > b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > index 2cd17dc..a669548 100644
> > --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > @@ -127,6 +127,7 @@ Contact:Wu Hao 
> >  Description:   Read-Only. Read this file to get the name of hwmon 
> > device, it
> > supports values:
> > 'dfl_fme_thermal' - thermal hwmon device name
> > +   'dfl_fme_power'   - power hwmon device name
> >  
> >  What:  
> > /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/temp1_input
> >  Date:  June 2019
> > @@ -183,3 +184,69 @@ Description:   Read-Only. Read this file to get the 
> > policy of hardware threshold1
> > (see 'temp1_max'). It only supports two values (policies):
> > 0 - AP2 state (90% throttling)
> > 1 - AP1 state (50% throttling)
> > +
> > +What:  
> > /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_input
> > +Date:  June 2019
> > +KernelVersion: 5.3
> > +Contact:   Wu Hao 
> > +Description:   Read-Only. It returns current FPGA power consumption in 
> > uW.
> > +
> > +What:  
> > /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_max
> > +Date:  June 2019
> > +KernelVersion: 5.3
> > +Contact:   Wu Hao 
> > +Description:   Read-Write. Read this file to get current hardware power
> > +   threshold1 in uW. If power consumption rises at or above
> > +   this threshold, hardware starts 50% throttling.
> > +   Write this file to set current hardware power threshold1 in uW.
> > +   As hardware only accepts values in Watts, so input value will
> > +   be round down per Watts (< 1 watts part will be discarded).
> > +   Write fails with -EINVAL if input parsing fails or input isn't
> > +   in the valid range (0 - 12700 uW).
> > +
> > +What:  
> > /sys/bus/platform/devices/dfl-fme.0/hwmon/hwmonX/power1_crit
> > +Date:  June 2019
> >

Re: [PATCH v2] phy: rockchip-inno-usb2: allow to force the B-Device Session Valid bit.

2019-06-28 Thread Ezequiel Garcia

Hi Heiko, Kishon,

I'll try to pick up this patch.
Some comments below, just for self-reference.

On Wed, 2019-06-26 at 12:32 -0300, Ezequiel Garcia wrote:
> On Wed, 2019-05-15 at 18:20 -0400, Gaël PORTAY wrote:
> > From: Enric Balletbo i Serra 
> > 
> > The OTG disconnection event is generated after the presence/absence of
> > an ID connection, but some platforms don't have the ID pin connected, so
> > the event is not generated. In such case, for detecting the
> > disconnection event, we can get the cable state from an extcon driver.
> > We need, though, to force to set the B-Device Session Valid bit on the
> > PHY to have the device respond to the setup address. Otherwise, the
> > following error is shown:
> > 
> > usb 2-2: Device not responding to setup address.
> > usb 2-2: device not accepting address 14, error -71
> > usb usb2-port2: unable to enumerate USB device
> > 
> > The patch tells the PHY to force the B-Device Session Valid bit when the
> > OTG role is device and clear that bit if the OTG role is host, when an
> > extcon is available.
> > 
> > Signed-off-by: Enric Balletbo i Serra 
> > Signed-off-by: Gaël PORTAY 
> > ---
> > 
> > Hi all,
> > 
> > The main purpose of this patch is have the Type-C port on the Samsung
> > Chromebook Plus work as a device or in OTG mode.
> > 
> > That patch was originally a part of that patchset[1]; all other patches
> > was merged recently in master.
> > 
> > The patch was tested on a Samsung Chromebook Plus by configuring one
> > port to work as device, configure a cdc ethernet gadget and communicate
> > via ethernet gadget my workstation with the chromebook through a usb-a
> > to type-c cable.
> > 
> > Best regards,
> > Gaël
> > 
> > [1]: https://lkml.org/lkml/2018/8/15/141
> > 

We still need the above devicetree changes.

> > Changes since v1:
> >  - [PATCH 3/4] Remove introduction of dt property "rockchip,force-bvalid"
> >and replace cable state using extcon instead (if set).
> > 
> >  drivers/phy/rockchip/phy-rockchip-inno-usb2.c | 51 +++
> >  1 file changed, 51 insertions(+)
> > 
> > diff --git a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c 
> > b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c
> > index ba07121c3eff..5e9d50b5ae16 100644
> > --- a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c
> > +++ b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c
> > @@ -125,6 +125,7 @@ struct rockchip_chg_det_reg {
> >   * @bvalid_det_en: vbus valid rise detection enable register.
> >   * @bvalid_det_st: vbus valid rise detection status register.
> >   * @bvalid_det_clr: vbus valid rise detection clear register.
> > + * @bvalid_session: force B-device session valid register.
> >   * @ls_det_en: linestate detection enable register.
> >   * @ls_det_st: linestate detection state register.
> >   * @ls_det_clr: linestate detection clear register.
> > @@ -138,6 +139,7 @@ struct rockchip_usb2phy_port_cfg {
> > struct usb2phy_reg  bvalid_det_en;
> > struct usb2phy_reg  bvalid_det_st;
> > struct usb2phy_reg  bvalid_det_clr;
> > +   struct usb2phy_reg  bvalid_session;
> > struct usb2phy_reg  ls_det_en;
> > struct usb2phy_reg  ls_det_st;
> > struct usb2phy_reg  ls_det_clr;
> > @@ -169,6 +171,7 @@ struct rockchip_usb2phy_cfg {
> >   * @port_id: flag for otg port or host port.
> >   * @suspended: phy suspended flag.
> >   * @vbus_attached: otg device vbus status.
> > + * @force_bvalid: force the control of the B-device session valid bit.
> >   * @bvalid_irq: IRQ number assigned for vbus valid rise detection.
> >   * @ls_irq: IRQ number assigned for linestate detection.
> >   * @otg_mux_irq: IRQ number which multiplex otg-id/otg-bvalid/linestate
> > @@ -187,6 +190,7 @@ struct rockchip_usb2phy_port {
> > unsigned intport_id;
> > boolsuspended;
> > boolvbus_attached;
> > +   boolforce_bvalid;
> > int bvalid_irq;
> > int ls_irq;
> > int otg_mux_irq;
> > @@ -553,6 +557,13 @@ static void rockchip_usb2phy_otg_sm_work(struct 
> > work_struct *work)
> > switch (rport->state) {
> > case OTG_STATE_UNDEFINED:
> > rport->state = OTG_STATE_B_IDLE;
> > +   if (rport->force_bvalid) {
> > +   property_enable(rphy->grf,
> > +   >port_cfg->bvalid_session,
> > +   true);
> > +   dev_dbg(>phy->dev,
> > +   "set the B-Device Session Valid\n");
> > +   }
> > if (!vbus_attach)
> > rockchip_usb2phy_power_off(rport->phy);
> > /* fall through */
> > @@ -560,6 +571,14 @@ static void rockchip_usb2phy_otg_sm_work(struct 
> > work_struct *work)
> > if (extcon_get_state(rphy->edev, EXTCON_USB_HOST) > 0) {
> > dev_dbg(>phy->dev, "usb otg host connect\n");
> > rport->state =

Re: linux-next: build failure after merge of the battery tree

2019-06-28 Thread Stephen Rothwell

Hi Enric,

On Fri, 28 Jun 2019 18:56:56 +0200 Enric Balletbo i Serra 
 wrote:
>
> Hmm, I just applied this patch on top of linux-next and it build with
> allmodconfig on x86_64
> 
> I am wondering if the build was done without this commit, which is in
> chrome-platform for-next branch. Could be this the problem?
> 
> commit 0c0b7ea23aed0b55ef2f9803f13ddaae1943713d
> Author: Nick Crews 
> Date:   Wed Apr 24 10:56:50 2019 -0600
> 
> platform/chrome: wilco_ec: Add property helper library

Exactly since I merge the battery tree before the chrome-platform
tree ... Cross tree dependencies are a pain.

> Anyway, I think the proper way to do it is create an immutable branch for
> Sebastian as the patch he picked depends on the above commit. I'll create one,
> sorry about that missing dependency.

Thanks.

-- 
Cheers,
Stephen Rothwell


pgpD4BsCXfkVf.pgp
Description: OpenPGP digital signature

[PATCH] MAINTAINERS: Relieve Zubair Lutfullah Kakakhel from his duties

2019-06-28 Thread Paul Cercueil

His e-mail address @imgtec.com is no more, and when I contacted him
on his private email about it, he told me to drop him from MAINTAINERS.

Signed-off-by: Paul Cercueil 
---
 MAINTAINERS | 5 -
 1 file changed, 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3ee871404aba..755d5e5941e0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7791,11 +7791,6 @@ F:   include/trace/events/ib_umad.h
 F: samples/bpf/ibumad_kern.c
 F: samples/bpf/ibumad_user.c
 
-INGENIC JZ4780 DMA Driver
-M: Zubair Lutfullah Kakakhel 
-S: Maintained
-F: drivers/dma/dma-jz4780.c
-
 INGENIC JZ47xx SoCs
 M: Paul Cercueil 
 S: Maintained
-- 
2.21.0.593.g511ec345e18

Re: [PATCH 1/2] usbip: Skip DMA mapping and unmapping for urb at vhci

2019-06-28 Thread shuah


Hi Suwan,

On 6/21/19 11:45 AM, Suwan Kim wrote:

vhci doesn’t do dma for remote device. Actually, the real dma
operation is done by network card driver. So, vhci doesn’t use and
need dma address of transfer buffer of urb.

When vhci supports SG, it is useful to use native SG list instead
of mapped SG list because dma mapping fnuction can adjust the
number of SG list that is urb->num_mapped_sgs.

But hcd provides dma mapping and unmapping function by defualt.


Typo "defualt"


Moreover, it causes unnecessary dma mapping and unmapping which
will be done again at the NIC driver and it wastes CPU cycles.
So, implement map_urb_for_dma and unmap_urb_for_dma function for
vhci in order to skip the dma mapping and unmapping procedure.



How did you verify that unnecessary dma map/unmap are happening?
How many CPU cycles did you manage to reduce with this change?

thanks,
-- Shuah

[GIT PULL] ARC fixes for 5.2-rc7

2019-06-28 Thread Vineet Gupta

Hi Linus,

Please pull some fixes for ARC.

Thx,
-Vineet

->
The following changes since commit d1fdb6d8f6a4109a4263176c84b899076a5f8008:

  Linux 5.2-rc4 (2019-06-08 20:24:46 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git/ tags/arc-5.2-rc7

for you to fetch changes up to ec9b4feb1e41587c15d43d237844193318389dc3:

  ARC: [plat-hsdk]: unify memory apertures configuration (2019-06-11 11:48:34 
-0700)


ARC fixes for 5.2-rc7

 - hsdk platform unifying apertures

 - build system CROSS_COMPILE prefix


Alexey Brodkin (1):
  ARC: build: Try to guess CROSS_COMPILE with cc-cross-prefix

Eugeniy Paltsev (1):
  ARC: [plat-hsdk]: unify memory apertures configuration

 arch/arc/Makefile |   4 ++
 arch/arc/plat-hsdk/platform.c | 161 +++---
 2 files changed, 157 insertions(+), 8 deletions(-)

Re: [RFC] Deadlock via recursive wakeup via RCU with threadirqs

2019-06-28 Thread Paul E. McKenney

On Fri, Jun 28, 2019 at 07:12:41PM -0400, Joel Fernandes wrote:
> On Fri, Jun 28, 2019 at 03:25:47PM -0700, Paul E. McKenney wrote:
> > On Fri, Jun 28, 2019 at 05:40:18PM -0400, Joel Fernandes wrote:
> > > Hi Paul,
> > > 
> > > On Fri, Jun 28, 2019 at 01:04:23PM -0700, Paul E. McKenney wrote:
> > > [snip]
> > > > > > > Commit
> > > > > > > - 23634ebc1d946 ("rcu: Check for wakeup-safe conditions in
> > > > > > >rcu_read_unlock_special()") does not trigger the bug within 94
> > > > > > >attempts.
> > > > > > > 
> > > > > > > - 48d07c04b4cc1 ("rcu: Enable elimination of Tree-RCU softirq
> > > > > > >   processing") needed 12 attempts to trigger the bug.
> > > > > > 
> > > > > > That matches my belief that 23634ebc1d946 ("rcu: Check for 
> > > > > > wakeup-safe
> > > > > > conditions in rcu_read_unlock_special()") will at least greatly 
> > > > > > decrease
> > > > > > the probability of this bug occurring.
> > > > > 
> > > > > I was just typing a reply that I can't reproduce it with:
> > > > >   rcu: Check for wakeup-safe conditions in rcu_read_unlock_special()
> > > > > 
> > > > > I am trying to revert enough of this patch to see what would break 
> > > > > things,
> > > > > however I think a better exercise might be to understand more what 
> > > > > the patch
> > > > > does why it fixes things in the first place ;-) It is probably the
> > > > > deferred_qs thing.
> > > > 
> > > > The deferred_qs flag is part of it!  Looking forward to hearing what
> > > > you come up with as being the critical piece of this commit.
> > > 
> > > The new deferred_qs flag indeed saves the machine from the dead-lock.
> > > 
> > > If we don't want the deferred_qs, then the below patch also fixes the 
> > > issue.
> > > However, I am more sure than not that it does not handle all cases (such 
> > > as
> > > what if we previously had an expedited grace period IPI in a previous 
> > > reader
> > > section and had to to defer processing. Then it seems a similar deadlock
> > > would present. But anyway, the below patch does fix it for me! It is 
> > > based on
> > > your -rcu tree commit 23634ebc1d946f19eb112d4455c1d84948875e31 (rcu: Check
> > > for wakeup-safe conditions in rcu_read_unlock_special()).
> > 
> > The point here being that you rely on .b.blocked rather than
> > .b.deferred_qs.  Hmmm...  There are a number of places that check all
> > the bits via the .s leg of the rcu_special union.  The .s check in
> > rcu_preempt_need_deferred_qs() should be OK because it is conditioned
> > on t->rcu_read_lock_nesting of zero or negative.
> > Do rest of those also work out OK?
> > 
> > It would be nice to remove the flag, but doing so clearly needs careful
> > review and testing.
> 
> Agreed. I am planning to do an audit of this code within the next couple of
> weeks so I will be on the look out for any optimization opportunities related
> to this. Will let you know if this can work. For now I like your patch better
> because it is more conservative and doesn't cause any space overhead.

Fixing the bug in a maintainable manner is the priority, to be sure.
However, simplifications, assuming that they work, are very much worth
considering as well.

And Murphy says that there are still a number of bugs and optimization
opportunities.  ;-)

> If you'd like, please free to included my Tested-by on it:
> 
> Tested-by: Joel Fernandes (Google) 

Will do, thank you!

> If you had a chance, could you also point to me any tests that show
> performance improvement with the irqwork patch, on the expedited GP usecase?
> I'd like to try it out as well. I guess rcuperf should have some?

As a first thing to try, I suggest running rcuperf with both readers and
writers, with only expedited grace periods, and with most (or maybe even
all) CPUs having nohz_full enabled.

Thanx, Paul

[GIT PULL] SCSI fixes for 5.2-rc6

2019-06-28 Thread James Bottomley

One simple fix for a driver use after free.

The patch is available here:

git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git scsi-fixes

The short changelog is:

Jan Kara (1):
  scsi: vmw_pscsi: Fix use-after-free in pvscsi_queue_lck()

And the diffstat:

 drivers/scsi/vmw_pvscsi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

With full diff below.

James

---

diff --git a/drivers/scsi/vmw_pvscsi.c b/drivers/scsi/vmw_pvscsi.c
index ecee4b3ff073..377b07b2feeb 100644
--- a/drivers/scsi/vmw_pvscsi.c
+++ b/drivers/scsi/vmw_pvscsi.c
@@ -763,6 +763,7 @@ static int pvscsi_queue_lck(struct scsi_cmnd *cmd, void 
(*done)(struct scsi_cmnd
struct pvscsi_adapter *adapter = shost_priv(host);
struct pvscsi_ctx *ctx;
unsigned long flags;
+   unsigned char op;
 
spin_lock_irqsave(>hw_lock, flags);
 
@@ -775,13 +776,14 @@ static int pvscsi_queue_lck(struct scsi_cmnd *cmd, void 
(*done)(struct scsi_cmnd
}
 
cmd->scsi_done = done;
+   op = cmd->cmnd[0];
 
dev_dbg(>device->sdev_gendev,
-   "queued cmd %p, ctx %p, op=%x\n", cmd, ctx, cmd->cmnd[0]);
+   "queued cmd %p, ctx %p, op=%x\n", cmd, ctx, op);
 
spin_unlock_irqrestore(>hw_lock, flags);
 
-   pvscsi_kick_io(adapter, cmd->cmnd[0]);
+   pvscsi_kick_io(adapter, op);
 
return 0;
 }

Re: linux-next: Tree for Jun 28 (power/reset/reboot-mode)

2019-06-28 Thread Randy Dunlap

On 6/28/19 3:38 AM, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20190627:
> 

on x86_64:

when CONFIG_OF is not set/enabled, but
CONFIG_NVMEM_REBOOT_MODE=m selects REBOOT_MODE:

  CC [M]  drivers/power/reset/reboot-mode.o
../drivers/power/reset/reboot-mode.c: In function ‘reboot_mode_register’:
../drivers/power/reset/reboot-mode.c:72:2: error: implicit declaration of 
function ‘for_each_property_of_node’ [-Werror=implicit-function-declaration]
  for_each_property_of_node(np, prop) {
  ^
../drivers/power/reset/reboot-mode.c:72:38: error: expected ‘;’ before ‘{’ token
  for_each_property_of_node(np, prop) {
  ^
../drivers/power/reset/reboot-mode.c:109:1: warning: label ‘error’ defined but 
not used [-Wunused-label]
 error:
 ^
../drivers/power/reset/reboot-mode.c:67:9: warning: unused variable ‘len’ 
[-Wunused-variable]
  size_t len = strlen(PREFIX);
 ^
../drivers/power/reset/reboot-mode.c: At top level:
../drivers/power/reset/reboot-mode.c:42:12: warning: ‘reboot_mode_notify’ 
defined but not used [-Wunused-function]
 static int reboot_mode_notify(struct notifier_block *this,
^


-- 
~Randy

Re: [PATCH v2] mm: vmscan: fix not scanning anonymous pages when detecting file refaults

2019-06-28 Thread Minchan Kim

On Fri, Jun 28, 2019 at 10:32:01AM -0400, Johannes Weiner wrote:
> On Fri, Jun 28, 2019 at 07:16:27PM +0800, Kuo-Hsin Yang wrote:
> > When file refaults are detected and there are many inactive file pages,
> > the system never reclaim anonymous pages, the file pages are dropped
> > aggressively when there are still a lot of cold anonymous pages and
> > system thrashes.  This issue impacts the performance of applications
> > with large executable, e.g. chrome.
> 
> This is good.
> 
> > Commit 2a2e48854d70 ("mm: vmscan: fix IO/refault regression in cache
> > workingset transition") introduced actual_reclaim parameter.  When file
> > refaults are detected, inactive_list_is_low() may return different
> > values depends on the actual_reclaim parameter.  Vmscan would only scan
> > active/inactive file lists at file thrashing state when the following 2
> > conditions are satisfied.
> > 
> > 1) inactive_list_is_low() returns false in get_scan_count() to trigger
> >scanning file lists only.
> > 2) inactive_list_is_low() returns true in shrink_list() to allow
> >scanning active file list.
> > 
> > This patch makes the return value of inactive_list_is_low() independent
> > of actual_reclaim and rename the parameter back to trace.
> 
> This is not. The root cause for the problem you describe isn't the
> patch you point to. The root cause is our decision to force-scan the
> file LRU based on relative inactive:active size alone, without taking
> file thrashing into account at all. This is a much older problem.
> 
> After the referenced patch, we're taking thrashing into account when
> deciding whether to deactivate active file pages or not. To solve the
> problem pointed out here, we can extend that same principle to the
> decision whether to force-scan files and skip the anon LRUs.
> 
> The patch you're pointing to isn't the culprit. On the contrary, it
> provides the infrastructure to solve a much older problem.
> 
> > The problem can be reproduced by the following test program.
> > 
> > ---8<---
> > void fallocate_file(const char *filename, off_t size)
> > {
> > struct stat st;
> > int fd;
> > 
> > if (!stat(filename, ) && st.st_size >= size)
> > return;
> > 
> > fd = open(filename, O_WRONLY | O_CREAT, 0600);
> > if (fd < 0) {
> > perror("create file");
> > exit(1);
> > }
> > if (posix_fallocate(fd, 0, size)) {
> > perror("fallocate");
> > exit(1);
> > }
> > close(fd);
> > }
> > 
> > long *alloc_anon(long size)
> > {
> > long *start = malloc(size);
> > memset(start, 1, size);
> > return start;
> > }
> > 
> > long access_file(const char *filename, long size, long rounds)
> > {
> > int fd, i;
> > volatile char *start1, *end1, *start2;
> > const int page_size = getpagesize();
> > long sum = 0;
> > 
> > fd = open(filename, O_RDONLY);
> > if (fd == -1) {
> > perror("open");
> > exit(1);
> > }
> > 
> > /*
> >  * Some applications, e.g. chrome, use a lot of executable file
> >  * pages, map some of the pages with PROT_EXEC flag to simulate
> >  * the behavior.
> >  */
> > start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
> >   fd, 0);
> > if (start1 == MAP_FAILED) {
> > perror("mmap");
> > exit(1);
> > }
> > end1 = start1 + size / 2;
> > 
> > start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
> > if (start2 == MAP_FAILED) {
> > perror("mmap");
> > exit(1);
> > }
> > 
> > for (i = 0; i < rounds; ++i) {
> > struct timeval before, after;
> > volatile char *ptr1 = start1, *ptr2 = start2;
> > gettimeofday(, NULL);
> > for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
> > sum += *ptr1 + *ptr2;
> > gettimeofday(, NULL);
> > printf("File access time, round %d: %f (sec)\n", i,
> >(after.tv_sec - before.tv_sec) +
> >(after.tv_usec - before.tv_usec) / 100.0);
> > }
> > return sum;
> > }
> > 
> > int main(int argc, char *argv[])
> > {
> > const long MB = 1024 * 1024;
> > long anon_mb, file_mb, file_rounds;
> > const char filename[] = "large";
> > long *ret1;
> > long ret2;
> > 
> > if (argc != 4) {
> > printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS\n");
> > exit(0);
> > }
> > anon_mb = atoi(argv[1]);
> > file_mb = atoi(argv[2]);
> > file_rounds = atoi(argv[3]);
> > 
> > fallocate_file(filename, file_mb * MB);
> > printf("Allocate %ld MB anonymous pages\n", anon_mb);
> > ret1 = alloc_anon(anon_mb * MB);
> > printf("Access %ld MB file pages\n", file_mb);
> > ret2 = access_file(filename, file_mb * MB, file_rounds);
> > printf("Print result to prevent optimization: %ld\n",
> >

Re: [PATCH] mm: vmscan: fix not scanning anonymous pages when detecting file refaults

2019-06-28 Thread Minchan Kim

On Fri, Jun 28, 2019 at 10:22:52AM -0400, Johannes Weiner wrote:
> Hi Minchan,
> 
> On Fri, Jun 28, 2019 at 03:51:38PM +0900, Minchan Kim wrote:
> > On Thu, Jun 27, 2019 at 02:41:23PM -0400, Johannes Weiner wrote:
> > > On Wed, Jun 19, 2019 at 04:08:35PM +0800, Kuo-Hsin Yang wrote:
> > > > Fixes: 2a2e48854d70 ("mm: vmscan: fix IO/refault regression in cache 
> > > > workingset transition")
> > > > Signed-off-by: Kuo-Hsin Yang 
> > > 
> > > Acked-by: Johannes Weiner 
> > > 
> > > Your change makes sense - we should indeed not force cache trimming
> > > only while the page cache is experiencing refaults.
> > > 
> > > I can't say I fully understand the changelog, though. The problem of
> > 
> > I guess the point of the patch is "actual_reclaim" paramter made divergency
> > to balance file vs. anon LRU in get_scan_count. Thus, it ends up scanning
> > file LRU active/inactive list at file thrashing state.
> 
> Look at the patch again. The parameter was only added to retain
> existing behavior. We *always* did file-only reclaim while thrashing -
> all the way back to the two commits I mentioned below.

Yeah, I know it that we did force file relcaim if we have enough file LRU.
What I confused from the description was "actual_reclaim" part.
Thanks for the pointing out, Johannes. I confirmed it kept the old
behavior in get_scan_count.

> 
> > So, Fixes: 2a2e48854d70 ("mm: vmscan: fix IO/refault regression in cache 
> > workingset transition")
> > would make sense to me since it introduces the parameter.
> 
> What is the observable behavior problem that this patch introduced?
> 
> > > forcing cache trimming while there is enough page cache is older than
> > > the commit you refer to. It could be argued that this commit is
> > > incomplete - it could have added refault detection not just to
> > > inactive:active file balancing, but also the file:anon balancing; but
> > > it didn't *cause* this problem.
> > > 
> > > Shouldn't this be
> > > 
> > > Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have 
> > > plenty")
> > > Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have 
> > > plenty")
> > 
> > That would affect, too but it would be trouble to have stable backport
> > since we don't have refault machinery in there.
> 
> Hm? The problematic behavior is that we force-scan file while file is
> thrashing. We can obviously only solve this in kernels that can
> actually detect thrashing.

What I meant is I thought it's -stable material but in there, we don't have
refault machinery in v3.8.
I agree this patch fixes above two commits you mentioned so we should use it.

Re: [PATCH v1] arm: dts: mediatek: add basic support for MT7629 SoC

2019-06-28 Thread Kevin Hilman

 writes:

> From: Ryder Lee 
>
> This adds basic support for MT7629 reference board.
>
> Signed-off-by: Ryder Lee 

Just noticing this is not upstream yet.

I did a basic boot test to ramdisk on the mt7629-rfb board donated for
kernelCI (thanks MediaTek!) and it boots just fine.

Tested-by: Kevin Hilman 

Kevin

Re: [RFC] Deadlock via recursive wakeup via RCU with threadirqs

2019-06-28 Thread Joel Fernandes

On Fri, Jun 28, 2019 at 03:25:47PM -0700, Paul E. McKenney wrote:
> On Fri, Jun 28, 2019 at 05:40:18PM -0400, Joel Fernandes wrote:
> > Hi Paul,
> > 
> > On Fri, Jun 28, 2019 at 01:04:23PM -0700, Paul E. McKenney wrote:
> > [snip]
> > > > > > Commit
> > > > > > - 23634ebc1d946 ("rcu: Check for wakeup-safe conditions in
> > > > > >rcu_read_unlock_special()") does not trigger the bug within 94
> > > > > >attempts.
> > > > > > 
> > > > > > - 48d07c04b4cc1 ("rcu: Enable elimination of Tree-RCU softirq
> > > > > >   processing") needed 12 attempts to trigger the bug.
> > > > > 
> > > > > That matches my belief that 23634ebc1d946 ("rcu: Check for wakeup-safe
> > > > > conditions in rcu_read_unlock_special()") will at least greatly 
> > > > > decrease
> > > > > the probability of this bug occurring.
> > > > 
> > > > I was just typing a reply that I can't reproduce it with:
> > > >   rcu: Check for wakeup-safe conditions in rcu_read_unlock_special()
> > > > 
> > > > I am trying to revert enough of this patch to see what would break 
> > > > things,
> > > > however I think a better exercise might be to understand more what the 
> > > > patch
> > > > does why it fixes things in the first place ;-) It is probably the
> > > > deferred_qs thing.
> > > 
> > > The deferred_qs flag is part of it!  Looking forward to hearing what
> > > you come up with as being the critical piece of this commit.
> > 
> > The new deferred_qs flag indeed saves the machine from the dead-lock.
> > 
> > If we don't want the deferred_qs, then the below patch also fixes the issue.
> > However, I am more sure than not that it does not handle all cases (such as
> > what if we previously had an expedited grace period IPI in a previous reader
> > section and had to to defer processing. Then it seems a similar deadlock
> > would present. But anyway, the below patch does fix it for me! It is based 
> > on
> > your -rcu tree commit 23634ebc1d946f19eb112d4455c1d84948875e31 (rcu: Check
> > for wakeup-safe conditions in rcu_read_unlock_special()).
> 
> The point here being that you rely on .b.blocked rather than
> .b.deferred_qs.  Hmmm...  There are a number of places that check all
> the bits via the .s leg of the rcu_special union.  The .s check in
> rcu_preempt_need_deferred_qs() should be OK because it is conditioned
> on t->rcu_read_lock_nesting of zero or negative.
> Do rest of those also work out OK?
> 
> It would be nice to remove the flag, but doing so clearly needs careful
> review and testing.

Agreed. I am planning to do an audit of this code within the next couple of
weeks so I will be on the look out for any optimization opportunities related
to this. Will let you know if this can work. For now I like your patch better
because it is more conservative and doesn't cause any space overhead.

If you'd like, please free to included my Tested-by on it:

Tested-by: Joel Fernandes (Google) 

If you had a chance, could you also point to me any tests that show
performance improvement with the irqwork patch, on the expedited GP usecase?
I'd like to try it out as well. I guess rcuperf should have some?

thanks!

 - Joel

[PATCH v1] perf/x86: Consider pinned events for group validation

2019-06-28 Thread Andi Kleen

From: Andi Kleen 

perf stat -M metrics relies on weak groups to reject unschedulable groups
and run them as non-groups.

This uses the group validation code in the kernel. Unfortunately
that code doesn't take pinned events, such as the NMI watchdog, into
account. So some groups can pass validation, but then later still
never schedule.

This patch is an attempt to track pinned events in the group
validation too. We track a pinned mask, and use the mask from
either the CPU the event is pinned or, or the current CPU
if floating.

Then use this mask as a starting point for the scheduler.

I *think* it is mostly conservative, as in rejecting nothing
that would schedule, except locking is a bit weaker than a real
schedule, so it might be slightly behind

It won't catch all possible cases that cannot schedule, such
as events pinned differently on different CPUs, or complicated
constraints. For the case of the NMI watchdog interacting
with the current perf metrics it is strong enough.

Reported-by: Stephane Eranian 
Signed-off-by: Andi Kleen 
---
 arch/x86/events/core.c | 44 +++---
 arch/x86/events/intel/p4.c |  3 ++-
 arch/x86/events/intel/uncore.c |  2 +-
 arch/x86/events/perf_event.h   | 10 +---
 4 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f0e4804515d8..9459b1f83aa4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -713,7 +713,7 @@ struct perf_sched {
  * Initialize interator that runs through all events and counters.
  */
 static void perf_sched_init(struct perf_sched *sched, struct event_constraint 
**constraints,
-   int num, int wmin, int wmax, int gpmax)
+   int num, int wmin, int wmax, int gpmax, unsigned 
long *pinned)
 {
int idx;
 
@@ -731,6 +731,8 @@ static void perf_sched_init(struct perf_sched *sched, 
struct event_constraint **
sched->state.event  = idx;  /* start with min weight */
sched->state.weight = wmin;
sched->state.unassigned = num;
+   if (pinned)
+   bitmap_copy(sched->state.used, pinned, X86_PMC_IDX_MAX);
 }
 
 static void perf_sched_save_state(struct perf_sched *sched)
@@ -846,11 +848,12 @@ static bool perf_sched_next_event(struct perf_sched 
*sched)
  * Assign a counter for each event.
  */
 int perf_assign_events(struct event_constraint **constraints, int n,
-   int wmin, int wmax, int gpmax, int *assign)
+  int wmin, int wmax, int gpmax, int *assign,
+  unsigned long *pinned)
 {
struct perf_sched sched;
 
-   perf_sched_init(, constraints, n, wmin, wmax, gpmax);
+   perf_sched_init(, constraints, n, wmin, wmax, gpmax, pinned);
 
do {
if (!perf_sched_find_counter())
@@ -863,7 +866,8 @@ int perf_assign_events(struct event_constraint 
**constraints, int n,
 }
 EXPORT_SYMBOL_GPL(perf_assign_events);
 
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign,
+   unsigned long *pinned)
 {
struct event_constraint *c;
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -871,7 +875,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, 
int *assign)
int n0, i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
 
-   bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+   if (pinned)
+   bitmap_copy(used_mask, pinned, X86_PMC_IDX_MAX);
+   else
+   bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
/*
 * Compute the number of events already present; see x86_pmu_add(),
@@ -953,7 +960,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, 
int *assign)
gpmax /= 2;
 
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-wmax, gpmax, assign);
+wmax, gpmax, assign, pinned);
}
 
/*
@@ -1267,7 +1274,7 @@ static int x86_pmu_add(struct perf_event *event, int 
flags)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect;
 
-   ret = x86_pmu.schedule_events(cpuc, n, assign);
+   ret = x86_pmu.schedule_events(cpuc, n, assign, NULL);
if (ret)
goto out;
/*
@@ -1321,6 +1328,8 @@ static void x86_pmu_start(struct perf_event *event, int 
flags)
__set_bit(idx, cpuc->running);
x86_pmu.enable(event);
perf_event_update_userpage(event);
+   if (event->attr.pinned)
+   __set_bit(idx, cpuc->pinned);
 }
 
 void perf_event_print_debug(void)
@@ -1388,12 +1397,16 @@ void x86_pmu_stop(struct perf_event *event, int flags)
struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
struct hw_perf_event *hwc = >hw;

Re: [PATCH V5 02/18] pinctrl: tegra: Add suspend and resume support

2019-06-28 Thread Sowjanya Komatineni




On 6/28/19 5:05 AM, Dmitry Osipenko wrote:

28.06.2019 14:56, Dmitry Osipenko пишет:

28.06.2019 5:12, Sowjanya Komatineni пишет:

This patch adds support for Tegra pinctrl driver suspend and resume.

During suspend, context of all pinctrl registers are stored and
on resume they are all restored to have all the pinmux and pad
configuration for normal operation.

Acked-by: Thierry Reding 
Signed-off-by: Sowjanya Komatineni 
---
  int tegra_pinctrl_probe(struct platform_device *pdev,
const struct tegra_pinctrl_soc_data *soc_data);
  #endif
diff --git a/drivers/pinctrl/tegra/pinctrl-tegra210.c 
b/drivers/pinctrl/tegra/pinctrl-tegra210.c
index 0b56ad5c9c1c..edd3f4606cdb 100644
--- a/drivers/pinctrl/tegra/pinctrl-tegra210.c
+++ b/drivers/pinctrl/tegra/pinctrl-tegra210.c
@@ -1571,6 +1571,7 @@ static struct platform_driver tegra210_pinctrl_driver = {
.driver = {
.name = "tegra210-pinctrl",
.of_match_table = tegra210_pinctrl_of_match,
+   .pm = _pinctrl_pm,
},
.probe = tegra210_pinctrl_probe,
  };


Could you please address my comments in the next revision if there will be one?


Also, what about adding ".pm' for other Tegras? I'm sure Jon could test them 
for you.


This series is for Tegra210 SC7 entry/exit along with clocks and pinctrl 
suspend resume needed for Tegra210 basic sc7 entry and exit.


This includes pinctrl, pmc changes, clock-tegra210 driver changes all 
w.r.t Tegra210 platforms specific.


Suspend/resume support for other Tegras will be in separate patch series.


thanks

Sowjanya

Re: [PATCH] ARC: ARCv2: jump label: implement jump label patching

2019-06-28 Thread Vineet Gupta

On 6/18/19 9:16 AM, Vineet Gupta wrote:
> On 6/14/19 9:41 AM, Eugeniy Paltsev wrote:
>> Implement jump label patching for ARC. Jump labels provide
>> an interface to generate dynamic branches using
>> self-modifying code.
>>
>> This allows us to implement conditional branches where
>> changing branch direction is expensive but branch selection
>> is basically 'free'
>>
>> This implementation uses 32-bit NOP and BRANCH instructions
>> which forced to be aligned by 4 to guarantee that they don't
>> cross L1 cache line and can be update atomically.
>>
>> Signed-off-by: Eugeniy Paltsev 
> 
> LGTM overall - nits below.

@Peter can we have your reviewed/ack or some such assuming you don't have any
objections !

Thx,
-Vineet

Re: [PATCH AUTOSEL 4.19 14/60] mwifiex: Abort at too short BSS descriptor element

2019-06-28 Thread Brian Norris

On Wed, Jun 26, 2019 at 5:49 PM Sasha Levin  wrote:
>
> From: Takashi Iwai 
>
> [ Upstream commit 685c9b7750bfacd6fc1db50d86579980593b7869 ]
>
> Currently mwifiex_update_bss_desc_with_ie() implicitly assumes that
> the source descriptor entries contain the enough size for each type
> and performs copying without checking the source size.  This may lead
> to read over boundary.
>
> Fix this by putting the source size check in appropriate places.
>
> Signed-off-by: Takashi Iwai 
> Signed-off-by: Kalle Valo 
> Signed-off-by: Sasha Levin 

For the record, this fixup is still aiming for 5.2, correcting some
potential mistakes in this patch:

63d7ef36103d mwifiex: Don't abort on small, spec-compliant vendor IEs

So you might want to hold off a bit, and grab them both.

Brian

Re: net: check before dereferencing netdev_ops during busy poll

2019-06-28 Thread Sasha Levin


On Fri, Jun 28, 2019 at 06:34:58PM +0200, Matteo Croce wrote:

Hi,

Is there any reason for this panic fix not being applied in stable?

https://lore.kernel.org/netdev/20180313053248.13654-1-jelsas...@appneta.com/T/


What's the upstream commit id?

--
Thanks,
Sasha

[GIT PULL] RISC-V patches for v5.2-rc7

2019-06-28 Thread Paul Walmsley

Linus,

The following changes since commit 4b972a01a7da614b4796475f933094751a295a2f:

  Linux 5.2-rc6 (2019-06-22 16:01:36 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git 
tags/riscv-for-v5.2/fixes-rc7

for you to fetch changes up to 0db7f5cd4aeba4cc63d0068598b3350eba8bb4cd:

  riscv: mm: Fix code comment (2019-06-26 15:10:30 -0700)


Minor RISC-V fixes and one defconfig update for the v5.2-rc series.

The fixes have no functional impact:

- Fix some comment text in the memory management vmalloc_fault path.

- Fix some warnings from the DT compiler in our newly-added DT files.

- Change the newly-added DT bindings such that SoC IP blocks with
  external I/O are marked as "disabled" by default, then enable them
  explicitly in board DT files when the devices are used on the board.
  This aligns the bindings with existing upstream practice.

- Add the MIT license as an option for a minor header file, at the
  request of one of the U-Boot maintainers.

The RISC-V defconfig update builds the SiFive SPI driver and the
MMC-SPI driver by default.  The intention here is to make v5.2 more
usable for testers and users with RISC-V hardware.


Atish Patra (1):
  RISC-V: defconfig: enable MMC & SPI for RISC-V

Paul Walmsley (2):
  dt-bindings: riscv: resolve 'make dt_binding_check' warnings
  dt-bindings: clock: sifive: add MIT license as an option for the header 
file

ShihPo Hung (1):
  riscv: mm: Fix code comment

Yash Shah (1):
  riscv: dts: Re-organize the DT nodes

 Documentation/devicetree/bindings/riscv/cpus.yaml  | 26 --
 arch/riscv/boot/dts/sifive/fu540-c000.dtsi |  6 +
 .../riscv/boot/dts/sifive/hifive-unleashed-a00.dts | 13 +++
 arch/riscv/configs/defconfig   |  5 +
 arch/riscv/mm/fault.c  |  3 ---
 include/dt-bindings/clock/sifive-fu540-prci.h  |  2 +-
 6 files changed, 39 insertions(+), 16 deletions(-)

Re: [PATCH v3 3/7] sched: rotate the cpu search window for better spread

2019-06-28 Thread Subhra Mazumdar




On 6/28/19 4:54 AM, Srikar Dronamraju wrote:

* subhra mazumdar  [2019-06-26 18:29:15]:


Rotate the cpu search window for better spread of threads. This will ensure
an idle cpu will quickly be found if one exists.

While rotating the cpu search window is good, not sure if this can find a
idle cpu quickly. The probability of finding an idle cpu still should remain
the same. No?


Signed-off-by: subhra mazumdar 
---
  kernel/sched/fair.c | 10 --
  1 file changed, 8 insertions(+), 2 deletions(-)

@@ -6219,9 +6219,15 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
}
}
  
+	if (per_cpu(next_cpu, target) != -1)

+   target_tmp = per_cpu(next_cpu, target);
+   else
+   target_tmp = target;
+
time = local_clock();
  
-	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {

+   for_each_cpu_wrap(cpu, sched_domain_span(sd), target_tmp) {
+   per_cpu(next_cpu, target) = cpu;

Shouldn't this assignment be outside the for loop.
With the current code,
1. We keep reassigning multiple times.
2. The last assignment happes for idle_cpu and sometimes the
assignment is for non-idle cpu.

We want the last assignment irrespective of it was an idle cpu or not since
in both cases we want to track the boundary of search.

Thanks,
Subhra

Re: [PATCH v3 5/7] sched: SIS_CORE to disable idle core search

2019-06-28 Thread Subhra Mazumdar




On 6/28/19 12:01 PM, Parth Shah wrote:


On 6/27/19 6:59 AM, subhra mazumdar wrote:

Use SIS_CORE to disable idle core search. For some workloads
select_idle_core becomes a scalability bottleneck, removing it improves
throughput. Also there are workloads where disabling it can hurt latency,
so need to have an option.

Signed-off-by: subhra mazumdar 
---
  kernel/sched/fair.c | 8 +---
  1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c1ca88e..6a74808 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6280,9 +6280,11 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
if (!sd)
return target;

-   i = select_idle_core(p, sd, target);
-   if ((unsigned)i < nr_cpumask_bits)
-   return i;
+   if (sched_feat(SIS_CORE)) {
+   i = select_idle_core(p, sd, target);
+   if ((unsigned)i < nr_cpumask_bits)
+   return i;
+   }

This can have significant performance loss if disabled. The select_idle_core 
spreads
workloads quickly across the cores, hence disabling this leaves much of the 
work to
be offloaded to load balancer to move task across the cores. Latency sensitive
and long running multi-threaded workload should see the regression under this 
conditions.

Yes in case of SPARC SMT8 I did notice that (see cover letter). That's why
it is a feature that is ON by default, but can be turned OFF for specific
workloads on x86 SMT2 that can benefit from it.

Also, systems like POWER9 has sd_llc as a pair of core only. So it
won't benefit from the limits and hence also hiding your code in select_idle_cpu
behind static keys will be much preferred.

If it doesn't hurt then I don't see the point.

Thanks,
Subhra

Re: [RFC PATCH 03/12] powerpc/prom_init: Add the ESM call to prom_init

2019-06-28 Thread Thiago Jung Bauermann



Hello Alexey,

Thanks for reviewing this patch!

Alexey Kardashevskiy  writes:

> On 21/05/2019 14:49, Thiago Jung Bauermann wrote:
>> @@ -1707,6 +1723,43 @@ static void __init prom_close_stdin(void)
>>  }
>>  }
>>  
>> +#ifdef CONFIG_PPC_SVM
>> +static int prom_rtas_os_term_hcall(uint64_t args)
>
>
> This is just an rtas hcall, nothing special about "os-term".

Sorry, unfortunately I don't understand how we're treating os-term
especially. Do you mean that we should inline this function directly
into prom_rtas_os_term()?

>> +{
>> +register uint64_t arg1 asm("r3") = 0xf000;
>> +register uint64_t arg2 asm("r4") = args;
>> +
>> +asm volatile("sc 1\n" : "=r" (arg1) :
>> +"r" (arg1),
>> +"r" (arg2) :);
>> +return arg1;
>> +}
>> +
>> +static struct rtas_args __prombss os_term_args;
>> +
>> +static void __init prom_rtas_os_term(char *str)
>> +{
>> +phandle rtas_node;
>> +__be32 val;
>> +u32 token;
>> +
>> +prom_printf("%s: start...\n", __func__);
>> +rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas"));
>> +prom_printf("rtas_node: %x\n", rtas_node);
>> +if (!PHANDLE_VALID(rtas_node))
>> +return;
>> +
>> +val = 0;
>> +prom_getprop(rtas_node, "ibm,os-term", , sizeof(val));
>> +token = be32_to_cpu(val);
>> +prom_printf("ibm,os-term: %x\n", token);
>> +if (token == 0)
>> +prom_panic("Could not get token for ibm,os-term\n");
>> +os_term_args.token = cpu_to_be32(token);
>> +prom_rtas_os_term_hcall((uint64_t)_term_args);
>> +}
>> +#endif /* CONFIG_PPC_SVM */
>> +
>>  /*
>>   * Allocate room for and instantiate RTAS
>>   */

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

Re: [PATCH 11/12] iomap: move the xfs writeback code to iomap.c

2019-06-28 Thread Luis Chamberlain

On Fri, Jun 28, 2019 at 10:45:42AM +1000, Dave Chinner wrote:
> On Tue, Jun 25, 2019 at 12:10:20PM +0200, Christoph Hellwig wrote:
> > On Tue, Jun 25, 2019 at 09:43:04AM +1000, Dave Chinner wrote:
> > > I'm a little concerned this is going to limit what we can do
> > > with the XFS IO path because now we can't change this code without
> > > considering the direct impact on other filesystems. The QA burden of
> > > changing the XFS writeback code goes through the roof with this
> > > change (i.e. we can break multiple filesystems, not just XFS).
> > 
> > Going through the roof is a little exaggerated.
> 
> You've already mentioned two new users you want to add. I don't even
> have zone capable hardware here to test one of the users you are
> indicating will use this code, and I suspect that very few people
> do.  That's a non-trivial increase in testing requirements for
> filesystem developers and distro QA departments who will want to
> change and/or validate this code path.

A side topic here:

Looking towards the future of prosects here with regards to helping QA
and developers with more confidence in API changes (kunit is one
prospect we're evaluating)...

If... we could somehow... codify what XFS *requires* from the API
precisely...  would that help alleviate concerns or bring confidence in
the prospect of sharing code?

Or is it simply an *impossibility* to address these concerns in question by
codifying tests for the promised API?

Ie, are the concerns something which could be addressed with strict
testing on adherence to an API, or are the concerns *unknown* side
dependencies which could not possibly be codified?

As an example of the extent possible to codify API promise (although
I beleive it was unintentional at first), see:

http://lkml.kernel.org/r/20190626021744.gu19...@42.do-not-panic.com

  Luis

Re: [PATCH v3 1/7] sched: limit cpu search in select_idle_cpu

2019-06-28 Thread Subhra Mazumdar




On 6/28/19 11:47 AM, Parth Shah wrote:


On 6/27/19 6:59 AM, subhra mazumdar wrote:

Put upper and lower limit on cpu search of select_idle_cpu. The lower limit
is amount of cpus in a core while upper limit is twice that. This ensures
for any architecture we will usually search beyond a core. The upper limit
also helps in keeping the search cost low and constant.

Signed-off-by: subhra mazumdar 
---
  kernel/sched/fair.c | 15 +++
  1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f..b58f08f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6188,7 +6188,7 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
-   int cpu, nr = INT_MAX;
+   int cpu, limit, floor, nr = INT_MAX;

this_sd = rcu_dereference(*this_cpu_ptr(_llc));
if (!this_sd)
@@ -6206,10 +6206,17 @@ static int select_idle_cpu(struct task_struct *p, 
struct sched_domain *sd, int t

if (sched_feat(SIS_PROP)) {
u64 span_avg = sd->span_weight * avg_idle;
-   if (span_avg > 4*avg_cost)
+   floor = cpumask_weight(topology_sibling_cpumask(target));
+   if (floor < 2)
+   floor = 2;
+   limit = floor << 1;

Is upper limit an experimental value only or it has any arch specific 
significance?
Because, AFAIU, systems like POWER9 might have benefit for searching for 4-cores
due to its different cache model. So it can be tuned for arch specific builds 
then.

The lower bound and upper bound were 1 core and 2 core respectively. That
is done as to search beyond one core and at the same time not to search
too much. It is heuristic that seemed to work well on all archs coupled
with the moving window mechanism. Does 4 vs 2 make any difference on your
POWER9? AFAIR it didn't on SPARC SMT8.


Also variable names can be changed for better readability.
floor -> weight_clamp_min
limit -> weight_clamp_max
or something similar

OK.

Thanks,
Subhra




+   if (span_avg > floor*avg_cost) {
nr = div_u64(span_avg, avg_cost);
-   else
-   nr = 4;
+   if (nr > limit)
+   nr = limit;
+   } else {
+   nr = floor;
+   }
}

time = local_clock();



Best,
Parth

Re: [RFC] Deadlock via recursive wakeup via RCU with threadirqs

2019-06-28 Thread Paul E. McKenney

On Fri, Jun 28, 2019 at 05:40:18PM -0400, Joel Fernandes wrote:
> Hi Paul,
> 
> On Fri, Jun 28, 2019 at 01:04:23PM -0700, Paul E. McKenney wrote:
> [snip]
> > > > > Commit
> > > > > - 23634ebc1d946 ("rcu: Check for wakeup-safe conditions in
> > > > >rcu_read_unlock_special()") does not trigger the bug within 94
> > > > >attempts.
> > > > > 
> > > > > - 48d07c04b4cc1 ("rcu: Enable elimination of Tree-RCU softirq
> > > > >   processing") needed 12 attempts to trigger the bug.
> > > > 
> > > > That matches my belief that 23634ebc1d946 ("rcu: Check for wakeup-safe
> > > > conditions in rcu_read_unlock_special()") will at least greatly decrease
> > > > the probability of this bug occurring.
> > > 
> > > I was just typing a reply that I can't reproduce it with:
> > >   rcu: Check for wakeup-safe conditions in rcu_read_unlock_special()
> > > 
> > > I am trying to revert enough of this patch to see what would break things,
> > > however I think a better exercise might be to understand more what the 
> > > patch
> > > does why it fixes things in the first place ;-) It is probably the
> > > deferred_qs thing.
> > 
> > The deferred_qs flag is part of it!  Looking forward to hearing what
> > you come up with as being the critical piece of this commit.
> 
> The new deferred_qs flag indeed saves the machine from the dead-lock.
> 
> If we don't want the deferred_qs, then the below patch also fixes the issue.
> However, I am more sure than not that it does not handle all cases (such as
> what if we previously had an expedited grace period IPI in a previous reader
> section and had to to defer processing. Then it seems a similar deadlock
> would present. But anyway, the below patch does fix it for me! It is based on
> your -rcu tree commit 23634ebc1d946f19eb112d4455c1d84948875e31 (rcu: Check
> for wakeup-safe conditions in rcu_read_unlock_special()).

The point here being that you rely on .b.blocked rather than
.b.deferred_qs.  Hmmm...  There are a number of places that check all
the bits via the .s leg of the rcu_special union.  The .s check in
rcu_preempt_need_deferred_qs() should be OK because it is conditioned
on t->rcu_read_lock_nesting of zero or negative.

Do rest of those also work out OK?

It would be nice to remove the flag, but doing so clearly needs careful
review and testing.

Thanx, Paul

> ---8<---
> 
> From: "Joel Fernandes (Google)" 
> Subject: [PATCH] Fix RCU recursive deadlock
> 
> Signed-off-by: Joel Fernandes (Google) 
> ---
>  include/linux/sched.h|  2 +-
>  kernel/rcu/tree_plugin.h | 17 +
>  2 files changed, 14 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 942a44c1b8eb..347e6dfcc91b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -565,7 +565,7 @@ union rcu_special {
>   u8  blocked;
>   u8  need_qs;
>   u8  exp_hint; /* Hint for performance. */
> - u8  deferred_qs;
> + u8  pad;
>   } b; /* Bits. */
>   u32 s; /* Set of bits. */
>  };
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index 75110ea75d01..5b9b12c1ba5c 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -455,7 +455,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, 
> unsigned long flags)
>   local_irq_restore(flags);
>   return;
>   }
> - t->rcu_read_unlock_special.b.deferred_qs = false;
>   if (special.b.need_qs) {
>   rcu_qs();
>   t->rcu_read_unlock_special.b.need_qs = false;
> @@ -608,13 +607,24 @@ static void rcu_read_unlock_special(struct task_struct 
> *t)
>   if (preempt_bh_were_disabled || irqs_were_disabled) {
>   t->rcu_read_unlock_special.b.exp_hint = false;
>   // Need to defer quiescent state until everything is enabled.
> +
> + /* If unlock_special was called in the current reader section
> +  * just because we were blocked in a previous reader section,
> +  * then raising softirqs can deadlock. This is because the
> +  * scheduler executes RCU sections with preemption disabled,
> +  * however it may have previously blocked in a previous
> +  * non-scheduler reader section and .blocked got set.  It is
> +  * never safe to call unlock_special from the scheduler path
> +  * due to recursive wake ups (unless we are in_irq(), so
> +  * prevent this by checking if we were previously blocked.
> +  */
>   if (irqs_were_disabled && use_softirq &&
> - (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
> + (!t->rcu_read_unlock_special.b.blocked || in_irq())) {
>

Re: [PATCH v3 3/7] sched: rotate the cpu search window for better spread

2019-06-28 Thread Subhra Mazumdar




On 6/28/19 11:36 AM, Parth Shah wrote:

Hi Subhra,

I ran your patch series on IBM POWER systems and this is what I have observed.

On 6/27/19 6:59 AM, subhra mazumdar wrote:

Rotate the cpu search window for better spread of threads. This will ensure
an idle cpu will quickly be found if one exists.

Signed-off-by: subhra mazumdar 
---
  kernel/sched/fair.c | 10 --
  1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b58f08f..c1ca88e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6188,7 +6188,7 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
-   int cpu, limit, floor, nr = INT_MAX;
+   int cpu, limit, floor, target_tmp, nr = INT_MAX;

this_sd = rcu_dereference(*this_cpu_ptr(_llc));
if (!this_sd)
@@ -6219,9 +6219,15 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
}
}

+   if (per_cpu(next_cpu, target) != -1)
+   target_tmp = per_cpu(next_cpu, target);
+   else
+   target_tmp = target;
+
time = local_clock();

-   for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+   for_each_cpu_wrap(cpu, sched_domain_span(sd), target_tmp) {
+   per_cpu(next_cpu, target) = cpu;

This leads to a problem of cache hotness.
AFAIU, in most cases, `target = prev_cpu` of the task being woken up and
selecting an idle CPU nearest to the prev_cpu is favorable.
But since this doesn't keep track of last idle cpu per task, it fails to find 
the
nearest possible idle CPU in cases when the task is being woken up after other
scheduled task.


I had tested hackbench on SPARC SMT8 (see numbers in cover letter) and
showed improvement with this. Firstly it's a tradeoff between cache effects
vs time spent in searching idle CPU, and both x86 and SPARC numbers showed
tradeoff is worth it. Secondly there is a lot of cache affinity logic
in the beginning of select_idle_sibling. If select_idle_cpu is still called
that means we are past that and want any idle cpu. I don't think waking up
close to the prev cpu is the intention for starting search from there,
rather it is to spread threads across all cpus so that no cpu gets
victimized as there is no atomicity. Prev cpu just acts a good seed to do
the spreading.

Thanks,
Subhra

[PATCH] perf tools: Fix typos / broken sentences

2019-06-28 Thread Andi Kleen

From: Andi Kleen 

- Fix a typo in the man page
- Fix a tip that doesn't make any sense.

Signed-off-by: Andi Kleen 
---
 tools/perf/Documentation/perf-report.txt | 2 +-
 tools/perf/Documentation/tips.txt| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt 
b/tools/perf/Documentation/perf-report.txt
index 8c4372819e11..987261d158d4 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -89,7 +89,7 @@ OPTIONS
- socket: processor socket number the task ran at the time of sample
- srcline: filename and line number executed at the time of sample.  The
DWARF debugging info must be provided.
-   - srcfile: file name of the source file of the same. Requires dwarf
+   - srcfile: file name of the source file of the samples. Requires dwarf
information.
- weight: Event specific weight, e.g. memory latency or transaction
abort cost. This is the global weight.
diff --git a/tools/perf/Documentation/tips.txt 
b/tools/perf/Documentation/tips.txt
index 869965d629ce..825745a645c1 100644
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -38,6 +38,6 @@ To report cacheline events from previous recording: perf c2c 
report
 To browse sample contexts use perf report --sample 10 and select in context 
menu
 To separate samples by time use perf report --sort time,overhead,sym
 To set sample time separation other than 100ms with --sort time use 
--time-quantum
-Add -I to perf report to sample register values visible in perf report context.
+Add -I to perf record to sample register values, which will be visible in perf 
report sample context.
 To show IPC for sampling periods use perf record -e '{cycles,instructions}:S' 
and then browse context
 To show context switches in perf report sample context add --switch-events to 
perf record.
-- 
2.20.1

[PATCH 3/3] perf tools metric: Don't include duration_time in group

2019-06-28 Thread Andi Kleen

From: Andi Kleen 

The Memory_BW metric generates groups including duration_time,
which maps to a software event.

For some reason this makes the group always not count.

Always put duration_time outside a group when generating metrics.
It's always the same time, so no need to group it.

Signed-off-by: Andi Kleen 
---
 tools/perf/util/metricgroup.c | 19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 1c85c9624c58..17ec05e17e7b 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -410,6 +410,7 @@ static int metricgroup__add_metric(const char *metric, 
struct strbuf *events,
const char **ids;
int idnum;
struct egroup *eg;
+   bool no_group = false;
 
pr_debug("metric expr %s for %s\n", pe->metric_expr, 
pe->metric_name);
 
@@ -420,11 +421,25 @@ static int metricgroup__add_metric(const char *metric, 
struct strbuf *events,
strbuf_addf(events, ",");
for (j = 0; j < idnum; j++) {
pr_debug("found event %s\n", ids[j]);
+   /*
+* Duration time maps to a software event and 
can make
+* groups not count. Always use it outside a
+* group.
+*/
+   if (!strcmp(ids[j], "duration_time")) {
+   if (j > 0)
+   strbuf_addf(events, "}:W,");
+   strbuf_addf(events, "duration_time");
+   no_group = true;
+   continue;
+   }
strbuf_addf(events, "%s%s",
-   j == 0 ? "{" : ",",
+   j == 0 || no_group ? "{" : ",",
ids[j]);
+   no_group = false;
}
-   strbuf_addf(events, "}:W");
+   if (!no_group)
+   strbuf_addf(events, "}:W");
 
eg = malloc(sizeof(struct egroup));
if (!eg) {
-- 
2.20.1

[PATCH 2/3] perf list: Avoid extra : for --raw metrics

2019-06-28 Thread Andi Kleen

From: Andi Kleen 

When printing the metrics raw, don't print : after the metricgroups.
This helps the command line completion to complete those too.

Signed-off-by: Andi Kleen 
---
 tools/perf/util/metricgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index fabdb6dde88e..1c85c9624c58 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -376,7 +376,7 @@ void metricgroup__print(bool metrics, bool metricgroups, 
char *filter,
struct mep *me = container_of(node, struct mep, nd);
 
if (metricgroups)
-   printf("%s%s%s", me->name, metrics ? ":" : "", raw ? " 
" : "\n");
+   printf("%s%s%s", me->name, metrics && !raw ? ":" : "", 
raw ? " " : "\n");
if (metrics)
metricgroup__print_strlist(me->metrics, raw);
next = rb_next(node);
-- 
2.20.1

[PATCH 1/3] perf vendor events intel: Metric fixes for SKX/CLX

2019-06-28 Thread Andi Kleen

From: Andi Kleen 

- Add a missing filter for the DRAM_Latency / DRAM_Parallel_Reads metrics
- Remove the useless PMM_* metrics from Skylake

Signed-off-by: Andi Kleen 
---
 .../arch/x86/cascadelakex/clx-metrics.json|  4 ++--
 .../arch/x86/skylakex/skx-metrics.json| 22 ++-
 2 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json 
b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
index 1a1a3501180a..a382b115633d 100644
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
@@ -314,13 +314,13 @@
 "MetricName": "DRAM_BW_Use"
 },
 {
-"MetricExpr": "10 * ( cha@event\\=0x36\\,umask\\=0x21@ / 
cha@event\\=0x35\\,umask\\=0x21@ ) / ( cha_0@event\\=0x0@ / duration_time 
)",
+   "MetricExpr": "10 * ( 
cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / 
cha@event\\=0x35\\,umask\\=0x21\\,config\\=0x40433@ ) / ( 
cha_0@event\\=0x0@ / duration_time )",
 "BriefDescription": "Average latency of data read request to external 
memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches",
 "MetricGroup": "Memory_Lat",
 "MetricName": "DRAM_Read_Latency"
 },
 {
-"MetricExpr": "cha@event\\=0x36\\,umask\\=0x21@ / 
cha@event\\=0x36\\,umask\\=0x21\\,thresh\\=1@",
+   "MetricExpr": 
"cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / 
cha@event\\=0x36\\,umask\\=0x21\\,thresh\\=1\\,config\\=0x40433@",
 "BriefDescription": "Average number of parallel data read requests to 
external memory. Accounts for demand loads and L1/L2 prefetches",
 "MetricGroup": "Memory_BW",
 "MetricName": "DRAM_Parallel_Reads"
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json 
b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
index 56e03ba771f4..35b255fa6a79 100644
--- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
@@ -314,35 +314,17 @@
 "MetricName": "DRAM_BW_Use"
 },
 {
-"MetricExpr": "10 * ( cha@event\\=0x36\\,umask\\=0x21@ / 
cha@event\\=0x35\\,umask\\=0x21@ ) / ( cha_0@event\\=0x0@ / duration_time 
)",
+   "MetricExpr": "10 * ( 
cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / 
cha@event\\=0x35\\,umask\\=0x21\\,config\\=0x40433@ ) / ( 
cha_0@event\\=0x0@ / duration_time )",
 "BriefDescription": "Average latency of data read request to external 
memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches",
 "MetricGroup": "Memory_Lat",
 "MetricName": "DRAM_Read_Latency"
 },
 {
-"MetricExpr": "cha@event\\=0x36\\,umask\\=0x21@ / 
cha@event\\=0x36\\,umask\\=0x21\\,thresh\\=1@",
+   "MetricExpr": 
"cha@event\\=0x36\\,umask\\=0x21\\,config\\=0x40433@ / 
cha@event\\=0x36\\,umask\\=0x21\\,thresh\\=1\\,config\\=0x40433@",
 "BriefDescription": "Average number of parallel data read requests to 
external memory. Accounts for demand loads and L1/L2 prefetches",
 "MetricGroup": "Memory_BW",
 "MetricName": "DRAM_Parallel_Reads"
 },
-{
-"MetricExpr": "( 10 * ( imc@event\\=0xe0\\,umask\\=0x1@ / 
imc@event\\=0xe3@ ) / imc_0@event\\=0x0@ ) if 1 if 0 == 1 else 0 else 0",
-"BriefDescription": "Average latency of data read request to external 
3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 
data-read prefetches",
-"MetricGroup": "Memory_Lat",
-"MetricName": "MEM_PMM_Read_Latency"
-},
-{
-"MetricExpr": "( ( 64 * imc@event\\=0xe3@ / 10 ) / 
duration_time ) if 1 if 0 == 1 else 0 else 0",
-"BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / 
sec]",
-"MetricGroup": "Memory_BW",
-"MetricName": "PMM_Read_BW"
-},
-{
-"MetricExpr": "( ( 64 * imc@event\\=0xe7@ / 10 ) / 
duration_time ) if 1 if 0 == 1 else 0 else 0",
-"BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB 
/ sec]",
-"MetricGroup": "Memory_BW",
-"MetricName": "PMM_Write_BW"
-},
 {
 "MetricExpr": "cha_0@event\\=0x0@",
 "BriefDescription": "Socket actual clocks when any core is active on 
that socket",
-- 
2.20.1

Re: [PATCH] rbtree: avoid generating code twice for the cached versions

2019-06-28 Thread Michel Lespinasse

Hi Davidlohr,

On Fri, Jun 28, 2019 at 9:56 AM Davidlohr Bueso  wrote:
> I think this makes sense, and is more along the lines of the augmented
> cached doing the static inline instead of separate instantiations of the
> calls.

Thanks for the review.

> >Change-Id: I0cb62be774fc0138b81188e6ae81d5f1da64578d
> what is this?

Gerrit code review insists on having this footer on every commit. I
forgot to remove it before submitting. Please ignore it :)

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

Re: [PATCH] clk: qoriq: Fix -Wunused-const-variable

2019-06-28 Thread Scott Wood

On Thu, 2019-06-27 at 15:06 -0700, Nathan Huckleberry wrote:
> drivers/clk/clk-qoriq.c:138:38: warning: unused variable
> 'p5020_cmux_grp1' [-Wunused-const-variable] static const struct
> clockgen_muxinfo p5020_cmux_grp1
> 
> drivers/clk/clk-qoriq.c:146:38: warning: unused variable
> 'p5020_cmux_grp2' [-Wunused-const-variable] static const struct
> clockgen_muxinfo p5020_cmux_grp2
> 
> In the definition of the p5020 chip, the p2041 chip's info was used
> instead.  The p5020 and p2041 chips have different info. This is most
> likely a typo.
> 
> Link: https://github.com/ClangBuiltLinux/linux/issues/525
> Cc: clang-built-li...@googlegroups.com
> Signed-off-by: Nathan Huckleberry 

Acked-by: Scott Wood 

-Scott

[PATCH 2/2 RESEND3] perf/x86/amd/uncore: set the thread mask for F17h L3 PMCs

2019-06-28 Thread Phillips, Kim

From: Kim Phillips 

Fill in the L3 performance event select register ThreadMask
bitfield, to enable per hardware thread accounting.

Signed-off-by: Kim Phillips 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Gleixner 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Martin Liska 
Cc: Suravee Suthikulpanit 
Cc: Janakarajan Natarajan 
Cc: Gary Hook 
Cc: Pu Wen 
Cc: Stephane Eranian 
Cc: Vince Weaver 
Cc: x...@kernel.org
---
RESEND3: file sent with header:

Content-Type: text/plain; charset="us-ascii"

to work around a bug in the Microsoft Outlook SMTP servers.

 arch/x86/events/amd/uncore.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index c2c4ae5fbbfc..a6ea07f2aa84 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -202,15 +202,22 @@ static int amd_uncore_event_init(struct perf_event *event)
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
hwc->idx = -1;
 
+   if (event->cpu < 0)
+   return -EINVAL;
+
/*
 * SliceMask and ThreadMask need to be set for certain L3 events in
 * Family 17h. For other events, the two fields do not affect the count.
 */
-   if (l3_mask && is_llc_event(event))
-   hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK);
+   if (l3_mask && is_llc_event(event)) {
+   int thread = 2 * (cpu_data(event->cpu).cpu_core_id % 4);
 
-   if (event->cpu < 0)
-   return -EINVAL;
+   if (smp_num_siblings > 1)
+   thread += cpu_data(event->cpu).apicid & 1;
+
+   hwc->config |= (1ULL << (AMD64_L3_THREAD_SHIFT + thread) &
+   AMD64_L3_THREAD_MASK) | AMD64_L3_SLICE_MASK;
+   }
 
uncore = event_to_amd_uncore(event);
if (!uncore)
-- 
2.22.0

[PATCH 1/2 RESEND3] perf/x86/amd/uncore: Do not set ThreadMask and SliceMask for non-L3 PMCs

2019-06-28 Thread Phillips, Kim

From: Kim Phillips 

Commit d7cbbe49a930 ("perf/x86/amd/uncore: Set ThreadMask and SliceMask
for L3 Cache perf events") enables L3 PMC events for all threads and
slices by writing 1s in ChL3PmcCfg (L3 PMC PERF_CTL) register fields.

Those bitfields overlap with high order event select bits in the Data
Fabric PMC control register, however.

So when a user requests raw Data Fabric events (-e amd_df/event=0xYYY/),
the two highest order bits get inadvertently set, changing the counter
select to events that don't exist, and for which no counts are read.

This patch changes the logic to write the L3 masks only when dealing
with L3 PMC counters.

AMD Family 16h and below Northbridge (NB) counters were not affected.

Signed-off-by: Kim Phillips 
Cc:  # v4.19+
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Arnaldo Carvalho de Melo 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Thomas Gleixner 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Martin Liska 
Cc: Suravee Suthikulpanit 
Cc: Janakarajan Natarajan 
Cc: Gary Hook 
Cc: Pu Wen 
Cc: Stephane Eranian 
Cc: Vince Weaver 
Cc: x...@kernel.org
Fixes: d7cbbe49a930 ("perf/x86/amd/uncore: Set ThreadMask and SliceMask for L3 
Cache perf events")
---
RESEND3: file sent with header:

Content-Type: text/plain; charset="us-ascii"

to work around a bug in the Microsoft Outlook SMTP servers.

 arch/x86/events/amd/uncore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 85e6984c560b..c2c4ae5fbbfc 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -206,7 +206,7 @@ static int amd_uncore_event_init(struct perf_event *event)
 * SliceMask and ThreadMask need to be set for certain L3 events in
 * Family 17h. For other events, the two fields do not affect the count.
 */
-   if (l3_mask)
+   if (l3_mask && is_llc_event(event))
hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK);
 
if (event->cpu < 0)
-- 
2.22.0

Re: [PATCH net v2] net: mvpp2: prs: Don't override the sign bit in SRAM parser shift

2019-06-28 Thread David Miller

From: Maxime Chevallier 
Date: Tue, 25 Jun 2019 14:04:12 +0200

> I see that this patch was set as "Accepted" on patchwork, but hasn't
> made it to -net, I was wondering if this patch slipped through the
> cracks :)
> 
> https://patchwork.ozlabs.org/patch/1119311/

It should really be there now.

I don't know how that happened, honestly ;)

Re: [RFC] Deadlock via recursive wakeup via RCU with threadirqs

2019-06-28 Thread Joel Fernandes

Hi Paul,

On Fri, Jun 28, 2019 at 01:04:23PM -0700, Paul E. McKenney wrote:
[snip]
> > > > Commit
> > > > - 23634ebc1d946 ("rcu: Check for wakeup-safe conditions in
> > > >rcu_read_unlock_special()") does not trigger the bug within 94
> > > >attempts.
> > > > 
> > > > - 48d07c04b4cc1 ("rcu: Enable elimination of Tree-RCU softirq
> > > >   processing") needed 12 attempts to trigger the bug.
> > > 
> > > That matches my belief that 23634ebc1d946 ("rcu: Check for wakeup-safe
> > > conditions in rcu_read_unlock_special()") will at least greatly decrease
> > > the probability of this bug occurring.
> > 
> > I was just typing a reply that I can't reproduce it with:
> >   rcu: Check for wakeup-safe conditions in rcu_read_unlock_special()
> > 
> > I am trying to revert enough of this patch to see what would break things,
> > however I think a better exercise might be to understand more what the patch
> > does why it fixes things in the first place ;-) It is probably the
> > deferred_qs thing.
> 
> The deferred_qs flag is part of it!  Looking forward to hearing what
> you come up with as being the critical piece of this commit.

The new deferred_qs flag indeed saves the machine from the dead-lock.

If we don't want the deferred_qs, then the below patch also fixes the issue.
However, I am more sure than not that it does not handle all cases (such as
what if we previously had an expedited grace period IPI in a previous reader
section and had to to defer processing. Then it seems a similar deadlock
would present. But anyway, the below patch does fix it for me! It is based on
your -rcu tree commit 23634ebc1d946f19eb112d4455c1d84948875e31 (rcu: Check
for wakeup-safe conditions in rcu_read_unlock_special()).

---8<---

From: "Joel Fernandes (Google)" 
Subject: [PATCH] Fix RCU recursive deadlock

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/sched.h|  2 +-
 kernel/rcu/tree_plugin.h | 17 +
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 942a44c1b8eb..347e6dfcc91b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -565,7 +565,7 @@ union rcu_special {
u8  blocked;
u8  need_qs;
u8  exp_hint; /* Hint for performance. */
-   u8  deferred_qs;
+   u8  pad;
} b; /* Bits. */
u32 s; /* Set of bits. */
 };
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 75110ea75d01..5b9b12c1ba5c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -455,7 +455,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, 
unsigned long flags)
local_irq_restore(flags);
return;
}
-   t->rcu_read_unlock_special.b.deferred_qs = false;
if (special.b.need_qs) {
rcu_qs();
t->rcu_read_unlock_special.b.need_qs = false;
@@ -608,13 +607,24 @@ static void rcu_read_unlock_special(struct task_struct *t)
if (preempt_bh_were_disabled || irqs_were_disabled) {
t->rcu_read_unlock_special.b.exp_hint = false;
// Need to defer quiescent state until everything is enabled.
+
+   /* If unlock_special was called in the current reader section
+* just because we were blocked in a previous reader section,
+* then raising softirqs can deadlock. This is because the
+* scheduler executes RCU sections with preemption disabled,
+* however it may have previously blocked in a previous
+* non-scheduler reader section and .blocked got set.  It is
+* never safe to call unlock_special from the scheduler path
+* due to recursive wake ups (unless we are in_irq(), so
+* prevent this by checking if we were previously blocked.
+*/
if (irqs_were_disabled && use_softirq &&
-   (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+   (!t->rcu_read_unlock_special.b.blocked || in_irq())) {
// Using softirq, safe to awaken, and we get
// no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
} else if (irqs_were_disabled && !use_softirq &&
-  !t->rcu_read_unlock_special.b.deferred_qs) {
+  !t->rcu_read_unlock_special.b.blocked) {
// Safe to awaken and we get no help from enabling
// irqs, unlike bh/preempt.
invoke_rcu_core();
@@ -623,7 +633,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
set_tsk_need_resched(current);

Re: [PATCH v4 2/4] usb: xhci: Use register defined and field names

2019-06-28 Thread Mathias Nyman


On 26.6.2019 10.55, Vinod Koul wrote:

Instead of using register values and fields lets define them and
use in the driver.

Signed-off-by: Vinod Koul 
Cc: Yoshihiro Shimoda 
Cc: Christian Lamparter 
Tested-by: Christian Lamparter 
---
  drivers/usb/host/xhci-pci.c | 60 ++---
  1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 237df5c47fca..0f2574b42cb1 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -57,6 +57,27 @@
  #define PCI_DEVICE_ID_AMD_PROMONTORYA_1   0x43bc
  #define PCI_DEVICE_ID_ASMEDIA_1042A_XHCI  0x1142
  
+#define RENESAS_FW_VERSION0x6C

+#define RENESAS_ROM_CONFIG 0xF0
+#define RENESAS_FW_STATUS  0xF4
+#define RENESAS_FW_STATUS_MSB  0xF5
+#define RENESAS_ROM_STATUS 0xF6
+#define RENESAS_ROM_STATUS_MSB 0xF7
+#define RENESAS_DATA0  0xF8
+#define RENESAS_DATA1  0xFC
+
+#define RENESAS_FW_VERSION_FIELD   GENMASK(23, 7)
+#define RENESAS_FW_VERSION_OFFSET  8
+
+#define RENESAS_FW_STATUS_DOWNLOAD_ENABLE  BIT(0)
+#define RENESAS_FW_STATUS_LOCK BIT(1)
+#define RENESAS_FW_STATUS_RESULT   GENMASK(6, 4)
+  #define RENESAS_FW_STATUS_INVALID0
+  #define RENESAS_FW_STATUS_SUCCESSBIT(4)
+  #define RENESAS_FW_STATUS_ERROR  BIT(5)
+#define RENESAS_FW_STATUS_SET_DATA0BIT(8)
+#define RENESAS_FW_STATUS_SET_DATA1BIT(9)
+
  #define RENESAS_RETRY 1
  #define RENESAS_DELAY 10
  
@@ -347,7 +368,8 @@ static int renesas_fw_download_image(struct pci_dev *dev,
  
  	/* step+1. Read "Set DATAX" and confirm it is cleared. */

for (i = 0; i < RENESAS_RETRY; i++) {
-   err = pci_read_config_byte(dev, 0xF5, _status);
+   err = pci_read_config_byte(dev, RENESAS_FW_STATUS_MSB,
+  _status);
if (err)
return pcibios_err_to_errno(err);
if (!(fw_status & BIT(data0_or_data1)))
@@ -362,7 +384,8 @@ static int renesas_fw_download_image(struct pci_dev *dev,
 * step+2. Write FW data to "DATAX".
 * "LSB is left" => force little endian
 */
-   err = pci_write_config_dword(dev, data0_or_data1 ? 0xFC : 0xF8,
+   err = pci_write_config_dword(dev, data0_or_data1 ?
+RENESAS_DATA1 : RENESAS_DATA0,
 (__force u32)cpu_to_le32(fw[step]));
if (err)
return pcibios_err_to_errno(err);
@@ -370,7 +393,8 @@ static int renesas_fw_download_image(struct pci_dev *dev,
udelay(100);
  
  	/* step+3. Set "Set DATAX". */

-   err = pci_write_config_byte(dev, 0xF5, BIT(data0_or_data1));
+   err = pci_write_config_byte(dev, RENESAS_FW_STATUS_MSB,
+   BIT(data0_or_data1));
if (err)
return pcibios_err_to_errno(err);
  
@@ -440,7 +464,7 @@ static int renesas_fw_check_running(struct pci_dev *pdev)

 * BIOSes will initialize the device for us. If the device is
 * initialized.
 */
-   err = pci_read_config_byte(pdev, 0xF4, _state);
+   err = pci_read_config_byte(pdev, RENESAS_FW_STATUS, _state);
if (err)
return pcibios_err_to_errno(err);
  
@@ -449,10 +473,10 @@ static int renesas_fw_check_running(struct pci_dev *pdev)

 * ready we can simply continue. If the FW is not ready, we have
 * to give up.
 */
-   if (fw_state & BIT(1)) {
+   if (fw_state & RENESAS_FW_STATUS_LOCK) {
dev_dbg(>dev, "FW Download Lock is engaged.");
  
-		if (fw_state & BIT(4))

+   if (fw_state & RENESAS_FW_STATUS_SUCCESS)
return 0;
  
  		dev_err(>dev,

@@ -465,33 +489,33 @@ static int renesas_fw_check_running(struct pci_dev *pdev)
 * with it and it can't be resetted, we have to give up too... and
 * ask for a forgiveness and a reboot.
 */
-   if (fw_state & BIT(0)) {
+   if (fw_state & RENESAS_FW_STATUS_DOWNLOAD_ENABLE) {
dev_err(>dev,
"FW Download Enable is stale. Giving Up (poweroff/reboot 
needed).");
return -EIO;
}
  
  	/* Otherwise, Check the "Result Code" Bits (6:4) and act accordingly */

-   switch ((fw_state & 0x70)) {
+   switch (fw_state & RENESAS_FW_STATUS_RESULT) {
case 0: /* No result yet */
dev_dbg(>dev, "FW is not ready/loaded yet.");
  
  		/* tell the caller, that this device needs the firmware. */

return 1;
  
-

Re: [PATCH v4 1/4] usb: xhci: add firmware loader for uPD720201 and uPD720202 w/o ROM

2019-06-28 Thread Mathias Nyman


On 26.6.2019 10.55, Vinod Koul wrote:

From: Christian Lamparter 

This patch adds a firmware loader for the uPD720201K8-711-BAC-A
and uPD720202K8-711-BAA-A variant. Both of these chips are listed
in Renesas' R19UH0078EJ0500 Rev.5.00 "User's Manual: Hardware" as
devices which need the firmware loader on page 2 in order to
work as they "do not support the External ROM".

The "Firmware Download Sequence" is describe in chapter
"7.1 FW Download Interface" R19UH0078EJ0500 Rev.5.00 page 131.

The firmware "K2013080.mem" is available from a USB3.0 Host to
PCIe Adapter (PP2U-E card) "Firmware download" archive. An
alternative version can be sourced from Netgear's WNDR4700 GPL
archives.

The release notes of the PP2U-E's "Firmware Download" ver 2.0.1.3
(2012-06-15) state that the firmware is for the following devices:
  - uPD720201 ES 2.0 sample whose revision ID is 2.
  - uPD720201 ES 2.1 sample & CS sample & Mass product, ID is 3.
  - uPD720202 ES 2.0 sample & CS sample & Mass product, ID is 2.

Cc: Yoshihiro Shimoda 
Signed-off-by: Christian Lamparter 
Signed-off-by: Bjorn Andersson 
[vkoul: fixed comments:
used macros for timeout count and delay
removed renesas_fw_alive_check
cleaned renesas_fw_callback
removed recurion for renesas_fw_download
added MODULE_FIRMWARE]
Tested-by: Christian Lamparter 
Signed-off-by: Vinod Koul 
---
  drivers/usb/host/xhci-pci.c | 454 
  1 file changed, 454 insertions(+)

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index c2fe218e051f..237df5c47fca 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -12,6 +12,8 @@
  #include 
  #include 
  #include 
+#include 
+#include 
  
  #include "xhci.h"

  #include "xhci-trace.h"
@@ -55,6 +57,9 @@
  #define PCI_DEVICE_ID_AMD_PROMONTORYA_1   0x43bc
  #define PCI_DEVICE_ID_ASMEDIA_1042A_XHCI  0x1142
  
+#define RENESAS_RETRY	1

+#define RENESAS_DELAY  10
+
  static const char hcd_name[] = "xhci_hcd";
  
  static struct hc_driver __read_mostly xhci_pci_hc_driver;

@@ -279,6 +284,429 @@ static void xhci_pme_acpi_rtd3_enable(struct pci_dev *dev)
  static void xhci_pme_acpi_rtd3_enable(struct pci_dev *dev) { }
  #endif /* CONFIG_ACPI */
  
+static const struct renesas_fw_entry {

+   const char *firmware_name;
+   u16 device;
+   u8 revision;
+   u16 expected_version;
+} renesas_fw_table[] = {
+   /*
+* Only the uPD720201K8-711-BAC-A or uPD720202K8-711-BAA-A
+* are listed in R19UH0078EJ0500 Rev.5.00 as devices which
+* need the software loader.
+*
+* PP2U/ReleaseNote_USB3-201-202-FW.txt:
+*
+* Note: This firmware is for the following devices.
+*  - uPD720201 ES 2.0 sample whose revision ID is 2.
+*  - uPD720201 ES 2.1 sample & CS sample & Mass product, ID is 3.
+*  - uPD720202 ES 2.0 sample & CS sample & Mass product, ID is 2.
+*/
+   { "K2013080.mem", 0x0014, 0x02, 0x2013 },
+   { "K2013080.mem", 0x0014, 0x03, 0x2013 },
+   { "K2013080.mem", 0x0015, 0x02, 0x2013 },
+};
+
+MODULE_FIRMWARE("K2013080.mem");
+
+static const struct renesas_fw_entry *renesas_needs_fw_dl(struct pci_dev *dev)
+{
+   const struct renesas_fw_entry *entry;
+   size_t i;
+
+   /* This loader will only work with a RENESAS device. */
+   if (!(dev->vendor == PCI_VENDOR_ID_RENESAS))
+   return NULL;
+
+   for (i = 0; i < ARRAY_SIZE(renesas_fw_table); i++) {
+   entry = _fw_table[i];
+   if (entry->device == dev->device &&
+   entry->revision == dev->revision)
+   return entry;
+   }
+
+   return NULL;
+}
+
+static int renesas_fw_download_image(struct pci_dev *dev,
+const u32 *fw,
+size_t step)
+{
+   size_t i;
+   int err;
+   u8 fw_status;
+   bool data0_or_data1;
+
+   /*
+* The hardware does alternate between two 32-bit pages.
+* (This is because each row of the firmware is 8 bytes).
+*
+* for even steps we use DATA0, for odd steps DATA1.
+*/
+   data0_or_data1 = (step & 1) == 1;
+
+   /* step+1. Read "Set DATAX" and confirm it is cleared. */
+   for (i = 0; i < RENESAS_RETRY; i++) {
+   err = pci_read_config_byte(dev, 0xF5, _status);
+   if (err)
+   return pcibios_err_to_errno(err);
+   if (!(fw_status & BIT(data0_or_data1)))
+   break;
+
+   udelay(RENESAS_DELAY);
+   }
+   if (i == RENESAS_RETRY)
+   return -ETIMEDOUT;
+
+   /*
+* step+2. Write FW data to "DATAX".
+* "LSB is left" => force little endian
+*/
+   err = pci_write_config_dword(dev, data0_or_data1 ? 0xFC : 0xF8,
+

Re: [PATCH net-next] ipv4: enable route flushing in network namespaces

2019-06-28 Thread David Miller

From: Christian Brauner 
Date: Mon, 24 Jun 2019 15:29:23 +0200

> Tools such as vpnc try to flush routes when run inside network
> namespaces by writing 1 into /proc/sys/net/ipv4/route/flush. This
> currently does not work because flush is not enabled in non-initial
> network namespaces.
> Since routes are per network namespace it is safe to enable
> /proc/sys/net/ipv4/route/flush in there.
> 
> Link: https://github.com/lxc/lxd/issues/4257
> Signed-off-by: Christian Brauner 

Applied.

Re: [PATCH][next] regulator: lp87565: fix missing break in switch statement

2019-06-28 Thread Colin Ian King

On 28/06/2019 15:36, Mark Brown wrote:
> On Thu, Jun 27, 2019 at 02:16:39PM +0100, Colin King wrote:
>> From: Colin Ian King 
>>
>> Currently the LP87565_DEVICE_TYPE_LP87561_Q1 case does not have a
>> break statement, causing it to fall through to a dev_err message.
>> Fix this by adding in the missing break statement.
> 
> This doesn't apply against current code, please check and resend.
> 
So it applies cleanly against linux-next, I think the original code
landed in mfd/for-mfd-next - c.f. https://lkml.org/lkml/2019/5/28/550

Colin




signature.asc
Description: OpenPGP digital signature

perf: Build with make -C

2019-06-28 Thread Andy Shevchenko

My setup includes Linux kernel repository and Buildroot.
I build Linux kernel with make O= and then
when I try to build perf by running

make -j1 V=1 JOBS=1 \
-C  \
CROSS_COMPILE=".../i586-buildroot-linux-uclibc-" \
DESTDIR="..." tools/perf_install

where  is a patch to output folder of built Linux kernel,
I got wrong path in perf during build, so, instead of tools/perf it becomes
tools/perf/tools/perf.

Note, it fulfills my purposes with minimal features supported,
that's why not every library got "fixed".

Below is the patch which helped me to achieve above.

I'm pretty sure it's not the best solution. Anyway, would like to hear any
ideas how to do this better.

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 4d46ca6d7e20..c56d4c0fd29b 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -698,23 +698,24 @@ $(LIBPERF_A): $(LIBPERF_IN)
 LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 
'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(LDFLAGS)'
 
 $(LIBTRACEEVENT): FORCE
-   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
$(OUTPUT)libtraceevent.a
+   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
subdir= $(OUTPUT)libtraceevent.a
 
 libtraceevent_plugins: FORCE
-   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
plugins
+   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
subdir= plugins
 
 $(LIBTRACEEVENT_DYNAMIC_LIST): libtraceevent_plugins
-   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
$(OUTPUT)libtraceevent-dynamic-list
+   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
subdir= $(OUTPUT)libtraceevent-dynamic-list
 
 $(LIBTRACEEVENT)-clean:
$(call QUIET_CLEAN, libtraceevent)
$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
 
 install-traceevent-plugins: libtraceevent_plugins
-   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
install_plugins
+   $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) 
subdir= install_plugins
 
 $(LIBAPI): FORCE
-   $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) $(OUTPUT)libapi.a
+   $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) subdir= $(OUTPUT)libapi.a
+   mkdir -p $(API_PATH) && cp $(OUTPUT)libapi.a $(LIBAPI)
 
 $(LIBAPI)-clean:
$(call QUIET_CLEAN, libapi)
@@ -728,7 +729,7 @@ $(LIBBPF)-clean:
$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) clean >/dev/null
 
 $(LIBSUBCMD): FORCE
-   $(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) $(OUTPUT)libsubcmd.a
+   $(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) subdir= $(OUTPUT)libsubcmd.a
 
 $(LIBSUBCMD)-clean:
$(call QUIET_CLEAN, libsubcmd)
-- 
2.20.1

Клиентские базы! Email: proda...@armyspy.com Узнайте подробнее!

2019-06-28 Thread linux-kernel

Клиентские базы! Email: proda...@armyspy.com Узнайте подробнее!

Re: [PATCH v5 1/2] RISC-V: Fix memory reservation in setup_bootmem()

2019-06-28 Thread Paul Walmsley

On Fri, 7 Jun 2019, Anup Patel wrote:

> Currently, the setup_bootmem() reserves memory from RAM start to the
> kernel end. This prevents us from exploring ways to use the RAM below
> (or before) the kernel start hence this patch updates setup_bootmem()
> to only reserve memory from the kernel start to the kernel end.
> 
> Suggested-by: Mike Rapoport 
> Signed-off-by: Anup Patel 
> Reviewed-by: Christoph Hellwig 

Thanks, queued for v5.3.


- Paul

Re: linux-next: Tree for Jun 28 (kernel/bpf/cgroup.c)

2019-06-28 Thread Randy Dunlap

On 6/28/19 3:38 AM, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20190627:
> 
> New tree: arm-soc-fixes
> 
> The net-next tree lost its build failure.  It also gained a conflict
> against the net tree.
> 
> The battery tree gained conflicts against the mfd and pci trees.  It also
> gained a build failure for which I reverted a commit.
> 
> The devicetree tree gained a conflict against the net-next tree.
> 
> The gpio tree gained a conflict against the mfd tree.
> 
> Non-merge commits (relative to Linus' tree): 9592
>  9480 files changed, 782342 insertions(+), 283272 deletions(-)
> 
> 
> 
> I have created today's linux-next tree at
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
> (patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
> are tracking the linux-next tree using git, you should not use "git pull"
> to do so as that will try to merge the new linux-next release with the
> old one.  You should use "git fetch" and checkout or reset to the new
> master.
> 
> You can see which trees have been included by looking in the Next/Trees
> file in the source.  There are also quilt-import.log and merge.log
> files in the Next directory.  Between each merge, the tree was built
> with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
> multi_v7_defconfig for arm and a native build of tools/perf. After
> the final fixups (if any), I do an x86_64 modules_install followed by
> builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
> ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
> and sparc64 defconfig. And finally, a simple boot test of the powerpc
> pseries_le_defconfig kernel in qemu (with and without kvm enabled).
> 
> Below is a summary of the state of the merge.
> 
> I am currently merging 297 trees (counting Linus' and 72 trees of bug
> fix patches pending for the current merge release).
> 
> Stats about the size of the tree over time can be seen at
> http://neuling.org/linux-next-size.html .
> 
> Status of my local build tests will be at
> http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
> advice about cross compilers/configs that work, we are always open to add
> more builds.
> 
> Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
> Gortmaker for triage and bug fixes.
> 

on i386:

ld: kernel/bpf/cgroup.o: in function `cg_sockopt_func_proto':
cgroup.c:(.text+0x2906): undefined reference to `bpf_sk_storage_delete_proto'
ld: cgroup.c:(.text+0x2939): undefined reference to `bpf_sk_storage_get_proto'
ld: kernel/bpf/cgroup.o: in function `__cgroup_bpf_run_filter_setsockopt':
cgroup.c:(.text+0x85e4): undefined reference to `lock_sock_nested'
ld: cgroup.c:(.text+0x8af2): undefined reference to `release_sock'
ld: kernel/bpf/cgroup.o: in function `__cgroup_bpf_run_filter_getsockopt':
cgroup.c:(.text+0x8fd6): undefined reference to `lock_sock_nested'
ld: cgroup.c:(.text+0x94e4): undefined reference to `release_sock'


Full randconfig file is attached.

-- 
~Randy
#
# Automatically generated file; DO NOT EDIT.
# Linux/i386 5.2.0-rc6 Kernel Configuration
#

#
# Compiler: gcc (SUSE Linux) 4.8.5
#
CONFIG_CC_IS_GCC=y
CONFIG_GCC_VERSION=40805
CONFIG_CLANG_VERSION=0
CONFIG_CC_HAS_ASM_GOTO=y
CONFIG_CC_HAS_WARN_MAYBE_UNINITIALIZED=y
CONFIG_CC_DISABLE_WARN_MAYBE_UNINITIALIZED=y
CONFIG_CONSTRUCTORS=y
CONFIG_IRQ_WORK=y
CONFIG_BUILDTIME_EXTABLE_SORT=y
CONFIG_THREAD_INFO_IN_TASK=y

#
# General setup
#
CONFIG_INIT_ENV_ARG_LIMIT=32
# CONFIG_COMPILE_TEST is not set
CONFIG_HEADER_TEST=y
CONFIG_LOCALVERSION=""
CONFIG_LOCALVERSION_AUTO=y
CONFIG_BUILD_SALT=""
CONFIG_HAVE_KERNEL_GZIP=y
CONFIG_HAVE_KERNEL_BZIP2=y
CONFIG_HAVE_KERNEL_LZMA=y
CONFIG_HAVE_KERNEL_XZ=y
CONFIG_HAVE_KERNEL_LZO=y
CONFIG_HAVE_KERNEL_LZ4=y
# CONFIG_KERNEL_GZIP is not set
# CONFIG_KERNEL_BZIP2 is not set
# CONFIG_KERNEL_LZMA is not set
# CONFIG_KERNEL_XZ is not set
# CONFIG_KERNEL_LZO is not set
CONFIG_KERNEL_LZ4=y
CONFIG_DEFAULT_HOSTNAME="(none)"
CONFIG_SYSVIPC=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
# CONFIG_USELIB is not set
CONFIG_HAVE_ARCH_AUDITSYSCALL=y

#
# IRQ subsystem
#
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y
CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_GENERIC_IRQ_MIGRATION=y
CONFIG_GENERIC_IRQ_CHIP=y
CONFIG_IRQ_DOMAIN=y
CONFIG_IRQ_SIM=y
CONFIG_IRQ_DOMAIN_HIERARCHY=y
CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y
CONFIG_GENERIC_IRQ_RESERVATION_MODE=y
CONFIG_IRQ_FORCED_THREADING=y
CONFIG_SPARSE_IRQ=y
# CONFIG_GENERIC_IRQ_DEBUGFS is not set
# end of IRQ subsystem

CONFIG_CLOCKSOURCE_WATCHDOG=y
CONFIG_ARCH_CLOCKSOURCE_DATA=y
CONFIG_ARCH_CLOCKSOURCE_INIT=y
CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y
CONFIG_GENERIC_CMOS_UPDATE=y

#
# Timers subsystem
#
CONFIG_TICK_ONESHOT=y

[PATCH 01/10] sched: introduce task_se_h_load helper

2019-06-28 Thread Rik van Riel

Sometimes the hierarchical load of a sched_entity needs to be calculated.
Rename task_h_load to task_se_h_load, and directly pass a sched_entity to
that function.

Move the function declaration up above where it will be used later.

No functional changes.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..eadf9a96b3e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -242,6 +242,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long 
weight, struct load_weight
 
 
 const struct sched_class fair_sched_class;
+static unsigned long task_se_h_load(struct sched_entity *se);
 
 /**
  * CFS operations on generic schedulable entities:
@@ -706,7 +707,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 #ifdef CONFIG_SMP
 
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
-static unsigned long task_h_load(struct task_struct *p);
 static unsigned long capacity_of(int cpu);
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
@@ -1668,7 +1668,7 @@ static void task_numa_compare(struct task_numa_env *env,
/*
 * In the overloaded case, try and keep the load balanced.
 */
-   load = task_h_load(env->p) - task_h_load(cur);
+   load = task_se_h_load(env->p->se) - task_se_h_load(cur->se);
if (!load)
goto assign;
 
@@ -1706,7 +1706,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
bool maymove = false;
int cpu;
 
-   load = task_h_load(env->p);
+   load = task_se_h_load(env->p->se);
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
 
@@ -3389,7 +3389,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq 
*cfs_rq, long runnable_sum
  * avg. The immediate corollary is that all (fair) tasks must be attached, see
  * post_init_entity_util_avg().
  *
- * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ * cfs_rq->avg is used for task_se_h_load() and update_cfs_share() for example.
  *
  * Returns true if the load decayed or we removed load.
  *
@@ -3522,7 +3522,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, 
struct sched_entity *s
 
/*
 * Track task load average for carrying it to new CPU after migrated, 
and
-* track group sched_entity load average for task_h_load calc in 
migration
+* track group sched_entity load average for task_se_h_load calc in 
migration
 */
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
@@ -3751,7 +3751,7 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
return;
}
 
-   rq->misfit_task_load = task_h_load(p);
+   rq->misfit_task_load = task_se_h_load(>se);
 }
 
 #else /* CONFIG_SMP */
@@ -5739,7 +5739,7 @@ wake_affine_weight(struct sched_domain *sd, struct 
task_struct *p,
this_eff_load = target_load(this_cpu, sd->wake_idx);
 
if (sync) {
-   unsigned long current_load = task_h_load(current);
+   unsigned long current_load = task_se_h_load(>se);
 
if (current_load > this_eff_load)
return this_cpu;
@@ -5747,7 +5747,7 @@ wake_affine_weight(struct sched_domain *sd, struct 
task_struct *p,
this_eff_load -= current_load;
}
 
-   task_load = task_h_load(p);
+   task_load = task_se_h_load(>se);
 
this_eff_load += task_load;
if (sched_feat(WA_BIAS))
@@ -7600,7 +7600,7 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
 
-   load = task_h_load(p);
+   load = task_se_h_load(>se);
 
if (sched_feat(LB_MIN) && load < 16 && 
!env->sd->nr_balance_failed)
goto next;
@@ -7833,12 +7833,12 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long task_se_h_load(struct sched_entity *se)
 {
-   struct cfs_rq *cfs_rq = task_cfs_rq(p);
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
update_cfs_rq_h_load(cfs_rq);
-   return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+   return div64_ul(se->avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
 }
 #else
@@ -7865,9 +7865,9 @@ static inline void update_blocked_averages(int cpu)
rq_unlock_irqrestore(rq, );
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long task_se_h_load(struct sched_entity *se)
 {
-   return p->se.avg.load_avg;
+   return

[PATCH 07/10] sched,cfs: fix zero length timeslice calculation

2019-06-28 Thread Rik van Riel

The way the time slice length is currently calculated, not only do high
priority tasks get longer time slices than low priority tasks, but due
to fixed point math, low priority tasks could end up with a zero length
time slice. This can lead to cache thrashing and other inefficiencies.

Simplify the logic a little bit, and cap the minimum time slice length
to sysctl_sched_min_granularity.

Tasks that end up getting a time slice length too long for their relative
priority will simply end up having their vruntime advanced much faster than
other tasks, resulting in them receiving time slices less frequently.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 25 -
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d48bff5118fc..8da2823401ca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -671,22 +671,6 @@ static inline u64 calc_delta_fair(u64 delta, struct 
sched_entity *se)
return delta;
 }
 
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
-   if (unlikely(nr_running > sched_nr_latency))
-   return nr_running * sysctl_sched_min_granularity;
-   else
-   return sysctl_sched_latency;
-}
-
 /*
  * We calculate the wall-time slice from the period by taking a part
  * proportional to the weight.
@@ -695,7 +679,7 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+   u64 slice = sysctl_sched_latency;
 
for_each_sched_entity(se) {
struct load_weight *load;
@@ -712,6 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
}
slice = __calc_delta(slice, se->load.weight, load);
}
+
+   /*
+* To avoid cache thrashing, run at least sysctl_sched_min_granularity.
+* The vruntime of a low priority task advances faster; those tasks
+* will simply get time slices less frequently.
+*/
+   slice = max_t(u64, slice, sysctl_sched_min_granularity);
return slice;
 }
 
-- 
2.20.1

[PATCH 05/10] sched,fair: remove cfs rqs from leaf_cfs_rq_list bottom up

2019-06-28 Thread Rik van Riel

Reducing the overhead of the CPU controller is achieved by not walking
all the sched_entities every time a task is enqueued or dequeued.

One of the things being checked every single time is whether the cfs_rq
is on the rq->leaf_cfs_rq_list.

By only removing a cfs_rq from the list once it no longer has children
on the list, we can avoid walking the sched_entity hierarchy if the bottom
cfs_rq is on the list, once the runqueues have been flattened.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c  | 17 +
 kernel/sched/sched.h |  1 +
 2 files changed, 18 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 63cb40253b26..e41feacc45d9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -286,6 +286,13 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
 
cfs_rq->on_list = 1;
 
+   /*
+* If the tmp_alone_branch cursor was moved, it means a child cfs_rq
+* is already on the list ahead of us.
+*/
+   if (rq->tmp_alone_branch != >leaf_cfs_rq_list)
+   cfs_rq->children_on_list++;
+
/*
 * Ensure we either appear before our parent (if already
 * enqueued) or force our parent to appear after us when it is
@@ -311,6 +318,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
 * list.
 */
rq->tmp_alone_branch = >leaf_cfs_rq_list;
+   cfs_rq->tg->parent->cfs_rq[cpu]->children_on_list++;
return true;
}
 
@@ -359,6 +367,11 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
if (rq->tmp_alone_branch == _rq->leaf_cfs_rq_list)
rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
 
+   if (cfs_rq->tg->parent) {
+   int cpu = cpu_of(rq);
+   cfs_rq->tg->parent->cfs_rq[cpu]->children_on_list--;
+   }
+
list_del_rcu(_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
@@ -7687,6 +7700,10 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq 
*cfs_rq)
if (cfs_rq->avg.util_sum)
return false;
 
+   /* Remove decayed parents once their decayed children are gone. */
+   if (cfs_rq->children_on_list)
+   return false;
+
return true;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32978a8de8ce..4f8acbab0fb2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -557,6 +557,7 @@ struct cfs_rq {
 * This list is used during load balance.
 */
int on_list;
+   int children_on_list;
struct list_headleaf_cfs_rq_list;
struct task_group   *tg;/* group that "owns" this runqueue */
 
-- 
2.20.1

[PATCH 02/10] sched: change /proc/sched_debug fields

2019-06-28 Thread Rik van Riel

Remove some fields from /proc/sched_debug that are removed from
sched_entity in a subsequent patch, and add h_load, which comes in
very handy to debug CPU controller weight distribution.

Signed-off-by: Rik van Riel 
---
 kernel/sched/debug.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 14c6a8716ba1..f6beaca97a09 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -416,11 +416,9 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
}
 
P(se->load.weight);
-   P(se->runnable_weight);
 #ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
-   P(se->avg.runnable_load_avg);
 #endif
 
 #undef PN_SCHEDSTAT
@@ -538,7 +536,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
-   SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", 
cfs_rq->runnable_weight);
SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
@@ -547,17 +544,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
cfs_rq->avg.util_avg);
SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
cfs_rq->avg.util_est.enqueued);
-   SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
-   cfs_rq->removed.load_avg);
SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
-   SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum",
-   cfs_rq->removed.runnable_sum);
 #ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(_rq->tg->load_avg));
+   SEQ_printf(m, "  .%-30s: %lu\n", "h_load",
+   cfs_rq->h_load);
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -961,10 +956,8 @@ void proc_sched_show_task(struct task_struct *p, struct 
pid_namespace *ns,
   "nr_involuntary_switches", (long long)p->nivcsw);
 
P(se.load.weight);
-   P(se.runnable_weight);
 #ifdef CONFIG_SMP
P(se.avg.load_sum);
-   P(se.avg.runnable_load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
P(se.avg.runnable_load_avg);
-- 
2.20.1

[PATCH 08/10] sched,fair: refactor enqueue/dequeue_entity

2019-06-28 Thread Rik van Riel

Refactor enqueue_entity, dequeue_entity, and update_load_avg, in order
to split out the things we still want to happen at every level in the
cgroup hierarchy with a flat runqueue from the things we only need to
happen once.

No functional changes.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 64 +
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8da2823401ca..a751e7a9b228 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3480,7 +3480,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, 
struct sched_entity *s
 #define DO_ATTACH  0x4
 
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
+static inline bool update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 {
u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
@@ -3509,6 +3509,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, 
struct sched_entity *s
 
} else if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
+
+   return decayed;
 }
 
 #ifndef CONFIG_64BIT
@@ -3725,9 +3727,10 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
 #define SKIP_AGE_LOAD  0x0
 #define DO_ATTACH  0x0
 
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int not_used1)
+static inline bool update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int not_used1)
 {
cfs_rq_util_change(cfs_rq, 0);
+   return false;
 }
 
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
@@ -3850,6 +3853,24 @@ static inline void check_schedstat_required(void)
  * CPU and an up-to-date min_vruntime on the destination CPU.
  */
 
+static bool
+enqueue_entity_groups(struct cfs_rq *cfs_rq, struct sched_entity *se, int 
flags)
+{
+   /*
+* When enqueuing a sched_entity, we must:
+*   - Update loads to have both entity and cfs_rq synced with now.
+*   - Add its load to cfs_rq->runnable_avg
+*   - For group_entity, update its weight to reflect the new share of
+* its group cfs_rq
+*   - Add its new weight to cfs_rq->load.weight
+*/
+   if (!update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH))
+   return false;
+
+   update_cfs_group(se);
+   return true;
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -3874,16 +3895,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
 
-   /*
-* When enqueuing a sched_entity, we must:
-*   - Update loads to have both entity and cfs_rq synced with now.
-*   - Add its load to cfs_rq->runnable_avg
-*   - For group_entity, update its weight to reflect the new share of
-* its group cfs_rq
-*   - Add its new weight to cfs_rq->load.weight
-*/
-   update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
-   update_cfs_group(se);
enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
 
@@ -3950,14 +3961,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
-static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static bool
+dequeue_entity_groups(struct cfs_rq *cfs_rq, struct sched_entity *se, int 
flags)
 {
-   /*
-* Update run-time statistics of the 'current'.
-*/
-   update_curr(cfs_rq);
-
/*
 * When dequeuing a sched_entity, we must:
 *   - Update loads to have both entity and cfs_rq synced with now.
@@ -3966,7 +3972,21 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
 *   - For group entity, update its weight to reflect the new share
 * of its group cfs_rq.
 */
-   update_load_avg(cfs_rq, se, UPDATE_TG);
+   if (!update_load_avg(cfs_rq, se, UPDATE_TG))
+   return false;
+   update_cfs_group(se);
+
+   return true;
+}
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+   /*
+* Update run-time statistics of the 'current'.
+*/
+   update_curr(cfs_rq);
+
dequeue_runnable_load_avg(cfs_rq, se);
 
update_stats_dequeue(cfs_rq, se, flags);
@@ -3990,8 +4010,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
 
-   update_cfs_group(se);
-
/*
 * Now advance min_vruntime if @se was the entity holding it back,
 * except when:

[PATCH 06/10] sched,cfs: use explicit cfs_rq of parent se helper

2019-06-28 Thread Rik van Riel

Use an explicit "cfs_rq of parent sched_entity" helper in a few
strategic places, where cfs_rq_of(se) may no longer point at the
right runqueue once we flatten the hierarchical cgroup runqueues.

No functional change.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e41feacc45d9..d48bff5118fc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -276,6 +276,15 @@ static inline struct cfs_rq *group_cfs_rq(struct 
sched_entity *grp)
return grp->my_q;
 }
 
+/* runqueue owned by the parent entity; the root cfs_rq for a top level se */
+static inline struct cfs_rq *group_cfs_rq_of_parent(struct sched_entity *se)
+{
+   if (se->parent)
+   return group_cfs_rq(se->parent);
+
+   return cfs_rq_of(se);
+}
+
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
@@ -3297,7 +3306,7 @@ static inline int propagate_entity_load_avg(struct 
sched_entity *se)
 
gcfs_rq->propagate = 0;
 
-   cfs_rq = cfs_rq_of(se);
+   cfs_rq = group_cfs_rq_of_parent(se);
 
add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 
@@ -7778,7 +7787,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 
WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
-   cfs_rq = cfs_rq_of(se);
+   cfs_rq = group_cfs_rq_of_parent(se);
WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
@@ -7801,7 +7810,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 
 static unsigned long task_se_h_load(struct sched_entity *se)
 {
-   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+   struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
 
update_cfs_rq_h_load(cfs_rq);
return div64_ul(se->avg.load_avg * cfs_rq->h_load,
@@ -10148,7 +10157,7 @@ static void task_tick_fair(struct rq *rq, struct 
task_struct *curr, int queued)
struct sched_entity *se = >se;
 
for_each_sched_entity(se) {
-   cfs_rq = cfs_rq_of(se);
+   cfs_rq = group_cfs_rq_of_parent(se);
entity_tick(cfs_rq, se, queued);
}
 
-- 
2.20.1

[PATCH 09/10] sched,fair: add helper functions for flattened runqueue

2019-06-28 Thread Rik van Riel

Add helper functions to make the flattened runqueue patch a little smaller.

The task_se_h_weight function is similar to task_se_h_load, but scales the
task weight by the group weight, without taking the task's duty cycle into
account.

The task_se_in_cgroup helper is functionally identical to parent_entity,
but directly calling a function with that name obscures what the other
code is trying to use it for, and would make the code harder to understand.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a751e7a9b228..6fea8849cc12 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -243,6 +243,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long 
weight, struct load_weight
 
 const struct sched_class fair_sched_class;
 static unsigned long task_se_h_load(struct sched_entity *se);
+static unsigned long task_se_h_weight(struct sched_entity *se);
 
 /**
  * CFS operations on generic schedulable entities:
@@ -411,6 +412,12 @@ static inline struct sched_entity *parent_entity(struct 
sched_entity *se)
return se->parent;
 }
 
+/* Is this (task) sched_entity in a non-root cgroup? */
+static inline bool task_se_in_cgroup(struct sched_entity *se)
+{
+   return parent_entity(se);
+}
+
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -7819,6 +7826,20 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
 }
 
+static unsigned long task_se_h_weight(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq;
+
+   if (!task_se_in_cgroup(se))
+   return se->load.weight;
+
+   cfs_rq = group_cfs_rq_of_parent(se);
+   update_cfs_rq_h_load(cfs_rq);
+
+   /* Reduce the load.weight by the h_load of the group the task is in. */
+   return (cfs_rq->h_load * se->load.weight) >> SCHED_FIXEDPOINT_SHIFT;
+}
+
 static unsigned long task_se_h_load(struct sched_entity *se)
 {
struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
@@ -7855,6 +7876,11 @@ static unsigned long task_se_h_load(struct sched_entity 
*se)
 {
return se->avg.load_avg;
 }
+
+static unsigned long task_se_h_weight(struct sched_entity *se)
+{
+   return se->load.weight;
+}
 #endif
 
 /** Helpers for find_busiest_group /
-- 
2.20.1

[PATCH RFC v2 0/10] sched,fair: flatten CPU controller runqueues

2019-06-28 Thread Rik van Riel

The current implementation of the CPU controller uses hierarchical
runqueues, where on wakeup a task is enqueued on its group's runqueue,
the group is enqueued on the runqueue of the group above it, etc.

This increases a fairly large amount of overhead for workloads that
do a lot of wakeups a second, especially given that the default systemd
hierarchy is 2 or 3 levels deep.

This patch series is an attempt at reducing that overhead, by placing
all the tasks on the same runqueue, and scaling the task priority by
the priority of the group, which is calculated periodically.

This patch series still has a number of TODO items:
- Clean up the code, and fix compilation without CONFIG_FAIR_GROUP_SCHED.
- Remove some more now unused code.
- Figure out a performance regression with our web server workload.
  I have fixed the schbench issue, now I need to find the less obvious
  stuff causing an increased number of involuntary preemptions...
- Reimplement CONFIG_CFS_BANDWIDTH.

Plan for the CONFIG_CFS_BANDWIDTH reimplementation:
- When a cgroup gets throttled, mark the cgroup and its children
  as throttled.
- When pick_next_entity finds a task that is on a throttled cgroup,
  stash it on the cgroup runqueue (which is not used for runnable
  tasks any more). Leave the vruntime unchanged, and adjust that
  runqueue's vruntime to be that of the left-most task.
- When a cgroup gets unthrottled, and has tasks on it, place it on
  a vruntime ordered heap separate from the main runqueue.
- Have pick_next_task_fair grab one task off that heap every time it
  is called, and the min vruntime of that heap is lower than the
  vruntime of the CPU's cfs_rq (or the CPU has no other runnable tasks).
- Place that selected task on the CPU's cfs_rq, renormalizing its
  vruntime with the GENTLE_FAIR_SLEEPERS logic. That should help
  interleave the already runnable tasks with the recently unthrottled
  group, and prevent thundering herd issues.
- If the group gets throttled again before all of its task had a chance
  to run, vruntime sorting ensures all the tasks in the throttled cgroup
  get a chance to run over time.

Changes from v1:
- use task_se_h_weight instead of task_se_h_load in calc_delta_fair
  and sched_slice, this seems to improve performance a little, but
  I still have some remaining regression to chase with our web server
  workload
- implement a number of the changes suggested by Dietmar Eggemann
  (still holding out for a better name for group_cfs_rq_of_parent)

This series applies on top of 5.2-rc6.

 include/linux/sched.h |6 
 kernel/sched/core.c   |2 
 kernel/sched/debug.c  |   15 -
 kernel/sched/fair.c   |  746 +-
 kernel/sched/pelt.c   |   53 +--
 kernel/sched/pelt.h   |2 
 kernel/sched/sched.h  |   10 
 7 files changed, 346 insertions(+), 488 deletions(-)

[PATCH 03/10] sched,fair: redefine runnable_load_avg as the sum of task_h_load

2019-06-28 Thread Rik van Riel

The runnable_load magic is used to quickly propagate information about
runnable tasks up the hierarchy of runqueues. The runnable_load_avg is
mostly used for the load balancing code, which only examines the value at
the root cfs_rq.

Redefine the root cfs_rq runnable_load_avg to be the sum of task_h_loads
of the runnable tasks. This works because the hierarchical runnable_load of
a task is already equal to the task_se_h_load today. This provides enough
information to the load balancer.

The runnable_load_avg of the cgroup cfs_rqs does not appear to be
used for anything, so don't bother calculating those.

This removes one of the things that the code currently traverses the
cgroup hierarchy for, and getting rid of it brings us one step closer
to a flat runqueue for the CPU controller.

Signed-off-by: Rik van Riel 
---
 include/linux/sched.h |   3 +-
 kernel/sched/core.c   |   2 -
 kernel/sched/debug.c  |   1 +
 kernel/sched/fair.c   | 125 +-
 kernel/sched/pelt.c   |  49 ++---
 kernel/sched/sched.h  |   6 --
 6 files changed, 55 insertions(+), 131 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11837410690f..f5bb6948e40c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -391,7 +391,6 @@ struct util_est {
 struct sched_avg {
u64 last_update_time;
u64 load_sum;
-   u64 runnable_load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long   load_avg;
@@ -439,7 +438,6 @@ struct sched_statistics {
 struct sched_entity {
/* For load-balancing: */
struct load_weight  load;
-   unsigned long   runnable_weight;
struct rb_node  run_node;
struct list_headgroup_node;
unsigned inton_rq;
@@ -455,6 +453,7 @@ struct sched_entity {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
+   unsigned long   enqueued_h_load;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq   *cfs_rq;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427742a9..fbd96900f715 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -744,7 +744,6 @@ static void set_load_weight(struct task_struct *p, bool 
update_load)
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
-   p->se.runnable_weight = load->weight;
return;
}
 
@@ -757,7 +756,6 @@ static void set_load_weight(struct task_struct *p, bool 
update_load)
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
-   p->se.runnable_weight = load->weight;
}
 }
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f6beaca97a09..cefc1b171c0b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -962,6 +962,7 @@ void proc_sched_show_task(struct task_struct *p, struct 
pid_namespace *ns,
P(se.avg.load_avg);
P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
+   P(se.enqueued_h_load);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
P(se.avg.util_est.enqueued);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eadf9a96b3e1..860708b687a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -723,9 +723,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 * nothing has been attached to the task group yet.
 */
if (entity_is_task(se))
-   sa->runnable_load_avg = sa->load_avg = 
scale_load_down(se->load.weight);
-
-   se->runnable_weight = se->load.weight;
+   sa->load_avg = scale_load_down(se->load.weight);
 
/* when this task enqueue'ed, it will contribute to its cfs_rq's 
load_avg */
 }
@@ -2766,20 +2764,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   cfs_rq->runnable_weight += se->runnable_weight;
+   if (entity_is_task(se)) {
+   struct cfs_rq *root_cfs_rq = _rq->rq->cfs;
+   se->enqueued_h_load = task_se_h_load(se);
 
-   cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
-   cfs_rq->avg.runnable_load_sum += se_runnable(se) * 
se->avg.runnable_load_sum;
+   root_cfs_rq->avg.runnable_load_avg += se->enqueued_h_load;
+   }
 }
 
 static inline void
 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-

[PATCH 04/10] sched,fair: move runnable_load_avg to cfs_rq

2019-06-28 Thread Rik van Riel

Since only the root cfs_rq runnable_load_avg field is used any more,
we can move the field from struct sched_avg, which every sched_entity
has one of, directly into the struct cfs_rq, of which we have way fewer.

No functional changes.

Suggested-by: Dietmar Eggemann 
Signed-off-by: Rik van Riel 
---
 include/linux/sched.h | 1 -
 kernel/sched/debug.c  | 3 +--
 kernel/sched/fair.c   | 8 
 kernel/sched/sched.h  | 1 +
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5bb6948e40c..84a6cc6f5c47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -394,7 +394,6 @@ struct sched_avg {
u32 util_sum;
u32 period_contrib;
unsigned long   load_avg;
-   unsigned long   runnable_load_avg;
unsigned long   util_avg;
struct util_est util_est;
 } cacheline_aligned;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cefc1b171c0b..6e7c8ff210a8 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -539,7 +539,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
-   cfs_rq->avg.runnable_load_avg);
+   cfs_rq->runnable_load_avg);
SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
@@ -960,7 +960,6 @@ void proc_sched_show_task(struct task_struct *p, struct 
pid_namespace *ns,
P(se.avg.load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
-   P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.enqueued_h_load);
P(se.avg.last_update_time);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 860708b687a7..63cb40253b26 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2768,7 +2768,7 @@ enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
struct cfs_rq *root_cfs_rq = _rq->rq->cfs;
se->enqueued_h_load = task_se_h_load(se);
 
-   root_cfs_rq->avg.runnable_load_avg += se->enqueued_h_load;
+   root_cfs_rq->runnable_load_avg += se->enqueued_h_load;
}
 }
 
@@ -2777,7 +2777,7 @@ dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 {
if (entity_is_task(se)) {
struct cfs_rq *root_cfs_rq = _rq->rq->cfs;
-   sub_positive(_cfs_rq->avg.runnable_load_avg,
+   sub_positive(_cfs_rq->runnable_load_avg,
se->enqueued_h_load);
}
 }
@@ -2795,7 +2795,7 @@ update_runnable_load_avg(struct sched_entity *se)
 
new_h_load = task_se_h_load(se);
delta = new_h_load - se->enqueued_h_load;
-   root_cfs_rq->avg.runnable_load_avg += delta;
+   root_cfs_rq->runnable_load_avg += delta;
se->enqueued_h_load = new_h_load;
 }
 
@@ -3559,7 +3559,7 @@ static void remove_entity_load_avg(struct sched_entity 
*se)
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 {
-   return cfs_rq->avg.runnable_load_avg;
+   return cfs_rq->runnable_load_avg;
 }
 
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5be14cee61f9..32978a8de8ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -516,6 +516,7 @@ struct cfs_rq {
 * CFS load tracking
 */
struct sched_avgavg;
+   unsigned long   runnable_load_avg;
 #ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
 #endif
-- 
2.20.1

[PATCH 10/10] sched,fair: flatten hierarchical runqueues

2019-06-28 Thread Rik van Riel

Flatten the hierarchical runqueues into just the per CPU rq.cfs runqueue.

Iteration of the sched_entity hierarchy is rate limited to once per jiffy
per sched_entity, which is a smaller change than it seems, because load
average adjustments were already rate limited to once per jiffy before this
patch series.

This patch breaks CONFIG_CFS_BANDWIDTH. The plan for that is to park tasks
from throttled cgroups onto their cgroup runqueues, and slowly (using the
GENTLE_FAIR_SLEEPERS) wake them back up, in vruntime order, once the cgroup
gets unthrottled, to prevent thundering herd issues.

Signed-off-by: Rik van Riel 
---
 include/linux/sched.h |   2 +
 kernel/sched/fair.c   | 452 +++---
 kernel/sched/pelt.c   |   6 +-
 kernel/sched/pelt.h   |   2 +-
 kernel/sched/sched.h  |   2 +-
 5 files changed, 171 insertions(+), 293 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 84a6cc6f5c47..901c710363e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -453,6 +453,8 @@ struct sched_entity {
 #ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
unsigned long   enqueued_h_load;
+   unsigned long   enqueued_h_weight;
+   struct load_weight  h_load;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq   *cfs_rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6fea8849cc12..c31d3da081fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -450,6 +450,19 @@ find_matching_se(struct sched_entity **se, struct 
sched_entity **pse)
}
 }
 
+/* Add the cgroup cfs_rqs to the list, for update_blocked_averages */
+static void enqueue_entity_cfs_rqs(struct sched_entity *se)
+{
+   SCHED_WARN_ON(!entity_is_task(se));
+
+   for_each_sched_entity(se) {
+   struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
+
+   if (list_add_leaf_cfs_rq(cfs_rq))
+   break;
+   }
+}
+
 #else  /* !CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct task_struct *task_of(struct sched_entity *se)
@@ -672,8 +685,14 @@ int sched_proc_update_handler(struct ctl_table *table, int 
write,
  */
 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 {
-   if (unlikely(se->load.weight != NICE_0_LOAD))
+   if (task_se_in_cgroup(se)) {
+   unsigned long h_weight = task_se_h_weight(se);
+   if (h_weight != se->h_load.weight)
+   update_load_set(>h_load, h_weight);
+   delta = __calc_delta(delta, NICE_0_LOAD, >h_load);
+   } else if (unlikely(se->load.weight != NICE_0_LOAD)) {
delta = __calc_delta(delta, NICE_0_LOAD, >load);
+   }
 
return delta;
 }
@@ -687,22 +706,16 @@ static inline u64 calc_delta_fair(u64 delta, struct 
sched_entity *se)
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
u64 slice = sysctl_sched_latency;
+   struct load_weight *load = _rq->load;
+   struct load_weight lw;
 
-   for_each_sched_entity(se) {
-   struct load_weight *load;
-   struct load_weight lw;
-
-   cfs_rq = cfs_rq_of(se);
-   load = _rq->load;
+   if (unlikely(!se->on_rq)) {
+   lw = cfs_rq->load;
 
-   if (unlikely(!se->on_rq)) {
-   lw = cfs_rq->load;
-
-   update_load_add(, se->load.weight);
-   load = 
-   }
-   slice = __calc_delta(slice, se->load.weight, load);
+   update_load_add(, task_se_h_weight(se));
+   load = 
}
+   slice = __calc_delta(slice, task_se_h_weight(se), load);
 
/*
 * To avoid cache thrashing, run at least sysctl_sched_min_granularity.
@@ -2703,16 +2716,28 @@ static inline void update_scan_period(struct 
task_struct *p, int new_cpu)
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   update_load_add(_rq->load, se->load.weight);
-   if (!parent_entity(se))
+   struct rq *rq;
+
+   if (task_se_in_cgroup(se)) {
+   struct cfs_rq *cgroup_rq = group_cfs_rq_of_parent(se);
+   unsigned long h_weight;
+
+   update_load_add(_rq->load, se->load.weight);
+   cgroup_rq->nr_running++;
+
+   /* Add the hierarchical weight to the CPU rq */
+   h_weight = task_se_h_weight(se);
+   se->enqueued_h_weight = h_weight;
+   update_load_add(_of(cfs_rq)->load, h_weight);
+   } else {
+   update_load_add(_rq->load, se->load.weight);
update_load_add(_of(cfs_rq)->load, se->load.weight);
+   }
 #ifdef CONFIG_SMP
-   if (entity_is_task(se)) {
-   struct rq *rq = rq_of(cfs_rq);
+   rq

Re: [PATCH] perf: Fix race between close() and fork()

2019-06-28 Thread Peter Zijlstra

On Fri, Jun 28, 2019 at 05:50:03PM +0100, Mark Rutland wrote:
> > +   /*
> > +* Wake any perf_event_free_task() waiting for this event to be
> > +* freed.
> > +*/
> > +   smp_mb(); /* pairs with wait_var_event() */
> > +   wake_up_var(var);
> 
> Huh, so wake_up_var() doesn't imply a RELEASE?
> 
> As an aside, doesn't that mean all callers of wake_up_var() have to do
> likewise to ensure it isn't re-ordered with whatever prior stuff they're
> trying to notify waiters about? Several do an smp_store_release() then a
> wake_up_var(), but IIUC the wake_up_var() could get pulled before that
> release...

Yah,...

  
https://lkml.kernel.org/r/20190624165012.gh3...@hirez.programming.kicks-ass.net

I needs to get back to that.

Re: [PATCH v2] arm64: Kconfig.platforms: Enable GPIO_DAVINCI for ARCH_K3

2019-06-28 Thread Nishanth Menon

On 09:08-20190628, Keerthy wrote:
[..]
> > > + select GPIO_SYSFS
> > > + select GPIO_DAVINCI
> > 
> > 
> > Could you help explain the logic of doing this? commit message is
> > basically the diff in English. To me, this does NOT make sense.
> > 
> > I understand GPIO_DAVINCI is the driver compatible, but we cant do this for
> > every single SoC driver that is NOT absolutely mandatory for basic
> > functionality.
> 
> In case of ARM64 could you help me find the right place to enable
> such SoC specific configs?

Is'nt that what defconfig is supposed to be all about?

arch/arm64/configs/defconfig

> 
> > 
> > Also keep in mind the impact to arm64/configs/defconfig -> every single
> > SoC in the arm64 world will be now rebuild with GPIO_SYSFS.. why force
> > that?
> 
> This was the practice in arm32 soc specific configs like
> omap2plus_defconfig. GPIO_SYSFS was he only way to validate. Now i totally
> understand your concern about every single SoC rebuilding but now where do
> we need to enable the bare minimal GPIO_DAVINCI config?

Well, SYSFS, I cannot agree testing as the rationale in
Kconfig.platform. And, looking at [1], I see majority being mandatory
components for the SoC bootup. However, most of the "optional" drivers
go into arm64 as defconfig (preferably as a module?) and if you find a
rationale for recommending DEBUG_GPIO, you could propose that to the
community as well.

Now, Thinking about this, I'd even challenge the current list of configs as
being "select". I'd rather do an "imply"[2] - yes, you need this for the
default dtb to boot, however a carefully carved dtb could boot with
lesser driver set to get a smaller (and less functional) kernel.

> 
> v1 i received feedback from Tero to enable in Kconfig.platforms. Hence i
> shifted to this approach.

I noticed that you were posting a v2, for future reference, please use
diffstat section to point to lore/patchworks link to point at v1 (I
did notice you mentioned you had an update, thanks - link will help
catch up on older discussions). This helps a later revision reviewer
like me to get context.

Tero, would you be able to help with a better rationale as to where the
boundaries are to be in your mind, rather than risk every single
peripheral driver getting into ARCH_K3?

As of right now, I'd rather we do not explode the current list out of
bounds. NAK unless we can find a better rationale.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/Kconfig.platforms
[2] https://www.kernel.org/doc/Documentation/kbuild/kconfig-language.txt

-- 
Regards,
Nishanth Menon

Re: [PATCH] dt-bindings: arm: Limit cpus schema to only check Arm 'cpu' nodes

2019-06-28 Thread Rob Herring

On Wed, Jun 26, 2019 at 7:02 PM Paul Walmsley  wrote:
>
> On Wed, 26 Jun 2019, Rob Herring wrote:
>
> > Matching on the 'cpus' node was a bad choice because the schema is
> > incorrectly applied to non-Arm cpus nodes. As we now have a common cpus
> > schema which checks the general structure, it is also redundant to do so
> > in the Arm CPU schema.
> >
> > The downside is one could conceivably mix different architecture's cpu
> > nodes or have typos in the compatible string. The latter problem pretty
> > much exists for every schema.
>
> The RISC-V patch applies cleanly, but this one doesn't apply here on
> either master or next-20190626.  Is there a different base commit?

Ugg, sorry. I had another commit colliding with this in my tree. I've
fixed it now and it's in my dt/next branch.

Rob

Re: [PATCH v1 0/3] Add required-opps support to devfreq passive gov

2019-06-28 Thread Saravana Kannan

On Thu, Jun 27, 2019 at 11:49 PM Viresh Kumar  wrote:
>
> On 26-06-19, 11:10, Saravana Kannan wrote:
> > On Tue, Jun 25, 2019 at 11:32 PM Viresh Kumar  
> > wrote:
>
> > > So, when a CPU changes frequency, we must change the performance state
> > > of PM domain and change frequency/bw of the cache synchronously.
> >
> > I mean, it's going to be changed when we get the CPUfreq transition
> > notifiers. From a correctness point of view, setting it inside the OPP
> > framework is not any better than doing it when we get the notifiers.
>
> That's what the problem is. All maintainers now a days ask people to
> stay away from notifiers and we are making that base of another new
> thing we are starting.

In that case we can just add direct calls in cpufreq.c to let devfreq
know about the frequency changes. But then again, CPU is just one
example for this use case. I'm just using that because people are more
familiar with that.

> Over that, with many cpufreq drivers we have fast switching enabled
> and notifiers disabled. How will they make these things work ? We
> still want to scale L3 in those cases as well.

Nothing is preventing them from using the xlate OPP API I added to
figure out all the CPU to L3 frequency mapping and then set the L3
frequency directly from the CPUfreq driver.

Also, whether we use OPP framework or devfreq to set the L3 frequency,
it's going to block fast switching because both these frameworks have
APIs that can sleep.

But really, most mobile use cases don't want to permanently tie L3
freq to CPU freq. Having it go through devfreq and being able to
switch governors is a very important need for mobile products.

Keep in mind that nothing in this series does any of the cpufreq stuff
yet. That'll need a few more changes. I was just using CPUfreq as an
example.

> > I see this as "required for good performance". So I don't see it as
> > redefining required-opps. If someone wants good performance/power
> > balance they follow the "required-opps". Technically even the PM
> > pstates are required for good power. Otherwise, the system could leave
> > the voltage at max and stuff would still work.
> >
> > Also, the slave device might need to get input from multiple master
> > devices and aggregate the request before setting the slave device
> > frequency. So I don't think OPP  framework would be the right place to
> > deal with those things. For example, L3 might (will) have different
> > mappings for big vs little cores. So that needs to be aggregated and
> > set properly by the slave device driver. Also, GPU might have a
> > mapping for L3 too. In which case the L3 slave driver needs to take
> > input from even more masters before it decides its frequency. But most
> > importantly, we still need the ability to change governors for L3.
> > Again these are just examples with L3 and it can get more complicated
> > based on the situation.
> >
> > Most importantly, instead of always going by mapping, one might decide
> > to scale the L3 based on some other governor (that looks at some HW
> > counter). Or just set it to performance governor for a use case for
> > which performance is more important. All of this comes for free with
> > devfreq and if we always set it from OPP framework we don't give this
> > required control to userspace.
> >
> > I think going through devfreq is the right approach for this. And we
> > can always rewrite the software if we find problems in the future. But
> > as it stands today, this will help cases like exynos without the need
> > for a lot of changes. Hope I've convinced you.
>
> I understand the aggregation thing and fully support that the
> aggregation can't happen in OPP core and must be done somewhere else.
> But the input can go from OPP core while the frequency is changing,
> isn't it ?

I'm not opposed to OPP sending input to devfreq to let it know that a
master device frequency change is happening. But I think this is kinda
orthogonal to this patch series.

Today the passive governor looks at the master device's devfreq
frequency changes to trigger the frequency change of the slave
devfreq. It neither supports tracking OPP frequency change nor CPUfreq
frequency change. If that's something we want to add, we can look into
that separately as passive governor (or a new governor) changes.

But then not all devices (CPUfreq or otherwise) use OPP to set the
frequencies. So it's beneficial to have all of these frameworks as
inputs for devfreq passive (like) governor. CPUfreq is actually a bit
more tricky because we'll also have to track hotplug, etc. So direct
calls from CPUfreq to devfreq (similar to cpufreq stats tracking)
would be good.

-Saravana

Re: [PATCH v3] clk: Add Si5341/Si5340 driver

2019-06-28 Thread Stephen Boyd

Quoting Mike Looijmans (2019-06-27 23:42:03)
> On 27-06-19 23:06, Stephen Boyd wrote:
> > Quoting Mike Looijmans (2019-05-17 06:23:52)
> >> Adds a driver for the Si5341 and Si5340 chips. The driver does not fully
> >> support all features of these chips, but allows the chip to be used
> >> without any support from the "clockbuilder pro" software.
> >>
> >> If the chip is preprogrammed, that is, you bought one with some defaults
> >> burned in, or you programmed the NVM in some way, the driver will just
> >> take over the current settings and only change them on demand. Otherwise
> >> the input must be a fixed XTAL in its most basic configuration (no
> >> predividers, no feedback, etc.).
> >>
> >> The driver supports dynamic changes of multisynth, output dividers and
> >> enabling or powering down outputs and multisynths.
> >>
> >> Signed-off-by: Mike Looijmans 
> >> ---
> > 
> > Applied to clk-next + some fixes. I'm not super thrilled about the kHz
> > thing but we don't have a solution for it right now so might as well
> > come back to it later.
> 
> Thanks for the fixes. And I'm not exactly proud of that kHz part either.
> 
> While thinking about a solution, I've also had a use case for less than 1Hz 
> frequency adjustment (a video clock to "follow" another one). These clock 
> generators allow for ridiculous ranges and accuracy, you can request it to 
> generate a 2.0005 Hz clock.
> 

Right. We need to make a plan to replace unsigned long with u64 in the
clk framework and then figure out how to support whatever use-cases we
can with the extra 32-bits we get on the 32-bit unsigned long platforms.
I had a patch lying around that started to plumb u64 through the core
clock framework code, but I didn't pursue it because it didn't seem
necessary. I've seen some code for display PLLs that need to support
10GHz frequencies for display port too, so you're not alone here. 

Some questions to get the discussion going:

 1. Do we need to use the clk framework to set these frequencies or can
 it be done via other means in whatever subsystem wants to program these
 frequencies, like a broadcast TV tuner subsystem or the IIO subsystem?

 2. If clk framework must handle these frequencies, does it need to be
 set through the clk consumer APIs or can we manage to set the rates on
 these clks via child dividers, muxes, etc. that have
 CLK_SET_RATE_PARENT flag? This might avoid changing the consumer API
 and be simpler to implement.

 3. What's the maximum frequency and the highest resolution we need to
 support? Maybe we just need to support GHz and not THz (10^12) and have
 a resolution of uHz (micro-Hertz)?

 4. Not really a question, but a goal. We should try to avoid a
 performance hit due to an increase in 64-bit math. If possible we can
 do things differently on different CPU architectures to achieve this or
 we can have the clk providers use different clk ops/flags to indicate
 the max range and precision they require.

Anyway, I'm not going to be working on this topic anytime soon but these
are my rough thoughts. I'm sure others on the list have thought about
this topic too so if you want to work on this then it would be good to
float an RFC that answers these questions.

Re: [RFC PATCH RT 4/4] rcutorture: Avoid problematic critical section nesting

2019-06-28 Thread Paul E. McKenney

On Fri, Jun 28, 2019 at 02:37:24PM -0500, Scott Wood wrote:
> On Thu, 2019-06-27 at 17:52 -0700, Paul E. McKenney wrote:
> > On Thu, Jun 27, 2019 at 05:46:27PM -0500, Scott Wood wrote:
> > > On Thu, 2019-06-27 at 13:50 -0700, Paul E. McKenney wrote:
> > > > If by IPI-to-self you mean the IRQ work trick, that isn't implemented
> > > > across all architectures yet, is it?
> > > 
> > > Right... smp_send_reschedule() has wider coverage, but even then there's
> > > some hardware that just can't do it reasonably (e.g. pre-APIC x86).
> > 
> > Except that smp_send_reschedule() won't do anything unless the scheduler
> > things something needs to be done, as it its wake list is non-empty.
> > Which might explain why Peter Zijlstra didn't suggest it.
> 
> The wake list stuff is separate from the original purpose of the IPI, which
> is to hit the need_resched check on IRQ exit.  When that happens, the
> scheduler will call into RCU, even if it doesn't change threads.  

Got it, thank you!

> > > So I guess the options are:
> > > 
> > > 1. Accept that such hardware might experience delayed grace period
> > > completion in certain configurations,
> > > 2. Have such hardware check for need_resched in local_irq_enable() (not
> > > nice
> > > if sharing a kernel build with hardware that doesn't need it), or
> > > 3. Forbid the sequence (enforced by debug checks).  Again, this would
> > > only
> > > prohibit rcu_read_lock()/local_irq_disable()/rcu_read_unlock()/
> > > local_irq_enable() *without* preempt disabling around the IRQ-disabled
> > > region.
> > 
> > 4. If further testing continues to show it to be reliable, continue
> > using the scheme in -rcu.
> 
> If the testing isn't done on machines that can't do the IPI then it's
> basically option #1.  FWIW I don't think option #1 is unreasonable given
> that we're talking about very old and/or specialized hardware, and we're
> only talking about delays, not a crash (maybe limit the ability to use
> nohz_full on such hardware?).  Of course if it turns out people are actually
> trying to run (modern versions of) RT on such hardware, that might be
> different. :-)

Having tried and failed to remove DEC Alpha support several times, I
know which way to bet.  Though DEC Alpha support is no longer much of a
burden on the non-Alpha portions of Linux, so no longer much motivation
for removing its support.

> > 5. Use a short-duration hrtimer to get a clean environment in short
> > order.  Yes, the timer might fire while preemption and/or softirqs
> > are disabled, but then the code can rely on the following
> > preempt_enable(), local_bh_enable(), or whatever.  This condition
> > should be sufficiently rare to avoid issues with hrtimer overhead.
> 
> Yeah, I considered that but was hesitant due to overhead -- at least in the
> case of the example I gave (pre-APIC x86), arming a oneshot timer is pretty
> slow.  Plus, some hardware might entirely lack one-shot timer capability.

The overhead is incurred in a rare case, and on systems lacking oneshot
timer it is always possible to fall back on normal timers, albeit with
fixed-time added delays.  But yes, this does add a bit of complexity.

Alternatively, assuming this case is rare, normal timers might suffice
without the need for hrtimers.

> > 6. Use smp_call_function_single() to IPI some other poor slob of a
> > CPU, which then does the same back.  Non-waiting version in both
> > cases, of course.
> 
> I was assuming any hardware that can't do smp_send_reschedule() is not SMP.

I have no idea either way.

> > Probably others as well.
> > 
> > > > Why not simply make rcutorture cyheck whether it is running in a
> > > > PREEMPT_RT_FULL environment and avoid the PREEMPT_RT_FULL-unfriendly
> > > > testing only in that case?
> > > > 
> > > > And should we later get to a place where the PREEMPT_RT_FULL-
> > > > unfriendly
> > > > scenarios are prohibited across all kernel configurations, then the
> > > > module
> > > > parameter can be removed.  Again, until we know (as opposed to
> > > > suspect)
> > > > that these scenarios really don't happen, mainline rcutorture must
> > > > continue testing them.
> > > 
> > > Yes, I already acknowledged that debug checks detecting the sequences
> > > should
> > > come before the test removal
> > 
> > OK, good to hear.  As you may have noticed, I was getting the impression
> > that you might have changed your mind on this point.  ;-)
> > 
> > >  (including this patch as an RFC at this
> > > point
> > > was mainly meant as a demonstration of what's needed to get rcutorture
> > > to
> > > pass), but it'd be nice to have some idea of whether there would be
> > > opposition to the concept before coding up the checks.  I'd rather not
> > > continue the state of "these sequences can blow up on RT and we don't
> > > know
> > > if they exist or not" any longer than necessary.  Plus, only one of the
> > > sequences is exclusively an RT issue (though it's the one with the worst
> >

Re: [bpf/tools] cd17d77705: kernel_selftests.bpf.test_sock_addr.sh.fail

2019-06-28 Thread Stanislav Fomichev

On 06/28, Andrii Nakryiko wrote:
> On Thu, Jun 27, 2019 at 7:38 PM Stanislav Fomichev  wrote:
> >
> > On 06/27, Andrii Nakryiko wrote:
> > > On Thu, Jun 27, 2019 at 10:29 AM Stanislav Fomichev  
> > > wrote:
> > > >
> > > > On 06/27, Stanislav Fomichev wrote:
> > > > > On 06/27, kernel test robot wrote:
> > > > > > FYI, we noticed the following commit (built with gcc-7):
> > > > > >
> > > > > > commit: cd17d77705780e2270937fb3cbd2b985adab3edc ("bpf/tools: sync 
> > > > > > bpf.h")
> > > > > > https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git 
> > > > > > master
> > > > > >
> > > > > > in testcase: kernel_selftests
> > > > > > with following parameters:
> > > > > >
> > > > > > group: kselftests-00
> > > > > >
> > > > > > test-description: The kernel contains a set of "self tests" under 
> > > > > > the tools/testing/selftests/ directory. These are intended to be 
> > > > > > small unit tests to exercise individual code paths in the kernel.
> > > > > > test-url: https://www.kernel.org/doc/Documentation/kselftest.txt
> > > > > >
> > > > > >
> > > > > > on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge 
> > > > > > -smp 2 -m 8G
> > > > > >
> > > > > > caused below changes (please refer to attached dmesg/kmsg for 
> > > > > > entire log/backtrace):
> > > > > >
> > > > > > # 55: (18) r1 = 0x100
> > > > > > # ; ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2);
> > > > > > # 57: (7b) *(u64 *)(r6 +16) = r1
> > > > > > # invalid bpf_context access off=16 size=8
> > > > > This looks like clang doing single u64 write for user_ip6[2] and
> > > > > user_ip6[3] instead of two u32. I don't think we allow that.
> > > > >
> > > > > I've seen this a couple of times myself while playing with some
> > > > > progs, but not sure what's the right way to 'fix' it.
> > > > >
> > > > Any thoughts about the patch below? Another way to "fix" it
> > >
> > > I'll give it a more thorough look a bit later, but see my comments below.
> > >
> > > > would be to mark context accesses 'volatile' in bpf progs, but that 
> > > > sounds
> > > > a bit gross.
> > > >
> > > > diff --git a/include/linux/filter.h b/include/linux/filter.h
> > > > index 43b45d6db36d..34a14c950e60 100644
> > > > --- a/include/linux/filter.h
> > > > +++ b/include/linux/filter.h
> > > > @@ -746,6 +746,20 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 
> > > > size_default)
> > > > return size <= size_default && (size & (size - 1)) == 0;
> > > >  }
> > > >
> > > > +static inline bool __bpf_ctx_wide_store_ok(u32 off, u32 size)
> > >
> > > It seems like bpf_ctx_wide_store_ok and __bpf_ctx_wide_store_ok are
> > > used only inside net/core/filter.c, why declaring them in header file?
> > I wanted it to be next to bpf_ctx_narrow_access_ok which does the
> > reverse operation for reads.
> 
> Ah, ok, I see that bpf_ctx_narrow_access_ok is used in
> kernel/bpf/cgroup.c as well and bpf_ctx_wide_store_ok might be useful
> in some other contexts as well, let's keep it here.
> 
> >
> > > > +{
> > > > +   /* u64 access is aligned and fits into the field size */
> > > > +   return off % sizeof(__u64) == 0 && off + sizeof(__u64) <= size;
> > > > +}
> > > > +
> > > > +#define bpf_ctx_wide_store_ok(off, size, type, field) \
> > > > +   (size == sizeof(__u64) && \
> > > > +off >= offsetof(type, field) && \
> > > > +off < offsetofend(type, field) ? \
> > > > +   __bpf_ctx_wide_store_ok(off - offsetof(type, field), \
> > > > +FIELD_SIZEOF(type, field)) : 0)
> 
> This would be sufficient, right?
Thanks, that looks much better and is actually more correct than my
implementation. We should really look at the off alignment, not
the off-offsetof(type, field) as I did.

> #define bpf_ctx_wide_store_ok(off, size, type, field) \
> size == sizeof(__u64) &&  \
> off >= offsetof(type, field) &&   \
> off + size <= offsetofend(type, field) && \
> off % sizeof(__u64) == 0
> 
> > >
> > > Why do you need ternary operator instead of just a chain of &?
> > Good point. I didn't spend too much time on the patch tbh :-)
> > If it looks good in general, I can add proper tests and do a
> > proper submition, this patch is just to get the discussion started.
> 
> Consider it started. :) Talking with Yonghong about preventing this
> from happening in the first place in Clang, it seems like that would
> be harder and more cumbersome than supporting in BPF verifier. So
> please go ahead and submit a proper patch.
>
> >
> > > It also seems like you can avoid macro and use plain function if
> > > instead of providing (type, field) you provide values of offsetof and
> > > offsetofend (offsetofend - offsetof should equal FIELD_SIZEOF(type,
> > > field), shouldn't it?).
> > But then I'd have to copy-paste the args of offsetof/offsetofend at
> > the caller, right? I wanted the caller to be clean and simple.
> 
> Yeah, that's a bit verbose,

Re: [RFC] Deadlock via recursive wakeup via RCU with threadirqs

2019-06-28 Thread Paul E. McKenney

On Fri, Jun 28, 2019 at 03:29:23PM -0400, Joel Fernandes wrote:
> On Fri, Jun 28, 2019 at 11:22:16AM -0700, Paul E. McKenney wrote:
> > On Fri, Jun 28, 2019 at 07:45:45PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2019-06-28 10:30:11 [-0700], Paul E. McKenney wrote:
> > > > > I believe the .blocked field remains set even though we are not any 
> > > > > more in a
> > > > > reader section because of deferred processing of the blocked lists 
> > > > > that you
> > > > > mentioned yesterday.
> > > > 
> > > > That can indeed happen.  However, in current -rcu, that would mean
> > > > that .deferred_qs is also set, which (if in_irq()) would prevent
> > > > the raise_softirq_irqsoff() from being invoked.  Which was why I was
> > > > asking the questions about whether in_irq() returns true within threaded
> > > > interrupts yesterday.  If it does, I need to find if there is some way
> > > > of determining whether rcu_read_unlock_special() is being called from
> > > > a threaded interrupt in order to suppress the call to raise_softirq()
> > > > in that case.
> > > 
> > > Please not that:
> > > | void irq_exit(void)
> > > | {
> > > |…
> > > in_irq() returns true
> > > | preempt_count_sub(HARDIRQ_OFFSET);
> > > in_irq() returns false
> > > | if (!in_interrupt() && local_softirq_pending())
> > > | invoke_softirq();
> > > 
> > > -> invoke_softirq() does
> > > |if (!force_irqthreads) {
> > > | __do_softirq();
> > > | } else {
> > > | wakeup_softirqd();
> > > | }
> > > 
> > > so for `force_irqthreads' rcu_read_unlock_special() within
> > > wakeup_softirqd() will see false.
> > 
> > OK, fair point.  How about the following instead, again on -rcu?
> > 
> > Here is the rationale for the new version of the "if" statement:
> > 
> > 1.  irqs_were_disabled:  If interrupts are enabled, we should
> > instead let the upcoming irq_enable()/local_bh_enable()
> > do the rescheduling for us.
> > 2.  use_softirq: If we aren't using softirq, then
> > raise_softirq_irqoff() will be unhelpful.
> > 3a. in_interrupt(): If this returns true, the subsequent
> > call to raise_softirq_irqoff() is guaranteed not to
> > do a wakeup, so that call will be both very cheap and
> > quite safe.
> > 3b. Otherwise, if !in_interrupt(), if exp (an expedited RCU grace
> > period is being blocked), then incurring wakeup overhead
> > is worthwhile, and if also !.deferred_qs then scheduler locks
> > cannot be held so the wakeup will be safe.
> > 
> > Does that make more sense?
> 
> This makes a lot of sense. It would be nice to stick these comments on top of
> rcu_read_unlock_special() for future reference.

I do have an expanded version in the commit log.  I hope to get a more
high-level description in comments.

Thanx, Paul

> thanks,
> 
>  - Joel
> 
> 
> > 
> > Thanx, Paul
> > 
> > 
> > 
> > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> > index 82c925df1d92..8cfe8707 100644
> > --- a/kernel/rcu/tree_plugin.h
> > +++ b/kernel/rcu/tree_plugin.h
> > @@ -624,8 +624,9 @@ static void rcu_read_unlock_special(struct task_struct 
> > *t)
> >   (rdp->grpmask & rnp->expmask) ||
> >   tick_nohz_full_cpu(rdp->cpu);
> > // Need to defer quiescent state until everything is enabled.
> > -   if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
> > -   (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
> > +   if (irqs_were_disabled && use_softirq &&
> > +   (in_interrupt() ||
> > +(exp && !t->rcu_read_unlock_special.b.deferred_qs))) {
> > // Using softirq, safe to awaken, and we get
> > // no help from enabling irqs, unlike bh/preempt.
> > raise_softirq_irqoff(RCU_SOFTIRQ);
> > 
>

Re: [RFC] Deadlock via recursive wakeup via RCU with threadirqs

2019-06-28 Thread Paul E. McKenney

On Fri, Jun 28, 2019 at 03:24:07PM -0400, Joel Fernandes wrote:
> On Fri, Jun 28, 2019 at 11:52:19AM -0700, Paul E. McKenney wrote:
> > On Fri, Jun 28, 2019 at 08:40:26PM +0200, Sebastian Andrzej Siewior wrote:
> > > On 2019-06-28 08:30:50 [-0700], Paul E. McKenney wrote:
> > > > On Fri, Jun 28, 2019 at 03:54:33PM +0200, Peter Zijlstra wrote:
> > > > > On Thu, Jun 27, 2019 at 11:41:07AM -0700, Paul E. McKenney wrote:
> > > > > > Or just don't do the wakeup at all, if it comes to that.  I don't 
> > > > > > know
> > > > > > of any way to determine whether rcu_read_unlock() is being called 
> > > > > > from
> > > > > > the scheduler, but it has been some time since I asked Peter 
> > > > > > Zijlstra
> > > > > > about that.
> > > > > 
> > > > > There (still) is no 'in-scheduler' state.
> > > > 
> > > > Well, my TREE03 + threadirqs rcutorture test ran for ten hours last
> > > > night with no problems, so we just might be OK.
> > > > 
> > > > The apparent fix is below, though my approach would be to do backports
> > > > for the full set of related changes.
> > > > 
> > > > Joel, Sebastian, how goes any testing from your end?  Any reason
> > > > to believe that this does not represent a fix?  (Me, I am still
> > > > concerned about doing raise_softirq() from within a threaded
> > > > interrupt, but am not seeing failures.)
> 
> Are you concerned also about a regular process context executing in the
> scheduler and using RCU, having this issue?
> (not anything with threaded or not threaded IRQs, but just a path in the
> scheduler that uses RCU).
> 
> I don't think Sebastian's lock up has to do with the fact that an interrupt
> is threaded or not, except that ksoftirqd is awakened in the case where
> threadirqs is passed.

In current -rcu, the checks should suffice in the absence of threaded
interrupts.  They might also suffice for threaded interrupts, but a more
direct approach would be better, hence the in_interrupt() patch.

> > > For some reason it does not trigger as good as it did yesterday.
> > 
> > I swear that I wasn't watching!!!  ;-)
> > 
> > But I do know that feeling.
> 
> :-)
> 
> > > Commit
> > > - 23634ebc1d946 ("rcu: Check for wakeup-safe conditions in
> > >rcu_read_unlock_special()") does not trigger the bug within 94
> > >attempts.
> > > 
> > > - 48d07c04b4cc1 ("rcu: Enable elimination of Tree-RCU softirq
> > >   processing") needed 12 attempts to trigger the bug.
> > 
> > That matches my belief that 23634ebc1d946 ("rcu: Check for wakeup-safe
> > conditions in rcu_read_unlock_special()") will at least greatly decrease
> > the probability of this bug occurring.
> 
> I was just typing a reply that I can't reproduce it with:
>   rcu: Check for wakeup-safe conditions in rcu_read_unlock_special()
> 
> I am trying to revert enough of this patch to see what would break things,
> however I think a better exercise might be to understand more what the patch
> does why it fixes things in the first place ;-) It is probably the
> deferred_qs thing.

The deferred_qs flag is part of it!  Looking forward to hearing what
you come up with as being the critical piece of this commit.

Thanx, Paul

Re: [RFC] Deadlock via recursive wakeup via RCU with threadirqs

2019-06-28 Thread Scott Wood

On Fri, 2019-06-28 at 16:15 +0200, Peter Zijlstra wrote:
> On Thu, Jun 27, 2019 at 01:36:12PM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 27, 2019 at 03:17:27PM -0500, Scott Wood wrote:
> > > On Thu, 2019-06-27 at 11:41 -0700, Paul E. McKenney wrote:
> > > > Of course, unconditionally refusing to do the wakeup might not be
> > > > happy
> > > > thing for NO_HZ_FULL kernels that don't implement IRQ work.
> > > 
> > > Couldn't smp_send_reschedule() be used instead?
> > 
> > Good point.  If current -rcu doesn't fix things for Sebastian's case,
> > that would be well worth looking at.  But there must be some reason
> > why Peter Zijlstra didn't suggest it when he instead suggested using
> > the IRQ work approach.
> > 
> > Peter, thoughts?
> 
> I've not exactly kept up with the thread; but irq_work allows you to run
> some actual code on the remote CPU which is often useful and it is only
> a little more expensive than smp_send_reschedule().
> 
> Also, just smp_send_reschedule() doesn't really do anything without
> first poking TIF_NEED_RESCHED (or other scheduler state) and if you want
> to do both, there's other helpers you should use, like resched_cpu().

resched_cpu() will not send an IPI to the current CPU[1].  Plus, the RCU
code needs to set need_resched even in cases where it doesn't need to send
the IPI.  And worst of all, resched_cpu() takes the rq lock which is the
deadlock scenario we're trying to avoid.

-Scott

[1] Which makes me nervous about latency if there are any wakeups with irqs
disabled, without a preempt_enable() after irqs are enabled again, and not
inside an interrupt.

Re: [PATCH v1 2/6] mm: Move set/get_pcppage_migratetype to mmzone.h

2019-06-28 Thread Alexander Duyck

On Tue, Jun 25, 2019 at 11:28 AM Dave Hansen  wrote:
>
> On 6/19/19 3:33 PM, Alexander Duyck wrote:
> > In order to support page aeration it will be necessary to store and
> > retrieve the migratetype of a page. To enable that I am moving the set and
> > get operations for pcppage_migratetype into the mmzone header so that they
> > can be used when adding or removing pages from the free lists.
> ...
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 4c07af2cfc2f..6f8fd5c1a286 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
>
> Not mm/internal.h?

Yeah, I can probably move those to there. I just need to pull the call
to set_pcpage_migratetype out of set_page_aerated and place it in
aerator_add_to_boundary.

Re: [PATCH] mtd: rawnand: ingenic: fix ingenic_ecc dependency

2019-06-28 Thread Paul Cercueil





Le jeu. 27 juin 2019 à 18:40, Miquel Raynal 
 a écrit :

Hi Paul,

Miquel Raynal  wrote on Mon, 17 Jun 2019
14:16:59 +0200:


 Hello,

 Arnd Bergmann  wrote on Mon, 17 Jun 2019 14:12:48 
+0200:


 > On Mon, Jun 17, 2019 at 1:24 PM Paul Cercueil 
 wrote:

 >
 > > I think there's a better way to fix it, only in Kconfig.
 > >
 > > * Add a bool symbol MTD_NAND_INGENIC_USE_HW_ECC
 > > * Have the three ECC/BCH drivers select this symbol instead of
 > >   MTD_NAND_INGENIC_ECC
 > > * Add the following to the MTD_NAND_JZ4780 config option:
 > >   "select MTD_NAND_INGENIC_ECC if MTD_NAND_INGENIC_USE_HW_ECC"
 >
 > I don't see much difference to my approach here, but if you want
 > to submit that version with 'Reported-by: Arnd Bergmann 
',

 > please do so.
 >
 > Yet another option would be to use Makefile code to link both
 > files into one module, and remove the EXPORT_SYMBOL statements:
 >
 > obj-$(CONFIG_MTD_NAND_JZ4780) += jz4780_nand.o
 > jz4780_nand-y += ingenic_nand.o
 > jz4780_nand-$(CONFIG_MTD_NAND_INGENIC_ECC) += ingenic_ecc.o
 >

 I personally have a preference for this one.


Would you mind sending the above change? I forgot about it but I would
like to queue it for the next release. Preferably the last version 
Arnd

proposed.


It does change the module name from 'ingenic_nand' to 'jz4780_nand', 
though.

That's not really ideal...



Thanks,
Miquèl

Re: [PATCH 3/4] staging: rtl8712: reduce stack usage, again

2019-06-28 Thread Willem de Bruijn

On Fri, Jun 28, 2019 at 8:41 AM Arnd Bergmann  wrote:
>
> An earlier patch I sent reduced the stack usage enough to get
> below the warning limit, and I could show this was safe, but with
> GCC_PLUGIN_STRUCTLEAK_BYREF_ALL, it gets worse again because large stack
> variables in the same function no longer overlap:
>
> drivers/staging/rtl8712/rtl871x_ioctl_linux.c: In function 
> 'translate_scan.isra.2':
> drivers/staging/rtl8712/rtl871x_ioctl_linux.c:322:1: error: the frame size of 
> 1200 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]
>
> Split out the largest two blocks in the affected function into two
> separate functions and mark those noinline_for_stack.
>
> Fixes: 8c5af16f7953 ("staging: rtl8712: reduce stack usage")
> Fixes: 81a56f6dcd20 ("gcc-plugins: structleak: Generalize to all variable 
> types")
> Signed-off-by: Arnd Bergmann 

Reviewed-by: Willem de Bruijn

Re: [PATCH 4/4] ipvs: reduce kernel stack usage

2019-06-28 Thread Willem de Bruijn

On Fri, Jun 28, 2019 at 8:40 AM Arnd Bergmann  wrote:
>
> With the new CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL option, the stack
> usage in the ipvs debug output grows because each instance of
> IP_VS_DBG_BUF() now has its own buffer of 160 bytes that add up
> rather than reusing the stack slots:
>
> net/netfilter/ipvs/ip_vs_core.c: In function 'ip_vs_sched_persist':
> net/netfilter/ipvs/ip_vs_core.c:427:1: error: the frame size of 1052 bytes is 
> larger than 1024 bytes [-Werror=frame-larger-than=]
> net/netfilter/ipvs/ip_vs_core.c: In function 'ip_vs_new_conn_out':
> net/netfilter/ipvs/ip_vs_core.c:1231:1: error: the frame size of 1048 bytes 
> is larger than 1024 bytes [-Werror=frame-larger-than=]
> net/netfilter/ipvs/ip_vs_ftp.c: In function 'ip_vs_ftp_out':
> net/netfilter/ipvs/ip_vs_ftp.c:397:1: error: the frame size of 1104 bytes is 
> larger than 1024 bytes [-Werror=frame-larger-than=]
> net/netfilter/ipvs/ip_vs_ftp.c: In function 'ip_vs_ftp_in':
> net/netfilter/ipvs/ip_vs_ftp.c:555:1: error: the frame size of 1200 bytes is 
> larger than 1024 bytes [-Werror=frame-larger-than=]
>
> Since printk() already has a way to print IPv4/IPv6 addresses using
> the %pIS format string, use that instead,

since these are sockaddr_in and sockaddr_in6, should that have the 'n'
specifier to denote network byteorder?

> combined with a macro that
> creates a local sockaddr structure on the stack. These will still
> add up, but the stack frames are now under 200 bytes.

would it make sense to just define a helper function that takes const
char * level and msg strings and up to three struct nf_inet_addr* and
do the conversion in there? No need for macros and no state on the
stack outside error paths at all.

>
> Signed-off-by: Arnd Bergmann 
> ---
> I'm not sure this actually does what I think it does. Someone
> needs to verify that we correctly print the addresses here.
> I've also only added three files that caused the warning messages
> to be reported. There are still a lot of other instances of
> IP_VS_DBG_BUF() that could be converted the same way after the
> basic idea is confirmed.
> ---
>  include/net/ip_vs.h | 71 +++--
>  net/netfilter/ipvs/ip_vs_core.c | 44 ++--
>  net/netfilter/ipvs/ip_vs_ftp.c  | 20 +-
>  3 files changed, 72 insertions(+), 63 deletions(-)
>
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index 3759167f91f5..3dfbeef67be6 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -227,6 +227,16 @@ static inline const char *ip_vs_dbg_addr(int af, char 
> *buf, size_t buf_len,
>sizeof(ip_vs_dbg_buf), addr, \
>_vs_dbg_idx)
>
> +#define IP_VS_DBG_SOCKADDR4(fam, addr, port)   \
> +   (struct sockaddr*)&(struct sockaddr_in) \
> +   { .sin_family = (fam), .sin_addr = (addr)->in, .sin_port = (port) }

might as well set .sin_family = AF_INET here and AF_INET6 below?

> +#define IP_VS_DBG_SOCKADDR6(fam, addr, port)   \
> +   (struct sockaddr*)&(struct sockaddr_in6) \
> +   { .sin6_family = (fam), .sin6_addr = (addr)->in6, .sin6_port = (port) 
> }

[PATCH v2] staging: iio: ad7192: create of_device_id array

2019-06-28 Thread Bárbara Fernandes

Create list of compatible device ids to be matched with those stated in
the device tree.

Signed-off-by: Bárbara Fernandes 
Signed-off-by: Wilson Sales 
Co-developed by: Wilson Sales 
---
 drivers/staging/iio/adc/ad7192.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/staging/iio/adc/ad7192.c b/drivers/staging/iio/adc/ad7192.c
index 3d74da9d37e7..70118db98d94 100644
--- a/drivers/staging/iio/adc/ad7192.c
+++ b/drivers/staging/iio/adc/ad7192.c
@@ -810,11 +810,23 @@ static const struct spi_device_id ad7192_id[] = {
{"ad7195", ID_AD7195},
{}
 };
+
 MODULE_DEVICE_TABLE(spi, ad7192_id);
 
+static const struct of_device_id ad7192_of_match[] = {
+   { .compatible = "adi,ad7190" },
+   { .compatible = "adi,ad7192" },
+   { .compatible = "adi,ad7193" },
+   { .compatible = "adi,ad7195" },
+   {}
+};
+
+MODULE_DEVICE_TABLE(of, ad7192_of_match);
+
 static struct spi_driver ad7192_driver = {
.driver = {
.name   = "ad7192",
+   .of_match_table = ad7192_of_match,
},
.probe  = ad7192_probe,
.remove = ad7192_remove,
-- 
2.17.1

Re: [PATCH v1 1/6] mm: Adjust shuffle code to allow for future coalescing

2019-06-28 Thread Alexander Duyck

On Tue, Jun 25, 2019 at 12:56 AM David Hildenbrand  wrote:
>
> On 20.06.19 00:33, Alexander Duyck wrote:
> > From: Alexander Duyck 
> >
> > This patch is meant to move the head/tail adding logic out of the shuffle
> > code and into the __free_one_page function since ultimately that is where
> > it is really needed anyway. By doing this we should be able to reduce the
> > overhead and can consolidate all of the list addition bits in one spot.
> >
> > Signed-off-by: Alexander Duyck 
> > ---
> >  include/linux/mmzone.h |   12 
> >  mm/page_alloc.c|   70 
> > +++-
> >  mm/shuffle.c   |   24 
> >  mm/shuffle.h   |   35 
> >  4 files changed, 74 insertions(+), 67 deletions(-)
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 427b79c39b3c..4c07af2cfc2f 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -116,18 +116,6 @@ static inline void add_to_free_area_tail(struct page 
> > *page, struct free_area *ar
> >   area->nr_free++;
> >  }
> >
> > -#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
> > -/* Used to preserve page allocation order entropy */
> > -void add_to_free_area_random(struct page *page, struct free_area *area,
> > - int migratetype);
> > -#else
> > -static inline void add_to_free_area_random(struct page *page,
> > - struct free_area *area, int migratetype)
> > -{
> > - add_to_free_area(page, area, migratetype);
> > -}
> > -#endif
> > -
> >  /* Used for pages which are on another list */
> >  static inline void move_to_free_area(struct page *page, struct free_area 
> > *area,
> >int migratetype)
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index f4651a09948c..ec344ce46587 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -830,6 +830,36 @@ static inline struct capture_control *task_capc(struct 
> > zone *zone)
> >  #endif /* CONFIG_COMPACTION */
> >
> >  /*
> > + * If this is not the largest possible page, check if the buddy
> > + * of the next-highest order is free. If it is, it's possible
> > + * that pages are being freed that will coalesce soon. In case,
> > + * that is happening, add the free page to the tail of the list
> > + * so it's less likely to be used soon and more likely to be merged
> > + * as a higher order page
> > + */
> > +static inline bool
> > +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
> > +struct page *page, unsigned int order)
> > +{
> > + struct page *higher_page, *higher_buddy;
> > + unsigned long combined_pfn;
> > +
> > + if (is_shuffle_order(order) || order >= (MAX_ORDER - 2))
>
> My intuition tells me you can drop the () around "MAX_ORDER - 2"

I dunno, I always kind of prefer to use the parenthesis in these cases
for readability. I suppose I can drop it though.

> > + return false;
>
> Guess the "is_shuffle_order(order)" check should rather be performed by
> the caller, before calling this function.

I could do that, however I am not sure it adds much. I am pretty sure
the resultant code would be the same. Where things would be a bit more
complicated is that I would then have to probably look at adding a
variable to trap the output of is_shuffle_tail_page or
buddy_merge_likely.

> > +
> > + if (!pfn_valid_within(buddy_pfn))
> > + return false;
> > +
> > + combined_pfn = buddy_pfn & pfn;
> > + higher_page = page + (combined_pfn - pfn);
> > + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
> > + higher_buddy = higher_page + (buddy_pfn - combined_pfn);
> > +
> > + return pfn_valid_within(buddy_pfn) &&
> > +page_is_buddy(higher_page, higher_buddy, order + 1);
> > +}
> > +
> > +/*
> >   * Freeing function for a buddy system allocator.
> >   *
> >   * The concept of a buddy system is to maintain direct-mapped table
> > @@ -858,11 +888,12 @@ static inline void __free_one_page(struct page *page,
> >   struct zone *zone, unsigned int order,
> >   int migratetype)
> >  {
> > - unsigned long combined_pfn;
> > + struct capture_control *capc = task_capc(zone);
> >   unsigned long uninitialized_var(buddy_pfn);
> > - struct page *buddy;
> > + unsigned long combined_pfn;
> > + struct free_area *area;
> >   unsigned int max_order;
> > - struct capture_control *capc = task_capc(zone);
> > + struct page *buddy;
> >
> >   max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
> >
> > @@ -931,35 +962,12 @@ static inline void __free_one_page(struct page *page,
> >  done_merging:
> >   set_page_order(page, order);
> >
> > - /*
> > -  * If this is not the largest possible page, check if the buddy
> > -  * of the next-highest order is free. If it is, it's possible
> > -  * that pages are being freed that will coalesce soon. In case,
> > -  *

Re: [PATCH v2] Convert struct pid count to refcount_t

2019-06-28 Thread Kees Cook

On Fri, Jun 28, 2019 at 03:34:42PM -0400, Joel Fernandes (Google) wrote:
> struct pid's count is an atomic_t field used as a refcount. Use
> refcount_t for it which is basically atomic_t but does additional
> checking to prevent use-after-free bugs.
> 
> For memory ordering, the only change is with the following:
>  -if ((atomic_read(>count) == 1) ||
>  - atomic_dec_and_test(>count)) {
>  +if (refcount_dec_and_test(>count)) {
>   kmem_cache_free(ns->pid_cachep, pid);
> 
> Here the change is from:
> Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
> This ACQUIRE should take care of making sure the free happens after the
> refcount_dec_and_test().
> 
> The above hunk also removes atomic_read() since it is not needed for the
> code to work and it is unclear how beneficial it is. The removal lets
> refcount_dec_and_test() check for cases where get_pid() happened before
> the object was freed.
> 
> Cc: mathieu.desnoy...@efficios.com
> Cc: wi...@infradead.org
> Cc: pet...@infradead.org
> Cc: will.dea...@arm.com
> Cc: paul...@linux.vnet.ibm.com
> Cc: elena.reshet...@intel.com
> Cc: keesc...@chromium.org
> Cc: kernel-t...@android.com
> Cc: kernel-harden...@lists.openwall.com
> Signed-off-by: Joel Fernandes (Google) 

Reviewed-by: Kees Cook 

-Kees

> 
> ---
> Only change from v1->v2 is to get rid of the atomic_read().
> 
>  include/linux/pid.h | 5 +++--
>  kernel/pid.c| 7 +++
>  2 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 14a9a39da9c7..8cb86d377ff5 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -3,6 +3,7 @@
>  #define _LINUX_PID_H
>  
>  #include 
> +#include 
>  
>  enum pid_type
>  {
> @@ -56,7 +57,7 @@ struct upid {
>  
>  struct pid
>  {
> - atomic_t count;
> + refcount_t count;
>   unsigned int level;
>   /* lists of tasks that use this pid */
>   struct hlist_head tasks[PIDTYPE_MAX];
> @@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
>  static inline struct pid *get_pid(struct pid *pid)
>  {
>   if (pid)
> - atomic_inc(>count);
> + refcount_inc(>count);
>   return pid;
>  }
>  
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 20881598bdfa..89c4849fab5d 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -37,7 +37,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
>  #include 
>  #include 
>  
> @@ -106,8 +106,7 @@ void put_pid(struct pid *pid)
>   return;
>  
>   ns = pid->numbers[pid->level].ns;
> - if ((atomic_read(>count) == 1) ||
> -  atomic_dec_and_test(>count)) {
> + if (refcount_dec_and_test(>count)) {
>   kmem_cache_free(ns->pid_cachep, pid);
>   put_pid_ns(ns);
>   }
> @@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>   }
>  
>   get_pid_ns(ns);
> - atomic_set(>count, 1);
> + refcount_set(>count, 1);
>   for (type = 0; type < PIDTYPE_MAX; ++type)
>   INIT_HLIST_HEAD(>tasks[type]);
>  
> -- 
> 2.22.0.410.gd8fdbe21b5-goog

-- 
Kees Cook

Re: BUG: unable to handle kernel paging request in hrtimer_interrupt

2019-06-28 Thread syzbot


syzbot has bisected this bug to:

commit e9db4ef6bf4ca9894bb324c76e01b8f1a16b2650
Author: John Fastabend 
Date:   Sat Jun 30 13:17:47 2018 +

bpf: sockhash fix omitted bucket lock in sock_close

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=14436833a0
start commit:   29f785ff Merge branch 'fixes' of git://git.kernel.org/pub/..
git tree:   upstream
final crash:https://syzkaller.appspot.com/x/report.txt?x=16436833a0
console output: https://syzkaller.appspot.com/x/log.txt?x=12436833a0
kernel config:  https://syzkaller.appspot.com/x/.config?x=e5c77f8090a3b96b
dashboard link: https://syzkaller.appspot.com/bug?extid=037e18398ba8c655a652
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=16da8cc9a0

Reported-by: syzbot+037e18398ba8c655a...@syzkaller.appspotmail.com
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

Re: [RFC PATCH RT 4/4] rcutorture: Avoid problematic critical section nesting

2019-06-28 Thread Scott Wood

On Thu, 2019-06-27 at 17:52 -0700, Paul E. McKenney wrote:
> On Thu, Jun 27, 2019 at 05:46:27PM -0500, Scott Wood wrote:
> > On Thu, 2019-06-27 at 13:50 -0700, Paul E. McKenney wrote:
> > > If by IPI-to-self you mean the IRQ work trick, that isn't implemented
> > > across all architectures yet, is it?
> > 
> > Right... smp_send_reschedule() has wider coverage, but even then there's
> > some hardware that just can't do it reasonably (e.g. pre-APIC x86).
> 
> Except that smp_send_reschedule() won't do anything unless the scheduler
> things something needs to be done, as it its wake list is non-empty.
> Which might explain why Peter Zijlstra didn't suggest it.

The wake list stuff is separate from the original purpose of the IPI, which
is to hit the need_resched check on IRQ exit.  When that happens, the
scheduler will call into RCU, even if it doesn't change threads.  

> > So I guess the options are:
> > 
> > 1. Accept that such hardware might experience delayed grace period
> > completion in certain configurations,
> > 2. Have such hardware check for need_resched in local_irq_enable() (not
> > nice
> > if sharing a kernel build with hardware that doesn't need it), or
> > 3. Forbid the sequence (enforced by debug checks).  Again, this would
> > only
> > prohibit rcu_read_lock()/local_irq_disable()/rcu_read_unlock()/
> > local_irq_enable() *without* preempt disabling around the IRQ-disabled
> > region.
> 
> 4. If further testing continues to show it to be reliable, continue
> using the scheme in -rcu.

If the testing isn't done on machines that can't do the IPI then it's
basically option #1.  FWIW I don't think option #1 is unreasonable given
that we're talking about very old and/or specialized hardware, and we're
only talking about delays, not a crash (maybe limit the ability to use
nohz_full on such hardware?).  Of course if it turns out people are actually
trying to run (modern versions of) RT on such hardware, that might be
different. :-)

> 5. Use a short-duration hrtimer to get a clean environment in short
> order.  Yes, the timer might fire while preemption and/or softirqs
> are disabled, but then the code can rely on the following
> preempt_enable(), local_bh_enable(), or whatever.  This condition
> should be sufficiently rare to avoid issues with hrtimer overhead.

Yeah, I considered that but was hesitant due to overhead -- at least in the
case of the example I gave (pre-APIC x86), arming a oneshot timer is pretty
slow.  Plus, some hardware might entirely lack one-shot timer capability.

> 6. Use smp_call_function_single() to IPI some other poor slob of a
> CPU, which then does the same back.  Non-waiting version in both
> cases, of course.

I was assuming any hardware that can't do smp_send_reschedule() is not SMP.

> 
> Probably others as well.
> 
> > > Why not simply make rcutorture cyheck whether it is running in a
> > > PREEMPT_RT_FULL environment and avoid the PREEMPT_RT_FULL-unfriendly
> > > testing only in that case?
> > > 
> > > And should we later get to a place where the PREEMPT_RT_FULL-
> > > unfriendly
> > > scenarios are prohibited across all kernel configurations, then the
> > > module
> > > parameter can be removed.  Again, until we know (as opposed to
> > > suspect)
> > > that these scenarios really don't happen, mainline rcutorture must
> > > continue testing them.
> > 
> > Yes, I already acknowledged that debug checks detecting the sequences
> > should
> > come before the test removal
> 
> OK, good to hear.  As you may have noticed, I was getting the impression
> that you might have changed your mind on this point.  ;-)
> 
> >  (including this patch as an RFC at this
> > point
> > was mainly meant as a demonstration of what's needed to get rcutorture
> > to
> > pass), but it'd be nice to have some idea of whether there would be
> > opposition to the concept before coding up the checks.  I'd rather not
> > continue the state of "these sequences can blow up on RT and we don't
> > know
> > if they exist or not" any longer than necessary.  Plus, only one of the
> > sequences is exclusively an RT issue (though it's the one with the worst
> > consequences).
> 
> Steve Rostedt's point about enlisting the aid of lockdep seems worth
> looking into.

Sure.  I was just concerned by the "Linus was against enforcing this in the
past" comment and was hoping for more details.

-Scott

1 2 3 4 5 6 7 8 >

1 - 100 of 763 matches

Mail list logo