date:20170824

[PATCH v2 09/10] powerpc: Handle opposite-endian processes in emulation code

2017-08-24 Thread Paul Mackerras

This adds code to the load and store emulation code to byte-swap
the data appropriately when the process being emulated is set to
the opposite endianness to that of the kernel.

This also enables the emulation for the multiple-register loads
and stores (lmw, stmw, lswi, stswi, lswx, stswx) to work for
little-endian.  In little-endian mode, the partial word at the
end of a transfer for lsw*/stsw* (when the byte count is not a
multiple of 4) is loaded/stored at the least-significant end of
the register.  Additionally, this fixes a bug in the previous
code in that it could call read_mem/write_mem with a byte count
that was not 1, 2, 4 or 8.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |   4 +-
 arch/powerpc/lib/sstep.c | 202 ++-
 2 files changed, 135 insertions(+), 71 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 0e5dd23..5a3d3d4 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -149,6 +149,6 @@ void emulate_update_regs(struct pt_regs *reg, struct 
instruction_op *op);
 extern int emulate_step(struct pt_regs *regs, unsigned int instr);
 
 extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
-const void *mem);
+const void *mem, bool cross_endian);
 extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg 
*reg,
- void *mem);
+ void *mem, bool cross_endian);
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 4773055..7afb8ef 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -210,6 +210,33 @@ static nokprobe_inline unsigned long byterev_8(unsigned 
long x)
 }
 #endif
 
+static nokprobe_inline void do_byte_reverse(void *ptr, int nb)
+{
+   switch (nb) {
+   case 2:
+   *(u16 *)ptr = byterev_2(*(u16 *)ptr);
+   break;
+   case 4:
+   *(u32 *)ptr = byterev_4(*(u32 *)ptr);
+   break;
+#ifdef __powerpc64__
+   case 8:
+   *(unsigned long *)ptr = byterev_8(*(unsigned long *)ptr);
+   break;
+   case 16: {
+   unsigned long *up = (unsigned long *)ptr;
+   unsigned long tmp;
+   tmp = byterev_8(up[0]);
+   up[0] = byterev_8(up[1]);
+   up[1] = tmp;
+   break;
+   }
+#endif
+   default:
+   WARN_ON_ONCE(1);
+   }
+}
+
 static nokprobe_inline int read_mem_aligned(unsigned long *dest,
unsigned long ea, int nb)
 {
@@ -409,7 +436,8 @@ NOKPROBE_SYMBOL(write_mem);
  * These access either the real FP register or the image in the
  * thread_struct, depending on regs->msr & MSR_FP.
  */
-static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs)
+static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs,
+ bool cross_endian)
 {
int err;
union {
@@ -424,6 +452,11 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
err = copy_mem_in(u.b, ea, nb);
if (err)
return err;
+   if (unlikely(cross_endian)) {
+   do_byte_reverse(u.b, min(nb, 8));
+   if (nb == 16)
+   do_byte_reverse(&u.b[8], 8);
+   }
preempt_disable();
if (nb == 4)
conv_sp_to_dp(&u.f, &u.d[0]);
@@ -444,7 +477,8 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_fp_load);
 
-static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs)
+static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs,
+  bool cross_endian)
 {
union {
float f;
@@ -470,6 +504,11 @@ static int do_fp_store(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
u.l[1] = current->thread.TS_FPR(rn);
}
preempt_enable();
+   if (unlikely(cross_endian)) {
+   do_byte_reverse(u.b, min(nb, 8));
+   if (nb == 16)
+   do_byte_reverse(&u.b[8], 8);
+   }
return copy_mem_out(u.b, ea, nb);
 }
 NOKPROBE_SYMBOL(do_fp_store);
@@ -478,7 +517,8 @@ NOKPROBE_SYMBOL(do_fp_store);
 #ifdef CONFIG_ALTIVEC
 /* For Altivec/VMX, no need to worry about alignment */
 static nokprobe_inline int do_vec_load(int rn, unsigned long ea,
-  int size, struct pt_regs *regs)
+  int size, struct pt_regs *regs,
+  bool cross_endian)
 {
int err;
union {
@@ -493,7 +533,8 @@ static nokprobe_inline int do_vec_load(int rn, unsigned 
long ea,
err = copy_mem_in(&u.b[ea & 0xf], ea, size);

[PATCH v2 10/10] powerpc/64: Fix update forms of loads and stores to write 64-bit EA

2017-08-24 Thread Paul Mackerras

When a 64-bit processor is executing in 32-bit mode, the update forms
of load and store instructions are required by the architecture to
write the full 64-bit effective address into the RA register, though
only the bottom 32 bits are used to address memory.  Currently,
the instruction emulation code writes the truncated address to the
RA register.  This fixes it by keeping the full 64-bit EA in the
instruction_op structure, truncating the address in emulate_step()
where it is used to address memory, rather than in the address
computations in analyse_instr().

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |  4 +-
 arch/powerpc/lib/sstep.c | 99 +---
 2 files changed, 54 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 5a3d3d4..9bf44e2 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -25,7 +25,7 @@ struct pt_regs;
 
 enum instruction_type {
COMPUTE,/* arith/logical/CR op, etc. */
-   LOAD,
+   LOAD,   /* load and store types need to be contiguous */
LOAD_MULTI,
LOAD_FP,
LOAD_VMX,
@@ -52,6 +52,8 @@ enum instruction_type {
 
 #define INSTR_TYPE_MASK0x1f
 
+#define OP_IS_LOAD_STORE(type) (LOAD <= (type) && (type) <= STCX)
+
 /* Compute flags, ORed in with type */
 #define SETREG 0x20
 #define SETCC  0x40
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 7afb8ef..b8d1d46 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -125,7 +125,7 @@ static nokprobe_inline unsigned long dform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 
 #ifdef __powerpc64__
@@ -143,7 +143,7 @@ static nokprobe_inline unsigned long dsform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 
 /*
@@ -160,7 +160,7 @@ static nokprobe_inline unsigned long dqform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 #endif /* __powerpc64 */
 
@@ -179,7 +179,7 @@ static nokprobe_inline unsigned long xform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 
 /*
@@ -2007,10 +2007,7 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
if (rb == 0)
rb = 32;/* # bytes to load */
op->type = MKOP(LOAD_MULTI, 0, rb);
-   op->ea = 0;
-   if (ra)
-   op->ea = truncate_if_32bit(regs->msr,
-  regs->gpr[ra]);
+   op->ea = ra ? regs->gpr[ra] : 0;
break;
 
 #ifdef CONFIG_PPC_FPU
@@ -2077,10 +2074,7 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
if (rb == 0)
rb = 32;/* # bytes to store */
op->type = MKOP(STORE_MULTI, 0, rb);
-   op->ea = 0;
-   if (ra)
-   op->ea = truncate_if_32bit(regs->msr,
-  regs->gpr[ra]);
+   op->ea = ra ? regs->gpr[ra] : 0;
break;
 
case 790:   /* lhbrx */
@@ -2787,10 +2781,11 @@ void emulate_update_regs(struct pt_regs *regs, struct 
instruction_op *op)
 int emulate_step(struct pt_regs *regs, unsigned int instr)
 {
struct instruction_op op;
-   int r, err, size;
+   int r, err, size, type;
unsigned long val;
unsigned int cr;
int i, rd, nb;
+   unsigned long ea;
bool cross_endian;
 
r = analyse_instr(&op, regs, instr);
@@ -2803,28 +2798,36 @@ int emulate_step(struct pt_regs *regs, unsigned int 
instr)
 
err = 0;
size = GETSIZE(op.type);
+   type = op.type & INSTR_TYPE_MASK;
cross_endian = (regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE);
-   switch (op.type & INSTR_TYPE_MASK) {
+
+   ea = op.ea;
+#ifdef __powerpc64__
+   if (OP_IS_LOAD_STORE(type) || type == CACHEOP)
+   ea = truncate_if_32bit(regs->msr, op.ea);
+#endif
+
+   switch (type) {
case CACHEOP:
-   if (!address_ok(regs, op.ea, 8))
+   if (!address_ok(regs, ea, 8))
return 0;
switch (op.type & CACHEOP_MASK) {
case DCBST:
-   __cacheop_user_asmx(op.ea, err, "dcbst");
+

[PATCH v2 08/10] powerpc: Emulate load/store floating double pair instructions

2017-08-24 Thread Paul Mackerras

This adds lfdp[x] and stfdp[x] to the set of instructions that
analyse_instr() and emulate_step() understand.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 76 ++--
 1 file changed, 60 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 82b1e69..4773055 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -414,9 +414,9 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
int err;
union {
float f;
-   double d;
-   unsigned long l;
-   u8 b[sizeof(double)];
+   double d[2];
+   unsigned long l[2];
+   u8 b[2 * sizeof(double)];
} u;
 
if (!address_ok(regs, ea, nb))
@@ -426,11 +426,19 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
return err;
preempt_disable();
if (nb == 4)
-   conv_sp_to_dp(&u.f, &u.d);
+   conv_sp_to_dp(&u.f, &u.d[0]);
if (regs->msr & MSR_FP)
-   put_fpr(rn, &u.d);
+   put_fpr(rn, &u.d[0]);
else
-   current->thread.TS_FPR(rn) = u.l;
+   current->thread.TS_FPR(rn) = u.l[0];
+   if (nb == 16) {
+   /* lfdp */
+   rn |= 1;
+   if (regs->msr & MSR_FP)
+   put_fpr(rn, &u.d[1]);
+   else
+   current->thread.TS_FPR(rn) = u.l[1];
+   }
preempt_enable();
return 0;
 }
@@ -440,20 +448,27 @@ static int do_fp_store(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
 {
union {
float f;
-   double d;
-   unsigned long l;
-   u8 b[sizeof(double)];
+   double d[2];
+   unsigned long l[2];
+   u8 b[2 * sizeof(double)];
} u;
 
if (!address_ok(regs, ea, nb))
return -EFAULT;
preempt_disable();
if (regs->msr & MSR_FP)
-   get_fpr(rn, &u.d);
+   get_fpr(rn, &u.d[0]);
else
-   u.l = current->thread.TS_FPR(rn);
+   u.l[0] = current->thread.TS_FPR(rn);
if (nb == 4)
-   conv_dp_to_sp(&u.d, &u.f);
+   conv_dp_to_sp(&u.d[0], &u.f);
+   if (nb == 16) {
+   rn |= 1;
+   if (regs->msr & MSR_FP)
+   get_fpr(rn, &u.d[1]);
+   else
+   u.l[1] = current->thread.TS_FPR(rn);
+   }
preempt_enable();
return copy_mem_out(u.b, ea, nb);
 }
@@ -1966,7 +1981,21 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
goto fpunavail;
op->type = MKOP(STORE_FP, u, 8);
break;
-#endif
+
+#ifdef __powerpc64__
+   case 791:   /* lfdpx */
+   if (!(regs->msr & MSR_FP))
+   goto fpunavail;
+   op->type = MKOP(LOAD_FP, 0, 16);
+   break;
+
+   case 919:   /* stfdpx */
+   if (!(regs->msr & MSR_FP))
+   goto fpunavail;
+   op->type = MKOP(STORE_FP, 0, 16);
+   break;
+#endif /* __powerpc64 */
+#endif /* CONFIG_PPC_FPU */
 
 #ifdef __powerpc64__
case 660:   /* stdbrx */
@@ -1984,7 +2013,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
op->val = byterev_4(regs->gpr[rd]);
break;
 
-   case 725:
+   case 725:   /* stswi */
if (rb == 0)
rb = 32;/* # bytes to store */
op->type = MKOP(STORE_MULTI, 0, rb);
@@ -2368,9 +2397,16 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #endif
 
 #ifdef CONFIG_VSX
-   case 57:/* lxsd, lxssp */
+   case 57:/* lfdp, lxsd, lxssp */
op->ea = dsform_ea(instr, regs);
switch (instr & 3) {
+   case 0: /* lfdp */
+   if (!(regs->msr & MSR_FP))
+   goto fpunavail;
+   if (rd & 1)
+   break;  /* reg must be even */
+   op->type = MKOP(LOAD_FP, 0, 16);
+   break;
case 2: /* lxsd */
if (!(regs->msr & MSR_VSX))
goto vsxunavail;
@@ -2408,8 +2444,16 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #endif
 
 #ifdef CONFIG_VSX
-   case 61:/* lxv, stxsd, stxssp, stxv */
+   case 61:

[PATCH v2 07/10] powerpc: Handle vector element load/stores in emulation code

2017-08-24 Thread Paul Mackerras

This adds code to analyse_instr() and emulate_step() to handle the
vector element loads and stores:

lvebx, lvehx, lvewx, stvebx, stvehx, stvewx.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 50 ++--
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 0b295fb..82b1e69 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -475,7 +475,7 @@ static nokprobe_inline int do_vec_load(int rn, unsigned 
long ea,
return -EFAULT;
/* align to multiple of size */
ea &= ~(size - 1);
-   err = copy_mem_in(u.b, ea, size);
+   err = copy_mem_in(&u.b[ea & 0xf], ea, size);
if (err)
return err;
 
@@ -507,7 +507,7 @@ static nokprobe_inline int do_vec_store(int rn, unsigned 
long ea,
else
u.v = current->thread.vr_state.vr[rn];
preempt_enable();
-   return copy_mem_out(u.b, ea, size);
+   return copy_mem_out(&u.b[ea & 0xf], ea, size);
 }
 #endif /* CONFIG_ALTIVEC */
 
@@ -1808,6 +1808,31 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
break;
 
 #ifdef CONFIG_ALTIVEC
+   /*
+* Note: for the load/store vector element instructions,
+* bits of the EA say which field of the VMX register to use.
+*/
+   case 7: /* lvebx */
+   if (!(regs->msr & MSR_VEC))
+   goto vecunavail;
+   op->type = MKOP(LOAD_VMX, 0, 1);
+   op->element_size = 1;
+   break;
+
+   case 39:/* lvehx */
+   if (!(regs->msr & MSR_VEC))
+   goto vecunavail;
+   op->type = MKOP(LOAD_VMX, 0, 2);
+   op->element_size = 2;
+   break;
+
+   case 71:/* lvewx */
+   if (!(regs->msr & MSR_VEC))
+   goto vecunavail;
+   op->type = MKOP(LOAD_VMX, 0, 4);
+   op->element_size = 4;
+   break;
+
case 103:   /* lvx */
case 359:   /* lvxl */
if (!(regs->msr & MSR_VEC))
@@ -1816,6 +1841,27 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
op->element_size = 16;
break;
 
+   case 135:   /* stvebx */
+   if (!(regs->msr & MSR_VEC))
+   goto vecunavail;
+   op->type = MKOP(STORE_VMX, 0, 1);
+   op->element_size = 1;
+   break;
+
+   case 167:   /* stvehx */
+   if (!(regs->msr & MSR_VEC))
+   goto vecunavail;
+   op->type = MKOP(STORE_VMX, 0, 2);
+   op->element_size = 2;
+   break;
+
+   case 199:   /* stvewx */
+   if (!(regs->msr & MSR_VEC))
+   goto vecunavail;
+   op->type = MKOP(STORE_VMX, 0, 4);
+   op->element_size = 4;
+   break;
+
case 231:   /* stvx */
case 487:   /* stvxl */
if (!(regs->msr & MSR_VEC))
-- 
2.7.4

[PATCH v2 06/10] powerpc: Emulate FP/vector/VSX loads/stores correctly when regs not live

2017-08-24 Thread Paul Mackerras

At present, the analyse_instr/emulate_step code checks for the
relevant MSR_FP/VEC/VSX bit being set when a FP/VMX/VSX load
or store is decoded, but doesn't recheck the bit before reading or
writing the relevant FP/VMX/VSX register in emulate_step().

Since we don't have preemption disabled, it is possible that we get
preempted between checking the MSR bit and doing the register access.
If that happened, then the registers would have been saved to the
thread_struct for the current process.  Accesses to the CPU registers
would then potentially read stale values, or write values that would
never be seen by the user process.

Another way that the registers can become non-live is if a page
fault occurs when accessing user memory, and the page fault code
calls a copy routine that wants to use the VMX or VSX registers.

To fix this, the code for all the FP/VMX/VSX loads gets restructured
so that it forms an image in a local variable of the desired register
contents, then disables preemption, checks the MSR bit and either
sets the CPU register or writes the value to the thread struct.
Similarly, the code for stores checks the MSR bit, copies either the
CPU register or the thread struct to a local variable, then reenables
preemption and then copies the register image to memory.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |   1 +
 arch/powerpc/lib/ldstfp.S| 241 +++
 arch/powerpc/lib/sstep.c | 218 ---
 3 files changed, 193 insertions(+), 267 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 5cdcbc4..0e5dd23 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -116,6 +116,7 @@ union vsx_reg {
unsigned long d[2];
float   fp[4];
double  dp[2];
+   __vector128 v;
 };
 
 /*
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index 6840911..7b5cf5e 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -21,27 +21,19 @@
 
 #define STKFRM (PPC_MIN_STKFRM + 16)
 
-   .macro  inst32  op
-reg = 0
-   .rept   32
-20:\op reg,0,r4
-   b   3f
-   EX_TABLE(20b,99f)
-reg = reg + 1
-   .endr
-   .endm
-
-/* Get the contents of frN into fr0; N is in r3. */
+/* Get the contents of frN into *p; N is in r3 and p is in r4. */
 _GLOBAL(get_fpr)
mflrr0
+   mfmsr   r6
+   ori r7, r6, MSR_FP
+   MTMSRD(r7)
+   isync
rlwinm  r3,r3,3,0xf8
bcl 20,31,1f
-   blr /* fr0 is already in fr0 */
-   nop
-reg = 1
-   .rept   31
-   fmr fr0,reg
-   blr
+reg = 0
+   .rept   32
+   stfdreg, 0(r4)
+   b   2f
 reg = reg + 1
.endr
 1: mflrr5
@@ -49,18 +41,23 @@ reg = reg + 1
mtctr   r5
mtlrr0
bctr
+2: MTMSRD(r6)
+   isync
+   blr
 
-/* Put the contents of fr0 into frN; N is in r3. */
+/* Put the contents of *p into frN; N is in r3 and p is in r4. */
 _GLOBAL(put_fpr)
mflrr0
+   mfmsr   r6
+   ori r7, r6, MSR_FP
+   MTMSRD(r7)
+   isync
rlwinm  r3,r3,3,0xf8
bcl 20,31,1f
-   blr /* fr0 is already in fr0 */
-   nop
-reg = 1
-   .rept   31
-   fmr reg,fr0
-   blr
+reg = 0
+   .rept   32
+   lfd reg, 0(r4)
+   b   2f
 reg = reg + 1
.endr
 1: mflrr5
@@ -68,127 +65,24 @@ reg = reg + 1
mtctr   r5
mtlrr0
bctr
-
-/* Load FP reg N from float at *p.  N is in r3, p in r4. */
-_GLOBAL(do_lfs)
-   PPC_STLU r1,-STKFRM(r1)
-   mflrr0
-   PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1)
-   mfmsr   r6
-   ori r7,r6,MSR_FP
-   cmpwi   cr7,r3,0
-   MTMSRD(r7)
-   isync
-   beq cr7,1f
-   stfdfr0,STKFRM-16(r1)
-1: li  r9,-EFAULT
-2: lfs fr0,0(r4)
-   li  r9,0
-3: bl  put_fpr
-   beq cr7,4f
-   lfd fr0,STKFRM-16(r1)
-4: PPC_LL  r0,STKFRM+PPC_LR_STKOFF(r1)
-   mtlrr0
-   MTMSRD(r6)
-   isync
-   mr  r3,r9
-   addir1,r1,STKFRM
-   blr
-   EX_TABLE(2b,3b)
-
-/* Load FP reg N from double at *p.  N is in r3, p in r4. */
-_GLOBAL(do_lfd)
-   PPC_STLU r1,-STKFRM(r1)
-   mflrr0
-   PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1)
-   mfmsr   r6
-   ori r7,r6,MSR_FP
-   cmpwi   cr7,r3,0
-   MTMSRD(r7)
-   isync
-   beq cr7,1f
-   stfdfr0,STKFRM-16(r1)
-1: li  r9,-EFAULT
-2: lfd fr0,0(r4)
-   li  r9,0
-3: beq cr7,4f
-   bl  put_fpr
-   lfd fr0,STKFRM-16(r1)
-4: PPC_LL  r0,STKFRM+PPC_LR_STKOFF(r1)
-   mtlrr0
-   MTMSRD(r6)
-   isync
-   mr  r3,r9
-   addir1,r1,STKFRM
-   blr
-   EX_TABLE(2b,3b)
-
-/* Store FP reg

[PATCH v2 05/10] powerpc: Make load/store emulation use larger memory accesses

2017-08-24 Thread Paul Mackerras

At the moment, emulation of loads and stores of up to 8 bytes to
unaligned addresses on a little-endian system uses a sequence of
single-byte loads or stores to memory.  This is rather inefficient,
and the code is hard to follow because it has many ifdefs.
In addition, the Power ISA has requirements on how unaligned accesses
are performed, which are not met by doing all accesses as
sequences of single-byte accesses.

Emulation of VSX loads and stores uses __copy_{to,from}_user,
which means the emulation code has no control on the size of
accesses.

To simplify this, we add new copy_mem_in() and copy_mem_out()
functions for accessing memory.  These use a sequence of the largest
possible aligned accesses, up to 8 bytes (or 4 on 32-bit systems),
to copy memory between a local buffer and user memory.  We then
rewrite {read,write}_mem_unaligned and the VSX load/store
emulation using these new functions.

These new function also simplify the code in do_fp_load() and
do_fp_store() for the unaligned cases.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 237 +--
 1 file changed, 106 insertions(+), 131 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index d9b3b63..861654e 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -194,7 +194,6 @@ static nokprobe_inline unsigned long max_align(unsigned 
long x)
return x & -x;  /* isolates rightmost bit */
 }
 
-
 static nokprobe_inline unsigned long byterev_2(unsigned long x)
 {
return ((x >> 8) & 0xff) | ((x & 0xff) << 8);
@@ -240,56 +239,68 @@ static nokprobe_inline int read_mem_aligned(unsigned long 
*dest,
return err;
 }
 
-static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
-   unsigned long ea, int nb, struct pt_regs *regs)
+/*
+ * Copy from userspace to a buffer, using the largest possible
+ * aligned accesses, up to sizeof(long).
+ */
+static int nokprobe_inline copy_mem_in(u8 *dest, unsigned long ea, int nb)
 {
-   int err;
-   unsigned long x, b, c;
-#ifdef __LITTLE_ENDIAN__
-   int len = nb; /* save a copy of the length for byte reversal */
-#endif
+   int err = 0;
+   int c;
 
-   /* unaligned, do this in pieces */
-   x = 0;
for (; nb > 0; nb -= c) {
-#ifdef __LITTLE_ENDIAN__
-   c = 1;
-#endif
-#ifdef __BIG_ENDIAN__
c = max_align(ea);
-#endif
if (c > nb)
c = max_align(nb);
-   err = read_mem_aligned(&b, ea, c);
+   switch (c) {
+   case 1:
+   err = __get_user(*dest, (unsigned char __user *) ea);
+   break;
+   case 2:
+   err = __get_user(*(u16 *)dest,
+(unsigned short __user *) ea);
+   break;
+   case 4:
+   err = __get_user(*(u32 *)dest,
+(unsigned int __user *) ea);
+   break;
+#ifdef __powerpc64__
+   case 8:
+   err = __get_user(*(unsigned long *)dest,
+(unsigned long __user *) ea);
+   break;
+#endif
+   }
if (err)
return err;
-   x = (x << (8 * c)) + b;
+   dest += c;
ea += c;
}
-#ifdef __LITTLE_ENDIAN__
-   switch (len) {
-   case 2:
-   *dest = byterev_2(x);
-   break;
-   case 4:
-   *dest = byterev_4(x);
-   break;
-#ifdef __powerpc64__
-   case 8:
-   *dest = byterev_8(x);
-   break;
-#endif
-   }
-#endif
-#ifdef __BIG_ENDIAN__
-   *dest = x;
-#endif
return 0;
 }
 
+static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
+ unsigned long ea, int nb)
+{
+   union {
+   unsigned long ul;
+   u8 b[sizeof(unsigned long)];
+   } u;
+   int i;
+   int err;
+
+   u.ul = 0;
+   i = IS_BE ? sizeof(unsigned long) - nb : 0;
+   err = copy_mem_in(&u.b[i], ea, nb);
+   if (!err)
+   *dest = u.ul;
+   return err;
+}
+
 /*
  * Read memory at address ea for nb bytes, return 0 for success
- * or -EFAULT if an error occurred.
+ * or -EFAULT if an error occurred.  N.B. nb must be 1, 2, 4 or 8.
+ * If nb < sizeof(long), the result is right-justified on BE systems.
  */
 static int read_mem(unsigned long *dest, unsigned long ea, int nb,
  struct pt_regs *regs)
@@ -298,7 +309,7 @@ static int read_mem(unsigned long *dest, unsigned long ea, 
int nb,
return -EFAULT;
if ((ea & (nb - 1)) == 0)
return read_mem_aligned(dest, ea, nb);
-   return read_m

[PATCH v2 04/10] powerpc: Add emulation for the addpcis instruction

2017-08-24 Thread Paul Mackerras

The addpcis instruction puts the sum of the next instruction address
plus a constant into a register.  Since the result depends on the
address of the instruction, it will give an incorrect result if it
is single-stepped out of line, which is what the *probes subsystem
will currently do if a probe is placed on an addpcis instruction.
This fixes the problem by adding emulation of it to analyse_instr().

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 7921b2a..d9b3b63 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1024,9 +1024,6 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
op->ccval = (regs->ccr & ~(1UL << (31 - rd))) |
(val << (31 - rd));
return 1;
-   default:
-   op->type = UNKNOWN;
-   return 0;
}
break;
case 31:
@@ -1126,6 +1123,17 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
op->val = imm;
goto compute_done;
 
+   case 19:
+   if (((instr >> 1) & 0x1f) == 2) {
+   /* addpcis */
+   imm = (short) (instr & 0xffc1); /* d0 + d2 fields */
+   imm |= (instr >> 15) & 0x3e;/* d1 field */
+   op->val = regs->nip + (imm << 16) + 4;
+   goto compute_done;
+   }
+   op->type = UNKNOWN;
+   return 0;
+
case 20:/* rlwimi */
mb = (instr >> 6) & 0x1f;
me = (instr >> 1) & 0x1f;
-- 
2.7.4

[PATCH v2 03/10] powerpc: Fix emulation of the isel instruction

2017-08-24 Thread Paul Mackerras

The case added for the isel instruction was added inside a switch
statement which uses the 10-bit minor opcode field in the 0x7fe
bits of the instruction word.  However, for the isel instruction,
the minor opcode field is only the 0x3e bits, and the 0x7c0 bits
are used for the "BC" field, which indicates which CR bit to use
to select the result.

Therefore, for the isel emulation to work correctly when BC != 0,
we need to match on ((instr >> 1) & 0x1f) == 15).  To do this, we
pull the isel case out of the switch statement and put it in an
if statement of its own.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index f9c973c..7921b2a 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1219,6 +1219,16 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
return 0;
 
case 31:
+   /* isel occupies 32 minor opcodes */
+   if (((instr >> 1) & 0x1f) == 15) {
+   mb = (instr >> 6) & 0x1f; /* bc field */
+   val = (regs->ccr >> (31 - mb)) & 1;
+   val2 = (ra) ? regs->gpr[ra] : 0;
+
+   op->val = (val) ? val2 : regs->gpr[rb];
+   goto compute_done;
+   }
+
switch ((instr >> 1) & 0x3ff) {
case 4: /* tw */
if (rd == 0x1f ||
@@ -1444,14 +1454,6 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 /*
  * Logical instructions
  */
-   case 15:/* isel */
-   mb = (instr >> 6) & 0x1f; /* bc */
-   val = (regs->ccr >> (31 - mb)) & 1;
-   val2 = (ra) ? regs->gpr[ra] : 0;
-
-   op->val = (val) ? val2 : regs->gpr[rb];
-   goto compute_done;
-
case 26:/* cntlzw */
op->val = __builtin_clz((unsigned int) regs->gpr[rd]);
goto logical_done;
-- 
2.7.4

[PATCH v2 02/10] powerpc: Change analyse_instr so it doesn't modify *regs

2017-08-24 Thread Paul Mackerras

The analyse_instr function currently doesn't just work out what an
instruction does, it also executes those instructions whose effect
is only to update CPU registers that are stored in struct pt_regs.
This is undesirable because optprobes uses analyse_instr to work out
if an instruction could be successfully emulated in future.

This changes analyse_instr so it doesn't modify *regs; instead it
stores information in the instruction_op structure to indicate what
registers (GPRs, CR, XER, LR) would be set and what value they would
be set to.  A companion function called emulate_update_regs() can
then use that information to update a pt_regs struct appropriately.

As a minor cleanup, this replaces inline asm using the cntlzw and
cntlzd instructions with calls to __builtin_clz() and __builtin_clzl().

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |  52 +++-
 arch/powerpc/lib/sstep.c | 607 +++
 2 files changed, 400 insertions(+), 259 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 863e1e4..5cdcbc4 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -23,9 +23,6 @@ struct pt_regs;
 #define IS_RFID(instr) (((instr) & 0xfc0007fe) == 0x4c24)
 #define IS_RFI(instr)  (((instr) & 0xfc0007fe) == 0x4c64)
 
-/* Emulate instructions that cause a transfer of control. */
-extern int emulate_step(struct pt_regs *regs, unsigned int instr);
-
 enum instruction_type {
COMPUTE,/* arith/logical/CR op, etc. */
LOAD,
@@ -55,11 +52,29 @@ enum instruction_type {
 
 #define INSTR_TYPE_MASK0x1f
 
+/* Compute flags, ORed in with type */
+#define SETREG 0x20
+#define SETCC  0x40
+#define SETXER 0x80
+
+/* Branch flags, ORed in with type */
+#define SETLK  0x20
+#define BRTAKEN0x40
+#define DECCTR 0x80
+
 /* Load/store flags, ORed in with type */
 #define SIGNEXT0x20
 #define UPDATE 0x40/* matches bit in opcode 31 instructions */
 #define BYTEREV0x80
 
+/* Barrier type field, ORed in with type */
+#define BARRIER_MASK   0xe0
+#define BARRIER_SYNC   0x00
+#define BARRIER_ISYNC  0x20
+#define BARRIER_EIEIO  0x40
+#define BARRIER_LWSYNC 0x60
+#define BARRIER_PTESYNC0x80
+
 /* Cacheop values, ORed in with type */
 #define CACHEOP_MASK   0x700
 #define DCBST  0
@@ -90,6 +105,8 @@ struct instruction_op {
int spr;
u8 element_size;/* for VSX/VMX loads/stores */
u8 vsx_flags;
+   u32 ccval;
+   u32 xerval;
 };
 
 union vsx_reg {
@@ -101,8 +118,35 @@ union vsx_reg {
double  dp[2];
 };
 
-extern int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
+/*
+ * Decode an instruction, and return information about it in *op
+ * without changing *regs.
+ *
+ * Return value is 1 if the instruction can be emulated just by
+ * updating *regs with the information in *op, -1 if we need the
+ * GPRs but *regs doesn't contain the full register set, or 0
+ * otherwise.
+ */
+extern int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 unsigned int instr);
+
+/*
+ * Emulate an instruction that can be executed just by updating
+ * fields in *regs.
+ */
+void emulate_update_regs(struct pt_regs *reg, struct instruction_op *op);
+
+/*
+ * Emulate instructions that cause a transfer of control,
+ * arithmetic/logical instructions, loads and stores,
+ * cache operations and barriers.
+ *
+ * Returns 1 if the instruction was emulated successfully,
+ * 0 if it could not be emulated, or -1 for an instruction that
+ * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.).
+ */
+extern int emulate_step(struct pt_regs *regs, unsigned int instr);
+
 extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
 const void *mem);
 extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg 
*reg,
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 6aa0ba6..f9c973c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -83,15 +83,17 @@ static nokprobe_inline unsigned long 
truncate_if_32bit(unsigned long msr,
 /*
  * Determine whether a conditional branch instruction would branch.
  */
-static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs 
*regs)
+static nokprobe_inline int branch_taken(unsigned int instr,
+   const struct pt_regs *regs,
+   struct instruction_op *op)
 {
unsigned int bo = (instr >> 21) & 0x1f;
unsigned int bi;
 
if ((bo & 4) == 0) {
/* decrement counter */
-   --regs->ctr;
-   if (((bo >> 1) & 1) ^ (regs->ctr == 0))
+   op->type |= DECCTR;
+   if (((bo >> 1

[PATCH v2 01/10] powerpc: Handle most loads and stores in instruction emulation code

2017-08-24 Thread Paul Mackerras

This extends the instruction emulation infrastructure in sstep.c to
handle all the load and store instructions defined in the Power ISA
v3.0, except for the atomic memory operations, ldmx (which was never
implemented), lfdp/stfdp, and the vector element load/stores.

The instructions added are:

Integer loads and stores: lbarx, lharx, lqarx, stbcx., sthcx., stqcx.,
lq, stq.

VSX loads and stores: lxsiwzx, lxsiwax, stxsiwx, lxvx, lxvl, lxvll,
lxvdsx, lxvwsx, stxvx, stxvl, stxvll, lxsspx, lxsdx, stxsspx, stxsdx,
lxvw4x, lxsibzx, lxvh8x, lxsihzx, lxvb16x, stxvw4x, stxsibx, stxvh8x,
stxsihx, stxvb16x, lxsd, lxssp, lxv, stxsd, stxssp, stxv.

These instructions are handled both in the analyse_instr phase and in
the emulate_step phase.

The code for lxvd2ux and stxvd2ux has been taken out, as those
instructions were never implemented in any processor and have been
taken out of the architecture, and their opcodes have been reused for
other instructions in POWER9 (lxvb16x and stxvb16x).

The emulation for the VSX loads and stores uses helper functions
which don't access registers or memory directly, which can hopefully
be reused by KVM later.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |  20 ++
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/ldstfp.S|  70 ++--
 arch/powerpc/lib/quad.S  |  62 
 arch/powerpc/lib/sstep.c | 688 ---
 5 files changed, 781 insertions(+), 61 deletions(-)
 create mode 100644 arch/powerpc/lib/quad.S

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index d3a42cc..863e1e4 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -68,6 +68,11 @@ enum instruction_type {
 #define DCBT   0x300
 #define ICBI   0x400
 
+/* VSX flags values */
+#define VSX_FPCONV 1   /* do floating point SP/DP conversion */
+#define VSX_SPLAT  2   /* store loaded value into all elements */
+#define VSX_LDLEFT 4   /* load VSX register from left */
+
 /* Size field in type word */
 #define SIZE(n)((n) << 8)
 #define GETSIZE(w) ((w) >> 8)
@@ -83,7 +88,22 @@ struct instruction_op {
int update_reg;
/* For MFSPR */
int spr;
+   u8 element_size;/* for VSX/VMX loads/stores */
+   u8 vsx_flags;
+};
+
+union vsx_reg {
+   u8  b[16];
+   u16 h[8];
+   u32 w[4];
+   unsigned long d[2];
+   float   fp[4];
+   double  dp[2];
 };
 
 extern int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 unsigned int instr);
+extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
+const void *mem);
+extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg 
*reg,
+ void *mem);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 3c3146b..7921fed 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -31,7 +31,7 @@ obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
 
 obj-y  += checksum_$(BITS).o checksum_wrappers.o
 
-obj-$(CONFIG_PPC_EMULATE_SSTEP)+= sstep.o ldstfp.o
+obj-$(CONFIG_PPC_EMULATE_SSTEP)+= sstep.o ldstfp.o quad.o
 
 obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
 
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index a58777c..6840911 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -178,10 +178,10 @@ _GLOBAL(do_stfd)
EX_TABLE(2b,3b)
 
 #ifdef CONFIG_ALTIVEC
-/* Get the contents of vrN into v0; N is in r3. */
+/* Get the contents of vrN into v0; N is in r3. Doesn't touch r3 or r4. */
 _GLOBAL(get_vr)
mflrr0
-   rlwinm  r3,r3,3,0xf8
+   rlwinm  r6,r3,3,0xf8
bcl 20,31,1f
blr /* v0 is already in v0 */
nop
@@ -192,15 +192,15 @@ reg = 1
 reg = reg + 1
.endr
 1: mflrr5
-   add r5,r3,r5
+   add r5,r6,r5
mtctr   r5
mtlrr0
bctr
 
-/* Put the contents of v0 into vrN; N is in r3. */
+/* Put the contents of v0 into vrN; N is in r3. Doesn't touch r3 or r4. */
 _GLOBAL(put_vr)
mflrr0
-   rlwinm  r3,r3,3,0xf8
+   rlwinm  r6,r3,3,0xf8
bcl 20,31,1f
blr /* v0 is already in v0 */
nop
@@ -211,7 +211,7 @@ reg = 1
 reg = reg + 1
.endr
 1: mflrr5
-   add r5,r3,r5
+   add r5,r6,r5
mtctr   r5
mtlrr0
bctr
@@ -313,7 +313,7 @@ reg = reg + 1
bctr
 
 /* Load VSX reg N from vector doubleword *p.  N is in r3, p in r4. */
-_GLOBAL(do_lxvd2x)
+_GLOBAL(load_vsrn)
PPC_STLU r1,-STKFRM(r1)
mflrr0
PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1)
@@ -325,41 +325,38 @@ _GLOBAL(do_lxvd2x)
isync
beq cr7,1f
STXVD2X(0,R1,R8)
-

[PATCH v2 0/10] powerpc: Beef up single-stepping/instruction emulation infrastructure

2017-08-24 Thread Paul Mackerras

This patch series extends the code in arch/powerpc/lib/sstep.c so that
it handles almost all load and store instructions -- all except the
atomic memory operations (lwat, stwat, etc.).  It also makes sure that
we use the largest possible aligned accesses to access memory and that
we don't access the CPU FP/VMX/VSX registers when they don't contain
user data.  With this, it should be possible to replace the body of
the alignment interrupt handler with a call to emulate_step() or
something quite similar.

This version is based on the powerpc tree next branch as of a day or
two ago, and includes code to emulate addpcis, a fix for the isel
emulation, code to handle the multi-register loads and stores in
little-endian mode, and a fix for the wrong behaviour in updating RA
for load/store with update instructions in 32-bit mode.

Paul.

 arch/powerpc/include/asm/sstep.h |   77 +-
 arch/powerpc/lib/Makefile|2 +-
 arch/powerpc/lib/ldstfp.S|  307 ++
 arch/powerpc/lib/quad.S  |   62 ++
 arch/powerpc/lib/sstep.c | 1929 --
 5 files changed, 1654 insertions(+), 723 deletions(-)

RE: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver

2017-08-24 Thread Madalin-cristian Bucur

> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Subject: Re: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver
> 
> From: David Miller 
> Date: Thu, 24 Aug 2017 09:42:20 -0700 (PDT)
> 
> > From: Madalin Bucur 
> > Date: Thu, 24 Aug 2017 10:28:21 +0300
> >
> >> This patch set introduces Receive Side Scaling for the DPAA Ethernet
> >> driver. Documentation is updated with details related to the new
> >> feature and limitations that apply.
> >> Added also a small fix.
> >>
> >> v2: removed a C++ style comment
> >> v3: move struct fman to header file to avoid exporting a function
> >
> > Series applied, thanks.
> 
> Actually I'm reverting, this doesn't even compile.

Hi,

Sorry for this blunder, I've only tested on PPC, where it works.
Will come back with a proper patch set.

Madalin

Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's

2017-08-24 Thread Frederic Barrat




Le 24/08/2017 à 20:47, Benjamin Herrenschmidt a écrit :

On Thu, 2017-08-24 at 18:40 +0200, Frederic Barrat wrote:


The decrementing part is giving me troubles, and I think it makes sense:
if I decrement the counter when detaching the context from the capi
card, then the next TLBIs for the memory context may be back to local.


Yes, you need to flush the CAPI TLB first.


So when the process exits, the NPU wouldn't get the associated TLBIs,
which spells trouble the next time the same memory context ID is reused.
I believe this the cause of the problem I'm seeing. As soon as I keep
the TLBIs global, even after I detach from the capi adapter, everything
is fine.

Does it sound right?

So to keep the checks minimal in mm_is_thread_local(), to just checking
the active_cpus count, I'm thinking of introducing a "copro enabled" bit
on the context, so that we can increment active_cpus only once. And
never decrement it.


You can decrement if you flush. Don't you have MMIOs to do directed
flushes ?


That's for the nMMU. Last I heard, we don't have MMIOs to flush anything 
on the nMMU.


Side note: for the PSL, we do have MMIOs to flush, but they were 
perceived as useful only for debug and we don't rely on them, precisely 
because the nMMU would fall out of sync, so we have to rely on broadcast.


  Fred

[PATCH v3 4/4] powerpc/64s: idle ESL=0 stop can avoid MSR and save/restore overhead

2017-08-24 Thread Nicholas Piggin

When stop is executed with EC=ESL=0, it appears to execute like a
normal instruction (resuming from NIP when woken by interrupt).
So all the save/restore handling can be avoided completely. In
particular NV GPRs do not have to be saved, and MSR does not have
to be switched back to kernel MSR.

So move the test for "lite" sleep states out to power9_idle_stop.

Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/idle_book3s.S | 35 ---
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 32d65ee323a0..fa56120bd0bc 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -315,9 +315,6 @@ enter_winkle:
 
ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 
-/*
- * r3 - PSSCR value corresponding to the requested stop state.
- */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 power_enter_stop_kvm_rm:
/*
@@ -330,14 +327,11 @@ power_enter_stop_kvm_rm:
li  r4,KVM_HWTHREAD_IN_IDLE
/* DO THIS IN REAL MODE!  See comment above. */
stb r4,HSTATE_HWTHREAD_STATE(r13)
-#endif
-power_enter_stop:
 /*
  * Check if we are executing the lite variant with ESL=EC=0
  */
-   andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
-   clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
-   bne  .Lhandle_esl_ec_set
+   andis.  r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
+   bne power_enter_stop_esl
PPC_STOP
li  r3,0  /* Since we didn't lose state, return 0 */
 
@@ -354,8 +348,13 @@ power_enter_stop:
 */
li  r12, 0
b   pnv_wakeup_noloss
+#endif
 
-.Lhandle_esl_ec_set:
+/*
+ * r3 - PSSCR value corresponding to the requested stop state.
+ */
+power_enter_stop_esl:
+   clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
/*
 * POWER9 DD2 can incorrectly set PMAO when waking up after a
 * state-loss idle. Saving and restoring MMCR0 over idle is a
@@ -428,9 +427,23 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);   
\
  * r3 contains desired PSSCR register value.
  */
 _GLOBAL(power9_idle_stop)
-   std r3, PACA_REQ_PSSCR(r13)
mtspr   SPRN_PSSCR,r3
-   LOAD_REG_ADDR(r4,power_enter_stop)
+
+   /*
+* Check if we are executing the lite variant with ESL=EC=0
+* This case resumes execution after the stop instruction without
+* losing any state, so nothing has to be saved. The following
+* instructions up to the blr must be skipped if we want to
+* use power_enter_stop_kvm_rm.
+*/
+   andis.  r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
+   bne 1f
+   PPC_STOP
+   li  r3,0  /* Since we didn't lose state, return 0 */
+   blr
+1: /* state-loss idle */
+   std r3, PACA_REQ_PSSCR(r13)
+   LOAD_REG_ADDR(r4,power_enter_stop_esl)
b   pnv_powersave_common
/* No return */
 
-- 
2.13.3

[PATCH v3 3/4] powerpc/64s: idle POWER9 can execute stop in virtual mode

2017-08-24 Thread Nicholas Piggin

The hardware can execute stop in any context, and KVM does not
require real mode because siblings do not share MMU state. This
saves a switch to real-mode when going idle.

Acked-by: Gautham R. Shenoy 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/idle_book3s.S | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 14e97f442167..32d65ee323a0 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -195,7 +195,16 @@ pnv_powersave_common:
std r5,_CCR(r1)
std r1,PACAR1(r13)
 
+BEGIN_FTR_SECTION
+   /*
+* POWER9 does not require real mode to stop, and presently does not
+* set hwthread_state for KVM (threads don't share MMU context), so
+* we can remain in virtual mode for this.
+*/
+   bctr
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
/*
+* POWER8
 * Go to real mode to do the nap, as required by the architecture.
 * Also, we need to be in real mode before setting hwthread_state,
 * because as soon as we do that, another thread can switch
-- 
2.13.3

[PATCH v3 2/4] powerpc/64s: idle POWER9 can execute stop without a sync sequence

2017-08-24 Thread Nicholas Piggin

Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/cpuidle.h | 16 
 arch/powerpc/kernel/idle_book3s.S  | 26 --
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 8a174cba5567..eb43b5c3a7b5 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -101,20 +101,4 @@ static inline void report_invalid_psscr_val(u64 psscr_val, 
int err)
 
 #endif
 
-/* Idle state entry routines */
-#ifdef CONFIG_PPC_P7_NAP
-#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \
-   /* Magic NAP/SLEEP/WINKLE mode enter sequence */\
-   std r0,0(r1);   \
-   ptesync;\
-   ld  r0,0(r1);   \
-236:   cmpdcr0,r0,r0;  \
-   bne 236b;   \
-   IDLE_INST;  \
-
-#defineIDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)   \
-   IDLE_STATE_ENTER_SEQ(IDLE_INST) \
-   b   .
-#endif /* CONFIG_PPC_P7_NAP */
-
 #endif
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 4924647d964d..14e97f442167 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -205,6 +205,19 @@ pnv_powersave_common:
mtmsrd  r7,0
bctr
 
+/*
+ * This is the sequence required to execute idle instructions, as
+ * specified in ISA v2.07. MSR[IR] and MSR[DR] must be 0.
+ */
+#define ARCH207_IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)  \
+   /* Magic NAP/SLEEP/WINKLE mode enter sequence */\
+   std r0,0(r1);   \
+   ptesync;\
+   ld  r0,0(r1);   \
+236:   cmpdcr0,r0,r0;  \
+   bne 236b;   \
+   IDLE_INST;
+
.globl pnv_enter_arch207_idle_mode
 pnv_enter_arch207_idle_mode:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -230,7 +243,7 @@ pnv_enter_arch207_idle_mode:
stb r3,PACA_THREAD_IDLE_STATE(r13)
cmpwi   cr3,r3,PNV_THREAD_SLEEP
bge cr3,2f
-   IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
+   ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
/* No return */
 2:
/* Sleep or winkle */
@@ -269,7 +282,7 @@ pnv_fastsleep_workaround_at_entry:
 
 common_enter: /* common code for all the threads entering sleep or winkle */
bgt cr3,enter_winkle
-   IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
+   ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
 
 fastsleep_workaround_at_entry:
orisr15,r15,PNV_CORE_IDLE_LOCK_BIT@h
@@ -291,7 +304,7 @@ fastsleep_workaround_at_entry:
 enter_winkle:
bl  save_sprs_to_stack
 
-   IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
+   ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 
 /*
  * r3 - PSSCR value corresponding to the requested stop state.
@@ -316,7 +329,7 @@ power_enter_stop:
andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
bne  .Lhandle_esl_ec_set
-   IDLE_STATE_ENTER_SEQ(PPC_STOP)
+   PPC_STOP
li  r3,0  /* Since we didn't lose state, return 0 */
 
/*
@@ -349,7 +362,8 @@ power_enter_stop:
ld  r4,ADDROFF(pnv_first_deep_stop_state)(r5)
cmpdr3,r4
bge .Lhandle_deep_stop
-   IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP)
+   PPC_STOP/* Does not return (system reset interrupt) */
+
 .Lhandle_deep_stop:
 /*
  * Entering deep idle state.
@@ -371,7 +385,7 @@ lwarx_loop_stop:
 
bl  save_sprs_to_stack
 
-   IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP)
+   PPC_STOP/* Does not return (system reset interrupt) */
 
 /*
  * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
-- 
2.13.3

[PATCH v3 1/4] KVM: PPC: Book3S HV: POWER9 does not require secondary thread management

2017-08-24 Thread Nicholas Piggin

POWER9 CPUs have independent MMU contexts per thread, so KVM does not
need to quiesce secondary threads, so the hwthread_req/hwthread_state
protocol does not have to be used. So patch it away on POWER9, and patch
away the branch from the Linux idle wakeup to kvm_start_guest that is
never used.

Add a warning and error out of kvmppc_grab_hwthread in case it is ever
called on POWER9.

This avoids a hwsync in the idle wakeup path on POWER9.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |  4 
 arch/powerpc/kernel/idle_book3s.S | 35 +--
 arch/powerpc/kvm/book3s_hv.c  | 14 -
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  8 +++
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 7cea76f11c26..83596f32f50b 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -104,6 +104,10 @@ struct kvmppc_host_state {
u8 napping;
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+   /*
+* hwthread_req/hwthread_state pair is used to pull sibling threads
+* out of guest on pre-ISAv3.0B CPUs where threads share MMU.
+*/
u8 hwthread_req;
u8 hwthread_state;
u8 host_ipi;
diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index bfbf0976fc09..4924647d964d 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -296,13 +296,20 @@ enter_winkle:
 /*
  * r3 - PSSCR value corresponding to the requested stop state.
  */
-power_enter_stop:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-   /* Tell KVM we're entering idle */
+power_enter_stop_kvm_rm:
+   /*
+* This is currently unused because POWER9 KVM does not have to
+* gather secondary threads into sibling mode, but the code is
+* here in case that function is required.
+*
+* Tell KVM we're entering idle.
+*/
li  r4,KVM_HWTHREAD_IN_IDLE
/* DO THIS IN REAL MODE!  See comment above. */
stb r4,HSTATE_HWTHREAD_STATE(r13)
 #endif
+power_enter_stop:
 /*
  * Check if we are executing the lite variant with ESL=EC=0
  */
@@ -465,6 +472,18 @@ pnv_powersave_wakeup_mce:
 
b   pnv_powersave_wakeup
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+kvm_start_guest_check:
+   li  r0,KVM_HWTHREAD_IN_KERNEL
+   stb r0,HSTATE_HWTHREAD_STATE(r13)
+   /* Order setting hwthread_state vs. testing hwthread_req */
+   sync
+   lbz r0,HSTATE_HWTHREAD_REQ(r13)
+   cmpwi   r0,0
+   beqlr
+   b   kvm_start_guest
+#endif
+
 /*
  * Called from reset vector for powersave wakeups.
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
@@ -489,15 +508,9 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mr  r3,r12
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-   li  r0,KVM_HWTHREAD_IN_KERNEL
-   stb r0,HSTATE_HWTHREAD_STATE(r13)
-   /* Order setting hwthread_state vs. testing hwthread_req */
-   sync
-   lbz r0,HSTATE_HWTHREAD_REQ(r13)
-   cmpwi   r0,0
-   beq 1f
-   b   kvm_start_guest
-1:
+BEGIN_FTR_SECTION
+   bl  kvm_start_guest_check
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #endif
 
/* Return SRR1 from power7_nap() */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 359c79cdf0cc..e34cd6fb947b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2111,6 +2111,16 @@ static int kvmppc_grab_hwthread(int cpu)
struct paca_struct *tpaca;
long timeout = 1;
 
+   /*
+* ISA v3.0 idle routines do not set hwthread_state or test
+* hwthread_req, so they can not grab idle threads.
+*/
+   if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+   WARN_ON(1);
+   pr_err("KVM: can not control sibling threads\n");
+   return -EBUSY;
+   }
+
tpaca = &paca[cpu];
 
/* Ensure the thread won't go into the kernel if it wakes */
@@ -2145,10 +2155,12 @@ static void kvmppc_release_hwthread(int cpu)
struct paca_struct *tpaca;
 
tpaca = &paca[cpu];
-   tpaca->kvm_hstate.hwthread_req = 0;
tpaca->kvm_hstate.kvm_vcpu = NULL;
tpaca->kvm_hstate.kvm_vcore = NULL;
tpaca->kvm_hstate.kvm_split_mode = NULL;
+   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   tpaca->kvm_hstate.hwthread_req = 0;
+
 }
 
 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c52184a8efdf..3e024fd71fe8 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -149,9 +149,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)

[PATCH v3 0/4] powerpc/64s: idle POWER9 stop improvements

2017-08-24 Thread Nicholas Piggin

These are rebased patches leftover from the unmerged bit of
the idle series.

Based on feedback, I dropped one of the KVM patches, and reworked
the code a bit so it is easier to restore the ability for KVM to
grab secondaries into real mode.

I did a bit more benchmarking, and all up these patches improve 2
CPU ping-pong context switch benchmark on a POWER9 by around 4-6%
(depending on what CPUs and idle states are used).

Nicholas Piggin (4):
  KVM: PPC: Book3S HV: POWER9 does not require secondary thread
management
  powerpc/64s: idle POWER9 can execute stop without a sync sequence
  powerpc/64s: idle POWER9 can execute stop in virtual mode
  powerpc/64s: idle ESL=0 stop can avoid MSR and save/restore overhead

 arch/powerpc/include/asm/cpuidle.h|  16 -
 arch/powerpc/include/asm/kvm_book3s_asm.h |   4 ++
 arch/powerpc/kernel/idle_book3s.S | 103 ++
 arch/powerpc/kvm/book3s_hv.c  |  14 +++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   8 +++
 5 files changed, 101 insertions(+), 44 deletions(-)

-- 
2.13.3

Re: [PATCH v7 04/12] powerpc/vas: Define helpers to access MMIO regions

2017-08-24 Thread Michael Ellerman

Hi Suka,

Comments inline.

Sukadev Bhattiprolu  writes:
> diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
> b/arch/powerpc/platforms/powernv/vas-window.c
> index 6156fbe..a3a705a 100644
> --- a/arch/powerpc/platforms/powernv/vas-window.c
> +++ b/arch/powerpc/platforms/powernv/vas-window.c
> @@ -9,9 +9,182 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #include "vas.h"
>  
> +/*
> + * Compute the paste address region for the window @window using the
> + * ->paste_base_addr and ->paste_win_id_shift we got from device tree.
> + */
> +void compute_paste_address(struct vas_window *window, uint64_t *addr, int 
> *len)
> +{
> + uint64_t base, shift;

Please use the kernel types, so u64 here.

> + int winid;
> +
> + base = window->vinst->paste_base_addr;
> + shift = window->vinst->paste_win_id_shift;
> + winid = window->winid;
> +
> + *addr  = base + (winid << shift);
> + if (len)
> + *len = PAGE_SIZE;

Having multiple output parameters makes for a pretty awkward API. Is it
really necesssary given len is a constant PAGE_SIZE anyway.

If you didn't return len, then you could just make the function return
the addr, and you wouldn't need any output parameters.

One of the callers that passes len is unmap_paste_region(), but that
is a bit odd. It would be more natural I think if once a window is
mapped it knows its size. Or if the mapping will always just be one page
then we can just know that.

> +
> + pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
> +}
> +
> +static inline void get_hvwc_mmio_bar(struct vas_window *window,
> + uint64_t *start, int *len)
> +{
> + uint64_t pbaddr;
> +
> + pbaddr = window->vinst->hvwc_bar_start;
> + *start = pbaddr + window->winid * VAS_HVWC_SIZE;
> + *len = VAS_HVWC_SIZE;

This is:

#define VAS_HVWC_SIZE   512

But then we map it, which will round up to a page anyway. So again I
don't see the point of having the len returned form this helper.

> +}
> +
> +static inline void get_uwc_mmio_bar(struct vas_window *window,
> + uint64_t *start, int *len)
> +{
> + uint64_t pbaddr;
> +
> + pbaddr = window->vinst->uwc_bar_start;
> + *start = pbaddr + window->winid * VAS_UWC_SIZE;
> + *len = VAS_UWC_SIZE;
> +}
> +
> +/*
> + * Map the paste bus address of the given send window into kernel address
> + * space. Unlike MMIO regions (map_mmio_region() below), paste region must
> + * be mapped cache-able and is only applicable to send windows.
> + */
> +void *map_paste_region(struct vas_window *txwin)
> +{
> + int rc, len;
> + void *map;
> + char *name;
> + uint64_t start;
> +
> + rc = -ENOMEM;

You don't need that.

> + name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id,
> + txwin->winid);
> + if (!name)
> + return ERR_PTR(rc);

That can goto free_name;

> +
> + txwin->paste_addr_name = name;
> + compute_paste_address(txwin, &start, &len);
> +
> + if (!request_mem_region(start, len, name)) {
> + pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
> + __func__, start, len);
> + goto free_name;
> + }
> +
> + map = ioremap_cache(start, len);
> + if (!map) {
> + pr_devel("%s(): ioremap_cache(0x%llx, %d) failed\n", __func__,
> + start, len);
> + goto free_name;
> + }
> +
> + pr_devel("VAS: mapped paste addr 0x%llx to kaddr 0x%p\n", start, map);
> + return map;
> +
> +free_name:
> + kfree(name);

Because kfree(NULL) is fine.

> + return ERR_PTR(rc);

And that can just return ERR_PTR(-ENOMEM);

> +}

cheers

RE: [PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread Nixiaoming

On 24.08.2017 11:14, Paul Mackerras wrote:
> Nixiaoming pointed out that there is a memory leak in
> kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() 
> fails; the memory allocated for the kvmppc_spapr_tce_table struct is 
> not freed, and nor are the pages allocated for the iommu tables.  In 
> addition, we have already incremented the process's count of locked 
> memory pages, and this doesn't get restored on error.
> 
> David Hildenbrand pointed out that there is a race in that the 
> function checks early on that there is not already an entry in the
> stt->iommu_tables list with the same LIOBN, but an entry with the
> same LIOBN could get added between then and when the new entry is 
> added to the list.
> 
> This fixes all three problems.  To simplify things, we now call
> anon_inode_getfd() before placing the new entry in the list.  The 
> check for an existing entry is done while holding the kvm->lock mutex, 
> immediately before adding the new entry to the list.
> Finally, on failure we now call kvmppc_account_memlimit to decrement 
> the process's count of locked memory pages.
> 
> Reported-by: Nixiaoming 
> Reported-by: David Hildenbrand 
> Signed-off-by: Paul Mackerras 
> ---
> v2: Don't overwrite stt in loop over spapr_tce_tables
> 

Reviewed-by: nixiaoming

Re: [PATCH] powerpc: powernv: Fix build error on const discarding

2017-08-24 Thread Michael Ellerman

Corentin Labbe  writes:

> On Thu, Aug 17, 2017 at 10:52:11PM +1000, Michael Ellerman wrote:
>> Corentin Labbe  writes:
>> 
>> > When building a random powerpc kernel I hit this build error:
>> >   CC  arch/powerpc/platforms/powernv/opal-imc.o
>> > arch/powerpc/platforms/powernv/opal-imc.c: In function « 
>> > disable_nest_pmu_counters »:
>> > arch/powerpc/platforms/powernv/opal-imc.c:130:13: error : assignment 
>> > discards « const » qualifier from pointer target type 
>> > [-Werror=discarded-qualifiers]
>> >l_cpumask = cpumask_of_node(nid);
>> >  ^
>> > This patch simply add const to l_cpumask to fix this issue.
>> 
>> Thanks. I'm not sure why we haven't seen that.
>> 
>> Do you mind attaching your .config ?
>> 
>> cheers
>
> Yes

Thanks.

So the key is:

> CONFIG_PPC_POWERNV=y
...
> # CONFIG_NUMA is not set


Which none of our configs have.

I'll add a test build of that.

Thanks.

cheers

Re: [PATCH 4/4] powerpc/32: remove a NOP from memset()

2017-08-24 Thread Michael Ellerman

Christophe LEROY  writes:

> Le 24/08/2017 à 12:51, Michael Ellerman a écrit :
>> Christophe Leroy  writes:
>> 
>>> memset() is patched after initialisation to activate the
>>> optimised part which uses cache instructions.
>>>
>>> Today we have a 'b 2f' to skip the optimised patch, which then gets
>>> replaced by a NOP, implying a useless cycle consumption.
>>> As we have a 'bne 2f' just before, we could use that instruction
>>> for the live patching, hence removing the need to have a
>>> dedicated 'b 2f' to be replaced by a NOP.
>>>
>>> This patch changes the 'bne 2f' by a 'b 2f'. During init, that
>>> 'b 2f' is then replaced by 'bne 2f'
>> 
>> I'm not sure what the sequence is during boot for the 32-bit code, but
>> can you use an ALT_FTR section for this? Possibly that doesn't get done
>> at the right time though.
>
> Unfortunately, as we discussed in 2015 
> (https://lkml.org/lkml/2015/9/10/608),

Haha, you expect me to remember things I said then! ;)

> the ALT_FTR does things too early, while the cache is not enabled yet.

OK. Ben did do some reworks to the early init since then, but I don't
think he changed that.

I notice we do setup_feature_keys() in machine_init(), which is the jump
label equivalent of apply_feature_fixups(). So I wonder if we could
actually move apply_feature_fixups() to there. But it would need some
serious review.

cheers

Re: [PATCH v2 2/5] powerpc: pseries: vio: match parent nodes with of_find_node_by_path

2017-08-24 Thread Michael Ellerman

Rob Herring  writes:

> On Tue, Aug 22, 2017 at 12:12 AM, Michael Ellerman  
> wrote:
>> Rob Herring  writes:
>>
>>> In preparation to remove the full path from device_node.full_name, use
>>> of_find_node_by_path instead of open coding with strcmp.
>>>
>>> Signed-off-by: Rob Herring 
>>> Cc: Benjamin Herrenschmidt 
>>> Cc: Paul Mackerras 
>>> Cc: Michael Ellerman 
>>> Cc: linuxppc-dev@lists.ozlabs.org
>>> ---
>>> v2:
>>> - rebased to linux-next and removed spurious change fro patch 1.
>>>
>>>  arch/powerpc/platforms/pseries/vio.c | 4 ++--
>>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/powerpc/platforms/pseries/vio.c 
>>> b/arch/powerpc/platforms/pseries/vio.c
>>> index aa5ca74316fa..5754572deb23 100644
>>> --- a/arch/powerpc/platforms/pseries/vio.c
>>> +++ b/arch/powerpc/platforms/pseries/vio.c
>>> @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct 
>>> device_node *of_node)
>>>*/
>>>   parent_node = of_get_parent(of_node);
>>>   if (parent_node) {
>>> - if (!strcmp(parent_node->full_name, 
>>> "/ibm,platform-facilities"))
>>> + if (parent_node == 
>>> of_find_node_by_path("/ibm,platform-facilities"))
>>>   family = PFO;
>>> - else if (!strcmp(parent_node->full_name, "/vdevice"))
>>> + else if (parent_node == of_find_node_by_path("/vdevice"))
>>>   family = VDEVICE;
>>
>> This leaks references to the looked up nodes.
>>
>> Both these nodes are defined in PAPR (our hypervisor spec), and both of
>> them must have a device_type, either "ibm,platform-facilities" or
>> "vdevice".
>>
>> Looking at the commit that added the code I don't see any particular
>> reason it used the comparison against full_name, rather than using the
>> device_type.
>>
>> So I'm inclined to do this instead:
>>
>> diff --git a/arch/powerpc/platforms/pseries/vio.c 
>> b/arch/powerpc/platforms/pseries/vio.c
>> index 8a47f168476b..f26f906e6021 100644
>> --- a/arch/powerpc/platforms/pseries/vio.c
>> +++ b/arch/powerpc/platforms/pseries/vio.c
>> @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct 
>> device_node *of_node)
>>  */
>> parent_node = of_get_parent(of_node);
>> if (parent_node) {
>> -   if (!strcmp(parent_node->full_name, 
>> "/ibm,platform-facilities"))
>> +   if (!strcmp(parent_node->type, "ibm,platform-facilities"))
>> family = PFO;
>> -   else if (!strcmp(parent_node->full_name, "/vdevice"))
>> +   else if (!strcmp(parent_node->type, "vdevice"))
>> family = VDEVICE;
>> else {
>> pr_warn("%s: parent(%s) of %s not recognized.\n",
>>
>>
>> I've checked both Qemu and kvmtool add the device_type, and I'm fairly
>> confident that PowerVM does too. Anyway I'll test it on all the machines
>> I can find.
>
> Okay, do you want me to respin the patch or will you update it with

I merged it. Should be in next today.

cheers

[PATCH V10 2/2] powerpc/nodes: Ensure enough nodes avail for operations

2017-08-24 Thread Michael Bringmann

From: Michael Bringmann 
To: linuxppc-dev@lists.ozlabs.org
To: linux-ker...@vger.kernel.org
Cc: Michael Ellerman 
Cc: Michael Bringmann 
Cc: John Allen 
Cc: Nathan Fontenot 
Subject: [PATCH V10 2/2] powerpc/nodes: Ensure enough nodes avail for operations

powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
or memory resources, it may occur that the new resources are to be
inserted into nodes that were not used for these resources at bootup.
In the kernel, any node that is used must be defined and initialized
at boot.

This patch extracts the value of the 'min_common_depth' element from
the "rtas" device tree property "ibm,max-associativity-domains" to use
as the maximum number of nodes to setup as possibly available in the
system.  [The 'min_common_depth' element is calculated from memory
associations found while loading all of the configured memory into
the system data structures at boot.]  This new setting will override
the instruction,

nodes_and(node_possible_map, node_possible_map, node_online_map);

presently seen in the function arch/powerpc/mm/numa.c:initmem_init().

If the property is not present at boot, no operation will be performed
to define or enable additional nodes.

Signed-off-by: Michael Bringmann 
---
Changes in V10:
  -- Try to use 'min_common_depth' from NUMA initialization to select
 domain level to use for maximum nodes.
---
 arch/powerpc/mm/numa.c |   44 
 1 file changed, 44 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 73427e290..841d3b6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -896,6 +896,48 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
NODE_DATA(nid)->node_spanned_pages = spanned_pages;
 }
 
+static void __init node_associativity_setup(void)
+{
+   struct device_node *rtas;
+
+   rtas = of_find_node_by_path("/rtas");
+   if (rtas) {
+   const __be32 *prop;
+   u32 len, entries, numnodes, i;
+
+   prop = of_get_property(rtas, "ibm,max-associativity-domains", 
&len);
+   if (!prop || len < sizeof(unsigned int))
+   goto endit;
+
+   entries = of_read_number(prop++, 1);
+
+   if (len < (entries * sizeof(unsigned int)))
+   goto endit;
+
+   dbg("numa: Debug: Entries = %d MCD = %d\n", entries, 
min_common_depth);
+
+   if ((0 <= min_common_depth) && (min_common_depth <= 
(entries-1)))
+   entries = min_common_depth;
+   else
+   entries -= 1;
+
+   numnodes = of_read_number(&prop[entries], 1);
+
+   printk(KERN_INFO "numa: Nodes = %d\n", numnodes);
+
+   for (i = 0; i < numnodes; i++) {
+   if (!node_possible(i)) {
+   setup_node_data(i, 0, 0);
+   node_set(i, node_possible_map);
+   }
+   }
+   }
+
+endit:
+   if (rtas)
+   of_node_put(rtas);
+}
+
 void __init initmem_init(void)
 {
int nid, cpu;
@@ -915,6 +957,8 @@ void __init initmem_init(void)
 */
nodes_and(node_possible_map, node_possible_map, node_online_map);
 
+   node_associativity_setup();
+
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;

[PATCH V10 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-08-24 Thread Michael Bringmann


powerpc/numa: Correct the currently broken capability to set the
topology for shared CPUs in LPARs.  At boot time for shared CPU
lpars, the topology for each shared CPU is set to node zero, however,
this is now updated correctly using the Virtual Processor Home Node
(VPHN) capabilities information provided by the pHyp.

Also, update initialization checks for device-tree attributes to
independently recognize PRRN or VPHN usage.

Finally, try to distinguish the VPHN code from the NUMA code better,
and move relevant functions to another file.

Signed-off-by: Michael Bringmann 
---
Changes in V10:
  -- Reorganize VPHN code to distinguish it from NUMA processing
---
 arch/powerpc/include/asm/topology.h  |8 
 arch/powerpc/mm/numa.c   |  503 --
 arch/powerpc/mm/vphn.c   |  586 ++
 arch/powerpc/mm/vphn.h   |4 
 arch/powerpc/platforms/pseries/hotplug-cpu.c |2 
 5 files changed, 609 insertions(+), 494 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index dc4e159..600e1c6 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -98,6 +98,14 @@ static inline int prrn_is_enabled(void)
 }
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES)
+#if defined(CONFIG_PPC_SPLPAR)
+extern int timed_topology_update(int nsecs);
+#else
+#definetimed_topology_update(nsecs)0
+#endif /* CONFIG_PPC_SPLPAR */
+#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */
+
 #include 
 
 #ifdef CONFIG_SMP
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b95c584..73427e290 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -41,8 +42,12 @@
 #include 
 #include 
 
+#include "vphn.h"
+
 static int numa_enabled = 1;
 
+bool topology_updates_enabled = true;
+
 static char *cmdline __initdata;
 
 static int numa_debug;
@@ -60,8 +65,7 @@
 static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
-#define MAX_DISTANCE_REF_POINTS 4
-static int distance_ref_points_depth;
+int distance_ref_points_depth;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
@@ -142,12 +146,12 @@ static void reset_numa_cpu_lookup_table(void)
numa_cpu_lookup_table[cpu] = -1;
 }
 
-static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
+void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
numa_cpu_lookup_table[cpu] = node;
 }
 
-static void map_cpu_to_node(int cpu, int node)
+void map_cpu_to_node(int cpu, int node)
 {
update_numa_cpu_lookup_table(cpu, node);
 
@@ -158,7 +162,7 @@ static void map_cpu_to_node(int cpu, int node)
 }
 
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
-static void unmap_cpu_from_node(unsigned long cpu)
+void unmap_cpu_from_node(unsigned long cpu)
 {
int node = numa_cpu_lookup_table[cpu];
 
@@ -233,7 +237,7 @@ static void initialize_distance_lookup_table(int nid,
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
-static int associativity_to_nid(const __be32 *associativity)
+int associativity_to_nid(const __be32 *associativity)
 {
int nid = -1;
 
@@ -957,8 +961,6 @@ static int __init early_numa(char *p)
 }
 early_param("numa", early_numa);
 
-static bool topology_updates_enabled = true;
-
 static int __init early_topology_updates(char *p)
 {
if (!p)
@@ -1135,488 +1137,3 @@ u64 memory_hotplug_max(void)
 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-
-/* Virtual Processor Home Node (VPHN) support */
-#ifdef CONFIG_PPC_SPLPAR
-
-#include "vphn.h"
-
-struct topology_update_data {
-   struct topology_update_data *next;
-   unsigned int cpu;
-   int old_nid;
-   int new_nid;
-};
-
-static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
-static cpumask_t cpu_associativity_changes_mask;
-static int vphn_enabled;
-static int prrn_enabled;
-static void reset_topology_timer(void);
-
-/*
- * Store the current values of the associativity change counters in the
- * hypervisor.
- */
-static void setup_cpu_associativity_change_counters(void)
-{
-   int cpu;
-
-   /* The VPHN feature supports a maximum of 8 reference points */
-   BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
-
-   for_each_possible_cpu(cpu) {
-   int i;
-   u8 *counts = vphn_cpu_change_counts[cpu];
-   volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
-
-   for (i = 0; i < distance_ref_points_depth; i++)
-   counts[i] = hypervisor_counts[i];
-   }
-}
-
-/*
- *

[PATCH V10 0/2] powerpc/dlpar: Correct display of hot-add/hot-remove CPUs and memory

2017-08-24 Thread Michael Bringmann

From: m...@linux.vnet.ibm.com
To: linuxppc-dev@lists.ozlabs.org, linux-ker...@vger.kernel.org
Cc: nf...@linux.vnet.ibm.com
Cc: m...@linux.vnet.ibm.com
Subject: [PATCH V10 0/2] powerpc/dlpar: Correct display of hot-add/hot-remove 
CPUs and memory

On Power systems with shared configurations of CPUs and memory, there
are some issues with association of additional CPUs and memory to nodes
when hot-adding resources.  These patches address some of those problems.

powerpc/numa: Correct the currently broken capability to set the
topology for shared CPUs in LPARs.  At boot time for shared CPU
lpars, the topology for each shared CPU is set to node zero, however,
this is now updated correctly using the Virtual Processor Home Node
(VPHN) capabilities information provided by the pHyp. The VPHN handling
in Linux is disabled, if PRRN handling is present.

Also, update initialization checks for device-tree attributes to
independently recognize PRRN or VPHN usage.

Finally, try to distinguish the VPHN code from the NUMA code better,
and move relevant functions to another file.

powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU
or memory resources, it may occur that the new resources are to be
inserted into nodes that were not used for these resources at bootup.
In the kernel, any node that is used must be defined and initialized
at boot.

This patch extracts the value of the 'min_common_depth' element from
the "rtas" device tree property "ibm,max-associativity-domains" to use
as the maximum number of nodes to setup as possibly available in the
system.  [The 'min_common_depth' element is calculated from memory
associations found while loading all of the configured memory into
the system data structures at boot.]  This new setting will override
the instruction,

nodes_and(node_possible_map, node_possible_map, node_online_map);

presently seen in the function arch/powerpc/mm/numa.c:initmem_init().

If the property is not present at boot, no operation will be performed
to define or enable additional nodes.

Signed-off-by: Michael Bringmann 

Michael Bringmann (2):
  powerpc/numa: Update CPU topology when VPHN enabled
  powerpc/nodes: Ensure enough nodes avail for operations
---
Changes in V10:
  -- Reorganize VPHN code
  -- Revise index used with property "ibm,max-associativity-domains"

Re: [PATCH v7 03/12] powerpc/vas: Define vas_init() and vas_exit()

2017-08-24 Thread Sukadev Bhattiprolu

Michael Ellerman [m...@ellerman.id.au] wrote:
> Hi Suka,
> 
> Comments inline ...
> 
> Sukadev Bhattiprolu  writes:
> > diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt 
> > b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> > new file mode 100644
> > index 000..0e3111d
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> > @@ -0,0 +1,24 @@
> > +* IBM Powerpc Virtual Accelerator Switchboard (VAS)
> > +
> > +VAS is a hardware mechanism that allows kernel subsystems and user 
> > processes
> > +to directly submit compression and other requests to Nest accelerators (NX)
> > +or other coprocessors functions.
> > +
> > +Required properties:
> > +- compatible : should be "ibm,vas" or "ibm,power9-vas"
> 
> The driver doesn't look for the latter.

Ok. I have removed it from this list of required properties

> 
> > +- ibm,vas-id : A unique identifier for each instance of VAS in the system
> 
> What is this?

Like the ibm,chip-id, but in the future, there could be more than one instance
of VAS per chip, so firmware assigns a unique id to each instance of VAS.
> 
> > +- reg : Should contain 4 pairs of 64-bit fields specifying the Hypervisor
> > +  window context start and length, OS/User window context start and length,
> > +  "Paste address" start and length, "Paste window id" start bit and number
> > +  of bits)
> > +- name : "vas"
> 
> I don't think the name is normally included in the binding, and in fact
> there's no reason why the name is important, so I'd be inclined to drop that.

Ok. I dropped it.

> 
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 3c41902..abc235f 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -6425,6 +6425,14 @@ F:   drivers/crypto/nx/nx.*
> >  F: drivers/crypto/nx/nx_csbcpb.h
> >  F: drivers/crypto/nx/nx_debugfs.h
> >  
> > +IBM Power Virtual Accelerator Switchboard
> > +M: Sukadev Bhattiprolu
> > +L: linuxppc-dev@lists.ozlabs.org
> > +S: Supported
> > +F: arch/powerpc/platforms/powernv/vas*
> > +F: arch/powerpc/include/asm/vas.h
> > +F: arch/powerpc/include/uapi/asm/vas.h
> 
> That's not in the right place, the file is sorted alphabetically.

ah, fixed.
> 
> V comes after L.
> 
> > diff --git a/arch/powerpc/platforms/powernv/Kconfig 
> > b/arch/powerpc/platforms/powernv/Kconfig
> > index 6a6f4ef..f565454 100644
> > --- a/arch/powerpc/platforms/powernv/Kconfig
> > +++ b/arch/powerpc/platforms/powernv/Kconfig
> > @@ -30,3 +30,17 @@ config OPAL_PRD
> > help
> >   This enables the opal-prd driver, a facility to run processor
> >   recovery diagnostics on OpenPower machines
> > +
> > +config PPC_VAS
> > +   bool "IBM Virtual Accelerator Switchboard (VAS)"
> 
> ^ bool, so never a module.

yes, it should be built in.

> 
> > +   depends on PPC_POWERNV && PPC_64K_PAGES
> > +   default n
> 
> It should be default y.
> 
> I know the usual advice is to make things 'default n', but this has
> fairly tight depends already, so y is OK IMO.

Ok.

> 
> > diff --git a/arch/powerpc/platforms/powernv/vas.c 
> > b/arch/powerpc/platforms/powernv/vas.c
> > new file mode 100644
> > index 000..556156b
> > --- /dev/null
> > +++ b/arch/powerpc/platforms/powernv/vas.c
> > @@ -0,0 +1,183 @@
> > +/*
> > + * Copyright 2016 IBM Corp.
> 
> 2016-2017.

Ok.

> 
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version
> > + * 2 of the License, or (at your option) any later version.
> > + */
> 
> #define pr_fmt(fmt) "vas: " fmt

Ok
> 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include "vas.h"
> > +
> > +static bool init_done;
> > +LIST_HEAD(vas_instances);
> 
> Can be static.

Yes

> 
> > +
> > +static int init_vas_instance(struct platform_device *pdev)
> > +{
> > +   int rc, vasid;
> > +   struct vas_instance *vinst;
> > +   struct device_node *dn = pdev->dev.of_node;
> > +   struct resource *res;
> 
>   struct device_node *dn = pdev->dev.of_node;
>   struct vas_instance *vinst;
>   struct resource *res;
>   int rc, vasid;
> 
> Petty I know, but much prettier :)

I usually go the opposite way (shortest first) so I have done that here also.
For newer files I will invert the tree.

> 
> > +
> > +   rc = of_property_read_u32(dn, "ibm,vas-id", &vasid);
> > +   if (rc) {
> > +   pr_err("VAS: No ibm,vas-id property for %s?\n", pdev->name);
> 
> With the pr_fmt() above you don't need VAS: on the front of all these.

Ok

> 
> > +   return -ENODEV;
> > +   }
> > +
> > +   if (pdev->num_resources != 4) {
> > +   pr_err("VAS: Unexpected DT configuration for [%s, %d]\n",
> > +   pdev->name, vasid);
> > +   return -ENODEV;
> > +   }
> > +
> > +   vinst = kcalloc(1, sizeof(*vinst), GFP_KERNEL);
> 
> kzalloc() would be

[v4 11/11] fsl/soc/qbman: Enable FSL_LAYERSCAPE config on ARM

2017-08-24 Thread Roy Pledge

From: Madalin Bucur 

Signed-off-by: Madalin Bucur 
Signed-off-by: Claudiu Manoil 
[Stuart: changed to use ARCH_LAYERSCAPE]
Signed-off-by: Stuart Yoder 
Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qbman/Kconfig b/drivers/soc/fsl/qbman/Kconfig
index 757033c..fb4e6bf 100644
--- a/drivers/soc/fsl/qbman/Kconfig
+++ b/drivers/soc/fsl/qbman/Kconfig
@@ -1,6 +1,6 @@
 menuconfig FSL_DPAA
bool "Freescale DPAA 1.x support"
-   depends on FSL_SOC_BOOKE
+   depends on (FSL_SOC_BOOKE || ARCH_LAYERSCAPE)
select GENERIC_ALLOCATOR
help
  The Freescale Data Path Acceleration Architecture (DPAA) is a set of
-- 
2.7.4

[v4 10/11] soc/fsl/qbman: Add missing headers on ARM

2017-08-24 Thread Roy Pledge

From: Claudiu Manoil 

Unlike PPC builds, ARM builds need following headers
explicitly:
+#include   for ioread32be()
+#includefor udelay()

Signed-off-by: Claudiu Manoil 
Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/dpaa_sys.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
index 0a1d573..8ec6a78 100644
--- a/drivers/soc/fsl/qbman/dpaa_sys.h
+++ b/drivers/soc/fsl/qbman/dpaa_sys.h
@@ -44,6 +44,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /* For 2-element tables related to cache-inhibited and cache-enabled mappings 
*/
 #define DPAA_PORTAL_CE 0
-- 
2.7.4

[v4 09/11] soc/fsl/qbman: different register offsets on ARM

2017-08-24 Thread Roy Pledge

From: Madalin Bucur 

Signed-off-by: Madalin Bucur 
Signed-off-by: Claudiu Manoil 
Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/bman.c | 22 ++
 drivers/soc/fsl/qbman/qman.c | 38 ++
 2 files changed, 60 insertions(+)

diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c
index e31c843..265048d 100644
--- a/drivers/soc/fsl/qbman/bman.c
+++ b/drivers/soc/fsl/qbman/bman.c
@@ -35,6 +35,27 @@
 
 /* Portal register assists */
 
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+/* Cache-inhibited register offsets */
+#define BM_REG_RCR_PI_CINH 0x3000
+#define BM_REG_RCR_CI_CINH 0x3100
+#define BM_REG_RCR_ITR 0x3200
+#define BM_REG_CFG 0x3300
+#define BM_REG_SCN(n)  (0x3400 + ((n) << 6))
+#define BM_REG_ISR 0x3e00
+#define BM_REG_IER 0x3e40
+#define BM_REG_ISDR0x3e80
+#define BM_REG_IIR 0x3ec0
+
+/* Cache-enabled register offsets */
+#define BM_CL_CR   0x
+#define BM_CL_RR0  0x0100
+#define BM_CL_RR1  0x0140
+#define BM_CL_RCR  0x1000
+#define BM_CL_RCR_PI_CENA  0x3000
+#define BM_CL_RCR_CI_CENA  0x3100
+
+#else
 /* Cache-inhibited register offsets */
 #define BM_REG_RCR_PI_CINH 0x
 #define BM_REG_RCR_CI_CINH 0x0004
@@ -53,6 +74,7 @@
 #define BM_CL_RCR  0x1000
 #define BM_CL_RCR_PI_CENA  0x3000
 #define BM_CL_RCR_CI_CENA  0x3100
+#endif
 
 /*
  * Portal modes.
diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
index 668fab1..fdd4c65 100644
--- a/drivers/soc/fsl/qbman/qman.c
+++ b/drivers/soc/fsl/qbman/qman.c
@@ -41,6 +41,43 @@
 
 /* Portal register assists */
 
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+/* Cache-inhibited register offsets */
+#define QM_REG_EQCR_PI_CINH0x3000
+#define QM_REG_EQCR_CI_CINH0x3040
+#define QM_REG_EQCR_ITR0x3080
+#define QM_REG_DQRR_PI_CINH0x3100
+#define QM_REG_DQRR_CI_CINH0x3140
+#define QM_REG_DQRR_ITR0x3180
+#define QM_REG_DQRR_DCAP   0x31C0
+#define QM_REG_DQRR_SDQCR  0x3200
+#define QM_REG_DQRR_VDQCR  0x3240
+#define QM_REG_DQRR_PDQCR  0x3280
+#define QM_REG_MR_PI_CINH  0x3300
+#define QM_REG_MR_CI_CINH  0x3340
+#define QM_REG_MR_ITR  0x3380
+#define QM_REG_CFG 0x3500
+#define QM_REG_ISR 0x3600
+#define QM_REG_IER 0x3640
+#define QM_REG_ISDR0x3680
+#define QM_REG_IIR 0x36C0
+#define QM_REG_ITPR0x3740
+
+/* Cache-enabled register offsets */
+#define QM_CL_EQCR 0x
+#define QM_CL_DQRR 0x1000
+#define QM_CL_MR   0x2000
+#define QM_CL_EQCR_PI_CENA 0x3000
+#define QM_CL_EQCR_CI_CENA 0x3040
+#define QM_CL_DQRR_PI_CENA 0x3100
+#define QM_CL_DQRR_CI_CENA 0x3140
+#define QM_CL_MR_PI_CENA   0x3300
+#define QM_CL_MR_CI_CENA   0x3340
+#define QM_CL_CR   0x3800
+#define QM_CL_RR0  0x3900
+#define QM_CL_RR1  0x3940
+
+#else
 /* Cache-inhibited register offsets */
 #define QM_REG_EQCR_PI_CINH0x
 #define QM_REG_EQCR_CI_CINH0x0004
@@ -75,6 +112,7 @@
 #define QM_CL_CR   0x3800
 #define QM_CL_RR0  0x3900
 #define QM_CL_RR1  0x3940
+#endif
 
 /*
  * BTW, the drivers (and h/w programming model) already obtain the required
-- 
2.7.4

[v4 07/11] soc/fsl/qbman: Rework portal mapping calls for ARM/PPC

2017-08-24 Thread Roy Pledge

Rework portal mapping for PPC and ARM. The PPC devices require a
cacheable coherent mapping while ARM will work with a non-cachable/write
combine mapping. This also eliminates the need for manual cache
flushes on ARM

Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/bman.c|  6 +++---
 drivers/soc/fsl/qbman/bman_portal.c | 36 +++-
 drivers/soc/fsl/qbman/bman_priv.h   |  8 +++-
 drivers/soc/fsl/qbman/dpaa_sys.h|  8 
 drivers/soc/fsl/qbman/qman.c|  6 +++---
 drivers/soc/fsl/qbman/qman_portal.c | 36 +++-
 drivers/soc/fsl/qbman/qman_priv.h   |  8 +++-
 7 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c
index ff8998f..e31c843 100644
--- a/drivers/soc/fsl/qbman/bman.c
+++ b/drivers/soc/fsl/qbman/bman.c
@@ -154,7 +154,7 @@ struct bm_mc {
 };
 
 struct bm_addr {
-   void __iomem *ce;   /* cache-enabled */
+   void *ce;   /* cache-enabled */
void __iomem *ci;   /* cache-inhibited */
 };
 
@@ -512,8 +512,8 @@ static int bman_create_portal(struct bman_portal *portal,
 * config, everything that follows depends on it and "config" is more
 * for (de)reference...
 */
-   p->addr.ce = c->addr_virt[DPAA_PORTAL_CE];
-   p->addr.ci = c->addr_virt[DPAA_PORTAL_CI];
+   p->addr.ce = c->addr_virt_ce;
+   p->addr.ci = c->addr_virt_ci;
if (bm_rcr_init(p, bm_rcr_pvb, bm_rcr_cce)) {
dev_err(c->dev, "RCR initialisation failed\n");
goto fail_rcr;
diff --git a/drivers/soc/fsl/qbman/bman_portal.c 
b/drivers/soc/fsl/qbman/bman_portal.c
index 39b39c8..bb03503 100644
--- a/drivers/soc/fsl/qbman/bman_portal.c
+++ b/drivers/soc/fsl/qbman/bman_portal.c
@@ -91,7 +91,6 @@ static int bman_portal_probe(struct platform_device *pdev)
struct device_node *node = dev->of_node;
struct bm_portal_config *pcfg;
struct resource *addr_phys[2];
-   void __iomem *va;
int irq, cpu;
 
pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
@@ -123,23 +122,34 @@ static int bman_portal_probe(struct platform_device *pdev)
}
pcfg->irq = irq;
 
-   va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0);
-   if (!va) {
-   dev_err(dev, "ioremap::CE failed\n");
+   /*
+* TODO: Ultimately we would like to use a cacheable/non-shareable
+* (coherent) mapping for the portal on both architectures but that
+* isn't currently available in the kernel.  Because of HW differences
+* PPC needs to be mapped cacheable while ARM SoCs will work with non
+* cacheable mappings
+*/
+#ifdef CONFIG_PPC
+   /* PPC requires a cacheable/non-coherent mapping of the portal */
+   pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
+   resource_size(addr_phys[0]), MEMREMAP_WB);
+#else
+   /* ARM can use a write combine mapping. */
+   pcfg->addr_virt_ce = memremap(addr_phys[0]->start,
+   resource_size(addr_phys[0]), MEMREMAP_WC);
+#endif
+   if (!pcfg->addr_virt_ce) {
+   dev_err(dev, "memremap::CE failed\n");
goto err_ioremap1;
}
 
-   pcfg->addr_virt[DPAA_PORTAL_CE] = va;
-
-   va = ioremap_prot(addr_phys[1]->start, resource_size(addr_phys[1]),
- _PAGE_GUARDED | _PAGE_NO_CACHE);
-   if (!va) {
+   pcfg->addr_virt_ci = ioremap(addr_phys[1]->start,
+   resource_size(addr_phys[1]));
+   if (!pcfg->addr_virt_ci) {
dev_err(dev, "ioremap::CI failed\n");
goto err_ioremap2;
}
 
-   pcfg->addr_virt[DPAA_PORTAL_CI] = va;
-
spin_lock(&bman_lock);
cpu = cpumask_next_zero(-1, &portal_cpus);
if (cpu >= nr_cpu_ids) {
@@ -164,9 +174,9 @@ static int bman_portal_probe(struct platform_device *pdev)
return 0;
 
 err_portal_init:
-   iounmap(pcfg->addr_virt[DPAA_PORTAL_CI]);
+   iounmap(pcfg->addr_virt_ci);
 err_ioremap2:
-   iounmap(pcfg->addr_virt[DPAA_PORTAL_CE]);
+   memunmap(pcfg->addr_virt_ce);
 err_ioremap1:
return -ENXIO;
 }
diff --git a/drivers/soc/fsl/qbman/bman_priv.h 
b/drivers/soc/fsl/qbman/bman_priv.h
index 765a4bf..c48e6eb 100644
--- a/drivers/soc/fsl/qbman/bman_priv.h
+++ b/drivers/soc/fsl/qbman/bman_priv.h
@@ -49,11 +49,9 @@ extern u16 bman_ip_rev;  /* 0 if uninitialised, 
otherwise BMAN_REVx */
 extern struct gen_pool *bm_bpalloc;
 
 struct bm_portal_config {
-   /*
-* Corenet portal addresses;
-* [0]==cache-enabled, [1]==cache-inhibited.
-*/
-   void __iomem *addr_virt[2];
+   /* Portal addresses */
+   void  *addr_virt_ce;
+   void __iomem *addr_virt_ci;
/* Allow these to be joined in lists */
struct list_head li

[v4 08/11] soc/fsl/qbman: add QMAN_REV32

2017-08-24 Thread Roy Pledge

From: Madalin Bucur 

Add revision 3.2 of the QBMan block.  This is the version
for LS1043A and LS1046A SoCs.

Signed-off-by: Madalin Bucur 
Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/qman_ccsr.c | 2 ++
 drivers/soc/fsl/qbman/qman_priv.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c 
b/drivers/soc/fsl/qbman/qman_ccsr.c
index 20a1ebd..bbe3975 100644
--- a/drivers/soc/fsl/qbman/qman_ccsr.c
+++ b/drivers/soc/fsl/qbman/qman_ccsr.c
@@ -720,6 +720,8 @@ static int fsl_qman_probe(struct platform_device *pdev)
qman_ip_rev = QMAN_REV30;
else if (major == 3 && minor == 1)
qman_ip_rev = QMAN_REV31;
+   else if (major == 3 && minor == 2)
+   qman_ip_rev = QMAN_REV32;
else {
dev_err(dev, "Unknown QMan version\n");
return -ENODEV;
diff --git a/drivers/soc/fsl/qbman/qman_priv.h 
b/drivers/soc/fsl/qbman/qman_priv.h
index bab7f15..8f715fa 100644
--- a/drivers/soc/fsl/qbman/qman_priv.h
+++ b/drivers/soc/fsl/qbman/qman_priv.h
@@ -185,6 +185,7 @@ struct qm_portal_config {
 #define QMAN_REV20 0x0200
 #define QMAN_REV30 0x0300
 #define QMAN_REV31 0x0301
+#define QMAN_REV32 0x0302
 extern u16 qman_ip_rev; /* 0 if uninitialised, otherwise QMAN_REVx */
 
 #define QM_FQID_RANGE_START 1 /* FQID 0 reserved for internal use */
-- 
2.7.4

[v4 06/11] soc/fsl/qbman: Fix ARM32 typo

2017-08-24 Thread Roy Pledge

From: Valentin Rothberg 

The Kconfig symbol for 32bit ARM is 'ARM', not 'ARM32'.

Signed-off-by: Valentin Rothberg 
Signed-off-by: Claudiu Manoil 
Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/dpaa_sys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
index f85c319..81a9a5e 100644
--- a/drivers/soc/fsl/qbman/dpaa_sys.h
+++ b/drivers/soc/fsl/qbman/dpaa_sys.h
@@ -53,7 +53,7 @@ static inline void dpaa_flush(void *p)
 {
 #ifdef CONFIG_PPC
flush_dcache_range((unsigned long)p, (unsigned long)p+64);
-#elif defined(CONFIG_ARM32)
+#elif defined(CONFIG_ARM)
__cpuc_flush_dcache_area(p, 64);
 #elif defined(CONFIG_ARM64)
__flush_dcache_area(p, 64);
-- 
2.7.4

[v4 05/11] soc/fsl/qbman: Drop L1_CACHE_BYTES compile time check

2017-08-24 Thread Roy Pledge

From: Claudiu Manoil 

Not relevant and arch dependent. Overkill for PPC.

Signed-off-by: Claudiu Manoil 
Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/dpaa_sys.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h
index 2ce394a..f85c319 100644
--- a/drivers/soc/fsl/qbman/dpaa_sys.h
+++ b/drivers/soc/fsl/qbman/dpaa_sys.h
@@ -49,10 +49,6 @@
 #define DPAA_PORTAL_CE 0
 #define DPAA_PORTAL_CI 1
 
-#if (L1_CACHE_BYTES != 32) && (L1_CACHE_BYTES != 64)
-#error "Unsupported Cacheline Size"
-#endif
-
 static inline void dpaa_flush(void *p)
 {
 #ifdef CONFIG_PPC
-- 
2.7.4

[v4 04/11] soc/fsl/qbman: Drop set/clear_bits usage

2017-08-24 Thread Roy Pledge

From: Madalin Bucur 

Replace PPC specific set/clear_bits API with standard
bit twiddling so driver is portalable outside PPC.

Signed-off-by: Madalin Bucur 
Signed-off-by: Claudiu Manoil 
Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/bman.c | 2 +-
 drivers/soc/fsl/qbman/qman.c | 8 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c
index 604e45c..ff8998f 100644
--- a/drivers/soc/fsl/qbman/bman.c
+++ b/drivers/soc/fsl/qbman/bman.c
@@ -616,7 +616,7 @@ int bman_p_irqsource_add(struct bman_portal *p, u32 bits)
unsigned long irqflags;
 
local_irq_save(irqflags);
-   set_bits(bits & BM_PIRQ_VISIBLE, &p->irq_sources);
+   p->irq_sources |= bits & BM_PIRQ_VISIBLE;
bm_out(&p->p, BM_REG_IER, p->irq_sources);
local_irq_restore(irqflags);
return 0;
diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
index 1bcfc51..25419e1 100644
--- a/drivers/soc/fsl/qbman/qman.c
+++ b/drivers/soc/fsl/qbman/qman.c
@@ -908,12 +908,12 @@ static inline int qm_mc_result_timeout(struct qm_portal 
*portal,
 
 static inline void fq_set(struct qman_fq *fq, u32 mask)
 {
-   set_bits(mask, &fq->flags);
+   fq->flags |= mask;
 }
 
 static inline void fq_clear(struct qman_fq *fq, u32 mask)
 {
-   clear_bits(mask, &fq->flags);
+   fq->flags &= ~mask;
 }
 
 static inline int fq_isset(struct qman_fq *fq, u32 mask)
@@ -1574,7 +1574,7 @@ void qman_p_irqsource_add(struct qman_portal *p, u32 bits)
unsigned long irqflags;
 
local_irq_save(irqflags);
-   set_bits(bits & QM_PIRQ_VISIBLE, &p->irq_sources);
+   p->irq_sources |= bits & QM_PIRQ_VISIBLE;
qm_out(&p->p, QM_REG_IER, p->irq_sources);
local_irq_restore(irqflags);
 }
@@ -1597,7 +1597,7 @@ void qman_p_irqsource_remove(struct qman_portal *p, u32 
bits)
 */
local_irq_save(irqflags);
bits &= QM_PIRQ_VISIBLE;
-   clear_bits(bits, &p->irq_sources);
+   p->irq_sources &= ~bits;
qm_out(&p->p, QM_REG_IER, p->irq_sources);
ier = qm_in(&p->p, QM_REG_IER);
/*
-- 
2.7.4

[v4 03/11] dt-bindings: soc/fsl: Update reserved memory binding for QBMan

2017-08-24 Thread Roy Pledge

Updates the QMan and BMan device tree bindings for reserved memory
nodes. This makes the reserved memory allocation compatible with
the shared-dma-pool usage.

Signed-off-by: Roy Pledge 
---
 Documentation/devicetree/bindings/soc/fsl/bman.txt | 12 +-
 Documentation/devicetree/bindings/soc/fsl/qman.txt | 26 --
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/Documentation/devicetree/bindings/soc/fsl/bman.txt 
b/Documentation/devicetree/bindings/soc/fsl/bman.txt
index 47ac834..48eed14 100644
--- a/Documentation/devicetree/bindings/soc/fsl/bman.txt
+++ b/Documentation/devicetree/bindings/soc/fsl/bman.txt
@@ -65,8 +65,8 @@ to the respective BMan instance
 BMan Private Memory Node
 
 BMan requires a contiguous range of physical memory used for the backing store
-for BMan Free Buffer Proxy Records (FBPR). This memory is reserved/allocated 
as a
-node under the /reserved-memory node
+for BMan Free Buffer Proxy Records (FBPR). This memory is reserved/allocated as
+a node under the /reserved-memory node.
 
 The BMan FBPR memory node must be named "bman-fbpr"
 
@@ -75,7 +75,9 @@ PROPERTIES
 - compatible
Usage:  required
Value type: 
-   Definition: Must inclide "fsl,bman-fbpr"
+   Definition: PPC platforms: Must include "fsl,bman-fbpr"
+   ARM platforms: Must include "shared-dma-pool"
+  as well as the "no-map" property
 
 The following constraints are relevant to the FBPR private memory:
- The size must be 2^(size + 1), with size = 11..33. That is 4 KiB to
@@ -100,10 +102,10 @@ The example below shows a BMan FBPR dynamic allocation 
memory node
ranges;
 
bman_fbpr: bman-fbpr {
-   compatible = "fsl,bman-fbpr";
-   alloc-ranges = <0 0 0x10 0>;
+   compatible = "shared-mem-pool";
size = <0 0x100>;
alignment = <0 0x100>;
+   no-map;
};
};
 
diff --git a/Documentation/devicetree/bindings/soc/fsl/qman.txt 
b/Documentation/devicetree/bindings/soc/fsl/qman.txt
index 556ebb8..ee96afd 100644
--- a/Documentation/devicetree/bindings/soc/fsl/qman.txt
+++ b/Documentation/devicetree/bindings/soc/fsl/qman.txt
@@ -60,6 +60,12 @@ are located at offsets 0xbf8 and 0xbfc
Value type: 
Definition: Reference input clock. Its frequency is half of the
platform clock
+- memory-regions
+   Usage:  Required for ARM
+   Value type: 
+   Definition: List of phandles referencing the QMan private memory
+   nodes (described below). The qman-fqd node must be
+   first followed by qman-pfdr node. Only used on ARM
 
 Devices connected to a QMan instance via Direct Connect Portals (DCP) must link
 to the respective QMan instance
@@ -74,7 +80,9 @@ QMan Private Memory Nodes
 
 QMan requires two contiguous range of physical memory used for the backing 
store
 for QMan Frame Queue Descriptor (FQD) and Packed Frame Descriptor Record 
(PFDR).
-This memory is reserved/allocated as a nodes under the /reserved-memory node
+This memory is reserved/allocated as a node under the /reserved-memory node.
+
+For additional details about reserved memory regions see reserved-memory.txt
 
 The QMan FQD memory node must be named "qman-fqd"
 
@@ -83,7 +91,9 @@ PROPERTIES
 - compatible
Usage:  required
Value type: 
-   Definition: Must inclide "fsl,qman-fqd"
+   Definition: PPC platforms: Must include "fsl,qman-fqd"
+   ARM platforms: Must include "shared-dma-pool"
+  as well as the "no-map" property
 
 The QMan PFDR memory node must be named "qman-pfdr"
 
@@ -92,7 +102,9 @@ PROPERTIES
 - compatible
Usage:  required
Value type: 
-   Definition: Must inclide "fsl,qman-pfdr"
+   Definition: PPC platforms: Must include "fsl,qman-pfdr"
+   ARM platforms: Must include "shared-dma-pool"
+  as well as the "no-map" property
 
 The following constraints are relevant to the FQD and PFDR private memory:
- The size must be 2^(size + 1), with size = 11..29. That is 4 KiB to
@@ -117,16 +129,16 @@ The example below shows a QMan FQD and a PFDR dynamic 
allocation memory nodes
ranges;
 
qman_fqd: qman-fqd {
-   compatible = "fsl,qman-fqd";
-   alloc-ranges = <0 0 0x10 0>;
+   compatible = "shared-dma-pool";
size = <0 0x40>;
alignment = <0 0x40>;
+   no-map;
};
qman_pfdr: qman-pfdr {
-   compatible = "fsl,qman-pfdr";

[v4 02/11] soc/fsl/qbman: Use shared-dma-pool for QMan private memory allocations

2017-08-24 Thread Roy Pledge

Use the shared-memory-pool mechanism for frame queue descriptor and
packed frame descriptor record area allocations.

Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/qman_ccsr.c | 138 +-
 drivers/soc/fsl/qbman/qman_priv.h |   4 +-
 drivers/soc/fsl/qbman/qman_test.h |   2 -
 3 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c 
b/drivers/soc/fsl/qbman/qman_ccsr.c
index 835ce94..20a1ebd 100644
--- a/drivers/soc/fsl/qbman/qman_ccsr.c
+++ b/drivers/soc/fsl/qbman/qman_ccsr.c
@@ -401,21 +401,42 @@ static int qm_init_pfdr(struct device *dev, u32 
pfdr_start, u32 num)
 }
 
 /*
- * Ideally we would use the DMA API to turn rmem->base into a DMA address
- * (especially if iommu translations ever get involved).  Unfortunately, the
- * DMA API currently does not allow mapping anything that is not backed with
- * a struct page.
+ * QMan needs two global memory areas initialized at boot time:
+ *  1) FQD: Frame Queue Descriptors used to manage frame queues
+ *  2) PFDR: Packed Frame Queue Descriptor Records used to store frames
+ * Both areas are reserved using the device tree reserved memory framework
+ * and the addresses and sizes are initialized when the QMan device is probed
  */
 static dma_addr_t fqd_a, pfdr_a;
 static size_t fqd_sz, pfdr_sz;
 
+#ifdef CONFIG_PPC
+/*
+ * Support for PPC Device Tree backward compatibility when compatible
+ * string is set to fsl-qman-fqd and fsl-qman-pfdr
+ */
+static int zero_priv_mem(phys_addr_t addr, size_t sz)
+{
+   /* map as cacheable, non-guarded */
+   void __iomem *tmpp = ioremap_prot(addr, sz, 0);
+
+   if (!tmpp)
+   return -ENOMEM;
+
+   memset_io(tmpp, 0, sz);
+   flush_dcache_range((unsigned long)tmpp,
+  (unsigned long)tmpp + sz);
+   iounmap(tmpp);
+
+   return 0;
+}
+
 static int qman_fqd(struct reserved_mem *rmem)
 {
fqd_a = rmem->base;
fqd_sz = rmem->size;
 
WARN_ON(!(fqd_a && fqd_sz));
-
return 0;
 }
 RESERVEDMEM_OF_DECLARE(qman_fqd, "fsl,qman-fqd", qman_fqd);
@@ -431,32 +452,13 @@ static int qman_pfdr(struct reserved_mem *rmem)
 }
 RESERVEDMEM_OF_DECLARE(qman_pfdr, "fsl,qman-pfdr", qman_pfdr);
 
+#endif
+
 static unsigned int qm_get_fqid_maxcnt(void)
 {
return fqd_sz / 64;
 }
 
-/*
- * Flush this memory range from data cache so that QMAN originated
- * transactions for this memory region could be marked non-coherent.
- */
-static int zero_priv_mem(struct device *dev, struct device_node *node,
-phys_addr_t addr, size_t sz)
-{
-   /* map as cacheable, non-guarded */
-   void __iomem *tmpp = ioremap_prot(addr, sz, 0);
-
-   if (!tmpp)
-   return -ENOMEM;
-
-   memset_io(tmpp, 0, sz);
-   flush_dcache_range((unsigned long)tmpp,
-  (unsigned long)tmpp + sz);
-   iounmap(tmpp);
-
-   return 0;
-}
-
 static void log_edata_bits(struct device *dev, u32 bit_count)
 {
u32 i, j, mask = 0x;
@@ -687,11 +689,12 @@ static int qman_resource_init(struct device *dev)
 static int fsl_qman_probe(struct platform_device *pdev)
 {
struct device *dev = &pdev->dev;
-   struct device_node *node = dev->of_node;
+   struct device_node *mem_node, *node = dev->of_node;
struct resource *res;
int ret, err_irq;
u16 id;
u8 major, minor;
+   u64 size;
 
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!res) {
@@ -727,10 +730,83 @@ static int fsl_qman_probe(struct platform_device *pdev)
qm_channel_caam = QMAN_CHANNEL_CAAM_REV3;
}
 
-   ret = zero_priv_mem(dev, node, fqd_a, fqd_sz);
-   WARN_ON(ret);
-   if (ret)
-   return -ENODEV;
+   if (fqd_a) {
+#ifdef CONFIG_PPC
+   /*
+* For PPC backward DT compatibility
+* FQD memory MUST be zero'd by software
+*/
+   zero_priv_mem(fqd_a, fqd_sz);
+#else
+   WARN(1, "Unexpected archiceture using non shared-dma-mem 
reservations");
+#endif
+   } else {
+   /*
+* Order of memory regions is assumed as FQD followed by PFDR
+* in order to ensure allocations from the correct regions the
+* driver initializes then allocates each piece in order
+*/
+   ret = of_reserved_mem_device_init_by_idx(dev, dev->of_node, 0);
+   if (ret) {
+   dev_err(dev, "of_reserved_mem_device_init_by_idx(0) 
failed 0x%x\n",
+   ret);
+   return -ENODEV;
+   }
+   mem_node = of_parse_phandle(dev->of_node, "memory-region", 0);
+   if (mem_node) {
+   ret = of_property_read_u64(mem_node, "size", &size);
+   if (ret) {
+

[v4 01/11] soc/fsl/qbman: Use shared-dma-pool for BMan private memory allocations

2017-08-24 Thread Roy Pledge

Use the shared-memory-pool mechanism for free buffer proxy record
area allocation.

Signed-off-by: Roy Pledge 
---
 drivers/soc/fsl/qbman/bman_ccsr.c | 35 ++-
 drivers/soc/fsl/qbman/bman_priv.h |  3 +++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qbman/bman_ccsr.c 
b/drivers/soc/fsl/qbman/bman_ccsr.c
index eaa9585..2182236 100644
--- a/drivers/soc/fsl/qbman/bman_ccsr.c
+++ b/drivers/soc/fsl/qbman/bman_ccsr.c
@@ -170,10 +170,11 @@ static int fsl_bman_probe(struct platform_device *pdev)
 {
int ret, err_irq;
struct device *dev = &pdev->dev;
-   struct device_node *node = dev->of_node;
+   struct device_node *mem_node, *node = dev->of_node;
struct resource *res;
u16 id, bm_pool_cnt;
u8 major, minor;
+   u64 size;
 
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!res) {
@@ -201,6 +202,38 @@ static int fsl_bman_probe(struct platform_device *pdev)
return -ENODEV;
}
 
+   /*
+* If FBPR memory wasn't defined using the qbman compatible string
+* try using the of_reserved_mem_device method
+*/
+   if (!fbpr_a) {
+   ret = of_reserved_mem_device_init(dev);
+   if (ret) {
+   dev_err(dev, "of_reserved_mem_device_init() failed 
0x%x\n",
+   ret);
+   return -ENODEV;
+   }
+   mem_node = of_parse_phandle(dev->of_node, "memory-region", 0);
+   if (mem_node) {
+   ret = of_property_read_u64(mem_node, "size", &size);
+   if (ret) {
+   dev_err(dev, "FBPR: of_address_to_resource 
fails 0x%x\n",
+   ret);
+   return -ENODEV;
+   }
+   fbpr_sz = size;
+   } else {
+   dev_err(dev, "No memory-region found for FBPR\n");
+   return -ENODEV;
+   }
+   if (!dma_zalloc_coherent(dev, fbpr_sz, &fbpr_a, 0)) {
+   dev_err(dev, "Alloc FBPR memory failed\n");
+   return -ENODEV;
+   }
+   }
+
+   dev_dbg(dev, "Allocated FBPR 0x%llx 0x%zx\n", fbpr_a, fbpr_sz);
+
bm_set_memory(fbpr_a, fbpr_sz);
 
err_irq = platform_get_irq(pdev, 0);
diff --git a/drivers/soc/fsl/qbman/bman_priv.h 
b/drivers/soc/fsl/qbman/bman_priv.h
index f6896a2..765a4bf 100644
--- a/drivers/soc/fsl/qbman/bman_priv.h
+++ b/drivers/soc/fsl/qbman/bman_priv.h
@@ -33,6 +33,9 @@
 #include "dpaa_sys.h"
 
 #include 
+#include 
+#include 
+#include 
 
 /* Portal processing (interrupt) sources */
 #define BM_PIRQ_RCRI   0x0002  /* RCR Ring (below threshold) */
-- 
2.7.4

[v4 00/11] soc/fsl/qbman: Enable QBMan on ARM Platforms

2017-08-24 Thread Roy Pledge

This patch series enables DPAA1 QBMan devices for ARM and
ARM64 architectures. This allows the LS1043A and LS1046A to use
QBMan functionality which allows access to ethernet and cyptographic
devices for example.

Changes since v3:
- Use memremap() instead of ioremap() for non iomem QBMan portal regions
- Ensured the __iomem attribute is respected when accessing iomem mapped regions
- Removed calls to flush/invalidate/prefetch for ARM/ARM64 since mapping is 
done as write combine

Changes since v2:
- Fixed some misspellings
- Added 'no-map' constraint to device tree bindings
- Described ordering contraint on regions in the device tree
- Removed confusing comment regarding non-shareable mappings
- Added warning if old reserved-memory technique is used on ARM

Changes since v1:
- Reworked private memory allocations to use shared-dma-pool on ARM platforms


Claudiu Manoil (2):
  soc/fsl/qbman: Drop L1_CACHE_BYTES compile time check
  soc/fsl/qbman: Add missing headers on ARM

Madalin Bucur (4):
  soc/fsl/qbman: Drop set/clear_bits usage
  soc/fsl/qbman: add QMAN_REV32
  soc/fsl/qbman: different register offsets on ARM
  fsl/soc/qbman: Enable FSL_LAYERSCAPE config on ARM

Roy Pledge (4):
  soc/fsl/qbman: Use shared-dma-pool for BMan private memory allocations
  soc/fsl/qbman: Use shared-dma-pool for QMan private memory allocations
  dt-bindings: soc/fsl: Update reserved memory binding for QBMan
  soc/fsl/qbman: Rework portal mapping calls for ARM/PPC

Valentin Rothberg (1):
  soc/fsl/qbman: Fix ARM32 typo

 Documentation/devicetree/bindings/soc/fsl/bman.txt |  12 +-
 Documentation/devicetree/bindings/soc/fsl/qman.txt |  26 ++--
 drivers/soc/fsl/qbman/Kconfig  |   2 +-
 drivers/soc/fsl/qbman/bman.c   |  30 -
 drivers/soc/fsl/qbman/bman_ccsr.c  |  35 +-
 drivers/soc/fsl/qbman/bman_portal.c|  36 --
 drivers/soc/fsl/qbman/bman_priv.h  |  11 +-
 drivers/soc/fsl/qbman/dpaa_sys.h   |  14 +--
 drivers/soc/fsl/qbman/qman.c   |  52 ++--
 drivers/soc/fsl/qbman/qman_ccsr.c  | 140 -
 drivers/soc/fsl/qbman/qman_portal.c|  36 --
 drivers/soc/fsl/qbman/qman_priv.h  |  13 +-
 drivers/soc/fsl/qbman/qman_test.h  |   2 -
 13 files changed, 305 insertions(+), 104 deletions(-)

--
2.7.4

Re: [RFC Part1 PATCH v3 14/17] x86/boot: Add early boot support when running with SEV active

2017-08-24 Thread Tom Lendacky


On 8/23/2017 10:30 AM, Borislav Petkov wrote:

On Mon, Jul 24, 2017 at 02:07:54PM -0500, Brijesh Singh wrote:

From: Tom Lendacky 

Early in the boot process, add checks to determine if the kernel is
running with Secure Encrypted Virtualization (SEV) active.

Checking for SEV requires checking that the kernel is running under a
hypervisor (CPUID 0x0001, bit 31), that the SEV feature is available
(CPUID 0x801f, bit 1) and then check a non-interceptable SEV MSR
(0xc0010131, bit 0).

This check is required so that during early compressed kernel booting the
pagetables (both the boot pagetables and KASLR pagetables (if enabled) are
updated to include the encryption mask so that when the kernel is
decompressed into encrypted memory.


, it can boot properly.

:)



Yup, kinda didn't complete that sentence.


After the kernel is decompressed and continues booting the same logic is
used to check if SEV is active and set a flag indicating so.  This allows
us to distinguish between SME and SEV, each of which have unique
differences in how certain things are handled: e.g. DMA (always bounce
buffered with SEV) or EFI tables (always access decrypted with SME).

Signed-off-by: Tom Lendacky 
Signed-off-by: Brijesh Singh 
---
  arch/x86/boot/compressed/Makefile  |   2 +
  arch/x86/boot/compressed/head_64.S |  16 +
  arch/x86/boot/compressed/mem_encrypt.S | 103 +
  arch/x86/boot/compressed/misc.h|   2 +
  arch/x86/boot/compressed/pagetable.c   |   8 ++-
  arch/x86/include/asm/mem_encrypt.h |   3 +
  arch/x86/include/asm/msr-index.h   |   3 +
  arch/x86/include/uapi/asm/kvm_para.h   |   1 -
  arch/x86/mm/mem_encrypt.c  |  42 +++---
  9 files changed, 169 insertions(+), 11 deletions(-)
  create mode 100644 arch/x86/boot/compressed/mem_encrypt.S

diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index 2c860ad..d2fe901 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -72,6 +72,8 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o 
$(obj)/misc.o \
$(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
$(obj)/piggy.o $(obj)/cpuflags.o
  
+vmlinux-objs-$(CONFIG_X86_64) += $(obj)/mem_encrypt.o


There's a

ifdef CONFIG_X86_64

a couple of lines below. Just put it there.


Will do.



...


+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -0,0 +1,103 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+
+   .text
+   .code32
+ENTRY(get_sev_encryption_bit)
+   xor %eax, %eax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   push%ebx
+   push%ecx
+   push%edx
+
+   /* Check if running under a hypervisor */
+   movl$1, %eax
+   cpuid
+   bt  $31, %ecx   /* Check the hypervisor bit */
+   jnc .Lno_sev
+
+   movl$0x8000, %eax   /* CPUID to check the highest leaf */
+   cpuid
+   cmpl$0x801f, %eax   /* See if 0x801f is available */
+   jb  .Lno_sev
+
+   /*
+* Check for the SEV feature:
+*   CPUID Fn8000_001F[EAX] - Bit 1
+*   CPUID Fn8000_001F[EBX] - Bits 5:0
+* Pagetable bit position used to indicate encryption
+*/
+   movl$0x801f, %eax
+   cpuid
+   bt  $1, %eax/* Check if SEV is available */
+   jnc .Lno_sev
+
+   movl$MSR_F17H_SEV, %ecx /* Read the SEV MSR */
+   rdmsr
+   bt  $MSR_F17H_SEV_ENABLED_BIT, %eax /* Check if SEV is active */
+   jnc .Lno_sev
+
+   /*
+* Get memory encryption information:
+*/


The side-comment is enough. This one above can go.


Done.




+   movl%ebx, %eax
+   andl$0x3f, %eax /* Return the encryption bit location */
+   jmp .Lsev_exit
+
+.Lno_sev:
+   xor %eax, %eax
+
+.Lsev_exit:
+   pop %edx
+   pop %ecx
+   pop %ebx
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+   ret
+ENDPROC(get_sev_encryption_bit)
+
+   .code64
+ENTRY(get_sev_encryption_mask)
+   xor %rax, %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   push%rbp
+   push%rdx
+
+   movq%rsp, %rbp  /* Save current stack pointer */
+
+   callget_sev_encryption_bit  /* Get the encryption bit position */


So we get to call get_sev_encryption_bit() here again and noodle through
the CPUID discovery and MSR access. We should instead cache that info
and return the encryption bit directly on a second call. (And initialize
it to -1 to denote that g

Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's

2017-08-24 Thread Benjamin Herrenschmidt

On Thu, 2017-08-24 at 18:40 +0200, Frederic Barrat wrote:
> 
> The decrementing part is giving me troubles, and I think it makes sense: 
> if I decrement the counter when detaching the context from the capi 
> card, then the next TLBIs for the memory context may be back to local. 

Yes, you need to flush the CAPI TLB first.

> So when the process exits, the NPU wouldn't get the associated TLBIs, 
> which spells trouble the next time the same memory context ID is reused. 
> I believe this the cause of the problem I'm seeing. As soon as I keep 
> the TLBIs global, even after I detach from the capi adapter, everything 
> is fine.
> 
> Does it sound right?
> 
> So to keep the checks minimal in mm_is_thread_local(), to just checking 
> the active_cpus count, I'm thinking of introducing a "copro enabled" bit 
> on the context, so that we can increment active_cpus only once. And 
> never decrement it.

You can decrement if you flush. Don't you have MMIOs to do directed
flushes ?

Cheers,
Ben.

Re: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver

2017-08-24 Thread David Miller

From: David Miller 
Date: Thu, 24 Aug 2017 09:42:20 -0700 (PDT)

> From: Madalin Bucur 
> Date: Thu, 24 Aug 2017 10:28:21 +0300
> 
>> This patch set introduces Receive Side Scaling for the DPAA Ethernet
>> driver. Documentation is updated with details related to the new
>> feature and limitations that apply.
>> Added also a small fix.
>> 
>> v2: removed a C++ style comment
>> v3: move struct fman to header file to avoid exporting a function
> 
> Series applied, thanks.

Actually I'm reverting, this doesn't even compile.

[davem@localhost net-next]$ make -s -j8
In file included from drivers/net/ethernet/freescale/fman/fman.c:35:0:
drivers/net/ethernet/freescale/fman/fman.h:286:9: error: type defaults to ‘int’ 
in declaration of ‘irqreturn_t’ [-Werror=implicit-int]
 typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman,
 ^~~
drivers/net/ethernet/freescale/fman/fman.h:286:9: error: ‘irqreturn_t’ declared 
as function returning a function
drivers/net/ethernet/freescale/fman/fman.h:287:12: warning: parameter names 
(without types) in function declaration
   enum fman_exceptions exception);
^~~
drivers/net/ethernet/freescale/fman/fman.h:300:22: error: ‘fman_bus_error_cb’ 
declared as function returning a function
 typedef irqreturn_t (fman_bus_error_cb)(struct fman *fman, u8 port_id,
  ^
drivers/net/ethernet/freescale/fman/fman.h:316:18: error: field ‘muram_res’ has 
incomplete type
  struct resource muram_res;  /* MURAM resource */
  ^
drivers/net/ethernet/freescale/fman/fman.h:330:2: error: unknown type name 
‘fman_exceptions_cb’
  fman_exceptions_cb *exception_cb;
  ^~
drivers/net/ethernet/freescale/fman/fman.h:333:2: error: unknown type name 
‘spinlock_t’
  spinlock_t spinlock;
  ^~
In file included from ./include/linux/irq.h:19:0,
 from ./include/linux/of_irq.h:6,
 from drivers/net/ethernet/freescale/fman/fman.c:46:
./include/linux/irqreturn.h:16:24: error: conflicting types for ‘irqreturn_t’
 typedef enum irqreturn irqreturn_t;
^~~
In file included from drivers/net/ethernet/freescale/fman/fman.c:35:0:
drivers/net/ethernet/freescale/fman/fman.h:286:9: note: previous declaration of 
‘irqreturn_t’ was here
 typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman,
 ^~~
drivers/net/ethernet/freescale/fman/fman.c: In function ‘bmi_err_event’:
drivers/net/ethernet/freescale/fman/fman.c:1237:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_BMI_STORAGE_PROFILE_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1239:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_BMI_LIST_RAM_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1241:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_BMI_STATISTICS_RAM_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1243:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_BMI_DISPATCH_RAM_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c: In function ‘qmi_err_event’:
drivers/net/ethernet/freescale/fman/fman.c:1266:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_QMI_DOUBLE_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1268:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman,
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c: In function ‘dma_err_event’:
drivers/net/ethernet/freescale/fman/fman.c:1317:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_DMA_SINGLE_PORT_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1319:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_DMA_READ_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1321:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_DMA_SYSTEM_WRITE_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1323:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_DMA_FM_WRITE_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c: In function ‘fpm_err_event’:
drivers/net/ethernet/freescale/fman/fman.c:1340:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_FPM_DOUBLE_ECC);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1342:9: error: called object is not 
a function or function pointer
   ret = fman->exception_cb(fman, FMAN_EX_FPM_STALL_ON_TASKS);
 ^~~~
drivers/net/ethernet/freescale/fman/fman.c:1345:9: e

Re: [PATCH v2 2/5] powerpc: pseries: vio: match parent nodes with of_find_node_by_path

2017-08-24 Thread Rob Herring

On Tue, Aug 22, 2017 at 12:12 AM, Michael Ellerman  wrote:
> Rob Herring  writes:
>
>> In preparation to remove the full path from device_node.full_name, use
>> of_find_node_by_path instead of open coding with strcmp.
>>
>> Signed-off-by: Rob Herring 
>> Cc: Benjamin Herrenschmidt 
>> Cc: Paul Mackerras 
>> Cc: Michael Ellerman 
>> Cc: linuxppc-dev@lists.ozlabs.org
>> ---
>> v2:
>> - rebased to linux-next and removed spurious change fro patch 1.
>>
>>  arch/powerpc/platforms/pseries/vio.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/vio.c 
>> b/arch/powerpc/platforms/pseries/vio.c
>> index aa5ca74316fa..5754572deb23 100644
>> --- a/arch/powerpc/platforms/pseries/vio.c
>> +++ b/arch/powerpc/platforms/pseries/vio.c
>> @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct 
>> device_node *of_node)
>>*/
>>   parent_node = of_get_parent(of_node);
>>   if (parent_node) {
>> - if (!strcmp(parent_node->full_name, 
>> "/ibm,platform-facilities"))
>> + if (parent_node == 
>> of_find_node_by_path("/ibm,platform-facilities"))
>>   family = PFO;
>> - else if (!strcmp(parent_node->full_name, "/vdevice"))
>> + else if (parent_node == of_find_node_by_path("/vdevice"))
>>   family = VDEVICE;
>
> This leaks references to the looked up nodes.
>
> Both these nodes are defined in PAPR (our hypervisor spec), and both of
> them must have a device_type, either "ibm,platform-facilities" or
> "vdevice".
>
> Looking at the commit that added the code I don't see any particular
> reason it used the comparison against full_name, rather than using the
> device_type.
>
> So I'm inclined to do this instead:
>
> diff --git a/arch/powerpc/platforms/pseries/vio.c 
> b/arch/powerpc/platforms/pseries/vio.c
> index 8a47f168476b..f26f906e6021 100644
> --- a/arch/powerpc/platforms/pseries/vio.c
> +++ b/arch/powerpc/platforms/pseries/vio.c
> @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct 
> device_node *of_node)
>  */
> parent_node = of_get_parent(of_node);
> if (parent_node) {
> -   if (!strcmp(parent_node->full_name, 
> "/ibm,platform-facilities"))
> +   if (!strcmp(parent_node->type, "ibm,platform-facilities"))
> family = PFO;
> -   else if (!strcmp(parent_node->full_name, "/vdevice"))
> +   else if (!strcmp(parent_node->type, "vdevice"))
> family = VDEVICE;
> else {
> pr_warn("%s: parent(%s) of %s not recognized.\n",
>
>
> I've checked both Qemu and kvmtool add the device_type, and I'm fairly
> confident that PowerVM does too. Anyway I'll test it on all the machines
> I can find.

Okay, do you want me to respin the patch or will you update it with
the above change?

Rob

Re: [PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread David Hildenbrand

On 24.08.2017 11:14, Paul Mackerras wrote:
> Nixiaoming pointed out that there is a memory leak in
> kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd()
> fails; the memory allocated for the kvmppc_spapr_tce_table struct
> is not freed, and nor are the pages allocated for the iommu
> tables.  In addition, we have already incremented the process's
> count of locked memory pages, and this doesn't get restored on
> error.
> 
> David Hildenbrand pointed out that there is a race in that the
> function checks early on that there is not already an entry in the
> stt->iommu_tables list with the same LIOBN, but an entry with the
> same LIOBN could get added between then and when the new entry is
> added to the list.
> 
> This fixes all three problems.  To simplify things, we now call
> anon_inode_getfd() before placing the new entry in the list.  The
> check for an existing entry is done while holding the kvm->lock
> mutex, immediately before adding the new entry to the list.
> Finally, on failure we now call kvmppc_account_memlimit to
> decrement the process's count of locked memory pages.
> 
> Reported-by: Nixiaoming 
> Reported-by: David Hildenbrand 
> Signed-off-by: Paul Mackerras 
> ---
> v2: Don't overwrite stt in loop over spapr_tce_tables
> 

Reviewed-by: David Hildenbrand 


-- 

Thanks,

David

Re: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver

2017-08-24 Thread David Miller

From: Madalin Bucur 
Date: Thu, 24 Aug 2017 10:28:21 +0300

> This patch set introduces Receive Side Scaling for the DPAA Ethernet
> driver. Documentation is updated with details related to the new
> feature and limitations that apply.
> Added also a small fix.
> 
> v2: removed a C++ style comment
> v3: move struct fman to header file to avoid exporting a function

Series applied, thanks.

Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's

2017-08-24 Thread Frederic Barrat




Le 21/08/2017 à 19:35, Benjamin Herrenschmidt a écrit :

On Mon, 2017-08-21 at 19:27 +0200, Frederic Barrat wrote:

Hi Ben,

Le 24/07/2017 à 06:28, Benjamin Herrenschmidt a écrit :

Instead of comparing the whole CPU mask every time, let's
keep a counter of how many bits are set in the mask. Thus
testing for a local mm only requires testing if that counter
is 1 and the current CPU bit is set in the mask.



I'm trying to see if we could merge this patch with what I'm trying to
do to mark a context as requiring global TLBIs.
In http://patchwork.ozlabs.org/patch/796775/
I'm introducing a 'flags' per memory context, using one bit to say if
the context needs global TLBIs.
The 2 could co-exist, just checking... Do you think about using the
actual active_cpus count down the road, or is it just a matter of
knowing if there are more than one active cpus?


Or you could just incrementer my counter. Just make sure you increment
it at most once per CXL context and decrement when the context is gone.


The decrementing part is giving me troubles, and I think it makes sense: 
if I decrement the counter when detaching the context from the capi 
card, then the next TLBIs for the memory context may be back to local. 
So when the process exits, the NPU wouldn't get the associated TLBIs, 
which spells trouble the next time the same memory context ID is reused. 
I believe this the cause of the problem I'm seeing. As soon as I keep 
the TLBIs global, even after I detach from the capi adapter, everything 
is fine.


Does it sound right?

So to keep the checks minimal in mm_is_thread_local(), to just checking 
the active_cpus count, I'm thinking of introducing a "copro enabled" bit 
on the context, so that we can increment active_cpus only once. And 
never decrement it.


  Fred

Re: [PATCH] powerpc/pseries: Don't attempt to acquire drc during memory hot add for assigned lmbs

2017-08-24 Thread John Allen

On 08/24/2017 05:33 AM, Michael Ellerman wrote:
> John Allen  writes:
> 
>> Check if an LMB is assigned before attempting to call dlpar_acquire_drc in
>> order to avoid any unnecessary rtas calls. This substantially reduces the
>> running time of memory hot add on lpars with large amounts of memory.
>>
>> Signed-off-by: John Allen 
> 
> I'll add:
> 
>   Fixes: c21f515c7436 ("powerpc/pseries: Make the acquire/release of the drc 
> for memory a seperate step")
> 
> ?

Yes, thanks.

> 
> How bad is the slow down, do we need to backport to stable/distros?

On an lpar with 16 TB of memory assigned, I observed that adding 1 GB of
memory took several minutes without this fix and improved to several
seconds with this fix. Yep, this will need to be backported. Memory hotplug
performance is a hot issue for our team right now and we'll want to have
solid performance improvement to give to customers relatively soon.

> 
> cheers
> 
>

Re: [PATCH 4/4] powerpc/32: remove a NOP from memset()

2017-08-24 Thread Christophe LEROY




Le 24/08/2017 à 12:51, Michael Ellerman a écrit :

Christophe Leroy  writes:


memset() is patched after initialisation to activate the
optimised part which uses cache instructions.

Today we have a 'b 2f' to skip the optimised patch, which then gets
replaced by a NOP, implying a useless cycle consumption.
As we have a 'bne 2f' just before, we could use that instruction
for the live patching, hence removing the need to have a
dedicated 'b 2f' to be replaced by a NOP.

This patch changes the 'bne 2f' by a 'b 2f'. During init, that
'b 2f' is then replaced by 'bne 2f'


I'm not sure what the sequence is during boot for the 32-bit code, but
can you use an ALT_FTR section for this? Possibly that doesn't get done
at the right time though.


Unfortunately, as we discussed in 2015 
(https://lkml.org/lkml/2015/9/10/608), the ALT_FTR does things too 
early, while the cache is not enabled yet.


Christophe

Re: [PATCH v2 1/1] Split VGA default device handler out of VGA arbiter

2017-08-24 Thread Bjorn Helgaas

On Thu, Aug 24, 2017 at 10:57:26AM +1000, Dave Airlie wrote:
> > Yeah, maybe it's time to disconnect the "default display device" idea
> > from the VGA arbiter.  I have no idea what (if any) dependencies X has
> > on the legacy VGA resources.  I assume X works fine on power, where it
> > sounds like those resources are rarely or never available.
> 
> The question on non-x86 archs, is what is the correct device to default to.
> 
> On x86 we use the legacy VGA resources as a pointer, as this is the device
> the BIOS appeared on at boot so hopefully should be one you can see stuff on.
> 
> On non-x86 I've no idea how to decide if there are multiple devices, maybe the
> firmware needs to tag something for the kernel if there are. Otherwise
> you'd just
> be picking something in probe order.
> 
> I think the idea of these patches is to separate default display
> device from the arbiter.
> 
> X uses the arbiter on x86 if required (it's horrible, and it's rare we
> have to nowadays),
> but for finding the default device it justs uses the sysfs boot_vga flag.

The sysfs boot_vga thing comes from PCI.  The name suggests that it's
a VGA device and can use the legacy VGA resources.

If we want to indicate a general default display device that need not
be "VGA", it'd be really nice if we could pick a name that did not
include "vga".

Even if we could only do it inside the kernel, I think it would reduce
confusion if we could separate out the "VGA"-specific stuff like the
arbiter and names like "vga_set_default_device()" so that systems with
a non-legacy VGA default display device didn't have to use "VGA"
interfaces that don't make sense for them.

Bjorn

Re: [PATCH 1/2] powerpc/workqueue: update list of possible CPUs

2017-08-24 Thread Tejun Heo

Hello, Laurent.

On Thu, Aug 24, 2017 at 02:10:31PM +0200, Laurent Vivier wrote:
> > Yeah, it just needs to match up new cpus to the cpu ids assigned to
> > the right node.
> 
> We are not able to assign the cpu ids to the right node before the CPU
> is present, because firmware doesn't provide CPU mapping <-> node id
> before that.

What I meant was to assign the matching CPU ID when the CPU becomes
present - ie. have CPU IDs available for different nodes and allocate
them to the new CPU according to its node mapping when it actually
comes up.  Please note that I'm not saying this is the way to go, just
that it is a solvable problem from the arch code.

> > The node mapping for that cpu id changes *dynamically* while the
> > system is running and that can race with node-affinity sensitive
> > operations such as memory allocations.
> 
> Memory is mapped to the node through its own firmware entry, so I don't
> think cpu id change can affect memory affinity, and before we know the
> node id of the CPU, the CPU is not present and thus it can't use memory.

The latter part isn't true.  For example, percpu memory gets alloacted
for all possible CPUs according to their node affinity, so the memory
node association change which happens when the CPU comes up for the
first time can race against such allocations.  I don't know whether
that's actually problematic but we don't have *any* synchronization
around it.  If you think it's safe to have such races, please explain
why that is.

> > Please take a step back and think through the problem again.  You
> > can't bandaid it this way.
> 
> Could you give some ideas, proposals?
> As the firmware doesn't provide the information before the CPU is really
> plugged, I really don't know how to manage this problem.

There are two possible approaches, I think.

1. Make physical cpu -> logical cpu mapping indirect so that the
   kernel's cpu ID assignment is always on the right numa node.  This
   may mean that the kernel might have to keep more possible CPUs
   around than necessary but it does have the benefit that all memory
   allocations are affine to the right node.

2. Make cpu <-> node mapping properly dynamic.  Identify what sort of
   synchronization we'd need around the mapping changing dynamically.
   Note that we might not need much but it'll most likely need some.
   Build synchronization and notification infrastructure around it.

Thanks.

-- 
tejun

Re: [PATCH] powerpc/powernv/idle: Round up latency and residency values

2017-08-24 Thread Vaidyanathan Srinivasan

* Michael Ellerman  [2017-08-24 20:28:19]:

> Vaidyanathan Srinivasan  writes:
> 
> > On PowerNV platforms, firmware provides exit latency and
> > target residency for each of the idle states in nano
> > seconds.  Cpuidle framework expects the values in micro
> > seconds.  Round up to nearest micro seconds to avoid errors
> > in cases where the values are defined as fractional micro
> > seconds.
> >
> > Default idle state of 'snooze' has exit latency of zero.  If
> > other states have fractional micro second exit latency, they
> > would get rounded down to zero micro second and make cpuidle
> > framework choose deeper idle state when snooze loop is the
> > right choice.
> >
> > Reported-by: Anton Blanchard 
> > Signed-off-by: Vaidyanathan Srinivasan 
> 
> This sounds like a fairly bad bug, does it need a Fixes / Cc stable tag?

Yes, we will need this on stable kernel that runs on POWER9.  On older
platforms the latencies are larger and hence no impact :)

I will post to stable after this fix hits your -next tree.

--Vaidy

Re: [PATCH v2 12/14] KVM: PPC: Book3S HV: POWER9 can execute stop without a sync sequence

2017-08-24 Thread Nicholas Piggin

On Thu, 24 Aug 2017 20:27:35 +1000
Paul Mackerras  wrote:

> On Sat, Aug 12, 2017 at 02:39:10AM +1000, Nicholas Piggin wrote:
> > Reviewed-by: Gautham R. Shenoy 
> > Signed-off-by: Nicholas Piggin 
> > ---
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 24 
> >  1 file changed, 12 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index 3e024fd71fe8..edb47738a686 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -2527,7 +2527,17 @@ BEGIN_FTR_SECTION
> >  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> >  
> >  kvm_nap_sequence:  /* desired LPCR value in r5 */
> > -BEGIN_FTR_SECTION
> > +BEGIN_FTR_SECTION  /* nap sequence */
> > +   mtspr   SPRN_LPCR,r5
> > +   isync
> > +   li  r0, 0
> > +   std r0, HSTATE_SCRATCH0(r13)
> > +   ptesync
> > +   ld  r0, HSTATE_SCRATCH0(r13)
> > +1: cmpdr0, r0
> > +   bne 1b
> > +   nap
> > +FTR_SECTION_ELSE   /* stop sequence */
> > /*
> >  * PSSCR bits:  exit criterion = 1 (wakeup based on LPCR at sreset)
> >  *  enable state loss = 1 (allow SMT mode switch)
> > @@ -2539,18 +2549,8 @@ BEGIN_FTR_SECTION
> > li  r4, LPCR_PECE_HVEE@higher
> > sldir4, r4, 32
> > or  r5, r5, r4
> > -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
> > mtspr   SPRN_LPCR,r5
> > -   isync
> > -   li  r0, 0
> > -   std r0, HSTATE_SCRATCH0(r13)
> > -   ptesync
> > -   ld  r0, HSTATE_SCRATCH0(r13)
> > -1: cmpdr0, r0
> > -   bne 1b
> > -BEGIN_FTR_SECTION
> > -   nap
> > -FTR_SECTION_ELSE
> > +
> > PPC_STOP
> >  ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
> > b   .
> > -- 
> > 2.13.3  
> 
> Currently we never get to kvm_nap_sequence on POWER9 because we are
> always running one vcpu per vcore, so I haven't worried about this
> code too much.  In future we might need this for running HPT guests on
> a radix host, though.

Just trying to keep in in synch with Linux, but the rest of the
series does not depend on this one so it can be left out if you'd
rather.

Thanks,
Nick

Re: [PATCH v2 10/14] KVM: PPC: Book3S HV: POWER9 does not require secondary thread management

2017-08-24 Thread Nicholas Piggin

On Thu, 24 Aug 2017 19:34:35 +1000
Paul Mackerras  wrote:

> On Sat, Aug 12, 2017 at 02:39:08AM +1000, Nicholas Piggin wrote:
> > POWER9 CPUs have independent MMU contexts per thread, so KVM does not
> > need to quiesce secondary threads, so the hwthread_req/hwthread_state
> > protocol does not have to be used. So patch it away on POWER9, and patch
> > away the branch from the Linux idle wakeup to kvm_start_guest that is
> > never used.  
> 
> If/when we add support for running HPT guests on a radix host, we will
> have to run the host in single-threaded mode (since POWER9 doesn't
> support having some threads of a core using HPT and some using radix
> simultaneously).  We'll then need some sort of thing like
> kvmppc_grab_hwthread to coordinate with the threads so that guests can
> use the secondary threads.
>
> So I think most of this code should stay.  We will still need to have
> a way to make sure that the secondaries are in real mode and not in a
> guest, because all threads will need to be in real mode when switching
> the core between radix and HPT mode.  Maybe we can optimize it a bit
> at present given that we don't yet support running HPT guests on a
> radix host, but I don't want to make it harder to do that in future.

Okay, most of the code's still there, just noped out with ARCH_300.
But yes this and then the subsequent patches do make it more
difficult to restore the KVM real mode functionality.

I'll see about restructuring them to keep that ability and make it
selectable with a minimal branchs or alt patches.

Thanks,
Nick

Re: powerpc/vio: Use device_type to detect family

2017-08-24 Thread Michael Ellerman

On Wed, 2017-08-23 at 05:47:13 UTC, Michael Ellerman wrote:
> Currently in the vio.c code we use a comparision against the parent
> device node's full path to decide if the device is a PFO or VIO family
> device.
> 
> Both the ibm,platform-facilities and vdevice nodes are defined by PAPR,
> and must have a matching device_type. So instead of using the path we
> can instead compare the device_type.
> 
> I've checked Qemu and kvmtool both do this correctly, and all the
> PowerVM systems I have access to do also. So it seems to be safe.
> 
> This removes the dependency on full_name, which is being removed
> upstream.
> 
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/bcf21e3a97a1247178338793df9ae3

cheers

Re: powerpc/64s: Fix replay interrupt return label name

2017-08-24 Thread Michael Ellerman

On Tue, 2017-08-22 at 01:51:37 UTC, Michael Ellerman wrote:
> In __replay_interrupt() we take the address of a local label so we can
> return to it later. However the assembler turns the local label into a
> symbol with a name like ".L1^B42" - where "^B" is literally "\002".
> This does not make for pleasant stack traces. Fix it by giving the
> label a sensible name.
> 
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/3e23a12bcaf18b3587088807722cd2

cheers

Re: [v2, 3/5] powerpc: pseries: remove dlpar_attach_node dependency on full path

2017-08-24 Thread Michael Ellerman

On Mon, 2017-08-21 at 15:16:49 UTC, Rob Herring wrote:
> In preparation to stop storing the full node path in full_name, remove the
> dependency on full_name from dlpar_attach_node(). Callers of
> dlpar_attach_node() already have the parent device_node, so just pass the
> parent node into dlpar_attach_node instead of the path. This avoids doing
> a lookup of the parent node by the path.
> 
> Signed-off-by: Rob Herring 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: linuxppc-dev@lists.ozlabs.org

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/215ee763f8cb9a2912d411f96f6f67

cheers

Re: [v2,1/5] powerpc: Convert to using %pOF instead of full_name

2017-08-24 Thread Michael Ellerman

On Mon, 2017-08-21 at 15:16:47 UTC, Rob Herring wrote:
> Now that we have a custom printf format specifier, convert users of
> full_name to use %pOF instead. This is preparation to remove storing
> of the full path string for each node.
> 
> Signed-off-by: Rob Herring 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Anatolij Gustschin 
> Cc: Scott Wood 
> Cc: Kumar Gala 
> Cc: Arnd Bergmann 
> Cc: linuxppc-dev@lists.ozlabs.org
> Reviewed-by: Tyrel Datwyler 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/b7c670d673d1186e9a6aafaad36aac


I forgot to add in the change log I took the chance to reformat some of the
affected printks, hopefully that doesn't muck you up when you're
merging/rebasing on top of this.

cheers

Re: [v2,01/14] powerpc/64s: masked interrupt avoid branch

2017-08-24 Thread Michael Ellerman

On Fri, 2017-08-11 at 16:38:59 UTC, Nicholas Piggin wrote:
> Interrupts which do not require EE to be cleared can all
> be tested with a single bitwise test.
> 
> Signed-off-by: Nicholas Piggin 

Patches 1-9 applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/e0c827c09c0d04d77616a4506a71b3

cheers

Re: [1/6] powerpc/mm: Move pgdir setting into a helper

2017-08-24 Thread Michael Ellerman

On Mon, 2017-07-24 at 04:27:58 UTC, Benjamin Herrenschmidt wrote:
> Makes switch_mm_irqs_off() a bit more readable
> 
> Signed-off-by: Benjamin Herrenschmidt 

Patches 1-2, 4-6 applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/43ed84a891b70165a621a5c9219694

cheers

Re: [PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread David Gibson

On Thu, Aug 24, 2017 at 07:14:47PM +1000, Paul Mackerras wrote:
> Nixiaoming pointed out that there is a memory leak in
> kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd()
> fails; the memory allocated for the kvmppc_spapr_tce_table struct
> is not freed, and nor are the pages allocated for the iommu
> tables.  In addition, we have already incremented the process's
> count of locked memory pages, and this doesn't get restored on
> error.
> 
> David Hildenbrand pointed out that there is a race in that the
> function checks early on that there is not already an entry in the
> stt->iommu_tables list with the same LIOBN, but an entry with the
> same LIOBN could get added between then and when the new entry is
> added to the list.
> 
> This fixes all three problems.  To simplify things, we now call
> anon_inode_getfd() before placing the new entry in the list.  The
> check for an existing entry is done while holding the kvm->lock
> mutex, immediately before adding the new entry to the list.
> Finally, on failure we now call kvmppc_account_memlimit to
> decrement the process's count of locked memory pages.
> 
> Reported-by: Nixiaoming 
> Reported-by: David Hildenbrand 
> Signed-off-by: Paul Mackerras 

Reviewed-by: David Gibson 

> ---
> v2: Don't overwrite stt in loop over spapr_tce_tables
> 
>  arch/powerpc/kvm/book3s_64_vio.c | 56 
> 
>  1 file changed, 34 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> b/arch/powerpc/kvm/book3s_64_vio.c
> index a160c14..53766e2 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  struct kvm_create_spapr_tce_64 *args)
>  {
>   struct kvmppc_spapr_tce_table *stt = NULL;
> + struct kvmppc_spapr_tce_table *siter;
>   unsigned long npages, size;
>   int ret = -ENOMEM;
>   int i;
> + int fd = -1;
>  
>   if (!args->size)
>   return -EINVAL;
>  
> - /* Check this LIOBN hasn't been previously allocated */
> - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> - if (stt->liobn == args->liobn)
> - return -EBUSY;
> - }
> -
>   size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
>   npages = kvmppc_tce_pages(size);
>   ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
> - if (ret) {
> - stt = NULL;
> - goto fail;
> - }
> + if (ret)
> + return ret;
>  
>   ret = -ENOMEM;
>   stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
> GFP_KERNEL);
>   if (!stt)
> - goto fail;
> + goto fail_acct;
>  
>   stt->liobn = args->liobn;
>   stt->page_shift = args->page_shift;
> @@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>   goto fail;
>   }
>  
> - kvm_get_kvm(kvm);
> + ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
> + stt, O_RDWR | O_CLOEXEC);
> + if (ret < 0)
> + goto fail;
>  
>   mutex_lock(&kvm->lock);
> - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
> +
> + /* Check this LIOBN hasn't been previously allocated */
> + ret = 0;
> + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) {
> + if (siter->liobn == args->liobn) {
> + ret = -EBUSY;
> + break;
> + }
> + }
> +
> + if (!ret) {
> + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
> + kvm_get_kvm(kvm);
> + }
>  
>   mutex_unlock(&kvm->lock);
>  
> - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
> - stt, O_RDWR | O_CLOEXEC);
> + if (!ret)
> + return fd;
>  
> -fail:
> - if (stt) {
> - for (i = 0; i < npages; i++)
> - if (stt->pages[i])
> - __free_page(stt->pages[i]);
> + put_unused_fd(fd);
>  
> - kfree(stt);
> - }
> + fail:
> + for (i = 0; i < npages; i++)
> + if (stt->pages[i])
> + __free_page(stt->pages[i]);
> +
> + kfree(stt);
> + fail_acct:
> + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
>   return ret;
>  }
>  

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [PATCH v7 4/4] boot/param: add pointer to next argument to unknown parameter callback

2017-08-24 Thread msuchanek

On Thu, 24 Aug 2017 21:04:51 +1000
Michael Ellerman  wrote:

> Michal Suchanek  writes:
> 
> > The fadump parameter processing re-does the logic of next_arg quote
> > stripping to determine where the argument ends. Pass pointer to the
> > next argument instead to make this more robust.
> >
> > Signed-off-by: Michal Suchanek 
> > ---
> >  arch/powerpc/kernel/fadump.c  | 13 +
> >  arch/powerpc/mm/hugetlbpage.c |  4 ++--
> >  include/linux/moduleparam.h   |  2 +-
> >  init/main.c   | 12 ++--
> >  kernel/module.c   |  4 ++--
> >  kernel/params.c   | 19 +++
> >  lib/dynamic_debug.c   |  2 +-
> >  7 files changed, 28 insertions(+), 28 deletions(-)  
> 
> Can you split out a patch that adds the next argument and updates the
> callers. And then a patch for the fadump to use the new arg.
> 
> cheers

Yes, that makes sense.

Thanks

Michal

Re: [PATCH v7 3/4] lib/cmdline.c Remove quotes symmetrically.

2017-08-24 Thread msuchanek

On Thu, 24 Aug 2017 21:02:47 +1000
Michael Ellerman  wrote:

> Michal Suchanek  writes:
> 
> > Remove quotes from argument value only if there is qoute on both
> > sides.
> >
> > Signed-off-by: Michal Suchanek 
> > ---
> >  arch/powerpc/kernel/fadump.c | 6 ++
> >  lib/cmdline.c| 7 ++-  
> 
> Can you split that into two patches?

Not really. There is logic in lib/cmdline.c which is duplicated in
arch/powerpc/kernel/fadump.c and so the two places should be updated in
sync. 

Thanks

Michal

> 
> cheers
> 
> >  2 files changed, 4 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/fadump.c
> > b/arch/powerpc/kernel/fadump.c index a1614d9b8a21..d7da4ce9f7ae
> > 100644 --- a/arch/powerpc/kernel/fadump.c
> > +++ b/arch/powerpc/kernel/fadump.c
> > @@ -489,10 +489,8 @@ static void __init fadump_update_params(struct
> > param_info *param_info, *tgt++ = ' ';
> >  
> > /* next_arg removes one leading and one trailing '"' */
> > -   if (*tgt == '"')
> > -   shortening += 1;
> > -   if (*(tgt + vallen + shortening) == '"')
> > -   shortening += 1;
> > +   if ((*tgt == '"') && (*(tgt + vallen + shortening) == '"'))
> > +   shortening += 2;
> >  
> > /* remove one leading and one trailing quote if both are
> > present */ if ((val[0] == '"') && (val[vallen - 1] == '"')) {
> > diff --git a/lib/cmdline.c b/lib/cmdline.c
> > index 4c0888c4a68d..01e701b2afe8 100644
> > --- a/lib/cmdline.c
> > +++ b/lib/cmdline.c
> > @@ -227,14 +227,11 @@ char *next_arg(char *args, char **param, char
> > **val) *val = args + equals + 1;
> >  
> > /* Don't include quotes in value. */
> > -   if (**val == '"') {
> > +   if ((**val == '"') && (args[i-1] == '"')) {
> > (*val)++;
> > -   if (args[i-1] == '"')
> > -   args[i-1] = '\0';
> > +   args[i-1] = '\0';
> > }
> > }
> > -   if (quoted && args[i-1] == '"')
> > -   args[i-1] = '\0';
> >  
> > if (args[i]) {
> > args[i] = '\0';
> > -- 
> > 2.10.2

[PATCH] rapidio: remove global irq spinlocks from the subsystem

2017-08-24 Thread Ioan Nicu


Locking of config and doorbell operations should be done
only if the underlying hardware requires it.

This patch removes the global spinlocks from the rapidio
subsystem and moves them to the mport drivers (fsl_rio and tsi721),
only to the necessary places. For example, local config space
read and write operations (lcread/lcwrite) are atomic in all
existing drivers, so there should be no need for locking, while
the cread/cwrite operations which generate maintenance transactions
need to be synchronized with a lock.

Later, each driver could chose to use a per-port lock instead
of a global one, or even more granular locking.

Signed-off-by: Ioan Nicu 
Signed-off-by: Frank Kunz 
---
 arch/powerpc/sysdev/fsl_rio.c| 17 +++--
 arch/powerpc/sysdev/fsl_rmu.c|  8 
 drivers/rapidio/devices/tsi721.c |  7 +++
 drivers/rapidio/rio-access.c | 40 +---
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c
index 1c41c51..e9f3bc9 100644
--- a/arch/powerpc/sysdev/fsl_rio.c
+++ b/arch/powerpc/sysdev/fsl_rio.c
@@ -71,6 +71,8 @@
 #define RIWAR_WRTYP_ALLOC  0x6000
 #define RIWAR_SIZE_MASK0x003F
 
+static DEFINE_SPINLOCK(fsl_rio_config_lock);
+
 #define __fsl_read_rio_config(x, addr, err, op)\
__asm__ __volatile__(   \
"1: "op" %1,0(%2)\n"\
@@ -184,6 +186,7 @@ static int fsl_local_config_write(struct rio_mport *mport,
u8 hopcount, u32 offset, int len, u32 *val)
 {
struct rio_priv *priv = mport->priv;
+   unsigned long flags;
u8 *data;
u32 rval, err = 0;
 
@@ -197,6 +200,8 @@ static int fsl_local_config_write(struct rio_mport *mport,
if (offset > (0x100 - len) || !IS_ALIGNED(offset, len))
return -EINVAL;
 
+   spin_lock_irqsave(&fsl_rio_config_lock, flags);
+
out_be32(&priv->maint_atmu_regs->rowtar,
 (destid << 22) | (hopcount << 12) | (offset >> 12));
out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10));
@@ -213,6 +218,7 @@ static int fsl_local_config_write(struct rio_mport *mport,
__fsl_read_rio_config(rval, data, err, "lwz");
break;
default:
+   spin_unlock_irqrestore(&fsl_rio_config_lock, flags);
return -EINVAL;
}
 
@@ -221,6 +227,7 @@ static int fsl_local_config_write(struct rio_mport *mport,
 err, destid, hopcount, offset);
}
 
+   spin_unlock_irqrestore(&fsl_rio_config_lock, flags);
*val = rval;
 
return err;
@@ -244,7 +251,10 @@ static int fsl_local_config_write(struct rio_mport *mport,
u8 hopcount, u32 offset, int len, u32 val)
 {
struct rio_priv *priv = mport->priv;
+   unsigned long flags;
u8 *data;
+   int ret = 0;
+
pr_debug
("fsl_rio_config_write:"
" index %d destid %d hopcount %d offset %8.8x len %d val 
%8.8x\n",
@@ -255,6 +265,8 @@ static int fsl_local_config_write(struct rio_mport *mport,
if (offset > (0x100 - len) || !IS_ALIGNED(offset, len))
return -EINVAL;
 
+   spin_lock_irqsave(&fsl_rio_config_lock, flags);
+
out_be32(&priv->maint_atmu_regs->rowtar,
 (destid << 22) | (hopcount << 12) | (offset >> 12));
out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10));
@@ -271,10 +283,11 @@ static int fsl_local_config_write(struct rio_mport *mport,
out_be32((u32 *) data, val);
break;
default:
-   return -EINVAL;
+   ret = -EINVAL;
}
+   spin_unlock_irqrestore(&fsl_rio_config_lock, flags);
 
-   return 0;
+   return ret;
 }
 
 static void fsl_rio_inbound_mem_init(struct rio_priv *priv)
diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c
index c1826de..c15a17a 100644
--- a/arch/powerpc/sysdev/fsl_rmu.c
+++ b/arch/powerpc/sysdev/fsl_rmu.c
@@ -104,6 +104,8 @@
 
 #define DOORBELL_MESSAGE_SIZE  0x08
 
+static DEFINE_SPINLOCK(fsl_rio_doorbell_lock);
+
 struct rio_msg_regs {
u32 omr;
u32 osr;
@@ -626,9 +628,13 @@ int fsl_rio_port_write_init(struct fsl_rio_pw *pw)
 int fsl_rio_doorbell_send(struct rio_mport *mport,
int index, u16 destid, u16 data)
 {
+   unsigned long flags;
+
pr_debug("fsl_doorbell_send: index %d destid %4.4x data %4.4x\n",
 index, destid, data);
 
+   spin_lock_irqsave(&fsl_rio_doorbell_lock, flags);
+
/* In the serial version silicons, such as MPC8548, MPC8641,
 * below operations is must be.
 */
@@ -638,6 +644,8 @@ int fsl_rio_doorbell_send(struct rio_mport *mport,
out_be32(&dbell->dbell_regs->oddatr, (index << 20) | data);
out_be32(&d

Re: [PATCH 1/2] powerpc/workqueue: update list of possible CPUs

2017-08-24 Thread Laurent Vivier

On 23/08/2017 15:26, Tejun Heo wrote:
> Hello, Michael.
> 
> On Wed, Aug 23, 2017 at 09:00:39PM +1000, Michael Ellerman wrote:
>>> I don't think that's true.  The CPU id used in kernel doesn't have to
>>> match the physical one and arch code should be able to pre-map CPU IDs
>>> to nodes and use the matching one when hotplugging CPUs.  I'm not
>>> saying that's the best way to solve the problem tho.
>>
>> We already virtualise the CPU numbers, but not the node IDs. And it's
>> the node IDs that are really the problem.
> 
> Yeah, it just needs to match up new cpus to the cpu ids assigned to
> the right node.

We are not able to assign the cpu ids to the right node before the CPU
is present, because firmware doesn't provide CPU mapping <-> node id
before that.

>>> It could be that the best way forward is making cpu <-> node mapping
>>> dynamic and properly synchronized.
>>
>> We don't need it to be dynamic (at least for this bug).
> 
> The node mapping for that cpu id changes *dynamically* while the
> system is running and that can race with node-affinity sensitive
> operations such as memory allocations.

Memory is mapped to the node through its own firmware entry, so I don't
think cpu id change can affect memory affinity, and before we know the
node id of the CPU, the CPU is not present and thus it can't use memory.

>> Laurent is booting Qemu with a fixed CPU <-> Node mapping, it's just
>> that because some CPUs aren't present at boot we don't know what the
>> node mapping is. (Correct me if I'm wrong Laurent).
>>
>> So all we need is:
>>  - the workqueue code to cope with CPUs that are possible but not online
>>having NUMA_NO_NODE to begin with.
>>  - a way to update the workqueue cpumask when the CPU comes online.
>>
>> Which seems reasonable to me?
> 
> Please take a step back and think through the problem again.  You
> can't bandaid it this way.

Could you give some ideas, proposals?
As the firmware doesn't provide the information before the CPU is really
plugged, I really don't know how to manage this problem.

Thanks,
Laurent

Re: [PATCH v7 03/12] powerpc/vas: Define vas_init() and vas_exit()

2017-08-24 Thread Michael Ellerman

Hi Suka,

Comments inline ...

Sukadev Bhattiprolu  writes:
> diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt 
> b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> new file mode 100644
> index 000..0e3111d
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> @@ -0,0 +1,24 @@
> +* IBM Powerpc Virtual Accelerator Switchboard (VAS)
> +
> +VAS is a hardware mechanism that allows kernel subsystems and user processes
> +to directly submit compression and other requests to Nest accelerators (NX)
> +or other coprocessors functions.
> +
> +Required properties:
> +- compatible : should be "ibm,vas" or "ibm,power9-vas"

The driver doesn't look for the latter.

> +- ibm,vas-id : A unique identifier for each instance of VAS in the system

What is this?

> +- reg : Should contain 4 pairs of 64-bit fields specifying the Hypervisor
> +  window context start and length, OS/User window context start and length,
> +  "Paste address" start and length, "Paste window id" start bit and number
> +  of bits)
> +- name : "vas"

I don't think the name is normally included in the binding, and in fact
there's no reason why the name is important, so I'd be inclined to drop that.

> diff --git a/MAINTAINERS b/MAINTAINERS
> index 3c41902..abc235f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -6425,6 +6425,14 @@ F: drivers/crypto/nx/nx.*
>  F:   drivers/crypto/nx/nx_csbcpb.h
>  F:   drivers/crypto/nx/nx_debugfs.h
>  
> +IBM Power Virtual Accelerator Switchboard
> +M:   Sukadev Bhattiprolu
> +L:   linuxppc-dev@lists.ozlabs.org
> +S:   Supported
> +F:   arch/powerpc/platforms/powernv/vas*
> +F:   arch/powerpc/include/asm/vas.h
> +F:   arch/powerpc/include/uapi/asm/vas.h

That's not in the right place, the file is sorted alphabetically.

V comes after L.

> diff --git a/arch/powerpc/platforms/powernv/Kconfig 
> b/arch/powerpc/platforms/powernv/Kconfig
> index 6a6f4ef..f565454 100644
> --- a/arch/powerpc/platforms/powernv/Kconfig
> +++ b/arch/powerpc/platforms/powernv/Kconfig
> @@ -30,3 +30,17 @@ config OPAL_PRD
>   help
> This enables the opal-prd driver, a facility to run processor
> recovery diagnostics on OpenPower machines
> +
> +config PPC_VAS
> + bool "IBM Virtual Accelerator Switchboard (VAS)"

^ bool, so never a module.

> + depends on PPC_POWERNV && PPC_64K_PAGES
> + default n

It should be default y.

I know the usual advice is to make things 'default n', but this has
fairly tight depends already, so y is OK IMO.

> diff --git a/arch/powerpc/platforms/powernv/vas.c 
> b/arch/powerpc/platforms/powernv/vas.c
> new file mode 100644
> index 000..556156b
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/vas.c
> @@ -0,0 +1,183 @@
> +/*
> + * Copyright 2016 IBM Corp.

2016-2017.

> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */

#define pr_fmt(fmt) "vas: " fmt

> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "vas.h"
> +
> +static bool init_done;
> +LIST_HEAD(vas_instances);

Can be static.

> +
> +static int init_vas_instance(struct platform_device *pdev)
> +{
> + int rc, vasid;
> + struct vas_instance *vinst;
> + struct device_node *dn = pdev->dev.of_node;
> + struct resource *res;

struct device_node *dn = pdev->dev.of_node;
struct vas_instance *vinst;
struct resource *res;
int rc, vasid;

Petty I know, but much prettier :)

> +
> + rc = of_property_read_u32(dn, "ibm,vas-id", &vasid);
> + if (rc) {
> + pr_err("VAS: No ibm,vas-id property for %s?\n", pdev->name);

With the pr_fmt() above you don't need VAS: on the front of all these.

> + return -ENODEV;
> + }
> +
> + if (pdev->num_resources != 4) {
> + pr_err("VAS: Unexpected DT configuration for [%s, %d]\n",
> + pdev->name, vasid);
> + return -ENODEV;
> + }
> +
> + vinst = kcalloc(1, sizeof(*vinst), GFP_KERNEL);

kzalloc() would be more normal given there's only 1.

> + if (!vinst)
> + return -ENOMEM;
> +
> + INIT_LIST_HEAD(&vinst->node);
> + ida_init(&vinst->ida);
> + mutex_init(&vinst->mutex);
> + vinst->vas_id = vasid;
> + vinst->pdev = pdev;
> +
> + res = &pdev->resource[0];
> + vinst->hvwc_bar_start = res->start;
> + vinst->hvwc_bar_len = res->end - res->start + 1;
> +
> + res = &pdev->resource[1];
> + vinst->uwc_bar_start = res->start;
> + vinst->uwc_bar_len = res->end - res->start + 1;

You have vinst->pdev, why do you need to copy all those?

I don't see the lens even used.

> + res = &pdev->resource[2];
> + vinst->paste_base_addr = res->start;
> +
> +

Re: [PATCH v7 4/4] boot/param: add pointer to next argument to unknown parameter callback

2017-08-24 Thread Michael Ellerman

Michal Suchanek  writes:

> The fadump parameter processing re-does the logic of next_arg quote
> stripping to determine where the argument ends. Pass pointer to the
> next argument instead to make this more robust.
>
> Signed-off-by: Michal Suchanek 
> ---
>  arch/powerpc/kernel/fadump.c  | 13 +
>  arch/powerpc/mm/hugetlbpage.c |  4 ++--
>  include/linux/moduleparam.h   |  2 +-
>  init/main.c   | 12 ++--
>  kernel/module.c   |  4 ++--
>  kernel/params.c   | 19 +++
>  lib/dynamic_debug.c   |  2 +-
>  7 files changed, 28 insertions(+), 28 deletions(-)

Can you split out a patch that adds the next argument and updates the
callers. And then a patch for the fadump to use the new arg.

cheers

> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index d7da4ce9f7ae..6ef96711ee9a 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -474,13 +474,14 @@ struct param_info {
>  };
>  
>  static void __init fadump_update_params(struct param_info *param_info,
> - char *param, char *val)
> + char *param, char *val, char *next)
>  {
>   ptrdiff_t param_offset = param - param_info->tmp_cmdline;
>   size_t vallen = val ? strlen(val) : 0;
>   char *tgt = param_info->cmdline + param_offset +
>   FADUMP_EXTRA_ARGS_LEN - param_info->shortening;
> - int shortening = 0;
> + int shortening = ((next - 1) - (param))
> + - (FADUMP_EXTRA_ARGS_LEN + 1 + vallen);
>  
>   if (!val)
>   return;
> @@ -488,10 +489,6 @@ static void __init fadump_update_params(struct 
> param_info *param_info,
>   /* remove '=' */
>   *tgt++ = ' ';
>  
> - /* next_arg removes one leading and one trailing '"' */
> - if ((*tgt == '"') && (*(tgt + vallen + shortening) == '"'))
> - shortening += 2;
> -
>   /* remove one leading and one trailing quote if both are present */
>   if ((val[0] == '"') && (val[vallen - 1] == '"')) {
>   shortening += 2;
> @@ -517,7 +514,7 @@ static void __init fadump_update_params(struct param_info 
> *param_info,
>   * to enforce the parameters passed through it
>   */
>  static int __init fadump_rework_cmdline_params(char *param, char *val,
> -const char *unused, void *arg)
> + char *next, const char *unused, void *arg)
>  {
>   struct param_info *param_info = (struct param_info *)arg;
>  
> @@ -525,7 +522,7 @@ static int __init fadump_rework_cmdline_params(char 
> *param, char *val,
>strlen(FADUMP_EXTRA_ARGS_PARAM) - 1))
>   return 0;
>  
> - fadump_update_params(param_info, param, val);
> + fadump_update_params(param_info, param, val, next);
>  
>   return 0;
>  }
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index e1bf5ca397fe..3a4cce552906 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -268,8 +268,8 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
>  
>  unsigned long gpage_npages[MMU_PAGE_COUNT];
>  
> -static int __init do_gpage_early_setup(char *param, char *val,
> -const char *unused, void *arg)
> +static int __init do_gpage_early_setup(char *param, char *val, char *unused1,
> +const char *unused2, void *arg)
>  {
>   static phys_addr_t size;
>   unsigned long npages;
> diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
> index 1ee7b30dafec..fec05a186c08 100644
> --- a/include/linux/moduleparam.h
> +++ b/include/linux/moduleparam.h
> @@ -326,7 +326,7 @@ extern char *parse_args(const char *name,
> s16 level_min,
> s16 level_max,
> void *arg,
> -   int (*unknown)(char *param, char *val,
> +   int (*unknown)(char *param, char *val, char *next,
>const char *doing, void *arg));
>  
>  /* Called by module remove. */
> diff --git a/init/main.c b/init/main.c
> index 052481fbe363..920c3564b2f0 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -239,7 +239,7 @@ static int __init loglevel(char *str)
>  early_param("loglevel", loglevel);
>  
>  /* Change NUL term back to "=", to make "param" the whole string. */
> -static int __init repair_env_string(char *param, char *val,
> +static int __init repair_env_string(char *param, char *val, char *unused2,
>   const char *unused, void *arg)
>  {
>   if (val) {
> @@ -257,7 +257,7 @@ static int __init repair_env_string(char *param, char 
> *val,
>  }
>  
>  /* Anything after -- gets handed straight to init. */
> -static int __init set_init_arg(char *param, char *val,
> +static int __init set_init_arg(char *param, char *val, c

Re: [PATCH v7 3/4] lib/cmdline.c Remove quotes symmetrically.

2017-08-24 Thread Michael Ellerman

Michal Suchanek  writes:

> Remove quotes from argument value only if there is qoute on both sides.
>
> Signed-off-by: Michal Suchanek 
> ---
>  arch/powerpc/kernel/fadump.c | 6 ++
>  lib/cmdline.c| 7 ++-

Can you split that into two patches?

cheers

>  2 files changed, 4 insertions(+), 9 deletions(-)
>
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index a1614d9b8a21..d7da4ce9f7ae 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -489,10 +489,8 @@ static void __init fadump_update_params(struct 
> param_info *param_info,
>   *tgt++ = ' ';
>  
>   /* next_arg removes one leading and one trailing '"' */
> - if (*tgt == '"')
> - shortening += 1;
> - if (*(tgt + vallen + shortening) == '"')
> - shortening += 1;
> + if ((*tgt == '"') && (*(tgt + vallen + shortening) == '"'))
> + shortening += 2;
>  
>   /* remove one leading and one trailing quote if both are present */
>   if ((val[0] == '"') && (val[vallen - 1] == '"')) {
> diff --git a/lib/cmdline.c b/lib/cmdline.c
> index 4c0888c4a68d..01e701b2afe8 100644
> --- a/lib/cmdline.c
> +++ b/lib/cmdline.c
> @@ -227,14 +227,11 @@ char *next_arg(char *args, char **param, char **val)
>   *val = args + equals + 1;
>  
>   /* Don't include quotes in value. */
> - if (**val == '"') {
> + if ((**val == '"') && (args[i-1] == '"')) {
>   (*val)++;
> - if (args[i-1] == '"')
> - args[i-1] = '\0';
> + args[i-1] = '\0';
>   }
>   }
> - if (quoted && args[i-1] == '"')
> - args[i-1] = '\0';
>  
>   if (args[i]) {
>   args[i] = '\0';
> -- 
> 2.10.2

Re: [PATCH V9 1/2] powerpc/numa: Update CPU topology when VPHN enabled

2017-08-24 Thread Michael Ellerman

Nathan Fontenot  writes:

> On 08/23/2017 06:41 AM, Michael Ellerman wrote:
>> Michael Bringmann  writes:
>> 
>>> powerpc/numa: Correct the currently broken capability to set the
>>> topology for shared CPUs in LPARs.  At boot time for shared CPU
>>> lpars, the topology for each shared CPU is set to node zero, however,
>>> this is now updated correctly using the Virtual Processor Home Node
>>> (VPHN) capabilities information provided by the pHyp.
>>>
>>> Also, update initialization checks for device-tree attributes to
>>> independently recognize PRRN or VPHN usage.
>> 
>> Did you ever do anything to address Nathan's comments on v4 ?
>> 
>>   http://patchwork.ozlabs.org/patch/767587/
>
> Looking at this patch I do not see that VPHN is always enabled.

You mean *this* patch? Or v4?

I think you mean this patch, in which case I agree.

>> Also your change log doesn't describe anything about what the patch does
>> and why it is the correct fix for the problem.
>> 
>> When a DLPAR happens you modify the VPHN timer to run in 1 nsec, but you
>> don't wait for it. Why would we not just run the logic synchronously?
>> 
>> It also seems to make VPHN and PRRN no longer exclusive, which looking
>> at PAPR seems like it might be correct, but is also a major change so
>> please justify it in detail.
>
> This is correct, they are not exclusive. When we first added PRRN support
> we mistakenly thought they were exclusive which is why the code currently
> only starts PRRN, or VPHN if PRRN is not present.

OK.

So we need a patch that does that and only that, and clearly explains
why we're doing that and why it's the correct thing to do.

Then a 2nd patch can fiddle with the timer, if we must.

...
>>> +static int topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
>>> +static int topology_inited;
>>> +static int topology_update_needed;
>> 
>> None of this code should be in numa.c. Which is not your fault but I'm
>> inclined to move it before we make it worse.
>
> Agreed. Perhaps this should all be in mm/vphn.c

Actually I was thinking platforms/pseries/vphn.c

cheers

Re: [PATCH 4/4] powerpc/32: remove a NOP from memset()

2017-08-24 Thread Michael Ellerman

Christophe Leroy  writes:

> memset() is patched after initialisation to activate the
> optimised part which uses cache instructions.
>
> Today we have a 'b 2f' to skip the optimised patch, which then gets
> replaced by a NOP, implying a useless cycle consumption.
> As we have a 'bne 2f' just before, we could use that instruction
> for the live patching, hence removing the need to have a
> dedicated 'b 2f' to be replaced by a NOP.
>
> This patch changes the 'bne 2f' by a 'b 2f'. During init, that
> 'b 2f' is then replaced by 'bne 2f'

I'm not sure what the sequence is during boot for the 32-bit code, but
can you use an ALT_FTR section for this? Possibly that doesn't get done
at the right time though.

cheers

[PATCH] powerpc: Fix DAR reporting when alignment handler faults

2017-08-24 Thread Michael Ellerman

Anton noticed that if we fault part way through emulating an unaligned
instruction, we don't update the DAR to reflect that.

The DAR value is eventually reported back to userspace as the address
in the SEGV signal, and if userspace is using that value to demand
fault then it can be confused by us not setting the value correctly.

This patch is ugly as hell, but is intended to be the minimal fix and
back ports easily.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/align.c | 119 +++-
 1 file changed, 74 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index ec7a8b099dd9..fd3c1fcc73eb 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -235,6 +235,28 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned 
char __user *addr)
 
 #define SWIZ_PTR(p)((unsigned char __user *)((p) ^ swiz))
 
+#define __get_user_or_set_dar(_regs, _dest, _addr) \
+   ({  \
+   int rc = 0; \
+   typeof(_addr) __addr = (_addr); \
+   if (__get_user_inatomic(_dest, __addr)) {   \
+   _regs->dar = (unsigned long)__addr; \
+   rc = -EFAULT;   \
+   }   \
+   rc; \
+   })
+
+#define __put_user_or_set_dar(_regs, _src, _addr)  \
+   ({  \
+   int rc = 0; \
+   typeof(_addr) __addr = (_addr); \
+   if (__put_user_inatomic(_src, __addr)) {\
+   _regs->dar = (unsigned long)__addr; \
+   rc = -EFAULT;   \
+   }   \
+   rc; \
+   })
+
 static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
unsigned int reg, unsigned int nb,
unsigned int flags, unsigned int instr,
@@ -263,9 +285,10 @@ static int emulate_multiple(struct pt_regs *regs, unsigned 
char __user *addr,
} else {
unsigned long pc = regs->nip ^ (swiz & 4);
 
-   if (__get_user_inatomic(instr,
-   (unsigned int __user *)pc))
+   if (__get_user_or_set_dar(regs, instr,
+ (unsigned int __user *)pc))
return -EFAULT;
+
if (swiz == 0 && (flags & SW))
instr = cpu_to_le32(instr);
nb = (instr >> 11) & 0x1f;
@@ -309,31 +332,31 @@ static int emulate_multiple(struct pt_regs *regs, 
unsigned char __user *addr,
   ((nb0 + 3) / 4) * sizeof(unsigned long));
 
for (i = 0; i < nb; ++i, ++p)
-   if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz),
-   SWIZ_PTR(p)))
+   if (__get_user_or_set_dar(regs, REG_BYTE(rptr, i ^ 
bswiz),
+ SWIZ_PTR(p)))
return -EFAULT;
if (nb0 > 0) {
rptr = ®s->gpr[0];
addr += nb;
for (i = 0; i < nb0; ++i, ++p)
-   if (__get_user_inatomic(REG_BYTE(rptr,
-i ^ bswiz),
-   SWIZ_PTR(p)))
+   if (__get_user_or_set_dar(regs,
+ REG_BYTE(rptr, i ^ 
bswiz),
+ SWIZ_PTR(p)))
return -EFAULT;
}
 
} else {
for (i = 0; i < nb; ++i, ++p)
-   if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz),
-   SWIZ_PTR(p)))
+   if (__put_user_or_set_dar(regs, REG_BYTE(rptr, i ^ 
bswiz),
+ SWIZ_PTR(p)))
return -EFAULT;
if (nb0 > 0) {
rptr = ®s->gpr[0];
addr += nb;
for (i = 0; i < nb0; ++i, ++p)
-   if (__put_user_inatomic(REG_BYTE(rptr,
-i ^ bswiz),
-

Re: [PATCH] powerpc/pseries: Don't attempt to acquire drc during memory hot add for assigned lmbs

2017-08-24 Thread Michael Ellerman

John Allen  writes:

> Check if an LMB is assigned before attempting to call dlpar_acquire_drc in
> order to avoid any unnecessary rtas calls. This substantially reduces the
> running time of memory hot add on lpars with large amounts of memory.
>
> Signed-off-by: John Allen 

I'll add:

  Fixes: c21f515c7436 ("powerpc/pseries: Make the acquire/release of the drc 
for memory a seperate step")

?

How bad is the slow down, do we need to backport to stable/distros?

cheers

Re: [PATCH] powerpc/powernv/idle: Round up latency and residency values

2017-08-24 Thread Michael Ellerman

Vaidyanathan Srinivasan  writes:

> On PowerNV platforms, firmware provides exit latency and
> target residency for each of the idle states in nano
> seconds.  Cpuidle framework expects the values in micro
> seconds.  Round up to nearest micro seconds to avoid errors
> in cases where the values are defined as fractional micro
> seconds.
>
> Default idle state of 'snooze' has exit latency of zero.  If
> other states have fractional micro second exit latency, they
> would get rounded down to zero micro second and make cpuidle
> framework choose deeper idle state when snooze loop is the
> right choice.
>
> Reported-by: Anton Blanchard 
> Signed-off-by: Vaidyanathan Srinivasan 

This sounds like a fairly bad bug, does it need a Fixes / Cc stable tag?

cheers

Re: [PATCH v2 12/14] KVM: PPC: Book3S HV: POWER9 can execute stop without a sync sequence

2017-08-24 Thread Paul Mackerras

On Sat, Aug 12, 2017 at 02:39:10AM +1000, Nicholas Piggin wrote:
> Reviewed-by: Gautham R. Shenoy 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 24 
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 3e024fd71fe8..edb47738a686 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -2527,7 +2527,17 @@ BEGIN_FTR_SECTION
>  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
>  
>  kvm_nap_sequence:/* desired LPCR value in r5 */
> -BEGIN_FTR_SECTION
> +BEGIN_FTR_SECTION/* nap sequence */
> + mtspr   SPRN_LPCR,r5
> + isync
> + li  r0, 0
> + std r0, HSTATE_SCRATCH0(r13)
> + ptesync
> + ld  r0, HSTATE_SCRATCH0(r13)
> +1:   cmpdr0, r0
> + bne 1b
> + nap
> +FTR_SECTION_ELSE /* stop sequence */
>   /*
>* PSSCR bits:  exit criterion = 1 (wakeup based on LPCR at sreset)
>*  enable state loss = 1 (allow SMT mode switch)
> @@ -2539,18 +2549,8 @@ BEGIN_FTR_SECTION
>   li  r4, LPCR_PECE_HVEE@higher
>   sldir4, r4, 32
>   or  r5, r5, r4
> -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>   mtspr   SPRN_LPCR,r5
> - isync
> - li  r0, 0
> - std r0, HSTATE_SCRATCH0(r13)
> - ptesync
> - ld  r0, HSTATE_SCRATCH0(r13)
> -1:   cmpdr0, r0
> - bne 1b
> -BEGIN_FTR_SECTION
> - nap
> -FTR_SECTION_ELSE
> +
>   PPC_STOP
>  ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
>   b   .
> -- 
> 2.13.3

Currently we never get to kvm_nap_sequence on POWER9 because we are
always running one vcpu per vcore, so I haven't worried about this
code too much.  In future we might need this for running HPT guests on
a radix host, though.

Paul.

Re: [PATCH v2 10/14] KVM: PPC: Book3S HV: POWER9 does not require secondary thread management

2017-08-24 Thread Paul Mackerras

On Sat, Aug 12, 2017 at 02:39:08AM +1000, Nicholas Piggin wrote:
> POWER9 CPUs have independent MMU contexts per thread, so KVM does not
> need to quiesce secondary threads, so the hwthread_req/hwthread_state
> protocol does not have to be used. So patch it away on POWER9, and patch
> away the branch from the Linux idle wakeup to kvm_start_guest that is
> never used.

If/when we add support for running HPT guests on a radix host, we will
have to run the host in single-threaded mode (since POWER9 doesn't
support having some threads of a core using HPT and some using radix
simultaneously).  We'll then need some sort of thing like
kvmppc_grab_hwthread to coordinate with the threads so that guests can
use the secondary threads.

So I think most of this code should stay.  We will still need to have
a way to make sure that the secondaries are in real mode and not in a
guest, because all threads will need to be in real mode when switching
the core between radix and HPT mode.  Maybe we can optimize it a bit
at present given that we don't yet support running HPT guests on a
radix host, but I don't want to make it harder to do that in future.

Paul.

Re: [PATCH v2 1/1] futex: remove duplicated code and fix UB

2017-08-24 Thread Will Deacon

Hi Jiri,

On Thu, Aug 24, 2017 at 09:31:05AM +0200, Jiri Slaby wrote:
> There is code duplicated over all architecture's headers for
> futex_atomic_op_inuser. Namely op decoding, access_ok check for uaddr,
> and comparison of the result.
> 
> Remove this duplication and leave up to the arches only the needed
> assembly which is now in arch_futex_atomic_op_inuser.
> 
> This effectively distributes the Will Deacon's arm64 fix for undefined
> behaviour reported by UBSAN to all architectures. The fix was done in
> commit 5f16a046f8e1 (arm64: futex: Fix undefined behaviour with
> FUTEX_OP_OPARG_SHIFT usage). Look there for an example dump.
> 
> And as suggested by Thomas, check for negative oparg too, because it was
> also reported to cause undefined behaviour report.
> 
> Note that s390 removed access_ok check in d12a29703 ("s390/uaccess:
> remove pointless access_ok() checks") as access_ok there returns true.
> We introduce it back to the helper for the sake of simplicity (it gets
> optimized away anyway).

For arm64 and the core code:

Reviewed-by: Will Deacon 

Although one minor thing on the core part:

> diff --git a/kernel/futex.c b/kernel/futex.c
> index 0939255fc750..3d38eaf05492 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -1551,6 +1551,45 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int 
> nr_wake, u32 bitset)
>   return ret;
>  }
>  
> +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
> +{
> + unsigned int op = (encoded_op & 0x7000) >> 28;
> + unsigned int cmp =(encoded_op & 0x0f00) >> 24;
> + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
> + int cmparg = sign_extend32(encoded_op & 0x0fff, 12);
> + int oldval, ret;
> +
> + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
> + if (oparg < 0 || oparg > 31)
> + return -EINVAL;
> + oparg = 1 << oparg;
> + }
> +
> + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
> + return -EFAULT;
> +
> + ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
> + if (ret)
> + return ret;

We could move the pagefault_{disable,enable} calls here, and then remove
them from the futex_atomic_op_inuser callsites elsewhere in futex.c

Will

Re: [PATCH v2 1/1] Split VGA default device handler out of VGA arbiter

2017-08-24 Thread Ard Biesheuvel

On 24 August 2017 at 01:57, Dave Airlie  wrote:
>> Yeah, maybe it's time to disconnect the "default display device" idea
>> from the VGA arbiter.  I have no idea what (if any) dependencies X has
>> on the legacy VGA resources.  I assume X works fine on power, where it
>> sounds like those resources are rarely or never available.
>
> The question on non-x86 archs, is what is the correct device to default to.
>
> On x86 we use the legacy VGA resources as a pointer, as this is the device
> the BIOS appeared on at boot so hopefully should be one you can see stuff on.
>
> On non-x86 I've no idea how to decide if there are multiple devices, maybe the
> firmware needs to tag something for the kernel if there are. Otherwise
> you'd just
> be picking something in probe order.
>
> I think the idea of these patches is to separate default display
> device from the arbiter.
>
> X uses the arbiter on x86 if required (it's horrible, and it's rare we
> have to nowadays),
> but for finding the default device it justs uses the sysfs boot_vga flag.
>

Part of the problem is that X refuses to start if there is only one
display device to begin with in case it hasn't taken ownership of the
VGA legacy resources.

Re: [PATCH v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread Paul Mackerras

On Thu, Aug 24, 2017 at 07:11:38PM +1000, Paul Mackerras wrote:

Ignore this.  My apologies.

Paul.

[PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread Paul Mackerras

Nixiaoming pointed out that there is a memory leak in
kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd()
fails; the memory allocated for the kvmppc_spapr_tce_table struct
is not freed, and nor are the pages allocated for the iommu
tables.  In addition, we have already incremented the process's
count of locked memory pages, and this doesn't get restored on
error.

David Hildenbrand pointed out that there is a race in that the
function checks early on that there is not already an entry in the
stt->iommu_tables list with the same LIOBN, but an entry with the
same LIOBN could get added between then and when the new entry is
added to the list.

This fixes all three problems.  To simplify things, we now call
anon_inode_getfd() before placing the new entry in the list.  The
check for an existing entry is done while holding the kvm->lock
mutex, immediately before adding the new entry to the list.
Finally, on failure we now call kvmppc_account_memlimit to
decrement the process's count of locked memory pages.

Reported-by: Nixiaoming 
Reported-by: David Hildenbrand 
Signed-off-by: Paul Mackerras 
---
v2: Don't overwrite stt in loop over spapr_tce_tables

 arch/powerpc/kvm/book3s_64_vio.c | 56 
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index a160c14..53766e2 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
   struct kvm_create_spapr_tce_64 *args)
 {
struct kvmppc_spapr_tce_table *stt = NULL;
+   struct kvmppc_spapr_tce_table *siter;
unsigned long npages, size;
int ret = -ENOMEM;
int i;
+   int fd = -1;
 
if (!args->size)
return -EINVAL;
 
-   /* Check this LIOBN hasn't been previously allocated */
-   list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-   if (stt->liobn == args->liobn)
-   return -EBUSY;
-   }
-
size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
npages = kvmppc_tce_pages(size);
ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
-   if (ret) {
-   stt = NULL;
-   goto fail;
-   }
+   if (ret)
+   return ret;
 
ret = -ENOMEM;
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
  GFP_KERNEL);
if (!stt)
-   goto fail;
+   goto fail_acct;
 
stt->liobn = args->liobn;
stt->page_shift = args->page_shift;
@@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
goto fail;
}
 
-   kvm_get_kvm(kvm);
+   ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+   stt, O_RDWR | O_CLOEXEC);
+   if (ret < 0)
+   goto fail;
 
mutex_lock(&kvm->lock);
-   list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+
+   /* Check this LIOBN hasn't been previously allocated */
+   ret = 0;
+   list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) {
+   if (siter->liobn == args->liobn) {
+   ret = -EBUSY;
+   break;
+   }
+   }
+
+   if (!ret) {
+   list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+   kvm_get_kvm(kvm);
+   }
 
mutex_unlock(&kvm->lock);
 
-   return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-   stt, O_RDWR | O_CLOEXEC);
+   if (!ret)
+   return fd;
 
-fail:
-   if (stt) {
-   for (i = 0; i < npages; i++)
-   if (stt->pages[i])
-   __free_page(stt->pages[i]);
+   put_unused_fd(fd);
 
-   kfree(stt);
-   }
+ fail:
+   for (i = 0; i < npages; i++)
+   if (stt->pages[i])
+   __free_page(stt->pages[i]);
+
+   kfree(stt);
+ fail_acct:
+   kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
return ret;
 }
 
-- 
2.7.4

[PATCH v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread Paul Mackerras

Nixiaoming pointed out that there is a memory leak in
kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd()
fails; the memory allocated for the kvmppc_spapr_tce_table struct
is not freed, and nor are the pages allocated for the iommu
tables.  In addition, we have already incremented the process's
count of locked memory pages, and this doesn't get restored on
error.

David Hildenbrand pointed out that there is a race in that the
function checks early on that there is not already an entry in the
stt->iommu_tables list with the same LIOBN, but an entry with the
same LIOBN could get added between then and when the new entry is
added to the list.

This fixes all three problems.  To simplify things, we now call
anon_inode_getfd() before placing the new entry in the list.  The
check for an existing entry is done while holding the kvm->lock
mutex, immediately before adding the new entry to the list.
Finally, on failure we now call kvmppc_account_memlimit to
decrement the process's count of locked memory pages.

Reported-by: Nixiaoming 
Reported-by: David Hildenbrand 
Signed-off-by: Paul Mackerras 
---
v2: Don't overwrite stt in loop over kvm->arch.spapr_tce_tables

 arch/powerpc/kvm/book3s_64_vio.c | 55 
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index a160c14..d463c1c 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -297,29 +297,22 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
unsigned long npages, size;
int ret = -ENOMEM;
int i;
+   int fd = -1;
 
if (!args->size)
return -EINVAL;
 
-   /* Check this LIOBN hasn't been previously allocated */
-   list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-   if (stt->liobn == args->liobn)
-   return -EBUSY;
-   }
-
size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
npages = kvmppc_tce_pages(size);
ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
-   if (ret) {
-   stt = NULL;
-   goto fail;
-   }
+   if (ret)
+   return ret;
 
ret = -ENOMEM;
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
  GFP_KERNEL);
if (!stt)
-   goto fail;
+   goto fail_acct;
 
stt->liobn = args->liobn;
stt->page_shift = args->page_shift;
@@ -334,24 +327,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
goto fail;
}
 
-   kvm_get_kvm(kvm);
+   ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+   stt, O_RDWR | O_CLOEXEC);
+   if (ret < 0)
+   goto fail;
 
mutex_lock(&kvm->lock);
-   list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+
+   /* Check this LIOBN hasn't been previously allocated */
+   ret = 0;
+   list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+   if (stt->liobn == args->liobn) {
+   ret = -EBUSY;
+   break;
+   }
+   }
+
+   if (!ret) {
+   list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+   kvm_get_kvm(kvm);
+   }
 
mutex_unlock(&kvm->lock);
 
-   return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-   stt, O_RDWR | O_CLOEXEC);
+   if (!ret)
+   return fd;
 
-fail:
-   if (stt) {
-   for (i = 0; i < npages; i++)
-   if (stt->pages[i])
-   __free_page(stt->pages[i]);
+   put_unused_fd(fd);
 
-   kfree(stt);
-   }
+ fail:
+   for (i = 0; i < npages; i++)
+   if (stt->pages[i])
+   __free_page(stt->pages[i]);
+
+   kfree(stt);
+ fail_acct:
+   kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
return ret;
 }
 
-- 
2.7.4

Re: [PATCH] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread Paul Mackerras

On Thu, Aug 24, 2017 at 06:43:22AM +, Nixiaoming wrote:
> >From: Paul Mackerras [mailto:pau...@ozlabs.org]  Thursday, August 24, 2017 
> >11:40 AM
> >
> >Nixiaoming pointed out that there is a memory leak in
> >kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the 
> >memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor 
> >are the pages allocated for the iommu tables.  In addition, we have already 
> >incremented the process's count of locked memory pages, and this doesn't get 
> >restored on error.
> >
> >David Hildenbrand pointed out that there is a race in that the function 
> >checks early on that there is not already an entry in the
> >stt->iommu_tables list with the same LIOBN, but an entry with the
> >same LIOBN could get added between then and when the new entry is added to 
> >the list.
> >
> >This fixes all three problems.  To simplify things, we now call
> >anon_inode_getfd() before placing the new entry in the list.  The check for 
> >an existing entry is done while holding the kvm->lock mutex, immediately 
> >before adding the new entry to the list.
> >Finally, on failure we now call kvmppc_account_memlimit to decrement the 
> >process's count of locked memory pages.
> >
> >Reported-by: Nixiaoming 
> >Reported-by: David Hildenbrand 
> >Signed-off-by: Paul Mackerras 
> >---
> > arch/powerpc/kvm/book3s_64_vio.c | 55 
> > 
> > 1 file changed, 33 insertions(+), 22 deletions(-)
> >
> >diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> >b/arch/powerpc/kvm/book3s_64_vio.c
> >index a160c14304eb..d463c1cd0d8d 100644
> >--- a/arch/powerpc/kvm/book3s_64_vio.c
> >+++ b/arch/powerpc/kvm/book3s_64_vio.c
> >@@ -297,29 +297,22 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> > unsigned long npages, size;
> > int ret = -ENOMEM;
> > int i;
> >+int fd = -1;
> > 
> > if (!args->size)
> > return -EINVAL;
> > 
> >-/* Check this LIOBN hasn't been previously allocated */
> >-list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> >-if (stt->liobn == args->liobn)
> >-return -EBUSY;
> >-}
> >-
> > size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
> > npages = kvmppc_tce_pages(size);
> > ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
> >-if (ret) {
> >-stt = NULL;
> >-goto fail;
> >-}
> >+if (ret)
> >+return ret;
> > 
> > ret = -ENOMEM;
> > stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
> >   GFP_KERNEL);
> > if (!stt)
> >-goto fail;
> >+goto fail_acct;
> > 
> > stt->liobn = args->liobn;
> > stt->page_shift = args->page_shift;
> >@@ -334,24 +327,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> > goto fail;
> > }
> > 
> >-kvm_get_kvm(kvm);
> >+ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
> >+stt, O_RDWR | O_CLOEXEC);
> >+if (ret < 0)
> >+goto fail;
> > 
> > mutex_lock(&kvm->lock);
> >-list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
> >+
> >+/* Check this LIOBN hasn't been previously allocated */
> >+ret = 0;
> >+list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> 
> I think stt can not be used here
> need a new value for list_for_each_entry

Yes.  Good point.  New version coming.

Paul.

Re: [PATCH 10/10] powerpc/xive: fix the size of the cpumask used in xive_find_target_in_mask()

2017-08-24 Thread Cédric Le Goater

On 08/24/2017 07:49 AM, Michael Ellerman wrote:
> Michael Ellerman  writes:
> 
>> Cédric Le Goater  writes:
>>> When called from xive_irq_startup(), the size of the cpumask can be
>>> larger than nr_cpu_ids. Most of time, its value is NR_CPUS (2048).
> ...
>>
>> I guess this patch is a good fix, I'll expand the change log a bit.
> 
> Actually this got lost, because it was part of the larger series, and
> then you sent a v2 of the series and so v1 was marked superseeded :/
> 
> Anyway I've pulled this out of the series and will merge it.

I just saw your resend. Thanks for doing so,

C.

[PATCH v2 1/1] futex: remove duplicated code and fix UB

2017-08-24 Thread Jiri Slaby

There is code duplicated over all architecture's headers for
futex_atomic_op_inuser. Namely op decoding, access_ok check for uaddr,
and comparison of the result.

Remove this duplication and leave up to the arches only the needed
assembly which is now in arch_futex_atomic_op_inuser.

This effectively distributes the Will Deacon's arm64 fix for undefined
behaviour reported by UBSAN to all architectures. The fix was done in
commit 5f16a046f8e1 (arm64: futex: Fix undefined behaviour with
FUTEX_OP_OPARG_SHIFT usage). Look there for an example dump.

And as suggested by Thomas, check for negative oparg too, because it was
also reported to cause undefined behaviour report.

Note that s390 removed access_ok check in d12a29703 ("s390/uaccess:
remove pointless access_ok() checks") as access_ok there returns true.
We introduce it back to the helper for the sake of simplicity (it gets
optimized away anyway).

[v2]
- check also for negative values
- wait for Will's fix to be in upstream

Signed-off-by: Jiri Slaby 
Cc: Richard Henderson 
Cc: Ivan Kokshaysky 
Cc: Matt Turner 
Cc: Vineet Gupta 
Acked-by: Russell King 
Cc: Catalin Marinas 
Cc: Will Deacon 
Reviewed-by: Darren Hart (VMware) 
Cc: Richard Kuo 
Cc: Tony Luck 
Cc: Fenghua Yu 
Cc: Michal Simek 
Cc: Ralf Baechle 
Cc: Jonas Bonn 
Cc: Stefan Kristiansson 
Cc: Stafford Horne 
Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Acked-by: Michael Ellerman  (powerpc)
Cc: Martin Schwidefsky 
Acked-by: Heiko Carstens  [s390]
Cc: Yoshinori Sato 
Cc: Rich Felker 
Cc: "David S. Miller" 
Acked-by: Chris Metcalf  [for tile]
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Chris Zankel 
Cc: Max Filippov 
Cc: Arnd Bergmann 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
---
 arch/alpha/include/asm/futex.h  | 26 ---
 arch/arc/include/asm/futex.h| 40 -
 arch/arm/include/asm/futex.h| 26 +++
 arch/arm64/include/asm/futex.h  | 26 +++
 arch/frv/include/asm/futex.h|  3 ++-
 arch/frv/kernel/futex.c | 27 +++-
 arch/hexagon/include/asm/futex.h| 38 +++-
 arch/ia64/include/asm/futex.h   | 25 +++
 arch/microblaze/include/asm/futex.h | 38 +++-
 arch/mips/include/asm/futex.h   | 25 +++
 arch/openrisc/include/asm/futex.h   | 39 +++--
 arch/parisc/include/asm/futex.h | 26 +++
 arch/powerpc/include/asm/futex.h| 26 ---
 arch/s390/include/asm/futex.h   | 23 -
 arch/sh/include/asm/futex.h | 26 +++
 arch/sparc/include/asm/futex_64.h   | 26 ---
 arch/tile/include/asm/futex.h   | 40 -
 arch/x86/include/asm/futex.h| 40 -
 arch/xtensa/include/asm/futex.h | 27 
 include/asm-generic/futex.h | 50 +++--
 kernel/futex.c  | 39 +
 21 files changed, 130 insertions(+), 506 deletions(-)

diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index fb01dfb760c2..05a70edd57b6 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -25,18 +25,10 @@
:   "r" (uaddr), "r"(oparg) \
:   "memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+   u32 __user *uaddr)
 {
-   int op = (encoded_op >> 28) & 7;
-   int cmp = (encoded_op >> 24) & 15;
-   int oparg = (encoded_op << 8) >> 20;
-   int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret;
-   if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-   oparg = 1 << oparg;
-
-   if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-   return -EFAULT;
 
pagefault_disable();
 
@@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, 
u32 __user *uaddr)
 
pagefault_enable();
 
-   if (!ret) {
-   switch (cmp) {
-   case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-   case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-   case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-   case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-   case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-   case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-   default: ret = -ENOSYS;
-   }
-   }
+   if (!ret)
+   *oval = oldval;
+
return ret;
 }
 
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 1

[PATCH v3 7/7] dpaa_eth: check allocation result

2017-08-24 Thread Madalin Bucur

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 73ca8d7..4225806 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -2561,6 +2561,9 @@ static struct dpaa_bp *dpaa_bp_alloc(struct device *dev)
 
dpaa_bp->bpid = FSL_DPAA_BPID_INV;
dpaa_bp->percpu_count = devm_alloc_percpu(dev, *dpaa_bp->percpu_count);
+   if (!dpaa_bp->percpu_count)
+   return ERR_PTR(-ENOMEM);
+
dpaa_bp->config_count = FSL_DPAA_ETH_MAX_BUF_COUNT;
 
dpaa_bp->seed_cb = dpaa_bp_seed;
-- 
2.1.0

[PATCH v3 6/7] Documentation: networking: add RSS information

2017-08-24 Thread Madalin Bucur

Signed-off-by: Madalin Bucur 
---
 Documentation/networking/dpaa.txt | 68 ++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/dpaa.txt 
b/Documentation/networking/dpaa.txt
index 76e016d..f88194f 100644
--- a/Documentation/networking/dpaa.txt
+++ b/Documentation/networking/dpaa.txt
@@ -13,6 +13,7 @@ Contents
- Configuring DPAA Ethernet in your kernel
- DPAA Ethernet Frame Processing
- DPAA Ethernet Features
+   - DPAA IRQ Affinity and Receive Side Scaling
- Debugging
 
 DPAA Ethernet Overview
@@ -147,7 +148,10 @@ gradually.
 
 The driver has Rx and Tx checksum offloading for UDP and TCP. Currently the Rx
 checksum offload feature is enabled by default and cannot be controlled through
-ethtool.
+ethtool. Also, rx-flow-hash and rx-hashing was added. The addition of RSS
+provides a big performance boost for the forwarding scenarios, allowing
+different traffic flows received by one interface to be processed by different
+CPUs in parallel.
 
 The driver has support for multiple prioritized Tx traffic classes. Priorities
 range from 0 (lowest) to 3 (highest). These are mapped to HW workqueues with
@@ -166,6 +170,68 @@ classes as follows:
 tc qdisc add dev  root handle 1: \
 mqprio num_tc 4 map 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 hw 1
 
+DPAA IRQ Affinity and Receive Side Scaling
+==
+
+Traffic coming on the DPAA Rx queues or on the DPAA Tx confirmation
+queues is seen by the CPU as ingress traffic on a certain portal.
+The DPAA QMan portal interrupts are affined each to a certain CPU.
+The same portal interrupt services all the QMan portal consumers.
+
+By default the DPAA Ethernet driver enables RSS, making use of the
+DPAA FMan Parser and Keygen blocks to distribute traffic on 128
+hardware frame queues using a hash on IP v4/v6 source and destination
+and L4 source and destination ports, in present in the received frame.
+When RSS is disabled, all traffic received by a certain interface is
+received on the default Rx frame queue. The default DPAA Rx frame
+queues are configured to put the received traffic into a pool channel
+that allows any available CPU portal to dequeue the ingress traffic.
+The default frame queues have the HOLDACTIVE option set, ensuring that
+traffic bursts from a certain queue are serviced by the same CPU.
+This ensures a very low rate of frame reordering. A drawback of this
+is that only one CPU at a time can service the traffic received by a
+certain interface when RSS is not enabled.
+
+To implement RSS, the DPAA Ethernet driver allocates an extra set of
+128 Rx frame queues that are configured to dedicated channels, in a
+round-robin manner. The mapping of the frame queues to CPUs is now
+hardcoded, there is no indirection table to move traffic for a certain
+FQ (hash result) to another CPU. The ingress traffic arriving on one
+of these frame queues will arrive at the same portal and will always
+be processed by the same CPU. This ensures intra-flow order preservation
+and workload distribution for multiple traffic flows.
+
+RSS can be turned off for a certain interface using ethtool, i.e.
+
+   # ethtool -N fm1-mac9 rx-flow-hash tcp4 ""
+
+To turn it back on, one needs to set rx-flow-hash for tcp4/6 or udp4/6:
+
+   # ethtool -N fm1-mac9 rx-flow-hash udp4 sfdn
+
+There is no independent control for individual protocols, any command
+run for one of tcp4|udp4|ah4|esp4|sctp4|tcp6|udp6|ah6|esp6|sctp6 is
+going to control the rx-flow-hashing for all protocols on that interface.
+
+Besides using the FMan Keygen computed hash for spreading traffic on the
+128 Rx FQs, the DPAA Ethernet driver also sets the skb hash value when
+the NETIF_F_RXHASH feature is on (active by default). This can be turned
+on or off through ethtool, i.e.:
+
+   # ethtool -K fm1-mac9 rx-hashing off
+   # ethtool -k fm1-mac9 | grep hash
+   receive-hashing: off
+   # ethtool -K fm1-mac9 rx-hashing on
+   Actual changes:
+   receive-hashing: on
+   # ethtool -k fm1-mac9 | grep hash
+   receive-hashing: on
+
+Please note that Rx hashing depends upon the rx-flow-hashing being on
+for that interface - turning off rx-flow-hashing will also disable the
+rx-hashing (without ethtool reporting it as off as that depends on the
+NETIF_F_RXHASH feature flag).
+
 Debugging
 =
 
-- 
2.1.0

[PATCH v3 5/7] dpaa_eth: add NETIF_F_RXHASH

2017-08-24 Thread Madalin Bucur

Set the skb hash when then FMan Keygen hash result is available.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 23 +++---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |  1 +
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c |  9 +++--
 drivers/net/ethernet/freescale/fman/fman_port.c| 11 +++
 drivers/net/ethernet/freescale/fman/fman_port.h|  2 ++
 5 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 6d89e74..73ca8d7 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -236,7 +236,7 @@ static int dpaa_netdev_init(struct net_device *net_dev,
net_dev->max_mtu = dpaa_get_max_mtu();
 
net_dev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-NETIF_F_LLTX);
+NETIF_F_LLTX | NETIF_F_RXHASH);
 
net_dev->hw_features |= NETIF_F_SG | NETIF_F_HIGHDMA;
/* The kernels enables GSO automatically, if we declare NETIF_F_SG.
@@ -2237,12 +2237,13 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct 
qman_portal *portal,
dma_addr_t addr = qm_fd_addr(fd);
enum qm_fd_format fd_format;
struct net_device *net_dev;
-   u32 fd_status;
+   u32 fd_status, hash_offset;
struct dpaa_bp *dpaa_bp;
struct dpaa_priv *priv;
unsigned int skb_len;
struct sk_buff *skb;
int *count_ptr;
+   void *vaddr;
 
fd_status = be32_to_cpu(fd->status);
fd_format = qm_fd_get_format(fd);
@@ -2288,7 +2289,8 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct 
qman_portal *portal,
dma_unmap_single(dpaa_bp->dev, addr, dpaa_bp->size, DMA_FROM_DEVICE);
 
/* prefetch the first 64 bytes of the frame or the SGT start */
-   prefetch(phys_to_virt(addr) + qm_fd_get_offset(fd));
+   vaddr = phys_to_virt(addr);
+   prefetch(vaddr + qm_fd_get_offset(fd));
 
fd_format = qm_fd_get_format(fd);
/* The only FD types that we may receive are contig and S/G */
@@ -2309,6 +2311,18 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct 
qman_portal *portal,
 
skb->protocol = eth_type_trans(skb, net_dev);
 
+   if (net_dev->features & NETIF_F_RXHASH && priv->keygen_in_use &&
+   !fman_port_get_hash_result_offset(priv->mac_dev->port[RX],
+ &hash_offset)) {
+   enum pkt_hash_types type;
+
+   /* if L4 exists, it was used in the hash generation */
+   type = be32_to_cpu(fd->status) & FM_FD_STAT_L4CV ?
+   PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3;
+   skb_set_hash(skb, be32_to_cpu(*(u32 *)(vaddr + hash_offset)),
+type);
+   }
+
skb_len = skb->len;
 
if (unlikely(netif_receive_skb(skb) == NET_RX_DROP))
@@ -2774,6 +2788,9 @@ static int dpaa_eth_probe(struct platform_device *pdev)
if (err)
goto init_ports_failed;
 
+   /* Rx traffic distribution based on keygen hashing defaults to on */
+   priv->keygen_in_use = true;
+
priv->percpu_priv = devm_alloc_percpu(dev, *priv->percpu_priv);
if (!priv->percpu_priv) {
dev_err(dev, "devm_alloc_percpu() failed\n");
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
index 496a12c..bd94220 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
@@ -159,6 +159,7 @@ struct dpaa_priv {
struct list_head dpaa_fq_list;
 
u8 num_tc;
+   bool keygen_in_use;
u32 msg_enable; /* net_device message level */
 
struct {
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 965f652..faea674 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -402,6 +402,8 @@ static void dpaa_get_strings(struct net_device *net_dev, 
u32 stringset,
 static int dpaa_get_hash_opts(struct net_device *dev,
  struct ethtool_rxnfc *cmd)
 {
+   struct dpaa_priv *priv = netdev_priv(dev);
+
cmd->data = 0;
 
switch (cmd->flow_type) {
@@ -409,7 +411,8 @@ static int dpaa_get_hash_opts(struct net_device *dev,
case TCP_V6_FLOW:
case UDP_V4_FLOW:
case UDP_V6_FLOW:
-   cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   if (priv->keygen_in_use)
+   cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
/* Fall through */
case IPV4_FLOW:
case IPV6_FLOW:
@@ -421,7 +424,8 @@ static int dpaa_get_hash_opts(struct net_device *dev,
case AH_V6_FLOW:

[PATCH v3 4/7] dpaa_eth: enable Rx hashing control

2017-08-24 Thread Madalin Bucur

Allow ethtool control of the Rx flow hashing. By default RSS is
enabled, this allows to turn it off by bypassing the FMan Keygen
block and sending all traffic on the default Rx frame queue.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 113 +
 1 file changed, 113 insertions(+)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index aad825088..965f652 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -399,6 +399,117 @@ static void dpaa_get_strings(struct net_device *net_dev, 
u32 stringset,
memcpy(strings, dpaa_stats_global, size);
 }
 
+static int dpaa_get_hash_opts(struct net_device *dev,
+ struct ethtool_rxnfc *cmd)
+{
+   cmd->data = 0;
+
+   switch (cmd->flow_type) {
+   case TCP_V4_FLOW:
+   case TCP_V6_FLOW:
+   case UDP_V4_FLOW:
+   case UDP_V6_FLOW:
+   cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   /* Fall through */
+   case IPV4_FLOW:
+   case IPV6_FLOW:
+   case SCTP_V4_FLOW:
+   case SCTP_V6_FLOW:
+   case AH_ESP_V4_FLOW:
+   case AH_ESP_V6_FLOW:
+   case AH_V4_FLOW:
+   case AH_V6_FLOW:
+   case ESP_V4_FLOW:
+   case ESP_V6_FLOW:
+   cmd->data |= RXH_IP_SRC | RXH_IP_DST;
+   break;
+   default:
+   cmd->data = 0;
+   break;
+   }
+
+   return 0;
+}
+
+static int dpaa_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
+ u32 *unused)
+{
+   int ret = -EOPNOTSUPP;
+
+   switch (cmd->cmd) {
+   case ETHTOOL_GRXFH:
+   ret = dpaa_get_hash_opts(dev, cmd);
+   break;
+   default:
+   break;
+   }
+
+   return ret;
+}
+
+static void dpaa_set_hash(struct net_device *net_dev, bool enable)
+{
+   struct mac_device *mac_dev;
+   struct fman_port *rxport;
+   struct dpaa_priv *priv;
+
+   priv = netdev_priv(net_dev);
+   mac_dev = priv->mac_dev;
+   rxport = mac_dev->port[0];
+
+   fman_port_use_kg_hash(rxport, enable);
+}
+
+static int dpaa_set_hash_opts(struct net_device *dev,
+ struct ethtool_rxnfc *nfc)
+{
+   int ret = -EINVAL;
+
+   /* we support hashing on IPv4/v6 src/dest IP and L4 src/dest port */
+   if (nfc->data &
+   ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3))
+   return -EINVAL;
+
+   switch (nfc->flow_type) {
+   case TCP_V4_FLOW:
+   case TCP_V6_FLOW:
+   case UDP_V4_FLOW:
+   case UDP_V6_FLOW:
+   case IPV4_FLOW:
+   case IPV6_FLOW:
+   case SCTP_V4_FLOW:
+   case SCTP_V6_FLOW:
+   case AH_ESP_V4_FLOW:
+   case AH_ESP_V6_FLOW:
+   case AH_V4_FLOW:
+   case AH_V6_FLOW:
+   case ESP_V4_FLOW:
+   case ESP_V6_FLOW:
+   dpaa_set_hash(dev, !!nfc->data);
+   ret = 0;
+   break;
+   default:
+   break;
+   }
+
+   return ret;
+}
+
+static int dpaa_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+   int ret = -EOPNOTSUPP;
+
+   switch (cmd->cmd) {
+   case ETHTOOL_SRXFH:
+   ret = dpaa_set_hash_opts(dev, cmd);
+   break;
+   default:
+   break;
+   }
+
+   return ret;
+}
+
 const struct ethtool_ops dpaa_ethtool_ops = {
.get_drvinfo = dpaa_get_drvinfo,
.get_msglevel = dpaa_get_msglevel,
@@ -412,4 +523,6 @@ const struct ethtool_ops dpaa_ethtool_ops = {
.get_strings = dpaa_get_strings,
.get_link_ksettings = dpaa_get_link_ksettings,
.set_link_ksettings = dpaa_set_link_ksettings,
+   .get_rxnfc = dpaa_get_rxnfc,
+   .set_rxnfc = dpaa_set_rxnfc,
 };
-- 
2.1.0

[PATCH v3 2/7] fsl/fman: enable FMan Keygen

2017-08-24 Thread Madalin Bucur

From: Iordache Florinel-R70177 

Add support for the FMan Keygen with a hardcoded scheme to spread
incoming traffic on a FQ range based on source and destination IPs
and ports.

Signed-off-by: Iordache Florinel 
Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/fman/Makefile  |   2 +-
 drivers/net/ethernet/freescale/fman/fman.c|   8 +
 drivers/net/ethernet/freescale/fman/fman.h|   2 +
 drivers/net/ethernet/freescale/fman/fman_keygen.c | 783 ++
 drivers/net/ethernet/freescale/fman/fman_keygen.h |  46 ++
 drivers/net/ethernet/freescale/fman/fman_port.c   |  40 +-
 drivers/net/ethernet/freescale/fman/fman_port.h   |   5 +
 7 files changed, 884 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.h

diff --git a/drivers/net/ethernet/freescale/fman/Makefile 
b/drivers/net/ethernet/freescale/fman/Makefile
index 6049177..2c38119 100644
--- a/drivers/net/ethernet/freescale/fman/Makefile
+++ b/drivers/net/ethernet/freescale/fman/Makefile
@@ -4,6 +4,6 @@ obj-$(CONFIG_FSL_FMAN) += fsl_fman.o
 obj-$(CONFIG_FSL_FMAN) += fsl_fman_port.o
 obj-$(CONFIG_FSL_FMAN) += fsl_mac.o
 
-fsl_fman-objs  := fman_muram.o fman.o fman_sp.o
+fsl_fman-objs  := fman_muram.o fman.o fman_sp.o fman_keygen.o
 fsl_fman_port-objs := fman_port.o
 fsl_mac-objs:= mac.o fman_dtsec.o fman_memac.o fman_tgec.o
diff --git a/drivers/net/ethernet/freescale/fman/fman.c 
b/drivers/net/ethernet/freescale/fman/fman.c
index 6ed383f..9383b99 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -34,6 +34,7 @@
 
 #include "fman.h"
 #include "fman_muram.h"
+#include "fman_keygen.h"
 
 #include 
 #include 
@@ -56,6 +57,7 @@
 /* Modules registers offsets */
 #define BMI_OFFSET 0x0008
 #define QMI_OFFSET 0x00080400
+#define KG_OFFSET  0x000C1000
 #define DMA_OFFSET 0x000C2000
 #define FPM_OFFSET 0x000C3000
 #define IMEM_OFFSET0x000C4000
@@ -1737,6 +1739,7 @@ static int fman_config(struct fman *fman)
fman->qmi_regs = base_addr + QMI_OFFSET;
fman->dma_regs = base_addr + DMA_OFFSET;
fman->hwp_regs = base_addr + HWP_OFFSET;
+   fman->kg_regs = base_addr + KG_OFFSET;
fman->base_addr = base_addr;
 
spin_lock_init(&fman->spinlock);
@@ -2009,6 +2012,11 @@ static int fman_init(struct fman *fman)
/* Init HW Parser */
hwp_init(fman->hwp_regs);
 
+   /* Init KeyGen */
+   fman->keygen = keygen_init(fman->kg_regs);
+   if (!fman->keygen)
+   return -EINVAL;
+
err = enable(fman, cfg);
if (err != 0)
return err;
diff --git a/drivers/net/ethernet/freescale/fman/fman.h 
b/drivers/net/ethernet/freescale/fman/fman.h
index 6745065..a4b1633 100644
--- a/drivers/net/ethernet/freescale/fman/fman.h
+++ b/drivers/net/ethernet/freescale/fman/fman.h
@@ -326,6 +326,7 @@ struct fman {
struct fman_qmi_regs __iomem *qmi_regs;
struct fman_dma_regs __iomem *dma_regs;
struct fman_hwp_regs __iomem *hwp_regs;
+   struct fman_kg_regs __iomem *kg_regs;
fman_exceptions_cb *exception_cb;
fman_bus_error_cb *bus_error_cb;
/* Spinlock for FMan use */
@@ -334,6 +335,7 @@ struct fman {
 
struct fman_cfg *cfg;
struct muram_info *muram;
+   struct fman_keygen *keygen;
/* cam section in muram */
unsigned long cam_offset;
size_t cam_size;
diff --git a/drivers/net/ethernet/freescale/fman/fman_keygen.c 
b/drivers/net/ethernet/freescale/fman/fman_keygen.c
new file mode 100644
index 000..f54da3c
--- /dev/null
+++ b/drivers/net/ethernet/freescale/fman/fman_keygen.c
@@ -0,0 +1,783 @@
+/*
+ * Copyright 2017 NXP
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of NXP nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NXP ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WA

[PATCH v3 3/7] dpaa_eth: use multiple Rx frame queues

2017-08-24 Thread Madalin Bucur

Add a block of 128 Rx frame queues per port. The FMan hardware will
send traffic on one of these queues based on the FMan port Parse
Classify Distribute setup. The hash computed by the FMan Keygen
block will select the Rx FQ.

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 50 +++---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |  1 +
 .../net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c   |  3 ++
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index c7fa285..6d89e74 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -158,7 +158,7 @@ MODULE_PARM_DESC(tx_timeout, "The Tx timeout in ms");
 #define DPAA_RX_PRIV_DATA_SIZE (u16)(DPAA_TX_PRIV_DATA_SIZE + \
dpaa_rx_extra_headroom)
 
-#define DPAA_ETH_RX_QUEUES 128
+#define DPAA_ETH_PCD_RXQ_NUM   128
 
 #define DPAA_ENQUEUE_RETRIES   10
 
@@ -169,6 +169,7 @@ struct fm_port_fqs {
struct dpaa_fq *tx_errq;
struct dpaa_fq *rx_defq;
struct dpaa_fq *rx_errq;
+   struct dpaa_fq *rx_pcdq;
 };
 
 /* All the dpa bps in use at any moment */
@@ -628,6 +629,7 @@ static inline void dpaa_assign_wq(struct dpaa_fq *fq, int 
idx)
fq->wq = 5;
break;
case FQ_TYPE_RX_DEFAULT:
+   case FQ_TYPE_RX_PCD:
fq->wq = 6;
break;
case FQ_TYPE_TX:
@@ -688,6 +690,7 @@ static int dpaa_alloc_all_fqs(struct device *dev, struct 
list_head *list,
  struct fm_port_fqs *port_fqs)
 {
struct dpaa_fq *dpaa_fq;
+   u32 fq_base, fq_base_aligned, i;
 
dpaa_fq = dpaa_fq_alloc(dev, 0, 1, list, FQ_TYPE_RX_ERROR);
if (!dpaa_fq)
@@ -701,6 +704,26 @@ static int dpaa_alloc_all_fqs(struct device *dev, struct 
list_head *list,
 
port_fqs->rx_defq = &dpaa_fq[0];
 
+   /* the PCD FQIDs range needs to be aligned for correct operation */
+   if (qman_alloc_fqid_range(&fq_base, 2 * DPAA_ETH_PCD_RXQ_NUM))
+   goto fq_alloc_failed;
+
+   fq_base_aligned = ALIGN(fq_base, DPAA_ETH_PCD_RXQ_NUM);
+
+   for (i = fq_base; i < fq_base_aligned; i++)
+   qman_release_fqid(i);
+
+   for (i = fq_base_aligned + DPAA_ETH_PCD_RXQ_NUM;
+i < (fq_base + 2 * DPAA_ETH_PCD_RXQ_NUM); i++)
+   qman_release_fqid(i);
+
+   dpaa_fq = dpaa_fq_alloc(dev, fq_base_aligned, DPAA_ETH_PCD_RXQ_NUM,
+   list, FQ_TYPE_RX_PCD);
+   if (!dpaa_fq)
+   goto fq_alloc_failed;
+
+   port_fqs->rx_pcdq = &dpaa_fq[0];
+
if (!dpaa_fq_alloc(dev, 0, DPAA_ETH_TXQ_NUM, list, FQ_TYPE_TX_CONF_MQ))
goto fq_alloc_failed;
 
@@ -870,13 +893,14 @@ static void dpaa_fq_setup(struct dpaa_priv *priv,
  const struct dpaa_fq_cbs *fq_cbs,
  struct fman_port *tx_port)
 {
-   int egress_cnt = 0, conf_cnt = 0, num_portals = 0, cpu;
+   int egress_cnt = 0, conf_cnt = 0, num_portals = 0, portal_cnt = 0, cpu;
const cpumask_t *affine_cpus = qman_affine_cpus();
-   u16 portals[NR_CPUS];
+   u16 channels[NR_CPUS];
struct dpaa_fq *fq;
 
for_each_cpu(cpu, affine_cpus)
-   portals[num_portals++] = qman_affine_channel(cpu);
+   channels[num_portals++] = qman_affine_channel(cpu);
+
if (num_portals == 0)
dev_err(priv->net_dev->dev.parent,
"No Qman software (affine) channels found");
@@ -890,6 +914,12 @@ static void dpaa_fq_setup(struct dpaa_priv *priv,
case FQ_TYPE_RX_ERROR:
dpaa_setup_ingress(priv, fq, &fq_cbs->rx_errq);
break;
+   case FQ_TYPE_RX_PCD:
+   if (!num_portals)
+   continue;
+   dpaa_setup_ingress(priv, fq, &fq_cbs->rx_defq);
+   fq->channel = channels[portal_cnt++ % num_portals];
+   break;
case FQ_TYPE_TX:
dpaa_setup_egress(priv, fq, tx_port,
  &fq_cbs->egress_ern);
@@ -1039,7 +1069,8 @@ static int dpaa_fq_init(struct dpaa_fq *dpaa_fq, bool 
td_enable)
/* Put all the ingress queues in our "ingress CGR". */
if (priv->use_ingress_cgr &&
(dpaa_fq->fq_type == FQ_TYPE_RX_DEFAULT ||
-dpaa_fq->fq_type == FQ_TYPE_RX_ERROR)) {
+dpaa_fq->fq_type == FQ_TYPE_RX_ERROR ||
+dpaa_fq->fq_type == FQ_TYPE_RX_PCD)) {
initfq.we_mask |= cpu_to_be16(QM_INITFQ_WE_CGID);
initfq.fqd.fq_ctrl |= cpu_to_be16(QM_FQCTRL_CGE);
i

[PATCH v3 1/7] fsl/fman: move struct fman to header file

2017-08-24 Thread Madalin Bucur

Signed-off-by: Madalin Bucur 
---
 drivers/net/ethernet/freescale/fman/fman.c | 74 --
 drivers/net/ethernet/freescale/fman/fman.h | 73 +
 2 files changed, 73 insertions(+), 74 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c 
b/drivers/net/ethernet/freescale/fman/fman.c
index e714b8f..6ed383f 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -564,80 +564,6 @@ struct fman_cfg {
u32 qmi_def_tnums_thresh;
 };
 
-/* Structure that holds information received from device tree */
-struct fman_dts_params {
-   void __iomem *base_addr;/* FMan virtual address */
-   struct resource *res;   /* FMan memory resource */
-   u8 id;  /* FMan ID */
-
-   int err_irq;/* FMan Error IRQ */
-
-   u16 clk_freq;   /* FMan clock freq (In Mhz) */
-
-   u32 qman_channel_base;  /* QMan channels base */
-   u32 num_of_qman_channels;   /* Number of QMan channels */
-
-   struct resource muram_res;  /* MURAM resource */
-};
-
-/** fman_exceptions_cb
- * fman- Pointer to FMan
- * exception   - The exception.
- *
- * Exceptions user callback routine, will be called upon an exception
- * passing the exception identification.
- *
- * Return: irq status
- */
-typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman,
-enum fman_exceptions exception);
-
-/** fman_bus_error_cb
- * fman- Pointer to FMan
- * port_id - Port id
- * addr- Address that caused the error
- * tnum- Owner of error
- * liodn   - Logical IO device number
- *
- * Bus error user callback routine, will be called upon bus error,
- * passing parameters describing the errors and the owner.
- *
- * Return: IRQ status
- */
-typedef irqreturn_t (fman_bus_error_cb)(struct fman *fman, u8 port_id,
-   u64 addr, u8 tnum, u16 liodn);
-
-struct fman {
-   struct device *dev;
-   void __iomem *base_addr;
-   struct fman_intr_src intr_mng[FMAN_EV_CNT];
-
-   struct fman_fpm_regs __iomem *fpm_regs;
-   struct fman_bmi_regs __iomem *bmi_regs;
-   struct fman_qmi_regs __iomem *qmi_regs;
-   struct fman_dma_regs __iomem *dma_regs;
-   struct fman_hwp_regs __iomem *hwp_regs;
-   fman_exceptions_cb *exception_cb;
-   fman_bus_error_cb *bus_error_cb;
-   /* Spinlock for FMan use */
-   spinlock_t spinlock;
-   struct fman_state_struct *state;
-
-   struct fman_cfg *cfg;
-   struct muram_info *muram;
-   /* cam section in muram */
-   unsigned long cam_offset;
-   size_t cam_size;
-   /* Fifo in MURAM */
-   unsigned long fifo_offset;
-   size_t fifo_size;
-
-   u32 liodn_base[64];
-   u32 liodn_offset[64];
-
-   struct fman_dts_params dts_params;
-};
-
 static irqreturn_t fman_exceptions(struct fman *fman,
   enum fman_exceptions exception)
 {
diff --git a/drivers/net/ethernet/freescale/fman/fman.h 
b/drivers/net/ethernet/freescale/fman/fman.h
index f53e147..6745065 100644
--- a/drivers/net/ethernet/freescale/fman/fman.h
+++ b/drivers/net/ethernet/freescale/fman/fman.h
@@ -274,6 +274,79 @@ struct fman_intr_src {
void *src_handle;
 };
 
+/** fman_exceptions_cb
+ * fman - Pointer to FMan
+ * exception- The exception.
+ *
+ * Exceptions user callback routine, will be called upon an exception
+ * passing the exception identification.
+ *
+ * Return: irq status
+ */
+typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman,
+enum fman_exceptions exception);
+/** fman_bus_error_cb
+ * fman - Pointer to FMan
+ * port_id  - Port id
+ * addr - Address that caused the error
+ * tnum - Owner of error
+ * liodn- Logical IO device number
+ *
+ * Bus error user callback routine, will be called upon bus error,
+ * passing parameters describing the errors and the owner.
+ *
+ * Return: IRQ status
+ */
+typedef irqreturn_t (fman_bus_error_cb)(struct fman *fman, u8 port_id,
+   u64 addr, u8 tnum, u16 liodn);
+
+/* Structure that holds information received from device tree */
+struct fman_dts_params {
+   void __iomem *base_addr;/* FMan virtual address */
+   struct resource *res;   /* FMan memory resource */
+   u8 id;  /* FMan ID */
+
+   int err_irq;/* FMan Error IRQ */
+
+   u16 clk_freq;   /* FMan clock freq (In Mhz) */
+
+   u32 qman_channel_base;  /* QMan channels base */
+   u32 num_of_qman_channels;   /* Num

[PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver

2017-08-24 Thread Madalin Bucur

This patch set introduces Receive Side Scaling for the DPAA Ethernet
driver. Documentation is updated with details related to the new
feature and limitations that apply.
Added also a small fix.

v2: removed a C++ style comment
v3: move struct fman to header file to avoid exporting a function

Iordache Florinel-R70177 (1):
  fsl/fman: enable FMan Keygen

Madalin Bucur (6):
  fsl/fman: move struct fman to header file
  dpaa_eth: use multiple Rx frame queues
  dpaa_eth: enable Rx hashing control
  dpaa_eth: add NETIF_F_RXHASH
  Documentation: networking: add RSS information
  dpaa_eth: check allocation result

 Documentation/networking/dpaa.txt  |  68 +-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c |  76 +-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h |   2 +
 .../net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c   |   3 +
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 118 
 drivers/net/ethernet/freescale/fman/Makefile   |   2 +-
 drivers/net/ethernet/freescale/fman/fman.c |  82 +--
 drivers/net/ethernet/freescale/fman/fman.h |  75 ++
 drivers/net/ethernet/freescale/fman/fman_keygen.c  | 783 +
 drivers/net/ethernet/freescale/fman/fman_keygen.h  |  46 ++
 drivers/net/ethernet/freescale/fman/fman_port.c|  51 +-
 drivers/net/ethernet/freescale/fman/fman_port.h|   7 +
 12 files changed, 1226 insertions(+), 87 deletions(-)
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.c
 create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.h

-- 
2.1.0

Re: [PATCH] cxl: Add support for POWER9 DD2

2017-08-24 Thread christophe lombard


Le 24/08/2017 à 07:24, Andrew Donnellan a écrit :

On 24/08/17 00:58, Christophe Lombard wrote:

The PSL initialization sequence has been updated to DD2.
This patch adapts to the changes, retaining compatibility with DD1.

Tests performed on some of the new hardware.


If we're retaining compatibility with DD1 I assume it's been tested on 
some of the old hardware too?


right, it's been tested on boston machine with dd1



It seems this includes some changes to DD1 fix-ups as well.


correct





Signed-off-by: Christophe Lombard  > ---
  drivers/misc/cxl/cxl.h |  2 ++
  drivers/misc/cxl/pci.c | 57 
+++---

  2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index b1afecc..0167df8 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -100,6 +100,8 @@ static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158};
  static const cxl_p1_reg_t CXL_XSL_DSNCTL= {0x0168};
  /* PSL registers - CAIA 2 */
  static const cxl_p1_reg_t CXL_PSL9_CONTROL  = {0x0020};
+static const cxl_p1_reg_t CXL_XSL9_INV  = {0x0110};
+static const cxl_p1_reg_t CXL_XSL9_DEF  = {0x0140};
  static const cxl_p1_reg_t CXL_XSL9_DSNCTL   = {0x0168};
  static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300};
  static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308};
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index d18b3d9..a981c65 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -475,37 +475,52 @@ static int 
init_implementation_adapter_regs_psl9(struct cxl *adapter,

  psl_fircntl |= 0x1ULL; /* ce_thresh */
  cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl);
  -/* vccredits=0x1  pcklat=0x4 */
-cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x1810ULL);
-
-/*
- * For debugging with trace arrays.
- * Configure RX trace 0 segmented mode.
- * Configure CT trace 0 segmented mode.
- * Configure LA0 trace 0 segmented mode.
- * Configure LA1 trace 0 segmented mode.
+/* Setup the PSL to transmit packets on the PCIe before the
+ * CAPP is enabled
   */
-cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008000ULL);
-cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008003ULL);
-cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008005ULL);
-cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008006ULL);
+cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x000100102A10ULL);
+
+/* For debugging with trace arrays */
+/* Configure RX trace 0 segmented mode */
+cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8200ULL);
+/* Configure RX trace 1 segmented mode */
+cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xAA01ULL);
+/* Configure CT trace 0 segmented mode */
+cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xA2B80003ULL);
+/* Configure LA0 trace 0 segmented mode */
+cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x83FFC005ULL);
+/* Configure JM0 trace 0 segmented mode */
+cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8207ULL);
+/* Configure DMA trace 0 segmented mode */
+cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8208ULL);
+/* Configure DMA trace 1 segmented mode */
+cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8209ULL);
/*
   * A response to an ASB_Notify request is returned by the
   * system as an MMIO write to the address defined in
   * the PSL_TNR_ADDR register
   */
-/* PSL_TNR_ADDR */
+/* keep the Reset Value: 0x0002E000 */


I was confused by this comment for a while - maybe keep PSL_TNR_ADDR 
at the beginning of the comment, it's not completely clear from the 
previous block alone.


okay, will do.




+
+/* Enable XSL rty limit */
+cxl_p1_write(adapter, CXL_XSL9_DEF, 0x51F80005ULL);
  -/* NORST */
-cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000ULL);
+/* Change XSL_INV dummy readtheshold */


read threshold?


+cxl_p1_write(adapter, CXL_XSL9_INV, 0x040007FFC200ULL);
  -/* allocate the apc machines */
-cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x4003ULL);
+if (phb_index == 3) {
+/* disable machines 31-47 and 20-27 for DMA */
+cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 
0x4FF3ULL);

+}
+
+/* Snoop machines */
+cxl_p1_write(adapter, CXL_PSL9_APCDEDALLOC, 0x800F0002ULL);
  -/* Disable vc dd1 fix */
-if (cxl_is_power9_dd1())
-cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0401ULL);
+if (cxl_is_power9_dd1()) {
+/* Disabling deadlock counter CAR */
+cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0021ULL);
+}
return 0;
  }

Re: [PATCH] cxl: Add support for POWER9 DD2

2017-08-24 Thread christophe lombard


Le 24/08/2017 à 09:09, Vaibhav Jain a écrit :

Hi Christophe,

Christophe Lombard  writes:

+   /* For debugging with trace arrays */
+   /* Configure RX trace 0 segmented mode */
+   cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8200ULL);
+   /* Configure RX trace 1 segmented mode */
+   cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xAA01ULL);
+   /* Configure CT trace 0 segmented mode */
+   cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xA2B80003ULL);
+   /* Configure LA0 trace 0 segmented mode */
+   cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x83FFC005ULL);
+   /* Configure JM0 trace 0 segmented mode */
+   cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8207ULL);
+   /* Configure DMA trace 0 segmented mode */
+   cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8208ULL);
+   /* Configure DMA trace 1 segmented mode */
+   cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8209ULL);

Please wrap this block that configures the trace arrays in #ifdef
DEBUG. Or it will be better if we remove it from here as the register is
already accessible via debugfs.

---
Vaibhav Jain 
Linux Technology Center, IBM India Pvt. Ltd.


okay, I will remove this block

Thanks

Re: [PATCH] cxl: Add support for POWER9 DD2

2017-08-24 Thread Vaibhav Jain

Hi Christophe,

Christophe Lombard  writes:
> + /* For debugging with trace arrays */
> + /* Configure RX trace 0 segmented mode */
> + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8200ULL);
> + /* Configure RX trace 1 segmented mode */
> + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xAA01ULL);
> + /* Configure CT trace 0 segmented mode */
> + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xA2B80003ULL);
> + /* Configure LA0 trace 0 segmented mode */
> + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x83FFC005ULL);
> + /* Configure JM0 trace 0 segmented mode */
> + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8207ULL);
> + /* Configure DMA trace 0 segmented mode */
> + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8208ULL);
> + /* Configure DMA trace 1 segmented mode */
> + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8209ULL);
Please wrap this block that configures the trace arrays in #ifdef
DEBUG. Or it will be better if we remove it from here as the register is
already accessible via debugfs.

---
Vaibhav Jain 
Linux Technology Center, IBM India Pvt. Ltd.

RE: [PATCH] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()

2017-08-24 Thread Nixiaoming

>From: Paul Mackerras [mailto:pau...@ozlabs.org]  Thursday, August 24, 2017 
>11:40 AM
>
>Nixiaoming pointed out that there is a memory leak in
>kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the 
>memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor 
>are the pages allocated for the iommu tables.  In addition, we have already 
>incremented the process's count of locked memory pages, and this doesn't get 
>restored on error.
>
>David Hildenbrand pointed out that there is a race in that the function checks 
>early on that there is not already an entry in the
>stt->iommu_tables list with the same LIOBN, but an entry with the
>same LIOBN could get added between then and when the new entry is added to the 
>list.
>
>This fixes all three problems.  To simplify things, we now call
>anon_inode_getfd() before placing the new entry in the list.  The check for an 
>existing entry is done while holding the kvm->lock mutex, immediately before 
>adding the new entry to the list.
>Finally, on failure we now call kvmppc_account_memlimit to decrement the 
>process's count of locked memory pages.
>
>Reported-by: Nixiaoming 
>Reported-by: David Hildenbrand 
>Signed-off-by: Paul Mackerras 
>---
> arch/powerpc/kvm/book3s_64_vio.c | 55 
> 1 file changed, 33 insertions(+), 22 deletions(-)
>
>diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
>b/arch/powerpc/kvm/book3s_64_vio.c
>index a160c14304eb..d463c1cd0d8d 100644
>--- a/arch/powerpc/kvm/book3s_64_vio.c
>+++ b/arch/powerpc/kvm/book3s_64_vio.c
>@@ -297,29 +297,22 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>   unsigned long npages, size;
>   int ret = -ENOMEM;
>   int i;
>+  int fd = -1;
> 
>   if (!args->size)
>   return -EINVAL;
> 
>-  /* Check this LIOBN hasn't been previously allocated */
>-  list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
>-  if (stt->liobn == args->liobn)
>-  return -EBUSY;
>-  }
>-
>   size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
>   npages = kvmppc_tce_pages(size);
>   ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
>-  if (ret) {
>-  stt = NULL;
>-  goto fail;
>-  }
>+  if (ret)
>+  return ret;
> 
>   ret = -ENOMEM;
>   stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
> GFP_KERNEL);
>   if (!stt)
>-  goto fail;
>+  goto fail_acct;
> 
>   stt->liobn = args->liobn;
>   stt->page_shift = args->page_shift;
>@@ -334,24 +327,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>   goto fail;
>   }
> 
>-  kvm_get_kvm(kvm);
>+  ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
>+  stt, O_RDWR | O_CLOEXEC);
>+  if (ret < 0)
>+  goto fail;
> 
>   mutex_lock(&kvm->lock);
>-  list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
>+
>+  /* Check this LIOBN hasn't been previously allocated */
>+  ret = 0;
>+  list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {

I think stt can not be used here
need a new value for list_for_each_entry


>+  if (stt->liobn == args->liobn) {
>+  ret = -EBUSY;
>+  break;
>+  }
>+  }
>+
>+  if (!ret) {
>+  list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
>+  kvm_get_kvm(kvm);
>+  }
> 
>   mutex_unlock(&kvm->lock);
> 
>-  return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
>-  stt, O_RDWR | O_CLOEXEC);
>+  if (!ret)
>+  return fd;
> 
>-fail:
>-  if (stt) {
>-  for (i = 0; i < npages; i++)
>-  if (stt->pages[i])
>-  __free_page(stt->pages[i]);
>+  put_unused_fd(fd);
> 
>-  kfree(stt);
>-  }
>+ fail:
>+  for (i = 0; i < npages; i++)
>+  if (stt->pages[i])
>+  __free_page(stt->pages[i]);
>+
>+  kfree(stt);
>+ fail_acct:
>+  kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
>   return ret;
> }
> 
>--
>2.11.0


Thanks

94 matches

Mail list logo