[PATCH v2 09/10] powerpc: Handle opposite-endian processes in emulation code
This adds code to the load and store emulation code to byte-swap the data appropriately when the process being emulated is set to the opposite endianness to that of the kernel. This also enables the emulation for the multiple-register loads and stores (lmw, stmw, lswi, stswi, lswx, stswx) to work for little-endian. In little-endian mode, the partial word at the end of a transfer for lsw*/stsw* (when the byte count is not a multiple of 4) is loaded/stored at the least-significant end of the register. Additionally, this fixes a bug in the previous code in that it could call read_mem/write_mem with a byte count that was not 1, 2, 4 or 8. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/sstep.h | 4 +- arch/powerpc/lib/sstep.c | 202 ++- 2 files changed, 135 insertions(+), 71 deletions(-) diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index 0e5dd23..5a3d3d4 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -149,6 +149,6 @@ void emulate_update_regs(struct pt_regs *reg, struct instruction_op *op); extern int emulate_step(struct pt_regs *regs, unsigned int instr); extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, -const void *mem); +const void *mem, bool cross_endian); extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, - void *mem); + void *mem, bool cross_endian); diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 4773055..7afb8ef 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -210,6 +210,33 @@ static nokprobe_inline unsigned long byterev_8(unsigned long x) } #endif +static nokprobe_inline void do_byte_reverse(void *ptr, int nb) +{ + switch (nb) { + case 2: + *(u16 *)ptr = byterev_2(*(u16 *)ptr); + break; + case 4: + *(u32 *)ptr = byterev_4(*(u32 *)ptr); + break; +#ifdef __powerpc64__ + case 8: + *(unsigned long *)ptr = byterev_8(*(unsigned long *)ptr); + break; + case 16: { + unsigned long *up = (unsigned long *)ptr; + unsigned long tmp; + tmp = byterev_8(up[0]); + up[0] = byterev_8(up[1]); + up[1] = tmp; + break; + } +#endif + default: + WARN_ON_ONCE(1); + } +} + static nokprobe_inline int read_mem_aligned(unsigned long *dest, unsigned long ea, int nb) { @@ -409,7 +436,8 @@ NOKPROBE_SYMBOL(write_mem); * These access either the real FP register or the image in the * thread_struct, depending on regs->msr & MSR_FP. */ -static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs) +static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs, + bool cross_endian) { int err; union { @@ -424,6 +452,11 @@ static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs) err = copy_mem_in(u.b, ea, nb); if (err) return err; + if (unlikely(cross_endian)) { + do_byte_reverse(u.b, min(nb, 8)); + if (nb == 16) + do_byte_reverse(&u.b[8], 8); + } preempt_disable(); if (nb == 4) conv_sp_to_dp(&u.f, &u.d[0]); @@ -444,7 +477,8 @@ static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs) } NOKPROBE_SYMBOL(do_fp_load); -static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs) +static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs, + bool cross_endian) { union { float f; @@ -470,6 +504,11 @@ static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs) u.l[1] = current->thread.TS_FPR(rn); } preempt_enable(); + if (unlikely(cross_endian)) { + do_byte_reverse(u.b, min(nb, 8)); + if (nb == 16) + do_byte_reverse(&u.b[8], 8); + } return copy_mem_out(u.b, ea, nb); } NOKPROBE_SYMBOL(do_fp_store); @@ -478,7 +517,8 @@ NOKPROBE_SYMBOL(do_fp_store); #ifdef CONFIG_ALTIVEC /* For Altivec/VMX, no need to worry about alignment */ static nokprobe_inline int do_vec_load(int rn, unsigned long ea, - int size, struct pt_regs *regs) + int size, struct pt_regs *regs, + bool cross_endian) { int err; union { @@ -493,7 +533,8 @@ static nokprobe_inline int do_vec_load(int rn, unsigned long ea, err = copy_mem_in(&u.b[ea & 0xf], ea, size);
[PATCH v2 10/10] powerpc/64: Fix update forms of loads and stores to write 64-bit EA
When a 64-bit processor is executing in 32-bit mode, the update forms of load and store instructions are required by the architecture to write the full 64-bit effective address into the RA register, though only the bottom 32 bits are used to address memory. Currently, the instruction emulation code writes the truncated address to the RA register. This fixes it by keeping the full 64-bit EA in the instruction_op structure, truncating the address in emulate_step() where it is used to address memory, rather than in the address computations in analyse_instr(). Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/sstep.h | 4 +- arch/powerpc/lib/sstep.c | 99 +--- 2 files changed, 54 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index 5a3d3d4..9bf44e2 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -25,7 +25,7 @@ struct pt_regs; enum instruction_type { COMPUTE,/* arith/logical/CR op, etc. */ - LOAD, + LOAD, /* load and store types need to be contiguous */ LOAD_MULTI, LOAD_FP, LOAD_VMX, @@ -52,6 +52,8 @@ enum instruction_type { #define INSTR_TYPE_MASK0x1f +#define OP_IS_LOAD_STORE(type) (LOAD <= (type) && (type) <= STCX) + /* Compute flags, ORed in with type */ #define SETREG 0x20 #define SETCC 0x40 diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 7afb8ef..b8d1d46 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -125,7 +125,7 @@ static nokprobe_inline unsigned long dform_ea(unsigned int instr, if (ra) ea += regs->gpr[ra]; - return truncate_if_32bit(regs->msr, ea); + return ea; } #ifdef __powerpc64__ @@ -143,7 +143,7 @@ static nokprobe_inline unsigned long dsform_ea(unsigned int instr, if (ra) ea += regs->gpr[ra]; - return truncate_if_32bit(regs->msr, ea); + return ea; } /* @@ -160,7 +160,7 @@ static nokprobe_inline unsigned long dqform_ea(unsigned int instr, if (ra) ea += regs->gpr[ra]; - return truncate_if_32bit(regs->msr, ea); + return ea; } #endif /* __powerpc64 */ @@ -179,7 +179,7 @@ static nokprobe_inline unsigned long xform_ea(unsigned int instr, if (ra) ea += regs->gpr[ra]; - return truncate_if_32bit(regs->msr, ea); + return ea; } /* @@ -2007,10 +2007,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, if (rb == 0) rb = 32;/* # bytes to load */ op->type = MKOP(LOAD_MULTI, 0, rb); - op->ea = 0; - if (ra) - op->ea = truncate_if_32bit(regs->msr, - regs->gpr[ra]); + op->ea = ra ? regs->gpr[ra] : 0; break; #ifdef CONFIG_PPC_FPU @@ -2077,10 +2074,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, if (rb == 0) rb = 32;/* # bytes to store */ op->type = MKOP(STORE_MULTI, 0, rb); - op->ea = 0; - if (ra) - op->ea = truncate_if_32bit(regs->msr, - regs->gpr[ra]); + op->ea = ra ? regs->gpr[ra] : 0; break; case 790: /* lhbrx */ @@ -2787,10 +2781,11 @@ void emulate_update_regs(struct pt_regs *regs, struct instruction_op *op) int emulate_step(struct pt_regs *regs, unsigned int instr) { struct instruction_op op; - int r, err, size; + int r, err, size, type; unsigned long val; unsigned int cr; int i, rd, nb; + unsigned long ea; bool cross_endian; r = analyse_instr(&op, regs, instr); @@ -2803,28 +2798,36 @@ int emulate_step(struct pt_regs *regs, unsigned int instr) err = 0; size = GETSIZE(op.type); + type = op.type & INSTR_TYPE_MASK; cross_endian = (regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE); - switch (op.type & INSTR_TYPE_MASK) { + + ea = op.ea; +#ifdef __powerpc64__ + if (OP_IS_LOAD_STORE(type) || type == CACHEOP) + ea = truncate_if_32bit(regs->msr, op.ea); +#endif + + switch (type) { case CACHEOP: - if (!address_ok(regs, op.ea, 8)) + if (!address_ok(regs, ea, 8)) return 0; switch (op.type & CACHEOP_MASK) { case DCBST: - __cacheop_user_asmx(op.ea, err, "dcbst"); +
[PATCH v2 08/10] powerpc: Emulate load/store floating double pair instructions
This adds lfdp[x] and stfdp[x] to the set of instructions that analyse_instr() and emulate_step() understand. Signed-off-by: Paul Mackerras --- arch/powerpc/lib/sstep.c | 76 ++-- 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 82b1e69..4773055 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -414,9 +414,9 @@ static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs) int err; union { float f; - double d; - unsigned long l; - u8 b[sizeof(double)]; + double d[2]; + unsigned long l[2]; + u8 b[2 * sizeof(double)]; } u; if (!address_ok(regs, ea, nb)) @@ -426,11 +426,19 @@ static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs) return err; preempt_disable(); if (nb == 4) - conv_sp_to_dp(&u.f, &u.d); + conv_sp_to_dp(&u.f, &u.d[0]); if (regs->msr & MSR_FP) - put_fpr(rn, &u.d); + put_fpr(rn, &u.d[0]); else - current->thread.TS_FPR(rn) = u.l; + current->thread.TS_FPR(rn) = u.l[0]; + if (nb == 16) { + /* lfdp */ + rn |= 1; + if (regs->msr & MSR_FP) + put_fpr(rn, &u.d[1]); + else + current->thread.TS_FPR(rn) = u.l[1]; + } preempt_enable(); return 0; } @@ -440,20 +448,27 @@ static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs) { union { float f; - double d; - unsigned long l; - u8 b[sizeof(double)]; + double d[2]; + unsigned long l[2]; + u8 b[2 * sizeof(double)]; } u; if (!address_ok(regs, ea, nb)) return -EFAULT; preempt_disable(); if (regs->msr & MSR_FP) - get_fpr(rn, &u.d); + get_fpr(rn, &u.d[0]); else - u.l = current->thread.TS_FPR(rn); + u.l[0] = current->thread.TS_FPR(rn); if (nb == 4) - conv_dp_to_sp(&u.d, &u.f); + conv_dp_to_sp(&u.d[0], &u.f); + if (nb == 16) { + rn |= 1; + if (regs->msr & MSR_FP) + get_fpr(rn, &u.d[1]); + else + u.l[1] = current->thread.TS_FPR(rn); + } preempt_enable(); return copy_mem_out(u.b, ea, nb); } @@ -1966,7 +1981,21 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, goto fpunavail; op->type = MKOP(STORE_FP, u, 8); break; -#endif + +#ifdef __powerpc64__ + case 791: /* lfdpx */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(LOAD_FP, 0, 16); + break; + + case 919: /* stfdpx */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(STORE_FP, 0, 16); + break; +#endif /* __powerpc64 */ +#endif /* CONFIG_PPC_FPU */ #ifdef __powerpc64__ case 660: /* stdbrx */ @@ -1984,7 +2013,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->val = byterev_4(regs->gpr[rd]); break; - case 725: + case 725: /* stswi */ if (rb == 0) rb = 32;/* # bytes to store */ op->type = MKOP(STORE_MULTI, 0, rb); @@ -2368,9 +2397,16 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, #endif #ifdef CONFIG_VSX - case 57:/* lxsd, lxssp */ + case 57:/* lfdp, lxsd, lxssp */ op->ea = dsform_ea(instr, regs); switch (instr & 3) { + case 0: /* lfdp */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + if (rd & 1) + break; /* reg must be even */ + op->type = MKOP(LOAD_FP, 0, 16); + break; case 2: /* lxsd */ if (!(regs->msr & MSR_VSX)) goto vsxunavail; @@ -2408,8 +2444,16 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, #endif #ifdef CONFIG_VSX - case 61:/* lxv, stxsd, stxssp, stxv */ + case 61:
[PATCH v2 07/10] powerpc: Handle vector element load/stores in emulation code
This adds code to analyse_instr() and emulate_step() to handle the vector element loads and stores: lvebx, lvehx, lvewx, stvebx, stvehx, stvewx. Signed-off-by: Paul Mackerras --- arch/powerpc/lib/sstep.c | 50 ++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 0b295fb..82b1e69 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -475,7 +475,7 @@ static nokprobe_inline int do_vec_load(int rn, unsigned long ea, return -EFAULT; /* align to multiple of size */ ea &= ~(size - 1); - err = copy_mem_in(u.b, ea, size); + err = copy_mem_in(&u.b[ea & 0xf], ea, size); if (err) return err; @@ -507,7 +507,7 @@ static nokprobe_inline int do_vec_store(int rn, unsigned long ea, else u.v = current->thread.vr_state.vr[rn]; preempt_enable(); - return copy_mem_out(u.b, ea, size); + return copy_mem_out(&u.b[ea & 0xf], ea, size); } #endif /* CONFIG_ALTIVEC */ @@ -1808,6 +1808,31 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, break; #ifdef CONFIG_ALTIVEC + /* +* Note: for the load/store vector element instructions, +* bits of the EA say which field of the VMX register to use. +*/ + case 7: /* lvebx */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(LOAD_VMX, 0, 1); + op->element_size = 1; + break; + + case 39:/* lvehx */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(LOAD_VMX, 0, 2); + op->element_size = 2; + break; + + case 71:/* lvewx */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(LOAD_VMX, 0, 4); + op->element_size = 4; + break; + case 103: /* lvx */ case 359: /* lvxl */ if (!(regs->msr & MSR_VEC)) @@ -1816,6 +1841,27 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->element_size = 16; break; + case 135: /* stvebx */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(STORE_VMX, 0, 1); + op->element_size = 1; + break; + + case 167: /* stvehx */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(STORE_VMX, 0, 2); + op->element_size = 2; + break; + + case 199: /* stvewx */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(STORE_VMX, 0, 4); + op->element_size = 4; + break; + case 231: /* stvx */ case 487: /* stvxl */ if (!(regs->msr & MSR_VEC)) -- 2.7.4
[PATCH v2 06/10] powerpc: Emulate FP/vector/VSX loads/stores correctly when regs not live
At present, the analyse_instr/emulate_step code checks for the relevant MSR_FP/VEC/VSX bit being set when a FP/VMX/VSX load or store is decoded, but doesn't recheck the bit before reading or writing the relevant FP/VMX/VSX register in emulate_step(). Since we don't have preemption disabled, it is possible that we get preempted between checking the MSR bit and doing the register access. If that happened, then the registers would have been saved to the thread_struct for the current process. Accesses to the CPU registers would then potentially read stale values, or write values that would never be seen by the user process. Another way that the registers can become non-live is if a page fault occurs when accessing user memory, and the page fault code calls a copy routine that wants to use the VMX or VSX registers. To fix this, the code for all the FP/VMX/VSX loads gets restructured so that it forms an image in a local variable of the desired register contents, then disables preemption, checks the MSR bit and either sets the CPU register or writes the value to the thread struct. Similarly, the code for stores checks the MSR bit, copies either the CPU register or the thread struct to a local variable, then reenables preemption and then copies the register image to memory. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/sstep.h | 1 + arch/powerpc/lib/ldstfp.S| 241 +++ arch/powerpc/lib/sstep.c | 218 --- 3 files changed, 193 insertions(+), 267 deletions(-) diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index 5cdcbc4..0e5dd23 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -116,6 +116,7 @@ union vsx_reg { unsigned long d[2]; float fp[4]; double dp[2]; + __vector128 v; }; /* diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S index 6840911..7b5cf5e 100644 --- a/arch/powerpc/lib/ldstfp.S +++ b/arch/powerpc/lib/ldstfp.S @@ -21,27 +21,19 @@ #define STKFRM (PPC_MIN_STKFRM + 16) - .macro inst32 op -reg = 0 - .rept 32 -20:\op reg,0,r4 - b 3f - EX_TABLE(20b,99f) -reg = reg + 1 - .endr - .endm - -/* Get the contents of frN into fr0; N is in r3. */ +/* Get the contents of frN into *p; N is in r3 and p is in r4. */ _GLOBAL(get_fpr) mflrr0 + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync rlwinm r3,r3,3,0xf8 bcl 20,31,1f - blr /* fr0 is already in fr0 */ - nop -reg = 1 - .rept 31 - fmr fr0,reg - blr +reg = 0 + .rept 32 + stfdreg, 0(r4) + b 2f reg = reg + 1 .endr 1: mflrr5 @@ -49,18 +41,23 @@ reg = reg + 1 mtctr r5 mtlrr0 bctr +2: MTMSRD(r6) + isync + blr -/* Put the contents of fr0 into frN; N is in r3. */ +/* Put the contents of *p into frN; N is in r3 and p is in r4. */ _GLOBAL(put_fpr) mflrr0 + mfmsr r6 + ori r7, r6, MSR_FP + MTMSRD(r7) + isync rlwinm r3,r3,3,0xf8 bcl 20,31,1f - blr /* fr0 is already in fr0 */ - nop -reg = 1 - .rept 31 - fmr reg,fr0 - blr +reg = 0 + .rept 32 + lfd reg, 0(r4) + b 2f reg = reg + 1 .endr 1: mflrr5 @@ -68,127 +65,24 @@ reg = reg + 1 mtctr r5 mtlrr0 bctr - -/* Load FP reg N from float at *p. N is in r3, p in r4. */ -_GLOBAL(do_lfs) - PPC_STLU r1,-STKFRM(r1) - mflrr0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) - mfmsr r6 - ori r7,r6,MSR_FP - cmpwi cr7,r3,0 - MTMSRD(r7) - isync - beq cr7,1f - stfdfr0,STKFRM-16(r1) -1: li r9,-EFAULT -2: lfs fr0,0(r4) - li r9,0 -3: bl put_fpr - beq cr7,4f - lfd fr0,STKFRM-16(r1) -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlrr0 - MTMSRD(r6) - isync - mr r3,r9 - addir1,r1,STKFRM - blr - EX_TABLE(2b,3b) - -/* Load FP reg N from double at *p. N is in r3, p in r4. */ -_GLOBAL(do_lfd) - PPC_STLU r1,-STKFRM(r1) - mflrr0 - PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) - mfmsr r6 - ori r7,r6,MSR_FP - cmpwi cr7,r3,0 - MTMSRD(r7) - isync - beq cr7,1f - stfdfr0,STKFRM-16(r1) -1: li r9,-EFAULT -2: lfd fr0,0(r4) - li r9,0 -3: beq cr7,4f - bl put_fpr - lfd fr0,STKFRM-16(r1) -4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) - mtlrr0 - MTMSRD(r6) - isync - mr r3,r9 - addir1,r1,STKFRM - blr - EX_TABLE(2b,3b) - -/* Store FP reg
[PATCH v2 05/10] powerpc: Make load/store emulation use larger memory accesses
At the moment, emulation of loads and stores of up to 8 bytes to unaligned addresses on a little-endian system uses a sequence of single-byte loads or stores to memory. This is rather inefficient, and the code is hard to follow because it has many ifdefs. In addition, the Power ISA has requirements on how unaligned accesses are performed, which are not met by doing all accesses as sequences of single-byte accesses. Emulation of VSX loads and stores uses __copy_{to,from}_user, which means the emulation code has no control on the size of accesses. To simplify this, we add new copy_mem_in() and copy_mem_out() functions for accessing memory. These use a sequence of the largest possible aligned accesses, up to 8 bytes (or 4 on 32-bit systems), to copy memory between a local buffer and user memory. We then rewrite {read,write}_mem_unaligned and the VSX load/store emulation using these new functions. These new function also simplify the code in do_fp_load() and do_fp_store() for the unaligned cases. Signed-off-by: Paul Mackerras --- arch/powerpc/lib/sstep.c | 237 +-- 1 file changed, 106 insertions(+), 131 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index d9b3b63..861654e 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -194,7 +194,6 @@ static nokprobe_inline unsigned long max_align(unsigned long x) return x & -x; /* isolates rightmost bit */ } - static nokprobe_inline unsigned long byterev_2(unsigned long x) { return ((x >> 8) & 0xff) | ((x & 0xff) << 8); @@ -240,56 +239,68 @@ static nokprobe_inline int read_mem_aligned(unsigned long *dest, return err; } -static nokprobe_inline int read_mem_unaligned(unsigned long *dest, - unsigned long ea, int nb, struct pt_regs *regs) +/* + * Copy from userspace to a buffer, using the largest possible + * aligned accesses, up to sizeof(long). + */ +static int nokprobe_inline copy_mem_in(u8 *dest, unsigned long ea, int nb) { - int err; - unsigned long x, b, c; -#ifdef __LITTLE_ENDIAN__ - int len = nb; /* save a copy of the length for byte reversal */ -#endif + int err = 0; + int c; - /* unaligned, do this in pieces */ - x = 0; for (; nb > 0; nb -= c) { -#ifdef __LITTLE_ENDIAN__ - c = 1; -#endif -#ifdef __BIG_ENDIAN__ c = max_align(ea); -#endif if (c > nb) c = max_align(nb); - err = read_mem_aligned(&b, ea, c); + switch (c) { + case 1: + err = __get_user(*dest, (unsigned char __user *) ea); + break; + case 2: + err = __get_user(*(u16 *)dest, +(unsigned short __user *) ea); + break; + case 4: + err = __get_user(*(u32 *)dest, +(unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __get_user(*(unsigned long *)dest, +(unsigned long __user *) ea); + break; +#endif + } if (err) return err; - x = (x << (8 * c)) + b; + dest += c; ea += c; } -#ifdef __LITTLE_ENDIAN__ - switch (len) { - case 2: - *dest = byterev_2(x); - break; - case 4: - *dest = byterev_4(x); - break; -#ifdef __powerpc64__ - case 8: - *dest = byterev_8(x); - break; -#endif - } -#endif -#ifdef __BIG_ENDIAN__ - *dest = x; -#endif return 0; } +static nokprobe_inline int read_mem_unaligned(unsigned long *dest, + unsigned long ea, int nb) +{ + union { + unsigned long ul; + u8 b[sizeof(unsigned long)]; + } u; + int i; + int err; + + u.ul = 0; + i = IS_BE ? sizeof(unsigned long) - nb : 0; + err = copy_mem_in(&u.b[i], ea, nb); + if (!err) + *dest = u.ul; + return err; +} + /* * Read memory at address ea for nb bytes, return 0 for success - * or -EFAULT if an error occurred. + * or -EFAULT if an error occurred. N.B. nb must be 1, 2, 4 or 8. + * If nb < sizeof(long), the result is right-justified on BE systems. */ static int read_mem(unsigned long *dest, unsigned long ea, int nb, struct pt_regs *regs) @@ -298,7 +309,7 @@ static int read_mem(unsigned long *dest, unsigned long ea, int nb, return -EFAULT; if ((ea & (nb - 1)) == 0) return read_mem_aligned(dest, ea, nb); - return read_m
[PATCH v2 04/10] powerpc: Add emulation for the addpcis instruction
The addpcis instruction puts the sum of the next instruction address plus a constant into a register. Since the result depends on the address of the instruction, it will give an incorrect result if it is single-stepped out of line, which is what the *probes subsystem will currently do if a probe is placed on an addpcis instruction. This fixes the problem by adding emulation of it to analyse_instr(). Signed-off-by: Paul Mackerras --- arch/powerpc/lib/sstep.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 7921b2a..d9b3b63 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1024,9 +1024,6 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->ccval = (regs->ccr & ~(1UL << (31 - rd))) | (val << (31 - rd)); return 1; - default: - op->type = UNKNOWN; - return 0; } break; case 31: @@ -1126,6 +1123,17 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->val = imm; goto compute_done; + case 19: + if (((instr >> 1) & 0x1f) == 2) { + /* addpcis */ + imm = (short) (instr & 0xffc1); /* d0 + d2 fields */ + imm |= (instr >> 15) & 0x3e;/* d1 field */ + op->val = regs->nip + (imm << 16) + 4; + goto compute_done; + } + op->type = UNKNOWN; + return 0; + case 20:/* rlwimi */ mb = (instr >> 6) & 0x1f; me = (instr >> 1) & 0x1f; -- 2.7.4
[PATCH v2 03/10] powerpc: Fix emulation of the isel instruction
The case added for the isel instruction was added inside a switch statement which uses the 10-bit minor opcode field in the 0x7fe bits of the instruction word. However, for the isel instruction, the minor opcode field is only the 0x3e bits, and the 0x7c0 bits are used for the "BC" field, which indicates which CR bit to use to select the result. Therefore, for the isel emulation to work correctly when BC != 0, we need to match on ((instr >> 1) & 0x1f) == 15). To do this, we pull the isel case out of the switch statement and put it in an if statement of its own. Signed-off-by: Paul Mackerras --- arch/powerpc/lib/sstep.c | 18 ++ 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index f9c973c..7921b2a 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1219,6 +1219,16 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, return 0; case 31: + /* isel occupies 32 minor opcodes */ + if (((instr >> 1) & 0x1f) == 15) { + mb = (instr >> 6) & 0x1f; /* bc field */ + val = (regs->ccr >> (31 - mb)) & 1; + val2 = (ra) ? regs->gpr[ra] : 0; + + op->val = (val) ? val2 : regs->gpr[rb]; + goto compute_done; + } + switch ((instr >> 1) & 0x3ff) { case 4: /* tw */ if (rd == 0x1f || @@ -1444,14 +1454,6 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, /* * Logical instructions */ - case 15:/* isel */ - mb = (instr >> 6) & 0x1f; /* bc */ - val = (regs->ccr >> (31 - mb)) & 1; - val2 = (ra) ? regs->gpr[ra] : 0; - - op->val = (val) ? val2 : regs->gpr[rb]; - goto compute_done; - case 26:/* cntlzw */ op->val = __builtin_clz((unsigned int) regs->gpr[rd]); goto logical_done; -- 2.7.4
[PATCH v2 02/10] powerpc: Change analyse_instr so it doesn't modify *regs
The analyse_instr function currently doesn't just work out what an instruction does, it also executes those instructions whose effect is only to update CPU registers that are stored in struct pt_regs. This is undesirable because optprobes uses analyse_instr to work out if an instruction could be successfully emulated in future. This changes analyse_instr so it doesn't modify *regs; instead it stores information in the instruction_op structure to indicate what registers (GPRs, CR, XER, LR) would be set and what value they would be set to. A companion function called emulate_update_regs() can then use that information to update a pt_regs struct appropriately. As a minor cleanup, this replaces inline asm using the cntlzw and cntlzd instructions with calls to __builtin_clz() and __builtin_clzl(). Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/sstep.h | 52 +++- arch/powerpc/lib/sstep.c | 607 +++ 2 files changed, 400 insertions(+), 259 deletions(-) diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index 863e1e4..5cdcbc4 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -23,9 +23,6 @@ struct pt_regs; #define IS_RFID(instr) (((instr) & 0xfc0007fe) == 0x4c24) #define IS_RFI(instr) (((instr) & 0xfc0007fe) == 0x4c64) -/* Emulate instructions that cause a transfer of control. */ -extern int emulate_step(struct pt_regs *regs, unsigned int instr); - enum instruction_type { COMPUTE,/* arith/logical/CR op, etc. */ LOAD, @@ -55,11 +52,29 @@ enum instruction_type { #define INSTR_TYPE_MASK0x1f +/* Compute flags, ORed in with type */ +#define SETREG 0x20 +#define SETCC 0x40 +#define SETXER 0x80 + +/* Branch flags, ORed in with type */ +#define SETLK 0x20 +#define BRTAKEN0x40 +#define DECCTR 0x80 + /* Load/store flags, ORed in with type */ #define SIGNEXT0x20 #define UPDATE 0x40/* matches bit in opcode 31 instructions */ #define BYTEREV0x80 +/* Barrier type field, ORed in with type */ +#define BARRIER_MASK 0xe0 +#define BARRIER_SYNC 0x00 +#define BARRIER_ISYNC 0x20 +#define BARRIER_EIEIO 0x40 +#define BARRIER_LWSYNC 0x60 +#define BARRIER_PTESYNC0x80 + /* Cacheop values, ORed in with type */ #define CACHEOP_MASK 0x700 #define DCBST 0 @@ -90,6 +105,8 @@ struct instruction_op { int spr; u8 element_size;/* for VSX/VMX loads/stores */ u8 vsx_flags; + u32 ccval; + u32 xerval; }; union vsx_reg { @@ -101,8 +118,35 @@ union vsx_reg { double dp[2]; }; -extern int analyse_instr(struct instruction_op *op, struct pt_regs *regs, +/* + * Decode an instruction, and return information about it in *op + * without changing *regs. + * + * Return value is 1 if the instruction can be emulated just by + * updating *regs with the information in *op, -1 if we need the + * GPRs but *regs doesn't contain the full register set, or 0 + * otherwise. + */ +extern int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, unsigned int instr); + +/* + * Emulate an instruction that can be executed just by updating + * fields in *regs. + */ +void emulate_update_regs(struct pt_regs *reg, struct instruction_op *op); + +/* + * Emulate instructions that cause a transfer of control, + * arithmetic/logical instructions, loads and stores, + * cache operations and barriers. + * + * Returns 1 if the instruction was emulated successfully, + * 0 if it could not be emulated, or -1 for an instruction that + * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.). + */ +extern int emulate_step(struct pt_regs *regs, unsigned int instr); + extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, const void *mem); extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 6aa0ba6..f9c973c 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -83,15 +83,17 @@ static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr, /* * Determine whether a conditional branch instruction would branch. */ -static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs) +static nokprobe_inline int branch_taken(unsigned int instr, + const struct pt_regs *regs, + struct instruction_op *op) { unsigned int bo = (instr >> 21) & 0x1f; unsigned int bi; if ((bo & 4) == 0) { /* decrement counter */ - --regs->ctr; - if (((bo >> 1) & 1) ^ (regs->ctr == 0)) + op->type |= DECCTR; + if (((bo >> 1
[PATCH v2 01/10] powerpc: Handle most loads and stores in instruction emulation code
This extends the instruction emulation infrastructure in sstep.c to handle all the load and store instructions defined in the Power ISA v3.0, except for the atomic memory operations, ldmx (which was never implemented), lfdp/stfdp, and the vector element load/stores. The instructions added are: Integer loads and stores: lbarx, lharx, lqarx, stbcx., sthcx., stqcx., lq, stq. VSX loads and stores: lxsiwzx, lxsiwax, stxsiwx, lxvx, lxvl, lxvll, lxvdsx, lxvwsx, stxvx, stxvl, stxvll, lxsspx, lxsdx, stxsspx, stxsdx, lxvw4x, lxsibzx, lxvh8x, lxsihzx, lxvb16x, stxvw4x, stxsibx, stxvh8x, stxsihx, stxvb16x, lxsd, lxssp, lxv, stxsd, stxssp, stxv. These instructions are handled both in the analyse_instr phase and in the emulate_step phase. The code for lxvd2ux and stxvd2ux has been taken out, as those instructions were never implemented in any processor and have been taken out of the architecture, and their opcodes have been reused for other instructions in POWER9 (lxvb16x and stxvb16x). The emulation for the VSX loads and stores uses helper functions which don't access registers or memory directly, which can hopefully be reused by KVM later. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/sstep.h | 20 ++ arch/powerpc/lib/Makefile| 2 +- arch/powerpc/lib/ldstfp.S| 70 ++-- arch/powerpc/lib/quad.S | 62 arch/powerpc/lib/sstep.c | 688 --- 5 files changed, 781 insertions(+), 61 deletions(-) create mode 100644 arch/powerpc/lib/quad.S diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index d3a42cc..863e1e4 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -68,6 +68,11 @@ enum instruction_type { #define DCBT 0x300 #define ICBI 0x400 +/* VSX flags values */ +#define VSX_FPCONV 1 /* do floating point SP/DP conversion */ +#define VSX_SPLAT 2 /* store loaded value into all elements */ +#define VSX_LDLEFT 4 /* load VSX register from left */ + /* Size field in type word */ #define SIZE(n)((n) << 8) #define GETSIZE(w) ((w) >> 8) @@ -83,7 +88,22 @@ struct instruction_op { int update_reg; /* For MFSPR */ int spr; + u8 element_size;/* for VSX/VMX loads/stores */ + u8 vsx_flags; +}; + +union vsx_reg { + u8 b[16]; + u16 h[8]; + u32 w[4]; + unsigned long d[2]; + float fp[4]; + double dp[2]; }; extern int analyse_instr(struct instruction_op *op, struct pt_regs *regs, unsigned int instr); +extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, +const void *mem); +extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, + void *mem); diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 3c3146b..7921fed 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -31,7 +31,7 @@ obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o obj-y += checksum_$(BITS).o checksum_wrappers.o -obj-$(CONFIG_PPC_EMULATE_SSTEP)+= sstep.o ldstfp.o +obj-$(CONFIG_PPC_EMULATE_SSTEP)+= sstep.o ldstfp.o quad.o obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S index a58777c..6840911 100644 --- a/arch/powerpc/lib/ldstfp.S +++ b/arch/powerpc/lib/ldstfp.S @@ -178,10 +178,10 @@ _GLOBAL(do_stfd) EX_TABLE(2b,3b) #ifdef CONFIG_ALTIVEC -/* Get the contents of vrN into v0; N is in r3. */ +/* Get the contents of vrN into v0; N is in r3. Doesn't touch r3 or r4. */ _GLOBAL(get_vr) mflrr0 - rlwinm r3,r3,3,0xf8 + rlwinm r6,r3,3,0xf8 bcl 20,31,1f blr /* v0 is already in v0 */ nop @@ -192,15 +192,15 @@ reg = 1 reg = reg + 1 .endr 1: mflrr5 - add r5,r3,r5 + add r5,r6,r5 mtctr r5 mtlrr0 bctr -/* Put the contents of v0 into vrN; N is in r3. */ +/* Put the contents of v0 into vrN; N is in r3. Doesn't touch r3 or r4. */ _GLOBAL(put_vr) mflrr0 - rlwinm r3,r3,3,0xf8 + rlwinm r6,r3,3,0xf8 bcl 20,31,1f blr /* v0 is already in v0 */ nop @@ -211,7 +211,7 @@ reg = 1 reg = reg + 1 .endr 1: mflrr5 - add r5,r3,r5 + add r5,r6,r5 mtctr r5 mtlrr0 bctr @@ -313,7 +313,7 @@ reg = reg + 1 bctr /* Load VSX reg N from vector doubleword *p. N is in r3, p in r4. */ -_GLOBAL(do_lxvd2x) +_GLOBAL(load_vsrn) PPC_STLU r1,-STKFRM(r1) mflrr0 PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) @@ -325,41 +325,38 @@ _GLOBAL(do_lxvd2x) isync beq cr7,1f STXVD2X(0,R1,R8) -
[PATCH v2 0/10] powerpc: Beef up single-stepping/instruction emulation infrastructure
This patch series extends the code in arch/powerpc/lib/sstep.c so that it handles almost all load and store instructions -- all except the atomic memory operations (lwat, stwat, etc.). It also makes sure that we use the largest possible aligned accesses to access memory and that we don't access the CPU FP/VMX/VSX registers when they don't contain user data. With this, it should be possible to replace the body of the alignment interrupt handler with a call to emulate_step() or something quite similar. This version is based on the powerpc tree next branch as of a day or two ago, and includes code to emulate addpcis, a fix for the isel emulation, code to handle the multi-register loads and stores in little-endian mode, and a fix for the wrong behaviour in updating RA for load/store with update instructions in 32-bit mode. Paul. arch/powerpc/include/asm/sstep.h | 77 +- arch/powerpc/lib/Makefile|2 +- arch/powerpc/lib/ldstfp.S| 307 ++ arch/powerpc/lib/quad.S | 62 ++ arch/powerpc/lib/sstep.c | 1929 -- 5 files changed, 1654 insertions(+), 723 deletions(-)
RE: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver
> -Original Message- > From: David Miller [mailto:da...@davemloft.net] > Subject: Re: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver > > From: David Miller > Date: Thu, 24 Aug 2017 09:42:20 -0700 (PDT) > > > From: Madalin Bucur > > Date: Thu, 24 Aug 2017 10:28:21 +0300 > > > >> This patch set introduces Receive Side Scaling for the DPAA Ethernet > >> driver. Documentation is updated with details related to the new > >> feature and limitations that apply. > >> Added also a small fix. > >> > >> v2: removed a C++ style comment > >> v3: move struct fman to header file to avoid exporting a function > > > > Series applied, thanks. > > Actually I'm reverting, this doesn't even compile. Hi, Sorry for this blunder, I've only tested on PPC, where it works. Will come back with a proper patch set. Madalin
Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's
Le 24/08/2017 à 20:47, Benjamin Herrenschmidt a écrit : On Thu, 2017-08-24 at 18:40 +0200, Frederic Barrat wrote: The decrementing part is giving me troubles, and I think it makes sense: if I decrement the counter when detaching the context from the capi card, then the next TLBIs for the memory context may be back to local. Yes, you need to flush the CAPI TLB first. So when the process exits, the NPU wouldn't get the associated TLBIs, which spells trouble the next time the same memory context ID is reused. I believe this the cause of the problem I'm seeing. As soon as I keep the TLBIs global, even after I detach from the capi adapter, everything is fine. Does it sound right? So to keep the checks minimal in mm_is_thread_local(), to just checking the active_cpus count, I'm thinking of introducing a "copro enabled" bit on the context, so that we can increment active_cpus only once. And never decrement it. You can decrement if you flush. Don't you have MMIOs to do directed flushes ? That's for the nMMU. Last I heard, we don't have MMIOs to flush anything on the nMMU. Side note: for the PSL, we do have MMIOs to flush, but they were perceived as useful only for debug and we don't rely on them, precisely because the nMMU would fall out of sync, so we have to rely on broadcast. Fred
[PATCH v3 4/4] powerpc/64s: idle ESL=0 stop can avoid MSR and save/restore overhead
When stop is executed with EC=ESL=0, it appears to execute like a normal instruction (resuming from NIP when woken by interrupt). So all the save/restore handling can be avoided completely. In particular NV GPRs do not have to be saved, and MSR does not have to be switched back to kernel MSR. So move the test for "lite" sleep states out to power9_idle_stop. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/idle_book3s.S | 35 --- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 32d65ee323a0..fa56120bd0bc 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -315,9 +315,6 @@ enter_winkle: ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) -/* - * r3 - PSSCR value corresponding to the requested stop state. - */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE power_enter_stop_kvm_rm: /* @@ -330,14 +327,11 @@ power_enter_stop_kvm_rm: li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) -#endif -power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ - andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED - clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ - bne .Lhandle_esl_ec_set + andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED + bne power_enter_stop_esl PPC_STOP li r3,0 /* Since we didn't lose state, return 0 */ @@ -354,8 +348,13 @@ power_enter_stop: */ li r12, 0 b pnv_wakeup_noloss +#endif -.Lhandle_esl_ec_set: +/* + * r3 - PSSCR value corresponding to the requested stop state. + */ +power_enter_stop_esl: + clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ /* * POWER9 DD2 can incorrectly set PMAO when waking up after a * state-loss idle. Saving and restoring MMCR0 over idle is a @@ -428,9 +427,23 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ * r3 contains desired PSSCR register value. */ _GLOBAL(power9_idle_stop) - std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 - LOAD_REG_ADDR(r4,power_enter_stop) + + /* +* Check if we are executing the lite variant with ESL=EC=0 +* This case resumes execution after the stop instruction without +* losing any state, so nothing has to be saved. The following +* instructions up to the blr must be skipped if we want to +* use power_enter_stop_kvm_rm. +*/ + andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED + bne 1f + PPC_STOP + li r3,0 /* Since we didn't lose state, return 0 */ + blr +1: /* state-loss idle */ + std r3, PACA_REQ_PSSCR(r13) + LOAD_REG_ADDR(r4,power_enter_stop_esl) b pnv_powersave_common /* No return */ -- 2.13.3
[PATCH v3 3/4] powerpc/64s: idle POWER9 can execute stop in virtual mode
The hardware can execute stop in any context, and KVM does not require real mode because siblings do not share MMU state. This saves a switch to real-mode when going idle. Acked-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/idle_book3s.S | 9 + 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 14e97f442167..32d65ee323a0 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -195,7 +195,16 @@ pnv_powersave_common: std r5,_CCR(r1) std r1,PACAR1(r13) +BEGIN_FTR_SECTION + /* +* POWER9 does not require real mode to stop, and presently does not +* set hwthread_state for KVM (threads don't share MMU context), so +* we can remain in virtual mode for this. +*/ + bctr +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) /* +* POWER8 * Go to real mode to do the nap, as required by the architecture. * Also, we need to be in real mode before setting hwthread_state, * because as soon as we do that, another thread can switch -- 2.13.3
[PATCH v3 2/4] powerpc/64s: idle POWER9 can execute stop without a sync sequence
Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/cpuidle.h | 16 arch/powerpc/kernel/idle_book3s.S | 26 -- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 8a174cba5567..eb43b5c3a7b5 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -101,20 +101,4 @@ static inline void report_invalid_psscr_val(u64 psscr_val, int err) #endif -/* Idle state entry routines */ -#ifdef CONFIG_PPC_P7_NAP -#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ - /* Magic NAP/SLEEP/WINKLE mode enter sequence */\ - std r0,0(r1); \ - ptesync;\ - ld r0,0(r1); \ -236: cmpdcr0,r0,r0; \ - bne 236b; \ - IDLE_INST; \ - -#defineIDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ - IDLE_STATE_ENTER_SEQ(IDLE_INST) \ - b . -#endif /* CONFIG_PPC_P7_NAP */ - #endif diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 4924647d964d..14e97f442167 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -205,6 +205,19 @@ pnv_powersave_common: mtmsrd r7,0 bctr +/* + * This is the sequence required to execute idle instructions, as + * specified in ISA v2.07. MSR[IR] and MSR[DR] must be 0. + */ +#define ARCH207_IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ + /* Magic NAP/SLEEP/WINKLE mode enter sequence */\ + std r0,0(r1); \ + ptesync;\ + ld r0,0(r1); \ +236: cmpdcr0,r0,r0; \ + bne 236b; \ + IDLE_INST; + .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -230,7 +243,7 @@ pnv_enter_arch207_idle_mode: stb r3,PACA_THREAD_IDLE_STATE(r13) cmpwi cr3,r3,PNV_THREAD_SLEEP bge cr3,2f - IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) + ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) /* No return */ 2: /* Sleep or winkle */ @@ -269,7 +282,7 @@ pnv_fastsleep_workaround_at_entry: common_enter: /* common code for all the threads entering sleep or winkle */ bgt cr3,enter_winkle - IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) + ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) fastsleep_workaround_at_entry: orisr15,r15,PNV_CORE_IDLE_LOCK_BIT@h @@ -291,7 +304,7 @@ fastsleep_workaround_at_entry: enter_winkle: bl save_sprs_to_stack - IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) + ARCH207_IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) /* * r3 - PSSCR value corresponding to the requested stop state. @@ -316,7 +329,7 @@ power_enter_stop: andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ bne .Lhandle_esl_ec_set - IDLE_STATE_ENTER_SEQ(PPC_STOP) + PPC_STOP li r3,0 /* Since we didn't lose state, return 0 */ /* @@ -349,7 +362,8 @@ power_enter_stop: ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) cmpdr3,r4 bge .Lhandle_deep_stop - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP/* Does not return (system reset interrupt) */ + .Lhandle_deep_stop: /* * Entering deep idle state. @@ -371,7 +385,7 @@ lwarx_loop_stop: bl save_sprs_to_stack - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP/* Does not return (system reset interrupt) */ /* * Entered with MSR[EE]=0 and no soft-masked interrupts pending. -- 2.13.3
[PATCH v3 1/4] KVM: PPC: Book3S HV: POWER9 does not require secondary thread management
POWER9 CPUs have independent MMU contexts per thread, so KVM does not need to quiesce secondary threads, so the hwthread_req/hwthread_state protocol does not have to be used. So patch it away on POWER9, and patch away the branch from the Linux idle wakeup to kvm_start_guest that is never used. Add a warning and error out of kvmppc_grab_hwthread in case it is ever called on POWER9. This avoids a hwsync in the idle wakeup path on POWER9. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/kvm_book3s_asm.h | 4 arch/powerpc/kernel/idle_book3s.S | 35 +-- arch/powerpc/kvm/book3s_hv.c | 14 - arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 +++ 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 7cea76f11c26..83596f32f50b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -104,6 +104,10 @@ struct kvmppc_host_state { u8 napping; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* +* hwthread_req/hwthread_state pair is used to pull sibling threads +* out of guest on pre-ISAv3.0B CPUs where threads share MMU. +*/ u8 hwthread_req; u8 hwthread_state; u8 host_ipi; diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index bfbf0976fc09..4924647d964d 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -296,13 +296,20 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ -power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* Tell KVM we're entering idle */ +power_enter_stop_kvm_rm: + /* +* This is currently unused because POWER9 KVM does not have to +* gather secondary threads into sibling mode, but the code is +* here in case that function is required. +* +* Tell KVM we're entering idle. +*/ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif +power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ @@ -465,6 +472,18 @@ pnv_powersave_wakeup_mce: b pnv_powersave_wakeup +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm_start_guest_check: + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beqlr + b kvm_start_guest +#endif + /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss @@ -489,15 +508,9 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mr r3,r12 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beq 1f - b kvm_start_guest -1: +BEGIN_FTR_SECTION + bl kvm_start_guest_check +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 359c79cdf0cc..e34cd6fb947b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2111,6 +2111,16 @@ static int kvmppc_grab_hwthread(int cpu) struct paca_struct *tpaca; long timeout = 1; + /* +* ISA v3.0 idle routines do not set hwthread_state or test +* hwthread_req, so they can not grab idle threads. +*/ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + WARN_ON(1); + pr_err("KVM: can not control sibling threads\n"); + return -EBUSY; + } + tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ @@ -2145,10 +2155,12 @@ static void kvmppc_release_hwthread(int cpu) struct paca_struct *tpaca; tpaca = &paca[cpu]; - tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + tpaca->kvm_hstate.hwthread_req = 0; + } static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c52184a8efdf..3e024fd71fe8 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -149,9 +149,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
[PATCH v3 0/4] powerpc/64s: idle POWER9 stop improvements
These are rebased patches leftover from the unmerged bit of the idle series. Based on feedback, I dropped one of the KVM patches, and reworked the code a bit so it is easier to restore the ability for KVM to grab secondaries into real mode. I did a bit more benchmarking, and all up these patches improve 2 CPU ping-pong context switch benchmark on a POWER9 by around 4-6% (depending on what CPUs and idle states are used). Nicholas Piggin (4): KVM: PPC: Book3S HV: POWER9 does not require secondary thread management powerpc/64s: idle POWER9 can execute stop without a sync sequence powerpc/64s: idle POWER9 can execute stop in virtual mode powerpc/64s: idle ESL=0 stop can avoid MSR and save/restore overhead arch/powerpc/include/asm/cpuidle.h| 16 - arch/powerpc/include/asm/kvm_book3s_asm.h | 4 ++ arch/powerpc/kernel/idle_book3s.S | 103 ++ arch/powerpc/kvm/book3s_hv.c | 14 +++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 +++ 5 files changed, 101 insertions(+), 44 deletions(-) -- 2.13.3
Re: [PATCH v7 04/12] powerpc/vas: Define helpers to access MMIO regions
Hi Suka, Comments inline. Sukadev Bhattiprolu writes: > diff --git a/arch/powerpc/platforms/powernv/vas-window.c > b/arch/powerpc/platforms/powernv/vas-window.c > index 6156fbe..a3a705a 100644 > --- a/arch/powerpc/platforms/powernv/vas-window.c > +++ b/arch/powerpc/platforms/powernv/vas-window.c > @@ -9,9 +9,182 @@ > > #include > #include > +#include > +#include > > #include "vas.h" > > +/* > + * Compute the paste address region for the window @window using the > + * ->paste_base_addr and ->paste_win_id_shift we got from device tree. > + */ > +void compute_paste_address(struct vas_window *window, uint64_t *addr, int > *len) > +{ > + uint64_t base, shift; Please use the kernel types, so u64 here. > + int winid; > + > + base = window->vinst->paste_base_addr; > + shift = window->vinst->paste_win_id_shift; > + winid = window->winid; > + > + *addr = base + (winid << shift); > + if (len) > + *len = PAGE_SIZE; Having multiple output parameters makes for a pretty awkward API. Is it really necesssary given len is a constant PAGE_SIZE anyway. If you didn't return len, then you could just make the function return the addr, and you wouldn't need any output parameters. One of the callers that passes len is unmap_paste_region(), but that is a bit odd. It would be more natural I think if once a window is mapped it knows its size. Or if the mapping will always just be one page then we can just know that. > + > + pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); > +} > + > +static inline void get_hvwc_mmio_bar(struct vas_window *window, > + uint64_t *start, int *len) > +{ > + uint64_t pbaddr; > + > + pbaddr = window->vinst->hvwc_bar_start; > + *start = pbaddr + window->winid * VAS_HVWC_SIZE; > + *len = VAS_HVWC_SIZE; This is: #define VAS_HVWC_SIZE 512 But then we map it, which will round up to a page anyway. So again I don't see the point of having the len returned form this helper. > +} > + > +static inline void get_uwc_mmio_bar(struct vas_window *window, > + uint64_t *start, int *len) > +{ > + uint64_t pbaddr; > + > + pbaddr = window->vinst->uwc_bar_start; > + *start = pbaddr + window->winid * VAS_UWC_SIZE; > + *len = VAS_UWC_SIZE; > +} > + > +/* > + * Map the paste bus address of the given send window into kernel address > + * space. Unlike MMIO regions (map_mmio_region() below), paste region must > + * be mapped cache-able and is only applicable to send windows. > + */ > +void *map_paste_region(struct vas_window *txwin) > +{ > + int rc, len; > + void *map; > + char *name; > + uint64_t start; > + > + rc = -ENOMEM; You don't need that. > + name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id, > + txwin->winid); > + if (!name) > + return ERR_PTR(rc); That can goto free_name; > + > + txwin->paste_addr_name = name; > + compute_paste_address(txwin, &start, &len); > + > + if (!request_mem_region(start, len, name)) { > + pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n", > + __func__, start, len); > + goto free_name; > + } > + > + map = ioremap_cache(start, len); > + if (!map) { > + pr_devel("%s(): ioremap_cache(0x%llx, %d) failed\n", __func__, > + start, len); > + goto free_name; > + } > + > + pr_devel("VAS: mapped paste addr 0x%llx to kaddr 0x%p\n", start, map); > + return map; > + > +free_name: > + kfree(name); Because kfree(NULL) is fine. > + return ERR_PTR(rc); And that can just return ERR_PTR(-ENOMEM); > +} cheers
RE: [PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
On 24.08.2017 11:14, Paul Mackerras wrote: > Nixiaoming pointed out that there is a memory leak in > kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() > fails; the memory allocated for the kvmppc_spapr_tce_table struct is > not freed, and nor are the pages allocated for the iommu tables. In > addition, we have already incremented the process's count of locked > memory pages, and this doesn't get restored on error. > > David Hildenbrand pointed out that there is a race in that the > function checks early on that there is not already an entry in the > stt->iommu_tables list with the same LIOBN, but an entry with the > same LIOBN could get added between then and when the new entry is > added to the list. > > This fixes all three problems. To simplify things, we now call > anon_inode_getfd() before placing the new entry in the list. The > check for an existing entry is done while holding the kvm->lock mutex, > immediately before adding the new entry to the list. > Finally, on failure we now call kvmppc_account_memlimit to decrement > the process's count of locked memory pages. > > Reported-by: Nixiaoming > Reported-by: David Hildenbrand > Signed-off-by: Paul Mackerras > --- > v2: Don't overwrite stt in loop over spapr_tce_tables > Reviewed-by: nixiaoming
Re: [PATCH] powerpc: powernv: Fix build error on const discarding
Corentin Labbe writes: > On Thu, Aug 17, 2017 at 10:52:11PM +1000, Michael Ellerman wrote: >> Corentin Labbe writes: >> >> > When building a random powerpc kernel I hit this build error: >> > CC arch/powerpc/platforms/powernv/opal-imc.o >> > arch/powerpc/platforms/powernv/opal-imc.c: In function « >> > disable_nest_pmu_counters »: >> > arch/powerpc/platforms/powernv/opal-imc.c:130:13: error : assignment >> > discards « const » qualifier from pointer target type >> > [-Werror=discarded-qualifiers] >> >l_cpumask = cpumask_of_node(nid); >> > ^ >> > This patch simply add const to l_cpumask to fix this issue. >> >> Thanks. I'm not sure why we haven't seen that. >> >> Do you mind attaching your .config ? >> >> cheers > > Yes Thanks. So the key is: > CONFIG_PPC_POWERNV=y ... > # CONFIG_NUMA is not set Which none of our configs have. I'll add a test build of that. Thanks. cheers
Re: [PATCH 4/4] powerpc/32: remove a NOP from memset()
Christophe LEROY writes: > Le 24/08/2017 à 12:51, Michael Ellerman a écrit : >> Christophe Leroy writes: >> >>> memset() is patched after initialisation to activate the >>> optimised part which uses cache instructions. >>> >>> Today we have a 'b 2f' to skip the optimised patch, which then gets >>> replaced by a NOP, implying a useless cycle consumption. >>> As we have a 'bne 2f' just before, we could use that instruction >>> for the live patching, hence removing the need to have a >>> dedicated 'b 2f' to be replaced by a NOP. >>> >>> This patch changes the 'bne 2f' by a 'b 2f'. During init, that >>> 'b 2f' is then replaced by 'bne 2f' >> >> I'm not sure what the sequence is during boot for the 32-bit code, but >> can you use an ALT_FTR section for this? Possibly that doesn't get done >> at the right time though. > > Unfortunately, as we discussed in 2015 > (https://lkml.org/lkml/2015/9/10/608), Haha, you expect me to remember things I said then! ;) > the ALT_FTR does things too early, while the cache is not enabled yet. OK. Ben did do some reworks to the early init since then, but I don't think he changed that. I notice we do setup_feature_keys() in machine_init(), which is the jump label equivalent of apply_feature_fixups(). So I wonder if we could actually move apply_feature_fixups() to there. But it would need some serious review. cheers
Re: [PATCH v2 2/5] powerpc: pseries: vio: match parent nodes with of_find_node_by_path
Rob Herring writes: > On Tue, Aug 22, 2017 at 12:12 AM, Michael Ellerman > wrote: >> Rob Herring writes: >> >>> In preparation to remove the full path from device_node.full_name, use >>> of_find_node_by_path instead of open coding with strcmp. >>> >>> Signed-off-by: Rob Herring >>> Cc: Benjamin Herrenschmidt >>> Cc: Paul Mackerras >>> Cc: Michael Ellerman >>> Cc: linuxppc-dev@lists.ozlabs.org >>> --- >>> v2: >>> - rebased to linux-next and removed spurious change fro patch 1. >>> >>> arch/powerpc/platforms/pseries/vio.c | 4 ++-- >>> 1 file changed, 2 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/powerpc/platforms/pseries/vio.c >>> b/arch/powerpc/platforms/pseries/vio.c >>> index aa5ca74316fa..5754572deb23 100644 >>> --- a/arch/powerpc/platforms/pseries/vio.c >>> +++ b/arch/powerpc/platforms/pseries/vio.c >>> @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct >>> device_node *of_node) >>>*/ >>> parent_node = of_get_parent(of_node); >>> if (parent_node) { >>> - if (!strcmp(parent_node->full_name, >>> "/ibm,platform-facilities")) >>> + if (parent_node == >>> of_find_node_by_path("/ibm,platform-facilities")) >>> family = PFO; >>> - else if (!strcmp(parent_node->full_name, "/vdevice")) >>> + else if (parent_node == of_find_node_by_path("/vdevice")) >>> family = VDEVICE; >> >> This leaks references to the looked up nodes. >> >> Both these nodes are defined in PAPR (our hypervisor spec), and both of >> them must have a device_type, either "ibm,platform-facilities" or >> "vdevice". >> >> Looking at the commit that added the code I don't see any particular >> reason it used the comparison against full_name, rather than using the >> device_type. >> >> So I'm inclined to do this instead: >> >> diff --git a/arch/powerpc/platforms/pseries/vio.c >> b/arch/powerpc/platforms/pseries/vio.c >> index 8a47f168476b..f26f906e6021 100644 >> --- a/arch/powerpc/platforms/pseries/vio.c >> +++ b/arch/powerpc/platforms/pseries/vio.c >> @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct >> device_node *of_node) >> */ >> parent_node = of_get_parent(of_node); >> if (parent_node) { >> - if (!strcmp(parent_node->full_name, >> "/ibm,platform-facilities")) >> + if (!strcmp(parent_node->type, "ibm,platform-facilities")) >> family = PFO; >> - else if (!strcmp(parent_node->full_name, "/vdevice")) >> + else if (!strcmp(parent_node->type, "vdevice")) >> family = VDEVICE; >> else { >> pr_warn("%s: parent(%s) of %s not recognized.\n", >> >> >> I've checked both Qemu and kvmtool add the device_type, and I'm fairly >> confident that PowerVM does too. Anyway I'll test it on all the machines >> I can find. > > Okay, do you want me to respin the patch or will you update it with I merged it. Should be in next today. cheers
[PATCH V10 2/2] powerpc/nodes: Ensure enough nodes avail for operations
From: Michael Bringmann To: linuxppc-dev@lists.ozlabs.org To: linux-ker...@vger.kernel.org Cc: Michael Ellerman Cc: Michael Bringmann Cc: John Allen Cc: Nathan Fontenot Subject: [PATCH V10 2/2] powerpc/nodes: Ensure enough nodes avail for operations powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU or memory resources, it may occur that the new resources are to be inserted into nodes that were not used for these resources at bootup. In the kernel, any node that is used must be defined and initialized at boot. This patch extracts the value of the 'min_common_depth' element from the "rtas" device tree property "ibm,max-associativity-domains" to use as the maximum number of nodes to setup as possibly available in the system. [The 'min_common_depth' element is calculated from memory associations found while loading all of the configured memory into the system data structures at boot.] This new setting will override the instruction, nodes_and(node_possible_map, node_possible_map, node_online_map); presently seen in the function arch/powerpc/mm/numa.c:initmem_init(). If the property is not present at boot, no operation will be performed to define or enable additional nodes. Signed-off-by: Michael Bringmann --- Changes in V10: -- Try to use 'min_common_depth' from NUMA initialization to select domain level to use for maximum nodes. --- arch/powerpc/mm/numa.c | 44 1 file changed, 44 insertions(+) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 73427e290..841d3b6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -896,6 +896,48 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) NODE_DATA(nid)->node_spanned_pages = spanned_pages; } +static void __init node_associativity_setup(void) +{ + struct device_node *rtas; + + rtas = of_find_node_by_path("/rtas"); + if (rtas) { + const __be32 *prop; + u32 len, entries, numnodes, i; + + prop = of_get_property(rtas, "ibm,max-associativity-domains", &len); + if (!prop || len < sizeof(unsigned int)) + goto endit; + + entries = of_read_number(prop++, 1); + + if (len < (entries * sizeof(unsigned int))) + goto endit; + + dbg("numa: Debug: Entries = %d MCD = %d\n", entries, min_common_depth); + + if ((0 <= min_common_depth) && (min_common_depth <= (entries-1))) + entries = min_common_depth; + else + entries -= 1; + + numnodes = of_read_number(&prop[entries], 1); + + printk(KERN_INFO "numa: Nodes = %d\n", numnodes); + + for (i = 0; i < numnodes; i++) { + if (!node_possible(i)) { + setup_node_data(i, 0, 0); + node_set(i, node_possible_map); + } + } + } + +endit: + if (rtas) + of_node_put(rtas); +} + void __init initmem_init(void) { int nid, cpu; @@ -915,6 +957,8 @@ void __init initmem_init(void) */ nodes_and(node_possible_map, node_possible_map, node_online_map); + node_associativity_setup(); + for_each_online_node(nid) { unsigned long start_pfn, end_pfn;
[PATCH V10 1/2] powerpc/numa: Update CPU topology when VPHN enabled
powerpc/numa: Correct the currently broken capability to set the topology for shared CPUs in LPARs. At boot time for shared CPU lpars, the topology for each shared CPU is set to node zero, however, this is now updated correctly using the Virtual Processor Home Node (VPHN) capabilities information provided by the pHyp. Also, update initialization checks for device-tree attributes to independently recognize PRRN or VPHN usage. Finally, try to distinguish the VPHN code from the NUMA code better, and move relevant functions to another file. Signed-off-by: Michael Bringmann --- Changes in V10: -- Reorganize VPHN code to distinguish it from NUMA processing --- arch/powerpc/include/asm/topology.h |8 arch/powerpc/mm/numa.c | 503 -- arch/powerpc/mm/vphn.c | 586 ++ arch/powerpc/mm/vphn.h |4 arch/powerpc/platforms/pseries/hotplug-cpu.c |2 5 files changed, 609 insertions(+), 494 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index dc4e159..600e1c6 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -98,6 +98,14 @@ static inline int prrn_is_enabled(void) } #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES) +#if defined(CONFIG_PPC_SPLPAR) +extern int timed_topology_update(int nsecs); +#else +#definetimed_topology_update(nsecs)0 +#endif /* CONFIG_PPC_SPLPAR */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */ + #include #ifdef CONFIG_SMP diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b95c584..73427e290 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -41,8 +42,12 @@ #include #include +#include "vphn.h" + static int numa_enabled = 1; +bool topology_updates_enabled = true; + static char *cmdline __initdata; static int numa_debug; @@ -60,8 +65,7 @@ static int n_mem_addr_cells, n_mem_size_cells; static int form1_affinity; -#define MAX_DISTANCE_REF_POINTS 4 -static int distance_ref_points_depth; +int distance_ref_points_depth; static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; @@ -142,12 +146,12 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void update_numa_cpu_lookup_table(unsigned int cpu, int node) +void update_numa_cpu_lookup_table(unsigned int cpu, int node) { numa_cpu_lookup_table[cpu] = node; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); @@ -158,7 +162,7 @@ static void map_cpu_to_node(int cpu, int node) } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -233,7 +237,7 @@ static void initialize_distance_lookup_table(int nid, /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */ -static int associativity_to_nid(const __be32 *associativity) +int associativity_to_nid(const __be32 *associativity) { int nid = -1; @@ -957,8 +961,6 @@ static int __init early_numa(char *p) } early_param("numa", early_numa); -static bool topology_updates_enabled = true; - static int __init early_topology_updates(char *p) { if (!p) @@ -1135,488 +1137,3 @@ u64 memory_hotplug_max(void) return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); } #endif /* CONFIG_MEMORY_HOTPLUG */ - -/* Virtual Processor Home Node (VPHN) support */ -#ifdef CONFIG_PPC_SPLPAR - -#include "vphn.h" - -struct topology_update_data { - struct topology_update_data *next; - unsigned int cpu; - int old_nid; - int new_nid; -}; - -static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; -static cpumask_t cpu_associativity_changes_mask; -static int vphn_enabled; -static int prrn_enabled; -static void reset_topology_timer(void); - -/* - * Store the current values of the associativity change counters in the - * hypervisor. - */ -static void setup_cpu_associativity_change_counters(void) -{ - int cpu; - - /* The VPHN feature supports a maximum of 8 reference points */ - BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); - - for_each_possible_cpu(cpu) { - int i; - u8 *counts = vphn_cpu_change_counts[cpu]; - volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; - - for (i = 0; i < distance_ref_points_depth; i++) - counts[i] = hypervisor_counts[i]; - } -} - -/* - *
[PATCH V10 0/2] powerpc/dlpar: Correct display of hot-add/hot-remove CPUs and memory
From: m...@linux.vnet.ibm.com To: linuxppc-dev@lists.ozlabs.org, linux-ker...@vger.kernel.org Cc: nf...@linux.vnet.ibm.com Cc: m...@linux.vnet.ibm.com Subject: [PATCH V10 0/2] powerpc/dlpar: Correct display of hot-add/hot-remove CPUs and memory On Power systems with shared configurations of CPUs and memory, there are some issues with association of additional CPUs and memory to nodes when hot-adding resources. These patches address some of those problems. powerpc/numa: Correct the currently broken capability to set the topology for shared CPUs in LPARs. At boot time for shared CPU lpars, the topology for each shared CPU is set to node zero, however, this is now updated correctly using the Virtual Processor Home Node (VPHN) capabilities information provided by the pHyp. The VPHN handling in Linux is disabled, if PRRN handling is present. Also, update initialization checks for device-tree attributes to independently recognize PRRN or VPHN usage. Finally, try to distinguish the VPHN code from the NUMA code better, and move relevant functions to another file. powerpc/nodes: On systems like PowerPC which allow 'hot-add' of CPU or memory resources, it may occur that the new resources are to be inserted into nodes that were not used for these resources at bootup. In the kernel, any node that is used must be defined and initialized at boot. This patch extracts the value of the 'min_common_depth' element from the "rtas" device tree property "ibm,max-associativity-domains" to use as the maximum number of nodes to setup as possibly available in the system. [The 'min_common_depth' element is calculated from memory associations found while loading all of the configured memory into the system data structures at boot.] This new setting will override the instruction, nodes_and(node_possible_map, node_possible_map, node_online_map); presently seen in the function arch/powerpc/mm/numa.c:initmem_init(). If the property is not present at boot, no operation will be performed to define or enable additional nodes. Signed-off-by: Michael Bringmann Michael Bringmann (2): powerpc/numa: Update CPU topology when VPHN enabled powerpc/nodes: Ensure enough nodes avail for operations --- Changes in V10: -- Reorganize VPHN code -- Revise index used with property "ibm,max-associativity-domains"
Re: [PATCH v7 03/12] powerpc/vas: Define vas_init() and vas_exit()
Michael Ellerman [m...@ellerman.id.au] wrote: > Hi Suka, > > Comments inline ... > > Sukadev Bhattiprolu writes: > > diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > > b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > > new file mode 100644 > > index 000..0e3111d > > --- /dev/null > > +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > > @@ -0,0 +1,24 @@ > > +* IBM Powerpc Virtual Accelerator Switchboard (VAS) > > + > > +VAS is a hardware mechanism that allows kernel subsystems and user > > processes > > +to directly submit compression and other requests to Nest accelerators (NX) > > +or other coprocessors functions. > > + > > +Required properties: > > +- compatible : should be "ibm,vas" or "ibm,power9-vas" > > The driver doesn't look for the latter. Ok. I have removed it from this list of required properties > > > +- ibm,vas-id : A unique identifier for each instance of VAS in the system > > What is this? Like the ibm,chip-id, but in the future, there could be more than one instance of VAS per chip, so firmware assigns a unique id to each instance of VAS. > > > +- reg : Should contain 4 pairs of 64-bit fields specifying the Hypervisor > > + window context start and length, OS/User window context start and length, > > + "Paste address" start and length, "Paste window id" start bit and number > > + of bits) > > +- name : "vas" > > I don't think the name is normally included in the binding, and in fact > there's no reason why the name is important, so I'd be inclined to drop that. Ok. I dropped it. > > > diff --git a/MAINTAINERS b/MAINTAINERS > > index 3c41902..abc235f 100644 > > --- a/MAINTAINERS > > +++ b/MAINTAINERS > > @@ -6425,6 +6425,14 @@ F: drivers/crypto/nx/nx.* > > F: drivers/crypto/nx/nx_csbcpb.h > > F: drivers/crypto/nx/nx_debugfs.h > > > > +IBM Power Virtual Accelerator Switchboard > > +M: Sukadev Bhattiprolu > > +L: linuxppc-dev@lists.ozlabs.org > > +S: Supported > > +F: arch/powerpc/platforms/powernv/vas* > > +F: arch/powerpc/include/asm/vas.h > > +F: arch/powerpc/include/uapi/asm/vas.h > > That's not in the right place, the file is sorted alphabetically. ah, fixed. > > V comes after L. > > > diff --git a/arch/powerpc/platforms/powernv/Kconfig > > b/arch/powerpc/platforms/powernv/Kconfig > > index 6a6f4ef..f565454 100644 > > --- a/arch/powerpc/platforms/powernv/Kconfig > > +++ b/arch/powerpc/platforms/powernv/Kconfig > > @@ -30,3 +30,17 @@ config OPAL_PRD > > help > > This enables the opal-prd driver, a facility to run processor > > recovery diagnostics on OpenPower machines > > + > > +config PPC_VAS > > + bool "IBM Virtual Accelerator Switchboard (VAS)" > > ^ bool, so never a module. yes, it should be built in. > > > + depends on PPC_POWERNV && PPC_64K_PAGES > > + default n > > It should be default y. > > I know the usual advice is to make things 'default n', but this has > fairly tight depends already, so y is OK IMO. Ok. > > > diff --git a/arch/powerpc/platforms/powernv/vas.c > > b/arch/powerpc/platforms/powernv/vas.c > > new file mode 100644 > > index 000..556156b > > --- /dev/null > > +++ b/arch/powerpc/platforms/powernv/vas.c > > @@ -0,0 +1,183 @@ > > +/* > > + * Copyright 2016 IBM Corp. > > 2016-2017. Ok. > > > + * > > + * This program is free software; you can redistribute it and/or > > + * modify it under the terms of the GNU General Public License > > + * as published by the Free Software Foundation; either version > > + * 2 of the License, or (at your option) any later version. > > + */ > > #define pr_fmt(fmt) "vas: " fmt Ok > > > +#include > > +#include > > +#include > > +#include > > +#include > > +#include > > +#include > > +#include > > +#include > > + > > +#include "vas.h" > > + > > +static bool init_done; > > +LIST_HEAD(vas_instances); > > Can be static. Yes > > > + > > +static int init_vas_instance(struct platform_device *pdev) > > +{ > > + int rc, vasid; > > + struct vas_instance *vinst; > > + struct device_node *dn = pdev->dev.of_node; > > + struct resource *res; > > struct device_node *dn = pdev->dev.of_node; > struct vas_instance *vinst; > struct resource *res; > int rc, vasid; > > Petty I know, but much prettier :) I usually go the opposite way (shortest first) so I have done that here also. For newer files I will invert the tree. > > > + > > + rc = of_property_read_u32(dn, "ibm,vas-id", &vasid); > > + if (rc) { > > + pr_err("VAS: No ibm,vas-id property for %s?\n", pdev->name); > > With the pr_fmt() above you don't need VAS: on the front of all these. Ok > > > + return -ENODEV; > > + } > > + > > + if (pdev->num_resources != 4) { > > + pr_err("VAS: Unexpected DT configuration for [%s, %d]\n", > > + pdev->name, vasid); > > + return -ENODEV; > > + } > > + > > + vinst = kcalloc(1, sizeof(*vinst), GFP_KERNEL); > > kzalloc() would be
[v4 11/11] fsl/soc/qbman: Enable FSL_LAYERSCAPE config on ARM
From: Madalin Bucur Signed-off-by: Madalin Bucur Signed-off-by: Claudiu Manoil [Stuart: changed to use ARCH_LAYERSCAPE] Signed-off-by: Stuart Yoder Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/fsl/qbman/Kconfig b/drivers/soc/fsl/qbman/Kconfig index 757033c..fb4e6bf 100644 --- a/drivers/soc/fsl/qbman/Kconfig +++ b/drivers/soc/fsl/qbman/Kconfig @@ -1,6 +1,6 @@ menuconfig FSL_DPAA bool "Freescale DPAA 1.x support" - depends on FSL_SOC_BOOKE + depends on (FSL_SOC_BOOKE || ARCH_LAYERSCAPE) select GENERIC_ALLOCATOR help The Freescale Data Path Acceleration Architecture (DPAA) is a set of -- 2.7.4
[v4 10/11] soc/fsl/qbman: Add missing headers on ARM
From: Claudiu Manoil Unlike PPC builds, ARM builds need following headers explicitly: +#include for ioread32be() +#includefor udelay() Signed-off-by: Claudiu Manoil Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/dpaa_sys.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h index 0a1d573..8ec6a78 100644 --- a/drivers/soc/fsl/qbman/dpaa_sys.h +++ b/drivers/soc/fsl/qbman/dpaa_sys.h @@ -44,6 +44,8 @@ #include #include #include +#include +#include /* For 2-element tables related to cache-inhibited and cache-enabled mappings */ #define DPAA_PORTAL_CE 0 -- 2.7.4
[v4 09/11] soc/fsl/qbman: different register offsets on ARM
From: Madalin Bucur Signed-off-by: Madalin Bucur Signed-off-by: Claudiu Manoil Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/bman.c | 22 ++ drivers/soc/fsl/qbman/qman.c | 38 ++ 2 files changed, 60 insertions(+) diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c index e31c843..265048d 100644 --- a/drivers/soc/fsl/qbman/bman.c +++ b/drivers/soc/fsl/qbman/bman.c @@ -35,6 +35,27 @@ /* Portal register assists */ +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) +/* Cache-inhibited register offsets */ +#define BM_REG_RCR_PI_CINH 0x3000 +#define BM_REG_RCR_CI_CINH 0x3100 +#define BM_REG_RCR_ITR 0x3200 +#define BM_REG_CFG 0x3300 +#define BM_REG_SCN(n) (0x3400 + ((n) << 6)) +#define BM_REG_ISR 0x3e00 +#define BM_REG_IER 0x3e40 +#define BM_REG_ISDR0x3e80 +#define BM_REG_IIR 0x3ec0 + +/* Cache-enabled register offsets */ +#define BM_CL_CR 0x +#define BM_CL_RR0 0x0100 +#define BM_CL_RR1 0x0140 +#define BM_CL_RCR 0x1000 +#define BM_CL_RCR_PI_CENA 0x3000 +#define BM_CL_RCR_CI_CENA 0x3100 + +#else /* Cache-inhibited register offsets */ #define BM_REG_RCR_PI_CINH 0x #define BM_REG_RCR_CI_CINH 0x0004 @@ -53,6 +74,7 @@ #define BM_CL_RCR 0x1000 #define BM_CL_RCR_PI_CENA 0x3000 #define BM_CL_RCR_CI_CENA 0x3100 +#endif /* * Portal modes. diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 668fab1..fdd4c65 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -41,6 +41,43 @@ /* Portal register assists */ +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) +/* Cache-inhibited register offsets */ +#define QM_REG_EQCR_PI_CINH0x3000 +#define QM_REG_EQCR_CI_CINH0x3040 +#define QM_REG_EQCR_ITR0x3080 +#define QM_REG_DQRR_PI_CINH0x3100 +#define QM_REG_DQRR_CI_CINH0x3140 +#define QM_REG_DQRR_ITR0x3180 +#define QM_REG_DQRR_DCAP 0x31C0 +#define QM_REG_DQRR_SDQCR 0x3200 +#define QM_REG_DQRR_VDQCR 0x3240 +#define QM_REG_DQRR_PDQCR 0x3280 +#define QM_REG_MR_PI_CINH 0x3300 +#define QM_REG_MR_CI_CINH 0x3340 +#define QM_REG_MR_ITR 0x3380 +#define QM_REG_CFG 0x3500 +#define QM_REG_ISR 0x3600 +#define QM_REG_IER 0x3640 +#define QM_REG_ISDR0x3680 +#define QM_REG_IIR 0x36C0 +#define QM_REG_ITPR0x3740 + +/* Cache-enabled register offsets */ +#define QM_CL_EQCR 0x +#define QM_CL_DQRR 0x1000 +#define QM_CL_MR 0x2000 +#define QM_CL_EQCR_PI_CENA 0x3000 +#define QM_CL_EQCR_CI_CENA 0x3040 +#define QM_CL_DQRR_PI_CENA 0x3100 +#define QM_CL_DQRR_CI_CENA 0x3140 +#define QM_CL_MR_PI_CENA 0x3300 +#define QM_CL_MR_CI_CENA 0x3340 +#define QM_CL_CR 0x3800 +#define QM_CL_RR0 0x3900 +#define QM_CL_RR1 0x3940 + +#else /* Cache-inhibited register offsets */ #define QM_REG_EQCR_PI_CINH0x #define QM_REG_EQCR_CI_CINH0x0004 @@ -75,6 +112,7 @@ #define QM_CL_CR 0x3800 #define QM_CL_RR0 0x3900 #define QM_CL_RR1 0x3940 +#endif /* * BTW, the drivers (and h/w programming model) already obtain the required -- 2.7.4
[v4 07/11] soc/fsl/qbman: Rework portal mapping calls for ARM/PPC
Rework portal mapping for PPC and ARM. The PPC devices require a cacheable coherent mapping while ARM will work with a non-cachable/write combine mapping. This also eliminates the need for manual cache flushes on ARM Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/bman.c| 6 +++--- drivers/soc/fsl/qbman/bman_portal.c | 36 +++- drivers/soc/fsl/qbman/bman_priv.h | 8 +++- drivers/soc/fsl/qbman/dpaa_sys.h| 8 drivers/soc/fsl/qbman/qman.c| 6 +++--- drivers/soc/fsl/qbman/qman_portal.c | 36 +++- drivers/soc/fsl/qbman/qman_priv.h | 8 +++- 7 files changed, 62 insertions(+), 46 deletions(-) diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c index ff8998f..e31c843 100644 --- a/drivers/soc/fsl/qbman/bman.c +++ b/drivers/soc/fsl/qbman/bman.c @@ -154,7 +154,7 @@ struct bm_mc { }; struct bm_addr { - void __iomem *ce; /* cache-enabled */ + void *ce; /* cache-enabled */ void __iomem *ci; /* cache-inhibited */ }; @@ -512,8 +512,8 @@ static int bman_create_portal(struct bman_portal *portal, * config, everything that follows depends on it and "config" is more * for (de)reference... */ - p->addr.ce = c->addr_virt[DPAA_PORTAL_CE]; - p->addr.ci = c->addr_virt[DPAA_PORTAL_CI]; + p->addr.ce = c->addr_virt_ce; + p->addr.ci = c->addr_virt_ci; if (bm_rcr_init(p, bm_rcr_pvb, bm_rcr_cce)) { dev_err(c->dev, "RCR initialisation failed\n"); goto fail_rcr; diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c index 39b39c8..bb03503 100644 --- a/drivers/soc/fsl/qbman/bman_portal.c +++ b/drivers/soc/fsl/qbman/bman_portal.c @@ -91,7 +91,6 @@ static int bman_portal_probe(struct platform_device *pdev) struct device_node *node = dev->of_node; struct bm_portal_config *pcfg; struct resource *addr_phys[2]; - void __iomem *va; int irq, cpu; pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL); @@ -123,23 +122,34 @@ static int bman_portal_probe(struct platform_device *pdev) } pcfg->irq = irq; - va = ioremap_prot(addr_phys[0]->start, resource_size(addr_phys[0]), 0); - if (!va) { - dev_err(dev, "ioremap::CE failed\n"); + /* +* TODO: Ultimately we would like to use a cacheable/non-shareable +* (coherent) mapping for the portal on both architectures but that +* isn't currently available in the kernel. Because of HW differences +* PPC needs to be mapped cacheable while ARM SoCs will work with non +* cacheable mappings +*/ +#ifdef CONFIG_PPC + /* PPC requires a cacheable/non-coherent mapping of the portal */ + pcfg->addr_virt_ce = memremap(addr_phys[0]->start, + resource_size(addr_phys[0]), MEMREMAP_WB); +#else + /* ARM can use a write combine mapping. */ + pcfg->addr_virt_ce = memremap(addr_phys[0]->start, + resource_size(addr_phys[0]), MEMREMAP_WC); +#endif + if (!pcfg->addr_virt_ce) { + dev_err(dev, "memremap::CE failed\n"); goto err_ioremap1; } - pcfg->addr_virt[DPAA_PORTAL_CE] = va; - - va = ioremap_prot(addr_phys[1]->start, resource_size(addr_phys[1]), - _PAGE_GUARDED | _PAGE_NO_CACHE); - if (!va) { + pcfg->addr_virt_ci = ioremap(addr_phys[1]->start, + resource_size(addr_phys[1])); + if (!pcfg->addr_virt_ci) { dev_err(dev, "ioremap::CI failed\n"); goto err_ioremap2; } - pcfg->addr_virt[DPAA_PORTAL_CI] = va; - spin_lock(&bman_lock); cpu = cpumask_next_zero(-1, &portal_cpus); if (cpu >= nr_cpu_ids) { @@ -164,9 +174,9 @@ static int bman_portal_probe(struct platform_device *pdev) return 0; err_portal_init: - iounmap(pcfg->addr_virt[DPAA_PORTAL_CI]); + iounmap(pcfg->addr_virt_ci); err_ioremap2: - iounmap(pcfg->addr_virt[DPAA_PORTAL_CE]); + memunmap(pcfg->addr_virt_ce); err_ioremap1: return -ENXIO; } diff --git a/drivers/soc/fsl/qbman/bman_priv.h b/drivers/soc/fsl/qbman/bman_priv.h index 765a4bf..c48e6eb 100644 --- a/drivers/soc/fsl/qbman/bman_priv.h +++ b/drivers/soc/fsl/qbman/bman_priv.h @@ -49,11 +49,9 @@ extern u16 bman_ip_rev; /* 0 if uninitialised, otherwise BMAN_REVx */ extern struct gen_pool *bm_bpalloc; struct bm_portal_config { - /* -* Corenet portal addresses; -* [0]==cache-enabled, [1]==cache-inhibited. -*/ - void __iomem *addr_virt[2]; + /* Portal addresses */ + void *addr_virt_ce; + void __iomem *addr_virt_ci; /* Allow these to be joined in lists */ struct list_head li
[v4 08/11] soc/fsl/qbman: add QMAN_REV32
From: Madalin Bucur Add revision 3.2 of the QBMan block. This is the version for LS1043A and LS1046A SoCs. Signed-off-by: Madalin Bucur Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/qman_ccsr.c | 2 ++ drivers/soc/fsl/qbman/qman_priv.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c b/drivers/soc/fsl/qbman/qman_ccsr.c index 20a1ebd..bbe3975 100644 --- a/drivers/soc/fsl/qbman/qman_ccsr.c +++ b/drivers/soc/fsl/qbman/qman_ccsr.c @@ -720,6 +720,8 @@ static int fsl_qman_probe(struct platform_device *pdev) qman_ip_rev = QMAN_REV30; else if (major == 3 && minor == 1) qman_ip_rev = QMAN_REV31; + else if (major == 3 && minor == 2) + qman_ip_rev = QMAN_REV32; else { dev_err(dev, "Unknown QMan version\n"); return -ENODEV; diff --git a/drivers/soc/fsl/qbman/qman_priv.h b/drivers/soc/fsl/qbman/qman_priv.h index bab7f15..8f715fa 100644 --- a/drivers/soc/fsl/qbman/qman_priv.h +++ b/drivers/soc/fsl/qbman/qman_priv.h @@ -185,6 +185,7 @@ struct qm_portal_config { #define QMAN_REV20 0x0200 #define QMAN_REV30 0x0300 #define QMAN_REV31 0x0301 +#define QMAN_REV32 0x0302 extern u16 qman_ip_rev; /* 0 if uninitialised, otherwise QMAN_REVx */ #define QM_FQID_RANGE_START 1 /* FQID 0 reserved for internal use */ -- 2.7.4
[v4 06/11] soc/fsl/qbman: Fix ARM32 typo
From: Valentin Rothberg The Kconfig symbol for 32bit ARM is 'ARM', not 'ARM32'. Signed-off-by: Valentin Rothberg Signed-off-by: Claudiu Manoil Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/dpaa_sys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h index f85c319..81a9a5e 100644 --- a/drivers/soc/fsl/qbman/dpaa_sys.h +++ b/drivers/soc/fsl/qbman/dpaa_sys.h @@ -53,7 +53,7 @@ static inline void dpaa_flush(void *p) { #ifdef CONFIG_PPC flush_dcache_range((unsigned long)p, (unsigned long)p+64); -#elif defined(CONFIG_ARM32) +#elif defined(CONFIG_ARM) __cpuc_flush_dcache_area(p, 64); #elif defined(CONFIG_ARM64) __flush_dcache_area(p, 64); -- 2.7.4
[v4 05/11] soc/fsl/qbman: Drop L1_CACHE_BYTES compile time check
From: Claudiu Manoil Not relevant and arch dependent. Overkill for PPC. Signed-off-by: Claudiu Manoil Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/dpaa_sys.h | 4 1 file changed, 4 deletions(-) diff --git a/drivers/soc/fsl/qbman/dpaa_sys.h b/drivers/soc/fsl/qbman/dpaa_sys.h index 2ce394a..f85c319 100644 --- a/drivers/soc/fsl/qbman/dpaa_sys.h +++ b/drivers/soc/fsl/qbman/dpaa_sys.h @@ -49,10 +49,6 @@ #define DPAA_PORTAL_CE 0 #define DPAA_PORTAL_CI 1 -#if (L1_CACHE_BYTES != 32) && (L1_CACHE_BYTES != 64) -#error "Unsupported Cacheline Size" -#endif - static inline void dpaa_flush(void *p) { #ifdef CONFIG_PPC -- 2.7.4
[v4 04/11] soc/fsl/qbman: Drop set/clear_bits usage
From: Madalin Bucur Replace PPC specific set/clear_bits API with standard bit twiddling so driver is portalable outside PPC. Signed-off-by: Madalin Bucur Signed-off-by: Claudiu Manoil Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/bman.c | 2 +- drivers/soc/fsl/qbman/qman.c | 8 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/soc/fsl/qbman/bman.c b/drivers/soc/fsl/qbman/bman.c index 604e45c..ff8998f 100644 --- a/drivers/soc/fsl/qbman/bman.c +++ b/drivers/soc/fsl/qbman/bman.c @@ -616,7 +616,7 @@ int bman_p_irqsource_add(struct bman_portal *p, u32 bits) unsigned long irqflags; local_irq_save(irqflags); - set_bits(bits & BM_PIRQ_VISIBLE, &p->irq_sources); + p->irq_sources |= bits & BM_PIRQ_VISIBLE; bm_out(&p->p, BM_REG_IER, p->irq_sources); local_irq_restore(irqflags); return 0; diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 1bcfc51..25419e1 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -908,12 +908,12 @@ static inline int qm_mc_result_timeout(struct qm_portal *portal, static inline void fq_set(struct qman_fq *fq, u32 mask) { - set_bits(mask, &fq->flags); + fq->flags |= mask; } static inline void fq_clear(struct qman_fq *fq, u32 mask) { - clear_bits(mask, &fq->flags); + fq->flags &= ~mask; } static inline int fq_isset(struct qman_fq *fq, u32 mask) @@ -1574,7 +1574,7 @@ void qman_p_irqsource_add(struct qman_portal *p, u32 bits) unsigned long irqflags; local_irq_save(irqflags); - set_bits(bits & QM_PIRQ_VISIBLE, &p->irq_sources); + p->irq_sources |= bits & QM_PIRQ_VISIBLE; qm_out(&p->p, QM_REG_IER, p->irq_sources); local_irq_restore(irqflags); } @@ -1597,7 +1597,7 @@ void qman_p_irqsource_remove(struct qman_portal *p, u32 bits) */ local_irq_save(irqflags); bits &= QM_PIRQ_VISIBLE; - clear_bits(bits, &p->irq_sources); + p->irq_sources &= ~bits; qm_out(&p->p, QM_REG_IER, p->irq_sources); ier = qm_in(&p->p, QM_REG_IER); /* -- 2.7.4
[v4 03/11] dt-bindings: soc/fsl: Update reserved memory binding for QBMan
Updates the QMan and BMan device tree bindings for reserved memory nodes. This makes the reserved memory allocation compatible with the shared-dma-pool usage. Signed-off-by: Roy Pledge --- Documentation/devicetree/bindings/soc/fsl/bman.txt | 12 +- Documentation/devicetree/bindings/soc/fsl/qman.txt | 26 -- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/fsl/bman.txt b/Documentation/devicetree/bindings/soc/fsl/bman.txt index 47ac834..48eed14 100644 --- a/Documentation/devicetree/bindings/soc/fsl/bman.txt +++ b/Documentation/devicetree/bindings/soc/fsl/bman.txt @@ -65,8 +65,8 @@ to the respective BMan instance BMan Private Memory Node BMan requires a contiguous range of physical memory used for the backing store -for BMan Free Buffer Proxy Records (FBPR). This memory is reserved/allocated as a -node under the /reserved-memory node +for BMan Free Buffer Proxy Records (FBPR). This memory is reserved/allocated as +a node under the /reserved-memory node. The BMan FBPR memory node must be named "bman-fbpr" @@ -75,7 +75,9 @@ PROPERTIES - compatible Usage: required Value type: - Definition: Must inclide "fsl,bman-fbpr" + Definition: PPC platforms: Must include "fsl,bman-fbpr" + ARM platforms: Must include "shared-dma-pool" + as well as the "no-map" property The following constraints are relevant to the FBPR private memory: - The size must be 2^(size + 1), with size = 11..33. That is 4 KiB to @@ -100,10 +102,10 @@ The example below shows a BMan FBPR dynamic allocation memory node ranges; bman_fbpr: bman-fbpr { - compatible = "fsl,bman-fbpr"; - alloc-ranges = <0 0 0x10 0>; + compatible = "shared-mem-pool"; size = <0 0x100>; alignment = <0 0x100>; + no-map; }; }; diff --git a/Documentation/devicetree/bindings/soc/fsl/qman.txt b/Documentation/devicetree/bindings/soc/fsl/qman.txt index 556ebb8..ee96afd 100644 --- a/Documentation/devicetree/bindings/soc/fsl/qman.txt +++ b/Documentation/devicetree/bindings/soc/fsl/qman.txt @@ -60,6 +60,12 @@ are located at offsets 0xbf8 and 0xbfc Value type: Definition: Reference input clock. Its frequency is half of the platform clock +- memory-regions + Usage: Required for ARM + Value type: + Definition: List of phandles referencing the QMan private memory + nodes (described below). The qman-fqd node must be + first followed by qman-pfdr node. Only used on ARM Devices connected to a QMan instance via Direct Connect Portals (DCP) must link to the respective QMan instance @@ -74,7 +80,9 @@ QMan Private Memory Nodes QMan requires two contiguous range of physical memory used for the backing store for QMan Frame Queue Descriptor (FQD) and Packed Frame Descriptor Record (PFDR). -This memory is reserved/allocated as a nodes under the /reserved-memory node +This memory is reserved/allocated as a node under the /reserved-memory node. + +For additional details about reserved memory regions see reserved-memory.txt The QMan FQD memory node must be named "qman-fqd" @@ -83,7 +91,9 @@ PROPERTIES - compatible Usage: required Value type: - Definition: Must inclide "fsl,qman-fqd" + Definition: PPC platforms: Must include "fsl,qman-fqd" + ARM platforms: Must include "shared-dma-pool" + as well as the "no-map" property The QMan PFDR memory node must be named "qman-pfdr" @@ -92,7 +102,9 @@ PROPERTIES - compatible Usage: required Value type: - Definition: Must inclide "fsl,qman-pfdr" + Definition: PPC platforms: Must include "fsl,qman-pfdr" + ARM platforms: Must include "shared-dma-pool" + as well as the "no-map" property The following constraints are relevant to the FQD and PFDR private memory: - The size must be 2^(size + 1), with size = 11..29. That is 4 KiB to @@ -117,16 +129,16 @@ The example below shows a QMan FQD and a PFDR dynamic allocation memory nodes ranges; qman_fqd: qman-fqd { - compatible = "fsl,qman-fqd"; - alloc-ranges = <0 0 0x10 0>; + compatible = "shared-dma-pool"; size = <0 0x40>; alignment = <0 0x40>; + no-map; }; qman_pfdr: qman-pfdr { - compatible = "fsl,qman-pfdr";
[v4 02/11] soc/fsl/qbman: Use shared-dma-pool for QMan private memory allocations
Use the shared-memory-pool mechanism for frame queue descriptor and packed frame descriptor record area allocations. Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/qman_ccsr.c | 138 +- drivers/soc/fsl/qbman/qman_priv.h | 4 +- drivers/soc/fsl/qbman/qman_test.h | 2 - 3 files changed, 109 insertions(+), 35 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c b/drivers/soc/fsl/qbman/qman_ccsr.c index 835ce94..20a1ebd 100644 --- a/drivers/soc/fsl/qbman/qman_ccsr.c +++ b/drivers/soc/fsl/qbman/qman_ccsr.c @@ -401,21 +401,42 @@ static int qm_init_pfdr(struct device *dev, u32 pfdr_start, u32 num) } /* - * Ideally we would use the DMA API to turn rmem->base into a DMA address - * (especially if iommu translations ever get involved). Unfortunately, the - * DMA API currently does not allow mapping anything that is not backed with - * a struct page. + * QMan needs two global memory areas initialized at boot time: + * 1) FQD: Frame Queue Descriptors used to manage frame queues + * 2) PFDR: Packed Frame Queue Descriptor Records used to store frames + * Both areas are reserved using the device tree reserved memory framework + * and the addresses and sizes are initialized when the QMan device is probed */ static dma_addr_t fqd_a, pfdr_a; static size_t fqd_sz, pfdr_sz; +#ifdef CONFIG_PPC +/* + * Support for PPC Device Tree backward compatibility when compatible + * string is set to fsl-qman-fqd and fsl-qman-pfdr + */ +static int zero_priv_mem(phys_addr_t addr, size_t sz) +{ + /* map as cacheable, non-guarded */ + void __iomem *tmpp = ioremap_prot(addr, sz, 0); + + if (!tmpp) + return -ENOMEM; + + memset_io(tmpp, 0, sz); + flush_dcache_range((unsigned long)tmpp, + (unsigned long)tmpp + sz); + iounmap(tmpp); + + return 0; +} + static int qman_fqd(struct reserved_mem *rmem) { fqd_a = rmem->base; fqd_sz = rmem->size; WARN_ON(!(fqd_a && fqd_sz)); - return 0; } RESERVEDMEM_OF_DECLARE(qman_fqd, "fsl,qman-fqd", qman_fqd); @@ -431,32 +452,13 @@ static int qman_pfdr(struct reserved_mem *rmem) } RESERVEDMEM_OF_DECLARE(qman_pfdr, "fsl,qman-pfdr", qman_pfdr); +#endif + static unsigned int qm_get_fqid_maxcnt(void) { return fqd_sz / 64; } -/* - * Flush this memory range from data cache so that QMAN originated - * transactions for this memory region could be marked non-coherent. - */ -static int zero_priv_mem(struct device *dev, struct device_node *node, -phys_addr_t addr, size_t sz) -{ - /* map as cacheable, non-guarded */ - void __iomem *tmpp = ioremap_prot(addr, sz, 0); - - if (!tmpp) - return -ENOMEM; - - memset_io(tmpp, 0, sz); - flush_dcache_range((unsigned long)tmpp, - (unsigned long)tmpp + sz); - iounmap(tmpp); - - return 0; -} - static void log_edata_bits(struct device *dev, u32 bit_count) { u32 i, j, mask = 0x; @@ -687,11 +689,12 @@ static int qman_resource_init(struct device *dev) static int fsl_qman_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct device_node *node = dev->of_node; + struct device_node *mem_node, *node = dev->of_node; struct resource *res; int ret, err_irq; u16 id; u8 major, minor; + u64 size; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) { @@ -727,10 +730,83 @@ static int fsl_qman_probe(struct platform_device *pdev) qm_channel_caam = QMAN_CHANNEL_CAAM_REV3; } - ret = zero_priv_mem(dev, node, fqd_a, fqd_sz); - WARN_ON(ret); - if (ret) - return -ENODEV; + if (fqd_a) { +#ifdef CONFIG_PPC + /* +* For PPC backward DT compatibility +* FQD memory MUST be zero'd by software +*/ + zero_priv_mem(fqd_a, fqd_sz); +#else + WARN(1, "Unexpected archiceture using non shared-dma-mem reservations"); +#endif + } else { + /* +* Order of memory regions is assumed as FQD followed by PFDR +* in order to ensure allocations from the correct regions the +* driver initializes then allocates each piece in order +*/ + ret = of_reserved_mem_device_init_by_idx(dev, dev->of_node, 0); + if (ret) { + dev_err(dev, "of_reserved_mem_device_init_by_idx(0) failed 0x%x\n", + ret); + return -ENODEV; + } + mem_node = of_parse_phandle(dev->of_node, "memory-region", 0); + if (mem_node) { + ret = of_property_read_u64(mem_node, "size", &size); + if (ret) { +
[v4 01/11] soc/fsl/qbman: Use shared-dma-pool for BMan private memory allocations
Use the shared-memory-pool mechanism for free buffer proxy record area allocation. Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/bman_ccsr.c | 35 ++- drivers/soc/fsl/qbman/bman_priv.h | 3 +++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/soc/fsl/qbman/bman_ccsr.c b/drivers/soc/fsl/qbman/bman_ccsr.c index eaa9585..2182236 100644 --- a/drivers/soc/fsl/qbman/bman_ccsr.c +++ b/drivers/soc/fsl/qbman/bman_ccsr.c @@ -170,10 +170,11 @@ static int fsl_bman_probe(struct platform_device *pdev) { int ret, err_irq; struct device *dev = &pdev->dev; - struct device_node *node = dev->of_node; + struct device_node *mem_node, *node = dev->of_node; struct resource *res; u16 id, bm_pool_cnt; u8 major, minor; + u64 size; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) { @@ -201,6 +202,38 @@ static int fsl_bman_probe(struct platform_device *pdev) return -ENODEV; } + /* +* If FBPR memory wasn't defined using the qbman compatible string +* try using the of_reserved_mem_device method +*/ + if (!fbpr_a) { + ret = of_reserved_mem_device_init(dev); + if (ret) { + dev_err(dev, "of_reserved_mem_device_init() failed 0x%x\n", + ret); + return -ENODEV; + } + mem_node = of_parse_phandle(dev->of_node, "memory-region", 0); + if (mem_node) { + ret = of_property_read_u64(mem_node, "size", &size); + if (ret) { + dev_err(dev, "FBPR: of_address_to_resource fails 0x%x\n", + ret); + return -ENODEV; + } + fbpr_sz = size; + } else { + dev_err(dev, "No memory-region found for FBPR\n"); + return -ENODEV; + } + if (!dma_zalloc_coherent(dev, fbpr_sz, &fbpr_a, 0)) { + dev_err(dev, "Alloc FBPR memory failed\n"); + return -ENODEV; + } + } + + dev_dbg(dev, "Allocated FBPR 0x%llx 0x%zx\n", fbpr_a, fbpr_sz); + bm_set_memory(fbpr_a, fbpr_sz); err_irq = platform_get_irq(pdev, 0); diff --git a/drivers/soc/fsl/qbman/bman_priv.h b/drivers/soc/fsl/qbman/bman_priv.h index f6896a2..765a4bf 100644 --- a/drivers/soc/fsl/qbman/bman_priv.h +++ b/drivers/soc/fsl/qbman/bman_priv.h @@ -33,6 +33,9 @@ #include "dpaa_sys.h" #include +#include +#include +#include /* Portal processing (interrupt) sources */ #define BM_PIRQ_RCRI 0x0002 /* RCR Ring (below threshold) */ -- 2.7.4
[v4 00/11] soc/fsl/qbman: Enable QBMan on ARM Platforms
This patch series enables DPAA1 QBMan devices for ARM and ARM64 architectures. This allows the LS1043A and LS1046A to use QBMan functionality which allows access to ethernet and cyptographic devices for example. Changes since v3: - Use memremap() instead of ioremap() for non iomem QBMan portal regions - Ensured the __iomem attribute is respected when accessing iomem mapped regions - Removed calls to flush/invalidate/prefetch for ARM/ARM64 since mapping is done as write combine Changes since v2: - Fixed some misspellings - Added 'no-map' constraint to device tree bindings - Described ordering contraint on regions in the device tree - Removed confusing comment regarding non-shareable mappings - Added warning if old reserved-memory technique is used on ARM Changes since v1: - Reworked private memory allocations to use shared-dma-pool on ARM platforms Claudiu Manoil (2): soc/fsl/qbman: Drop L1_CACHE_BYTES compile time check soc/fsl/qbman: Add missing headers on ARM Madalin Bucur (4): soc/fsl/qbman: Drop set/clear_bits usage soc/fsl/qbman: add QMAN_REV32 soc/fsl/qbman: different register offsets on ARM fsl/soc/qbman: Enable FSL_LAYERSCAPE config on ARM Roy Pledge (4): soc/fsl/qbman: Use shared-dma-pool for BMan private memory allocations soc/fsl/qbman: Use shared-dma-pool for QMan private memory allocations dt-bindings: soc/fsl: Update reserved memory binding for QBMan soc/fsl/qbman: Rework portal mapping calls for ARM/PPC Valentin Rothberg (1): soc/fsl/qbman: Fix ARM32 typo Documentation/devicetree/bindings/soc/fsl/bman.txt | 12 +- Documentation/devicetree/bindings/soc/fsl/qman.txt | 26 ++-- drivers/soc/fsl/qbman/Kconfig | 2 +- drivers/soc/fsl/qbman/bman.c | 30 - drivers/soc/fsl/qbman/bman_ccsr.c | 35 +- drivers/soc/fsl/qbman/bman_portal.c| 36 -- drivers/soc/fsl/qbman/bman_priv.h | 11 +- drivers/soc/fsl/qbman/dpaa_sys.h | 14 +-- drivers/soc/fsl/qbman/qman.c | 52 ++-- drivers/soc/fsl/qbman/qman_ccsr.c | 140 - drivers/soc/fsl/qbman/qman_portal.c| 36 -- drivers/soc/fsl/qbman/qman_priv.h | 13 +- drivers/soc/fsl/qbman/qman_test.h | 2 - 13 files changed, 305 insertions(+), 104 deletions(-) -- 2.7.4
Re: [RFC Part1 PATCH v3 14/17] x86/boot: Add early boot support when running with SEV active
On 8/23/2017 10:30 AM, Borislav Petkov wrote: On Mon, Jul 24, 2017 at 02:07:54PM -0500, Brijesh Singh wrote: From: Tom Lendacky Early in the boot process, add checks to determine if the kernel is running with Secure Encrypted Virtualization (SEV) active. Checking for SEV requires checking that the kernel is running under a hypervisor (CPUID 0x0001, bit 31), that the SEV feature is available (CPUID 0x801f, bit 1) and then check a non-interceptable SEV MSR (0xc0010131, bit 0). This check is required so that during early compressed kernel booting the pagetables (both the boot pagetables and KASLR pagetables (if enabled) are updated to include the encryption mask so that when the kernel is decompressed into encrypted memory. , it can boot properly. :) Yup, kinda didn't complete that sentence. After the kernel is decompressed and continues booting the same logic is used to check if SEV is active and set a flag indicating so. This allows us to distinguish between SME and SEV, each of which have unique differences in how certain things are handled: e.g. DMA (always bounce buffered with SEV) or EFI tables (always access decrypted with SME). Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh --- arch/x86/boot/compressed/Makefile | 2 + arch/x86/boot/compressed/head_64.S | 16 + arch/x86/boot/compressed/mem_encrypt.S | 103 + arch/x86/boot/compressed/misc.h| 2 + arch/x86/boot/compressed/pagetable.c | 8 ++- arch/x86/include/asm/mem_encrypt.h | 3 + arch/x86/include/asm/msr-index.h | 3 + arch/x86/include/uapi/asm/kvm_para.h | 1 - arch/x86/mm/mem_encrypt.c | 42 +++--- 9 files changed, 169 insertions(+), 11 deletions(-) create mode 100644 arch/x86/boot/compressed/mem_encrypt.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 2c860ad..d2fe901 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -72,6 +72,8 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o +vmlinux-objs-$(CONFIG_X86_64) += $(obj)/mem_encrypt.o There's a ifdef CONFIG_X86_64 a couple of lines below. Just put it there. Will do. ... +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -0,0 +1,103 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2017 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +#include +#include +#include + + .text + .code32 +ENTRY(get_sev_encryption_bit) + xor %eax, %eax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push%ebx + push%ecx + push%edx + + /* Check if running under a hypervisor */ + movl$1, %eax + cpuid + bt $31, %ecx /* Check the hypervisor bit */ + jnc .Lno_sev + + movl$0x8000, %eax /* CPUID to check the highest leaf */ + cpuid + cmpl$0x801f, %eax /* See if 0x801f is available */ + jb .Lno_sev + + /* +* Check for the SEV feature: +* CPUID Fn8000_001F[EAX] - Bit 1 +* CPUID Fn8000_001F[EBX] - Bits 5:0 +* Pagetable bit position used to indicate encryption +*/ + movl$0x801f, %eax + cpuid + bt $1, %eax/* Check if SEV is available */ + jnc .Lno_sev + + movl$MSR_F17H_SEV, %ecx /* Read the SEV MSR */ + rdmsr + bt $MSR_F17H_SEV_ENABLED_BIT, %eax /* Check if SEV is active */ + jnc .Lno_sev + + /* +* Get memory encryption information: +*/ The side-comment is enough. This one above can go. Done. + movl%ebx, %eax + andl$0x3f, %eax /* Return the encryption bit location */ + jmp .Lsev_exit + +.Lno_sev: + xor %eax, %eax + +.Lsev_exit: + pop %edx + pop %ecx + pop %ebx + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + + ret +ENDPROC(get_sev_encryption_bit) + + .code64 +ENTRY(get_sev_encryption_mask) + xor %rax, %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push%rbp + push%rdx + + movq%rsp, %rbp /* Save current stack pointer */ + + callget_sev_encryption_bit /* Get the encryption bit position */ So we get to call get_sev_encryption_bit() here again and noodle through the CPUID discovery and MSR access. We should instead cache that info and return the encryption bit directly on a second call. (And initialize it to -1 to denote that g
Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's
On Thu, 2017-08-24 at 18:40 +0200, Frederic Barrat wrote: > > The decrementing part is giving me troubles, and I think it makes sense: > if I decrement the counter when detaching the context from the capi > card, then the next TLBIs for the memory context may be back to local. Yes, you need to flush the CAPI TLB first. > So when the process exits, the NPU wouldn't get the associated TLBIs, > which spells trouble the next time the same memory context ID is reused. > I believe this the cause of the problem I'm seeing. As soon as I keep > the TLBIs global, even after I detach from the capi adapter, everything > is fine. > > Does it sound right? > > So to keep the checks minimal in mm_is_thread_local(), to just checking > the active_cpus count, I'm thinking of introducing a "copro enabled" bit > on the context, so that we can increment active_cpus only once. And > never decrement it. You can decrement if you flush. Don't you have MMIOs to do directed flushes ? Cheers, Ben.
Re: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver
From: David Miller Date: Thu, 24 Aug 2017 09:42:20 -0700 (PDT) > From: Madalin Bucur > Date: Thu, 24 Aug 2017 10:28:21 +0300 > >> This patch set introduces Receive Side Scaling for the DPAA Ethernet >> driver. Documentation is updated with details related to the new >> feature and limitations that apply. >> Added also a small fix. >> >> v2: removed a C++ style comment >> v3: move struct fman to header file to avoid exporting a function > > Series applied, thanks. Actually I'm reverting, this doesn't even compile. [davem@localhost net-next]$ make -s -j8 In file included from drivers/net/ethernet/freescale/fman/fman.c:35:0: drivers/net/ethernet/freescale/fman/fman.h:286:9: error: type defaults to ‘int’ in declaration of ‘irqreturn_t’ [-Werror=implicit-int] typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman, ^~~ drivers/net/ethernet/freescale/fman/fman.h:286:9: error: ‘irqreturn_t’ declared as function returning a function drivers/net/ethernet/freescale/fman/fman.h:287:12: warning: parameter names (without types) in function declaration enum fman_exceptions exception); ^~~ drivers/net/ethernet/freescale/fman/fman.h:300:22: error: ‘fman_bus_error_cb’ declared as function returning a function typedef irqreturn_t (fman_bus_error_cb)(struct fman *fman, u8 port_id, ^ drivers/net/ethernet/freescale/fman/fman.h:316:18: error: field ‘muram_res’ has incomplete type struct resource muram_res; /* MURAM resource */ ^ drivers/net/ethernet/freescale/fman/fman.h:330:2: error: unknown type name ‘fman_exceptions_cb’ fman_exceptions_cb *exception_cb; ^~ drivers/net/ethernet/freescale/fman/fman.h:333:2: error: unknown type name ‘spinlock_t’ spinlock_t spinlock; ^~ In file included from ./include/linux/irq.h:19:0, from ./include/linux/of_irq.h:6, from drivers/net/ethernet/freescale/fman/fman.c:46: ./include/linux/irqreturn.h:16:24: error: conflicting types for ‘irqreturn_t’ typedef enum irqreturn irqreturn_t; ^~~ In file included from drivers/net/ethernet/freescale/fman/fman.c:35:0: drivers/net/ethernet/freescale/fman/fman.h:286:9: note: previous declaration of ‘irqreturn_t’ was here typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman, ^~~ drivers/net/ethernet/freescale/fman/fman.c: In function ‘bmi_err_event’: drivers/net/ethernet/freescale/fman/fman.c:1237:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_BMI_STORAGE_PROFILE_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1239:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_BMI_LIST_RAM_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1241:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_BMI_STATISTICS_RAM_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1243:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_BMI_DISPATCH_RAM_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c: In function ‘qmi_err_event’: drivers/net/ethernet/freescale/fman/fman.c:1266:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_QMI_DOUBLE_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1268:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, ^~~~ drivers/net/ethernet/freescale/fman/fman.c: In function ‘dma_err_event’: drivers/net/ethernet/freescale/fman/fman.c:1317:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_DMA_SINGLE_PORT_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1319:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_DMA_READ_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1321:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_DMA_SYSTEM_WRITE_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1323:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_DMA_FM_WRITE_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c: In function ‘fpm_err_event’: drivers/net/ethernet/freescale/fman/fman.c:1340:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_FPM_DOUBLE_ECC); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1342:9: error: called object is not a function or function pointer ret = fman->exception_cb(fman, FMAN_EX_FPM_STALL_ON_TASKS); ^~~~ drivers/net/ethernet/freescale/fman/fman.c:1345:9: e
Re: [PATCH v2 2/5] powerpc: pseries: vio: match parent nodes with of_find_node_by_path
On Tue, Aug 22, 2017 at 12:12 AM, Michael Ellerman wrote: > Rob Herring writes: > >> In preparation to remove the full path from device_node.full_name, use >> of_find_node_by_path instead of open coding with strcmp. >> >> Signed-off-by: Rob Herring >> Cc: Benjamin Herrenschmidt >> Cc: Paul Mackerras >> Cc: Michael Ellerman >> Cc: linuxppc-dev@lists.ozlabs.org >> --- >> v2: >> - rebased to linux-next and removed spurious change fro patch 1. >> >> arch/powerpc/platforms/pseries/vio.c | 4 ++-- >> 1 file changed, 2 insertions(+), 2 deletions(-) >> >> diff --git a/arch/powerpc/platforms/pseries/vio.c >> b/arch/powerpc/platforms/pseries/vio.c >> index aa5ca74316fa..5754572deb23 100644 >> --- a/arch/powerpc/platforms/pseries/vio.c >> +++ b/arch/powerpc/platforms/pseries/vio.c >> @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct >> device_node *of_node) >>*/ >> parent_node = of_get_parent(of_node); >> if (parent_node) { >> - if (!strcmp(parent_node->full_name, >> "/ibm,platform-facilities")) >> + if (parent_node == >> of_find_node_by_path("/ibm,platform-facilities")) >> family = PFO; >> - else if (!strcmp(parent_node->full_name, "/vdevice")) >> + else if (parent_node == of_find_node_by_path("/vdevice")) >> family = VDEVICE; > > This leaks references to the looked up nodes. > > Both these nodes are defined in PAPR (our hypervisor spec), and both of > them must have a device_type, either "ibm,platform-facilities" or > "vdevice". > > Looking at the commit that added the code I don't see any particular > reason it used the comparison against full_name, rather than using the > device_type. > > So I'm inclined to do this instead: > > diff --git a/arch/powerpc/platforms/pseries/vio.c > b/arch/powerpc/platforms/pseries/vio.c > index 8a47f168476b..f26f906e6021 100644 > --- a/arch/powerpc/platforms/pseries/vio.c > +++ b/arch/powerpc/platforms/pseries/vio.c > @@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct > device_node *of_node) > */ > parent_node = of_get_parent(of_node); > if (parent_node) { > - if (!strcmp(parent_node->full_name, > "/ibm,platform-facilities")) > + if (!strcmp(parent_node->type, "ibm,platform-facilities")) > family = PFO; > - else if (!strcmp(parent_node->full_name, "/vdevice")) > + else if (!strcmp(parent_node->type, "vdevice")) > family = VDEVICE; > else { > pr_warn("%s: parent(%s) of %s not recognized.\n", > > > I've checked both Qemu and kvmtool add the device_type, and I'm fairly > confident that PowerVM does too. Anyway I'll test it on all the machines > I can find. Okay, do you want me to respin the patch or will you update it with the above change? Rob
Re: [PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
On 24.08.2017 11:14, Paul Mackerras wrote: > Nixiaoming pointed out that there is a memory leak in > kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() > fails; the memory allocated for the kvmppc_spapr_tce_table struct > is not freed, and nor are the pages allocated for the iommu > tables. In addition, we have already incremented the process's > count of locked memory pages, and this doesn't get restored on > error. > > David Hildenbrand pointed out that there is a race in that the > function checks early on that there is not already an entry in the > stt->iommu_tables list with the same LIOBN, but an entry with the > same LIOBN could get added between then and when the new entry is > added to the list. > > This fixes all three problems. To simplify things, we now call > anon_inode_getfd() before placing the new entry in the list. The > check for an existing entry is done while holding the kvm->lock > mutex, immediately before adding the new entry to the list. > Finally, on failure we now call kvmppc_account_memlimit to > decrement the process's count of locked memory pages. > > Reported-by: Nixiaoming > Reported-by: David Hildenbrand > Signed-off-by: Paul Mackerras > --- > v2: Don't overwrite stt in loop over spapr_tce_tables > Reviewed-by: David Hildenbrand -- Thanks, David
Re: [PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver
From: Madalin Bucur Date: Thu, 24 Aug 2017 10:28:21 +0300 > This patch set introduces Receive Side Scaling for the DPAA Ethernet > driver. Documentation is updated with details related to the new > feature and limitations that apply. > Added also a small fix. > > v2: removed a C++ style comment > v3: move struct fman to header file to avoid exporting a function Series applied, thanks.
Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's
Le 21/08/2017 à 19:35, Benjamin Herrenschmidt a écrit : On Mon, 2017-08-21 at 19:27 +0200, Frederic Barrat wrote: Hi Ben, Le 24/07/2017 à 06:28, Benjamin Herrenschmidt a écrit : Instead of comparing the whole CPU mask every time, let's keep a counter of how many bits are set in the mask. Thus testing for a local mm only requires testing if that counter is 1 and the current CPU bit is set in the mask. I'm trying to see if we could merge this patch with what I'm trying to do to mark a context as requiring global TLBIs. In http://patchwork.ozlabs.org/patch/796775/ I'm introducing a 'flags' per memory context, using one bit to say if the context needs global TLBIs. The 2 could co-exist, just checking... Do you think about using the actual active_cpus count down the road, or is it just a matter of knowing if there are more than one active cpus? Or you could just incrementer my counter. Just make sure you increment it at most once per CXL context and decrement when the context is gone. The decrementing part is giving me troubles, and I think it makes sense: if I decrement the counter when detaching the context from the capi card, then the next TLBIs for the memory context may be back to local. So when the process exits, the NPU wouldn't get the associated TLBIs, which spells trouble the next time the same memory context ID is reused. I believe this the cause of the problem I'm seeing. As soon as I keep the TLBIs global, even after I detach from the capi adapter, everything is fine. Does it sound right? So to keep the checks minimal in mm_is_thread_local(), to just checking the active_cpus count, I'm thinking of introducing a "copro enabled" bit on the context, so that we can increment active_cpus only once. And never decrement it. Fred
Re: [PATCH] powerpc/pseries: Don't attempt to acquire drc during memory hot add for assigned lmbs
On 08/24/2017 05:33 AM, Michael Ellerman wrote: > John Allen writes: > >> Check if an LMB is assigned before attempting to call dlpar_acquire_drc in >> order to avoid any unnecessary rtas calls. This substantially reduces the >> running time of memory hot add on lpars with large amounts of memory. >> >> Signed-off-by: John Allen > > I'll add: > > Fixes: c21f515c7436 ("powerpc/pseries: Make the acquire/release of the drc > for memory a seperate step") > > ? Yes, thanks. > > How bad is the slow down, do we need to backport to stable/distros? On an lpar with 16 TB of memory assigned, I observed that adding 1 GB of memory took several minutes without this fix and improved to several seconds with this fix. Yep, this will need to be backported. Memory hotplug performance is a hot issue for our team right now and we'll want to have solid performance improvement to give to customers relatively soon. > > cheers > >
Re: [PATCH 4/4] powerpc/32: remove a NOP from memset()
Le 24/08/2017 à 12:51, Michael Ellerman a écrit : Christophe Leroy writes: memset() is patched after initialisation to activate the optimised part which uses cache instructions. Today we have a 'b 2f' to skip the optimised patch, which then gets replaced by a NOP, implying a useless cycle consumption. As we have a 'bne 2f' just before, we could use that instruction for the live patching, hence removing the need to have a dedicated 'b 2f' to be replaced by a NOP. This patch changes the 'bne 2f' by a 'b 2f'. During init, that 'b 2f' is then replaced by 'bne 2f' I'm not sure what the sequence is during boot for the 32-bit code, but can you use an ALT_FTR section for this? Possibly that doesn't get done at the right time though. Unfortunately, as we discussed in 2015 (https://lkml.org/lkml/2015/9/10/608), the ALT_FTR does things too early, while the cache is not enabled yet. Christophe
Re: [PATCH v2 1/1] Split VGA default device handler out of VGA arbiter
On Thu, Aug 24, 2017 at 10:57:26AM +1000, Dave Airlie wrote: > > Yeah, maybe it's time to disconnect the "default display device" idea > > from the VGA arbiter. I have no idea what (if any) dependencies X has > > on the legacy VGA resources. I assume X works fine on power, where it > > sounds like those resources are rarely or never available. > > The question on non-x86 archs, is what is the correct device to default to. > > On x86 we use the legacy VGA resources as a pointer, as this is the device > the BIOS appeared on at boot so hopefully should be one you can see stuff on. > > On non-x86 I've no idea how to decide if there are multiple devices, maybe the > firmware needs to tag something for the kernel if there are. Otherwise > you'd just > be picking something in probe order. > > I think the idea of these patches is to separate default display > device from the arbiter. > > X uses the arbiter on x86 if required (it's horrible, and it's rare we > have to nowadays), > but for finding the default device it justs uses the sysfs boot_vga flag. The sysfs boot_vga thing comes from PCI. The name suggests that it's a VGA device and can use the legacy VGA resources. If we want to indicate a general default display device that need not be "VGA", it'd be really nice if we could pick a name that did not include "vga". Even if we could only do it inside the kernel, I think it would reduce confusion if we could separate out the "VGA"-specific stuff like the arbiter and names like "vga_set_default_device()" so that systems with a non-legacy VGA default display device didn't have to use "VGA" interfaces that don't make sense for them. Bjorn
Re: [PATCH 1/2] powerpc/workqueue: update list of possible CPUs
Hello, Laurent. On Thu, Aug 24, 2017 at 02:10:31PM +0200, Laurent Vivier wrote: > > Yeah, it just needs to match up new cpus to the cpu ids assigned to > > the right node. > > We are not able to assign the cpu ids to the right node before the CPU > is present, because firmware doesn't provide CPU mapping <-> node id > before that. What I meant was to assign the matching CPU ID when the CPU becomes present - ie. have CPU IDs available for different nodes and allocate them to the new CPU according to its node mapping when it actually comes up. Please note that I'm not saying this is the way to go, just that it is a solvable problem from the arch code. > > The node mapping for that cpu id changes *dynamically* while the > > system is running and that can race with node-affinity sensitive > > operations such as memory allocations. > > Memory is mapped to the node through its own firmware entry, so I don't > think cpu id change can affect memory affinity, and before we know the > node id of the CPU, the CPU is not present and thus it can't use memory. The latter part isn't true. For example, percpu memory gets alloacted for all possible CPUs according to their node affinity, so the memory node association change which happens when the CPU comes up for the first time can race against such allocations. I don't know whether that's actually problematic but we don't have *any* synchronization around it. If you think it's safe to have such races, please explain why that is. > > Please take a step back and think through the problem again. You > > can't bandaid it this way. > > Could you give some ideas, proposals? > As the firmware doesn't provide the information before the CPU is really > plugged, I really don't know how to manage this problem. There are two possible approaches, I think. 1. Make physical cpu -> logical cpu mapping indirect so that the kernel's cpu ID assignment is always on the right numa node. This may mean that the kernel might have to keep more possible CPUs around than necessary but it does have the benefit that all memory allocations are affine to the right node. 2. Make cpu <-> node mapping properly dynamic. Identify what sort of synchronization we'd need around the mapping changing dynamically. Note that we might not need much but it'll most likely need some. Build synchronization and notification infrastructure around it. Thanks. -- tejun
Re: [PATCH] powerpc/powernv/idle: Round up latency and residency values
* Michael Ellerman [2017-08-24 20:28:19]: > Vaidyanathan Srinivasan writes: > > > On PowerNV platforms, firmware provides exit latency and > > target residency for each of the idle states in nano > > seconds. Cpuidle framework expects the values in micro > > seconds. Round up to nearest micro seconds to avoid errors > > in cases where the values are defined as fractional micro > > seconds. > > > > Default idle state of 'snooze' has exit latency of zero. If > > other states have fractional micro second exit latency, they > > would get rounded down to zero micro second and make cpuidle > > framework choose deeper idle state when snooze loop is the > > right choice. > > > > Reported-by: Anton Blanchard > > Signed-off-by: Vaidyanathan Srinivasan > > This sounds like a fairly bad bug, does it need a Fixes / Cc stable tag? Yes, we will need this on stable kernel that runs on POWER9. On older platforms the latencies are larger and hence no impact :) I will post to stable after this fix hits your -next tree. --Vaidy
Re: [PATCH v2 12/14] KVM: PPC: Book3S HV: POWER9 can execute stop without a sync sequence
On Thu, 24 Aug 2017 20:27:35 +1000 Paul Mackerras wrote: > On Sat, Aug 12, 2017 at 02:39:10AM +1000, Nicholas Piggin wrote: > > Reviewed-by: Gautham R. Shenoy > > Signed-off-by: Nicholas Piggin > > --- > > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 24 > > 1 file changed, 12 insertions(+), 12 deletions(-) > > > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > > index 3e024fd71fe8..edb47738a686 100644 > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > > @@ -2527,7 +2527,17 @@ BEGIN_FTR_SECTION > > END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) > > > > kvm_nap_sequence: /* desired LPCR value in r5 */ > > -BEGIN_FTR_SECTION > > +BEGIN_FTR_SECTION /* nap sequence */ > > + mtspr SPRN_LPCR,r5 > > + isync > > + li r0, 0 > > + std r0, HSTATE_SCRATCH0(r13) > > + ptesync > > + ld r0, HSTATE_SCRATCH0(r13) > > +1: cmpdr0, r0 > > + bne 1b > > + nap > > +FTR_SECTION_ELSE /* stop sequence */ > > /* > > * PSSCR bits: exit criterion = 1 (wakeup based on LPCR at sreset) > > * enable state loss = 1 (allow SMT mode switch) > > @@ -2539,18 +2549,8 @@ BEGIN_FTR_SECTION > > li r4, LPCR_PECE_HVEE@higher > > sldir4, r4, 32 > > or r5, r5, r4 > > -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) > > mtspr SPRN_LPCR,r5 > > - isync > > - li r0, 0 > > - std r0, HSTATE_SCRATCH0(r13) > > - ptesync > > - ld r0, HSTATE_SCRATCH0(r13) > > -1: cmpdr0, r0 > > - bne 1b > > -BEGIN_FTR_SECTION > > - nap > > -FTR_SECTION_ELSE > > + > > PPC_STOP > > ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) > > b . > > -- > > 2.13.3 > > Currently we never get to kvm_nap_sequence on POWER9 because we are > always running one vcpu per vcore, so I haven't worried about this > code too much. In future we might need this for running HPT guests on > a radix host, though. Just trying to keep in in synch with Linux, but the rest of the series does not depend on this one so it can be left out if you'd rather. Thanks, Nick
Re: [PATCH v2 10/14] KVM: PPC: Book3S HV: POWER9 does not require secondary thread management
On Thu, 24 Aug 2017 19:34:35 +1000 Paul Mackerras wrote: > On Sat, Aug 12, 2017 at 02:39:08AM +1000, Nicholas Piggin wrote: > > POWER9 CPUs have independent MMU contexts per thread, so KVM does not > > need to quiesce secondary threads, so the hwthread_req/hwthread_state > > protocol does not have to be used. So patch it away on POWER9, and patch > > away the branch from the Linux idle wakeup to kvm_start_guest that is > > never used. > > If/when we add support for running HPT guests on a radix host, we will > have to run the host in single-threaded mode (since POWER9 doesn't > support having some threads of a core using HPT and some using radix > simultaneously). We'll then need some sort of thing like > kvmppc_grab_hwthread to coordinate with the threads so that guests can > use the secondary threads. > > So I think most of this code should stay. We will still need to have > a way to make sure that the secondaries are in real mode and not in a > guest, because all threads will need to be in real mode when switching > the core between radix and HPT mode. Maybe we can optimize it a bit > at present given that we don't yet support running HPT guests on a > radix host, but I don't want to make it harder to do that in future. Okay, most of the code's still there, just noped out with ARCH_300. But yes this and then the subsequent patches do make it more difficult to restore the KVM real mode functionality. I'll see about restructuring them to keep that ability and make it selectable with a minimal branchs or alt patches. Thanks, Nick
Re: powerpc/vio: Use device_type to detect family
On Wed, 2017-08-23 at 05:47:13 UTC, Michael Ellerman wrote: > Currently in the vio.c code we use a comparision against the parent > device node's full path to decide if the device is a PFO or VIO family > device. > > Both the ibm,platform-facilities and vdevice nodes are defined by PAPR, > and must have a matching device_type. So instead of using the path we > can instead compare the device_type. > > I've checked Qemu and kvmtool both do this correctly, and all the > PowerVM systems I have access to do also. So it seems to be safe. > > This removes the dependency on full_name, which is being removed > upstream. > > Signed-off-by: Michael Ellerman Applied to powerpc next. https://git.kernel.org/powerpc/c/bcf21e3a97a1247178338793df9ae3 cheers
Re: powerpc/64s: Fix replay interrupt return label name
On Tue, 2017-08-22 at 01:51:37 UTC, Michael Ellerman wrote: > In __replay_interrupt() we take the address of a local label so we can > return to it later. However the assembler turns the local label into a > symbol with a name like ".L1^B42" - where "^B" is literally "\002". > This does not make for pleasant stack traces. Fix it by giving the > label a sensible name. > > Signed-off-by: Michael Ellerman Applied to powerpc next. https://git.kernel.org/powerpc/c/3e23a12bcaf18b3587088807722cd2 cheers
Re: [v2, 3/5] powerpc: pseries: remove dlpar_attach_node dependency on full path
On Mon, 2017-08-21 at 15:16:49 UTC, Rob Herring wrote: > In preparation to stop storing the full node path in full_name, remove the > dependency on full_name from dlpar_attach_node(). Callers of > dlpar_attach_node() already have the parent device_node, so just pass the > parent node into dlpar_attach_node instead of the path. This avoids doing > a lookup of the parent node by the path. > > Signed-off-by: Rob Herring > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Michael Ellerman > Cc: linuxppc-dev@lists.ozlabs.org Applied to powerpc next, thanks. https://git.kernel.org/powerpc/c/215ee763f8cb9a2912d411f96f6f67 cheers
Re: [v2,1/5] powerpc: Convert to using %pOF instead of full_name
On Mon, 2017-08-21 at 15:16:47 UTC, Rob Herring wrote: > Now that we have a custom printf format specifier, convert users of > full_name to use %pOF instead. This is preparation to remove storing > of the full path string for each node. > > Signed-off-by: Rob Herring > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Michael Ellerman > Cc: Anatolij Gustschin > Cc: Scott Wood > Cc: Kumar Gala > Cc: Arnd Bergmann > Cc: linuxppc-dev@lists.ozlabs.org > Reviewed-by: Tyrel Datwyler Applied to powerpc next, thanks. https://git.kernel.org/powerpc/c/b7c670d673d1186e9a6aafaad36aac I forgot to add in the change log I took the chance to reformat some of the affected printks, hopefully that doesn't muck you up when you're merging/rebasing on top of this. cheers
Re: [v2,01/14] powerpc/64s: masked interrupt avoid branch
On Fri, 2017-08-11 at 16:38:59 UTC, Nicholas Piggin wrote: > Interrupts which do not require EE to be cleared can all > be tested with a single bitwise test. > > Signed-off-by: Nicholas Piggin Patches 1-9 applied to powerpc next, thanks. https://git.kernel.org/powerpc/c/e0c827c09c0d04d77616a4506a71b3 cheers
Re: [1/6] powerpc/mm: Move pgdir setting into a helper
On Mon, 2017-07-24 at 04:27:58 UTC, Benjamin Herrenschmidt wrote: > Makes switch_mm_irqs_off() a bit more readable > > Signed-off-by: Benjamin Herrenschmidt Patches 1-2, 4-6 applied to powerpc next, thanks. https://git.kernel.org/powerpc/c/43ed84a891b70165a621a5c9219694 cheers
Re: [PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
On Thu, Aug 24, 2017 at 07:14:47PM +1000, Paul Mackerras wrote: > Nixiaoming pointed out that there is a memory leak in > kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() > fails; the memory allocated for the kvmppc_spapr_tce_table struct > is not freed, and nor are the pages allocated for the iommu > tables. In addition, we have already incremented the process's > count of locked memory pages, and this doesn't get restored on > error. > > David Hildenbrand pointed out that there is a race in that the > function checks early on that there is not already an entry in the > stt->iommu_tables list with the same LIOBN, but an entry with the > same LIOBN could get added between then and when the new entry is > added to the list. > > This fixes all three problems. To simplify things, we now call > anon_inode_getfd() before placing the new entry in the list. The > check for an existing entry is done while holding the kvm->lock > mutex, immediately before adding the new entry to the list. > Finally, on failure we now call kvmppc_account_memlimit to > decrement the process's count of locked memory pages. > > Reported-by: Nixiaoming > Reported-by: David Hildenbrand > Signed-off-by: Paul Mackerras Reviewed-by: David Gibson > --- > v2: Don't overwrite stt in loop over spapr_tce_tables > > arch/powerpc/kvm/book3s_64_vio.c | 56 > > 1 file changed, 34 insertions(+), 22 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio.c > b/arch/powerpc/kvm/book3s_64_vio.c > index a160c14..53766e2 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > struct kvm_create_spapr_tce_64 *args) > { > struct kvmppc_spapr_tce_table *stt = NULL; > + struct kvmppc_spapr_tce_table *siter; > unsigned long npages, size; > int ret = -ENOMEM; > int i; > + int fd = -1; > > if (!args->size) > return -EINVAL; > > - /* Check this LIOBN hasn't been previously allocated */ > - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { > - if (stt->liobn == args->liobn) > - return -EBUSY; > - } > - > size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); > npages = kvmppc_tce_pages(size); > ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); > - if (ret) { > - stt = NULL; > - goto fail; > - } > + if (ret) > + return ret; > > ret = -ENOMEM; > stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), > GFP_KERNEL); > if (!stt) > - goto fail; > + goto fail_acct; > > stt->liobn = args->liobn; > stt->page_shift = args->page_shift; > @@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > goto fail; > } > > - kvm_get_kvm(kvm); > + ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, > + stt, O_RDWR | O_CLOEXEC); > + if (ret < 0) > + goto fail; > > mutex_lock(&kvm->lock); > - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); > + > + /* Check this LIOBN hasn't been previously allocated */ > + ret = 0; > + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) { > + if (siter->liobn == args->liobn) { > + ret = -EBUSY; > + break; > + } > + } > + > + if (!ret) { > + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); > + kvm_get_kvm(kvm); > + } > > mutex_unlock(&kvm->lock); > > - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, > - stt, O_RDWR | O_CLOEXEC); > + if (!ret) > + return fd; > > -fail: > - if (stt) { > - for (i = 0; i < npages; i++) > - if (stt->pages[i]) > - __free_page(stt->pages[i]); > + put_unused_fd(fd); > > - kfree(stt); > - } > + fail: > + for (i = 0; i < npages; i++) > + if (stt->pages[i]) > + __free_page(stt->pages[i]); > + > + kfree(stt); > + fail_acct: > + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); > return ret; > } > -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson signature.asc Description: PGP signature
Re: [PATCH v7 4/4] boot/param: add pointer to next argument to unknown parameter callback
On Thu, 24 Aug 2017 21:04:51 +1000 Michael Ellerman wrote: > Michal Suchanek writes: > > > The fadump parameter processing re-does the logic of next_arg quote > > stripping to determine where the argument ends. Pass pointer to the > > next argument instead to make this more robust. > > > > Signed-off-by: Michal Suchanek > > --- > > arch/powerpc/kernel/fadump.c | 13 + > > arch/powerpc/mm/hugetlbpage.c | 4 ++-- > > include/linux/moduleparam.h | 2 +- > > init/main.c | 12 ++-- > > kernel/module.c | 4 ++-- > > kernel/params.c | 19 +++ > > lib/dynamic_debug.c | 2 +- > > 7 files changed, 28 insertions(+), 28 deletions(-) > > Can you split out a patch that adds the next argument and updates the > callers. And then a patch for the fadump to use the new arg. > > cheers Yes, that makes sense. Thanks Michal
Re: [PATCH v7 3/4] lib/cmdline.c Remove quotes symmetrically.
On Thu, 24 Aug 2017 21:02:47 +1000 Michael Ellerman wrote: > Michal Suchanek writes: > > > Remove quotes from argument value only if there is qoute on both > > sides. > > > > Signed-off-by: Michal Suchanek > > --- > > arch/powerpc/kernel/fadump.c | 6 ++ > > lib/cmdline.c| 7 ++- > > Can you split that into two patches? Not really. There is logic in lib/cmdline.c which is duplicated in arch/powerpc/kernel/fadump.c and so the two places should be updated in sync. Thanks Michal > > cheers > > > 2 files changed, 4 insertions(+), 9 deletions(-) > > > > diff --git a/arch/powerpc/kernel/fadump.c > > b/arch/powerpc/kernel/fadump.c index a1614d9b8a21..d7da4ce9f7ae > > 100644 --- a/arch/powerpc/kernel/fadump.c > > +++ b/arch/powerpc/kernel/fadump.c > > @@ -489,10 +489,8 @@ static void __init fadump_update_params(struct > > param_info *param_info, *tgt++ = ' '; > > > > /* next_arg removes one leading and one trailing '"' */ > > - if (*tgt == '"') > > - shortening += 1; > > - if (*(tgt + vallen + shortening) == '"') > > - shortening += 1; > > + if ((*tgt == '"') && (*(tgt + vallen + shortening) == '"')) > > + shortening += 2; > > > > /* remove one leading and one trailing quote if both are > > present */ if ((val[0] == '"') && (val[vallen - 1] == '"')) { > > diff --git a/lib/cmdline.c b/lib/cmdline.c > > index 4c0888c4a68d..01e701b2afe8 100644 > > --- a/lib/cmdline.c > > +++ b/lib/cmdline.c > > @@ -227,14 +227,11 @@ char *next_arg(char *args, char **param, char > > **val) *val = args + equals + 1; > > > > /* Don't include quotes in value. */ > > - if (**val == '"') { > > + if ((**val == '"') && (args[i-1] == '"')) { > > (*val)++; > > - if (args[i-1] == '"') > > - args[i-1] = '\0'; > > + args[i-1] = '\0'; > > } > > } > > - if (quoted && args[i-1] == '"') > > - args[i-1] = '\0'; > > > > if (args[i]) { > > args[i] = '\0'; > > -- > > 2.10.2
[PATCH] rapidio: remove global irq spinlocks from the subsystem
Locking of config and doorbell operations should be done only if the underlying hardware requires it. This patch removes the global spinlocks from the rapidio subsystem and moves them to the mport drivers (fsl_rio and tsi721), only to the necessary places. For example, local config space read and write operations (lcread/lcwrite) are atomic in all existing drivers, so there should be no need for locking, while the cread/cwrite operations which generate maintenance transactions need to be synchronized with a lock. Later, each driver could chose to use a per-port lock instead of a global one, or even more granular locking. Signed-off-by: Ioan Nicu Signed-off-by: Frank Kunz --- arch/powerpc/sysdev/fsl_rio.c| 17 +++-- arch/powerpc/sysdev/fsl_rmu.c| 8 drivers/rapidio/devices/tsi721.c | 7 +++ drivers/rapidio/rio-access.c | 40 +--- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 1c41c51..e9f3bc9 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -71,6 +71,8 @@ #define RIWAR_WRTYP_ALLOC 0x6000 #define RIWAR_SIZE_MASK0x003F +static DEFINE_SPINLOCK(fsl_rio_config_lock); + #define __fsl_read_rio_config(x, addr, err, op)\ __asm__ __volatile__( \ "1: "op" %1,0(%2)\n"\ @@ -184,6 +186,7 @@ static int fsl_local_config_write(struct rio_mport *mport, u8 hopcount, u32 offset, int len, u32 *val) { struct rio_priv *priv = mport->priv; + unsigned long flags; u8 *data; u32 rval, err = 0; @@ -197,6 +200,8 @@ static int fsl_local_config_write(struct rio_mport *mport, if (offset > (0x100 - len) || !IS_ALIGNED(offset, len)) return -EINVAL; + spin_lock_irqsave(&fsl_rio_config_lock, flags); + out_be32(&priv->maint_atmu_regs->rowtar, (destid << 22) | (hopcount << 12) | (offset >> 12)); out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10)); @@ -213,6 +218,7 @@ static int fsl_local_config_write(struct rio_mport *mport, __fsl_read_rio_config(rval, data, err, "lwz"); break; default: + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); return -EINVAL; } @@ -221,6 +227,7 @@ static int fsl_local_config_write(struct rio_mport *mport, err, destid, hopcount, offset); } + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); *val = rval; return err; @@ -244,7 +251,10 @@ static int fsl_local_config_write(struct rio_mport *mport, u8 hopcount, u32 offset, int len, u32 val) { struct rio_priv *priv = mport->priv; + unsigned long flags; u8 *data; + int ret = 0; + pr_debug ("fsl_rio_config_write:" " index %d destid %d hopcount %d offset %8.8x len %d val %8.8x\n", @@ -255,6 +265,8 @@ static int fsl_local_config_write(struct rio_mport *mport, if (offset > (0x100 - len) || !IS_ALIGNED(offset, len)) return -EINVAL; + spin_lock_irqsave(&fsl_rio_config_lock, flags); + out_be32(&priv->maint_atmu_regs->rowtar, (destid << 22) | (hopcount << 12) | (offset >> 12)); out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10)); @@ -271,10 +283,11 @@ static int fsl_local_config_write(struct rio_mport *mport, out_be32((u32 *) data, val); break; default: - return -EINVAL; + ret = -EINVAL; } + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); - return 0; + return ret; } static void fsl_rio_inbound_mem_init(struct rio_priv *priv) diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c index c1826de..c15a17a 100644 --- a/arch/powerpc/sysdev/fsl_rmu.c +++ b/arch/powerpc/sysdev/fsl_rmu.c @@ -104,6 +104,8 @@ #define DOORBELL_MESSAGE_SIZE 0x08 +static DEFINE_SPINLOCK(fsl_rio_doorbell_lock); + struct rio_msg_regs { u32 omr; u32 osr; @@ -626,9 +628,13 @@ int fsl_rio_port_write_init(struct fsl_rio_pw *pw) int fsl_rio_doorbell_send(struct rio_mport *mport, int index, u16 destid, u16 data) { + unsigned long flags; + pr_debug("fsl_doorbell_send: index %d destid %4.4x data %4.4x\n", index, destid, data); + spin_lock_irqsave(&fsl_rio_doorbell_lock, flags); + /* In the serial version silicons, such as MPC8548, MPC8641, * below operations is must be. */ @@ -638,6 +644,8 @@ int fsl_rio_doorbell_send(struct rio_mport *mport, out_be32(&dbell->dbell_regs->oddatr, (index << 20) | data); out_be32(&d
Re: [PATCH 1/2] powerpc/workqueue: update list of possible CPUs
On 23/08/2017 15:26, Tejun Heo wrote: > Hello, Michael. > > On Wed, Aug 23, 2017 at 09:00:39PM +1000, Michael Ellerman wrote: >>> I don't think that's true. The CPU id used in kernel doesn't have to >>> match the physical one and arch code should be able to pre-map CPU IDs >>> to nodes and use the matching one when hotplugging CPUs. I'm not >>> saying that's the best way to solve the problem tho. >> >> We already virtualise the CPU numbers, but not the node IDs. And it's >> the node IDs that are really the problem. > > Yeah, it just needs to match up new cpus to the cpu ids assigned to > the right node. We are not able to assign the cpu ids to the right node before the CPU is present, because firmware doesn't provide CPU mapping <-> node id before that. >>> It could be that the best way forward is making cpu <-> node mapping >>> dynamic and properly synchronized. >> >> We don't need it to be dynamic (at least for this bug). > > The node mapping for that cpu id changes *dynamically* while the > system is running and that can race with node-affinity sensitive > operations such as memory allocations. Memory is mapped to the node through its own firmware entry, so I don't think cpu id change can affect memory affinity, and before we know the node id of the CPU, the CPU is not present and thus it can't use memory. >> Laurent is booting Qemu with a fixed CPU <-> Node mapping, it's just >> that because some CPUs aren't present at boot we don't know what the >> node mapping is. (Correct me if I'm wrong Laurent). >> >> So all we need is: >> - the workqueue code to cope with CPUs that are possible but not online >>having NUMA_NO_NODE to begin with. >> - a way to update the workqueue cpumask when the CPU comes online. >> >> Which seems reasonable to me? > > Please take a step back and think through the problem again. You > can't bandaid it this way. Could you give some ideas, proposals? As the firmware doesn't provide the information before the CPU is really plugged, I really don't know how to manage this problem. Thanks, Laurent
Re: [PATCH v7 03/12] powerpc/vas: Define vas_init() and vas_exit()
Hi Suka, Comments inline ... Sukadev Bhattiprolu writes: > diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > new file mode 100644 > index 000..0e3111d > --- /dev/null > +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > @@ -0,0 +1,24 @@ > +* IBM Powerpc Virtual Accelerator Switchboard (VAS) > + > +VAS is a hardware mechanism that allows kernel subsystems and user processes > +to directly submit compression and other requests to Nest accelerators (NX) > +or other coprocessors functions. > + > +Required properties: > +- compatible : should be "ibm,vas" or "ibm,power9-vas" The driver doesn't look for the latter. > +- ibm,vas-id : A unique identifier for each instance of VAS in the system What is this? > +- reg : Should contain 4 pairs of 64-bit fields specifying the Hypervisor > + window context start and length, OS/User window context start and length, > + "Paste address" start and length, "Paste window id" start bit and number > + of bits) > +- name : "vas" I don't think the name is normally included in the binding, and in fact there's no reason why the name is important, so I'd be inclined to drop that. > diff --git a/MAINTAINERS b/MAINTAINERS > index 3c41902..abc235f 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -6425,6 +6425,14 @@ F: drivers/crypto/nx/nx.* > F: drivers/crypto/nx/nx_csbcpb.h > F: drivers/crypto/nx/nx_debugfs.h > > +IBM Power Virtual Accelerator Switchboard > +M: Sukadev Bhattiprolu > +L: linuxppc-dev@lists.ozlabs.org > +S: Supported > +F: arch/powerpc/platforms/powernv/vas* > +F: arch/powerpc/include/asm/vas.h > +F: arch/powerpc/include/uapi/asm/vas.h That's not in the right place, the file is sorted alphabetically. V comes after L. > diff --git a/arch/powerpc/platforms/powernv/Kconfig > b/arch/powerpc/platforms/powernv/Kconfig > index 6a6f4ef..f565454 100644 > --- a/arch/powerpc/platforms/powernv/Kconfig > +++ b/arch/powerpc/platforms/powernv/Kconfig > @@ -30,3 +30,17 @@ config OPAL_PRD > help > This enables the opal-prd driver, a facility to run processor > recovery diagnostics on OpenPower machines > + > +config PPC_VAS > + bool "IBM Virtual Accelerator Switchboard (VAS)" ^ bool, so never a module. > + depends on PPC_POWERNV && PPC_64K_PAGES > + default n It should be default y. I know the usual advice is to make things 'default n', but this has fairly tight depends already, so y is OK IMO. > diff --git a/arch/powerpc/platforms/powernv/vas.c > b/arch/powerpc/platforms/powernv/vas.c > new file mode 100644 > index 000..556156b > --- /dev/null > +++ b/arch/powerpc/platforms/powernv/vas.c > @@ -0,0 +1,183 @@ > +/* > + * Copyright 2016 IBM Corp. 2016-2017. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + */ #define pr_fmt(fmt) "vas: " fmt > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include "vas.h" > + > +static bool init_done; > +LIST_HEAD(vas_instances); Can be static. > + > +static int init_vas_instance(struct platform_device *pdev) > +{ > + int rc, vasid; > + struct vas_instance *vinst; > + struct device_node *dn = pdev->dev.of_node; > + struct resource *res; struct device_node *dn = pdev->dev.of_node; struct vas_instance *vinst; struct resource *res; int rc, vasid; Petty I know, but much prettier :) > + > + rc = of_property_read_u32(dn, "ibm,vas-id", &vasid); > + if (rc) { > + pr_err("VAS: No ibm,vas-id property for %s?\n", pdev->name); With the pr_fmt() above you don't need VAS: on the front of all these. > + return -ENODEV; > + } > + > + if (pdev->num_resources != 4) { > + pr_err("VAS: Unexpected DT configuration for [%s, %d]\n", > + pdev->name, vasid); > + return -ENODEV; > + } > + > + vinst = kcalloc(1, sizeof(*vinst), GFP_KERNEL); kzalloc() would be more normal given there's only 1. > + if (!vinst) > + return -ENOMEM; > + > + INIT_LIST_HEAD(&vinst->node); > + ida_init(&vinst->ida); > + mutex_init(&vinst->mutex); > + vinst->vas_id = vasid; > + vinst->pdev = pdev; > + > + res = &pdev->resource[0]; > + vinst->hvwc_bar_start = res->start; > + vinst->hvwc_bar_len = res->end - res->start + 1; > + > + res = &pdev->resource[1]; > + vinst->uwc_bar_start = res->start; > + vinst->uwc_bar_len = res->end - res->start + 1; You have vinst->pdev, why do you need to copy all those? I don't see the lens even used. > + res = &pdev->resource[2]; > + vinst->paste_base_addr = res->start; > + > +
Re: [PATCH v7 4/4] boot/param: add pointer to next argument to unknown parameter callback
Michal Suchanek writes: > The fadump parameter processing re-does the logic of next_arg quote > stripping to determine where the argument ends. Pass pointer to the > next argument instead to make this more robust. > > Signed-off-by: Michal Suchanek > --- > arch/powerpc/kernel/fadump.c | 13 + > arch/powerpc/mm/hugetlbpage.c | 4 ++-- > include/linux/moduleparam.h | 2 +- > init/main.c | 12 ++-- > kernel/module.c | 4 ++-- > kernel/params.c | 19 +++ > lib/dynamic_debug.c | 2 +- > 7 files changed, 28 insertions(+), 28 deletions(-) Can you split out a patch that adds the next argument and updates the callers. And then a patch for the fadump to use the new arg. cheers > diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c > index d7da4ce9f7ae..6ef96711ee9a 100644 > --- a/arch/powerpc/kernel/fadump.c > +++ b/arch/powerpc/kernel/fadump.c > @@ -474,13 +474,14 @@ struct param_info { > }; > > static void __init fadump_update_params(struct param_info *param_info, > - char *param, char *val) > + char *param, char *val, char *next) > { > ptrdiff_t param_offset = param - param_info->tmp_cmdline; > size_t vallen = val ? strlen(val) : 0; > char *tgt = param_info->cmdline + param_offset + > FADUMP_EXTRA_ARGS_LEN - param_info->shortening; > - int shortening = 0; > + int shortening = ((next - 1) - (param)) > + - (FADUMP_EXTRA_ARGS_LEN + 1 + vallen); > > if (!val) > return; > @@ -488,10 +489,6 @@ static void __init fadump_update_params(struct > param_info *param_info, > /* remove '=' */ > *tgt++ = ' '; > > - /* next_arg removes one leading and one trailing '"' */ > - if ((*tgt == '"') && (*(tgt + vallen + shortening) == '"')) > - shortening += 2; > - > /* remove one leading and one trailing quote if both are present */ > if ((val[0] == '"') && (val[vallen - 1] == '"')) { > shortening += 2; > @@ -517,7 +514,7 @@ static void __init fadump_update_params(struct param_info > *param_info, > * to enforce the parameters passed through it > */ > static int __init fadump_rework_cmdline_params(char *param, char *val, > -const char *unused, void *arg) > + char *next, const char *unused, void *arg) > { > struct param_info *param_info = (struct param_info *)arg; > > @@ -525,7 +522,7 @@ static int __init fadump_rework_cmdline_params(char > *param, char *val, >strlen(FADUMP_EXTRA_ARGS_PARAM) - 1)) > return 0; > > - fadump_update_params(param_info, param, val); > + fadump_update_params(param_info, param, val, next); > > return 0; > } > diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c > index e1bf5ca397fe..3a4cce552906 100644 > --- a/arch/powerpc/mm/hugetlbpage.c > +++ b/arch/powerpc/mm/hugetlbpage.c > @@ -268,8 +268,8 @@ int alloc_bootmem_huge_page(struct hstate *hstate) > > unsigned long gpage_npages[MMU_PAGE_COUNT]; > > -static int __init do_gpage_early_setup(char *param, char *val, > -const char *unused, void *arg) > +static int __init do_gpage_early_setup(char *param, char *val, char *unused1, > +const char *unused2, void *arg) > { > static phys_addr_t size; > unsigned long npages; > diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h > index 1ee7b30dafec..fec05a186c08 100644 > --- a/include/linux/moduleparam.h > +++ b/include/linux/moduleparam.h > @@ -326,7 +326,7 @@ extern char *parse_args(const char *name, > s16 level_min, > s16 level_max, > void *arg, > - int (*unknown)(char *param, char *val, > + int (*unknown)(char *param, char *val, char *next, >const char *doing, void *arg)); > > /* Called by module remove. */ > diff --git a/init/main.c b/init/main.c > index 052481fbe363..920c3564b2f0 100644 > --- a/init/main.c > +++ b/init/main.c > @@ -239,7 +239,7 @@ static int __init loglevel(char *str) > early_param("loglevel", loglevel); > > /* Change NUL term back to "=", to make "param" the whole string. */ > -static int __init repair_env_string(char *param, char *val, > +static int __init repair_env_string(char *param, char *val, char *unused2, > const char *unused, void *arg) > { > if (val) { > @@ -257,7 +257,7 @@ static int __init repair_env_string(char *param, char > *val, > } > > /* Anything after -- gets handed straight to init. */ > -static int __init set_init_arg(char *param, char *val, > +static int __init set_init_arg(char *param, char *val, c
Re: [PATCH v7 3/4] lib/cmdline.c Remove quotes symmetrically.
Michal Suchanek writes: > Remove quotes from argument value only if there is qoute on both sides. > > Signed-off-by: Michal Suchanek > --- > arch/powerpc/kernel/fadump.c | 6 ++ > lib/cmdline.c| 7 ++- Can you split that into two patches? cheers > 2 files changed, 4 insertions(+), 9 deletions(-) > > diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c > index a1614d9b8a21..d7da4ce9f7ae 100644 > --- a/arch/powerpc/kernel/fadump.c > +++ b/arch/powerpc/kernel/fadump.c > @@ -489,10 +489,8 @@ static void __init fadump_update_params(struct > param_info *param_info, > *tgt++ = ' '; > > /* next_arg removes one leading and one trailing '"' */ > - if (*tgt == '"') > - shortening += 1; > - if (*(tgt + vallen + shortening) == '"') > - shortening += 1; > + if ((*tgt == '"') && (*(tgt + vallen + shortening) == '"')) > + shortening += 2; > > /* remove one leading and one trailing quote if both are present */ > if ((val[0] == '"') && (val[vallen - 1] == '"')) { > diff --git a/lib/cmdline.c b/lib/cmdline.c > index 4c0888c4a68d..01e701b2afe8 100644 > --- a/lib/cmdline.c > +++ b/lib/cmdline.c > @@ -227,14 +227,11 @@ char *next_arg(char *args, char **param, char **val) > *val = args + equals + 1; > > /* Don't include quotes in value. */ > - if (**val == '"') { > + if ((**val == '"') && (args[i-1] == '"')) { > (*val)++; > - if (args[i-1] == '"') > - args[i-1] = '\0'; > + args[i-1] = '\0'; > } > } > - if (quoted && args[i-1] == '"') > - args[i-1] = '\0'; > > if (args[i]) { > args[i] = '\0'; > -- > 2.10.2
Re: [PATCH V9 1/2] powerpc/numa: Update CPU topology when VPHN enabled
Nathan Fontenot writes: > On 08/23/2017 06:41 AM, Michael Ellerman wrote: >> Michael Bringmann writes: >> >>> powerpc/numa: Correct the currently broken capability to set the >>> topology for shared CPUs in LPARs. At boot time for shared CPU >>> lpars, the topology for each shared CPU is set to node zero, however, >>> this is now updated correctly using the Virtual Processor Home Node >>> (VPHN) capabilities information provided by the pHyp. >>> >>> Also, update initialization checks for device-tree attributes to >>> independently recognize PRRN or VPHN usage. >> >> Did you ever do anything to address Nathan's comments on v4 ? >> >> http://patchwork.ozlabs.org/patch/767587/ > > Looking at this patch I do not see that VPHN is always enabled. You mean *this* patch? Or v4? I think you mean this patch, in which case I agree. >> Also your change log doesn't describe anything about what the patch does >> and why it is the correct fix for the problem. >> >> When a DLPAR happens you modify the VPHN timer to run in 1 nsec, but you >> don't wait for it. Why would we not just run the logic synchronously? >> >> It also seems to make VPHN and PRRN no longer exclusive, which looking >> at PAPR seems like it might be correct, but is also a major change so >> please justify it in detail. > > This is correct, they are not exclusive. When we first added PRRN support > we mistakenly thought they were exclusive which is why the code currently > only starts PRRN, or VPHN if PRRN is not present. OK. So we need a patch that does that and only that, and clearly explains why we're doing that and why it's the correct thing to do. Then a 2nd patch can fiddle with the timer, if we must. ... >>> +static int topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS; >>> +static int topology_inited; >>> +static int topology_update_needed; >> >> None of this code should be in numa.c. Which is not your fault but I'm >> inclined to move it before we make it worse. > > Agreed. Perhaps this should all be in mm/vphn.c Actually I was thinking platforms/pseries/vphn.c cheers
Re: [PATCH 4/4] powerpc/32: remove a NOP from memset()
Christophe Leroy writes: > memset() is patched after initialisation to activate the > optimised part which uses cache instructions. > > Today we have a 'b 2f' to skip the optimised patch, which then gets > replaced by a NOP, implying a useless cycle consumption. > As we have a 'bne 2f' just before, we could use that instruction > for the live patching, hence removing the need to have a > dedicated 'b 2f' to be replaced by a NOP. > > This patch changes the 'bne 2f' by a 'b 2f'. During init, that > 'b 2f' is then replaced by 'bne 2f' I'm not sure what the sequence is during boot for the 32-bit code, but can you use an ALT_FTR section for this? Possibly that doesn't get done at the right time though. cheers
[PATCH] powerpc: Fix DAR reporting when alignment handler faults
Anton noticed that if we fault part way through emulating an unaligned instruction, we don't update the DAR to reflect that. The DAR value is eventually reported back to userspace as the address in the SEGV signal, and if userspace is using that value to demand fault then it can be confused by us not setting the value correctly. This patch is ugly as hell, but is intended to be the minimal fix and back ports easily. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/align.c | 119 +++- 1 file changed, 74 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index ec7a8b099dd9..fd3c1fcc73eb 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -235,6 +235,28 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) #define SWIZ_PTR(p)((unsigned char __user *)((p) ^ swiz)) +#define __get_user_or_set_dar(_regs, _dest, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__get_user_inatomic(_dest, __addr)) { \ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + +#define __put_user_or_set_dar(_regs, _src, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__put_user_inatomic(_src, __addr)) {\ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, unsigned int reg, unsigned int nb, unsigned int flags, unsigned int instr, @@ -263,9 +285,10 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, } else { unsigned long pc = regs->nip ^ (swiz & 4); - if (__get_user_inatomic(instr, - (unsigned int __user *)pc)) + if (__get_user_or_set_dar(regs, instr, + (unsigned int __user *)pc)) return -EFAULT; + if (swiz == 0 && (flags & SW)) instr = cpu_to_le32(instr); nb = (instr >> 11) & 0x1f; @@ -309,31 +332,31 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, ((nb0 + 3) / 4) * sizeof(unsigned long)); for (i = 0; i < nb; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, -i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, + REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; } } else { for (i = 0; i < nb; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__put_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, -i ^ bswiz), -
Re: [PATCH] powerpc/pseries: Don't attempt to acquire drc during memory hot add for assigned lmbs
John Allen writes: > Check if an LMB is assigned before attempting to call dlpar_acquire_drc in > order to avoid any unnecessary rtas calls. This substantially reduces the > running time of memory hot add on lpars with large amounts of memory. > > Signed-off-by: John Allen I'll add: Fixes: c21f515c7436 ("powerpc/pseries: Make the acquire/release of the drc for memory a seperate step") ? How bad is the slow down, do we need to backport to stable/distros? cheers
Re: [PATCH] powerpc/powernv/idle: Round up latency and residency values
Vaidyanathan Srinivasan writes: > On PowerNV platforms, firmware provides exit latency and > target residency for each of the idle states in nano > seconds. Cpuidle framework expects the values in micro > seconds. Round up to nearest micro seconds to avoid errors > in cases where the values are defined as fractional micro > seconds. > > Default idle state of 'snooze' has exit latency of zero. If > other states have fractional micro second exit latency, they > would get rounded down to zero micro second and make cpuidle > framework choose deeper idle state when snooze loop is the > right choice. > > Reported-by: Anton Blanchard > Signed-off-by: Vaidyanathan Srinivasan This sounds like a fairly bad bug, does it need a Fixes / Cc stable tag? cheers
Re: [PATCH v2 12/14] KVM: PPC: Book3S HV: POWER9 can execute stop without a sync sequence
On Sat, Aug 12, 2017 at 02:39:10AM +1000, Nicholas Piggin wrote: > Reviewed-by: Gautham R. Shenoy > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 24 > 1 file changed, 12 insertions(+), 12 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > index 3e024fd71fe8..edb47738a686 100644 > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > @@ -2527,7 +2527,17 @@ BEGIN_FTR_SECTION > END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) > > kvm_nap_sequence:/* desired LPCR value in r5 */ > -BEGIN_FTR_SECTION > +BEGIN_FTR_SECTION/* nap sequence */ > + mtspr SPRN_LPCR,r5 > + isync > + li r0, 0 > + std r0, HSTATE_SCRATCH0(r13) > + ptesync > + ld r0, HSTATE_SCRATCH0(r13) > +1: cmpdr0, r0 > + bne 1b > + nap > +FTR_SECTION_ELSE /* stop sequence */ > /* >* PSSCR bits: exit criterion = 1 (wakeup based on LPCR at sreset) >* enable state loss = 1 (allow SMT mode switch) > @@ -2539,18 +2549,8 @@ BEGIN_FTR_SECTION > li r4, LPCR_PECE_HVEE@higher > sldir4, r4, 32 > or r5, r5, r4 > -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) > mtspr SPRN_LPCR,r5 > - isync > - li r0, 0 > - std r0, HSTATE_SCRATCH0(r13) > - ptesync > - ld r0, HSTATE_SCRATCH0(r13) > -1: cmpdr0, r0 > - bne 1b > -BEGIN_FTR_SECTION > - nap > -FTR_SECTION_ELSE > + > PPC_STOP > ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) > b . > -- > 2.13.3 Currently we never get to kvm_nap_sequence on POWER9 because we are always running one vcpu per vcore, so I haven't worried about this code too much. In future we might need this for running HPT guests on a radix host, though. Paul.
Re: [PATCH v2 10/14] KVM: PPC: Book3S HV: POWER9 does not require secondary thread management
On Sat, Aug 12, 2017 at 02:39:08AM +1000, Nicholas Piggin wrote: > POWER9 CPUs have independent MMU contexts per thread, so KVM does not > need to quiesce secondary threads, so the hwthread_req/hwthread_state > protocol does not have to be used. So patch it away on POWER9, and patch > away the branch from the Linux idle wakeup to kvm_start_guest that is > never used. If/when we add support for running HPT guests on a radix host, we will have to run the host in single-threaded mode (since POWER9 doesn't support having some threads of a core using HPT and some using radix simultaneously). We'll then need some sort of thing like kvmppc_grab_hwthread to coordinate with the threads so that guests can use the secondary threads. So I think most of this code should stay. We will still need to have a way to make sure that the secondaries are in real mode and not in a guest, because all threads will need to be in real mode when switching the core between radix and HPT mode. Maybe we can optimize it a bit at present given that we don't yet support running HPT guests on a radix host, but I don't want to make it harder to do that in future. Paul.
Re: [PATCH v2 1/1] futex: remove duplicated code and fix UB
Hi Jiri, On Thu, Aug 24, 2017 at 09:31:05AM +0200, Jiri Slaby wrote: > There is code duplicated over all architecture's headers for > futex_atomic_op_inuser. Namely op decoding, access_ok check for uaddr, > and comparison of the result. > > Remove this duplication and leave up to the arches only the needed > assembly which is now in arch_futex_atomic_op_inuser. > > This effectively distributes the Will Deacon's arm64 fix for undefined > behaviour reported by UBSAN to all architectures. The fix was done in > commit 5f16a046f8e1 (arm64: futex: Fix undefined behaviour with > FUTEX_OP_OPARG_SHIFT usage). Look there for an example dump. > > And as suggested by Thomas, check for negative oparg too, because it was > also reported to cause undefined behaviour report. > > Note that s390 removed access_ok check in d12a29703 ("s390/uaccess: > remove pointless access_ok() checks") as access_ok there returns true. > We introduce it back to the helper for the sake of simplicity (it gets > optimized away anyway). For arm64 and the core code: Reviewed-by: Will Deacon Although one minor thing on the core part: > diff --git a/kernel/futex.c b/kernel/futex.c > index 0939255fc750..3d38eaf05492 100644 > --- a/kernel/futex.c > +++ b/kernel/futex.c > @@ -1551,6 +1551,45 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int > nr_wake, u32 bitset) > return ret; > } > > +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) > +{ > + unsigned int op = (encoded_op & 0x7000) >> 28; > + unsigned int cmp =(encoded_op & 0x0f00) >> 24; > + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); > + int cmparg = sign_extend32(encoded_op & 0x0fff, 12); > + int oldval, ret; > + > + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { > + if (oparg < 0 || oparg > 31) > + return -EINVAL; > + oparg = 1 << oparg; > + } > + > + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) > + return -EFAULT; > + > + ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); > + if (ret) > + return ret; We could move the pagefault_{disable,enable} calls here, and then remove them from the futex_atomic_op_inuser callsites elsewhere in futex.c Will
Re: [PATCH v2 1/1] Split VGA default device handler out of VGA arbiter
On 24 August 2017 at 01:57, Dave Airlie wrote: >> Yeah, maybe it's time to disconnect the "default display device" idea >> from the VGA arbiter. I have no idea what (if any) dependencies X has >> on the legacy VGA resources. I assume X works fine on power, where it >> sounds like those resources are rarely or never available. > > The question on non-x86 archs, is what is the correct device to default to. > > On x86 we use the legacy VGA resources as a pointer, as this is the device > the BIOS appeared on at boot so hopefully should be one you can see stuff on. > > On non-x86 I've no idea how to decide if there are multiple devices, maybe the > firmware needs to tag something for the kernel if there are. Otherwise > you'd just > be picking something in probe order. > > I think the idea of these patches is to separate default display > device from the arbiter. > > X uses the arbiter on x86 if required (it's horrible, and it's rare we > have to nowadays), > but for finding the default device it justs uses the sysfs boot_vga flag. > Part of the problem is that X refuses to start if there is only one display device to begin with in case it hasn't taken ownership of the VGA legacy resources.
Re: [PATCH v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
On Thu, Aug 24, 2017 at 07:11:38PM +1000, Paul Mackerras wrote: Ignore this. My apologies. Paul.
[PATCH really v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
Nixiaoming pointed out that there is a memory leak in kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor are the pages allocated for the iommu tables. In addition, we have already incremented the process's count of locked memory pages, and this doesn't get restored on error. David Hildenbrand pointed out that there is a race in that the function checks early on that there is not already an entry in the stt->iommu_tables list with the same LIOBN, but an entry with the same LIOBN could get added between then and when the new entry is added to the list. This fixes all three problems. To simplify things, we now call anon_inode_getfd() before placing the new entry in the list. The check for an existing entry is done while holding the kvm->lock mutex, immediately before adding the new entry to the list. Finally, on failure we now call kvmppc_account_memlimit to decrement the process's count of locked memory pages. Reported-by: Nixiaoming Reported-by: David Hildenbrand Signed-off-by: Paul Mackerras --- v2: Don't overwrite stt in loop over spapr_tce_tables arch/powerpc/kvm/book3s_64_vio.c | 56 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index a160c14..53766e2 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; + struct kvmppc_spapr_tce_table *siter; unsigned long npages, size; int ret = -ENOMEM; int i; + int fd = -1; if (!args->size) return -EINVAL; - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); - if (ret) { - stt = NULL; - goto fail; - } + if (ret) + return ret; ret = -ENOMEM; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) - goto fail; + goto fail_acct; stt->liobn = args->liobn; stt->page_shift = args->page_shift; @@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } - kvm_get_kvm(kvm); + ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + if (ret < 0) + goto fail; mutex_lock(&kvm->lock); - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + + /* Check this LIOBN hasn't been previously allocated */ + ret = 0; + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) { + if (siter->liobn == args->liobn) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + kvm_get_kvm(kvm); + } mutex_unlock(&kvm->lock); - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR | O_CLOEXEC); + if (!ret) + return fd; -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); + put_unused_fd(fd); - kfree(stt); - } + fail: + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + fail_acct: + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); return ret; } -- 2.7.4
[PATCH v2] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
Nixiaoming pointed out that there is a memory leak in kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor are the pages allocated for the iommu tables. In addition, we have already incremented the process's count of locked memory pages, and this doesn't get restored on error. David Hildenbrand pointed out that there is a race in that the function checks early on that there is not already an entry in the stt->iommu_tables list with the same LIOBN, but an entry with the same LIOBN could get added between then and when the new entry is added to the list. This fixes all three problems. To simplify things, we now call anon_inode_getfd() before placing the new entry in the list. The check for an existing entry is done while holding the kvm->lock mutex, immediately before adding the new entry to the list. Finally, on failure we now call kvmppc_account_memlimit to decrement the process's count of locked memory pages. Reported-by: Nixiaoming Reported-by: David Hildenbrand Signed-off-by: Paul Mackerras --- v2: Don't overwrite stt in loop over kvm->arch.spapr_tce_tables arch/powerpc/kvm/book3s_64_vio.c | 55 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index a160c14..d463c1c 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -297,29 +297,22 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, unsigned long npages, size; int ret = -ENOMEM; int i; + int fd = -1; if (!args->size) return -EINVAL; - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); - if (ret) { - stt = NULL; - goto fail; - } + if (ret) + return ret; ret = -ENOMEM; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) - goto fail; + goto fail_acct; stt->liobn = args->liobn; stt->page_shift = args->page_shift; @@ -334,24 +327,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } - kvm_get_kvm(kvm); + ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + if (ret < 0) + goto fail; mutex_lock(&kvm->lock); - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + + /* Check this LIOBN hasn't been previously allocated */ + ret = 0; + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == args->liobn) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + kvm_get_kvm(kvm); + } mutex_unlock(&kvm->lock); - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR | O_CLOEXEC); + if (!ret) + return fd; -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); + put_unused_fd(fd); - kfree(stt); - } + fail: + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + fail_acct: + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); return ret; } -- 2.7.4
Re: [PATCH] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
On Thu, Aug 24, 2017 at 06:43:22AM +, Nixiaoming wrote: > >From: Paul Mackerras [mailto:pau...@ozlabs.org] Thursday, August 24, 2017 > >11:40 AM > > > >Nixiaoming pointed out that there is a memory leak in > >kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the > >memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor > >are the pages allocated for the iommu tables. In addition, we have already > >incremented the process's count of locked memory pages, and this doesn't get > >restored on error. > > > >David Hildenbrand pointed out that there is a race in that the function > >checks early on that there is not already an entry in the > >stt->iommu_tables list with the same LIOBN, but an entry with the > >same LIOBN could get added between then and when the new entry is added to > >the list. > > > >This fixes all three problems. To simplify things, we now call > >anon_inode_getfd() before placing the new entry in the list. The check for > >an existing entry is done while holding the kvm->lock mutex, immediately > >before adding the new entry to the list. > >Finally, on failure we now call kvmppc_account_memlimit to decrement the > >process's count of locked memory pages. > > > >Reported-by: Nixiaoming > >Reported-by: David Hildenbrand > >Signed-off-by: Paul Mackerras > >--- > > arch/powerpc/kvm/book3s_64_vio.c | 55 > > > > 1 file changed, 33 insertions(+), 22 deletions(-) > > > >diff --git a/arch/powerpc/kvm/book3s_64_vio.c > >b/arch/powerpc/kvm/book3s_64_vio.c > >index a160c14304eb..d463c1cd0d8d 100644 > >--- a/arch/powerpc/kvm/book3s_64_vio.c > >+++ b/arch/powerpc/kvm/book3s_64_vio.c > >@@ -297,29 +297,22 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > > unsigned long npages, size; > > int ret = -ENOMEM; > > int i; > >+int fd = -1; > > > > if (!args->size) > > return -EINVAL; > > > >-/* Check this LIOBN hasn't been previously allocated */ > >-list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { > >-if (stt->liobn == args->liobn) > >-return -EBUSY; > >-} > >- > > size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); > > npages = kvmppc_tce_pages(size); > > ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); > >-if (ret) { > >-stt = NULL; > >-goto fail; > >-} > >+if (ret) > >+return ret; > > > > ret = -ENOMEM; > > stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), > > GFP_KERNEL); > > if (!stt) > >-goto fail; > >+goto fail_acct; > > > > stt->liobn = args->liobn; > > stt->page_shift = args->page_shift; > >@@ -334,24 +327,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > > goto fail; > > } > > > >-kvm_get_kvm(kvm); > >+ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, > >+stt, O_RDWR | O_CLOEXEC); > >+if (ret < 0) > >+goto fail; > > > > mutex_lock(&kvm->lock); > >-list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); > >+ > >+/* Check this LIOBN hasn't been previously allocated */ > >+ret = 0; > >+list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { > > I think stt can not be used here > need a new value for list_for_each_entry Yes. Good point. New version coming. Paul.
Re: [PATCH 10/10] powerpc/xive: fix the size of the cpumask used in xive_find_target_in_mask()
On 08/24/2017 07:49 AM, Michael Ellerman wrote: > Michael Ellerman writes: > >> Cédric Le Goater writes: >>> When called from xive_irq_startup(), the size of the cpumask can be >>> larger than nr_cpu_ids. Most of time, its value is NR_CPUS (2048). > ... >> >> I guess this patch is a good fix, I'll expand the change log a bit. > > Actually this got lost, because it was part of the larger series, and > then you sent a v2 of the series and so v1 was marked superseeded :/ > > Anyway I've pulled this out of the series and will merge it. I just saw your resend. Thanks for doing so, C.
[PATCH v2 1/1] futex: remove duplicated code and fix UB
There is code duplicated over all architecture's headers for futex_atomic_op_inuser. Namely op decoding, access_ok check for uaddr, and comparison of the result. Remove this duplication and leave up to the arches only the needed assembly which is now in arch_futex_atomic_op_inuser. This effectively distributes the Will Deacon's arm64 fix for undefined behaviour reported by UBSAN to all architectures. The fix was done in commit 5f16a046f8e1 (arm64: futex: Fix undefined behaviour with FUTEX_OP_OPARG_SHIFT usage). Look there for an example dump. And as suggested by Thomas, check for negative oparg too, because it was also reported to cause undefined behaviour report. Note that s390 removed access_ok check in d12a29703 ("s390/uaccess: remove pointless access_ok() checks") as access_ok there returns true. We introduce it back to the helper for the sake of simplicity (it gets optimized away anyway). [v2] - check also for negative values - wait for Will's fix to be in upstream Signed-off-by: Jiri Slaby Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Vineet Gupta Acked-by: Russell King Cc: Catalin Marinas Cc: Will Deacon Reviewed-by: Darren Hart (VMware) Cc: Richard Kuo Cc: Tony Luck Cc: Fenghua Yu Cc: Michal Simek Cc: Ralf Baechle Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Michael Ellerman (powerpc) Cc: Martin Schwidefsky Acked-by: Heiko Carstens [s390] Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Acked-by: Chris Metcalf [for tile] Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Chris Zankel Cc: Max Filippov Cc: Arnd Bergmann Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: --- arch/alpha/include/asm/futex.h | 26 --- arch/arc/include/asm/futex.h| 40 - arch/arm/include/asm/futex.h| 26 +++ arch/arm64/include/asm/futex.h | 26 +++ arch/frv/include/asm/futex.h| 3 ++- arch/frv/kernel/futex.c | 27 +++- arch/hexagon/include/asm/futex.h| 38 +++- arch/ia64/include/asm/futex.h | 25 +++ arch/microblaze/include/asm/futex.h | 38 +++- arch/mips/include/asm/futex.h | 25 +++ arch/openrisc/include/asm/futex.h | 39 +++-- arch/parisc/include/asm/futex.h | 26 +++ arch/powerpc/include/asm/futex.h| 26 --- arch/s390/include/asm/futex.h | 23 - arch/sh/include/asm/futex.h | 26 +++ arch/sparc/include/asm/futex_64.h | 26 --- arch/tile/include/asm/futex.h | 40 - arch/x86/include/asm/futex.h| 40 - arch/xtensa/include/asm/futex.h | 27 include/asm-generic/futex.h | 50 +++-- kernel/futex.c | 39 + 21 files changed, 130 insertions(+), 506 deletions(-) diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index fb01dfb760c2..05a70edd57b6 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -25,18 +25,10 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 1
[PATCH v3 7/7] dpaa_eth: check allocation result
Signed-off-by: Madalin Bucur --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 73ca8d7..4225806 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2561,6 +2561,9 @@ static struct dpaa_bp *dpaa_bp_alloc(struct device *dev) dpaa_bp->bpid = FSL_DPAA_BPID_INV; dpaa_bp->percpu_count = devm_alloc_percpu(dev, *dpaa_bp->percpu_count); + if (!dpaa_bp->percpu_count) + return ERR_PTR(-ENOMEM); + dpaa_bp->config_count = FSL_DPAA_ETH_MAX_BUF_COUNT; dpaa_bp->seed_cb = dpaa_bp_seed; -- 2.1.0
[PATCH v3 6/7] Documentation: networking: add RSS information
Signed-off-by: Madalin Bucur --- Documentation/networking/dpaa.txt | 68 ++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/dpaa.txt b/Documentation/networking/dpaa.txt index 76e016d..f88194f 100644 --- a/Documentation/networking/dpaa.txt +++ b/Documentation/networking/dpaa.txt @@ -13,6 +13,7 @@ Contents - Configuring DPAA Ethernet in your kernel - DPAA Ethernet Frame Processing - DPAA Ethernet Features + - DPAA IRQ Affinity and Receive Side Scaling - Debugging DPAA Ethernet Overview @@ -147,7 +148,10 @@ gradually. The driver has Rx and Tx checksum offloading for UDP and TCP. Currently the Rx checksum offload feature is enabled by default and cannot be controlled through -ethtool. +ethtool. Also, rx-flow-hash and rx-hashing was added. The addition of RSS +provides a big performance boost for the forwarding scenarios, allowing +different traffic flows received by one interface to be processed by different +CPUs in parallel. The driver has support for multiple prioritized Tx traffic classes. Priorities range from 0 (lowest) to 3 (highest). These are mapped to HW workqueues with @@ -166,6 +170,68 @@ classes as follows: tc qdisc add dev root handle 1: \ mqprio num_tc 4 map 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 hw 1 +DPAA IRQ Affinity and Receive Side Scaling +== + +Traffic coming on the DPAA Rx queues or on the DPAA Tx confirmation +queues is seen by the CPU as ingress traffic on a certain portal. +The DPAA QMan portal interrupts are affined each to a certain CPU. +The same portal interrupt services all the QMan portal consumers. + +By default the DPAA Ethernet driver enables RSS, making use of the +DPAA FMan Parser and Keygen blocks to distribute traffic on 128 +hardware frame queues using a hash on IP v4/v6 source and destination +and L4 source and destination ports, in present in the received frame. +When RSS is disabled, all traffic received by a certain interface is +received on the default Rx frame queue. The default DPAA Rx frame +queues are configured to put the received traffic into a pool channel +that allows any available CPU portal to dequeue the ingress traffic. +The default frame queues have the HOLDACTIVE option set, ensuring that +traffic bursts from a certain queue are serviced by the same CPU. +This ensures a very low rate of frame reordering. A drawback of this +is that only one CPU at a time can service the traffic received by a +certain interface when RSS is not enabled. + +To implement RSS, the DPAA Ethernet driver allocates an extra set of +128 Rx frame queues that are configured to dedicated channels, in a +round-robin manner. The mapping of the frame queues to CPUs is now +hardcoded, there is no indirection table to move traffic for a certain +FQ (hash result) to another CPU. The ingress traffic arriving on one +of these frame queues will arrive at the same portal and will always +be processed by the same CPU. This ensures intra-flow order preservation +and workload distribution for multiple traffic flows. + +RSS can be turned off for a certain interface using ethtool, i.e. + + # ethtool -N fm1-mac9 rx-flow-hash tcp4 "" + +To turn it back on, one needs to set rx-flow-hash for tcp4/6 or udp4/6: + + # ethtool -N fm1-mac9 rx-flow-hash udp4 sfdn + +There is no independent control for individual protocols, any command +run for one of tcp4|udp4|ah4|esp4|sctp4|tcp6|udp6|ah6|esp6|sctp6 is +going to control the rx-flow-hashing for all protocols on that interface. + +Besides using the FMan Keygen computed hash for spreading traffic on the +128 Rx FQs, the DPAA Ethernet driver also sets the skb hash value when +the NETIF_F_RXHASH feature is on (active by default). This can be turned +on or off through ethtool, i.e.: + + # ethtool -K fm1-mac9 rx-hashing off + # ethtool -k fm1-mac9 | grep hash + receive-hashing: off + # ethtool -K fm1-mac9 rx-hashing on + Actual changes: + receive-hashing: on + # ethtool -k fm1-mac9 | grep hash + receive-hashing: on + +Please note that Rx hashing depends upon the rx-flow-hashing being on +for that interface - turning off rx-flow-hashing will also disable the +rx-hashing (without ethtool reporting it as off as that depends on the +NETIF_F_RXHASH feature flag). + Debugging = -- 2.1.0
[PATCH v3 5/7] dpaa_eth: add NETIF_F_RXHASH
Set the skb hash when then FMan Keygen hash result is available. Signed-off-by: Madalin Bucur --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 23 +++--- drivers/net/ethernet/freescale/dpaa/dpaa_eth.h | 1 + drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 9 +++-- drivers/net/ethernet/freescale/fman/fman_port.c| 11 +++ drivers/net/ethernet/freescale/fman/fman_port.h| 2 ++ 5 files changed, 41 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 6d89e74..73ca8d7 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -236,7 +236,7 @@ static int dpaa_netdev_init(struct net_device *net_dev, net_dev->max_mtu = dpaa_get_max_mtu(); net_dev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | -NETIF_F_LLTX); +NETIF_F_LLTX | NETIF_F_RXHASH); net_dev->hw_features |= NETIF_F_SG | NETIF_F_HIGHDMA; /* The kernels enables GSO automatically, if we declare NETIF_F_SG. @@ -2237,12 +2237,13 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal, dma_addr_t addr = qm_fd_addr(fd); enum qm_fd_format fd_format; struct net_device *net_dev; - u32 fd_status; + u32 fd_status, hash_offset; struct dpaa_bp *dpaa_bp; struct dpaa_priv *priv; unsigned int skb_len; struct sk_buff *skb; int *count_ptr; + void *vaddr; fd_status = be32_to_cpu(fd->status); fd_format = qm_fd_get_format(fd); @@ -2288,7 +2289,8 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal, dma_unmap_single(dpaa_bp->dev, addr, dpaa_bp->size, DMA_FROM_DEVICE); /* prefetch the first 64 bytes of the frame or the SGT start */ - prefetch(phys_to_virt(addr) + qm_fd_get_offset(fd)); + vaddr = phys_to_virt(addr); + prefetch(vaddr + qm_fd_get_offset(fd)); fd_format = qm_fd_get_format(fd); /* The only FD types that we may receive are contig and S/G */ @@ -2309,6 +2311,18 @@ static enum qman_cb_dqrr_result rx_default_dqrr(struct qman_portal *portal, skb->protocol = eth_type_trans(skb, net_dev); + if (net_dev->features & NETIF_F_RXHASH && priv->keygen_in_use && + !fman_port_get_hash_result_offset(priv->mac_dev->port[RX], + &hash_offset)) { + enum pkt_hash_types type; + + /* if L4 exists, it was used in the hash generation */ + type = be32_to_cpu(fd->status) & FM_FD_STAT_L4CV ? + PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3; + skb_set_hash(skb, be32_to_cpu(*(u32 *)(vaddr + hash_offset)), +type); + } + skb_len = skb->len; if (unlikely(netif_receive_skb(skb) == NET_RX_DROP)) @@ -2774,6 +2788,9 @@ static int dpaa_eth_probe(struct platform_device *pdev) if (err) goto init_ports_failed; + /* Rx traffic distribution based on keygen hashing defaults to on */ + priv->keygen_in_use = true; + priv->percpu_priv = devm_alloc_percpu(dev, *priv->percpu_priv); if (!priv->percpu_priv) { dev_err(dev, "devm_alloc_percpu() failed\n"); diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h index 496a12c..bd94220 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h @@ -159,6 +159,7 @@ struct dpaa_priv { struct list_head dpaa_fq_list; u8 num_tc; + bool keygen_in_use; u32 msg_enable; /* net_device message level */ struct { diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index 965f652..faea674 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -402,6 +402,8 @@ static void dpaa_get_strings(struct net_device *net_dev, u32 stringset, static int dpaa_get_hash_opts(struct net_device *dev, struct ethtool_rxnfc *cmd) { + struct dpaa_priv *priv = netdev_priv(dev); + cmd->data = 0; switch (cmd->flow_type) { @@ -409,7 +411,8 @@ static int dpaa_get_hash_opts(struct net_device *dev, case TCP_V6_FLOW: case UDP_V4_FLOW: case UDP_V6_FLOW: - cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + if (priv->keygen_in_use) + cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; /* Fall through */ case IPV4_FLOW: case IPV6_FLOW: @@ -421,7 +424,8 @@ static int dpaa_get_hash_opts(struct net_device *dev, case AH_V6_FLOW:
[PATCH v3 4/7] dpaa_eth: enable Rx hashing control
Allow ethtool control of the Rx flow hashing. By default RSS is enabled, this allows to turn it off by bypassing the FMan Keygen block and sending all traffic on the default Rx frame queue. Signed-off-by: Madalin Bucur --- drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 113 + 1 file changed, 113 insertions(+) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index aad825088..965f652 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -399,6 +399,117 @@ static void dpaa_get_strings(struct net_device *net_dev, u32 stringset, memcpy(strings, dpaa_stats_global, size); } +static int dpaa_get_hash_opts(struct net_device *dev, + struct ethtool_rxnfc *cmd) +{ + cmd->data = 0; + + switch (cmd->flow_type) { + case TCP_V4_FLOW: + case TCP_V6_FLOW: + case UDP_V4_FLOW: + case UDP_V6_FLOW: + cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + /* Fall through */ + case IPV4_FLOW: + case IPV6_FLOW: + case SCTP_V4_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V4_FLOW: + case AH_ESP_V6_FLOW: + case AH_V4_FLOW: + case AH_V6_FLOW: + case ESP_V4_FLOW: + case ESP_V6_FLOW: + cmd->data |= RXH_IP_SRC | RXH_IP_DST; + break; + default: + cmd->data = 0; + break; + } + + return 0; +} + +static int dpaa_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, + u32 *unused) +{ + int ret = -EOPNOTSUPP; + + switch (cmd->cmd) { + case ETHTOOL_GRXFH: + ret = dpaa_get_hash_opts(dev, cmd); + break; + default: + break; + } + + return ret; +} + +static void dpaa_set_hash(struct net_device *net_dev, bool enable) +{ + struct mac_device *mac_dev; + struct fman_port *rxport; + struct dpaa_priv *priv; + + priv = netdev_priv(net_dev); + mac_dev = priv->mac_dev; + rxport = mac_dev->port[0]; + + fman_port_use_kg_hash(rxport, enable); +} + +static int dpaa_set_hash_opts(struct net_device *dev, + struct ethtool_rxnfc *nfc) +{ + int ret = -EINVAL; + + /* we support hashing on IPv4/v6 src/dest IP and L4 src/dest port */ + if (nfc->data & + ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) + return -EINVAL; + + switch (nfc->flow_type) { + case TCP_V4_FLOW: + case TCP_V6_FLOW: + case UDP_V4_FLOW: + case UDP_V6_FLOW: + case IPV4_FLOW: + case IPV6_FLOW: + case SCTP_V4_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V4_FLOW: + case AH_ESP_V6_FLOW: + case AH_V4_FLOW: + case AH_V6_FLOW: + case ESP_V4_FLOW: + case ESP_V6_FLOW: + dpaa_set_hash(dev, !!nfc->data); + ret = 0; + break; + default: + break; + } + + return ret; +} + +static int dpaa_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) +{ + int ret = -EOPNOTSUPP; + + switch (cmd->cmd) { + case ETHTOOL_SRXFH: + ret = dpaa_set_hash_opts(dev, cmd); + break; + default: + break; + } + + return ret; +} + const struct ethtool_ops dpaa_ethtool_ops = { .get_drvinfo = dpaa_get_drvinfo, .get_msglevel = dpaa_get_msglevel, @@ -412,4 +523,6 @@ const struct ethtool_ops dpaa_ethtool_ops = { .get_strings = dpaa_get_strings, .get_link_ksettings = dpaa_get_link_ksettings, .set_link_ksettings = dpaa_set_link_ksettings, + .get_rxnfc = dpaa_get_rxnfc, + .set_rxnfc = dpaa_set_rxnfc, }; -- 2.1.0
[PATCH v3 2/7] fsl/fman: enable FMan Keygen
From: Iordache Florinel-R70177 Add support for the FMan Keygen with a hardcoded scheme to spread incoming traffic on a FQ range based on source and destination IPs and ports. Signed-off-by: Iordache Florinel Signed-off-by: Madalin Bucur --- drivers/net/ethernet/freescale/fman/Makefile | 2 +- drivers/net/ethernet/freescale/fman/fman.c| 8 + drivers/net/ethernet/freescale/fman/fman.h| 2 + drivers/net/ethernet/freescale/fman/fman_keygen.c | 783 ++ drivers/net/ethernet/freescale/fman/fman_keygen.h | 46 ++ drivers/net/ethernet/freescale/fman/fman_port.c | 40 +- drivers/net/ethernet/freescale/fman/fman_port.h | 5 + 7 files changed, 884 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.c create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.h diff --git a/drivers/net/ethernet/freescale/fman/Makefile b/drivers/net/ethernet/freescale/fman/Makefile index 6049177..2c38119 100644 --- a/drivers/net/ethernet/freescale/fman/Makefile +++ b/drivers/net/ethernet/freescale/fman/Makefile @@ -4,6 +4,6 @@ obj-$(CONFIG_FSL_FMAN) += fsl_fman.o obj-$(CONFIG_FSL_FMAN) += fsl_fman_port.o obj-$(CONFIG_FSL_FMAN) += fsl_mac.o -fsl_fman-objs := fman_muram.o fman.o fman_sp.o +fsl_fman-objs := fman_muram.o fman.o fman_sp.o fman_keygen.o fsl_fman_port-objs := fman_port.o fsl_mac-objs:= mac.o fman_dtsec.o fman_memac.o fman_tgec.o diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c index 6ed383f..9383b99 100644 --- a/drivers/net/ethernet/freescale/fman/fman.c +++ b/drivers/net/ethernet/freescale/fman/fman.c @@ -34,6 +34,7 @@ #include "fman.h" #include "fman_muram.h" +#include "fman_keygen.h" #include #include @@ -56,6 +57,7 @@ /* Modules registers offsets */ #define BMI_OFFSET 0x0008 #define QMI_OFFSET 0x00080400 +#define KG_OFFSET 0x000C1000 #define DMA_OFFSET 0x000C2000 #define FPM_OFFSET 0x000C3000 #define IMEM_OFFSET0x000C4000 @@ -1737,6 +1739,7 @@ static int fman_config(struct fman *fman) fman->qmi_regs = base_addr + QMI_OFFSET; fman->dma_regs = base_addr + DMA_OFFSET; fman->hwp_regs = base_addr + HWP_OFFSET; + fman->kg_regs = base_addr + KG_OFFSET; fman->base_addr = base_addr; spin_lock_init(&fman->spinlock); @@ -2009,6 +2012,11 @@ static int fman_init(struct fman *fman) /* Init HW Parser */ hwp_init(fman->hwp_regs); + /* Init KeyGen */ + fman->keygen = keygen_init(fman->kg_regs); + if (!fman->keygen) + return -EINVAL; + err = enable(fman, cfg); if (err != 0) return err; diff --git a/drivers/net/ethernet/freescale/fman/fman.h b/drivers/net/ethernet/freescale/fman/fman.h index 6745065..a4b1633 100644 --- a/drivers/net/ethernet/freescale/fman/fman.h +++ b/drivers/net/ethernet/freescale/fman/fman.h @@ -326,6 +326,7 @@ struct fman { struct fman_qmi_regs __iomem *qmi_regs; struct fman_dma_regs __iomem *dma_regs; struct fman_hwp_regs __iomem *hwp_regs; + struct fman_kg_regs __iomem *kg_regs; fman_exceptions_cb *exception_cb; fman_bus_error_cb *bus_error_cb; /* Spinlock for FMan use */ @@ -334,6 +335,7 @@ struct fman { struct fman_cfg *cfg; struct muram_info *muram; + struct fman_keygen *keygen; /* cam section in muram */ unsigned long cam_offset; size_t cam_size; diff --git a/drivers/net/ethernet/freescale/fman/fman_keygen.c b/drivers/net/ethernet/freescale/fman/fman_keygen.c new file mode 100644 index 000..f54da3c --- /dev/null +++ b/drivers/net/ethernet/freescale/fman/fman_keygen.c @@ -0,0 +1,783 @@ +/* + * Copyright 2017 NXP + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NXP nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY NXP ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WA
[PATCH v3 3/7] dpaa_eth: use multiple Rx frame queues
Add a block of 128 Rx frame queues per port. The FMan hardware will send traffic on one of these queues based on the FMan port Parse Classify Distribute setup. The hash computed by the FMan Keygen block will select the Rx FQ. Signed-off-by: Madalin Bucur --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 50 +++--- drivers/net/ethernet/freescale/dpaa/dpaa_eth.h | 1 + .../net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c | 3 ++ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index c7fa285..6d89e74 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -158,7 +158,7 @@ MODULE_PARM_DESC(tx_timeout, "The Tx timeout in ms"); #define DPAA_RX_PRIV_DATA_SIZE (u16)(DPAA_TX_PRIV_DATA_SIZE + \ dpaa_rx_extra_headroom) -#define DPAA_ETH_RX_QUEUES 128 +#define DPAA_ETH_PCD_RXQ_NUM 128 #define DPAA_ENQUEUE_RETRIES 10 @@ -169,6 +169,7 @@ struct fm_port_fqs { struct dpaa_fq *tx_errq; struct dpaa_fq *rx_defq; struct dpaa_fq *rx_errq; + struct dpaa_fq *rx_pcdq; }; /* All the dpa bps in use at any moment */ @@ -628,6 +629,7 @@ static inline void dpaa_assign_wq(struct dpaa_fq *fq, int idx) fq->wq = 5; break; case FQ_TYPE_RX_DEFAULT: + case FQ_TYPE_RX_PCD: fq->wq = 6; break; case FQ_TYPE_TX: @@ -688,6 +690,7 @@ static int dpaa_alloc_all_fqs(struct device *dev, struct list_head *list, struct fm_port_fqs *port_fqs) { struct dpaa_fq *dpaa_fq; + u32 fq_base, fq_base_aligned, i; dpaa_fq = dpaa_fq_alloc(dev, 0, 1, list, FQ_TYPE_RX_ERROR); if (!dpaa_fq) @@ -701,6 +704,26 @@ static int dpaa_alloc_all_fqs(struct device *dev, struct list_head *list, port_fqs->rx_defq = &dpaa_fq[0]; + /* the PCD FQIDs range needs to be aligned for correct operation */ + if (qman_alloc_fqid_range(&fq_base, 2 * DPAA_ETH_PCD_RXQ_NUM)) + goto fq_alloc_failed; + + fq_base_aligned = ALIGN(fq_base, DPAA_ETH_PCD_RXQ_NUM); + + for (i = fq_base; i < fq_base_aligned; i++) + qman_release_fqid(i); + + for (i = fq_base_aligned + DPAA_ETH_PCD_RXQ_NUM; +i < (fq_base + 2 * DPAA_ETH_PCD_RXQ_NUM); i++) + qman_release_fqid(i); + + dpaa_fq = dpaa_fq_alloc(dev, fq_base_aligned, DPAA_ETH_PCD_RXQ_NUM, + list, FQ_TYPE_RX_PCD); + if (!dpaa_fq) + goto fq_alloc_failed; + + port_fqs->rx_pcdq = &dpaa_fq[0]; + if (!dpaa_fq_alloc(dev, 0, DPAA_ETH_TXQ_NUM, list, FQ_TYPE_TX_CONF_MQ)) goto fq_alloc_failed; @@ -870,13 +893,14 @@ static void dpaa_fq_setup(struct dpaa_priv *priv, const struct dpaa_fq_cbs *fq_cbs, struct fman_port *tx_port) { - int egress_cnt = 0, conf_cnt = 0, num_portals = 0, cpu; + int egress_cnt = 0, conf_cnt = 0, num_portals = 0, portal_cnt = 0, cpu; const cpumask_t *affine_cpus = qman_affine_cpus(); - u16 portals[NR_CPUS]; + u16 channels[NR_CPUS]; struct dpaa_fq *fq; for_each_cpu(cpu, affine_cpus) - portals[num_portals++] = qman_affine_channel(cpu); + channels[num_portals++] = qman_affine_channel(cpu); + if (num_portals == 0) dev_err(priv->net_dev->dev.parent, "No Qman software (affine) channels found"); @@ -890,6 +914,12 @@ static void dpaa_fq_setup(struct dpaa_priv *priv, case FQ_TYPE_RX_ERROR: dpaa_setup_ingress(priv, fq, &fq_cbs->rx_errq); break; + case FQ_TYPE_RX_PCD: + if (!num_portals) + continue; + dpaa_setup_ingress(priv, fq, &fq_cbs->rx_defq); + fq->channel = channels[portal_cnt++ % num_portals]; + break; case FQ_TYPE_TX: dpaa_setup_egress(priv, fq, tx_port, &fq_cbs->egress_ern); @@ -1039,7 +1069,8 @@ static int dpaa_fq_init(struct dpaa_fq *dpaa_fq, bool td_enable) /* Put all the ingress queues in our "ingress CGR". */ if (priv->use_ingress_cgr && (dpaa_fq->fq_type == FQ_TYPE_RX_DEFAULT || -dpaa_fq->fq_type == FQ_TYPE_RX_ERROR)) { +dpaa_fq->fq_type == FQ_TYPE_RX_ERROR || +dpaa_fq->fq_type == FQ_TYPE_RX_PCD)) { initfq.we_mask |= cpu_to_be16(QM_INITFQ_WE_CGID); initfq.fqd.fq_ctrl |= cpu_to_be16(QM_FQCTRL_CGE); i
[PATCH v3 1/7] fsl/fman: move struct fman to header file
Signed-off-by: Madalin Bucur --- drivers/net/ethernet/freescale/fman/fman.c | 74 -- drivers/net/ethernet/freescale/fman/fman.h | 73 + 2 files changed, 73 insertions(+), 74 deletions(-) diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c index e714b8f..6ed383f 100644 --- a/drivers/net/ethernet/freescale/fman/fman.c +++ b/drivers/net/ethernet/freescale/fman/fman.c @@ -564,80 +564,6 @@ struct fman_cfg { u32 qmi_def_tnums_thresh; }; -/* Structure that holds information received from device tree */ -struct fman_dts_params { - void __iomem *base_addr;/* FMan virtual address */ - struct resource *res; /* FMan memory resource */ - u8 id; /* FMan ID */ - - int err_irq;/* FMan Error IRQ */ - - u16 clk_freq; /* FMan clock freq (In Mhz) */ - - u32 qman_channel_base; /* QMan channels base */ - u32 num_of_qman_channels; /* Number of QMan channels */ - - struct resource muram_res; /* MURAM resource */ -}; - -/** fman_exceptions_cb - * fman- Pointer to FMan - * exception - The exception. - * - * Exceptions user callback routine, will be called upon an exception - * passing the exception identification. - * - * Return: irq status - */ -typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman, -enum fman_exceptions exception); - -/** fman_bus_error_cb - * fman- Pointer to FMan - * port_id - Port id - * addr- Address that caused the error - * tnum- Owner of error - * liodn - Logical IO device number - * - * Bus error user callback routine, will be called upon bus error, - * passing parameters describing the errors and the owner. - * - * Return: IRQ status - */ -typedef irqreturn_t (fman_bus_error_cb)(struct fman *fman, u8 port_id, - u64 addr, u8 tnum, u16 liodn); - -struct fman { - struct device *dev; - void __iomem *base_addr; - struct fman_intr_src intr_mng[FMAN_EV_CNT]; - - struct fman_fpm_regs __iomem *fpm_regs; - struct fman_bmi_regs __iomem *bmi_regs; - struct fman_qmi_regs __iomem *qmi_regs; - struct fman_dma_regs __iomem *dma_regs; - struct fman_hwp_regs __iomem *hwp_regs; - fman_exceptions_cb *exception_cb; - fman_bus_error_cb *bus_error_cb; - /* Spinlock for FMan use */ - spinlock_t spinlock; - struct fman_state_struct *state; - - struct fman_cfg *cfg; - struct muram_info *muram; - /* cam section in muram */ - unsigned long cam_offset; - size_t cam_size; - /* Fifo in MURAM */ - unsigned long fifo_offset; - size_t fifo_size; - - u32 liodn_base[64]; - u32 liodn_offset[64]; - - struct fman_dts_params dts_params; -}; - static irqreturn_t fman_exceptions(struct fman *fman, enum fman_exceptions exception) { diff --git a/drivers/net/ethernet/freescale/fman/fman.h b/drivers/net/ethernet/freescale/fman/fman.h index f53e147..6745065 100644 --- a/drivers/net/ethernet/freescale/fman/fman.h +++ b/drivers/net/ethernet/freescale/fman/fman.h @@ -274,6 +274,79 @@ struct fman_intr_src { void *src_handle; }; +/** fman_exceptions_cb + * fman - Pointer to FMan + * exception- The exception. + * + * Exceptions user callback routine, will be called upon an exception + * passing the exception identification. + * + * Return: irq status + */ +typedef irqreturn_t (fman_exceptions_cb)(struct fman *fman, +enum fman_exceptions exception); +/** fman_bus_error_cb + * fman - Pointer to FMan + * port_id - Port id + * addr - Address that caused the error + * tnum - Owner of error + * liodn- Logical IO device number + * + * Bus error user callback routine, will be called upon bus error, + * passing parameters describing the errors and the owner. + * + * Return: IRQ status + */ +typedef irqreturn_t (fman_bus_error_cb)(struct fman *fman, u8 port_id, + u64 addr, u8 tnum, u16 liodn); + +/* Structure that holds information received from device tree */ +struct fman_dts_params { + void __iomem *base_addr;/* FMan virtual address */ + struct resource *res; /* FMan memory resource */ + u8 id; /* FMan ID */ + + int err_irq;/* FMan Error IRQ */ + + u16 clk_freq; /* FMan clock freq (In Mhz) */ + + u32 qman_channel_base; /* QMan channels base */ + u32 num_of_qman_channels; /* Num
[PATCH v3 0/7] Add RSS to DPAA 1.x Ethernet driver
This patch set introduces Receive Side Scaling for the DPAA Ethernet driver. Documentation is updated with details related to the new feature and limitations that apply. Added also a small fix. v2: removed a C++ style comment v3: move struct fman to header file to avoid exporting a function Iordache Florinel-R70177 (1): fsl/fman: enable FMan Keygen Madalin Bucur (6): fsl/fman: move struct fman to header file dpaa_eth: use multiple Rx frame queues dpaa_eth: enable Rx hashing control dpaa_eth: add NETIF_F_RXHASH Documentation: networking: add RSS information dpaa_eth: check allocation result Documentation/networking/dpaa.txt | 68 +- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 76 +- drivers/net/ethernet/freescale/dpaa/dpaa_eth.h | 2 + .../net/ethernet/freescale/dpaa/dpaa_eth_sysfs.c | 3 + drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 118 drivers/net/ethernet/freescale/fman/Makefile | 2 +- drivers/net/ethernet/freescale/fman/fman.c | 82 +-- drivers/net/ethernet/freescale/fman/fman.h | 75 ++ drivers/net/ethernet/freescale/fman/fman_keygen.c | 783 + drivers/net/ethernet/freescale/fman/fman_keygen.h | 46 ++ drivers/net/ethernet/freescale/fman/fman_port.c| 51 +- drivers/net/ethernet/freescale/fman/fman_port.h| 7 + 12 files changed, 1226 insertions(+), 87 deletions(-) create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.c create mode 100644 drivers/net/ethernet/freescale/fman/fman_keygen.h -- 2.1.0
Re: [PATCH] cxl: Add support for POWER9 DD2
Le 24/08/2017 à 07:24, Andrew Donnellan a écrit : On 24/08/17 00:58, Christophe Lombard wrote: The PSL initialization sequence has been updated to DD2. This patch adapts to the changes, retaining compatibility with DD1. Tests performed on some of the new hardware. If we're retaining compatibility with DD1 I assume it's been tested on some of the old hardware too? right, it's been tested on boston machine with dd1 It seems this includes some changes to DD1 fix-ups as well. correct Signed-off-by: Christophe Lombard > --- drivers/misc/cxl/cxl.h | 2 ++ drivers/misc/cxl/pci.c | 57 +++--- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index b1afecc..0167df8 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -100,6 +100,8 @@ static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158}; static const cxl_p1_reg_t CXL_XSL_DSNCTL= {0x0168}; /* PSL registers - CAIA 2 */ static const cxl_p1_reg_t CXL_PSL9_CONTROL = {0x0020}; +static const cxl_p1_reg_t CXL_XSL9_INV = {0x0110}; +static const cxl_p1_reg_t CXL_XSL9_DEF = {0x0140}; static const cxl_p1_reg_t CXL_XSL9_DSNCTL = {0x0168}; static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300}; static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308}; diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index d18b3d9..a981c65 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -475,37 +475,52 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, psl_fircntl |= 0x1ULL; /* ce_thresh */ cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl); -/* vccredits=0x1 pcklat=0x4 */ -cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x1810ULL); - -/* - * For debugging with trace arrays. - * Configure RX trace 0 segmented mode. - * Configure CT trace 0 segmented mode. - * Configure LA0 trace 0 segmented mode. - * Configure LA1 trace 0 segmented mode. +/* Setup the PSL to transmit packets on the PCIe before the + * CAPP is enabled */ -cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008000ULL); -cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008003ULL); -cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008005ULL); -cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x804080008006ULL); +cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x000100102A10ULL); + +/* For debugging with trace arrays */ +/* Configure RX trace 0 segmented mode */ +cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8200ULL); +/* Configure RX trace 1 segmented mode */ +cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xAA01ULL); +/* Configure CT trace 0 segmented mode */ +cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xA2B80003ULL); +/* Configure LA0 trace 0 segmented mode */ +cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x83FFC005ULL); +/* Configure JM0 trace 0 segmented mode */ +cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8207ULL); +/* Configure DMA trace 0 segmented mode */ +cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8208ULL); +/* Configure DMA trace 1 segmented mode */ +cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8209ULL); /* * A response to an ASB_Notify request is returned by the * system as an MMIO write to the address defined in * the PSL_TNR_ADDR register */ -/* PSL_TNR_ADDR */ +/* keep the Reset Value: 0x0002E000 */ I was confused by this comment for a while - maybe keep PSL_TNR_ADDR at the beginning of the comment, it's not completely clear from the previous block alone. okay, will do. + +/* Enable XSL rty limit */ +cxl_p1_write(adapter, CXL_XSL9_DEF, 0x51F80005ULL); -/* NORST */ -cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000ULL); +/* Change XSL_INV dummy readtheshold */ read threshold? +cxl_p1_write(adapter, CXL_XSL9_INV, 0x040007FFC200ULL); -/* allocate the apc machines */ -cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x4003ULL); +if (phb_index == 3) { +/* disable machines 31-47 and 20-27 for DMA */ +cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x4FF3ULL); +} + +/* Snoop machines */ +cxl_p1_write(adapter, CXL_PSL9_APCDEDALLOC, 0x800F0002ULL); -/* Disable vc dd1 fix */ -if (cxl_is_power9_dd1()) -cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0401ULL); +if (cxl_is_power9_dd1()) { +/* Disabling deadlock counter CAR */ +cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0021ULL); +} return 0; }
Re: [PATCH] cxl: Add support for POWER9 DD2
Le 24/08/2017 à 09:09, Vaibhav Jain a écrit : Hi Christophe, Christophe Lombard writes: + /* For debugging with trace arrays */ + /* Configure RX trace 0 segmented mode */ + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8200ULL); + /* Configure RX trace 1 segmented mode */ + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xAA01ULL); + /* Configure CT trace 0 segmented mode */ + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xA2B80003ULL); + /* Configure LA0 trace 0 segmented mode */ + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x83FFC005ULL); + /* Configure JM0 trace 0 segmented mode */ + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8207ULL); + /* Configure DMA trace 0 segmented mode */ + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8208ULL); + /* Configure DMA trace 1 segmented mode */ + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8209ULL); Please wrap this block that configures the trace arrays in #ifdef DEBUG. Or it will be better if we remove it from here as the register is already accessible via debugfs. --- Vaibhav Jain Linux Technology Center, IBM India Pvt. Ltd. okay, I will remove this block Thanks
Re: [PATCH] cxl: Add support for POWER9 DD2
Hi Christophe, Christophe Lombard writes: > + /* For debugging with trace arrays */ > + /* Configure RX trace 0 segmented mode */ > + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8200ULL); > + /* Configure RX trace 1 segmented mode */ > + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xAA01ULL); > + /* Configure CT trace 0 segmented mode */ > + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0xA2B80003ULL); > + /* Configure LA0 trace 0 segmented mode */ > + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x83FFC005ULL); > + /* Configure JM0 trace 0 segmented mode */ > + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8207ULL); > + /* Configure DMA trace 0 segmented mode */ > + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8208ULL); > + /* Configure DMA trace 1 segmented mode */ > + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8209ULL); Please wrap this block that configures the trace arrays in #ifdef DEBUG. Or it will be better if we remove it from here as the register is already accessible via debugfs. --- Vaibhav Jain Linux Technology Center, IBM India Pvt. Ltd.
RE: [PATCH] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce()
>From: Paul Mackerras [mailto:pau...@ozlabs.org] Thursday, August 24, 2017 >11:40 AM > >Nixiaoming pointed out that there is a memory leak in >kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the >memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor >are the pages allocated for the iommu tables. In addition, we have already >incremented the process's count of locked memory pages, and this doesn't get >restored on error. > >David Hildenbrand pointed out that there is a race in that the function checks >early on that there is not already an entry in the >stt->iommu_tables list with the same LIOBN, but an entry with the >same LIOBN could get added between then and when the new entry is added to the >list. > >This fixes all three problems. To simplify things, we now call >anon_inode_getfd() before placing the new entry in the list. The check for an >existing entry is done while holding the kvm->lock mutex, immediately before >adding the new entry to the list. >Finally, on failure we now call kvmppc_account_memlimit to decrement the >process's count of locked memory pages. > >Reported-by: Nixiaoming >Reported-by: David Hildenbrand >Signed-off-by: Paul Mackerras >--- > arch/powerpc/kvm/book3s_64_vio.c | 55 > 1 file changed, 33 insertions(+), 22 deletions(-) > >diff --git a/arch/powerpc/kvm/book3s_64_vio.c >b/arch/powerpc/kvm/book3s_64_vio.c >index a160c14304eb..d463c1cd0d8d 100644 >--- a/arch/powerpc/kvm/book3s_64_vio.c >+++ b/arch/powerpc/kvm/book3s_64_vio.c >@@ -297,29 +297,22 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > unsigned long npages, size; > int ret = -ENOMEM; > int i; >+ int fd = -1; > > if (!args->size) > return -EINVAL; > >- /* Check this LIOBN hasn't been previously allocated */ >- list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { >- if (stt->liobn == args->liobn) >- return -EBUSY; >- } >- > size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); > npages = kvmppc_tce_pages(size); > ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); >- if (ret) { >- stt = NULL; >- goto fail; >- } >+ if (ret) >+ return ret; > > ret = -ENOMEM; > stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), > GFP_KERNEL); > if (!stt) >- goto fail; >+ goto fail_acct; > > stt->liobn = args->liobn; > stt->page_shift = args->page_shift; >@@ -334,24 +327,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > goto fail; > } > >- kvm_get_kvm(kvm); >+ ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, >+ stt, O_RDWR | O_CLOEXEC); >+ if (ret < 0) >+ goto fail; > > mutex_lock(&kvm->lock); >- list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); >+ >+ /* Check this LIOBN hasn't been previously allocated */ >+ ret = 0; >+ list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { I think stt can not be used here need a new value for list_for_each_entry >+ if (stt->liobn == args->liobn) { >+ ret = -EBUSY; >+ break; >+ } >+ } >+ >+ if (!ret) { >+ list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); >+ kvm_get_kvm(kvm); >+ } > > mutex_unlock(&kvm->lock); > >- return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, >- stt, O_RDWR | O_CLOEXEC); >+ if (!ret) >+ return fd; > >-fail: >- if (stt) { >- for (i = 0; i < npages; i++) >- if (stt->pages[i]) >- __free_page(stt->pages[i]); >+ put_unused_fd(fd); > >- kfree(stt); >- } >+ fail: >+ for (i = 0; i < npages; i++) >+ if (stt->pages[i]) >+ __free_page(stt->pages[i]); >+ >+ kfree(stt); >+ fail_acct: >+ kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); > return ret; > } > >-- >2.11.0 Thanks