date:20171006

[PATCH v2 18/18] powerpc/vas: Add support for user receive window

2017-10-06 Thread Sukadev Bhattiprolu

Add support for user space receive window (for the Fast thread-wakeup
coprocessor type)

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 59 +
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 1d08b64..99642ec 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -16,7 +16,8 @@
 #include 
 #include 
 #include 
-
+#include 
+#include 
 #include "vas.h"
 #include "copy-paste.h"
 
@@ -602,6 +603,32 @@ static void put_rx_win(struct vas_window *rxwin)
 }
 
 /*
+ * Find the user space receive window given the @pswid.
+ *  - We must have a valid vasid and it must belong to this instance.
+ *(so both send and receive windows are on the same VAS instance)
+ *  - The window must refer to an OPEN, FTW, RECEIVE window.
+ *
+ * NOTE: We access ->windows[] table and assume that vinst->mutex is held.
+ */
+static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+{
+   int vasid, winid;
+   struct vas_window *rxwin;
+
+   decode_pswid(pswid, , );
+
+   if (vinst->vas_id != vasid)
+   return ERR_PTR(-EINVAL);
+
+   rxwin = vinst->windows[winid];
+
+   if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+   return ERR_PTR(-EINVAL);
+
+   return rxwin;
+}
+
+/*
  * Get the VAS receive window associated with NX engine identified
  * by @cop and if applicable, @pswid.
  *
@@ -614,10 +641,10 @@ static struct vas_window *get_vinst_rxwin(struct 
vas_instance *vinst,
 
mutex_lock(>mutex);
 
-   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI)
-   rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
+   if (cop == VAS_COP_TYPE_FTW)
+   rxwin = get_user_rxwin(vinst, pswid);
else
-   rxwin = ERR_PTR(-EINVAL);
+   rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
 
if (!IS_ERR(rxwin))
atomic_inc(>num_txwins);
@@ -941,10 +968,9 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
winctx->tx_word_mode = txattr->tx_win_ord_mode;
winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
-   if (winctx->nx_win) {
+   winctx->intr_disable = true;
+   if (winctx->nx_win)
winctx->data_stamp = true;
-   winctx->intr_disable = true;
-   }
 
winctx->lpid = txattr->lpid;
winctx->pidr = txattr->pidr;
@@ -989,6 +1015,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
if (!tx_win_args_valid(cop, attr))
return ERR_PTR(-EINVAL);
 
+   /*
+* If caller did not specify a vasid but specified the PSWID of a
+* receive window (applicable only to FTW windows), use the vasid
+* from that receive window.
+*/
+   if (vasid == -1 && attr->pswid)
+   decode_pswid(attr->pswid, , NULL);
+
vinst = find_vas_instance(vasid);
if (!vinst) {
pr_devel("vasid %d not found!\n", vasid);
@@ -1037,6 +1071,17 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
 
set_vinst_win(vinst, txwin);
 
+   set_thread_used_vas();
+
+   /*
+* Even a process that has no foreign real address mapping can use
+* an unpaired COPY instruction (to no real effect). Issue CP_ABORT
+* to clear any pending COPY and prevent a covert channel.
+*
+* __switch_to() will issue CP_ABORT on future context switches.
+*/
+   asm volatile(PPC_CP_ABORT);
+
return txwin;
 
 free_window:
-- 
2.7.4

[PATCH v2 17/18] powerpc/vas: Define vas_win_id()

2017-10-06 Thread Sukadev Bhattiprolu

Define an interface to return a system-wide unique id for a given VAS
window.

The vas_win_id() will be used in a follow-on patch to generate an unique
handle for a user space receive window. Applications can use this handle
to pair send and receive windows for fast thread-wakeup.

The hardware refers to this system-wide unique id as a Partition Send
Window ID which is expected to be used during fault handling. Hence the
"pswid" in the function names.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h  |  5 +
 arch/powerpc/platforms/powernv/vas-window.c |  9 +
 arch/powerpc/platforms/powernv/vas.h| 28 
 3 files changed, 42 insertions(+)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index f98ade8..7714562 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -168,6 +168,11 @@ int vas_copy_crb(void *crb, int offset);
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
 /*
+ * Return a system-wide unique id for the VAS window @win.
+ */
+extern u32 vas_win_id(struct vas_window *win);
+
+/*
  * Return the power bus paste address associated with @win so the caller
  * can map that address into their address space.
  */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index e4a9c7b..1d08b64 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1239,3 +1239,12 @@ int vas_win_close(struct vas_window *window)
return 0;
 }
 EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return a system-wide unique window id for the window @win.
+ */
+u32 vas_win_id(struct vas_window *win)
+{
+   return encode_pswid(win->vinst->vas_id, win->winid);
+}
+EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 145749a..78a8926 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -447,4 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
return in_be64(win->hvwc_map+reg);
 }
 
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * BitsUsage
+ * 0:7 VAS id (8 bits)
+ * 8:15Unused, 0 (3 bits)
+ * 16:31   Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+   u32 pswid = 0;
+
+   pswid |= vasid << (31 - 7);
+   pswid |= winid;
+
+   return pswid;
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+   if (vasid)
+   *vasid = pswid >> (31 - 7) & 0xFF;
+
+   if (winid)
+   *winid = pswid & 0x;
+}
 #endif /* _VAS_H */
-- 
2.7.4

[PATCH v2 16/18] powerpc/vas: Define vas_win_paste_addr()

2017-10-06 Thread Sukadev Bhattiprolu

Define an interface that the NX drivers can use to find the physical
paste address of a send window. This interface is expected to be used
with the mmap() operation of the NX driver's device. i.e the user space
process can use driver's mmap() operation to map the send window's paste
address into their address space and then use copy and paste instructions
to submit the CRBs to the NX engine.

Note that kernel drivers will use vas_paste_crb() directly and don't need
this interface.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h  |  7 +++
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 044748f..f98ade8 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -10,6 +10,8 @@
 #ifndef _ASM_POWERPC_VAS_H
 #define _ASM_POWERPC_VAS_H
 
+struct vas_window;
+
 /*
  * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25
  * (Local FIFO Size Register) of the VAS workbook.
@@ -165,4 +167,9 @@ int vas_copy_crb(void *crb, int offset);
  */
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
+/*
+ * Return the power bus paste address associated with @win so the caller
+ * can map that address into their address space.
+ */
+extern u64 vas_win_paste_addr(struct vas_window *win);
 #endif /* __ASM_POWERPC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 088ce56..e4a9c7b 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -40,6 +40,16 @@ static void compute_paste_address(struct vas_window *window, 
u64 *addr, int *len
pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
 }
 
+u64 vas_win_paste_addr(struct vas_window *win)
+{
+   u64 addr;
+
+   compute_paste_address(win, , NULL);
+
+   return addr;
+}
+EXPORT_SYMBOL(vas_win_paste_addr);
+
 static inline void get_hvwc_mmio_bar(struct vas_window *window,
u64 *start, int *len)
 {
-- 
2.7.4

[PATCH v2 15/18] powerpc: Emulate paste instruction

2017-10-06 Thread Sukadev Bhattiprolu

From: Michael Neuling 

On POWER9 DD2.1 and below there are issues when the paste instruction
generates an error. If an error occurs when thread reconfiguration
happens (ie another thread in the core goes into/out of powersave) the
core may hang.

To avoid this a special sequence is required which stops thread
configuration so that the paste can be safely executed.

This patch assumes paste executed in userspace are trapped into the
illegal instruction exception at 0xe40.

Here we re-execute the paste instruction but with the required
sequence to ensure thread reconfiguration doesn't occur.

Signed-off-by: Michael Neuling 
Signed-off-by: Sukadev Bhattiprolu 
---

Edit by Sukadev: Use PPC_PASTE() rather than the paste instruction since
in older versions the instruction required a third parameter.
---
 arch/powerpc/include/asm/emulated_ops.h |  1 +
 arch/powerpc/include/asm/ppc-opcode.h   |  1 +
 arch/powerpc/include/asm/reg.h  |  2 ++
 arch/powerpc/kernel/traps.c | 64 +
 4 files changed, 68 insertions(+)

diff --git a/arch/powerpc/include/asm/emulated_ops.h 
b/arch/powerpc/include/asm/emulated_ops.h
index f00e10e..9247af9 100644
--- a/arch/powerpc/include/asm/emulated_ops.h
+++ b/arch/powerpc/include/asm/emulated_ops.h
@@ -55,6 +55,7 @@ extern struct ppc_emulated {
struct ppc_emulated_entry mfdscr;
struct ppc_emulated_entry mtdscr;
struct ppc_emulated_entry lq_stq;
+   struct ppc_emulated_entry paste;
 #endif
 } ppc_emulated;
 
diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index ce0930d..a55d2ef 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -229,6 +229,7 @@
 #define PPC_INST_MTTMR 0x7c0003dc
 #define PPC_INST_NOP   0x6000
 #define PPC_INST_PASTE 0x7c20070d
+#define PPC_INST_PASTE_MASK0xfc2007ff
 #define PPC_INST_POPCNTB   0x7cf4
 #define PPC_INST_POPCNTB_MASK  0xfc0007fe
 #define PPC_INST_POPCNTD   0x7c0003f4
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f92eaf7..5cde1c4 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -468,6 +468,8 @@
 #define SPRN_DBAT7U0x23E   /* Data BAT 7 Upper Register */
 #define SPRN_PPR   0x380   /* SMT Thread status Register */
 #define SPRN_TSCR  0x399   /* Thread Switch Control Register */
+#define SPRN_TRIG1 0x371   /* WAT Trigger 1 */
+#define SPRN_TRIG2 0x372   /* WAT Trigger 2 */
 
 #define SPRN_DEC   0x016   /* Decrement Register */
 #define SPRN_DER   0x095   /* Debug Enable Register */
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 13c9dcd..7e6b1fe 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -956,6 +956,65 @@ static inline bool tm_abort_check(struct pt_regs *regs, 
int reason)
 }
 #endif
 
+static DEFINE_SPINLOCK(paste_emulation_lock);
+
+static inline int paste(void *i)
+{
+   int cr;
+   long retval = 0;
+
+   /* Need per core lock to ensure trig1/2 writes don't race */
+   spin_lock(_emulation_lock);
+   mtspr(SPRN_TRIG1, 0); /* data doesn't matter */
+   mtspr(SPRN_TRIG1, 0); /* HW says do this twice */
+   asm volatile(
+   "1: " PPC_PASTE(0, %2) "\n"
+   "2: mfcr %1\n"
+   ".section .fixup,\"ax\"\n"
+   "3: li %0,%3\n"
+   "   li %2,0\n"
+   "   b 2b\n"
+   ".previous\n"
+   EX_TABLE(1b, 3b)
+   : "=r" (retval), "=r" (cr)
+   : "b" (i), "i" (-EFAULT), "0" (retval));
+   mtspr(SPRN_TRIG2, 0);
+   spin_unlock(_emulation_lock);
+   return cr;
+}
+
+static int emulate_paste(struct pt_regs *regs, u32 instword)
+{
+   const void __user *addr;
+   unsigned long ea;
+   u8 ra, rb;
+
+   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   return -EINVAL;
+
+   ra = (instword >> 16) & 0x1f;
+   rb = (instword >> 11) & 0x1f;
+
+   ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0ul);
+   if (is_32bit_task())
+   ea &= 0xul;
+   addr = (__force const void __user *)ea;
+
+   if (!access_ok(VERIFY_WRITE, addr, 128)) // cacheline size == 128
+   return -EFAULT;
+
+   hard_irq_disable(); /* FIXME: could we just soft disable ?? */
+   pagefault_disable();
+
+   PPC_WARN_EMULATED(paste, regs);
+   regs->ccr = paste((void *)addr);
+
+   pagefault_enable();
+   may_hard_irq_enable();
+
+   return 0;
+}
+
 static int emulate_instruction(struct pt_regs *regs)
 {
u32 instword;
@@ -968,6 +1027,10 @@ static int emulate_instruction(struct pt_regs *regs)
if (get_user(instword, (u32 __user

[PATCH v2 14/18] powerpc: Define set_thread_used_vas()

2017-10-06 Thread Sukadev Bhattiprolu

A CP_ABORT instruction is required in processes that have mapped a VAS
"paste address" with the intention of using COPY/PASTE instructions.
But since CP_ABORT is expensive, we want to restrict it to only processes
that use/intend to use COPY/PASTE.

Define an interface, set_thread_used_vas(), that VAS can use to indicate
that the current process opened a send window. During context switch,
issue CP_ABORT only for processes that have the flag set.

Thanks for input from Nick Piggin, Michael Ellerman.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/processor.h |  2 ++
 arch/powerpc/include/asm/switch_to.h |  2 ++
 arch/powerpc/kernel/process.c| 32 ++--
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 58cc212..bdab3b74 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -341,7 +341,9 @@ struct thread_struct {
unsigned long   sier;
unsigned long   mmcr2;
unsignedmmcr0;
+
unsignedused_ebb;
+   unsigned intused_vas;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index f5da32f..aeb305b 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -91,6 +91,8 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+extern int set_thread_used_vas(void);
+
 extern int set_thread_tidr(struct task_struct *t);
 extern void clear_thread_tidr(struct task_struct *t);
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index d861fcd..cb5f108 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1234,17 +1234,17 @@ struct task_struct *__switch_to(struct task_struct 
*prev,
 * The copy-paste buffer can only store into foreign real
 * addresses, so unprivileged processes can not see the
 * data or use it in any way unless they have foreign real
-* mappings. We don't have a VAS driver that allocates those
-* yet, so no cpabort is required.
+* mappings. If the new process has the foreign real address
+* mappings, we must issue a cp_abort to clear any state and
+* prevent a covert channel being setup.
+*
+* DD1 allows paste into normal system memory so we do an
+* unpaired copy, rather than cp_abort, to clear the buffer,
+* since cp_abort is quite expensive.
 */
-   if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
-   /*
-* DD1 allows paste into normal system memory, so we
-* do an unpaired copy here to clear the buffer and
-* prevent a covert channel being set up.
-*
-* cpabort is not used because it is quite expensive.
-*/
+   if (new_thread->used_vas) {
+   asm volatile(PPC_CP_ABORT);
+   } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
asm volatile(PPC_COPY(%0, %1)
: : "r"(dummy_copy_buffer), "r"(0));
}
@@ -1445,6 +1445,18 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+int set_thread_used_vas(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   return -EINVAL;
+
+   current->thread.used_vas = 1;
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+   return 0;
+}
+
 #ifdef CONFIG_PPC64
 static DEFINE_SPINLOCK(vas_thread_id_lock);
 static DEFINE_IDA(vas_thread_ida);
-- 
2.7.4

[PATCH v2 13/18] powerpc: Add support for setting SPRN_TIDR

2017-10-06 Thread Sukadev Bhattiprolu

We need the SPRN_TIDR to be set for use with fast thread-wakeup (core-
to-core wakeup) and also with CAPI.

Each thread in a process needs to have a unique id within the process.
But as explained below, for now, we assign globally unique thread ids
to all threads in the system.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Philippe Bergheaud 
Signed-off-by: Christophe Lombard 
---
Changelog[v3]
- Merge changes with and address comments to Christophe's patch.
  (i.e drop CONFIG_PPC_VAS; use CONFIG_PPC64; check CPU_ARCH_300
  before setting TIDR). Defer following to separate patches:
- emulation parts of Christophe's patch,
- setting TIDR for tasks other than 'current'
- setting feature bit in AT_HWCAP2

Changelog[v2]
- Michael Ellerman: Use an interface to assign TIDR so it is
assigned to only threads that need it; move assignment to
restore_sprs(). Drop lint from rebase;
---
 arch/powerpc/include/asm/processor.h |   1 +
 arch/powerpc/include/asm/switch_to.h |   3 +
 arch/powerpc/kernel/process.c| 122 +++
 3 files changed, 126 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index fab7ff8..58cc212 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -329,6 +329,7 @@ struct thread_struct {
 */
int dscr_inherit;
unsigned long   ppr;/* used to save/restore SMT priority */
+   unsigned long   tidr;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
unsigned long   tar;
diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index 17c8380..f5da32f 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -91,4 +91,7 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+extern int set_thread_tidr(struct task_struct *t);
+extern void clear_thread_tidr(struct task_struct *t);
+
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 37ed60b..d861fcd 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1120,6 +1120,13 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 #endif
+#ifdef CONFIG_PPC64
+   if (old_thread->tidr != new_thread->tidr) {
+   /* TIDR should be non-zero only with ISA3.0. */
+   WARN_ON_ONCE(!cpu_has_feature(CPU_FTR_ARCH_300));
+   mtspr(SPRN_TIDR, new_thread->tidr);
+   }
+#endif
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -1438,9 +1445,117 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+#ifdef CONFIG_PPC64
+static DEFINE_SPINLOCK(vas_thread_id_lock);
+static DEFINE_IDA(vas_thread_ida);
+
+/*
+ * We need to assign a unique thread id to each thread in a process.
+ *
+ * This thread id, referred to as TIDR, and separate from the Linux's tgid,
+ * is intended to be used to direct an ASB_Notify from the hardware to the
+ * thread, when a suitable event occurs in the system.
+ *
+ * One such event is a "paste" instruction in the context of Fast Thread
+ * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard
+ * (VAS) in POWER9.
+ *
+ * To get a unique TIDR per process we could simply reuse task_pid_nr() but
+ * the problem is that task_pid_nr() is not yet available copy_thread() is
+ * called. Fixing that would require changing more intrusive arch-neutral
+ * code in code path in copy_process()?.
+ *
+ * Further, to assign unique TIDRs within each process, we need an atomic
+ * field (or an IDR) in task_struct, which again intrudes into the arch-
+ * neutral code. So try to assign globally unique TIDRs for now.
+ *
+ * NOTE: TIDR 0 indicates that the thread does not need a TIDR value.
+ *  For now, only threads that expect to be notified by the VAS
+ *  hardware need a TIDR value and we assign values > 0 for those.
+ */
+#define MAX_THREAD_CONTEXT ((1 << 16) - 1)
+static int assign_thread_tidr(void)
+{
+   int index;
+   int err;
+
+again:
+   if (!ida_pre_get(_thread_ida, GFP_KERNEL))
+   return -ENOMEM;
+
+   spin_lock(_thread_id_lock);
+   err = ida_get_new_above(_thread_ida, 1, );
+   spin_unlock(_thread_id_lock);
+
+   if (err == -EAGAIN)
+   goto again;
+   else if (err)
+   return err;
+
+   if (index > MAX_THREAD_CONTEXT) {
+   spin_lock(_thread_id_lock);
+   ida_remove(_thread_ida, index);
+   spin_unlock(_thread_id_lock);
+   return -ENOMEM;
+   }
+
+   return index;
+}
+
+static void free_thread_tidr(int id)
+{
+

[PATCH v2 12/18] powerpc: have copy depend on CONFIG_BOOK3S_64

2017-10-06 Thread Sukadev Bhattiprolu

Have the COPY/PASTE instructions depend on CONFIG_BOOK3S_64 rather than
CONFIG_PPC_STD_MMU_64.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/kernel/process.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a0c74bb..37ed60b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1215,10 +1215,14 @@ struct task_struct *__switch_to(struct task_struct 
*prev,
batch = this_cpu_ptr(_tlb_batch);
batch->active = 1;
}
+#endif /* CONFIG_PPC_STD_MMU_64 */
 
if (current_thread_info()->task->thread.regs) {
+#ifdef CONFIG_PPC_STD_MMU_64
restore_math(current_thread_info()->task->thread.regs);
+#endif /* CONFIG_PPC_STD_MMU_64 */
 
+#ifdef CONFIG_PPC_BOOK3S_64
/*
 * The copy-paste buffer can only store into foreign real
 * addresses, so unprivileged processes can not see the
@@ -1237,8 +1241,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
asm volatile(PPC_COPY(%0, %1)
: : "r"(dummy_copy_buffer), "r"(0));
}
+#endif /* CONFIG_PPC_BOOK3S_64 */
}
-#endif /* CONFIG_PPC_STD_MMU_64 */
 
return last;
 }
-- 
2.7.4

[PATCH v2 11/18] powerpc/vas: Export HVWC to debugfs

2017-10-06 Thread Sukadev Bhattiprolu

Export the VAS Window context information to debugfs.

We need to hold a mutex when closing the window to prevent a race
with the debugfs read(). Rather than introduce a per-instance mutex,
we use the global vas_mutex for now, since it is not heavily contended.

The window->cop field is only relevant to a receive window so we were
not setting it for a send window (which is is paired to a receive window
anyway). But to simplify reporting in debugfs, set the 'cop' field for the
send window also.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/Makefile |   3 +-
 arch/powerpc/platforms/powernv/vas-debug.c  | 209 
 arch/powerpc/platforms/powernv/vas-window.c |  34 -
 arch/powerpc/platforms/powernv/vas.c|   6 +-
 arch/powerpc/platforms/powernv/vas.h|  14 ++
 5 files changed, 259 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c

diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 37d60f7..17921c4 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -14,4 +14,5 @@ obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o
+obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_FTW)  += nx-ftw.o
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c 
b/arch/powerpc/platforms/powernv/vas-debug.c
new file mode 100644
index 000..ca22f1e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include "vas.h"
+
+static struct dentry *vas_debugfs;
+
+static char *cop_to_str(int cop)
+{
+   switch (cop) {
+   case VAS_COP_TYPE_FAULT:return "Fault";
+   case VAS_COP_TYPE_842:  return "NX-842 Normal Priority";
+   case VAS_COP_TYPE_842_HIPRI:return "NX-842 High Priority";
+   case VAS_COP_TYPE_GZIP: return "NX-GZIP Normal Priority";
+   case VAS_COP_TYPE_GZIP_HIPRI:   return "NX-GZIP High Priority";
+   case VAS_COP_TYPE_FTW:  return "Fast Thread-wakeup";
+   default:return "Unknown";
+   }
+}
+
+static int info_dbg_show(struct seq_file *s, void *private)
+{
+   struct vas_window *window = s->private;
+
+   mutex_lock(_mutex);
+
+   /* ensure window is not unmapped */
+   if (!window->hvwc_map)
+   goto unlock;
+
+   seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+   window->tx_win ? "Send" : "Receive");
+   seq_printf(s, "Pid : %d\n", window->pid);
+
+unlock:
+   mutex_unlock(_mutex);
+   return 0;
+}
+
+static int info_dbg_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, info_dbg_show, inode->i_private);
+}
+
+static const struct file_operations info_fops = {
+   .open   = info_dbg_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+};
+
+static inline void print_reg(struct seq_file *s, struct vas_window *win,
+   char *name, u32 reg)
+{
+   seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
+}
+
+static int hvwc_dbg_show(struct seq_file *s, void *private)
+{
+   struct vas_window *window = s->private;
+
+   mutex_lock(_mutex);
+
+   /* ensure window is not unmapped */
+   if (!window->hvwc_map)
+   goto unlock;
+
+   print_reg(s, window, VREG(LPID));
+   print_reg(s, window, VREG(PID));
+   print_reg(s, window, VREG(XLATE_MSR));
+   print_reg(s, window, VREG(XLATE_LPCR));
+   print_reg(s, window, VREG(XLATE_CTL));
+   print_reg(s, window, VREG(AMR));
+   print_reg(s, window, VREG(SEIDR));
+   print_reg(s, window, VREG(FAULT_TX_WIN));
+   print_reg(s, window, VREG(OSU_INTR_SRC_RA));
+   print_reg(s, window, VREG(HV_INTR_SRC_RA));
+   print_reg(s, window, VREG(PSWID));
+   print_reg(s, window, VREG(LFIFO_BAR));
+   print_reg(s, window, VREG(LDATA_STAMP_CTL));
+   print_reg(s, window, VREG(LDMA_CACHE_CTL));
+   print_reg(s, window, VREG(LRFIFO_PUSH));
+   print_reg(s, window, VREG(CURR_MSG_COUNT));
+   print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT));
+   print_reg(s, window, VREG(LRX_WCRED));
+   print_reg(s, window, VREG(LRX_WCRED_ADDER));
+

[PATCH v2 10/18] powerpc/vas, nx-842: Define and use chip_to_vas_id()

2017-10-06 Thread Sukadev Bhattiprolu

Define a helper, chip_to_vas_id() to map a given chip id to corresponding
vas id.

Normally, callers of vas_rx_win_open() and vas_tx_win_open() want the VAS
window to be on the same chip where the calling thread is executing. These
callers can pass in -1 for the VAS id.

This interface will be useful if a thread running on one chip wants to open
a window on another chip (like the NX-842 driver does during start up).

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h   |  9 +
 arch/powerpc/platforms/powernv/vas.c | 11 +++
 drivers/crypto/nx/nx-842-powernv.c   | 18 +++---
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index fd5963a..044748f 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -104,6 +104,15 @@ struct vas_tx_win_attr {
 };
 
 /*
+ * Helper to map a chip id to VAS id.
+ * For POWER9, this is a 1:1 mapping. In the future this maybe a 1:N
+ * mapping in which case, we will need to update this helper.
+ *
+ * Return the VAS id or -1 if no matching vasid is found.
+ */
+int chip_to_vas_id(int chipid);
+
+/*
  * Helper to initialize receive window attributes to defaults for an
  * NX window.
  */
diff --git a/arch/powerpc/platforms/powernv/vas.c 
b/arch/powerpc/platforms/powernv/vas.c
index abb7090..cd9a733 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -123,6 +123,17 @@ struct vas_instance *find_vas_instance(int vasid)
return NULL;
 }
 
+int chip_to_vas_id(int chipid)
+{
+   int cpu;
+
+   for_each_possible_cpu(cpu) {
+   if (cpu_to_chip_id(cpu) == chipid)
+   return per_cpu(cpu_vas_id, cpu);
+   }
+   return -1;
+}
+
 static int vas_probe(struct platform_device *pdev)
 {
return init_vas_instance(pdev);
diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
index 874ddf5..eb221ed 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -847,24 +847,12 @@ static int __init nx842_powernv_probe_vas(struct 
device_node *pn)
return -EINVAL;
}
 
-   for_each_compatible_node(dn, NULL, "ibm,power9-vas-x") {
-   if (of_get_ibm_chip_id(dn) == chip_id)
-   break;
-   }
-
-   if (!dn) {
-   pr_err("Missing VAS device node\n");
+   vasid = chip_to_vas_id(chip_id);
+   if (vasid < 0) {
+   pr_err("Unable to map chip_id %d to vasid\n", chip_id);
return -EINVAL;
}
 
-   if (of_property_read_u32(dn, "ibm,vas-id", )) {
-   pr_err("Missing ibm,vas-id device property\n");
-   of_node_put(dn);
-   return -EINVAL;
-   }
-
-   of_node_put(dn);
-
for_each_child_of_node(pn, dn) {
if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
ret = vas_cfg_coproc_info(dn, chip_id, vasid);
-- 
2.7.4

[PATCH v2 09/18] powerpc/vas: Create cpu to vas id mapping

2017-10-06 Thread Sukadev Bhattiprolu

Create a cpu to vasid mapping so callers can specify -1 instead of
trying to find a VAS id.

Changelog[v2]
[Michael Ellerman] Use per-cpu variables to simplify code.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/vas.c 
b/arch/powerpc/platforms/powernv/vas.c
index 565a487..abb7090 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -18,15 +18,18 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "vas.h"
 
 static DEFINE_MUTEX(vas_mutex);
 static LIST_HEAD(vas_instances);
 
+static DEFINE_PER_CPU(int, cpu_vas_id);
+
 static int init_vas_instance(struct platform_device *pdev)
 {
-   int rc, vasid;
+   int rc, cpu, vasid;
struct resource *res;
struct vas_instance *vinst;
struct device_node *dn = pdev->dev.of_node;
@@ -74,6 +77,11 @@ static int init_vas_instance(struct platform_device *pdev)
"paste_win_id_shift 0x%llx\n", pdev->name, vasid,
vinst->paste_base_addr, vinst->paste_win_id_shift);
 
+   for_each_possible_cpu(cpu) {
+   if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
+   per_cpu(cpu_vas_id, cpu) = vasid;
+   }
+
mutex_lock(_mutex);
list_add(>node, _instances);
mutex_unlock(_mutex);
@@ -98,6 +106,10 @@ struct vas_instance *find_vas_instance(int vasid)
struct vas_instance *vinst;
 
mutex_lock(_mutex);
+
+   if (vasid == -1)
+   vasid = per_cpu(cpu_vas_id, smp_processor_id());
+
list_for_each(ent, _instances) {
vinst = list_entry(ent, struct vas_instance, node);
if (vinst->vas_id == vasid) {
-- 
2.7.4

[PATCH v2 08/18] powerpc/vas: poll for return of window credits

2017-10-06 Thread Sukadev Bhattiprolu

Normally, the NX driver waits for the CRBs to be processed before closing
the window. But it is better to ensure that the credits are returned before
the window gets reassigned later.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 45 +
 1 file changed, 45 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index a59a187..23c13a7 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1063,6 +1063,49 @@ int vas_paste_crb(struct vas_window *txwin, int offset, 
bool re)
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
 /*
+ * If credit checking is enabled for this window, poll for the return
+ * of window credits (i.e for NX engines to process any outstanding CRBs).
+ * Since NX-842 waits for the CRBs to be processed before closing the
+ * window, we should not have to wait for too long.
+ *
+ * TODO: We retry in 10ms intervals now. We could/should probably peek at
+ * the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending
+ * CRBs on the FIFO and compute the delay dynamically on each retry.
+ * But that is not really needed until we support NX-GZIP access from
+ * user space. (NX-842 driver waits for CSB and Fast thread-wakeup
+ * doesn't use credit checking).
+ */
+static void poll_window_credits(struct vas_window *window)
+{
+   u64 val;
+   int creds, mode;
+
+   val = read_hvwc_reg(window, VREG(WINCTL));
+   if (window->tx_win)
+   mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val);
+   else
+   mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val);
+
+   if (!mode)
+   return;
+retry:
+   if (window->tx_win) {
+   val = read_hvwc_reg(window, VREG(TX_WCRED));
+   creds = GET_FIELD(VAS_TX_WCRED, val);
+   } else {
+   val = read_hvwc_reg(window, VREG(LRX_WCRED));
+   creds = GET_FIELD(VAS_LRX_WCRED, val);
+   }
+
+   if (creds < window->wcreds_max) {
+   val = 0;
+   set_current_state(TASK_UNINTERRUPTIBLE);
+   schedule_timeout(msecs_to_jiffies(10));
+   goto retry;
+   }
+}
+
+/*
  * Wait for the window to go to "not-busy" state. It should only take a
  * short time to queue a CRB, so window should not be busy for too long.
  * Trying 5ms intervals.
@@ -1149,6 +1192,8 @@ int vas_win_close(struct vas_window *window)
 
unpin_close_window(window);
 
+   poll_window_credits(window);
+
poll_window_castout(window);
 
/* if send window, drop reference to matching receive window */
-- 
2.7.4

[PATCH v2 07/18] powerpc/vas: Save configured window credits

2017-10-06 Thread Sukadev Bhattiprolu

Save the configured max window credits for a window in the vas_window
structure. We will need this when polling for return of window credits.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 --
 arch/powerpc/platforms/powernv/vas.h| 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 1422cdd..a59a187 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -674,7 +674,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 
winctx->rx_fifo = rxattr->rx_fifo;
winctx->rx_fifo_size = rxattr->rx_fifo_size;
-   winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+   winctx->wcreds_max = rxwin->wcreds_max;
winctx->pin_win = rxattr->pin_win;
 
winctx->nx_win = rxattr->nx_win;
@@ -844,6 +844,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
rxwin->nx_win = rxattr->nx_win;
rxwin->user_win = rxattr->user_win;
rxwin->cop = cop;
+   rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
if (rxattr->user_win)
rxwin->pid = task_pid_vnr(current);
 
@@ -893,7 +894,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 */
memset(winctx, 0, sizeof(struct vas_winctx));
 
-   winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+   winctx->wcreds_max = txwin->wcreds_max;
 
winctx->user_win = txattr->user_win;
winctx->nx_win = txwin->rxwin->nx_win;
@@ -978,6 +979,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
txwin->nx_win = txwin->rxwin->nx_win;
txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
+   txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 
init_winctx_for_txwin(txwin, attr, );
 
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 15d2dfa..ad906f6 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -332,6 +332,7 @@ struct vas_window {
void *hvwc_map; /* HV window context */
void *uwc_map;  /* OS/User window context */
pid_t pid;  /* Linux process id of owner */
+   int wcreds_max; /* Window credits */
 
/* Fields applicable only to send windows */
void *paste_kaddr;
-- 
2.7.4

[PATCH v2 06/18] powerpc/vas: Reduce polling interval for busy state

2017-10-06 Thread Sukadev Bhattiprolu

A VAS window is normally in "busy" state for only a short duration.
Reduce the time we wait for the window to go to "not-busy" state to
speed-up vas_win_close() a bit.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 95622a9..1422cdd 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1060,21 +1060,23 @@ int vas_paste_crb(struct vas_window *txwin, int offset, 
bool re)
 }
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
+/*
+ * Wait for the window to go to "not-busy" state. It should only take a
+ * short time to queue a CRB, so window should not be busy for too long.
+ * Trying 5ms intervals.
+ */
 static void poll_window_busy_state(struct vas_window *window)
 {
int busy;
u64 val;
 
 retry:
-   /*
-* Poll Window Busy flag
-*/
val = read_hvwc_reg(window, VREG(WIN_STATUS));
busy = GET_FIELD(VAS_WIN_BUSY, val);
if (busy) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
-   schedule_timeout(HZ);
+   schedule_timeout(msecs_to_jiffies(5));
goto retry;
}
 }
-- 
2.7.4

[PATCH v2 05/18] powerpc/vas: Use helper to unpin/close window

2017-10-06 Thread Sukadev Bhattiprolu

Use a helper to have the hardware unpin and mark a window closed.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 22 +++---
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 8ab8a82..95622a9 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1101,6 +1101,20 @@ static void poll_window_castout(struct vas_window 
*window)
 }
 
 /*
+ * Unpin and close a window so no new requests are accepted and the
+ * hardware can evict this window from cache if necessary.
+ */
+static void unpin_close_window(struct vas_window *window)
+{
+   u64 val;
+
+   val = read_hvwc_reg(window, VREG(WINCTL));
+   val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
+   val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
+   write_hvwc_reg(window, VREG(WINCTL), val);
+}
+
+/*
  * Close a window.
  *
  * See Section 1.12.1 of VAS workbook v1.05 for details on closing window:
@@ -1114,8 +1128,6 @@ static void poll_window_castout(struct vas_window *window)
  */
 int vas_win_close(struct vas_window *window)
 {
-   u64 val;
-
if (!window)
return 0;
 
@@ -1131,11 +1143,7 @@ int vas_win_close(struct vas_window *window)
 
poll_window_busy_state(window);
 
-   /* Unpin window from cache and close it */
-   val = read_hvwc_reg(window, VREG(WINCTL));
-   val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
-   val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
-   write_hvwc_reg(window, VREG(WINCTL), val);
+   unpin_close_window(window);
 
poll_window_castout(window);
 
-- 
2.7.4

[PATCH v2 04/18] powerpc/vas: Drop poll_window_cast_out().

2017-10-06 Thread Sukadev Bhattiprolu

Polling for window cast out is listed in the spec, but turns out that
it is not strictly necessary and slows down window close. Making it a
stub for now.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 34 ++---
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 67ffc5d..8ab8a82 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1079,25 +1079,25 @@ static void poll_window_busy_state(struct vas_window 
*window)
}
 }
 
+/*
+ * Have the hardware cast a window out of cache and wait for it to
+ * be completed.
+ *
+ * NOTE: It can take a relatively long time to cast the window context
+ * out of the cache. It is not strictly necessary to cast out if:
+ *
+ * - we clear the "Pin Window" bit (so hardware is free to evict)
+ *
+ * - we re-initialize the window context when it is reassigned.
+ *
+ * We do the former in vas_win_close() and latter in vas_win_open().
+ * So, ignoring the cast-out for now. We can add it as needed. If
+ * casting out becomes necessary we should consider offloading the
+ * job to a worker thread, so the window close can proceed quickly.
+ */
 static void poll_window_castout(struct vas_window *window)
 {
-   int cached;
-   u64 val;
-
-   /* Cast window context out of the cache */
-retry:
-   val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL));
-   cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val);
-   if (cached) {
-   val = 0ULL;
-   val = SET_FIELD(VAS_CASTOUT_REQ, val, 1);
-   val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0);
-   write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
-
-   set_current_state(TASK_UNINTERRUPTIBLE);
-   schedule_timeout(HZ);
-   goto retry;
-   }
+   /* stub for now */
 }
 
 /*
-- 
2.7.4

[PATCH v2 03/18] powerpc/vas: Cleanup some debug code

2017-10-06 Thread Sukadev Bhattiprolu

Clean up vas.h and the debug code around ifdef vas_debug.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c |  8 +++--
 arch/powerpc/platforms/powernv/vas.h| 56 +++--
 2 files changed, 18 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index a2fe120..67ffc5d 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -726,7 +726,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 static bool rx_win_args_valid(enum vas_cop_type cop,
struct vas_rx_win_attr *attr)
 {
-   dump_rx_win_attr(attr);
+   pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n",
+   attr->fault_win, attr->notify_disable,
+   attr->intr_disable, attr->notify_early,
+   attr->rx_fifo_size);
 
if (cop >= VAS_COP_TYPE_MAX)
return false;
@@ -1050,7 +1053,8 @@ int vas_paste_crb(struct vas_window *txwin, int offset, 
bool re)
else
rc = -EINVAL;
 
-   print_fifo_msg_count(txwin);
+   pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+   read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
 
return rc;
 }
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index fea0de4..15d2dfa 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -259,6 +259,16 @@
 #define VAS_NX_UTIL_ADDER  PPC_BITMASK(32, 63)
 
 /*
+ * VREG(x):
+ * Expand a register's short name (eg: LPID) into two parameters:
+ * - the register's short name in string form ("LPID"), and
+ * - the name of the macro (eg: VAS_LPID_OFFSET), defining the
+ *   register's offset in the window context
+ */
+#define VREG_SFX(n, s) __stringify(n), VAS_##n##s
+#define VREG(r)VREG_SFX(r, _OFFSET)
+
+/*
  * Local Notify Scope Control Register. (Receive windows only).
  */
 enum vas_notify_scope {
@@ -385,43 +395,15 @@ struct vas_winctx {
 
 extern struct vas_instance *find_vas_instance(int vasid);
 
-/*
- * VREG(x):
- * Expand a register's short name (eg: LPID) into two parameters:
- * - the register's short name in string form ("LPID"), and
- * - the name of the macro (eg: VAS_LPID_OFFSET), defining the
- *   register's offset in the window context
- */
-#define VREG_SFX(n, s) __stringify(n), VAS_##n##s
-#define VREG(r)VREG_SFX(r, _OFFSET)
-
-#ifdef vas_debug
-static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr)
-{
-   pr_err("fault %d, notify %d, intr %d early %d\n",
-   attr->fault_win, attr->notify_disable,
-   attr->intr_disable, attr->notify_early);
-
-   pr_err("rx_fifo_size %d, max value %d\n",
-   attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX);
-}
-
 static inline void vas_log_write(struct vas_window *win, char *name,
void *regptr, u64 val)
 {
-   if (val)
-   pr_err("%swin #%d: %s reg %p, val 0x%016llx\n",
+   if (!val)
+   pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
win->tx_win ? "Tx" : "Rx", win->winid, name,
regptr, val);
 }
 
-#else  /* vas_debug */
-
-#define vas_log_write(win, name, reg, val)
-#define dump_rx_win_attr(attr)
-
-#endif /* vas_debug */
-
 static inline void write_uwc_reg(struct vas_window *win, char *name,
s32 reg, u64 val)
 {
@@ -450,18 +432,4 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
return in_be64(win->hvwc_map+reg);
 }
 
-#ifdef vas_debug
-
-static void print_fifo_msg_count(struct vas_window *txwin)
-{
-   uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o);
-   pr_devel("Winid %d, Msg count %llu\n", txwin->winid,
-   (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
-}
-#else  /* vas_debug */
-
-#define print_fifo_msg_count(window)
-
-#endif /* vas_debug */
-
 #endif /* _VAS_H */
-- 
2.7.4

[PATCH v2 02/18] powerpc/vas: Validate window credits

2017-10-06 Thread Sukadev Bhattiprolu

NX-842, the only user of VAS, sets the window credits to default values
but VAS should check the credits against the possible max values.

The VAS_WCREDS_MIN is not needed and can be dropped.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++
 arch/powerpc/platforms/powernv/vas.h| 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index cec7ab7..a2fe120 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -738,6 +738,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
return false;
 
+   if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+   return false;
+
if (attr->nx_win) {
/* cannot be fault or user window if it is nx */
if (attr->fault_win || attr->user_win)
@@ -927,6 +930,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (cop > VAS_COP_TYPE_MAX)
return false;
 
+   if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
+   return false;
+
if (attr->user_win &&
(cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
return false;
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 38dee5d..fea0de4 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -106,8 +106,8 @@
  *
  * TODO: Needs tuning for per-process credits
  */
-#define VAS_WCREDS_MIN 16
-#define VAS_WCREDS_MAX ((64 << 10) - 1)
+#define VAS_RX_WCREDS_MAX  ((64 << 10) - 1)
+#define VAS_TX_WCREDS_MAX  ((4 << 10) - 1)
 #define VAS_WCREDS_DEFAULT (1 << 10)
 
 /*
-- 
2.7.4

[PATCH v2 01/18] powerpc/vas: init missing fields from [rt]xattr

2017-10-06 Thread Sukadev Bhattiprolu

Initialize a few missing window context fields from the window attributes
specified by the caller. These fields are currently set to their default
values by the caller (NX-842), but would be good to apply them anyway.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 5aae845..cec7ab7 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -679,10 +679,13 @@ static void init_winctx_for_rxwin(struct vas_window 
*rxwin,
 
winctx->nx_win = rxattr->nx_win;
winctx->fault_win = rxattr->fault_win;
+   winctx->user_win = rxattr->user_win;
+   winctx->rej_no_credit = rxattr->rej_no_credit;
winctx->rx_word_mode = rxattr->rx_win_ord_mode;
winctx->tx_word_mode = rxattr->tx_win_ord_mode;
winctx->rx_wcred_mode = rxattr->rx_wcred_mode;
winctx->tx_wcred_mode = rxattr->tx_wcred_mode;
+   winctx->notify_early = rxattr->notify_early;
 
if (winctx->nx_win) {
winctx->data_stamp = true;
@@ -889,11 +892,14 @@ static void init_winctx_for_txwin(struct vas_window 
*txwin,
winctx->user_win = txattr->user_win;
winctx->nx_win = txwin->rxwin->nx_win;
winctx->pin_win = txattr->pin_win;
+   winctx->rej_no_credit = txattr->rej_no_credit;
+   winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable;
 
winctx->rx_wcred_mode = txattr->rx_wcred_mode;
winctx->tx_wcred_mode = txattr->tx_wcred_mode;
winctx->rx_word_mode = txattr->rx_win_ord_mode;
winctx->tx_word_mode = txattr->tx_win_ord_mode;
+   winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
if (winctx->nx_win) {
winctx->data_stamp = true;
-- 
2.7.4

[PATCH v2 00/18] powerpc/vas: Add support for FTW

2017-10-06 Thread Sukadev Bhattiprolu

The first 10 patches in this set were posted earlier[1] and don't have
any significant changes since then. This set sanitizes cpu/chip id to
VAS id mapping, improves vas_win_close() performance and adds a check
for return of credits and cleans up some code.

Patch 11 adds debugfs support for the VAS window contexts.

Patches 11-18 add support for user space aka Fast thread-wakeup windows
in VAS. These include a patch from Michael Neuling to support emulating
the paste instruction.

Michael Neuling (1):
  powerpc: Emulate paste instruction

Sukadev Bhattiprolu (17):
  powerpc/vas: init missing fields from [rt]xattr
  powerpc/vas: Validate window credits
  powerpc/vas: Cleanup some debug code
  powerpc/vas: Drop poll_window_cast_out().
  powerpc/vas: Use helper to unpin/close window
  powerpc/vas: Reduce polling interval for busy state
  powerpc/vas: Save configured window credits
  powerpc/vas: poll for return of window credits
  powerpc/vas: Create cpu to vas id mapping
  powerpc/vas, nx-842: Define and use chip_to_vas_id()
  powerpc/vas: Export HVWC to debugfs
  powerpc: have copy depend on CONFIG_BOOK3S_64
  powerpc: Add support for setting SPRN_TIDR
  powerpc: Define set_thread_used_vas()
  powerpc/vas: Define vas_win_paste_addr()
  powerpc/vas: Define vas_win_id()
  powerpc/vas: Add support for user receive window

 arch/powerpc/include/asm/emulated_ops.h |   1 +
 arch/powerpc/include/asm/ppc-opcode.h   |   1 +
 arch/powerpc/include/asm/processor.h|   3 +
 arch/powerpc/include/asm/reg.h  |   2 +
 arch/powerpc/include/asm/switch_to.h|   5 +
 arch/powerpc/include/asm/vas.h  |  21 +++
 arch/powerpc/kernel/process.c   | 160 --
 arch/powerpc/kernel/traps.c |  64 
 arch/powerpc/platforms/powernv/Makefile |   3 +-
 arch/powerpc/platforms/powernv/vas-debug.c  | 209 
 arch/powerpc/platforms/powernv/vas-window.c | 245 +++-
 arch/powerpc/platforms/powernv/vas.c|  31 +++-
 arch/powerpc/platforms/powernv/vas.h|  95 ++-
 drivers/crypto/nx/nx-842-powernv.c  |  18 +-
 14 files changed, 745 insertions(+), 113 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c

-- 
2.7.4

[PATCH 2/2] vgaarb: Factor out EFI and fallback default device selection

2017-10-06 Thread Bjorn Helgaas

From: Bjorn Helgaas 

The default VGA device is normally set in vga_arbiter_add_pci_device() when
we call it for the first enabled device that can be accessed with the
legacy VGA resources ([mem 0xa-0xb], etc.)

That default device can be overridden by an EFI device that owns the boot
framebuffer.  As a fallback, we can also select a VGA device that can't be
accessed via legacy VGA resources, or a VGA device that isn't even enabled.

Factor out this EFI and fallback selection from vga_arb_device_init() into
a separate vga_arb_select_default_device() function.  This doesn't change
any behavior, but it untangles the "bridge control possible" checking and
messages from the default device selection.

Signed-off-by: Bjorn Helgaas 
---
 drivers/gpu/vga/vgaarb.c |   57 --
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index aeb41f793ed4..7803e35a3702 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -1402,29 +1402,14 @@ static struct miscdevice vga_arb_device = {
MISC_DYNAMIC_MINOR, "vga_arbiter", _arb_device_fops
 };
 
-static int __init vga_arb_device_init(void)
+static void __init vga_arb_select_default_device(void)
 {
-   int rc;
struct pci_dev *pdev;
struct vga_device *vgadev;
 
-   rc = misc_register(_arb_device);
-   if (rc < 0)
-   pr_err("error %d registering device\n", rc);
-
-   bus_register_notifier(_bus_type, _notifier);
-
-   /* We add all pci devices satisfying vga class in the arbiter by
-* default */
-   pdev = NULL;
-   while ((pdev =
-   pci_get_subsys(PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
-  PCI_ANY_ID, pdev)) != NULL)
-   vga_arbiter_add_pci_device(pdev);
-
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
list_for_each_entry(vgadev, _list, list) {
struct device *dev = >pdev->dev;
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
/*
 * Override vga_arbiter_add_pci_device()'s I/O based detection
 * as it may take the wrong device (e.g. on Apple system under
@@ -1461,12 +1446,8 @@ static int __init vga_arb_device_init(void)
vgaarb_info(dev, "overriding boot device\n");
vga_set_default_device(vgadev->pdev);
}
-#endif
-   if (vgadev->bridge_has_one_vga)
-   vgaarb_info(dev, "bridge control possible\n");
-   else
-   vgaarb_info(dev, "no bridge control possible\n");
}
+#endif
 
if (!vga_default_device()) {
list_for_each_entry(vgadev, _list, list) {
@@ -1492,6 +1473,38 @@ static int __init vga_arb_device_init(void)
vga_set_default_device(pdev);
}
}
+}
+
+static int __init vga_arb_device_init(void)
+{
+   int rc;
+   struct pci_dev *pdev;
+   struct vga_device *vgadev;
+
+   rc = misc_register(_arb_device);
+   if (rc < 0)
+   pr_err("error %d registering device\n", rc);
+
+   bus_register_notifier(_bus_type, _notifier);
+
+   /* We add all PCI devices satisfying VGA class in the arbiter by
+* default */
+   pdev = NULL;
+   while ((pdev =
+   pci_get_subsys(PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
+  PCI_ANY_ID, pdev)) != NULL)
+   vga_arbiter_add_pci_device(pdev);
+
+   list_for_each_entry(vgadev, _list, list) {
+   struct device *dev = >pdev->dev;
+
+   if (vgadev->bridge_has_one_vga)
+   vgaarb_info(dev, "bridge control possible\n");
+   else
+   vgaarb_info(dev, "no bridge control possible\n");
+   }
+
+   vga_arb_select_default_device();
 
pr_info("loaded\n");
return rc;

[PATCH 1/2] vgaarb: Select a default VGA device even if there's no legacy VGA

2017-10-06 Thread Bjorn Helgaas

From: Bjorn Helgaas 

Daniel Axtens reported that on the HiSilicon D05 board, the VGA device is
behind a bridge that doesn't support PCI_BRIDGE_CTL_VGA, so the VGA arbiter
never selects it as the default, which means Xorg auto-detection doesn't
work.

VGA is a legacy PCI feature: a VGA device can respond to addresses, e.g.,
[mem 0xa-0xb], [io 0x3b0-0x3bb], [io 0x3c0-0x3df], etc., that are
not configurable by BARs.  Consequently, multiple VGA devices can conflict
with each other.  The VGA arbiter avoids conflicts by ensuring that those
legacy resources are only routed to one VGA device at a time.

The arbiter identifies the "default VGA" device, i.e., a legacy VGA device
that was used by boot firmware.  It selects the first device that:

  - is of PCI_CLASS_DISPLAY_VGA,
  - has both PCI_COMMAND_IO and PCI_COMMAND_MEMORY enabled, and
  - has PCI_BRIDGE_CTL_VGA set in all upstream bridges.

Some systems don't have such a device.  For example, if a host bridge
doesn't support I/O space, PCI_COMMAND_IO probably won't be enabled for any
devices below it.  Or, as on the HiSilicon D05, the VGA device may be
behind a bridge that doesn't support PCI_BRIDGE_CTL_VGA, so accesses to the
legacy VGA resources will never reach the device.

This patch extends the arbiter so that if it doesn't find a device that
meets all the above criteria, it selects the first device that:

  - is of PCI_CLASS_DISPLAY_VGA and
  - has PCI_COMMAND_IO or PCI_COMMAND_MEMORY enabled

If it doesn't find even that, it selects the first device that:

  - is of class PCI_CLASS_DISPLAY_VGA.

Such a device may not be able to use the legacy VGA resources, but most
drivers can operate the device without those.  Setting it as the default
device means its "boot_vga" sysfs file will contain "1", which Xorg (via
libpciaccess) uses to help select its default output device.

This fixes Xorg auto-detection on some arm64 systems (HiSilicon D05 in
particular; see the link below).

It also replaces the powerpc fixup_vga() quirk, albeit with slightly
different semantics: the quirk selected the first VGA device we found, and
overrode that selection with any enabled VGA device we found.  If there
were several enabled VGA devices, the *last* one we found would become the
default.

The code here instead selects the *first* enabled VGA device we find, and
if none are enabled, the first VGA device we find.

Link: http://lkml.kernel.org/r/20170901072744.2409-1-...@axtens.net
Tested-by: Daniel Axtens # arm64, ppc64-qemu-tcg
Signed-off-by: Bjorn Helgaas 
---
 arch/powerpc/kernel/pci-common.c |   12 
 drivers/gpu/vga/vgaarb.c |   25 +
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 02831a396419..0ac7aa346c69 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1740,15 +1740,3 @@ static void fixup_hide_host_resource_fsl(struct pci_dev 
*dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, 
fixup_hide_host_resource_fsl);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, 
fixup_hide_host_resource_fsl);
-
-static void fixup_vga(struct pci_dev *pdev)
-{
-   u16 cmd;
-
-   pci_read_config_word(pdev, PCI_COMMAND, );
-   if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || 
!vga_default_device())
-   vga_set_default_device(pdev);
-
-}
-DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
- PCI_CLASS_DISPLAY_VGA, 8, fixup_vga);
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 76875f6299b8..aeb41f793ed4 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -1468,6 +1468,31 @@ static int __init vga_arb_device_init(void)
vgaarb_info(dev, "no bridge control possible\n");
}
 
+   if (!vga_default_device()) {
+   list_for_each_entry(vgadev, _list, list) {
+   struct device *dev = >pdev->dev;
+   u16 cmd;
+
+   pdev = vgadev->pdev;
+   pci_read_config_word(pdev, PCI_COMMAND, );
+   if (cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
+   vgaarb_info(dev, "setting as boot device (VGA 
legacy resources not available)\n");
+   vga_set_default_device(pdev);
+   break;
+   }
+   }
+   }
+
+   if (!vga_default_device()) {
+   vgadev = list_first_entry_or_null(_list,
+ struct vga_device, list);
+   if (vgadev) {
+   struct device *dev = >pdev->dev;
+   vgaarb_info(dev, "setting as boot device (VGA legacy 
resources not available)\n");
+

[PATCH 0/2] vgaarb: Select fallback default VGA device

2017-10-06 Thread Bjorn Helgaas

These patches are supposed to fix a problem Daniel Axtens found on the
HiSilicon D05 board.  The VGA device there is behind a bridge that doesn't
support PCI_BRIDGE_CTL_VGA, so the arbiter never selects the device as the
default.

The first patch extends the arbiter so that if it can't find an enabled VGA
device with legacy resources, it selects the first enabled device *without*
legacy resources (this is what fixes the D05).  If that fails, it selects
the first device that isn't enabled.  The combination of both changes
should make the current powerpc fixup_vga() quirk unnecessary.

N.B. It changes the powerpc behavior: if there are several enabled VGA
devices, the current quirk selects the last one, while this patch selects
the first one.  If this is a problem, I can drop that part of the patch and
keep the quirk.

The second patch pulls out this fallback device detection (and the EFI
override) from vga_arb_device_init() to make it easier to read.

---

Bjorn Helgaas (2):
  vgaarb: Select a default VGA device even if there's no legacy VGA
  vgaarb: Factor out EFI and fallback default device selection


 arch/powerpc/kernel/pci-common.c |   12 --
 drivers/gpu/vga/vgaarb.c |   72 +-
 2 files changed, 55 insertions(+), 29 deletions(-)

Re: [PATCH0/8] Support for ibm, dynamic-memory-v2 device tree property

2017-10-06 Thread Michael Ellerman

Nathan Fontenot  writes:
> This patch set provides a set of updates to de-couple the LMB information
> provided in the ibm,dynamic-memory device tree property from the device
> tree property format. The goal is to provide a data set of LMB information
> so that consumners of this data do not need to understand and provide
> multiple parsing routines for the supported device tree property formats.

Couple of build problems in this series.

The non-pseries build dies with:

  arch/powerpc/kernel/prom.c:541:4: error: implicit declaration of function 
'early_init_dt_scan_drmem_lmbs' [-Werror=implicit-function-declaration]

And pseries dies with:

  arch/powerpc/platforms/pseries/hotplug-memory.c:860:21: error: 
'lmbs_available' may be used uninitialized in this function 
[-Werror=maybe-uninitialized]

cheers

Re: [RFC PATCH for 4.14 1/2] membarrier: Remove unused code for architectures without membarrier hooks

2017-10-06 Thread Mathieu Desnoyers


- On Oct 6, 2017, at 5:08 PM, Paul E. McKenney paul...@linux.vnet.ibm.com 
wrote:

> On Thu, Oct 05, 2017 at 06:33:26PM -0400, Mathieu Desnoyers wrote:
>> Architectures without membarrier hooks don't need to emit the
>> empty membarrier_arch_switch_mm() static inline when
>> CONFIG_MEMBARRIER=y.
>> 
>> Adapt the CONFIG_MEMBARRIER=n counterpart to only emit the empty
>> membarrier_arch_switch_mm() for architectures with membarrier hooks.
>> 
>> Reported-by: Nicholas Piggin 
>> Signed-off-by: Mathieu Desnoyers 
> 
> Queued for further review and testing, targeting v4.15.  Please let me
> know if you need it sooner.

Hi Paul,

Given that the following patch in this patchset (2/2)
"Fix: membarrier: Handle CLONE_VM + !CLONE_THREAD correctly on powerpc"
is based on this patch, and that the fix needs to go into 4.14,
I would recommend queuing both patches for 4.14 is possible.

Otherwise I could try swapping the order of the two patches if
needed, but since both are touching nearby code areas, rebasing
is not straightforward.

Thanks,

Mathieu

> 
>   Thanx, Paul
> 
>> CC: Peter Zijlstra 
>> CC: Paul E. McKenney 
>> CC: Boqun Feng 
>> CC: Andrew Hunter 
>> CC: Maged Michael 
>> CC: gro...@google.com
>> CC: Avi Kivity 
>> CC: Benjamin Herrenschmidt 
>> CC: Paul Mackerras 
>> CC: Michael Ellerman 
>> CC: Dave Watson 
>> CC: Alan Stern 
>> CC: Will Deacon 
>> CC: Andy Lutomirski 
>> CC: Ingo Molnar 
>> CC: Alexander Viro 
>> CC: linuxppc-dev@lists.ozlabs.org
>> CC: linux-a...@vger.kernel.org
>> ---
>>  include/linux/sched/mm.h | 6 ++
>>  1 file changed, 2 insertions(+), 4 deletions(-)
>> 
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index d5a9ab8f3836..b2767ecb21a8 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -215,10 +215,6 @@ static inline void memalloc_noreclaim_restore(unsigned 
>> int
>> flags)
>>  #ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
>>  #include 
>>  #else
>> -static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
>> -struct mm_struct *next, struct task_struct *tsk)
>> -{
>> -}
>>  static inline void membarrier_arch_fork(struct task_struct *t,
>>  unsigned long clone_flags)
>>  {
>> @@ -247,10 +243,12 @@ static inline void membarrier_execve(struct task_struct
>> *t)
>>  membarrier_arch_execve(t);
>>  }
>>  #else
>> +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
>>  static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
>>  struct mm_struct *next, struct task_struct *tsk)
>>  {
>>  }
>> +#endif
>>  static inline void membarrier_fork(struct task_struct *t,
>>  unsigned long clone_flags)
>>  {
>> --
>> 2.11.0

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

Re: [RFC PATCH for 4.14 1/2] membarrier: Remove unused code for architectures without membarrier hooks

2017-10-06 Thread Paul E. McKenney

On Thu, Oct 05, 2017 at 06:33:26PM -0400, Mathieu Desnoyers wrote:
> Architectures without membarrier hooks don't need to emit the
> empty membarrier_arch_switch_mm() static inline when
> CONFIG_MEMBARRIER=y.
> 
> Adapt the CONFIG_MEMBARRIER=n counterpart to only emit the empty
> membarrier_arch_switch_mm() for architectures with membarrier hooks.
> 
> Reported-by: Nicholas Piggin 
> Signed-off-by: Mathieu Desnoyers 

Queued for further review and testing, targeting v4.15.  Please let me
know if you need it sooner.

Thanx, Paul

> CC: Peter Zijlstra 
> CC: Paul E. McKenney 
> CC: Boqun Feng 
> CC: Andrew Hunter 
> CC: Maged Michael 
> CC: gro...@google.com
> CC: Avi Kivity 
> CC: Benjamin Herrenschmidt 
> CC: Paul Mackerras 
> CC: Michael Ellerman 
> CC: Dave Watson 
> CC: Alan Stern 
> CC: Will Deacon 
> CC: Andy Lutomirski 
> CC: Ingo Molnar 
> CC: Alexander Viro 
> CC: linuxppc-dev@lists.ozlabs.org
> CC: linux-a...@vger.kernel.org
> ---
>  include/linux/sched/mm.h | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index d5a9ab8f3836..b2767ecb21a8 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -215,10 +215,6 @@ static inline void memalloc_noreclaim_restore(unsigned 
> int flags)
>  #ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
>  #include 
>  #else
> -static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
> - struct mm_struct *next, struct task_struct *tsk)
> -{
> -}
>  static inline void membarrier_arch_fork(struct task_struct *t,
>   unsigned long clone_flags)
>  {
> @@ -247,10 +243,12 @@ static inline void membarrier_execve(struct task_struct 
> *t)
>   membarrier_arch_execve(t);
>  }
>  #else
> +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
>  static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
>   struct mm_struct *next, struct task_struct *tsk)
>  {
>  }
> +#endif
>  static inline void membarrier_fork(struct task_struct *t,
>   unsigned long clone_flags)
>  {
> -- 
> 2.11.0
>

Re: [PATCH v3 0/3] Split default display handling out from VGA arbiter

2017-10-06 Thread Bjorn Helgaas

On Wed, Sep 27, 2017 at 01:52:55PM +1000, Daniel Axtens wrote:
> Hi Bjorn,
> 
> Yes, this works:
> 
> Tested-by: Daniel Axtens  # arm64, ppc64-qemu-tcg

I guess I was assuming you'd pick this up, but that doesn't really
make sense because I didn't give you a signed-off-by or anything.
I'll post this with a changelog and signed-off-by so it doesn't get
lost.

I also noticed that I didn't correctly handle the powerpc quirk case
where it doesn't require the device to be enabled at all.  I'll try to
fix that up, too.

Bjorn

> > On Fri, Sep 01, 2017 at 05:27:41PM +1000, Daniel Axtens wrote:
> >> This patch set:
> >> 
> >>  - splits the default display handling out from VGA arbiter, into its
> >>own file and behind its own Kconfig option (and gives the functions
> >>better names).
> >> 
> >>  - adds extra detection of default devices. To be nominated, the vga
> >>arbiter and platform hooks must not have nominated a default. A
> >>card will then only be nominated if it has a driver attached and
> >>has IO or memory decoding enabled.
> >> 
> >>  - adds relevant documentation.
> >> 
> >> The practical impact of this is improved X autoconfiguration on some
> >> arm64 systems.
> >
> > I think I gave you bad advice about trying to separate the "default
> > device" idea from the VGA arbiter.
> >
> > It is true that the "VGA arbiter" per se is related to routing the
> > legacy VGA resources, and the arbiter currently only selects a default
> > device if it finds a device to which those resources are routed.
> >
> > We have some cases where we want to select a default device that may
> > not support the legacy VGA resources, or where they might not be
> > routed to the device:
> >
> >   - systems where we match the EFI framebuffer address with a BAR, and
> > select that device as default,
> >
> >   - powerpc systems where there may be no host bridge window that maps
> > to the legacy VGA resources,
> >
> >   - your ARM64 systems where the default device may be behind a bridge
> > that doesn't support legacy VGA routing (PCI_BRIDGE_CTL_VGA)
> >
> > But I think trying to split the "default device" part out from the VGA
> > arbiter ends up being overkill and making things more complicated
> > instead of simpler.
> >
> > Would something like the following work for you as well as the powerpc
> > case?  On powerpc, we already use vga_set_default_device() to select a
> > device that doesn't use legacy VGA resources, so maybe we can just do
> > the same on ARM64?
> >
> > I suppose there might be wrinkles in how the arbiter deals with
> > multiple graphics devices on those systems, since I don't think it
> > identifies these devices that don't use the legacy resources, but it
> > seems like we live with whatever those on are powerpc and probably can
> > on ARM64 as well.
> >
> >
> > diff --git a/arch/powerpc/kernel/pci-common.c 
> > b/arch/powerpc/kernel/pci-common.c
> > index 02831a396419..0ac7aa346c69 100644
> > --- a/arch/powerpc/kernel/pci-common.c
> > +++ b/arch/powerpc/kernel/pci-common.c
> > @@ -1740,15 +1740,3 @@ static void fixup_hide_host_resource_fsl(struct 
> > pci_dev *dev)
> >  }
> >  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, 
> > fixup_hide_host_resource_fsl);
> >  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, 
> > fixup_hide_host_resource_fsl);
> > -
> > -static void fixup_vga(struct pci_dev *pdev)
> > -{
> > -   u16 cmd;
> > -
> > -   pci_read_config_word(pdev, PCI_COMMAND, );
> > -   if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || 
> > !vga_default_device())
> > -   vga_set_default_device(pdev);
> > -
> > -}
> > -DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
> > - PCI_CLASS_DISPLAY_VGA, 8, fixup_vga);
> > diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
> > index 76875f6299b8..9df4802c5f04 100644
> > --- a/drivers/gpu/vga/vgaarb.c
> > +++ b/drivers/gpu/vga/vgaarb.c
> > @@ -1468,6 +1468,21 @@ static int __init vga_arb_device_init(void)
> > vgaarb_info(dev, "no bridge control possible\n");
> > }
> >  
> > +   if (!vga_default_device()) {
> > +   list_for_each_entry(vgadev, _list, list) {
> > +   struct device *dev = >pdev->dev;
> > +   u16 cmd;
> > +
> > +   pdev = vgadev->pdev;
> > +   pci_read_config_word(pdev, PCI_COMMAND, );
> > +   if (cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
> > +   vgaarb_info(dev, "setting as boot device\n");
> > +   vga_set_default_device(pdev);
> > +   break;
> > +   }
> > +   }
> > +   }
> > +
> > pr_info("loaded\n");
> > return rc;
> >  }
> 
>

linux-4.14-rc3/arch/powerpc/perf/imc-pmu.c:599: pointless test ?

2017-10-06 Thread David Binderman

Hello there,

linux-4.14-rc3/arch/powerpc/perf/imc-pmu.c:599]: (style) Unsigned variable 
'ncpu' can't be negative so it is unnecessary to test it.

Source code is

if (ncpu >= 0 && ncpu < nr_cpu_ids) {

but

unsigned int ncpu, core_id;

Suggest remove test.

Regards

David Binderman

Re: [PATCH v10 05/10] mm: zero reserved and unavailable struct pages

2017-10-06 Thread Pasha Tatashin


Hi Michal,



As I've said in other reply this should go in only if the scenario you
describe is real. I am somehow suspicious to be honest. I simply do not
see how those weird struct pages would be in a valid pfn range of any
zone.



There are examples of both when unavailable memory is not part of any 
zone, and where it is part of zones.


I run Linux in kvm with these arguments:

qemu-system-x86_64
-enable-kvm
-cpu kvm64
-kernel $kernel
-initrd $initrd
-m 512
-smp 2
-device e1000,netdev=net0
-netdev user,id=net0
-boot order=nc
-no-reboot
-watchdog i6300esb
-watchdog-action debug
-rtc base=localtime
-serial stdio
-display none
-monitor null

This patch reports that there are 98 unavailable pages.

They are: pfn 0 and pfns in range [159, 255].

Note, trim_low_memory_range() reserves only pfns in range [0, 15], it 
does not reserve [159, 255] ones.


e820__memblock_setup() reports linux that the following physical ranges 
are available:

[1 , 158]
[256, 130783]

Notice, that exactly unavailable pfns are missing!

Now, lets check what we have in zone 0: [1, 131039]

pfn 0, is not part of the zone, but pfns [1, 158], are.

However, the bigger problem we have if we do not initialize these struct 
pages is with memory hotplug. Because, that path operates at 2M 
boundaries (section_nr). And checks if 2M range of pages is hot 
removable. It starts with first pfn from zone, rounds it down to 2M 
boundary (sturct pages are allocated at 2M boundaries when vmemmap is 
created), and and checks if that section is hot removable. In this case 
start with pfn 1 and convert it down to pfn 0.


Later pfn is converted to struct page, and some fields are checked. Now, 
if we do not zero struct pages, we get unpredictable results. In fact 
when CONFIG_VM_DEBUG is enabled, and we explicitly set all vmemmap 
memory to ones, I am getting the following panic with kernel test 
without this patch applied:


[   23.277793] BUG: unable to handle kernel NULL pointer dereference at 
 (null)

[   23.278863] IP: is_pageblock_removable_nolock+0x35/0x90
[   23.279619] PGD 0 P4D 0
[   23.280031] Oops:  [#1] PREEMPT
[   23.280527] CPU: 0 PID: 249 Comm: udevd Not tainted 
4.14.0-rc3_pt_memset10-00335-g5e2c7478bed5-dirty #8

[   23.281735] task: 88001f4e2900 task.stack: c9314000
[   23.282532] RIP: 0010:is_pageblock_removable_nolock+0x35/0x90
[   23.283275] RSP: 0018:c9317d60 EFLAGS: 00010202
[   23.283948] RAX:  RBX: 88001d92b000 RCX: 

[   23.284862] RDX:  RSI: 0020 RDI: 
88001d92b000
[   23.285771] RBP: c9317d80 R08: 10c8 R09: 

[   23.286542] R10:  R11:  R12: 
88001db2b000
[   23.287264] R13: 81af6d00 R14: 88001f7d5000 R15: 
82a1b6c0
[   23.287971] FS:  7f4eb857f7c0() GS:81c27000() 
knlGS:

[   23.288775] CS:  0010 DS:  ES:  CR0: 80050033
[   23.289355] CR2:  CR3: 1f4e6000 CR4: 
06b0

[   23.290066] Call Trace:
[   23.290323]  ? is_mem_section_removable+0x5a/0xd0
[   23.290798]  show_mem_removable+0x6b/0xa0
[   23.291204]  dev_attr_show+0x1b/0x50
[   23.291565]  sysfs_kf_seq_show+0xa1/0x100
[   23.291967]  kernfs_seq_show+0x22/0x30
[   23.292349]  seq_read+0x1ac/0x3a0
[   23.292687]  kernfs_fop_read+0x36/0x190
[   23.293074]  ? security_file_permission+0x90/0xb0
[   23.293547]  __vfs_read+0x16/0x30
[   23.293884]  vfs_read+0x81/0x130
[   23.294214]  SyS_read+0x44/0xa0
[   23.294537]  entry_SYSCALL_64_fastpath+0x1f/0xbd
[   23.295003] RIP: 0033:0x7f4eb7c660a0
[   23.295364] RSP: 002b:7ffda6cffe28 EFLAGS: 0246 ORIG_RAX: 

[   23.296152] RAX: ffda RBX: 03de RCX: 
7f4eb7c660a0
[   23.296934] RDX: 1000 RSI: 7ffda6cffec8 RDI: 
0005
[   23.297963] RBP: 7ffda6cffde8 R08: 7379732f73656369 R09: 
6f6d656d2f6d6574
[   23.299198] R10: 726f6d656d2f7972 R11: 0246 R12: 
0022
[   23.300400] R13: 561d68ea7710 R14:  R15: 
7ffda6d05c78
[   23.301591] Code: c1 ea 35 49 c1 e8 2b 48 8b 14 d5 c0 b6 a1 82 41 83 
e0 03 48 85 d2 74 0c 48 c1 e8 29 25 f0 0f 00 00 48 01 c2 4d 69 c0 98 
05 00 00 <48> 8b 02 48 89 fa 48 83 e0 f8 49 8b 88 28 b5 d3 81 48 29 c2 49
[   23.304739] RIP: is_pageblock_removable_nolock+0x35/0x90 RSP: 
c9317d60

[   23.305940] CR2:

Re: [PATCH] mm: deferred_init_memmap improvements

2017-10-06 Thread Pasha Tatashin


Hi Anshuman,

Thank you very much for looking at this. My reply below::

On 10/06/2017 02:48 AM, Anshuman Khandual wrote:

On 10/04/2017 08:59 PM, Pavel Tatashin wrote:

This patch fixes another existing issue on systems that have holes in
zones i.e CONFIG_HOLES_IN_ZONE is defined.

In for_each_mem_pfn_range() we have code like this:

if (!pfn_valid_within(pfn)
goto free_range;

Note: 'page' is not set to NULL and is not incremented but 'pfn' advances.


page is initialized to NULL at the beginning of the function.


Yes, it is initialized to NULL but at the beginning of 
for_each_mem_pfn_range() loop



PFN advances but we dont proceed unless pfn_valid_within(pfn)
holds true which basically should have checked with arch call
back if the PFN is valid in presence of memory holes as well.
Is not this correct ?


Correct, if pfn_valid_within() is false we jump to the "goto 
free_range;", which is at the end of for (; pfn < end_pfn; pfn++) loop, 
so we are not jumping outside of this loop.





Thus means if deferred struct pages are enabled on systems with these kind
of holes, linux would get memory corruptions. I have fixed this issue by
defining a new macro that performs all the necessary operations when we
free the current set of pages.


If we bail out in case PFN is not valid, then how corruption
can happen ?



We are not bailing out. We continue next iteration with next pfn, but 
page is not incremented.


Please let me know if I am missing something.

Thank you,
Pasha

[PATCH 18/18] crypto: talitos - avoid useless copy

2017-10-06 Thread Christophe Leroy

This patch avoids copy of buffered data to hash from bufnext to buf

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 36 ++--
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 5c4499a85611..5bd8191405d8 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -842,8 +842,8 @@ struct talitos_ctx {
 struct talitos_ahash_req_ctx {
u32 hw_context[TALITOS_MDEU_MAX_CONTEXT_SIZE / sizeof(u32)];
unsigned int hw_context_size;
-   u8 buf[HASH_MAX_BLOCK_SIZE];
-   u8 bufnext[HASH_MAX_BLOCK_SIZE];
+   u8 buf[2][HASH_MAX_BLOCK_SIZE];
+   int buf_idx;
unsigned int swinit;
unsigned int first;
unsigned int last;
@@ -1709,7 +1709,7 @@ static void ahash_done(struct device *dev,
 
if (!req_ctx->last && req_ctx->to_hash_later) {
/* Position any partial block for next update/final/finup */
-   memcpy(req_ctx->buf, req_ctx->bufnext, req_ctx->to_hash_later);
+   req_ctx->buf_idx = (req_ctx->buf_idx + 1) & 1;
req_ctx->nbuf = req_ctx->to_hash_later;
}
common_nonsnoop_hash_unmap(dev, edesc, areq);
@@ -1789,8 +1789,10 @@ static int common_nonsnoop_hash(struct talitos_edesc 
*edesc,
 * data in
 */
if (is_sec1 && req_ctx->nbuf) {
-   to_talitos_ptr(>ptr[3], ctx->dma_buf, req_ctx->nbuf,
-  is_sec1);
+   dma_addr_t dma_buf = ctx->dma_buf + req_ctx->buf_idx *
+   HASH_MAX_BLOCK_SIZE;
+
+   to_talitos_ptr(>ptr[3], dma_buf, req_ctx->nbuf, is_sec1);
} else {
sg_count = talitos_sg_map(dev, req_ctx->psrc, length, edesc,
  >ptr[3], sg_count, offset, 0);
@@ -1883,6 +1885,7 @@ static int ahash_init(struct ahash_request *areq)
bool is_sec1 = has_ftr_sec1(priv);
 
/* Initialize the context */
+   req_ctx->buf_idx = 0;
req_ctx->nbuf = 0;
req_ctx->first = 1; /* first indicates h/w must init its context */
req_ctx->swinit = 0; /* assume h/w init of context */
@@ -1955,6 +1958,7 @@ static int ahash_process_req(struct ahash_request *areq, 
unsigned int nbytes)
struct talitos_private *priv = dev_get_drvdata(dev);
bool is_sec1 = has_ftr_sec1(priv);
int offset = 0;
+   u8 *ctx_buf = req_ctx->buf[req_ctx->buf_idx];
 
if (!req_ctx->last && (nbytes + req_ctx->nbuf <= blocksize)) {
/* Buffer up to one whole block */
@@ -1964,7 +1968,7 @@ static int ahash_process_req(struct ahash_request *areq, 
unsigned int nbytes)
return nents;
}
sg_copy_to_buffer(areq->src, nents,
- req_ctx->buf + req_ctx->nbuf, nbytes);
+ ctx_buf + req_ctx->nbuf, nbytes);
req_ctx->nbuf += nbytes;
return 0;
}
@@ -1988,7 +1992,7 @@ static int ahash_process_req(struct ahash_request *areq, 
unsigned int nbytes)
if (!is_sec1 && req_ctx->nbuf) {
nsg = (req_ctx->nbuf < nbytes_to_hash) ? 2 : 1;
sg_init_table(req_ctx->bufsl, nsg);
-   sg_set_buf(req_ctx->bufsl, req_ctx->buf, req_ctx->nbuf);
+   sg_set_buf(req_ctx->bufsl, ctx_buf, req_ctx->nbuf);
if (nsg > 1)
sg_chain(req_ctx->bufsl, 2, areq->src);
req_ctx->psrc = req_ctx->bufsl;
@@ -2003,7 +2007,7 @@ static int ahash_process_req(struct ahash_request *areq, 
unsigned int nbytes)
return nents;
}
sg_copy_to_buffer(areq->src, nents,
- req_ctx->buf + req_ctx->nbuf, offset);
+ ctx_buf + req_ctx->nbuf, offset);
req_ctx->nbuf += offset;
req_ctx->psrc = areq->src;
} else
@@ -2016,7 +2020,7 @@ static int ahash_process_req(struct ahash_request *areq, 
unsigned int nbytes)
return nents;
}
sg_pcopy_to_buffer(areq->src, nents,
- req_ctx->bufnext,
+  req_ctx->buf[(req_ctx->buf_idx + 1) & 1],
  to_hash_later,
  nbytes - to_hash_later);
}
@@ -2038,9 +2042,13 @@ static int ahash_process_req(struct ahash_request *areq, 
unsigned int nbytes)
/* request SEC to INIT hash. */
if (req_ctx->first && !req_ctx->swinit)
edesc->desc.hdr |= DESC_HDR_MODE0_MDEU_INIT;
-   if (is_sec1)
-   dma_sync_single_for_device(dev, ctx->dma_buf,
+   if (is_sec1) {
+   dma_addr_t dma_buf = ctx->dma_buf +

[PATCH 17/18] crypto: talitos - chain in buffered data for ahash on SEC1

2017-10-06 Thread Christophe Leroy

SEC1 doesn't support S/G in descriptors so for hash operations,
the CPU has to build a buffer containing the buffered block and
the incoming data. This generates a lot of memory copies which
represents more than 50% of CPU time of a md5sum operation as
shown below with a 'perf record'.

|--86.24%-- kcapi_md_digest
|  |
|  |--86.18%-- _kcapi_common_vmsplice_chunk_fd
|  |  |
|  |  |--83.68%-- splice
|  |  |  |
|  |  |  |--83.59%-- ret_from_syscall
|  |  |  |  |
|  |  |  |  |--83.52%-- sys_splice
|  |  |  |  |  |
|  |  |  |  |  |--83.49%-- 
splice_from_pipe
|  |  |  |  |  |  |
|  |  |  |  |  |  |--83.04%-- 
__splice_from_pipe
|  |  |  |  |  |  |  |
|  |  |  |  |  |  |  
|--80.67%-- pipe_to_sendpage
|  |  |  |  |  |  |  |  
|
|  |  |  |  |  |  |  |  
|--78.25%-- hash_sendpage
|  |  |  |  |  |  |  |  
|  |
|  |  |  |  |  |  |  |  
|  |--60.08%-- ahash_process_req
|  |  |  |  |  |  |  |  
|  |  |
|  |  |  |  |  |  |  |  
|  |  |--56.36%-- sg_copy_buffer
|  |  |  |  |  |  |  |  
|  |  |  |
|  |  |  |  |  |  |  |  
|  |  |  |--55.29%-- memcpy
|  |  |  |  |  |  |  |  
|  |  |  |

However, unlike SEC2+, SEC1 offers the possibility to chain
descriptors. It is therefore possible to build a first descriptor
pointing to the buffered data and a second descriptor pointing to
the incoming data, hence avoiding the memory copy to a single
buffer.

With this patch, the time necessary for a md5sum on a 90Mbytes file
is approximately 3 seconds. Without the patch it takes 6 seconds.

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 139 ++-
 drivers/crypto/talitos.h |   1 +
 2 files changed, 127 insertions(+), 13 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index d495649d5267..5c4499a85611 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -160,6 +160,10 @@ static int reset_channel(struct device *dev, int ch)
/* set 36-bit addressing, done writeback enable and done IRQ enable */
setbits32(priv->chan[ch].reg + TALITOS_CCCR_LO, TALITOS_CCCR_LO_EAE |
  TALITOS_CCCR_LO_CDWE | TALITOS_CCCR_LO_CDIE);
+   /* enable chaining descriptors */
+   if (is_sec1)
+   setbits32(priv->chan[ch].reg + TALITOS_CCCR_LO,
+ TALITOS_CCCR_LO_NE);
 
/* and ICCR writeback, if available */
if (priv->features & TALITOS_FTR_HW_AUTH_CHECK)
@@ -333,7 +337,12 @@ static void flush_channel(struct device *dev, int ch, int 
error, int reset_ch)
 
/* descriptors with their done bits set don't get the error */
rmb();
-   hdr = is_sec1 ? request->desc->hdr1 : request->desc->hdr;
+   if (!is_sec1)
+   hdr = request->desc->hdr;
+   else if (request->desc->next_desc)
+   hdr = (request->desc + 1)->hdr1;
+   else
+   hdr = request->desc->hdr1;
 
if ((hdr & DESC_HDR_DONE) == DESC_HDR_DONE)
status = 0;
@@ -454,7 +463,8 @@ static u32 current_desc_hdr(struct device *dev, int ch)
tail = priv->chan[ch].tail;
 
iter = tail;
-   while (priv->chan[ch].fifo[iter].dma_desc != cur_desc) {
+   while (priv->chan[ch].fifo[iter].dma_desc != cur_desc &&
+  priv->chan[ch].fifo[iter].desc->next_desc != cur_desc) {
iter = (iter + 1) & (priv->fifo_len - 1);
if (iter == tail) {
dev_err(dev, "couldn't locate current descriptor\n");
@@ -462,6 +472,9 @@ static u32 current_desc_hdr(struct device *dev, int ch)
}
}
 
+   if (priv->chan[ch].fifo[iter].desc->next_desc == cur_desc)
+   return (priv->chan[ch].fifo[iter].desc + 1)->hdr;
+

[PATCH 16/18] crypto: talitos - do hw_context DMA mapping outside the requests

2017-10-06 Thread Christophe Leroy

At every request, we map and unmap the same hash hw_context.

This patch moves the dma mapping/unmapping in functions ahash_init()
and ahash_import().

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 80 ++--
 1 file changed, 57 insertions(+), 23 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index ebfd6d982ed6..d495649d5267 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -819,6 +819,7 @@ struct talitos_ctx {
unsigned int keylen;
unsigned int enckeylen;
unsigned int authkeylen;
+   dma_addr_t dma_hw_context;
 };
 
 #define HASH_MAX_BLOCK_SIZESHA512_BLOCK_SIZE
@@ -1663,18 +1664,9 @@ static void common_nonsnoop_hash_unmap(struct device 
*dev,
   struct ahash_request *areq)
 {
struct talitos_ahash_req_ctx *req_ctx = ahash_request_ctx(areq);
-   struct talitos_private *priv = dev_get_drvdata(dev);
-   bool is_sec1 = has_ftr_sec1(priv);
-
-   unmap_single_talitos_ptr(dev, >desc.ptr[5], DMA_FROM_DEVICE);
 
talitos_sg_unmap(dev, edesc, req_ctx->psrc, NULL, 0, 0);
 
-   /* When using hashctx-in, must unmap it. */
-   if (from_talitos_ptr_len(>desc.ptr[1], is_sec1))
-   unmap_single_talitos_ptr(dev, >desc.ptr[1],
-DMA_TO_DEVICE);
-
if (edesc->dma_len)
dma_unmap_single(dev, edesc->dma_link_tbl, edesc->dma_len,
 DMA_BIDIRECTIONAL);
@@ -1744,10 +1736,8 @@ static int common_nonsnoop_hash(struct talitos_edesc 
*edesc,
 
/* hash context in */
if (!req_ctx->first || req_ctx->swinit) {
-   map_single_talitos_ptr(dev, >ptr[1],
-  req_ctx->hw_context_size,
-  (char *)req_ctx->hw_context,
-  DMA_TO_DEVICE);
+   to_talitos_ptr(>ptr[1], ctx->dma_hw_context,
+  req_ctx->hw_context_size, is_sec1);
req_ctx->swinit = 0;
}
/* Indicate next op is not the first. */
@@ -1780,9 +1770,8 @@ static int common_nonsnoop_hash(struct talitos_edesc 
*edesc,
   crypto_ahash_digestsize(tfm),
   areq->result, DMA_FROM_DEVICE);
else
-   map_single_talitos_ptr(dev, >ptr[5],
-  req_ctx->hw_context_size,
-  req_ctx->hw_context, DMA_FROM_DEVICE);
+   to_talitos_ptr(>ptr[5], ctx->dma_hw_context,
+  req_ctx->hw_context_size, is_sec1);
 
/* last DWORD empty */
 
@@ -1815,17 +1804,25 @@ static struct talitos_edesc *ahash_edesc_alloc(struct 
ahash_request *areq,
 static int ahash_init(struct ahash_request *areq)
 {
struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq);
+   struct talitos_ctx *ctx = crypto_ahash_ctx(tfm);
+   struct device *dev = ctx->dev;
struct talitos_ahash_req_ctx *req_ctx = ahash_request_ctx(areq);
+   unsigned int size;
 
/* Initialize the context */
req_ctx->nbuf = 0;
req_ctx->first = 1; /* first indicates h/w must init its context */
req_ctx->swinit = 0; /* assume h/w init of context */
-   req_ctx->hw_context_size =
-   (crypto_ahash_digestsize(tfm) <= SHA256_DIGEST_SIZE)
+   size =  (crypto_ahash_digestsize(tfm) <= SHA256_DIGEST_SIZE)
? TALITOS_MDEU_CONTEXT_SIZE_MD5_SHA1_SHA256
: TALITOS_MDEU_CONTEXT_SIZE_SHA384_SHA512;
+   req_ctx->hw_context_size = size;
 
+   if (ctx->dma_hw_context)
+   dma_unmap_single(dev, ctx->dma_hw_context, size,
+DMA_BIDIRECTIONAL);
+   ctx->dma_hw_context = dma_map_single(dev, req_ctx->hw_context, size,
+DMA_BIDIRECTIONAL);
return 0;
 }
 
@@ -1836,6 +1833,9 @@ static int ahash_init(struct ahash_request *areq)
 static int ahash_init_sha224_swinit(struct ahash_request *areq)
 {
struct talitos_ahash_req_ctx *req_ctx = ahash_request_ctx(areq);
+   struct crypto_ahash *tfm = crypto_ahash_reqtfm(areq);
+   struct talitos_ctx *ctx = crypto_ahash_ctx(tfm);
+   struct device *dev = ctx->dev;
 
ahash_init(areq);
req_ctx->swinit = 1;/* prevent h/w initting context with sha256 values*/
@@ -1853,6 +1853,9 @@ static int ahash_init_sha224_swinit(struct ahash_request 
*areq)
req_ctx->hw_context[8] = 0;
req_ctx->hw_context[9] = 0;
 
+   dma_sync_single_for_device(dev, ctx->dma_hw_context,
+  req_ctx->hw_context_size, DMA_TO_DEVICE);
+
return 0;
 }
 
@@ -1990,7 +1993,12 @@ static int ahash_export(struct ahash_request *areq, void

[PATCH 15/18] crypto: talitos - DMA map key in setkey()

2017-10-06 Thread Christophe Leroy

dma_map_single() is an heavy operation which doesn't need to
be done at each request as the key doesn't change.

Instead of DMA mapping the key at every request, this patch maps it
once in setkey()

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 56 +---
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 307d534a0f2f..ebfd6d982ed6 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -815,6 +815,7 @@ struct talitos_ctx {
__be32 desc_hdr_template;
u8 key[TALITOS_MAX_KEY_SIZE];
u8 iv[TALITOS_MAX_IV_LENGTH];
+   dma_addr_t dma_key;
unsigned int keylen;
unsigned int enckeylen;
unsigned int authkeylen;
@@ -851,6 +852,7 @@ static int aead_setkey(struct crypto_aead *authenc,
   const u8 *key, unsigned int keylen)
 {
struct talitos_ctx *ctx = crypto_aead_ctx(authenc);
+   struct device *dev = ctx->dev;
struct crypto_authenc_keys keys;
 
if (crypto_authenc_extractkeys(, key, keylen) != 0)
@@ -859,12 +861,17 @@ static int aead_setkey(struct crypto_aead *authenc,
if (keys.authkeylen + keys.enckeylen > TALITOS_MAX_KEY_SIZE)
goto badkey;
 
+   if (ctx->keylen)
+   dma_unmap_single(dev, ctx->dma_key, ctx->keylen, DMA_TO_DEVICE);
+
memcpy(ctx->key, keys.authkey, keys.authkeylen);
memcpy(>key[keys.authkeylen], keys.enckey, keys.enckeylen);
 
ctx->keylen = keys.authkeylen + keys.enckeylen;
ctx->enckeylen = keys.enckeylen;
ctx->authkeylen = keys.authkeylen;
+   ctx->dma_key = dma_map_single(dev, ctx->key, ctx->keylen,
+ DMA_TO_DEVICE);
 
return 0;
 
@@ -940,14 +947,11 @@ static void ipsec_esp_unmap(struct device *dev,
unsigned int ivsize = crypto_aead_ivsize(aead);
bool is_ipsec_esp = edesc->desc.hdr & DESC_HDR_TYPE_IPSEC_ESP;
struct talitos_ptr *civ_ptr = >desc.ptr[is_ipsec_esp ? 2 : 3];
-   struct talitos_ptr *ckey_ptr = >desc.ptr[is_ipsec_esp ? 3 : 2];
 
if (is_ipsec_esp)
unmap_single_talitos_ptr(dev, >desc.ptr[6],
 DMA_FROM_DEVICE);
-   unmap_single_talitos_ptr(dev, ckey_ptr, DMA_TO_DEVICE);
unmap_single_talitos_ptr(dev, civ_ptr, DMA_TO_DEVICE);
-   unmap_single_talitos_ptr(dev, >desc.ptr[0], DMA_TO_DEVICE);
 
talitos_sg_unmap(dev, edesc, areq->src, areq->dst, areq->cryptlen,
 areq->assoclen);
@@ -976,6 +980,7 @@ static void ipsec_esp_encrypt_done(struct device *dev,
struct aead_request *areq = context;
struct crypto_aead *authenc = crypto_aead_reqtfm(areq);
unsigned int authsize = crypto_aead_authsize(authenc);
+   unsigned int ivsize = crypto_aead_ivsize(authenc);
struct talitos_edesc *edesc;
struct scatterlist *sg;
void *icvdata;
@@ -996,6 +1001,8 @@ static void ipsec_esp_encrypt_done(struct device *dev,
   icvdata, authsize);
}
 
+   dma_unmap_single(dev, edesc->iv_dma, ivsize, DMA_TO_DEVICE);
+
kfree(edesc);
 
aead_request_complete(areq, err);
@@ -1164,8 +1171,7 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
struct talitos_ptr *ckey_ptr = >ptr[is_ipsec_esp ? 3 : 2];
 
/* hmac key */
-   map_single_talitos_ptr(dev, >ptr[0], ctx->authkeylen, >key,
-  DMA_TO_DEVICE);
+   to_talitos_ptr(>ptr[0], ctx->dma_key, ctx->authkeylen, is_sec1);
 
sg_count = edesc->src_nents ?: 1;
if (is_sec1 && sg_count > 1)
@@ -1189,9 +1195,8 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
to_talitos_ptr(civ_ptr, edesc->iv_dma, ivsize, is_sec1);
 
/* cipher key */
-   map_single_talitos_ptr(dev, ckey_ptr, ctx->enckeylen,
-  (char *)>key + ctx->authkeylen,
-  DMA_TO_DEVICE);
+   to_talitos_ptr(ckey_ptr, ctx->dma_key  + ctx->authkeylen,
+  ctx->enckeylen, is_sec1);
 
/*
 * cipher in
@@ -1481,6 +1486,7 @@ static int ablkcipher_setkey(struct crypto_ablkcipher 
*cipher,
 const u8 *key, unsigned int keylen)
 {
struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
+   struct device *dev = ctx->dev;
u32 tmp[DES_EXPKEY_WORDS];
 
if (keylen > TALITOS_MAX_KEY_SIZE) {
@@ -1495,9 +1501,14 @@ static int ablkcipher_setkey(struct crypto_ablkcipher 
*cipher,
return -EINVAL;
}
 
+   if (ctx->keylen)
+   dma_unmap_single(dev, ctx->dma_key, ctx->keylen, DMA_TO_DEVICE);
+
memcpy(>key, key, keylen);
ctx->keylen = keylen;
 
+   ctx->dma_key = dma_map_single(dev,

[PATCH 14/18] crypto: talitos - simplify tests in ipsec_esp()

2017-10-06 Thread Christophe Leroy

Do (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP) only once.
Limit number of if/else paths

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 42 --
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 7e96db75347a..307d534a0f2f 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -938,12 +938,15 @@ static void ipsec_esp_unmap(struct device *dev,
struct crypto_aead *aead = crypto_aead_reqtfm(areq);
struct talitos_ctx *ctx = crypto_aead_ctx(aead);
unsigned int ivsize = crypto_aead_ivsize(aead);
+   bool is_ipsec_esp = edesc->desc.hdr & DESC_HDR_TYPE_IPSEC_ESP;
+   struct talitos_ptr *civ_ptr = >desc.ptr[is_ipsec_esp ? 2 : 3];
+   struct talitos_ptr *ckey_ptr = >desc.ptr[is_ipsec_esp ? 3 : 2];
 
-   if (edesc->desc.hdr & DESC_HDR_TYPE_IPSEC_ESP)
+   if (is_ipsec_esp)
unmap_single_talitos_ptr(dev, >desc.ptr[6],
 DMA_FROM_DEVICE);
-   unmap_single_talitos_ptr(dev, >desc.ptr[3], DMA_TO_DEVICE);
-   unmap_single_talitos_ptr(dev, >desc.ptr[2], DMA_TO_DEVICE);
+   unmap_single_talitos_ptr(dev, ckey_ptr, DMA_TO_DEVICE);
+   unmap_single_talitos_ptr(dev, civ_ptr, DMA_TO_DEVICE);
unmap_single_talitos_ptr(dev, >desc.ptr[0], DMA_TO_DEVICE);
 
talitos_sg_unmap(dev, edesc, areq->src, areq->dst, areq->cryptlen,
@@ -953,7 +956,7 @@ static void ipsec_esp_unmap(struct device *dev,
dma_unmap_single(dev, edesc->dma_link_tbl, edesc->dma_len,
 DMA_BIDIRECTIONAL);
 
-   if (!(edesc->desc.hdr & DESC_HDR_TYPE_IPSEC_ESP)) {
+   if (!is_ipsec_esp) {
unsigned int dst_nents = edesc->dst_nents ? : 1;
 
sg_pcopy_to_buffer(areq->dst, dst_nents, ctx->iv, ivsize,
@@ -1156,6 +1159,9 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
bool sync_needed = false;
struct talitos_private *priv = dev_get_drvdata(dev);
bool is_sec1 = has_ftr_sec1(priv);
+   bool is_ipsec_esp = desc->hdr & DESC_HDR_TYPE_IPSEC_ESP;
+   struct talitos_ptr *civ_ptr = >ptr[is_ipsec_esp ? 2 : 3];
+   struct talitos_ptr *ckey_ptr = >ptr[is_ipsec_esp ? 3 : 2];
 
/* hmac key */
map_single_talitos_ptr(dev, >ptr[0], ctx->authkeylen, >key,
@@ -1180,20 +1186,12 @@ static int ipsec_esp(struct talitos_edesc *edesc, 
struct aead_request *areq,
}
 
/* cipher iv */
-   if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP)
-   to_talitos_ptr(>ptr[2], edesc->iv_dma, ivsize, is_sec1);
-   else
-   to_talitos_ptr(>ptr[3], edesc->iv_dma, ivsize, is_sec1);
+   to_talitos_ptr(civ_ptr, edesc->iv_dma, ivsize, is_sec1);
 
/* cipher key */
-   if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP)
-   map_single_talitos_ptr(dev, >ptr[3], ctx->enckeylen,
-  (char *)>key + ctx->authkeylen,
-  DMA_TO_DEVICE);
-   else
-   map_single_talitos_ptr(dev, >ptr[2], ctx->enckeylen,
-  (char *)>key + ctx->authkeylen,
-  DMA_TO_DEVICE);
+   map_single_talitos_ptr(dev, ckey_ptr, ctx->enckeylen,
+  (char *)>key + ctx->authkeylen,
+  DMA_TO_DEVICE);
 
/*
 * cipher in
@@ -1203,10 +1201,10 @@ static int ipsec_esp(struct talitos_edesc *edesc, 
struct aead_request *areq,
 */
sg_link_tbl_len = cryptlen;
 
-   if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP) {
+   if (is_ipsec_esp) {
to_talitos_ptr_ext_set(>ptr[4], authsize, is_sec1);
 
-   if (edesc->desc.hdr & DESC_HDR_MODE1_MDEU_CICV)
+   if (desc->hdr & DESC_HDR_MODE1_MDEU_CICV)
sg_link_tbl_len += authsize;
}
 
@@ -1228,7 +1226,7 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
ret = talitos_sg_map(dev, areq->dst, cryptlen, edesc, >ptr[5],
 sg_count, areq->assoclen, tbl_off);
 
-   if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP)
+   if (is_ipsec_esp)
to_talitos_ptr_ext_or(>ptr[5], authsize, is_sec1);
 
/* ICV data */
@@ -1237,7 +1235,7 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
edesc->icv_ool = true;
sync_needed = true;
 
-   if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP) {
+   if (is_ipsec_esp) {
struct talitos_ptr *tbl_ptr = >link_tbl[tbl_off];
int offset = (edesc->src_nents + edesc->dst_nents + 2) *
 sizeof(struct talitos_ptr) + authsize;
@@ -1260,7 +1258,7 @@ static int

[PATCH 13/18] crypto: talitos - remove to_talitos_ptr_len()

2017-10-06 Thread Christophe Leroy

to_talitos_ptr() and to_talitos_ptr_len() are always called together
in order to fully set a ptr, so lets merge them into a single
helper.

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 56 ++--
 1 file changed, 21 insertions(+), 35 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index e7e1bada03df..7e96db75347a 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -56,28 +56,26 @@
 #include "talitos.h"
 
 static void to_talitos_ptr(struct talitos_ptr *ptr, dma_addr_t dma_addr,
-  bool is_sec1)
+  unsigned int len, bool is_sec1)
 {
ptr->ptr = cpu_to_be32(lower_32_bits(dma_addr));
-   if (!is_sec1)
+   if (is_sec1) {
+   ptr->len1 = cpu_to_be16(len);
+   } else {
+   ptr->len = cpu_to_be16(len);
ptr->eptr = upper_32_bits(dma_addr);
+   }
 }
 
 static void copy_talitos_ptr(struct talitos_ptr *dst_ptr,
 struct talitos_ptr *src_ptr, bool is_sec1)
 {
dst_ptr->ptr = src_ptr->ptr;
-   if (!is_sec1)
-   dst_ptr->eptr = src_ptr->eptr;
-}
-
-static void to_talitos_ptr_len(struct talitos_ptr *ptr, unsigned int len,
-  bool is_sec1)
-{
if (is_sec1) {
-   ptr->len1 = cpu_to_be16(len);
+   dst_ptr->len1 = src_ptr->len1;
} else {
-   ptr->len = cpu_to_be16(len);
+   dst_ptr->len = src_ptr->len;
+   dst_ptr->eptr = src_ptr->eptr;
}
 }
 
@@ -115,8 +113,7 @@ static void map_single_talitos_ptr(struct device *dev,
struct talitos_private *priv = dev_get_drvdata(dev);
bool is_sec1 = has_ftr_sec1(priv);
 
-   to_talitos_ptr_len(ptr, len, is_sec1);
-   to_talitos_ptr(ptr, dma_addr, is_sec1);
+   to_talitos_ptr(ptr, dma_addr, len, is_sec1);
 }
 
 /*
@@ -1090,8 +1087,7 @@ static int sg_to_link_tbl_offset(struct scatterlist *sg, 
int sg_count,
len = cryptlen;
 
to_talitos_ptr(link_tbl_ptr + count,
-  sg_dma_address(sg) + offset, 0);
-   to_talitos_ptr_len(link_tbl_ptr + count, len, 0);
+  sg_dma_address(sg) + offset, len, 0);
to_talitos_ptr_ext_set(link_tbl_ptr + count, 0, 0);
count++;
cryptlen -= len;
@@ -1117,14 +1113,12 @@ static int talitos_sg_map(struct device *dev, struct 
scatterlist *src,
struct talitos_private *priv = dev_get_drvdata(dev);
bool is_sec1 = has_ftr_sec1(priv);
 
-   to_talitos_ptr_len(ptr, len, is_sec1);
-
if (sg_count == 1) {
-   to_talitos_ptr(ptr, sg_dma_address(src) + offset, is_sec1);
+   to_talitos_ptr(ptr, sg_dma_address(src) + offset, len, is_sec1);
return sg_count;
}
if (is_sec1) {
-   to_talitos_ptr(ptr, edesc->dma_link_tbl + offset, is_sec1);
+   to_talitos_ptr(ptr, edesc->dma_link_tbl + offset, len, is_sec1);
return sg_count;
}
sg_count = sg_to_link_tbl_offset(src, sg_count, offset, len,
@@ -1135,7 +1129,7 @@ static int talitos_sg_map(struct device *dev, struct 
scatterlist *src,
return sg_count;
}
to_talitos_ptr(ptr, edesc->dma_link_tbl +
-   tbl_off * sizeof(struct talitos_ptr), is_sec1);
+   tbl_off * sizeof(struct talitos_ptr), len, is_sec1);
to_talitos_ptr_ext_or(ptr, DESC_PTR_LNKTBL_JUMP, is_sec1);
 
return sg_count;
@@ -1186,13 +1180,10 @@ static int ipsec_esp(struct talitos_edesc *edesc, 
struct aead_request *areq,
}
 
/* cipher iv */
-   if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP) {
-   to_talitos_ptr(>ptr[2], edesc->iv_dma, is_sec1);
-   to_talitos_ptr_len(>ptr[2], ivsize, is_sec1);
-   } else {
-   to_talitos_ptr(>ptr[3], edesc->iv_dma, is_sec1);
-   to_talitos_ptr_len(>ptr[3], ivsize, is_sec1);
-   }
+   if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP)
+   to_talitos_ptr(>ptr[2], edesc->iv_dma, ivsize, is_sec1);
+   else
+   to_talitos_ptr(>ptr[3], edesc->iv_dma, ivsize, is_sec1);
 
/* cipher key */
if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP)
@@ -1210,8 +1201,6 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
 * extent is bytes of HMAC postpended to ciphertext,
 * typically 12 for ipsec
 */
-   to_talitos_ptr_len(>ptr[4], cryptlen, is_sec1);
-
sg_link_tbl_len = cryptlen;
 
if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP) {
@@ -1257,11 +1246,10 @@ static int ipsec_esp(struct talitos_edesc *edesc, 
struct aead_request *areq,
to_talitos_ptr_ext_set(tbl_ptr - 1, 0, is_sec1);

[PATCH 12/18] crypto: talitos - don't check the number of channels at each interrupt

2017-10-06 Thread Christophe Leroy

The number of channels is known from the beginning, no need to
test it everytime.
This patch defines two additional done functions handling only channel 0.
Then the probe registers the correct one based on the number of channels.

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 27 +++
 drivers/crypto/talitos.h |  4 
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 83b2a70a1ba7..e7e1bada03df 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -390,8 +390,6 @@ static void talitos1_done_##name(unsigned long data)
\
\
if (ch_done_mask & 0x1000)  \
flush_channel(dev, 0, 0, 0);\
-   if (priv->num_channels == 1)\
-   goto out;   \
if (ch_done_mask & 0x4000)  \
flush_channel(dev, 1, 0, 0);\
if (ch_done_mask & 0x0001)  \
@@ -399,7 +397,6 @@ static void talitos1_done_##name(unsigned long data)
\
if (ch_done_mask & 0x0004)  \
flush_channel(dev, 3, 0, 0);\
\
-out:   \
/* At this point, all completed channels have been processed */ \
/* Unmask done interrupts for channels completed later on. */   \
spin_lock_irqsave(>reg_lock, flags);  \
@@ -409,6 +406,7 @@ out:
\
 }
 
 DEF_TALITOS1_DONE(4ch, TALITOS1_ISR_4CHDONE)
+DEF_TALITOS1_DONE(ch0, TALITOS1_ISR_CH_0_DONE)
 
 #define DEF_TALITOS2_DONE(name, ch_done_mask)  \
 static void talitos2_done_##name(unsigned long data)   \
@@ -419,8 +417,6 @@ static void talitos2_done_##name(unsigned long data)
\
\
if (ch_done_mask & 1)   \
flush_channel(dev, 0, 0, 0);\
-   if (priv->num_channels == 1)\
-   goto out;   \
if (ch_done_mask & (1 << 2))\
flush_channel(dev, 1, 0, 0);\
if (ch_done_mask & (1 << 4))\
@@ -428,7 +424,6 @@ static void talitos2_done_##name(unsigned long data)
\
if (ch_done_mask & (1 << 6))\
flush_channel(dev, 3, 0, 0);\
\
-out:   \
/* At this point, all completed channels have been processed */ \
/* Unmask done interrupts for channels completed later on. */   \
spin_lock_irqsave(>reg_lock, flags);  \
@@ -438,6 +433,7 @@ out:
\
 }
 
 DEF_TALITOS2_DONE(4ch, TALITOS2_ISR_4CHDONE)
+DEF_TALITOS2_DONE(ch0, TALITOS2_ISR_CH_0_DONE)
 DEF_TALITOS2_DONE(ch0_2, TALITOS2_ISR_CH_0_2_DONE)
 DEF_TALITOS2_DONE(ch1_3, TALITOS2_ISR_CH_1_3_DONE)
 
@@ -3237,17 +3233,24 @@ static int talitos_probe(struct platform_device *ofdev)
goto err_out;
 
if (of_device_is_compatible(np, "fsl,sec1.0")) {
-   tasklet_init(>done_task[0], talitos1_done_4ch,
-(unsigned long)dev);
-   } else {
-   if (!priv->irq[1]) {
-   tasklet_init(>done_task[0], talitos2_done_4ch,
+   if (priv->num_channels == 1)
+   tasklet_init(>done_task[0], talitos1_done_ch0,
 (unsigned long)dev);
-   } else {
+   else
+   tasklet_init(>done_task[0], talitos1_done_4ch,
+(unsigned long)dev);
+   } else {
+   if (priv->irq[1]) {
tasklet_init(>done_task[0], talitos2_done_ch0_2,
 (unsigned long)dev);
tasklet_init(>done_task[1], talitos2_done_ch1_3,
 (unsigned long)dev);
+   } else if (priv->num_channels == 1) {
+

[PATCH 11/18] crypto: talitos - use devm_ioremap()

2017-10-06 Thread Christophe Leroy

Use devm_ioremap()

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index f139a0cef2e2..83b2a70a1ba7 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -3008,8 +3008,6 @@ static int talitos_remove(struct platform_device *ofdev)
if (priv->irq[1])
tasklet_kill(>done_task[1]);
 
-   iounmap(priv->reg);
-
return 0;
 }
 
@@ -3160,6 +3158,7 @@ static int talitos_probe(struct platform_device *ofdev)
struct talitos_private *priv;
int i, err;
int stride;
+   struct resource *res;
 
priv = devm_kzalloc(dev, sizeof(struct talitos_private), GFP_KERNEL);
if (!priv)
@@ -3173,7 +3172,10 @@ static int talitos_probe(struct platform_device *ofdev)
 
spin_lock_init(>reg_lock);
 
-   priv->reg = of_iomap(np, 0);
+   res = platform_get_resource(ofdev, IORESOURCE_MEM, 0);
+   if (!res)
+   return -ENXIO;
+   priv->reg = devm_ioremap(dev, res->start, resource_size(res));
if (!priv->reg) {
dev_err(dev, "failed to of_iomap\n");
err = -ENOMEM;
-- 
2.13.3

[PATCH 10/18] crypto: talitos - use of_property_read_u32()

2017-10-06 Thread Christophe Leroy

Use of_property_read_u32() to simplify DT read

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 21 +
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 2a53d0f2a869..f139a0cef2e2 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -3158,7 +3158,6 @@ static int talitos_probe(struct platform_device *ofdev)
struct device *dev = >dev;
struct device_node *np = ofdev->dev.of_node;
struct talitos_private *priv;
-   const unsigned int *prop;
int i, err;
int stride;
 
@@ -3182,21 +3181,11 @@ static int talitos_probe(struct platform_device *ofdev)
}
 
/* get SEC version capabilities from device tree */
-   prop = of_get_property(np, "fsl,num-channels", NULL);
-   if (prop)
-   priv->num_channels = *prop;
-
-   prop = of_get_property(np, "fsl,channel-fifo-len", NULL);
-   if (prop)
-   priv->chfifo_len = *prop;
-
-   prop = of_get_property(np, "fsl,exec-units-mask", NULL);
-   if (prop)
-   priv->exec_units = *prop;
-
-   prop = of_get_property(np, "fsl,descriptor-types-mask", NULL);
-   if (prop)
-   priv->desc_types = *prop;
+   of_property_read_u32(np, "fsl,num-channels", >num_channels);
+   of_property_read_u32(np, "fsl,channel-fifo-len", >chfifo_len);
+   of_property_read_u32(np, "fsl,exec-units-mask", >exec_units);
+   of_property_read_u32(np, "fsl,descriptor-types-mask",
+>desc_types);
 
if (!is_power_of_2(priv->num_channels) || !priv->chfifo_len ||
!priv->exec_units || !priv->desc_types) {
-- 
2.13.3

[PATCH 09/18] crypto: talitos - use devm_kmalloc()

2017-10-06 Thread Christophe Leroy

Replace kmalloc() by devm_kmalloc()

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 30 --
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index dd6b1fc90020..2a53d0f2a869 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -2993,17 +2993,11 @@ static int talitos_remove(struct platform_device *ofdev)
break;
}
list_del(_alg->entry);
-   kfree(t_alg);
}
 
if (hw_supports(dev, DESC_HDR_SEL0_RNG))
talitos_unregister_rng(dev);
 
-   for (i = 0; priv->chan && i < priv->num_channels; i++)
-   kfree(priv->chan[i].fifo);
-
-   kfree(priv->chan);
-
for (i = 0; i < 2; i++)
if (priv->irq[i]) {
free_irq(priv->irq[i], dev);
@@ -3016,8 +3010,6 @@ static int talitos_remove(struct platform_device *ofdev)
 
iounmap(priv->reg);
 
-   kfree(priv);
-
return 0;
 }
 
@@ -3029,7 +3021,8 @@ static struct talitos_crypto_alg 
*talitos_alg_alloc(struct device *dev,
struct talitos_crypto_alg *t_alg;
struct crypto_alg *alg;
 
-   t_alg = kzalloc(sizeof(struct talitos_crypto_alg), GFP_KERNEL);
+   t_alg = devm_kzalloc(dev, sizeof(struct talitos_crypto_alg),
+GFP_KERNEL);
if (!t_alg)
return ERR_PTR(-ENOMEM);
 
@@ -3053,7 +3046,7 @@ static struct talitos_crypto_alg 
*talitos_alg_alloc(struct device *dev,
t_alg->algt.alg.aead.decrypt = aead_decrypt;
if (!(priv->features & TALITOS_FTR_SHA224_HWINIT) &&
!strncmp(alg->cra_name, "authenc(hmac(sha224)", 20)) {
-   kfree(t_alg);
+   devm_kfree(dev, t_alg);
return ERR_PTR(-ENOTSUPP);
}
break;
@@ -3073,7 +3066,7 @@ static struct talitos_crypto_alg 
*talitos_alg_alloc(struct device *dev,
 
if (!(priv->features & TALITOS_FTR_HMAC_OK) &&
!strncmp(alg->cra_name, "hmac", 4)) {
-   kfree(t_alg);
+   devm_kfree(dev, t_alg);
return ERR_PTR(-ENOTSUPP);
}
if (!(priv->features & TALITOS_FTR_SHA224_HWINIT) &&
@@ -3088,7 +3081,7 @@ static struct talitos_crypto_alg 
*talitos_alg_alloc(struct device *dev,
break;
default:
dev_err(dev, "unknown algorithm type %d\n", t_alg->algt.type);
-   kfree(t_alg);
+   devm_kfree(dev, t_alg);
return ERR_PTR(-EINVAL);
}
 
@@ -3169,7 +3162,7 @@ static int talitos_probe(struct platform_device *ofdev)
int i, err;
int stride;
 
-   priv = kzalloc(sizeof(struct talitos_private), GFP_KERNEL);
+   priv = devm_kzalloc(dev, sizeof(struct talitos_private), GFP_KERNEL);
if (!priv)
return -ENOMEM;
 
@@ -3267,8 +3260,8 @@ static int talitos_probe(struct platform_device *ofdev)
}
}
 
-   priv->chan = kzalloc(sizeof(struct talitos_channel) *
-priv->num_channels, GFP_KERNEL);
+   priv->chan = devm_kzalloc(dev, sizeof(struct talitos_channel) *
+  priv->num_channels, GFP_KERNEL);
if (!priv->chan) {
dev_err(dev, "failed to allocate channel management space\n");
err = -ENOMEM;
@@ -3285,8 +3278,9 @@ static int talitos_probe(struct platform_device *ofdev)
spin_lock_init(>chan[i].head_lock);
spin_lock_init(>chan[i].tail_lock);
 
-   priv->chan[i].fifo = kzalloc(sizeof(struct talitos_request) *
-priv->fifo_len, GFP_KERNEL);
+   priv->chan[i].fifo = devm_kzalloc(dev,
+   sizeof(struct talitos_request) *
+   priv->fifo_len, GFP_KERNEL);
if (!priv->chan[i].fifo) {
dev_err(dev, "failed to allocate request fifo %d\n", i);
err = -ENOMEM;
@@ -3352,7 +3346,7 @@ static int talitos_probe(struct platform_device *ofdev)
if (err) {
dev_err(dev, "%s alg registration failed\n",
alg->cra_driver_name);
-   kfree(t_alg);
+   devm_kfree(dev, t_alg);
} else
list_add_tail(_alg->entry, >alg_list);
}
-- 
2.13.3

[PATCH 08/18] crypto: talitos - declare local functions static

2017-10-06 Thread Christophe Leroy

talitos_handle_buggy_hash() and talitos_sg_map() are only used
locally, make them static

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 266e7e626e12..dd6b1fc90020 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -1113,7 +1113,7 @@ static int sg_to_link_tbl_offset(struct scatterlist *sg, 
int sg_count,
return count;
 }
 
-int talitos_sg_map(struct device *dev, struct scatterlist *src,
+static int talitos_sg_map(struct device *dev, struct scatterlist *src,
   unsigned int len, struct talitos_edesc *edesc,
   struct talitos_ptr *ptr,
   int sg_count, unsigned int offset, int tbl_off)
@@ -1721,7 +1721,7 @@ static void ahash_done(struct device *dev,
  * SEC1 doesn't like hashing of 0 sized message, so we do the padding
  * ourself and submit a padded block
  */
-void talitos_handle_buggy_hash(struct talitos_ctx *ctx,
+static void talitos_handle_buggy_hash(struct talitos_ctx *ctx,
   struct talitos_edesc *edesc,
   struct talitos_ptr *ptr)
 {
-- 
2.13.3

[PATCH 07/18] crypto: talitos - zeroize the descriptor with memset()

2017-10-06 Thread Christophe Leroy

This patch zeroize the descriptor at allocation using memset().
This has two advantages:
- It reduces the number of places where data has to be set to 0
- It avoids reading memory and loading the cache with data that
will be entirely replaced.

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 19 +--
 drivers/crypto/talitos.h |  2 --
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index a19b5d0300a9..266e7e626e12 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -75,7 +75,6 @@ static void to_talitos_ptr_len(struct talitos_ptr *ptr, 
unsigned int len,
   bool is_sec1)
 {
if (is_sec1) {
-   ptr->res = 0;
ptr->len1 = cpu_to_be16(len);
} else {
ptr->len = cpu_to_be16(len);
@@ -118,7 +117,6 @@ static void map_single_talitos_ptr(struct device *dev,
 
to_talitos_ptr_len(ptr, len, is_sec1);
to_talitos_ptr(ptr, dma_addr, is_sec1);
-   to_talitos_ptr_ext_set(ptr, 0, is_sec1);
 }
 
 /*
@@ -287,7 +285,6 @@ int talitos_submit(struct device *dev, int ch, struct 
talitos_desc *desc,
/* map descriptor and save caller data */
if (is_sec1) {
desc->hdr1 = desc->hdr;
-   desc->next_desc = 0;
request->dma_desc = dma_map_single(dev, >hdr1,
   TALITOS_DESC_SIZE,
   DMA_BIDIRECTIONAL);
@@ -1125,7 +1122,6 @@ int talitos_sg_map(struct device *dev, struct scatterlist 
*src,
bool is_sec1 = has_ftr_sec1(priv);
 
to_talitos_ptr_len(ptr, len, is_sec1);
-   to_talitos_ptr_ext_set(ptr, 0, is_sec1);
 
if (sg_count == 1) {
to_talitos_ptr(ptr, sg_dma_address(src) + offset, is_sec1);
@@ -1197,11 +1193,9 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
if (desc->hdr & DESC_HDR_TYPE_IPSEC_ESP) {
to_talitos_ptr(>ptr[2], edesc->iv_dma, is_sec1);
to_talitos_ptr_len(>ptr[2], ivsize, is_sec1);
-   to_talitos_ptr_ext_set(>ptr[2], 0, is_sec1);
} else {
to_talitos_ptr(>ptr[3], edesc->iv_dma, is_sec1);
to_talitos_ptr_len(>ptr[3], ivsize, is_sec1);
-   to_talitos_ptr_ext_set(>ptr[3], 0, is_sec1);
}
 
/* cipher key */
@@ -1221,7 +1215,6 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
 * typically 12 for ipsec
 */
to_talitos_ptr_len(>ptr[4], cryptlen, is_sec1);
-   to_talitos_ptr_ext_set(>ptr[4], 0, is_sec1);
 
sg_link_tbl_len = cryptlen;
 
@@ -1406,6 +1399,7 @@ static struct talitos_edesc *talitos_edesc_alloc(struct 
device *dev,
err = ERR_PTR(-ENOMEM);
goto error_sg;
}
+   memset(>desc, 0, sizeof(edesc->desc));
 
edesc->src_nents = src_nents;
edesc->dst_nents = dst_nents;
@@ -1481,7 +1475,6 @@ static int aead_decrypt(struct aead_request *req)
  DESC_HDR_MODE1_MDEU_CICV;
 
/* reset integrity check result bits */
-   edesc->desc.hdr_lo = 0;
 
return ipsec_esp(edesc, req, ipsec_esp_decrypt_hwauth_done);
}
@@ -1576,12 +1569,10 @@ static int common_nonsnoop(struct talitos_edesc *edesc,
bool is_sec1 = has_ftr_sec1(priv);
 
/* first DWORD empty */
-   desc->ptr[0] = zero_entry;
 
/* cipher iv */
to_talitos_ptr(>ptr[1], edesc->iv_dma, is_sec1);
to_talitos_ptr_len(>ptr[1], ivsize, is_sec1);
-   to_talitos_ptr_ext_set(>ptr[1], 0, is_sec1);
 
/* cipher key */
map_single_talitos_ptr(dev, >ptr[2], ctx->keylen,
@@ -1620,7 +1611,6 @@ static int common_nonsnoop(struct talitos_edesc *edesc,
   DMA_FROM_DEVICE);
 
/* last DWORD empty */
-   desc->ptr[6] = zero_entry;
 
if (sync_needed)
dma_sync_single_for_device(dev, edesc->dma_link_tbl,
@@ -1766,7 +1756,6 @@ static int common_nonsnoop_hash(struct talitos_edesc 
*edesc,
int sg_count;
 
/* first DWORD empty */
-   desc->ptr[0] = zero_entry;
 
/* hash context in */
if (!req_ctx->first || req_ctx->swinit) {
@@ -1775,8 +1764,6 @@ static int common_nonsnoop_hash(struct talitos_edesc 
*edesc,
   (char *)req_ctx->hw_context,
   DMA_TO_DEVICE);
req_ctx->swinit = 0;
-   } else {
-   desc->ptr[1] = zero_entry;
}
/* Indicate next op is not the first. */
req_ctx->first = 0;
@@ -1785,8 +1772,6 @@ static int common_nonsnoop_hash(struct talitos_edesc 
*edesc,
if (ctx->keylen)
map_single_talitos_ptr(dev, >ptr[2],

[PATCH 06/18] crypto: talitos - fix ctr-aes-talitos

2017-10-06 Thread Christophe Leroy

ctr-aes-talitos test fails as follows on SEC2

[0.837427] alg: skcipher: Test 1 failed (invalid result) on encryption for 
ctr-aes-talitos
[0.845763] : 16 36 d5 ee 34 f8 06 25 d7 7f 8e 56 ca 88 43 45
[0.852345] 0010: f9 3f f7 17 2a b2 12 23 30 43 09 15 82 dd e1 97
[0.858940] 0020: a7 f7 32 b5 eb 25 06 13 9a ec f5 29 25 f8 4d 66
[0.865366] 0030: b0 03 5b 8e aa 9a 42 b6 19 33 8a e2 9d 65 96 95

This patch fixes the descriptor type which is special for CTR AES

Fixes: 5e75ae1b3cef6 ("crypto: talitos - add new crypto modes")
Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index cf5c9701b898..a19b5d0300a9 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -2635,7 +2635,7 @@ static struct talitos_alg_template driver_algs[] = {
.ivsize = AES_BLOCK_SIZE,
}
},
-   .desc_hdr_template = DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU |
+   .desc_hdr_template = DESC_HDR_TYPE_AESU_CTR_NONSNOOP |
 DESC_HDR_SEL0_AESU |
 DESC_HDR_MODE0_AESU_CTR,
},
-- 
2.13.3

[PATCH 05/18] crypto: talitos - fix use of sg_link_tbl_len

2017-10-06 Thread Christophe Leroy

sg_link_tbl_len shall be used instead of cryptlen, otherwise
SECs which perform HW CICV verification will fail.

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index b7184f305867..cf5c9701b898 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -1232,8 +1232,8 @@ static int ipsec_esp(struct talitos_edesc *edesc, struct 
aead_request *areq,
sg_link_tbl_len += authsize;
}
 
-   ret = talitos_sg_map(dev, areq->src, cryptlen, edesc, >ptr[4],
-sg_count, areq->assoclen, tbl_off);
+   ret = talitos_sg_map(dev, areq->src, sg_link_tbl_len, edesc,
+>ptr[4], sg_count, areq->assoclen, tbl_off);
 
if (ret > 1) {
tbl_off += ret;
-- 
2.13.3

[PATCH 04/18] crypto: talitos - fix AEAD for sha224 on non sha224 capable chips

2017-10-06 Thread Christophe Leroy

sha224 AEAD test fails with:

[2.803125] talitos ff02.crypto: DEUISR 0x_
[2.808743] talitos ff02.crypto: MDEUISR 0x8010_
[2.814678] talitos ff02.crypto: DESCBUF 0x20731f21_0018
[2.820616] talitos ff02.crypto: DESCBUF 0x0628d64c_0010
[2.826554] talitos ff02.crypto: DESCBUF 0x0631005c_0018
[2.832492] talitos ff02.crypto: DESCBUF 0x0628d664_0008
[2.838430] talitos ff02.crypto: DESCBUF 0x061b13a0_0080
[2.844369] talitos ff02.crypto: DESCBUF 0x0631006c_0080
[2.850307] talitos ff02.crypto: DESCBUF 0x0631006c_0018
[2.856245] talitos ff02.crypto: DESCBUF 0x063100ec_
[2.884972] talitos ff02.crypto: failed to reset channel 0
[2.890503] talitos ff02.crypto: done overflow, internal time out, or 
rngu error: ISR 0x2000_0002
[2.900652] alg: aead: encryption failed on test 1 for 
authenc-hmac-sha224-cbc-3des-talitos: ret=22

This is due to SHA224 not being supported by the HW. Allthough for
hash we are able to init the hash context by SW, it is not
possible for AEAD. Therefore SHA224 AEAD has to be deactivated.

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 8aa1212086f4..b7184f305867 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -3068,6 +3068,11 @@ static struct talitos_crypto_alg 
*talitos_alg_alloc(struct device *dev,
t_alg->algt.alg.aead.setkey = aead_setkey;
t_alg->algt.alg.aead.encrypt = aead_encrypt;
t_alg->algt.alg.aead.decrypt = aead_decrypt;
+   if (!(priv->features & TALITOS_FTR_SHA224_HWINIT) &&
+   !strncmp(alg->cra_name, "authenc(hmac(sha224)", 20)) {
+   kfree(t_alg);
+   return ERR_PTR(-ENOTSUPP);
+   }
break;
case CRYPTO_ALG_TYPE_AHASH:
alg = _alg->algt.alg.hash.halg.base;
-- 
2.13.3

[PATCH 03/18] crypto: talitos - fix setkey to check key weakness

2017-10-06 Thread Christophe Leroy

Crypto manager test report the following failures:
[3.061081] alg: skcipher: setkey failed on test 5 for ecb-des-talitos: 
flags=100
[3.069342] alg: skcipher-ddst: setkey failed on test 5 for ecb-des-talitos: 
flags=100
[3.077754] alg: skcipher-ddst: setkey failed on test 5 for ecb-des-talitos: 
flags=100

This is due to setkey being expected to detect weak keys.

Signed-off-by: Christophe Leroy 
---
 drivers/crypto/talitos.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 1e799886c57d..8aa1212086f4 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -1507,12 +1507,20 @@ static int ablkcipher_setkey(struct crypto_ablkcipher 
*cipher,
 const u8 *key, unsigned int keylen)
 {
struct talitos_ctx *ctx = crypto_ablkcipher_ctx(cipher);
+   u32 tmp[DES_EXPKEY_WORDS];
 
if (keylen > TALITOS_MAX_KEY_SIZE) {
crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
 
+   if (unlikely(crypto_ablkcipher_get_flags(cipher) &
+CRYPTO_TFM_REQ_WEAK_KEY) &&
+   !des_ekey(tmp, key)) {
+   crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_WEAK_KEY);
+   return -EINVAL;
+   }
+
memcpy(>key, key, keylen);
ctx->keylen = keylen;
 
-- 
2.13.3

[PATCH 02/18] crypto: talitos - fix memory corruption on SEC2

2017-10-06 Thread Christophe Leroy

On SEC2, when using the old descriptors type (hmac snoop no afeu)
for doing IPsec, the CICV out pointeur points out of the allocated
memory.

[2.502554] 
=
[2.510740] BUG dma-kmalloc-256 (Not tainted): Redzone overwritten
[2.516907] 
-
[2.516907]
[2.526535] Disabling lock debugging due to kernel taint
[2.531845] INFO: 0xde858108-0xde85810b. First byte 0xf8 instead of 0xcc
[2.538549] INFO: Allocated in 0x806181a9 age=0 cpu=0 pid=58
[2.544229]  __kmalloc+0x374/0x564
[2.547649]  talitos_edesc_alloc+0x17c/0x48c
[2.551929]  aead_edesc_alloc+0x80/0x154
[2.555863]  aead_encrypt+0x30/0xe0
[2.559368]  __test_aead+0x5a0/0x1f3c
[2.563042]  test_aead+0x2c/0x110
[2.566371]  alg_test_aead+0x5c/0xf4
[2.569958]  alg_test+0x1dc/0x5a0
[2.573305]  cryptomgr_test+0x50/0x70
[2.576984]  kthread+0xd8/0x134
[2.580155]  ret_from_kernel_thread+0x5c/0x64
[2.584534] INFO: Freed in ipsec_esp_encrypt_done+0x130/0x240 age=6 cpu=0 
pid=0
[2.591839]  ipsec_esp_encrypt_done+0x130/0x240
[2.596395]  flush_channel+0x1dc/0x488
[2.600161]  talitos2_done_4ch+0x30/0x200
[2.604185]  tasklet_action+0xa0/0x13c
[2.607948]  __do_softirq+0x148/0x6cc
[2.611623]  irq_exit+0xc0/0x124
[2.614869]  call_do_irq+0x24/0x3c
[2.618292]  do_IRQ+0x78/0x108
[2.621369]  ret_from_except+0x0/0x14
[2.625055]  finish_task_switch+0x58/0x350
[2.629165]  schedule+0x80/0x134
[2.632409]  schedule_preempt_disabled+0x38/0xc8
[2.637042]  cpu_startup_entry+0xe4/0x190
[2.641074]  start_kernel+0x3f4/0x408
[2.644741]  0x3438
[2.646857] INFO: Slab 0xdffbdb00 objects=9 used=1 fp=0xde8581c0 flags=0x0080
[2.653978] INFO: Object 0xde858008 @offset=8 fp=0xca4395df
[2.653978]
[2.661032] Redzone de858000: cc cc cc cc cc cc cc cc
  
[2.669029] Object de858008: 00 00 00 02 00 00 00 02 00 6b 6b 6b 1e 83 ea 28 
 .kkk...(
[2.677628] Object de858018: 00 00 00 70 1e 85 80 64 ff 73 1d 21 6b 6b 6b 6b 
 ...p...d.s.!
[2.686228] Object de858028: 00 20 00 00 1e 84 17 24 00 10 00 00 1e 85 70 00 
 . .$..p.
[2.694829] Object de858038: 00 18 00 00 1e 84 17 44 00 08 00 00 1e 83 ea 28 
 ...D...(
[2.703430] Object de858048: 00 80 00 00 1e 84 f0 00 00 80 00 00 1e 85 70 10 
 ..p.
[2.712030] Object de858058: 00 20 6b 00 1e 85 80 f4 6b 6b 6b 6b 00 80 02 00 
 . k.
[2.720629] Object de858068: 1e 84 f0 00 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.729230] Object de858078: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.737830] Object de858088: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.746429] Object de858098: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.755029] Object de8580a8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.763628] Object de8580b8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.772229] Object de8580c8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.780829] Object de8580d8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
 
[2.789430] Object de8580e8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 73 b0 ea 9f 
 s...
[2.798030] Object de8580f8: e8 18 80 d6 56 38 44 c0 db e3 4f 71 f7 ce d1 d3 
 V8D...Oq
[2.806629] Redzone de858108: f8 bd 3e 4f
  ..>O
[2.814279] Padding de8581b0: 5a 5a 5a 5a 5a 5a 5a 5a
  
[2.822283] CPU: 0 PID: 0 Comm: swapper Tainted: GB   
4.9.50-g995be12679 #179
[2.831819] Call Trace:
[2.834301] [dffefd20] [c01aa9a8] check_bytes_and_report+0x100/0x194 
(unreliable)
[2.841801] [dffefd50] [c01aac3c] check_object+0x200/0x530
[2.847306] [dffefd80] [c01ae584] free_debug_processing+0x290/0x690
[2.853585] [dffefde0] [c01aec8c] __slab_free+0x308/0x628
[2.859000] [dffefe80] [c05057f4] ipsec_esp_encrypt_done+0x130/0x240
[2.865378] [dffefeb0] [c05002c4] flush_channel+0x1dc/0x488
[2.870968] [dffeff10] [c05007a8] talitos2_done_4ch+0x30/0x200
[2.876814] [dffeff30] [c002fe38] tasklet_action+0xa0/0x13c
[2.882399] [dffeff60] [c002f118] __do_softirq+0x148/0x6cc
[2.887896] [dffeffd0] [c002f954] irq_exit+0xc0/0x124
[2.892968] [dffefff0] [c0013adc] call_do_irq+0x24/0x3c
[2.898213] [c0d4be00] [c000757c] do_IRQ+0x78/0x108
[2.903113] [c0d4be30] [c0015c08] ret_from_except+0x0/0x14
[2.908634] --- interrupt: 501 at finish_task_switch+0x70/0x350
[2.908634] LR = finish_task_switch+0x58/0x350
[2.919327] [c0d4bf20] [c085e1d4] schedule+0x80/0x134
[2.924398] [c0d4bf50] [c085e2c0] schedule_preempt_disabled+0x38/0xc8
[

[PATCH 01/18] crypto: talitos - fix AEAD test failures

2017-10-06 Thread Christophe Leroy

AEAD tests fail when destination SG list has more than 1 element.

[2.058752] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha1-cbc-aes-talitos
[2.066965] : 53 69 6e 67 6c 65 20 62 6c 6f 63 6b 20 6d 73 67
0010: c0 43 ff 74 c0 43 ff e0 de 83 d1 20 de 84 8e 54
0020: de 83 d7 c4
[2.082138] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha1-cbc-aes-talitos
[2.090435] : 53 69 6e 67 6c 65 20 62 6c 6f 63 6b 20 6d 73 67
0010: de 84 ea 58 c0 93 1a 24 de 84 e8 59 de 84 f1 20
0020: 00 00 00 00
[2.105721] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha1-cbc-3des-talitos
[2.114259] : 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72 73 74
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
0030: 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63
0040: 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65
0050: 65 72 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53
0060: 72 63 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20
0070: 63 65 65 72 73 74 54 20 6f 6f 4d 20 6e 61 0a 79
0080: c0 50 f1 ac c0 50 f3 38 c0 50 f3 94 c0 50 f5 30
0090: c0 99 74 3c
[2.166410] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha1-cbc-3des-talitos
[2.174794] : 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72 73 74
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
0030: 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63
0040: 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65
0050: 65 72 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53
0060: 72 63 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20
0070: 63 65 65 72 73 74 54 20 6f 6f 4d 20 6e 61 0a 79
0080: c0 50 f1 ac c0 50 f3 38 c0 50 f3 94 c0 50 f5 30
0090: c0 99 74 3c
[2.226486] alg: No test for authenc(hmac(sha224),cbc(aes)) 
(authenc-hmac-sha224-cbc-aes-talitos)
[2.236459] alg: No test for authenc(hmac(sha224),cbc(aes)) 
(authenc-hmac-sha224-cbc-aes-talitos)
[2.247196] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha224-cbc-3des-talitos
[2.25] : 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72 73 74
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
0030: 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63
0040: 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65
0050: 65 72 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53
0060: 72 63 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20
0070: 63 65 65 72 73 74 54 20 6f 6f 4d 20 6e 61 0a 79
0080: c0 50 f1 ac c0 50 f3 38 c0 50 f3 94 c0 50 f5 30
0090: c0 99 74 3c c0 96 e5 b8
[2.309004] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha224-cbc-3des-talitos
[2.317562] : 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72 73 74
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
0030: 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63
0040: 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65
0050: 65 72 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53
0060: 72 63 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20
0070: 63 65 65 72 73 74 54 20 6f 6f 4d 20 6e 61 0a 79
0080: c0 50 f1 ac c0 50 f3 38 c0 50 f3 94 c0 50 f5 30
0090: c0 99 74 3c c0 96 e5 b8
[2.370710] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha256-cbc-aes-talitos
[2.379177] : 53 69 6e 67 6c 65 20 62 6c 6f 63 6b 20 6d 73 67
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
[2.397863] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha256-cbc-aes-talitos
[2.406134] : 53 69 6e 67 6c 65 20 62 6c 6f 63 6b 20 6d 73 67
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
[2.424789] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha256-cbc-3des-talitos
[2.433491] : 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72 73 74
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
0030: 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63
0040: 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65
0050: 65 72 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53
0060: 72 63 74 65 20 73 6f 54 20 6f 61 4d 79 6e 53 20
0070: 63 65 65 72 73 74 54 20 6f 6f 4d 20 6e 61 0a 79
0080: c0 50 f1 ac c0 50 f3 38 c0 50 f3 94 c0 50 f5 30
0090: c0 99 74 3c c0 96 e5 b8 c0 96 e9 20 c0 00 3d dc
[2.488832] alg: aead: Test 1 failed on encryption for 
authenc-hmac-sha256-cbc-3des-talitos
[2.497387] : 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72 73 74
0010: 54 20 6f 6f 4d 20 6e 61 20 79 65 53 72 63 74 65
0020: 20 73 6f 54 20 6f 61 4d 79 6e 53 20 63 65 65 72
0030: 73 74 54 20 6f 6f 4d 20 6e 61 20 79 65 53

[PATCH 00/18] crypto: talitos - fixes and performance improvement

2017-10-06 Thread Christophe Leroy

This serie fixes and improves the talitos crypto driver.

First 6 patchs are fixes of failures reported by the new tests in the
kernel crypto test manager.

The 8 following patches are cleanups and simplifications.

The last 4 ones are performance improvement. The main improvement is
in the one before the last, it divides by 2 the time needed for a md5
hash on the SEC1.

Christophe Leroy (18):
  crypto: talitos - fix AEAD test failures
  crypto: talitos - fix memory corruption on SEC2
  crypto: talitos - fix setkey to check key weakness
  crypto: talitos - fix AEAD for sha224 on non sha224 capable chips
  crypto: talitos - fix use of sg_link_tbl_len
  crypto: talitos - fix ctr-aes-talitos
  crypto: talitos - zeroize the descriptor with memset()
  crypto: talitos - declare local functions static
  crypto: talitos - use devm_kmalloc()
  crypto: talitos - use of_property_read_u32()
  crypto: talitos - use devm_ioremap()
  crypto: talitos - don't check the number of channels at each interrupt
  crypto: talitos - remove to_talitos_ptr_len()
  crypto: talitos - simplify tests in ipsec_esp()
  crypto: talitos - DMA map key in setkey()
  crypto: talitos - do hw_context DMA mapping outside the requests
  crypto: talitos - chain in buffered data for ahash on SEC1
  crypto: talitos - avoid useless copy

 drivers/crypto/talitos.c | 544 ++-
 drivers/crypto/talitos.h |   7 +-
 2 files changed, 356 insertions(+), 195 deletions(-)

-- 
2.13.3

Re: [PATCH v10 05/10] mm: zero reserved and unavailable struct pages

2017-10-06 Thread Michal Hocko

On Thu 05-10-17 17:11:19, Pavel Tatashin wrote:
> Some memory is reserved but unavailable: not present in memblock.memory
> (because not backed by physical pages), but present in memblock.reserved.
> Such memory has backing struct pages, but they are not initialized by going
> through __init_single_page().
> 
> In some cases these struct pages are accessed even if they do not contain
> any data. One example is page_to_pfn() might access page->flags if this is
> where section information is stored (CONFIG_SPARSEMEM,
> SECTION_IN_PAGE_FLAGS).
> 
> One example of such memory: trim_low_memory_range() unconditionally
> reserves from pfn 0, but e820__memblock_setup() might provide the exiting
> memory from pfn 1 (i.e. KVM).
> 
> Since, struct pages are zeroed in __init_single_page(), and not during
> allocation time, we must zero such struct pages explicitly.
> 
> The patch involves adding a new memblock iterator:
>   for_each_resv_unavail_range(i, p_start, p_end)
> 
> Which iterates through reserved && !memory lists, and we zero struct pages
> explicitly by calling mm_zero_struct_page().

As I've said in other reply this should go in only if the scenario you
describe is real. I am somehow suspicious to be honest. I simply do not
see how those weird struct pages would be in a valid pfn range of any
zone.

> Signed-off-by: Pavel Tatashin 
> Reviewed-by: Steven Sistare 
> Reviewed-by: Daniel Jordan 
> Reviewed-by: Bob Picco 
> ---
>  include/linux/memblock.h | 16 
>  include/linux/mm.h   | 15 +++
>  mm/page_alloc.c  | 38 ++
>  3 files changed, 69 insertions(+)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index bae11c7e7bf3..ce8bfa5f3e9b 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, 
> unsigned long max_pfn);
>   for_each_mem_range_rev(i, , , \
>  nid, flags, p_start, p_end, p_nid)
>  
> +/**
> + * for_each_resv_unavail_range - iterate through reserved and unavailable 
> memory
> + * @i: u64 used as loop variable
> + * @flags: pick from blocks based on memory attributes
> + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
> + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
> + *
> + * Walks over unavailable but reserved (reserved && !memory) areas of 
> memblock.
> + * Available as soon as memblock is initialized.
> + * Note: because this memory does not belong to any physical node, flags and
> + * nid arguments do not make sense and thus not exported as arguments.
> + */
> +#define for_each_resv_unavail_range(i, p_start, p_end)   
> \
> + for_each_mem_range(i, , , \
> +NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
> +
>  static inline void memblock_set_region_flags(struct memblock_region *r,
>unsigned long flags)
>  {
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 065d99deb847..04c8b2e5aff4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -94,6 +94,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
>  #define mm_forbids_zeropage(X)   (0)
>  #endif
>  
> +/*
> + * On some architectures it is expensive to call memset() for small sizes.
> + * Those architectures should provide their own implementation of "struct 
> page"
> + * zeroing by defining this macro in .
> + */
> +#ifndef mm_zero_struct_page
> +#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
> +#endif
> +
>  /*
>   * Default maximum number of active map areas, this limits the number of vmas
>   * per mm struct. Users can overwrite this number by sysctl but there is a
> @@ -2001,6 +2010,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long 
> pfn,
>   struct mminit_pfnnid_cache *state);
>  #endif
>  
> +#ifdef CONFIG_HAVE_MEMBLOCK
> +void zero_resv_unavail(void);
> +#else
> +static inline void zero_resv_unavail(void) {}
> +#endif
> +
>  extern void set_dma_reserve(unsigned long new_dma_reserve);
>  extern void memmap_init_zone(unsigned long, int, unsigned long,
>   unsigned long, enum memmap_context);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 20b0bace2235..5f0013bbbe9d 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6209,6 +6209,42 @@ void __paginginit free_area_init_node(int nid, 
> unsigned long *zones_size,
>   free_area_init_core(pgdat);
>  }
>  
> +#ifdef CONFIG_HAVE_MEMBLOCK
> +/*
> + * Only struct pages that are backed by physical memory are zeroed and
> + * initialized by going through __init_single_page(). But, there are some
> + * struct pages which are reserved in memblock allocator and

Re: [PATCH v10 09/10] mm: stop zeroing memory during allocation in vmemmap

2017-10-06 Thread Michal Hocko

On Fri 06-10-17 12:11:42, David Laight wrote:
> From: Michal Hocko
> > Sent: 06 October 2017 12:47
> > On Fri 06-10-17 11:10:14, David Laight wrote:
> > > From: Pavel Tatashin
> > > > Sent: 05 October 2017 22:11
> > > > vmemmap_alloc_block() will no longer zero the block, so zero memory
> > > > at its call sites for everything except struct pages.  Struct page 
> > > > memory
> > > > is zero'd by struct page initialization.
> > >
> > > It seems dangerous to change an allocator to stop zeroing memory.
> > > It is probably saver to add a new function that doesn't zero
> > > the memory and use that is the places where you don't want it
> > > to be zeroed.
> > 
> > Not sure what you mean. memblock_virt_alloc_try_nid_raw is a new
> > function which doesn't zero out...
> 
> You should probably leave vmemap_alloc_block() zeroing the memory
> so that existing alls don't have to be changed - apart from the
> ones you are explicitly optimising.

But the whole point of vmemmap_alloc_block is to allocate memmaps and
the point of this change is to cover those. This is not a generic API
that other users would depend on. 
-- 
Michal Hocko
SUSE Labs

RE: [PATCH v10 09/10] mm: stop zeroing memory during allocation in vmemmap

2017-10-06 Thread David Laight

From: Michal Hocko
> Sent: 06 October 2017 12:47
> On Fri 06-10-17 11:10:14, David Laight wrote:
> > From: Pavel Tatashin
> > > Sent: 05 October 2017 22:11
> > > vmemmap_alloc_block() will no longer zero the block, so zero memory
> > > at its call sites for everything except struct pages.  Struct page memory
> > > is zero'd by struct page initialization.
> >
> > It seems dangerous to change an allocator to stop zeroing memory.
> > It is probably saver to add a new function that doesn't zero
> > the memory and use that is the places where you don't want it
> > to be zeroed.
> 
> Not sure what you mean. memblock_virt_alloc_try_nid_raw is a new
> function which doesn't zero out...

You should probably leave vmemap_alloc_block() zeroing the memory
so that existing alls don't have to be changed - apart from the
ones you are explicitly optimising.

David

Re: [PATCH v10 09/10] mm: stop zeroing memory during allocation in vmemmap

2017-10-06 Thread Michal Hocko

On Fri 06-10-17 11:10:14, David Laight wrote:
> From: Pavel Tatashin
> > Sent: 05 October 2017 22:11
> > vmemmap_alloc_block() will no longer zero the block, so zero memory
> > at its call sites for everything except struct pages.  Struct page memory
> > is zero'd by struct page initialization.
> 
> It seems dangerous to change an allocator to stop zeroing memory.
> It is probably saver to add a new function that doesn't zero
> the memory and use that is the places where you don't want it
> to be zeroed.

Not sure what you mean. memblock_virt_alloc_try_nid_raw is a new
function which doesn't zero out...
-- 
Michal Hocko
SUSE Labs

Re: [PATCH 2/2] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-10-06 Thread Naveen N. Rao

On 2017/09/18 09:23AM, Santosh Sivaraj wrote:
> Current vDSO64 implementation does not have support for coarse clocks
> (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls back
> to system call, increasing the response time, vDSO implementation reduces
> the cycle time. Below is a benchmark of the difference in execution time
> with and without vDSO support.
> 
> (Non-coarse clocks are also included just for completion)
> 
> Without vDSO support:
> 
> clock-gettime-realtime: syscall: 172 nsec/call
> clock-gettime-realtime:libc: 26 nsec/call
> clock-gettime-realtime:vdso: 21 nsec/call
> clock-gettime-monotonic: syscall: 170 nsec/call
> clock-gettime-monotonic:libc: 30 nsec/call
> clock-gettime-monotonic:vdso: 24 nsec/call
> clock-gettime-realtime-coarse: syscall: 153 nsec/call
> clock-gettime-realtime-coarse:libc: 15 nsec/call
> clock-gettime-realtime-coarse:vdso: 9 nsec/call
> clock-gettime-monotonic-coarse: syscall: 167 nsec/call
> clock-gettime-monotonic-coarse:libc: 15 nsec/call
> clock-gettime-monotonic-coarse:vdso: 11 nsec/call
> 
> CC: Benjamin Herrenschmidt 
> Signed-off-by: Santosh Sivaraj 
> ---
>  arch/powerpc/kernel/asm-offsets.c |  2 ++
>  arch/powerpc/kernel/vdso64/gettimeofday.S | 56 
> +++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/asm-offsets.c 
> b/arch/powerpc/kernel/asm-offsets.c
> index 8cfb20e38cfe..b55c68c54dc1 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -396,6 +396,8 @@ int main(void)
>   /* Other bits used by the vdso */
>   DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
>   DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
> + DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
> + DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
>   DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
>   DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
> 
> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index a0b4943811db..bae197a81add 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -71,6 +71,11 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>   crorcr0*4+eq,cr0*4+eq,cr1*4+eq
>   beq cr0,49f
> 
> + cmpwi   cr0,r3,CLOCK_REALTIME_COARSE
> + cmpwi   cr1,r3,CLOCK_MONOTONIC_COARSE
> + crorcr0*4+eq,cr0*4+eq,cr1*4+eq
> + beq cr0,65f
> +
>   b   99f /* Fallback to syscall */
>.cfi_register lr,r12
>  49:  bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> @@ -112,6 +117,57 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>  1:   bge cr1,80f
>   addir4,r4,-1
>   add r5,r5,r7
> + b   80f
> +
> + /*
> +  * For coarse clocks we get data directly from the vdso data page, so
> +  * we don't need to call __do_get_tspec, but we still need to do the
> +  * counter trick.
> +  */
> +65:  bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> +70:  ld  r8,CFG_TB_UPDATE_COUNT(r3)
> + andi.   r0,r8,1 /* pending update ? loop */
> + bne-70b
> + xor r0,r8,r8/* create dependency */
> + add r3,r3,r0
> +
> + /*
> +  * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
> +  * too
> +  */
> + ld  r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
> + ld  r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
> + bne cr1,78f
> +
> + /* CLOCK_MONOTONIC_COARSE */
> + lwa r6,WTOM_CLOCK_SEC(r3)
> + lwa r9,WTOM_CLOCK_NSEC(r3)
> +
> + /* check if counter has updated */
> +78:  or  r0,r6,r9
> + xor r0,r0,r0
> + add r3,r3,r0
> + ld  r0,CFG_TB_UPDATE_COUNT(r3)
> + cmpld   cr0,r0,r8   /* check if updated */
> + bne-70b

Don't you need a dependency on r4/r5 here for REALTIME_COARSE?
Something like:

/* check if counter has updated */
or  r0,r6,r9
78: or  r0,r4,r5
xor r0,r0,r0

> +
> + /* Counter has not updated, so continue calculating proper values for
> +  * sec and nsec if monotonic coarse, or just return with the proper
> +  * values for realtime.
> +  */
> + bne cr1,80f
> +

I think the below hunk can surely be shared across the _COARSE and 
regular clocks, if not more.

- Naveen

> + /* Add wall->monotonic offset and check for overflow or underflow */
> + add r4,r4,r6
> + add r5,r5,r9
> + cmpdcr0,r5,r7
> + cmpdi   cr1,r5,0
> + blt 79f
> + subfr5,r7,r5
> + addir4,r4,1
> +79:  bge cr1,80f
> + addir4,r4,-1
> + add r5,r5,r7
> 
>  80:  std r4,TSPC64_TV_SEC(r11)
>   std r5,TSPC64_TV_NSEC(r11)
> -- 
> 2.13.5
>

Re: [PATCH 2/3] powerpc/tm: P9 disabled suspend mode workaround

2017-10-06 Thread Gustavo Romero

Hi Cyril,

On 06-10-2017 04:46, Cyril Bur wrote:
> [added by Cyril Bur]
> As the no-suspend firmware change is novel and untested using it should
> be opt in by users. Furthumore, currently the kernel has no method to

I forgot to mention on my last reply, but should s/Furthumore/Furthermore/ ?

Regards,
Gustavo

Re: [PATCH] cpufreq: powernv: Return the actual CPU frequency in /proc/cpuinfo

2017-10-06 Thread shriyak


On 2017-10-06 16:00, Michael Ellerman wrote:

Shriya  writes:


Make /proc/cpuinfo read the frequency of the CPU it is running at
instead of reading the cached value of the last requested frequency.
In conditions like WOF/throttle CPU can be running at a different
frequency than the requested frequency.


Sounds like a bug fix to me ?

cheers

diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c

index 897aa14..55ea4bf 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -311,7 +311,7 @@ static unsigned long pnv_get_proc_freq(unsigned 
int cpu)

 {
unsigned long ret_freq;

-   ret_freq = cpufreq_quick_get(cpu) * 1000ul;
+   ret_freq = cpufreq_get(cpu) * 1000ul;

/*
 * If the backend cpufreq driver does not exist,
--
1.9.1


Yes, its a bug fix.

Thanks.

Re: [PATCH 3/3] powerpc/tm: P9 disable transactionally suspended sigcontexts

2017-10-06 Thread Gustavo Romero

Hi Cyril,

On 06-10-2017 04:46, Cyril Bur wrote:
> From: Michael Neuling 
> 
> Unfortunately userspace can construct a sigcontext which enables
> suspend. Thus userspace can force Linux into a path where trechkpt is
> executed.
> 
> This patch blocks this from happening on POWER9 but sanity checking
> sigcontexts passed in.

I think "but" should say "by" as pointed out by Joel and acked by Mikey
previously.

Regards,
Gustavo

RE: [PATCH v10 09/10] mm: stop zeroing memory during allocation in vmemmap

2017-10-06 Thread David Laight

From: Pavel Tatashin
> Sent: 05 October 2017 22:11
> vmemmap_alloc_block() will no longer zero the block, so zero memory
> at its call sites for everything except struct pages.  Struct page memory
> is zero'd by struct page initialization.

It seems dangerous to change an allocator to stop zeroing memory.
It is probably saver to add a new function that doesn't zero
the memory and use that is the places where you don't want it
to be zeroed.

David

Re: powerpc/powernv: Increase memory block size to 1GB on radix

2017-10-06 Thread Michael Ellerman

On Thu, 2017-09-07 at 05:05:51 UTC, Anton Blanchard wrote:
> From: Anton Blanchard 
> 
> Memory hot unplug on PowerNV radix hosts is broken. Our memory block
> size is 256MB but since we map the linear region with very large pages,
> each pte we tear down maps 1GB.
> 
> A hot unplug of one 256MB memory block results in 768MB of memory
> getting unintentionally unmapped. At this point we are likely to oops.
> 
> Fix this by increasing our memory block size to 1GB on PowerNV radix
> hosts.
> 
> Signed-off-by: Anton Blanchard 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/53ecde0b9126ff140abe3aefd7f0ec

cheers

[GIT PULL] Please pull powerpc/linux.git powerpc-4.14-4 tag

2017-10-06 Thread Michael Ellerman

Hi Linus,

Please pull some more powerpc fixes for 4.14.

This is two weeks worth of fixes, and the diffstat is reasonably small,
so I think we're on track.

The following changes since commit e19b205be43d11bff638cad4487008c48d21c103:

  Linux 4.14-rc2 (2017-09-24 16:38:56 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
tags/powerpc-4.14-4

for you to fetch changes up to 53ecde0b9126ff140abe3aefd7f0ec64d6fa36b0:

  powerpc/powernv: Increase memory block size to 1GB on radix (2017-10-06 
15:50:45 +1100)


powerpc fixes for 4.14 #4

Nine small fixes, really nothing that stands out.

A work around for a spurious MCE on Power9. A CXL fault handling fix, some fixes
to the new XIVE code, and a fix to the new 32-bit STRICT_KERNEL_RWX code.

Fixes for old code/stable: an fix to an incorrect TLB flush on boot but not on
any current machines, a compile error on 4xx and a fix to memory hotplug when
using radix (Power9).

Thanks to:
  Anton Blanchard, Cédric Le Goater, Christian Lamparter, Christophe Leroy,
  Christophe Lombard, Guenter Roeck, Jeremy Kerr, Michael Neuling, Nicholas
  Piggin.


Anton Blanchard (1):
  powerpc/powernv: Increase memory block size to 1GB on radix

Christian Lamparter (1):
  powerpc/4xx: Fix compile error with 64K pages on 40x, 44x

Christophe Lombard (1):
  cxl: Fix memory page not handled

Cédric Le Goater (2):
  powerpc/xive: Fix IPI reset
  powerpc/xive: Clear XIVE internal structures when a CPU is removed

Guenter Roeck (1):
  powerpc/mm: Call flush_tlb_kernel_range with interrupts enabled

Jeremy Kerr (1):
  powerpc: Fix action argument for cpufeatures-based TLB flush

Michael Neuling (2):
  powerpc: Handle MCE on POWER9 with only DSISR bit 30 set
  powerpc: Fix workaround for spurious MCE on POWER9

 arch/powerpc/kernel/dt_cpu_ftrs.c  |  4 ++--
 arch/powerpc/kernel/mce_power.c| 13 +
 arch/powerpc/kernel/setup-common.c |  3 ---
 arch/powerpc/mm/pgtable_32.c   |  2 +-
 arch/powerpc/platforms/powernv/setup.c | 10 +-
 arch/powerpc/sysdev/xive/common.c  |  8 
 arch/powerpc/sysdev/xive/spapr.c   |  4 
 drivers/misc/cxl/cxllib.c  | 13 +++--
 8 files changed, 48 insertions(+), 9 deletions(-)


signature.asc
Description: PGP signature

Re: Possible LMB hot unplug bug in 4.13+ kernels

2017-10-06 Thread Daniel Henrique Barboza




Unless you (Daniel) think there's some reason lmb_is_removable() is
incorrectly returning false. But most likely it's correct and there's
just an unmovable allocation in that range.


I am not educated enough to say that the current behavior is wrong. What I
can say is that in 4.11 and older kernels that supports LMB hot 
plug/unplug I

didn't see this kernel "refusal" to remove a LMB that was just hotplugged.

Assuming that the kernel is behaving as intended, a QEMU guest started with
4Gb of RAM that receives an extra 1Gb of RAM will not unplug this same 1Gb.
It seems off from the user perspective that a recently added memory is being
considered not removable, thus QEMU will need to keep this limitation in 
mind when

dealing with future LMB bugs in 4.13+ kernels.


Thanks,


Daniel

Re: [PATCH] cpufreq: powernv: Return the actual CPU frequency in /proc/cpuinfo

2017-10-06 Thread Michael Ellerman

Shriya  writes:

> Make /proc/cpuinfo read the frequency of the CPU it is running at
> instead of reading the cached value of the last requested frequency.
> In conditions like WOF/throttle CPU can be running at a different
> frequency than the requested frequency.

Sounds like a bug fix to me ?

cheers

> diff --git a/arch/powerpc/platforms/powernv/setup.c 
> b/arch/powerpc/platforms/powernv/setup.c
> index 897aa14..55ea4bf 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -311,7 +311,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu)
>  {
>   unsigned long ret_freq;
>  
> - ret_freq = cpufreq_quick_get(cpu) * 1000ul;
> + ret_freq = cpufreq_get(cpu) * 1000ul;
>  
>   /*
>* If the backend cpufreq driver does not exist,
> -- 
> 1.9.1

Re: [PATCH 2/3] powerpc/tm: P9 disabled suspend mode workaround

2017-10-06 Thread Michael Ellerman

Benjamin Herrenschmidt  writes:
> On Fri, 2017-10-06 at 18:46 +1100, Cyril Bur wrote:
...
>> diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
>> index 760872916013..2cb01b48123a 100644
>> --- a/arch/powerpc/kernel/cputable.c
>> +++ b/arch/powerpc/kernel/cputable.c
>> @@ -2301,6 +2302,17 @@ void __init identify_cpu_name(unsigned int pvr)
>>  }
>>  }
>>  
>> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>> +bool tm_suspend_supported(void)
>> +{
>> +if (cpu_has_feature(CPU_FTR_TM)) {
>> +if (pvr_version_is(PVR_POWER9) && ppc_tm_state != 
>> TM_STATE_NO_SUSPEND)
>> +return false;
>> +return true;
>> +}
>
> Hrm... so if state is "NO SUSPEND" you return "true" ? Isn't this
> backward ? Or I don't understand what this is about...

Yeah it's a bit confuzzled.

I literally wrote it for you on a post-it Cyril! >:D

tm_suspend_supported() should be called tm_no_suspend_mode(). Where "no
suspend mode" is the new "mode" we're adding where suspend is not supported.

Then tm_no_suspend_mode() should be:

+   if (pvr_version_is(PVR_POWER9) && ppc_tm_state == TM_STATE_NO_SUSPEND)
+   return true;
+   return false;
  
And then all the extra checks and warnings in patch 3 just use it like:

WARN_ON(tm_no_suspend_mode());

Because they're in paths where we shouldn't get to if suspend is
disabled.

I don't think we need to check CPU_FTR_TM because it's only called from
TM paths anyway. But we could add that to be paranoid. Or probably
better, when TM is forced off (below) we set ppc_tm_state to off.

>> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
>> index e37c26d2e54b..227ac600a1b7 100644
>> --- a/arch/powerpc/kernel/setup_64.c
>> +++ b/arch/powerpc/kernel/setup_64.c
>> @@ -265,11 +267,13 @@ early_param("ppc_tm", parse_ppc_tm);
>>  
>>  static void check_disable_tm(void)
>>  {
>> -if (cpu_has_feature(CPU_FTR_TM) && ppc_tm_state == TM_STATE_OFF) {
>> -printk(KERN_NOTICE "Disabling hardware transactional memory 
>> (HTM)\n");
>> -cur_cpu_spec->cpu_user_features2 &=
>> -~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM);
>> -cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
>> +if (cpu_has_feature(CPU_FTR_TM)) {
>> +if (ppc_tm_state == TM_STATE_OFF || (!tm_suspend_supported())) {
>> +printk(KERN_NOTICE "Disabling hardware transactional 
>> memory (HTM)\n");
>> +cur_cpu_spec->cpu_user_features2 &=
>> +~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM);
>> +cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
>> +}
>
> So that code translates to if TM is off or doesn't support suspend,
> disable TM. Are we sure that's really what we meant here ?

It should be:

+   if (!cpu_has_feature(CPU_FTR_TM))
+   return;
+
+   if (ppc_tm_state == TM_STATE_OFF || \
+   (pvr_version_is(PVR_POWER9) && ppc_tm_state != 
TM_STATE_NO_SUSPEND)) {
+   printk(KERN_NOTICE "Disabling hardware transactional memory 
(HTM)\n");
+   cur_cpu_spec->cpu_user_features2 &= ~(PPC_FEATURE2_HTM_NOSC | 
PPC_FEATURE2_HTM);
+   cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
+   }

And as I mentioned above perhaps we should also do:
+   ppc_tm_state = TM_STATE_OFF;

cheers

Re: [PATCH 1/2] powerpc/vdso64: Coarse timer support preparatory patch

2017-10-06 Thread Michael Ellerman

Thanks for reviewing Naveen.

"Naveen N. Rao"  writes:
> On 2017/09/18 09:23AM, Santosh Sivaraj wrote:
>> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
>> b/arch/powerpc/kernel/vdso64/gettimeofday.S
>> index 382021324883..a0b4943811db 100644
>> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
>> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
>> @@ -60,18 +60,20 @@ V_FUNCTION_END(__kernel_gettimeofday)
>>   */
>>  V_FUNCTION_BEGIN(__kernel_clock_gettime)
>>.cfi_startproc
>> +mr  r11,r4  /* r11 saves tp */
>> +mflrr12 /* r12 saves lr */
>> +lis r7,NSEC_PER_SEC@h   /* want nanoseconds */
>> +ori r7,r7,NSEC_PER_SEC@l
>> +
>>  /* Check for supported clock IDs */
>>  cmpwi   cr0,r3,CLOCK_REALTIME
>>  cmpwi   cr1,r3,CLOCK_MONOTONIC
>>  crorcr0*4+eq,cr0*4+eq,cr1*4+eq
>> -bne cr0,99f
>> +beq cr0,49f
>> 
>> -mflrr12 /* r12 saves lr */
>> +b   99f /* Fallback to syscall */
>
> 'beq', followed by a 'b' looks weird without considering the next patch.  
> I think this can be organized better to not have to update r7/r11/r12 if 
> using the system call. See next patch for my comments.
>
>>.cfi_register lr,r12
>
> If you move the mflr, you should move the above line along with it.

s/should/must/.

It literally says "lr is saved in r12".

cheers

Re: [PATCH 2/2] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-10-06 Thread Naveen N. Rao

On 2017/09/18 09:23AM, Santosh Sivaraj wrote:
> Current vDSO64 implementation does not have support for coarse clocks
> (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls back
> to system call, increasing the response time, vDSO implementation reduces
> the cycle time. Below is a benchmark of the difference in execution time
> with and without vDSO support.
> 
> (Non-coarse clocks are also included just for completion)
> 
> Without vDSO support:
> 
> clock-gettime-realtime: syscall: 172 nsec/call
> clock-gettime-realtime:libc: 26 nsec/call
> clock-gettime-realtime:vdso: 21 nsec/call
> clock-gettime-monotonic: syscall: 170 nsec/call
> clock-gettime-monotonic:libc: 30 nsec/call
> clock-gettime-monotonic:vdso: 24 nsec/call
> clock-gettime-realtime-coarse: syscall: 153 nsec/call
> clock-gettime-realtime-coarse:libc: 15 nsec/call
> clock-gettime-realtime-coarse:vdso: 9 nsec/call
> clock-gettime-monotonic-coarse: syscall: 167 nsec/call
> clock-gettime-monotonic-coarse:libc: 15 nsec/call
> clock-gettime-monotonic-coarse:vdso: 11 nsec/call
> 
> CC: Benjamin Herrenschmidt 
> Signed-off-by: Santosh Sivaraj 
> ---
>  arch/powerpc/kernel/asm-offsets.c |  2 ++
>  arch/powerpc/kernel/vdso64/gettimeofday.S | 56 
> +++
>  2 files changed, 58 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/asm-offsets.c 
> b/arch/powerpc/kernel/asm-offsets.c
> index 8cfb20e38cfe..b55c68c54dc1 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -396,6 +396,8 @@ int main(void)
>   /* Other bits used by the vdso */
>   DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
>   DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
> + DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
> + DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
>   DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
>   DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
> 
> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index a0b4943811db..bae197a81add 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -71,6 +71,11 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>   crorcr0*4+eq,cr0*4+eq,cr1*4+eq
>   beq cr0,49f
> 
> + cmpwi   cr0,r3,CLOCK_REALTIME_COARSE
> + cmpwi   cr1,r3,CLOCK_MONOTONIC_COARSE
> + crorcr0*4+eq,cr0*4+eq,cr1*4+eq
> + beq cr0,65f

If you use cr5-7 here, you should be able to re-organize this to not 
have to update r4/r11/r12 if we're taking the syscall path. Not 
necessarily a huge win by itself, but can also help reuse some of the 
other code between the _COARSE and the regular variants.

- Naveen

> +
>   b   99f /* Fallback to syscall */
>.cfi_register lr,r12
>  49:  bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> @@ -112,6 +117,57 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>  1:   bge cr1,80f
>   addir4,r4,-1
>   add r5,r5,r7
> + b   80f
> +
> + /*
> +  * For coarse clocks we get data directly from the vdso data page, so
> +  * we don't need to call __do_get_tspec, but we still need to do the
> +  * counter trick.
> +  */
> +65:  bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> +70:  ld  r8,CFG_TB_UPDATE_COUNT(r3)
> + andi.   r0,r8,1 /* pending update ? loop */
> + bne-70b
> + xor r0,r8,r8/* create dependency */
> + add r3,r3,r0
> +
> + /*
> +  * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
> +  * too
> +  */
> + ld  r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
> + ld  r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
> + bne cr1,78f
> +
> + /* CLOCK_MONOTONIC_COARSE */
> + lwa r6,WTOM_CLOCK_SEC(r3)
> + lwa r9,WTOM_CLOCK_NSEC(r3)
> +
> + /* check if counter has updated */
> +78:  or  r0,r6,r9
> + xor r0,r0,r0
> + add r3,r3,r0
> + ld  r0,CFG_TB_UPDATE_COUNT(r3)
> + cmpld   cr0,r0,r8   /* check if updated */
> + bne-70b
> +
> + /* Counter has not updated, so continue calculating proper values for
> +  * sec and nsec if monotonic coarse, or just return with the proper
> +  * values for realtime.
> +  */
> + bne cr1,80f
> +
> + /* Add wall->monotonic offset and check for overflow or underflow */
> + add r4,r4,r6
> + add r5,r5,r9
> + cmpdcr0,r5,r7
> + cmpdi   cr1,r5,0
> + blt 79f
> + subfr5,r7,r5
> + addir4,r4,1
> +79:  bge cr1,80f
> + addir4,r4,-1
> + add r5,r5,r7
> 
>  80:  std r4,TSPC64_TV_SEC(r11)
>   std r5,TSPC64_TV_NSEC(r11)
> -- 
> 2.13.5
>

Re: [PATCH 1/2] powerpc/vdso64: Coarse timer support preparatory patch

2017-10-06 Thread Naveen N. Rao

Hi Santosh,

On 2017/09/18 09:23AM, Santosh Sivaraj wrote:
> Reorganize code to make it easy to introduce CLOCK_REALTIME_COARSE and
> CLOCK_MONOTONIC_COARSE timer support.
> 
> Signed-off-by: Santosh Sivaraj 
> ---
>  arch/powerpc/kernel/vdso64/gettimeofday.S | 14 --
>  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index 382021324883..a0b4943811db 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -60,18 +60,20 @@ V_FUNCTION_END(__kernel_gettimeofday)
>   */
>  V_FUNCTION_BEGIN(__kernel_clock_gettime)
>.cfi_startproc
> + mr  r11,r4  /* r11 saves tp */
> + mflrr12 /* r12 saves lr */
> + lis r7,NSEC_PER_SEC@h   /* want nanoseconds */
> + ori r7,r7,NSEC_PER_SEC@l
> +
>   /* Check for supported clock IDs */
>   cmpwi   cr0,r3,CLOCK_REALTIME
>   cmpwi   cr1,r3,CLOCK_MONOTONIC
>   crorcr0*4+eq,cr0*4+eq,cr1*4+eq
> - bne cr0,99f
> + beq cr0,49f
> 
> - mflrr12 /* r12 saves lr */
> + b   99f /* Fallback to syscall */

'beq', followed by a 'b' looks weird without considering the next patch.  
I think this can be organized better to not have to update r7/r11/r12 if 
using the system call. See next patch for my comments.

>.cfi_register lr,r12

If you move the mflr, you should move the above line along with it.

- Naveen

> - mr  r11,r4  /* r11 saves tp */
> - bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> - lis r7,NSEC_PER_SEC@h   /* want nanoseconds */
> - ori r7,r7,NSEC_PER_SEC@l
> +49:  bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
>  50:  bl  V_LOCAL_FUNC(__do_get_tspec)/* get time from tb & kernel */
>   bne cr1,80f /* if not monotonic, all done */
> 
> -- 
> 2.13.5
>

Re: [PATCH tip/core/rcu 1/3] membarrier: Provide register expedited private command

2017-10-06 Thread Peter Zijlstra

> AFAIU the scheduler rq->lock is held while preemption is disabled.
> synchronize_sched() is used here to ensure that all pre-existing
> preempt-off critical sections have completed.
> 
> So saying that we use synchronize_sched() to synchronize with rq->lock
> would be stretching the truth a bit. It's actually only true because the
> scheduler holding the rq->lock is surrounded by a preempt-off
> critical section.

No, rq->lock is sufficient, note that rq->lock is a raw_spinlock_t which
implies !preempt. Yes, we also surround the rq->lock usage with a
slightly larger preempt_disable() section but that's not in fact
required for this.

Re: [PATCH 3/3] powerpc/tm: P9 disable transactionally suspended sigcontexts

2017-10-06 Thread Benjamin Herrenschmidt

On Fri, 2017-10-06 at 18:46 +1100, Cyril Bur wrote:
> From: Michael Neuling 
> 
> Unfortunately userspace can construct a sigcontext which enables
> suspend. Thus userspace can force Linux into a path where trechkpt is
> executed.
> 
> This patch blocks this from happening on POWER9 but sanity checking
> sigcontexts passed in.
> 
> ptrace doesn't have this problem as only MSR SE and BE can be changed
> via ptrace.
> 
> This patch also adds a number of WARN_ON() in case we every enter
> suspend when we shouldn't. This should catch systems that don't have
> the firmware change and are running TM.
> 
> A future firmware change will allow suspend mode on POWER9 but that is
> going to require additional Linux changes to support. In the interim,
> this allows TM to continue to (partially) work while stopping
> userspace from crashing Linux.
> 
> Signed-off-by: Michael Neuling 
> Signed-off-by: Cyril Bur 
> ---
>  arch/powerpc/kernel/process.c   | 2 ++
>  arch/powerpc/kernel/signal_32.c | 4 
>  arch/powerpc/kernel/signal_64.c | 5 +
>  3 files changed, 11 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index a0c74bbf3454..5b81673c5026 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -903,6 +903,8 @@ static inline void tm_reclaim_task(struct task_struct 
> *tsk)
>   if (!MSR_TM_ACTIVE(thr->regs->msr))
>   goto out_and_saveregs;
>  
> + WARN_ON(!tm_suspend_supported());
> +

What does this function really says ? That TM is supported or that TM
supports suspend ? Because the implementation in the previous patch
seems to indicate that what it actually indicates is that TM is
supported, period.

>   TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
>"ccr=%lx, msr=%lx, trap=%lx)\n",
>tsk->pid, thr->regs->nip,
> diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
> index 92fb1c8dbbd8..9eac0131c080 100644
> --- a/arch/powerpc/kernel/signal_32.c
> +++ b/arch/powerpc/kernel/signal_32.c
> @@ -519,6 +519,8 @@ static int save_tm_user_regs(struct pt_regs *regs,
>  {
>   unsigned long msr = regs->msr;
>  
> + WARN_ON(!tm_suspend_supported());
> +
>   /* Remove TM bits from thread's MSR.  The MSR in the sigcontext
>* just indicates to userland that we were doing a transaction, but we
>* don't want to return in transactional state.  This also ensures
> @@ -769,6 +771,8 @@ static long restore_tm_user_regs(struct pt_regs *regs,
>   int i;
>  #endif
>  
> + if (!tm_suspend_supported())
> + return 1;
>   /*
>* restore general registers but not including MSR or SOFTE. Also
>* take care of keeping r2 (TLS) intact if not a signal.
> diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> index c83c115858c1..6d28caf8496f 100644
> --- a/arch/powerpc/kernel/signal_64.c
> +++ b/arch/powerpc/kernel/signal_64.c
> @@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user 
> *sc,
>  
>   BUG_ON(!MSR_TM_ACTIVE(regs->msr));
>  
> + WARN_ON(!tm_suspend_supported());
> +
>   /* Remove TM bits from thread's MSR.  The MSR in the sigcontext
>* just indicates to userland that we were doing a transaction, but we
>* don't want to return in transactional state.  This also ensures
> @@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct 
> *tsk,
>  
>   BUG_ON(tsk != current);
>  
> + if (!tm_suspend_supported())
> + return -EINVAL;
> +
>   /* copy the GPRs */
>   err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
>   err |= __copy_from_user(>thread.ckpt_regs, sc->gp_regs,

Re: [PATCH 2/3] powerpc/tm: P9 disabled suspend mode workaround

2017-10-06 Thread Benjamin Herrenschmidt

On Fri, 2017-10-06 at 18:46 +1100, Cyril Bur wrote:
> [from Michael Neulings original patch]
> Each POWER9 core is made of two super slices. Each super slice can
> only have one thread at a time in TM suspend mode. The super slice
> restricts ever entering a state where both threads are in suspend by
> aborting transactions on tsuspend or exceptions into the kernel.
> 
> Unfortunately for context switch we need trechkpt which forces suspend
> mode. If a thread is already in suspend and a second thread needs to
> be restored that was suspended, the trechkpt must be executed.
> Currently the trechkpt will hang in this case until the other thread
> exits suspend. This causes problems for Linux resulting in hang and
> RCU stall detectors going off.
> 
> To workaround this, we disable suspend in the core. This is done via a
> firmware change which stops the hardware ever getting into suspend.
> The hardware will always rollback a transaction on any tsuspend or
> entry into the kernel.
> 
> [added by Cyril Bur]
> As the no-suspend firmware change is novel and untested using it should
> be opt in by users. Furthumore, currently the kernel has no method to
> know if the firmware has applied the no-suspend workaround. This patch
> extends the ppc_tm commandline option to allow users to opt-in if they
> are sure that their firmware has been updated and they understand the
> risks involed.

Is this what the patch actually does ? ...

> Signed-off-by: Cyril Bur 
> ---
>  Documentation/admin-guide/kernel-parameters.txt |  7 +--
>  arch/powerpc/include/asm/cputable.h |  6 ++
>  arch/powerpc/include/asm/tm.h   |  6 --
>  arch/powerpc/kernel/cputable.c  | 12 
>  arch/powerpc/kernel/setup_64.c  | 16 ++--
>  5 files changed, 37 insertions(+), 10 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 4e2b5d9078a0..a0f757f749cf 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -805,8 +805,11 @@
>   Disable RADIX MMU mode on POWER9
>  
>   ppc_tm= [PPC]
> - Format: {"off"}
> - Disable Hardware Transactional Memory
> + Format: {"off" | "no-suspend"}
> + "Off" Will disable Hardware Transactional Memory.
> + "no-suspend" Informs the kernel that the
> + hardware will not transition into the kernel
> + with a suspended transaction.
>  
>   disable_cpu_apicid= [X86,APIC,SMP]
>   Format: 
> diff --git a/arch/powerpc/include/asm/cputable.h 
> b/arch/powerpc/include/asm/cputable.h
> index a9bf921f4efc..e66101830af2 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -124,6 +124,12 @@ extern void identify_cpu_name(unsigned int pvr);
>  extern void do_feature_fixups(unsigned long value, void *fixup_start,
> void *fixup_end);
>  
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +extern bool tm_suspend_supported(void);
> +#else
> +static inline bool tm_suspend_supported(void) { return false; }
> +#endif
> +
>  extern const char *powerpc_base_platform;
>  
>  #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
> diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
> index eca1c866ca97..1fd0b5f72861 100644
> --- a/arch/powerpc/include/asm/tm.h
> +++ b/arch/powerpc/include/asm/tm.h
> @@ -9,9 +9,11 @@
>  
>  #ifndef __ASSEMBLY__
>  
> -#define TM_STATE_ON  0
> -#define TM_STATE_OFF 1
> +#define TM_STATE_ON  0
> +#define TM_STATE_OFF 1
> +#define TM_STATE_NO_SUSPEND  2
>  
> +extern int ppc_tm_state;
>  extern void tm_enable(void);
>  extern void tm_reclaim(struct thread_struct *thread,
>  unsigned long orig_msr, uint8_t cause);
> diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
> index 760872916013..2cb01b48123a 100644
> --- a/arch/powerpc/kernel/cputable.c
> +++ b/arch/powerpc/kernel/cputable.c
> @@ -22,6 +22,7 @@
>  #include /* for PTRRELOC on ARCH=ppc */
>  #include 
>  #include 
> +#include 
>  
>  static struct cpu_spec the_cpu_spec __read_mostly;
>  
> @@ -2301,6 +2302,17 @@ void __init identify_cpu_name(unsigned int pvr)
>   }
>  }
>  
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +bool tm_suspend_supported(void)
> +{
> + if (cpu_has_feature(CPU_FTR_TM)) {
> + if (pvr_version_is(PVR_POWER9) && ppc_tm_state != 
> TM_STATE_NO_SUSPEND)
> + return false;
> + return true;
> + }

Hrm... so if state is "NO SUSPEND" you return "true" ? Isn't this
backward ? Or I don't understand what this is about...

> + return false;
> +}
> +#endif
>  
>  #ifdef

[PATCH 3/3] powerpc/tm: P9 disable transactionally suspended sigcontexts

2017-10-06 Thread Cyril Bur

From: Michael Neuling 

Unfortunately userspace can construct a sigcontext which enables
suspend. Thus userspace can force Linux into a path where trechkpt is
executed.

This patch blocks this from happening on POWER9 but sanity checking
sigcontexts passed in.

ptrace doesn't have this problem as only MSR SE and BE can be changed
via ptrace.

This patch also adds a number of WARN_ON() in case we every enter
suspend when we shouldn't. This should catch systems that don't have
the firmware change and are running TM.

A future firmware change will allow suspend mode on POWER9 but that is
going to require additional Linux changes to support. In the interim,
this allows TM to continue to (partially) work while stopping
userspace from crashing Linux.

Signed-off-by: Michael Neuling 
Signed-off-by: Cyril Bur 
---
 arch/powerpc/kernel/process.c   | 2 ++
 arch/powerpc/kernel/signal_32.c | 4 
 arch/powerpc/kernel/signal_64.c | 5 +
 3 files changed, 11 insertions(+)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a0c74bbf3454..5b81673c5026 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -903,6 +903,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
if (!MSR_TM_ACTIVE(thr->regs->msr))
goto out_and_saveregs;
 
+   WARN_ON(!tm_suspend_supported());
+
TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
 "ccr=%lx, msr=%lx, trap=%lx)\n",
 tsk->pid, thr->regs->nip,
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 92fb1c8dbbd8..9eac0131c080 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -519,6 +519,8 @@ static int save_tm_user_regs(struct pt_regs *regs,
 {
unsigned long msr = regs->msr;
 
+   WARN_ON(!tm_suspend_supported());
+
/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
 * just indicates to userland that we were doing a transaction, but we
 * don't want to return in transactional state.  This also ensures
@@ -769,6 +771,8 @@ static long restore_tm_user_regs(struct pt_regs *regs,
int i;
 #endif
 
+   if (!tm_suspend_supported())
+   return 1;
/*
 * restore general registers but not including MSR or SOFTE. Also
 * take care of keeping r2 (TLS) intact if not a signal.
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index c83c115858c1..6d28caf8496f 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user 
*sc,
 
BUG_ON(!MSR_TM_ACTIVE(regs->msr));
 
+   WARN_ON(!tm_suspend_supported());
+
/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
 * just indicates to userland that we were doing a transaction, but we
 * don't want to return in transactional state.  This also ensures
@@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 
BUG_ON(tsk != current);
 
+   if (!tm_suspend_supported())
+   return -EINVAL;
+
/* copy the GPRs */
err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
err |= __copy_from_user(>thread.ckpt_regs, sc->gp_regs,
-- 
2.14.2

[PATCH 2/3] powerpc/tm: P9 disabled suspend mode workaround

2017-10-06 Thread Cyril Bur

[from Michael Neulings original patch]
Each POWER9 core is made of two super slices. Each super slice can
only have one thread at a time in TM suspend mode. The super slice
restricts ever entering a state where both threads are in suspend by
aborting transactions on tsuspend or exceptions into the kernel.

Unfortunately for context switch we need trechkpt which forces suspend
mode. If a thread is already in suspend and a second thread needs to
be restored that was suspended, the trechkpt must be executed.
Currently the trechkpt will hang in this case until the other thread
exits suspend. This causes problems for Linux resulting in hang and
RCU stall detectors going off.

To workaround this, we disable suspend in the core. This is done via a
firmware change which stops the hardware ever getting into suspend.
The hardware will always rollback a transaction on any tsuspend or
entry into the kernel.

[added by Cyril Bur]
As the no-suspend firmware change is novel and untested using it should
be opt in by users. Furthumore, currently the kernel has no method to
know if the firmware has applied the no-suspend workaround. This patch
extends the ppc_tm commandline option to allow users to opt-in if they
are sure that their firmware has been updated and they understand the
risks involed.

Signed-off-by: Cyril Bur 
---
 Documentation/admin-guide/kernel-parameters.txt |  7 +--
 arch/powerpc/include/asm/cputable.h |  6 ++
 arch/powerpc/include/asm/tm.h   |  6 --
 arch/powerpc/kernel/cputable.c  | 12 
 arch/powerpc/kernel/setup_64.c  | 16 ++--
 5 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 4e2b5d9078a0..a0f757f749cf 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -805,8 +805,11 @@
Disable RADIX MMU mode on POWER9
 
ppc_tm= [PPC]
-   Format: {"off"}
-   Disable Hardware Transactional Memory
+   Format: {"off" | "no-suspend"}
+   "Off" Will disable Hardware Transactional Memory.
+   "no-suspend" Informs the kernel that the
+   hardware will not transition into the kernel
+   with a suspended transaction.
 
disable_cpu_apicid= [X86,APIC,SMP]
Format: 
diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index a9bf921f4efc..e66101830af2 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -124,6 +124,12 @@ extern void identify_cpu_name(unsigned int pvr);
 extern void do_feature_fixups(unsigned long value, void *fixup_start,
  void *fixup_end);
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+extern bool tm_suspend_supported(void);
+#else
+static inline bool tm_suspend_supported(void) { return false; }
+#endif
+
 extern const char *powerpc_base_platform;
 
 #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index eca1c866ca97..1fd0b5f72861 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -9,9 +9,11 @@
 
 #ifndef __ASSEMBLY__
 
-#define TM_STATE_ON0
-#define TM_STATE_OFF   1
+#define TM_STATE_ON0
+#define TM_STATE_OFF   1
+#define TM_STATE_NO_SUSPEND2
 
+extern int ppc_tm_state;
 extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
   unsigned long orig_msr, uint8_t cause);
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 760872916013..2cb01b48123a 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -22,6 +22,7 @@
 #include   /* for PTRRELOC on ARCH=ppc */
 #include 
 #include 
+#include 
 
 static struct cpu_spec the_cpu_spec __read_mostly;
 
@@ -2301,6 +2302,17 @@ void __init identify_cpu_name(unsigned int pvr)
}
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+bool tm_suspend_supported(void)
+{
+   if (cpu_has_feature(CPU_FTR_TM)) {
+   if (pvr_version_is(PVR_POWER9) && ppc_tm_state != 
TM_STATE_NO_SUSPEND)
+   return false;
+   return true;
+   }
+   return false;
+}
+#endif
 
 #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECKS
 struct static_key_true cpu_feature_keys[NUM_CPU_FTR_KEYS] = {
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index e37c26d2e54b..227ac600a1b7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -251,12 +251,14 @@ static void cpu_ready_for_interrupts(void)
get_paca()->kernel_msr = MSR_KERNEL;
 }
 
+int ppc_tm_state;
 #ifdef

[PATCH 1/3] powerpc/tm: Add commandline option to disable hardware transactional memory

2017-10-06 Thread Cyril Bur

Currently the kernel relies on firmware to inform it whether or not the
CPU supports HTM and as long as the kernel was built with
CONFIG_PPC_TRANSACTIONAL_MEM=y then it will allow userspace to make use
of the facility.

There may be situations where it would be advantageous for the kernel
to not allow userspace to use HTM, currently the only way to achieve
this is to recompile the kernel with CONFIG_PPC_TRANSACTIONAL_MEM=n.

This patch adds a simple commandline option so that HTM can be disabled
at boot time.

Signed-off-by: Cyril Bur 
---
 Documentation/admin-guide/kernel-parameters.txt |  4 
 arch/powerpc/include/asm/tm.h   |  3 +++
 arch/powerpc/kernel/setup_64.c  | 28 +
 3 files changed, 35 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 05496622b4ef..4e2b5d9078a0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -804,6 +804,10 @@
disable_radix   [PPC]
Disable RADIX MMU mode on POWER9
 
+   ppc_tm= [PPC]
+   Format: {"off"}
+   Disable Hardware Transactional Memory
+
disable_cpu_apicid= [X86,APIC,SMP]
Format: 
The number of initial APIC ID for the
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index 82e06ca3a49b..eca1c866ca97 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -9,6 +9,9 @@
 
 #ifndef __ASSEMBLY__
 
+#define TM_STATE_ON0
+#define TM_STATE_OFF   1
+
 extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
   unsigned long orig_msr, uint8_t cause);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index b89c6aac48c9..e37c26d2e54b 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -250,6 +251,31 @@ static void cpu_ready_for_interrupts(void)
get_paca()->kernel_msr = MSR_KERNEL;
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static int ppc_tm_state;
+static int __init parse_ppc_tm(char *p)
+{
+   if (strcmp(p, "off") == 0)
+   ppc_tm_state = TM_STATE_OFF;
+   else
+   printk(KERN_NOTICE "Unknown value to cmdline ppc_tm '%s'\n", p);
+   return 0;
+}
+early_param("ppc_tm", parse_ppc_tm);
+
+static void check_disable_tm(void)
+{
+   if (cpu_has_feature(CPU_FTR_TM) && ppc_tm_state == TM_STATE_OFF) {
+   printk(KERN_NOTICE "Disabling hardware transactional memory 
(HTM)\n");
+   cur_cpu_spec->cpu_user_features2 &=
+   ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM);
+   cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
+   }
+}
+#else
+static void check_disable_tm(void) { }
+#endif
+
 /*
  * Early initialization entry point. This is called by head.S
  * with MMU translation disabled. We rely on the "feature" of
@@ -299,6 +325,8 @@ void __init early_setup(unsigned long dt_ptr)
 */
early_init_devtree(__va(dt_ptr));
 
+   check_disable_tm();
+
/* Now we know the logical id of our boot cpu, setup the paca. */
setup_paca([boot_cpuid]);
fixup_boot_paca();
-- 
2.14.2

Re: [PATCH 1/4] PCI: Remove redundant pcibios_set_master() declarations

2017-10-06 Thread Jesper Nilsson

On Thu, Oct 05, 2017 at 03:38:42PM -0500, Bjorn Helgaas wrote:
> From: Bjorn Helgaas 
> 
> All users of pcibios_set_master() include , which already has
> a declaration.  Remove the unnecessary declarations from the 
> files.
> 
> Signed-off-by: Bjorn Helgaas 
> ---
>  arch/alpha/include/asm/pci.h   |2 --
>  arch/cris/include/asm/pci.h|1 -

For what it's worth, for the cris changes:

Acked-by: Jesper Nilsson 

>  arch/frv/include/asm/pci.h |2 --
>  arch/mips/include/asm/pci.h|2 --
>  arch/mn10300/include/asm/pci.h |2 --
>  arch/parisc/include/asm/pci.h  |1 -
>  arch/sh/include/asm/pci.h  |2 --
>  arch/x86/include/asm/pci.h |1 -
>  8 files changed, 13 deletions(-)

/^JN - Jesper Nilsson
-- 
   Jesper Nilsson -- jesper.nils...@axis.com

Re: [PATCH 2/4] PCI: Remove redundant pci_dev, pci_bus, resource declarations

2017-10-06 Thread Jesper Nilsson

On Thu, Oct 05, 2017 at 03:38:49PM -0500, Bjorn Helgaas wrote:
> From: Bjorn Helgaas 
> 
>  defines struct pci_bus and struct pci_dev and includes the
> struct resource definition before including .  Nobody includes
>  directly, so they don't need their own declarations.
> 
> Remove the redundant struct pci_dev, pci_bus, resource declarations.
> 
> Signed-off-by: Bjorn Helgaas 
> ---
>  arch/alpha/include/asm/pci.h|3 ---
>  arch/cris/include/asm/pci.h |2 --

For what it's worth, for the cris changes:

Acked-by: Jesper Nilsson 

>  arch/frv/include/asm/pci.h  |2 --
>  arch/ia64/include/asm/pci.h |2 --
>  arch/mips/include/asm/pci.h |2 --
>  arch/mn10300/include/asm/pci.h  |2 --
>  arch/parisc/include/asm/pci.h   |7 ---
>  arch/powerpc/include/asm/pci.h  |2 --
>  arch/sh/include/asm/pci.h   |2 --
>  arch/sparc/include/asm/pci_32.h |2 --
>  arch/xtensa/include/asm/pci.h   |2 --
>  11 files changed, 28 deletions(-)

/^JN - Jesper Nilsson
-- 
   Jesper Nilsson -- jesper.nils...@axis.com

[PATCH] cpufreq: powernv: Return the actual CPU frequency in /proc/cpuinfo

2017-10-06 Thread Shriya

Make /proc/cpuinfo read the frequency of the CPU it is running at
instead of reading the cached value of the last requested frequency.
In conditions like WOF/throttle CPU can be running at a different
frequency than the requested frequency.

Signed-off-by: Shriya 
---
 arch/powerpc/platforms/powernv/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c
index 897aa14..55ea4bf 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -311,7 +311,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu)
 {
unsigned long ret_freq;
 
-   ret_freq = cpufreq_quick_get(cpu) * 1000ul;
+   ret_freq = cpufreq_get(cpu) * 1000ul;
 
/*
 * If the backend cpufreq driver does not exist,
-- 
1.9.1

Re: [PATCH] mm: deferred_init_memmap improvements

2017-10-06 Thread Anshuman Khandual

On 10/04/2017 08:59 PM, Pavel Tatashin wrote:
> This patch fixes another existing issue on systems that have holes in
> zones i.e CONFIG_HOLES_IN_ZONE is defined.
> 
> In for_each_mem_pfn_range() we have code like this:
> 
> if (!pfn_valid_within(pfn)
>   goto free_range;
> 
> Note: 'page' is not set to NULL and is not incremented but 'pfn' advances.

page is initialized to NULL at the beginning of the function.
PFN advances but we dont proceed unless pfn_valid_within(pfn)
holds true which basically should have checked with arch call
back if the PFN is valid in presence of memory holes as well.
Is not this correct ?

> Thus means if deferred struct pages are enabled on systems with these kind
> of holes, linux would get memory corruptions. I have fixed this issue by
> defining a new macro that performs all the necessary operations when we
> free the current set of pages.

If we bail out in case PFN is not valid, then how corruption
can happen ?

[PATCH 3/3] powerpc: use NMI IPI for smp_send_stop

2017-10-06 Thread Nicholas Piggin

Use the NMI IPI rather than smp_call_function for smp_send_stop.
Have stopped CPUs hard disable interrupts rather than just soft
disable.

This function is used in crash/panic/shutdown paths to bring other
CPUs down as quickly and reliably as possible, and minimizing their
potential to cause trouble.

Avoiding the Linux smp_call_function infrastructure and (if supported)
using true NMI IPIs makes this more robust.

Also use spin loop primitives in the stop callback, mainly to help
processing speed of the active thread speed in the simulator.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/smp.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index e0a4c1f82e25..ce891030d925 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -547,19 +547,20 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct 
pt_regs *))
 }
 #endif
 
-static void stop_this_cpu(void *dummy)
+static void __noreturn stop_this_cpu(struct pt_regs *regs)
 {
/* Remove this CPU */
set_cpu_online(smp_processor_id(), false);
 
-   local_irq_disable();
+   hard_irq_disable();
+   spin_begin();
while (1)
-   ;
+   spin_cpu_relax();
 }
 
 void smp_send_stop(void)
 {
-   smp_call_function(stop_this_cpu, NULL, 0);
+   smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, stop_this_cpu, 100);
 }
 
 struct thread_info *current_set[NR_CPUS];
-- 
2.13.3

[PATCH 2/3] powerpc/powernv: Always stop secondaries before reboot/shutdown

2017-10-06 Thread Nicholas Piggin

Currently powernv reboot and shutdown requests just leave secondaries
to do their own things. This is undesirable because they can trigger
any number of watchdogs while waiting for reboot, but also we don't
know what else they might be doing, or they might be stuck somewhere
causing trouble.

The opal scheduled flash update code already ran into watchdog problems
due to flashing taking a long time, but it's possible for regular
reboots to trigger problems too (this is with watchdog_thresh set to 1,
but I have seen it with watchdog_thresh at the default value once too):

  reboot: Restarting system
  [  360.038896709,5] OPAL: Reboot request...
  Watchdog CPU:0 Hard LOCKUP
  Watchdog CPU:44 detected Hard LOCKUP other CPUS:16
  Watchdog CPU:16 Hard LOCKUP
  watchdog: BUG: soft lockup - CPU#16 stuck for 3s! [swapper/16:0]

So remove the special case for flash update, and unconditionally do
smp_send_stop before rebooting.

Return the CPUs to Linux stop loops rather than OPAL. The reason for
this is that in firmware, CPUs will check for jobs, whereas smp_send_stop
puts them into a simple infinite loop. If there is some corruption, it
is better to do the latter, to maximize the chance of a successful
reboot.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/opal.h |  2 +-
 arch/powerpc/platforms/powernv/opal-flash.c | 28 +---
 arch/powerpc/platforms/powernv/setup.c  | 15 +--
 3 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 04c32b08ffa1..ce58f4139ff5 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -317,7 +317,7 @@ struct rtc_time;
 extern unsigned long opal_get_boot_time(void);
 extern void opal_nvram_init(void);
 extern void opal_flash_update_init(void);
-extern void opal_flash_term_callback(void);
+extern void opal_flash_update_print_message(void);
 extern int opal_elog_init(void);
 extern void opal_platform_dump_init(void);
 extern void opal_sys_param_init(void);
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c 
b/arch/powerpc/platforms/powernv/opal-flash.c
index 2fa3ac80cb4e..632871d78576 100644
--- a/arch/powerpc/platforms/powernv/opal-flash.c
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -303,26 +303,9 @@ static int opal_flash_update(int op)
return rc;
 }
 
-/* Return CPUs to OPAL before starting FW update */
-static void flash_return_cpu(void *info)
-{
-   int cpu = smp_processor_id();
-
-   if (!cpu_online(cpu))
-   return;
-
-   /* Disable IRQ */
-   hard_irq_disable();
-
-   /* Return the CPU to OPAL */
-   opal_return_cpu();
-}
-
 /* This gets called just before system reboots */
-void opal_flash_term_callback(void)
+void opal_flash_update_print_message(void)
 {
-   struct cpumask mask;
-
if (update_flash_data.status != FLASH_IMG_READY)
return;
 
@@ -333,15 +316,6 @@ void opal_flash_term_callback(void)
 
/* Small delay to help getting the above message out */
msleep(500);
-
-   /* Return secondary CPUs to firmware */
-   cpumask_copy(, cpu_online_mask);
-   cpumask_clear_cpu(smp_processor_id(), );
-   if (!cpumask_empty())
-   smp_call_function_many(,
-  flash_return_cpu, NULL, false);
-   /* Hard disable interrupts */
-   hard_irq_disable();
 }
 
 /*
diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c
index cf52d53da460..0d2f70d24747 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -112,17 +112,12 @@ static void pnv_prepare_going_down(void)
 */
opal_event_shutdown();
 
-   /* Soft disable interrupts */
-   local_irq_disable();
+   /* Print flash update message if one is scheduled. */
+   opal_flash_update_print_message();
 
-   /*
-* Return secondary CPUs to firwmare if a flash update
-* is pending otherwise we will get all sort of error
-* messages about CPU being stuck etc.. This will also
-* have the side effect of hard disabling interrupts so
-* past this point, the kernel is effectively dead.
-*/
-   opal_flash_term_callback();
+   smp_send_stop();
+
+   hard_irq_disable();
 }
 
 static void  __noreturn pnv_restart(char *cmd)
-- 
2.13.3

[PATCH 1/3] powerpc/powernv: Avoid the secondary hold spinloop for OPAL boot

2017-10-06 Thread Nicholas Piggin

OPAL boot does not insert secondaries at 0x60 to wait at the secondary
hold spinloop. Instead it keeps them held in firmware until the
opal_start_cpu call is made, which directs them where the caller
specifies. Linux inserts them into generic_secondary_smp_init(), which
is after the secondary hold spinloop (they go on to spin at the per-CPU
paca loops, but that is another step).

So avoid waiting on this spinloop when booting with OPAL firmware.
It always just times out.

This saves 100ms boot time on bare metal, and 10s of seconds when
booting the simulator in SMP.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/head_64.S  |  4 +++-
 arch/powerpc/kernel/setup_64.c | 14 --
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index c9e760ec7530..1ebfb3f2cbbb 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -60,7 +60,9 @@
  *   1. The MMU is off, processor in HV mode, primary CPU enters at 0
  *  with device-tree in gpr3. We also get OPAL base in r8 and
  * entry in r9 for debugging purposes
- *   2. Secondary processors enter at 0x60 with PIR in gpr3
+ *   2. Secondary processors enter as directed by opal_start_cpu(), which
+ *  is generic_secondary_smp_init, with PIR in gpr3. The secondary spin
+ *  code is not used.
  *
  *  For Book3E processors:
  *   1. The MMU is on running in AS0 in a state defined in ePAPR
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 3f2453858f60..eada0a7b73f8 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -363,8 +363,18 @@ void early_setup_secondary(void)
 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
 static bool use_spinloop(void)
 {
-   if (!IS_ENABLED(CONFIG_PPC_BOOK3E))
-   return true;
+   if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
+   /*
+* With OPAL, secondaries do not use the secondary hold
+* spinloop, rather they are held in firmware until
+* opal_start_cpu() sends them to generic_secondary_smp_init
+* directly.
+*/
+   if (firmware_has_feature(FW_FEATURE_OPAL))
+   return false;
+   else
+   return true;
+   }
 
/*
 * When book3e boots from kexec, the ePAPR spin table does
-- 
2.13.3

[PATCH 0/3] some boot/shutdown improvements

2017-10-06 Thread Nicholas Piggin

Hi,

These are a couple of improvements to powernv/opal boot and
shutdown paths. Also a patch to move smp_send_stop over to
use NMI IPIs, which gives us a significantly better chance to
stop secondaries on platforms which support it (pSeries and
PowerNV POWER9 so far).

Patch 1 in particular it would be good if people could take
a look at. I *think* it's okay wrt kexec, but I could miss
something or some old firmware might do the wrong thing.

Thanks,
Nick

Nicholas Piggin (3):
  powerpc/powernv: Avoid the secondary hold spinloop for OPAL boot
  powerpc/powernv: Always stop secondaries before reboot/shutdown
  powerpc: use NMI IPI for smp_send_stop

 arch/powerpc/include/asm/opal.h |  2 +-
 arch/powerpc/kernel/head_64.S   |  4 +++-
 arch/powerpc/kernel/setup_64.c  | 14 --
 arch/powerpc/kernel/smp.c   |  9 +
 arch/powerpc/platforms/powernv/opal-flash.c | 28 +---
 arch/powerpc/platforms/powernv/setup.c  | 15 +--
 6 files changed, 27 insertions(+), 45 deletions(-)

-- 
2.13.3

79 matches

Mail list logo