date:20170808

[PATCH] powerpc/iommu: Avoid undefined right shift in iommu_range_alloc()

2017-08-08 Thread Michael Ellerman

In iommu_range_alloc() we generate a mask by right shifting ~0,
however if the specified alignment is 0 then we right shift by 64,
which is undefined. UBSAN tells us so:

  UBSAN: Undefined behaviour in ../arch/powerpc/kernel/iommu.c:193:35
  shift exponent 64 is too large for 64-bit type 'long unsigned int'

We can avoid it by instead generating the mask with:

  align_mask = (1ull << align_order) - 1;

That will also generate an undefined shift if align_order is 64 or
greater, but that shouldn't be a problem for a while.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0e49a4560cff..e0af6cd7ba4f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -190,7 +190,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
unsigned int pool_nr;
struct iommu_pool *pool;
 
-   align_mask = 0xl >> (64 - align_order);
+   align_mask = (1ull << align_order) - 1;
 
/* This allocator was derived from x86_64's bit string search */
 
-- 
2.7.4

[PATCH v5] powerpc/mm: Only read faulting instruction when necessary in do_page_fault()

2017-08-08 Thread Christophe Leroy

Commit a7a9dcd882a67 ("powerpc: Avoid taking a data miss on every
userspace instruction miss") has shown that limiting the read of
faulting instruction to likely cases improves performance.

This patch goes further into this direction by limiting the read
of the faulting instruction to the only cases where it is definitly
needed.

On an MPC885, with the same benchmark app as in the commit referred
above, we see a reduction of 4000 dTLB misses (approx 3%):

Before the patch:
 Performance counter stats for './fault 500' (10 runs):

 720495838  cpu-cycles  ( +-  0.04% )
141769  dTLB-load-misses( +-  0.02% )
 52722  iTLB-load-misses( +-  0.01% )
 19611  faults  ( +-  0.02% )

   5.750535176 seconds time elapsed ( +-  0.16% )

With the patch:
 Performance counter stats for './fault 500' (10 runs):

 717669123  cpu-cycles  ( +-  0.02% )
137344  dTLB-load-misses( +-  0.03% )
 52731  iTLB-load-misses( +-  0.01% )
 19614  faults  ( +-  0.03% )

   5.728423115 seconds time elapsed ( +-  0.14% )

The proper work of the huge stack expansion was tested with the
following app:

#include 
#include 

int main(int argc, char **argv)
{
char buf[1024 * 1025];

sprintf(buf, "Hello world !\n");
printf(buf);

exit(0);
}

Signed-off-by: Christophe Leroy 
---
 I'm wondering if it really worth it to do something so complex. Is there 
really a chance that the
 get_user() faults ? It would mean that an instruction that as just been 
executed has been in the
 meantime swapped out. Is that really a possibility ? I'd expect not, which 
would mean that we
 could limit it to __get_user_inatomic() and then not implement this complex 
unlocking and retry stuff.

 v5: Reworked to fit after Benh do_fault improvement and rebased on top of 
powerpc/merge (65152902e43fef)

 v4: Rebased on top of powerpc/next (f718d426d7e42e) and doing access_ok() 
verification before __get_user_xxx()

 v3: Do a first try with pagefault disabled before releasing the semaphore

 v2: Changes 'if (cond1) if (cond2)' by 'if (cond1 && cond2)'

 arch/powerpc/mm/fault.c | 90 +++--
 1 file changed, 65 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index f88fac3d281b..7a218f69f956 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -68,26 +68,58 @@ static inline bool notify_page_fault(struct pt_regs *regs)
 /*
  * Check whether the instruction at regs->nip is a store using
  * an update addressing form which will update r1.
+ * If no, returns STACK_EXPANSION_BAD
+ * If yes, returns STACK_EXPANSION_GOOD
+ * In addition, the result is ored with STACK_EXPANSION_UNLOCKED if the
+ * semaphore has been released
  */
-static bool store_updates_sp(struct pt_regs *regs)
+
+#define STACK_EXPANSION_BAD0
+#define STACK_EXPANSION_GOOD   1
+#define STACK_EXPANSION_LOCKED 0
+#define STACK_EXPANSION_UNLOCKED   2
+
+int store_updates_sp(struct pt_regs *regs)
 {
unsigned int inst;
+   unsigned int __user *nip = (unsigned int __user *)regs->nip;
+   int ret;
+   int sema = STACK_EXPANSION_LOCKED;
+
+   /*
+* We want to do this outside mmap_sem, because reading code around nip
+* can result in fault, which will cause a deadlock when called with
+* mmap_sem held. However, we do a first try with pagefault disabled as
+* a fault here is very unlikely.
+*/
+   if (!access_ok(VERIFY_READ, nip, sizeof(inst)))
+   return STACK_EXPANSION_BAD | STACK_EXPANSION_LOCKED;
+
+   pagefault_disable();
+   ret = __get_user_inatomic(inst, nip);
+   pagefault_enable();
+   if (ret) {
+   up_read(¤t->mm->mmap_sem);
+   sema = STACK_EXPANSION_UNLOCKED;
+   if (__get_user(inst, nip))
+   return STACK_EXPANSION_BAD | STACK_EXPANSION_UNLOCKED;
+   }
 
-   if (get_user(inst, (unsigned int __user *)regs->nip))
-   return false;
/* check for 1 in the rA field */
if (((inst >> 16) & 0x1f) != 1)
-   return false;
+   return STACK_EXPANSION_BAD | sema;
+
/* check major opcode */
switch (inst >> 26) {
+   case 62:/* std or stdu */
+   if ((inst & 3) == 0)
+   break;
case 37:/* stwu */
case 39:/* stbu */
case 45:/* sthu */
case 53:/* stfsu */
case 55:/* stfdu */
-   return true;
-   case 62:/* std or stdu */
-   return (inst & 3) == 1;
+   return STACK_EXPANSION_GOOD | sema;
case 31:
/* check minor opcode */

Re: block/ps3vram: Delete an error message for a failed memory allocation in ps3vram_cache_init()

2017-08-08 Thread SF Markus Elfring

>> https://patchwork.ozlabs.org/patch/798575/
> 
> I submitted your patch

Thanks for your constructive feedback.
https://patchwork.ozlabs.org/patch/798850/


> and a fix to ps3vram_probe() with the other patches in my queue.

I find it nice that you picked this change opportunity up after
a bit of discussion (before an other developer would eventually
have tackled it also).

“Check return of ps3vram_cache_init”
https://patchwork.ozlabs.org/patch/798853/

1. Unfortunately, I find that this specific update suggestion does not fit
   to the Linux coding style convention.

   “…
   Do not unnecessarily use braces where a single statement will do.
   …”

2. How do you think about to use the check “if (error)” instead?

3. Will an additional commit description be useful?

Regards,
Markus

[v2 PATCH] powerpc/powernv/idle: Disable LOSE_FULL_CONTEXT states when stop-api fails

2017-08-08 Thread Gautham R. Shenoy

From: "Gautham R. Shenoy" 

Currently, we use the opal call opal_slw_set_reg() to inform the
Sleep-Winkle Engine (SLW) to restore the contents of some of the
Hypervisor state on wakeup from deep idle states that lose full
hypervisor context (characterized by the flag
OPAL_PM_LOSE_FULL_CONTEXT).

However, the current code has a bug in that if opal_slw_set_reg()
fails, we don't disable the use of these deep states (winkle on
POWER8, stop4 onwards on POWER9).

This patch fixes this bug by ensuring that if programing the
sleep-winkle engine to restore the hypervisor states in
pnv_save_sprs_for_deep_states() fails, then we exclude such states by
clearing the OPAL_PM_LOSE_FULL_CONTEXT flag from
supported_cpuidle_states. As a result POWER8 will be prevented from
using winkle for CPU-Hotplug, and POWER9 will put the offlined CPUs to
the default stop state when available.

Further, we ensure in the initialization of the cpuidle-powernv driver
to only include those states whose flags are present in
supported_cpuidle_states, thereby skipping OPAL_PM_LOSE_FULL_CONTEXT
states when they have been disabled due to stop-api failure.

Fixes: 1e1601b38e6 ("powerpc/powernv/idle: Restore SPRs for deep idle
states via stop API.")

Signed-off-by: Gautham R. Shenoy 
---
v2 --> v1:
  Do the bare minimum required to exclude the LOSE_FULL_CONTEXT
  states.

 arch/powerpc/platforms/powernv/idle.c | 41 ---
 drivers/cpuidle/cpuidle-powernv.c | 10 +
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 2abee07..a553aee 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -56,6 +56,7 @@
  */
 static u64 pnv_deepest_stop_psscr_val;
 static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_flag;
 static bool deepest_stop_found;
 
 static int pnv_save_sprs_for_deep_states(void)
@@ -185,8 +186,40 @@ static void pnv_alloc_idle_core_states(void)
 
update_subcore_sibling_mask();
 
-   if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
-   pnv_save_sprs_for_deep_states();
+   if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+   int rc = pnv_save_sprs_for_deep_states();
+
+   if (likely(!rc))
+   return;
+
+   /*
+* The stop-api is unable to restore hypervisor
+* resources on wakeup from platform idle states which
+* lose full context. So disable such states.
+*/
+   supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+   pr_warn("cpuidle-powernv: Disabling idle states that lose full 
context\n");
+   pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug 
affected\n");
+
+   if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+   (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+   /*
+* Use the default stop state for CPU-Hotplug
+* if available.
+*/
+   if (default_stop_found) {
+   pnv_deepest_stop_psscr_val =
+   pnv_default_stop_val;
+   pnv_deepest_stop_psscr_mask =
+   pnv_default_stop_mask;
+   pr_warn("cpuidle-powernv: Offlined CPUs will 
stop with psscr = 0x%016llx\n",
+   pnv_deepest_stop_psscr_val);
+   } else { /* Fallback to snooze loop for CPU-Hotplug */
+   deepest_stop_found = false;
+   pr_warn("cpuidle-powernv: Offlined CPUs will 
busy wait\n");
+   }
+   }
+   }
 }
 
 u32 pnv_get_supported_cpuidle_states(void)
@@ -375,7 +408,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
pnv_deepest_stop_psscr_val;
srr1 = power9_idle_stop(psscr);
 
-   } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+   } else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
+  (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
@@ -553,6 +587,7 @@ static int __init pnv_power9_idle_init(struct device_node 
*np, u32 *flags,
max_residency_ns = residency_ns[i];
pnv_deepest_stop_psscr_val = psscr_val[i];
pnv_deepest_stop_psscr_mask = psscr_mask[i];
+   pnv_deepest_stop_flag = flags[i];
deepest_stop_found = true;
}
 
diff --git a/dri

[PATCH 06/10] powerpc/xive: introduce H_INT_ESB hcall

2017-08-08 Thread Cédric Le Goater

The H_INT_ESB hcall() is used to issue a load or store to the ESB page
instead of using the MMIO pages. This can be used as a workaround on
some HW issues. The OS knows that this hcall should be used on an
interrupt source when the ESB hcall flag is set to 1 in the hcall
H_INT_GET_SOURCE_INFO.

To maintain the frontier between the xive frontend and backend, we
introduce a new xive operation 'esb_rw' to be used in the routines
doing memory accesses on the ESBs.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive.h  |  1 +
 arch/powerpc/sysdev/xive/common.c| 10 ++--
 arch/powerpc/sysdev/xive/spapr.c | 44 +++-
 arch/powerpc/sysdev/xive/xive-internal.h |  1 +
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 6d097a18d3ae..c6a4eede7733 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -56,6 +56,7 @@ struct xive_irq_data {
 #define XIVE_IRQ_FLAG_SHIFT_BUG0x04
 #define XIVE_IRQ_FLAG_MASK_FW  0x08
 #define XIVE_IRQ_FLAG_EOI_FW   0x10
+#define XIVE_IRQ_FLAG_H_INT_ESB0x20
 
 #define XIVE_INVALID_CHIP_ID   -1
 
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 22b6f8954083..891d24c82e03 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -198,7 +198,10 @@ static u8 xive_peek_esb(struct xive_irq_data *xd, u32 
offset)
if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
offset |= offset << 4;
 
-   val = in_be64(xd->eoi_mmio + offset);
+   if ((xd->flags & XIVE_IRQ_FLAG_H_INT_ESB) && xive_ops->esb_rw)
+   val = xive_ops->esb_rw(xd->hw_irq, offset, 0, 0);
+   else
+   val = in_be64(xd->eoi_mmio + offset);
 
return (u8)val;
 }
@@ -209,7 +212,10 @@ static void xive_esb_write(struct xive_irq_data *xd, u32 
offset, u64 data)
if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
offset |= offset << 4;
 
-   out_be64(xd->eoi_mmio + offset, data);
+   if ((xd->flags & XIVE_IRQ_FLAG_H_INT_ESB) && xive_ops->esb_rw)
+   xive_ops->esb_rw(xd->hw_irq, offset, data, 1);
+   else
+   out_be64(xd->eoi_mmio + offset, data);
 }
 
 #ifdef CONFIG_XMON
diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index 9a0fd9f7e38a..7fc40047c23d 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -224,7 +224,46 @@ static long plpar_int_sync(unsigned long flags)
return 0;
 }
 
-#define XIVE_SRC_H_INT_ESB (1ull << (63 - 60)) /* TODO */
+#define XIVE_ESB_FLAG_STORE (1ull << (63 - 63))
+
+static long plpar_int_esb(unsigned long flags,
+ unsigned long lisn,
+ unsigned long offset,
+ unsigned long in_data,
+ unsigned long *out_data)
+{
+   unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+   long rc;
+
+   pr_devel("H_INT_ESB flags=%lx lisn=%lx offset=%lx in=%lx\n",
+   flags,  lisn, offset, in_data);
+
+   rc = plpar_hcall(H_INT_ESB, retbuf, flags, lisn, offset, in_data);
+   if (rc) {
+   pr_err("H_INT_ESB lisn=%ld offset=%ld returned %ld\n",
+  lisn, offset, rc);
+   return  rc;
+   }
+
+   *out_data = retbuf[0];
+
+   return 0;
+}
+
+static u64 xive_spapr_esb_rw(u32 lisn, u32 offset, u64 data, bool write)
+{
+   unsigned long read_data;
+   long rc;
+
+   rc = plpar_int_esb(write ? XIVE_ESB_FLAG_STORE : 0,
+  lisn, offset, data, &read_data);
+   if (rc)
+   return -1;
+
+   return write ? 0 : read_data;
+}
+
+#define XIVE_SRC_H_INT_ESB (1ull << (63 - 60))
 #define XIVE_SRC_LSI   (1ull << (63 - 61))
 #define XIVE_SRC_TRIGGER   (1ull << (63 - 62))
 #define XIVE_SRC_STORE_EOI (1ull << (63 - 63))
@@ -244,6 +283,8 @@ static int xive_spapr_populate_irq_data(u32 hw_irq, struct 
xive_irq_data *data)
if (rc)
return  -EINVAL;
 
+   if (flags & XIVE_SRC_H_INT_ESB)
+   data->flags  |= XIVE_IRQ_FLAG_H_INT_ESB;
if (flags & XIVE_SRC_STORE_EOI)
data->flags  |= XIVE_IRQ_FLAG_STORE_EOI;
if (flags & XIVE_SRC_LSI)
@@ -483,6 +524,7 @@ static const struct xive_ops xive_spapr_ops = {
.setup_cpu  = xive_spapr_setup_cpu,
.teardown_cpu   = xive_spapr_teardown_cpu,
.sync_source= xive_spapr_sync_source,
+   .esb_rw = xive_spapr_esb_rw,
 #ifdef CONFIG_SMP
.get_ipi= xive_spapr_get_ipi,
.put_ipi= xive_spapr_put_ipi,
diff --git a/arch/powerpc/sysdev/xive/xive-internal.h 
b/arch/powerpc/sysdev/xive/xive-internal.h
index d07ef2d29caf..c99a100abf02 100644
--- a/arch/powerpc/sysdev/xi

[PATCH 10/10] powerpc/xive: fix the size of the cpumask used in xive_find_target_in_mask()

2017-08-08 Thread Cédric Le Goater

When called from xive_irq_startup(), the size of the cpumask can be
larger than nr_cpu_ids. Most of time, its value is NR_CPUS (2048).
This can result in such WARNINGs in xive_find_target_in_mask():

   [0.094480] WARNING: CPU: 10 PID: 1 at 
../arch/powerpc/sysdev/xive/common.c:476 xive_find_target_in_mask+0x110/0x2f0
   [0.094486] Modules linked in:
   [0.094491] CPU: 10 PID: 1 Comm: swapper/0 Not tainted 4.12.0+ #3
   [0.094496] task: c003fae4f200 task.stack: c003fe108000
   [0.094501] NIP: c008a310 LR: c008a2e4 CTR: 
0072ca34
   [0.094506] REGS: c003fe10b360 TRAP: 0700   Not tainted  (4.12.0+)
   [0.094510] MSR: 80029033 
   [0.094515]   CR: 88000222  XER: 20040008
   [0.094521] CFAR: c008a2cc SOFTE: 0
   [0.094521] GPR00: c008a274 c003fe10b5e0 c1428f00 
0010
   [0.094521] GPR04: 0010 0010 0010 
0099
   [0.094521] GPR08: 0010 0001  

   [0.094521] GPR12:  cfff2d00 c000d4d8 

   [0.094521] GPR16:    

   [0.094521] GPR20:    
c0b451e8
   [0.094521] GPR24:  c1462354 0800 
07ff
   [0.094521] GPR28: c1462354 0010 c003f857e418 
0010
   [0.094580] NIP [c008a310] xive_find_target_in_mask+0x110/0x2f0
   [0.094585] LR [c008a2e4] xive_find_target_in_mask+0xe4/0x2f0
   [0.094589] Call Trace:
   [0.094593] [c003fe10b5e0] [c008a274] 
xive_find_target_in_mask+0x74/0x2f0 (unreliable)
   [0.094601] [c003fe10b690] [c008abf0] 
xive_pick_irq_target.isra.1+0x200/0x230
   [0.094608] [c003fe10b830] [c008b250] 
xive_irq_startup+0x60/0x180
   [0.094614] [c003fe10b8b0] [c01608f0] irq_startup+0x70/0xd0
   [0.094620] [c003fe10b8f0] [c015df7c] __setup_irq+0x7bc/0x880
   [0.094626] [c003fe10ba90] [c015e30c] 
request_threaded_irq+0x14c/0x2c0
   [0.094632] [c003fe10baf0] [c00aeb00] 
request_event_sources_irqs+0x100/0x180
   [0.094639] [c003fe10bc10] [c0e7d2f8] 
__machine_initcall_pseries_init_ras_IRQ+0x104/0x134
   [0.094646] [c003fe10bc40] [c000cc88] 
do_one_initcall+0x68/0x1d0
   [0.094652] [c003fe10bd00] [c0e643c8] 
kernel_init_freeable+0x290/0x374
   [0.094658] [c003fe10bdc0] [c000d4f4] kernel_init+0x24/0x170
   [0.094664] [c003fe10be30] [c000b268] 
ret_from_kernel_thread+0x5c/0x74
   [0.094669] Instruction dump:
   [0.094673] 48586529 6000 e8dc0002 393f0001 7f9b4800 7c7d07b4 
7d3f07b4 409effcc
   [0.094682] 7f9d3000 7d26e850 79290fe0 69290001 <0b09> 409c0194 
3f620004 3b7b8ec8

Fix this problem by using a minimum value.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 536ee15f61fb..4dac7d560a42 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -463,7 +463,7 @@ static int xive_find_target_in_mask(const struct cpumask 
*mask,
int cpu, first, num, i;
 
/* Pick up a starting point CPU in the mask based on  fuzz */
-   num = cpumask_weight(mask);
+   num = min_t(int, cpumask_weight(mask), nr_cpu_ids);
first = fuzz % num;
 
/* Locate it */
-- 
2.7.5

[PATCH 00/10] guest exploitation of the XIVE interrupt controller

2017-08-08 Thread Cédric Le Goater

Hello,

On a POWER9 sPAPR machine, the Client Architecture Support (CAS)
negotiation process determines whether the guest operates with an
interrupt controller using the legacy model, as found on POWER8, or in
XIVE exploitation mode, the newer POWER9 interrupt model. This
patchset is a first proposal to add XIVE support in the sPAPR machine.

Tested with a QEMU XIVE model for sPAPR machine and with the Power
hypervisor.

Code is here:

  https://github.com/legoater/linux/commits/xive
  https://github.com/legoater/qemu/commits/xive   

Thanks,

C.

Changes since RFC :

 - renamed backend to 'spapr'
 - fixed hotplug support
 - fixed kexec support
 - fixed src_chip value (XIVE_INVALID_CHIP_ID)
 - added doorbell support
 - added some debug logs
 - added  H_INT_ESB hcall
 - took into account '/ibm,plat-res-int-priorities'
 - fixed WARNING in xive_find_target_in_mask()

Cédric Le Goater (10):
  powerpc/xive: fix OV5_XIVE_EXPLOIT bits
  powerpc/xive: guest exploitation of the XIVE interrupt controller
  powerpc/xive: rename xive_poke_esb in xive_esb_read
  powerpc/xive: introduce xive_esb_write
  powerpc/xive: add the HW IRQ number under xive_irq_data
  powerpc/xive: introduce H_INT_ESB hcall
  powerpc/xive: add XIVE exploitation mode to CAS
  powerpc/xive: take into account '/ibm,plat-res-int-priorities'
  powerpc/xive: improve debugging macros
  powerpc/xive: fix the size of the cpumask used in
xive_find_target_in_mask()

 arch/powerpc/include/asm/hvcall.h|  13 +-
 arch/powerpc/include/asm/prom.h  |   3 +-
 arch/powerpc/include/asm/xive.h  |   4 +
 arch/powerpc/kernel/prom_init.c  |  15 +-
 arch/powerpc/platforms/pseries/Kconfig   |   1 +
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  10 +-
 arch/powerpc/platforms/pseries/kexec.c   |   6 +-
 arch/powerpc/platforms/pseries/setup.c   |   8 +-
 arch/powerpc/platforms/pseries/smp.c |  32 +-
 arch/powerpc/sysdev/xive/Kconfig |   5 +
 arch/powerpc/sysdev/xive/Makefile|   1 +
 arch/powerpc/sysdev/xive/common.c|  49 +-
 arch/powerpc/sysdev/xive/native.c|   2 +
 arch/powerpc/sysdev/xive/spapr.c | 658 +++
 arch/powerpc/sysdev/xive/xive-internal.h |   1 +
 15 files changed, 778 insertions(+), 30 deletions(-)
 create mode 100644 arch/powerpc/sysdev/xive/spapr.c

-- 
2.7.5

[PATCH 08/10] powerpc/xive: take into account '/ibm, plat-res-int-priorities'

2017-08-08 Thread Cédric Le Goater

'/ibm,plat-res-int-priorities' contains a list of priorities that the
hypervisor has reserved for its own use. Scan these ranges to choose
the lowest unused priority for the xive spapr backend.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/spapr.c | 62 +++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index 7fc40047c23d..220331986bd8 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -532,13 +532,70 @@ static const struct xive_ops xive_spapr_ops = {
.name   = "spapr",
 };
 
+/*
+ * get max priority from "/ibm,plat-res-int-priorities"
+ */
+static bool xive_get_max_prio(u8 *max_prio)
+{
+   struct device_node *rootdn;
+   const __be32 *reg;
+   u32 len;
+   int prio, found;
+
+   rootdn = of_find_node_by_path("/");
+   if (!rootdn) {
+   pr_err("not root node found !\n");
+   return false;
+   }
+
+   reg = of_get_property(rootdn, "ibm,plat-res-int-priorities", &len);
+   if (!reg) {
+   pr_err("Failed to read 'ibm,plat-res-int-priorities' 
property\n");
+   return false;
+   }
+
+   if (len % (2 * sizeof(u32)) != 0) {
+   pr_err("invalid 'ibm,plat-res-int-priorities' property\n");
+   return false;
+   }
+
+   /* HW supports priorities in the range [0-7] and 0xFF is a
+* wildcard priority used to mask. We scan the ranges reserved
+* by the hypervisor to find the lowest priority we can use.
+*/
+   found = 0xFF;
+   for (prio = 0; prio < 8; prio++) {
+   int reserved = 0;
+   int i;
+
+   for (i = 0; i < len / (2 * sizeof(u32)); i++) {
+   int base  = be32_to_cpu(reg[2 * i]);
+   int range = be32_to_cpu(reg[2 * i + 1]);
+
+   if (prio >= base && prio < base + range)
+   reserved++;
+   }
+
+   if (!reserved)
+   found = prio;
+   }
+
+   if (found == 0xFF) {
+   pr_err("no valid priority found in 
'ibm,plat-res-int-priorities'\n");
+   return false;
+   }
+
+   *max_prio = found;
+   return true;
+}
+
 bool xive_spapr_init(void)
 {
struct device_node *np;
struct resource r;
void __iomem *tima;
struct property *prop;
-   u8 max_prio = 7;
+   u8 max_prio;
u32 val;
u32 len;
const __be32 *reg;
@@ -566,6 +623,9 @@ bool xive_spapr_init(void)
return false;
}
 
+   if (!xive_get_max_prio(&max_prio))
+   return false;
+
/* Feed the IRQ number allocator with the ranges given in the DT */
reg = of_get_property(np, "ibm,xive-lisn-ranges", &len);
if (!reg) {
-- 
2.7.5

[PATCH 09/10] powerpc/xive: improve debugging macros

2017-08-08 Thread Cédric Le Goater

Having the CPU identifier in the debug logs is helpful when tracking
issues. Also add some more logging and fix a compile issue in
xive_do_source_eoi().

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 891d24c82e03..536ee15f61fb 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -40,7 +40,8 @@
 #undef DEBUG_ALL
 
 #ifdef DEBUG_ALL
-#define DBG_VERBOSE(fmt...)pr_devel(fmt)
+#define DBG_VERBOSE(fmt, ...)  pr_devel("cpu %d - " fmt, \
+smp_processor_id(), ## __VA_ARGS__)
 #else
 #define DBG_VERBOSE(fmt...)do { } while(0)
 #endif
@@ -344,7 +345,7 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data 
*xd)
xive_peek_esb(xd, XIVE_ESB_LOAD_EOI);
else {
eoi_val = xive_peek_esb(xd, XIVE_ESB_SET_PQ_00);
-   DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
+   DBG_VERBOSE("eoi_val=%x\n", eoi_val);
 
/* Re-trigger if needed */
if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio)
@@ -1004,6 +1005,9 @@ static void xive_ipi_eoi(struct irq_data *d)
 {
struct xive_cpu *xc = __this_cpu_read(xive_cpu);
 
+   DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n",
+   d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio);
+
/* Handle possible race with unplug and drop stale IPIs */
if (!xc)
return;
-- 
2.7.5

[PATCH 04/10] powerpc/xive: introduce xive_esb_write

2017-08-08 Thread Cédric Le Goater

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index e6b245bb9602..22b6f8954083 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -203,6 +203,15 @@ static u8 xive_peek_esb(struct xive_irq_data *xd, u32 
offset)
return (u8)val;
 }
 
+static void xive_esb_write(struct xive_irq_data *xd, u32 offset, u64 data)
+{
+   /* Handle HW errata */
+   if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+   offset |= offset << 4;
+
+   out_be64(xd->eoi_mmio + offset, data);
+}
+
 #ifdef CONFIG_XMON
 static void xive_dump_eq(const char *name, struct xive_q *q)
 {
@@ -297,7 +306,7 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data 
*xd)
 {
/* If the XIVE supports the new "store EOI facility, use it */
if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
-   out_be64(xd->eoi_mmio + XIVE_ESB_STORE_EOI, 0);
+   xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0);
else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
/*
 * The FW told us to call it. This happens for some
-- 
2.7.5

[PATCH 01/10] powerpc/xive: fix OV5_XIVE_EXPLOIT bits

2017-08-08 Thread Cédric Le Goater

Platform Exploitation Mode support is indicated by the property
"ibm,arch-vec-5-platform-support-vec-5" : byte 23 bits 0-1 set to 0b01
or 0b10

OS Selection for Exploitation Mode is indicated by the property
"ibm,architecture-vec-5" : byte 23 bits 0-1 set to 0b01. A value of
0b00 indicates use of legacy compatibility mode.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/prom.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 35c00d7a0cf8..b6edaa0ed833 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -159,7 +159,8 @@ struct of_drconf_cell {
 #define OV5_PFO_HW_842 0x1140  /* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR0x1120  /* PFO Encryption Accelerator */
 #define OV5_SUB_PROCESSORS 0x1501  /* 1,2,or 4 Sub-Processors supported */
-#define OV5_XIVE_EXPLOIT   0x1701  /* XIVE exploitation supported */
+#define OV5_XIVE_SUPPORT   0x17C0  /* XIVE Exploitation Support Mask */
+#define OV5_XIVE_EXPLOIT   0x1740  /* XIVE exploitation mode */
 /* MMU Base Architecture */
 #define OV5_MMU_SUPPORT0x18C0  /* MMU Mode Support Mask */
 #define OV5_MMU_HASH   0x1800  /* Hash MMU Only */
-- 
2.7.5

[PATCH 07/10] powerpc/xive: add XIVE exploitation mode to CAS

2017-08-08 Thread Cédric Le Goater

On POWER9, the Client Architecture Support (CAS) negotiation process
determines whether the guest operates in XIVE Legacy compatibility or
in XIVE exploitation mode.

Now that we have initial guest support for the XIVE interrupt
controller, let's inform the hypervisor what we can do.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kernel/prom_init.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 613f79f03877..25c14f543bd7 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -177,6 +177,7 @@ struct platform_support {
bool hash_mmu;
bool radix_mmu;
bool radix_gtse;
+   bool xive;
 };
 
 /* Platforms codes are now obsolete in the kernel. Now only used within this
@@ -1054,6 +1055,12 @@ static void __init prom_parse_platform_support(u8 index, 
u8 val,
support->radix_gtse = true;
}
break;
+   case OV5_INDX(OV5_XIVE_SUPPORT): /* XIVE Exploitation mode */
+   if (val & OV5_FEAT(OV5_XIVE_SUPPORT)) {
+   prom_debug("XIVE - exploitation mode\n");
+   support->xive = true;
+   }
+   break;
}
 }
 
@@ -1062,7 +1069,8 @@ static void __init prom_check_platform_support(void)
struct platform_support supported = {
.hash_mmu = false,
.radix_mmu = false,
-   .radix_gtse = false
+   .radix_gtse = false,
+   .xive = false
};
int prop_len = prom_getproplen(prom.chosen,
   "ibm,arch-vec-5-platform-support");
@@ -1095,6 +1103,11 @@ static void __init prom_check_platform_support(void)
/* We're probably on a legacy hypervisor */
prom_debug("Assuming legacy hash support\n");
}
+
+   if (supported.xive) {
+   prom_debug("Asking for XIVE\n");
+   ibm_architecture_vec.vec5.intarch = OV5_FEAT(OV5_XIVE_EXPLOIT);
+   }
 }
 
 static void __init prom_send_capabilities(void)
-- 
2.7.5

Re: [v6 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-08 Thread Will Deacon

On Mon, Aug 07, 2017 at 04:38:45PM -0400, Pavel Tatashin wrote:
> To optimize the performance of struct page initialization,
> vmemmap_populate() will no longer zero memory.
> 
> We must explicitly zero the memory that is allocated by vmemmap_populate()
> for kasan, as this memory does not go through struct page initialization
> path.
> 
> Signed-off-by: Pavel Tatashin 
> Reviewed-by: Steven Sistare 
> Reviewed-by: Daniel Jordan 
> Reviewed-by: Bob Picco 
> ---
>  arch/arm64/mm/kasan_init.c | 42 ++
>  1 file changed, 42 insertions(+)
> 
> diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
> index 81f03959a4ab..e78a9ecbb687 100644
> --- a/arch/arm64/mm/kasan_init.c
> +++ b/arch/arm64/mm/kasan_init.c
> @@ -135,6 +135,41 @@ static void __init clear_pgds(unsigned long start,
>   set_pgd(pgd_offset_k(start), __pgd(0));
>  }
>  
> +/*
> + * Memory that was allocated by vmemmap_populate is not zeroed, so we must
> + * zero it here explicitly.
> + */
> +static void
> +zero_vmemmap_populated_memory(void)
> +{
> + struct memblock_region *reg;
> + u64 start, end;
> +
> + for_each_memblock(memory, reg) {
> + start = __phys_to_virt(reg->base);
> + end = __phys_to_virt(reg->base + reg->size);
> +
> + if (start >= end)
> + break;
> +
> + start = (u64)kasan_mem_to_shadow((void *)start);
> + end = (u64)kasan_mem_to_shadow((void *)end);
> +
> + /* Round to the start end of the mapped pages */
> + start = round_down(start, SWAPPER_BLOCK_SIZE);
> + end = round_up(end, SWAPPER_BLOCK_SIZE);
> + memset((void *)start, 0, end - start);
> + }
> +
> + start = (u64)kasan_mem_to_shadow(_text);
> + end = (u64)kasan_mem_to_shadow(_end);
> +
> + /* Round to the start end of the mapped pages */
> + start = round_down(start, SWAPPER_BLOCK_SIZE);
> + end = round_up(end, SWAPPER_BLOCK_SIZE);
> + memset((void *)start, 0, end - start);
> +}

I can't help but think this would be an awful lot nicer if you made
vmemmap_alloc_block take extra GFP flags as a parameter. That way, we could
implement a version of vmemmap_populate that does the zeroing when we need
it, without having to duplicate a bunch of the code like this. I think it
would also be less error-prone, because you wouldn't have to do the
allocation and the zeroing in two separate steps.

Will

[PATCH 02/10] powerpc/xive: guest exploitation of the XIVE interrupt controller

2017-08-08 Thread Cédric Le Goater

This is the framework for using XIVE in a PowerVM guest. The support
is very similar to the native one in a much simpler form.

Instead of OPAL calls, a set of Hypervisors call are used to configure
the interrupt sources and the event/notification queues of the guest:

 - H_INT_GET_SOURCE_INFO

   used to obtain the address of the MMIO page of the Event State
   Buffer (PQ bits) entry associated with the source.

 - H_INT_SET_SOURCE_CONFIG

   assigns a source to a "target".

 - H_INT_GET_SOURCE_CONFIG

   determines to which "target" and "priority" is assigned to a source

 - H_INT_GET_QUEUE_INFO

   returns the address of the notification management page associated
   with the specified "target" and "priority".

 - H_INT_SET_QUEUE_CONFIG

   sets or resets the event queue for a given "target" and "priority".
   It is also used to set the notification config associated with the
   queue, only unconditional notification for the moment.  Reset is
   performed with a queue size of 0 and queueing is disabled in that
   case.

 - H_INT_GET_QUEUE_CONFIG

   returns the queue settings for a given "target" and "priority".

 - H_INT_RESET

   resets all of the partition's interrupt exploitation structures to
   their initial state, losing all configuration set via the hcalls
   H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.

 - H_INT_SYNC

   issue a synchronisation on a source to make sure sure all
   notifications have reached their queue.

As for XICS, the XIVE interface for the guest is described in the
device tree under the interrupt controller node. A couple of new
properties are specific to XIVE :

 - "reg"

   contains the base address and size of the thread interrupt
   managnement areas (TIMA) for the user level for the OS level. Only
   the OS level is taken into account.

 - "ibm,xive-eq-sizes"

   the size of the event queues.

 - "ibm,xive-lisn-ranges"

   the interrupt numbers ranges assigned to the guest. These are
   allocated using a simple bitmap.

Tested with a QEMU XIVE model for pseries and with the Power
hypervisor

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/hvcall.h|  13 +-
 arch/powerpc/include/asm/xive.h  |   2 +
 arch/powerpc/platforms/pseries/Kconfig   |   1 +
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  10 +-
 arch/powerpc/platforms/pseries/kexec.c   |   6 +-
 arch/powerpc/platforms/pseries/setup.c   |   8 +-
 arch/powerpc/platforms/pseries/smp.c |  32 +-
 arch/powerpc/sysdev/xive/Kconfig |   5 +
 arch/powerpc/sysdev/xive/Makefile|   1 +
 arch/powerpc/sysdev/xive/spapr.c | 554 +++
 10 files changed, 619 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/sysdev/xive/spapr.c

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 57d38b504ff7..3d34dc0869f6 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -280,7 +280,18 @@
 #define H_RESIZE_HPT_COMMIT0x370
 #define H_REGISTER_PROC_TBL0x37C
 #define H_SIGNAL_SYS_RESET 0x380
-#define MAX_HCALL_OPCODE   H_SIGNAL_SYS_RESET
+#define H_INT_GET_SOURCE_INFO   0x3A8
+#define H_INT_SET_SOURCE_CONFIG 0x3AC
+#define H_INT_GET_SOURCE_CONFIG 0x3B0
+#define H_INT_GET_QUEUE_INFO0x3B4
+#define H_INT_SET_QUEUE_CONFIG  0x3B8
+#define H_INT_GET_QUEUE_CONFIG  0x3BC
+#define H_INT_SET_OS_REPORTING_LINE 0x3C0
+#define H_INT_GET_OS_REPORTING_LINE 0x3C4
+#define H_INT_ESB   0x3C8
+#define H_INT_SYNC  0x3CC
+#define H_INT_RESET 0x3D0
+#define MAX_HCALL_OPCODE   H_INT_RESET
 
 /* H_VIOCTL functions */
 #define H_GET_VIOA_DUMP_SIZE   0x01
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index c23ff4389ca2..1deb10032d61 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -110,6 +110,7 @@ extern bool __xive_enabled;
 
 static inline bool xive_enabled(void) { return __xive_enabled; }
 
+extern bool xive_spapr_init(void);
 extern bool xive_native_init(void);
 extern void xive_smp_probe(void);
 extern int  xive_smp_prepare_cpu(unsigned int cpu);
@@ -147,6 +148,7 @@ extern int xive_native_get_vp_info(u32 vp_id, u32 
*out_cam_id, u32 *out_chip_id)
 
 static inline bool xive_enabled(void) { return false; }
 
+static inline bool xive_spapr_init(void) { return false; }
 static inline bool xive_native_init(void) { return false; }
 static inline void xive_smp_probe(void) { }
 extern inline int  xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index 3a6dfd14f64b..71dd69d9ec64 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -7,6 +7,7 @@ config PPC_PSERIES
select PCI
select PCI_MSI
select PPC_XICS
+   select PPC_XIVE_SPAPR
select PPC_ICP_NATIVE

[PATCH 03/10] powerpc/xive: rename xive_poke_esb in xive_esb_read

2017-08-08 Thread Cédric Le Goater

xive_poke_esb() is performing a load/read so it is better named as
xive_esb_read(). Also introduce a XIVE_ESB_LOAD_EOI read when EOI'ing
LSI interrupts.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 6595462b1fc8..e6b245bb9602 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -190,7 +190,7 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool 
just_peek)
  * This is used to perform the magic loads from an ESB
  * described in xive.h
  */
-static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset)
+static u8 xive_peek_esb(struct xive_irq_data *xd, u32 offset)
 {
u64 val;
 
@@ -227,7 +227,7 @@ void xmon_xive_do_dump(int cpu)
xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
 #ifdef CONFIG_SMP
{
-   u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET);
+   u64 val = xive_peek_esb(&xc->ipi_data, XIVE_ESB_GET);
xmon_printf("  IPI state: %x:%c%c\n", xc->hw_ipi,
val & XIVE_ESB_VAL_P ? 'P' : 'p',
val & XIVE_ESB_VAL_P ? 'Q' : 'q');
@@ -326,9 +326,9 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data 
*xd)
 * properly.
 */
if (xd->flags & XIVE_IRQ_FLAG_LSI)
-   in_be64(xd->eoi_mmio);
+   xive_peek_esb(xd, XIVE_ESB_LOAD_EOI);
else {
-   eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+   eoi_val = xive_peek_esb(xd, XIVE_ESB_SET_PQ_00);
DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
 
/* Re-trigger if needed */
@@ -383,12 +383,12 @@ static void xive_do_source_set_mask(struct xive_irq_data 
*xd,
 * ESB accordingly on unmask.
 */
if (mask) {
-   val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01);
+   val = xive_peek_esb(xd, XIVE_ESB_SET_PQ_01);
xd->saved_p = !!(val & XIVE_ESB_VAL_P);
} else if (xd->saved_p)
-   xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+   xive_peek_esb(xd, XIVE_ESB_SET_PQ_10);
else
-   xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+   xive_peek_esb(xd, XIVE_ESB_SET_PQ_00);
 }
 
 /*
@@ -768,7 +768,7 @@ static int xive_irq_retrigger(struct irq_data *d)
 * To perform a retrigger, we first set the PQ bits to
 * 11, then perform an EOI.
 */
-   xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+   xive_peek_esb(xd, XIVE_ESB_SET_PQ_11);
 
/*
 * Note: We pass "0" to the hw_irq argument in order to
@@ -803,7 +803,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, 
void *state)
irqd_set_forwarded_to_vcpu(d);
 
/* Set it to PQ=10 state to prevent further sends */
-   pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+   pq = xive_peek_esb(xd, XIVE_ESB_SET_PQ_10);
 
/* No target ? nothing to do */
if (xd->target == XIVE_INVALID_TARGET) {
@@ -832,7 +832,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, 
void *state)
 * for sure the queue slot is no longer in use.
 */
if (pq & 2) {
-   pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+   pq = xive_peek_esb(xd, XIVE_ESB_SET_PQ_11);
xd->saved_p = true;
 
/*
-- 
2.7.5

[PATCH 05/10] powerpc/xive: add the HW IRQ number under xive_irq_data

2017-08-08 Thread Cédric Le Goater

It will be required later by the H_INT_ESB hcall.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive.h   | 1 +
 arch/powerpc/sysdev/xive/native.c | 2 ++
 arch/powerpc/sysdev/xive/spapr.c  | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 1deb10032d61..6d097a18d3ae 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -45,6 +45,7 @@ struct xive_irq_data {
void __iomem *trig_mmio;
u32 esb_shift;
int src_chip;
+   u32 hw_irq;
 
/* Setup/used by frontend */
int target;
diff --git a/arch/powerpc/sysdev/xive/native.c 
b/arch/powerpc/sysdev/xive/native.c
index 0f95476b01f6..3417fb0ce1ff 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -82,6 +82,8 @@ int xive_native_populate_irq_data(u32 hw_irq, struct 
xive_irq_data *data)
return -ENOMEM;
}
 
+   data->hw_irq = hw_irq;
+
if (!data->trig_page)
return 0;
if (data->trig_page == data->eoi_page) {
diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index a3668815d5c1..9a0fd9f7e38a 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -264,6 +264,8 @@ static int xive_spapr_populate_irq_data(u32 hw_irq, struct 
xive_irq_data *data)
return -ENOMEM;
}
 
+   data->hw_irq = hw_irq;
+
/* Full function page supports trigger */
if (flags & XIVE_SRC_TRIGGER) {
data->trig_mmio = data->eoi_mmio;
-- 
2.7.5

Re: [01/24] powerpc/mm: Move exception_enter/exit to a do_page_fault wrapper

2017-08-08 Thread Michael Ellerman

Christophe LEROY  writes:

> Le 08/08/2017 à 04:16, Michael Ellerman a écrit :
>> Christophe LEROY  writes:
>> 
>>> Le 07/08/2017 à 12:41, Michael Ellerman a écrit :
 On Wed, 2017-07-19 at 04:49:23 UTC, Benjamin Herrenschmidt wrote:
> This will allow simplifying the returns from do_page_fault
>
> Signed-off-by: Benjamin Herrenschmidt 

 Series applied to powerpc next, thanks.

 https://git.kernel.org/powerpc/c/7afad422ac61067473d5f3d20bbd54
>>>
>>> Boot failure on the 8xx:
>>>
>>> [6.029556] Failed to execute /init (error -14)
>>> [6.034623] Starting init: /bin/sh exists but couldn't execute it
>>> (error -14)
>>> [6.041489] Kernel panic - not syncing: No working init found.  Try
>>> passing init= option to kernel. See Linux
>>> Documentation/admin-guide/init.rst for guidance.
>>> [6.055518] CPU: 0 PID: 1 Comm: init Not tainted
>>> 4.13.0-rc3-s3k-dev-00143-g7aa62e972a56 #56
>>> [6.063745] Call Trace:
>>> [6.066224] [c60f1ed0] [c001a624] panic+0x108/0x250 (unreliable)
>>> [6.072140] [c60f1f30] [c0002640] rootfs_mount+0x0/0x58
>>> [6.077311] [c60f1f40] [c000cb80] ret_from_kernel_thread+0x5c/0x64
>>> [6.083405] Rebooting in 180 seconds..
>>>
>>>
>>> Bisected to c433ec0455f921eaf8dd0262a718ce6f8ad62ea2 ("powerpc/mm:
>>> Pre-filter SRR1 bits before do_page_fault()")
>> 
>> Sorry about that. I don't have a way to test 8xx.
>
> Looks like I concluded too quickly yesterday night, indeed the above 
> commit is the last good one. The faulty one is 
> d300627c6a53693fb01479b59b0cdd293761b1fa("powerpc/6xx: Handle DABR match 
> before calling do_page_fault")
>
>> 
>>> Will see tomorrow what's wrong with that.
>> 
>> Thanks.
>
> This is because the 'bl do_page_fault' has been enclosed inside the 
> #ifdef CONFIG_6xx
>
> See patch at https://patchwork.ozlabs.org/patch/799039/

Oops :}

Thanks for the fix.

cheers

RE: [PATCH v10 4/4] irqchip/qeic: remove PPCisms for QEIC

2017-08-08 Thread Michael Ellerman

Qiang Zhao  writes:

> On Mon 8/7/2017 3:02 PM, Michael Ellerman  wrote:
>
>> -Original Message-
>> From: Michael Ellerman [mailto:m...@ellerman.id.au]
>> Sent: Monday, August 07, 2017 3:02 PM
>> To: Qiang Zhao ; t...@linutronix.de
>> Cc: o...@buserror.net; Qiang Zhao ; linuxppc-
>> d...@lists.ozlabs.org; Xiaobo Xie ; linux-
>> ker...@vger.kernel.org
>> Subject: Re: [PATCH v10 4/4] irqchip/qeic: remove PPCisms for QEIC
>> 
>> Zhao Qiang  writes:
>> 
>> > QEIC was supported on PowerPC, and dependent on PPC, Now it is
>> > supported on other platforms, so remove PPCisms.
>> >
>> > Signed-off-by: Zhao Qiang 
>> > ---
>> >  arch/powerpc/platforms/83xx/km83xx.c  |   1 -
>> >  arch/powerpc/platforms/83xx/misc.c|   1 -
>> >  arch/powerpc/platforms/83xx/mpc832x_mds.c |   1 -
>> >  arch/powerpc/platforms/83xx/mpc832x_rdb.c |   1 -
>> >  arch/powerpc/platforms/83xx/mpc836x_mds.c |   1 -
>> >  arch/powerpc/platforms/83xx/mpc836x_rdk.c |   1 -
>> >  arch/powerpc/platforms/85xx/corenet_generic.c |   1 -
>> >  arch/powerpc/platforms/85xx/mpc85xx_mds.c |   1 -
>> >  arch/powerpc/platforms/85xx/mpc85xx_rdb.c |   1 -
>> >  arch/powerpc/platforms/85xx/twr_p102x.c   |   1 -
>> >  drivers/irqchip/irq-qeic.c| 188 
>> > +++---
>> >  include/soc/fsl/qe/qe_ic.h| 132 --
>> >  12 files changed, 80 insertions(+), 250 deletions(-)  delete mode
>> > 100644 include/soc/fsl/qe/qe_ic.h
>> >
>> > diff --git a/arch/powerpc/platforms/83xx/km83xx.c
>> > b/arch/powerpc/platforms/83xx/km83xx.c
>> > index d8642a4..b1cef0a 100644
>> > --- a/arch/powerpc/platforms/83xx/km83xx.c
>> > +++ b/arch/powerpc/platforms/83xx/km83xx.c
>> > @@ -38,7 +38,6 @@
>> >  #include 
>> >  #include 
>> >  #include 
>> > -#include 
>> 
>> You deleted that file in patch 2. So didn't you just break the build for the 
>> last two
>> commits?
>
> Sorry, I am not sure what you said. Could you explain?

Don't worry about it. I was confused by the fact that we have both:

  drivers/soc/fsl/qe/qe_ic.h

and:

  include/soc/fsl/qe/qe_ic.h

cheers

Re: [PATCH 3/3] powerpc: replace vga_fixup() with generic code

2017-08-08 Thread Michael Ellerman

Daniel Axtens  writes:

> Michael Ellerman  writes:
>
>> Daniel Axtens  writes:
>>
>>> Currently, we do a PCI fixup to mark a default card so that Xorg
>>> autoconfiguration works.
>>>
>>> There is a new generic method to do this sort of vga fixup, and
>>> it occurs by default.
>>>
>>> Drop our old method.
>>>
>>> This method is different:
>>>  - it will only mark a card as default if a driver is bound
>>>  - the marking will happen at late_initcall time, or even later
>>>if a card is enabled later on (via an ENABLE hook). Currently
>>>things are enabled in a FINAL hook.
>>>
>>> This *does* change behaviour under some circumstances.
>>>
>>> For example, pseries_le_defconfig doesn't have DRM drivers for
>>> many of the qemu GPU models, including the 'standard' vga.
>>
>> Should we enable them/it?
>
> Hard to say.
>
> The 'standard' vga module (bochs_drm) was blacklisted by Ubuntu -
> apparently at IBM's request [0] - some years back. Even if you
> un-blacklist it, I had trouble with getting it and the openfirmware
> framebuffer driver to play nicely together. It may not be worth the
> trouble for bochs_drm.
>
> There's a better case for including some of the more modern drivers -
> maybe QXL and virtio - but I wasn't able to test them: my particular
> build of qemu/TCG refused to start with them and I didn't feel like
> rebuilding/debugging qemu.

Yeah OK. Sounds like a bit of mess :)

I'll leave it unless someone who knows Qemu/Gfx etc. tells me otherwise.

> It would also be legitmate to say that you're focussing on headless use
> with pseries_*defconfig and not include them: you need to bring in the
> DRM core if you want these drivers.

True. There's a bit of a tension there between making them useful
configs for developers and also turning on as much code as possible so
it gets tested.

Arguably we should have DRM enabled because the distros will.

cheers

Re: [PATCH 0/3] Minor updates for PS3

2017-08-08 Thread Michael Ellerman

Geoff Levand  writes:

> Hi Michael,
>
> A few very minor updates for PS3.  Please apply.

Jens do you want to take the block ones, or should I just take the lot?

cheers

Re: [PATCH 10/13] powerpc/64s: idle simplify KVM idle on POWER9

2017-08-08 Thread Gautham R Shenoy

Hi Nicholas,

On Sun, Aug 06, 2017 at 03:02:38AM +1000, Nicholas Piggin wrote:
> POWER9 CPUs have independent MMU contexts per thread so KVM
> does not have to bring sibling threads into real-mode when
> switching MMU mode to guest. This can simplify POWER9 sleep/wake
> paths and avoids hwsyncs.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/include/asm/kvm_book3s_asm.h |  4 
>  arch/powerpc/kernel/idle_book3s.S |  8 ++-
>  arch/powerpc/kvm/book3s_hv.c  | 37 
> ++-
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  8 +++
>  4 files changed, 46 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
> b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index 7cea76f11c26..83596f32f50b 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -104,6 +104,10 @@ struct kvmppc_host_state {
>   u8 napping;
> 
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> + /*
> +  * hwthread_req/hwthread_state pair is used to pull sibling threads
> +  * out of guest on pre-ISAv3.0B CPUs where threads share MMU.
> +  */
>   u8 hwthread_req;
>   u8 hwthread_state;
>   u8 host_ipi;
> diff --git a/arch/powerpc/kernel/idle_book3s.S 
> b/arch/powerpc/kernel/idle_book3s.S
> index e6252c5a57a4..3ab73f9223e4 100644
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -243,12 +243,6 @@ enter_winkle:
>   * r3 - PSSCR value corresponding to the requested stop state.
>   */
>  power_enter_stop:
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> - /* Tell KVM we're entering idle */
> - li  r4,KVM_HWTHREAD_IN_IDLE
> - /* DO THIS IN REAL MODE!  See comment above. */
> - stb r4,HSTATE_HWTHREAD_STATE(r13)
> -#endif
>  /*
>   * Check if we are executing the lite variant with ESL=EC=0
>   */
> @@ -435,6 +429,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
>   mr  r3,r12
> 
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +BEGIN_FTR_SECTION
>   li  r0,KVM_HWTHREAD_IN_KERNEL
>   stb r0,HSTATE_HWTHREAD_STATE(r13)
>   /* Order setting hwthread_state vs. testing hwthread_req */
> @@ -444,6 +439,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
>   beq 1f
>   b   kvm_start_guest
>  1:
> +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)

This would be 7 nops on power9. Should we move this to a different
function and do a bl to that?


>  #endif
> 
>   /* Return SRR1 from power7_nap() */
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 359c79cdf0cc..bb1ab14f963a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -2111,6 +2111,16 @@ static int kvmppc_grab_hwthread(int cpu)
>   struct paca_struct *tpaca;
>   long timeout = 1;
> 
> + /*
> +  * ISA v3.0 idle routines do not set hwthread_state or test
> +  * hwthread_req, so they can not grab idle threads.
> +  */
> + if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> + WARN_ON(1);
> + pr_err("KVM: can not control sibling threads\n");
> + return -EBUSY;
> + }
> +
>   tpaca = &paca[cpu];
> 
>   /* Ensure the thread won't go into the kernel if it wakes */
> @@ -2145,12 +2155,26 @@ static void kvmppc_release_hwthread(int cpu)
>   struct paca_struct *tpaca;
> 
>   tpaca = &paca[cpu];
> - tpaca->kvm_hstate.hwthread_req = 0;
>   tpaca->kvm_hstate.kvm_vcpu = NULL;
>   tpaca->kvm_hstate.kvm_vcore = NULL;
>   tpaca->kvm_hstate.kvm_split_mode = NULL;
>  }
> 
> +static void kvmppc_release_hwthread_secondary(int cpu)
> +{
> + struct paca_struct *tpaca;
> +
> + if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> + WARN_ON(1);
> + return;
> + }
> +
> + tpaca = &paca[cpu];
> + tpaca->kvm_hstate.hwthread_req = 0;
> + kvmppc_release_hwthread(cpu);
> +}
> +
> +

Extra blank line not needed.

>  static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
>  {
>   int i;
> @@ -2274,7 +2298,7 @@ static int on_primary_thread(void)
>   if (kvmppc_grab_hwthread(cpu + thr)) {
>   /* Couldn't grab one; let the others go */
>   do {
> - kvmppc_release_hwthread(cpu + thr);
> + kvmppc_release_hwthread_secondary(cpu + thr);
>   } while (--thr > 0);
>   return 0;
>   }
> @@ -2702,8 +2726,9 @@ static noinline void kvmppc_run_core(struct 
> kvmppc_vcore *vc)
>   kvmppc_vcore_preempt(pvc);
>   spin_unlock(&pvc->lock);
>   }
> - for (i = 0; i < controlled_threads; ++i)
> - kvmppc_release_hwthread(pcpu + i);
> + for (i = 1; i < controlled_threads; ++i)
> + kvmppc_release_hwthread_secondary(pcpu +

Re: [PATCH 11/13] powerpc/64s: idle POWER9 can execute stop without ptesync

2017-08-08 Thread Gautham R Shenoy

On Sun, Aug 06, 2017 at 03:02:39AM +1000, Nicholas Piggin wrote:
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Gautham R. Shenoy 

--
Thanks and Regards
gautham.

Re: [PATCH] powerpc: xive: ensure active irqd when setting affinity

2017-08-08 Thread Michael Ellerman

Sukadev Bhattiprolu  writes:

> From fd0abf5c61b6041fdb75296e8580b86dc91d08d6 Mon Sep 17 00:00:00 2001
> From: Benjamin Herrenschmidt 
> Date: Tue, 1 Aug 2017 20:54:41 -0500
> Subject: [PATCH] powerpc: xive: ensure active irqd when setting affinity
>
> Ensure irqd is active before attempting to set affinity. This should
> make the set affinity code more robust. For instance, this prevents
> these messages seen on a 4.12 based kernel when taking cpus offline:
>
>[  123.053037264,3] XIVE[ IC 00  ] ISN 2 lead to invalid IVE !
>[   77.885859] xive: Error -6 reconfiguring irq 17
>[   77.885862] IRQ17: set affinity failed(-6).
>
> The underlying problem with taking cpus offline was fixed in 4.13-rc1 by:
>
>commit 91f26cb4cd3c ("genirq/cpuhotplug: Do not migrated shutdown irqs")

So do we still need this? Or is the above only a partial fix?

I'm a bit confused.

cheers

Re: [PATCH 12/13] powerpc/64s: idle POWER9 can execute stop in virtual mode

2017-08-08 Thread Gautham R Shenoy

On Sun, Aug 06, 2017 at 03:02:40AM +1000, Nicholas Piggin wrote:
> The hardware can execute stop in any context, and KVM does not
> require real mode. This saves a switch to real-mode when going
> idle.
> 
> Signed-off-by: Nicholas Piggin 

Acked-by: Gautham R. Shenoy 

> ---
>  arch/powerpc/kernel/idle_book3s.S | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/idle_book3s.S 
> b/arch/powerpc/kernel/idle_book3s.S
> index 75746111e2c4..8ac366a51bb5 100644
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -141,7 +141,16 @@ pnv_powersave_common:
>   std r5,_CCR(r1)
>   std r1,PACAR1(r13)
> 
> +BEGIN_FTR_SECTION
> + /*
> +  * POWER9 does not require real mode to stop, and does not set
> +  * hwthread_state for KVM (threads don't share MMU context), so
> +  * we can remain in virtual mode for this.
> +  */
> + bctr
> +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>   /*
> +  * POWER8
>* Go to real mode to do the nap, as required by the architecture.
>* Also, we need to be in real mode before setting hwthread_state,
>* because as soon as we do that, another thread can switch
> -- 
> 2.11.0
>

Re: [PATCH] powerpc/xmon: Exclude all of xmon/ from ftrace

2017-08-08 Thread Michael Ellerman

"Naveen N. Rao"  writes:

> diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
> index 0b2f771593eb..5f95af64cb8f 100644
> --- a/arch/powerpc/xmon/Makefile
> +++ b/arch/powerpc/xmon/Makefile
> @@ -7,6 +7,19 @@ UBSAN_SANITIZE := n
>  
>  ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
>  
> +ifdef CONFIG_FUNCTION_TRACER
> +CFLAGS_REMOVE_xmon.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
> +CFLAGS_REMOVE_nonstdio.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
> +ifdef CONFIG_XMON_DISASSEMBLY
> +CFLAGS_REMOVE_ppc-dis.o  = -mno-sched-epilog $(CC_FLAGS_FTRACE)
> +CFLAGS_REMOVE_ppc-opc.o  = -mno-sched-epilog $(CC_FLAGS_FTRACE)
> +ifdef CONFIG_SPU_BASE
> +CFLAGS_REMOVE_spu-dis.o  = -mno-sched-epilog $(CC_FLAGS_FTRACE)
> +CFLAGS_REMOVE_spu-opc.o  = -mno-sched-epilog $(CC_FLAGS_FTRACE)
> +endif
> +endif
> +endif

Urk.

We want to disable it for everything in the directory, so can you do
something like:

  ORIG_CFLAGS := $(KBUILD_CFLAGS)
  KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))

cheers

Re: [PATCH 13/13] powerpc/64s: idle ESL=0 stop can avoid all save/restore overhead

2017-08-08 Thread Gautham R Shenoy

On Sun, Aug 06, 2017 at 03:02:41AM +1000, Nicholas Piggin wrote:
> When stop is executed with EC=ESL=0, it appears to execute like a
> normal instruction (resuming from NIP when woken by interrupt).
> So all the save/restore handling can be avoided completely. In
> particular NV GPRs do not have to be saved, and MSR does not have
> to be switched back to kernel MSR.
> 
> So move the test for "lite" sleep states out to power9_idle_stop.

Nice optimization!

Reviewed-by: Gautham R. Shenoy 

> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/idle_book3s.S | 38 +-
>  1 file changed, 13 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/idle_book3s.S 
> b/arch/powerpc/kernel/idle_book3s.S
> index 8ac366a51bb5..9eb47c99dc39 100644
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -251,31 +251,8 @@ enter_winkle:
>  /*
>   * r3 - PSSCR value corresponding to the requested stop state.
>   */
> -power_enter_stop:
> -/*
> - * Check if we are executing the lite variant with ESL=EC=0
> - */
> - andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
> +power_enter_stop_esl:
>   clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
> - bne  .Lhandle_esl_ec_set
> - PPC_STOP
> - li  r3,0  /* Since we didn't lose state, return 0 */
> -
> - /*
> -  * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so
> -  * it can determine if the wakeup reason is an HMI in
> -  * CHECK_HMI_INTERRUPT.
> -  *
> -  * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup
> -  * reason, so there is no point setting r12 to SRR1.
> -  *
> -  * Further, we clear r12 here, so that we don't accidentally enter the
> -  * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI.
> -  */
> - li  r12, 0
> - b   pnv_wakeup_noloss
> -
> -.Lhandle_esl_ec_set:
>   /*
>* POWER9 DD2 can incorrectly set PMAO when waking up after a
>* state-loss idle. Saving and restoring MMCR0 over idle is a
> @@ -348,9 +325,20 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); 
> \
>   * r3 contains desired PSSCR register value.
>   */
>  _GLOBAL(power9_idle_stop)
> + /*
> +  * Check if we are executing the lite variant with ESL=EC=0
> +  * This case resumes execution after the stop instruction without
> +  * losing any state, so nothing has to be saved.
> +  */
> + andis.  r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
> + bne 1f
> + PPC_STOP
> + li  r3,0  /* Since we didn't lose state, return 0 */
> + blr
> +1:   /* state-loss idle */
>   std r3, PACA_REQ_PSSCR(r13)
>   mtspr   SPRN_PSSCR,r3
> - LOAD_REG_ADDR(r4,power_enter_stop)
> + LOAD_REG_ADDR(r4,power_enter_stop_esl)
>   b   pnv_powersave_common
>   /* No return */
> 
> -- 
> 2.11.0
>

Re: [1/3] powerpc/mm/book3s64: Make KERN_IO_START a variable

2017-08-08 Thread Michael Ellerman

On Tue, 2017-08-01 at 10:29:22 UTC, Michael Ellerman wrote:
> Currently KERN_IO_START is defined as:
> 
>  #define KERN_IO_START  (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1))
> 
> Although it looks like a constant, both the components are actually
> variables, to allow us to have a different value between Radix and
> Hash with a single kernel.
> 
> However that still requires both Radix and Hash to place the kernel IO
> region at the same location relative to the start and end of the
> kernel virtual region (namely 1/2 way through it), and we'd like to
> change that.
> 
> So split KERN_IO_START out into its own variable, and initialise it
> for Radix and Hash. In the medium term we should be able to
> reconsolidate this, by doing a more involved rearrangement of the
> location of the regions.
> 
> Signed-off-by: Michael Ellerman 
> Reviewed-by: Aneesh Kumar K.V 
> Acked-by: Balbir Singh 

Series applied to powerpc next.

https://git.kernel.org/powerpc/c/63ee9b2ff9d306efaa61b04b8710fa

cheers

Re: [v3] powerpc/powernv: Use darn instr for random_seed on p9

2017-08-08 Thread Michael Ellerman

On Fri, 2017-08-04 at 01:12:18 UTC, Matt Brown wrote:
> This adds the powernv_get_random_darn function which utilises the darn
> instruction, introduced in POWER9. The powernv_get_random_darn function
> is used as the ppc_md.get_random_seed on P9.
> 
> The DARN instruction can potentially throw an error, so we attempt to
> register the powernv_get_random_darn function up to 10 times before
> failing.
> 
> Signed-off-by: Matt Brown 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/e66ca3db5917f4bcad039d3a3df9f1

cheers

Re: [v4] powerpc/powernv: Enable PCI peer-to-peer

2017-08-08 Thread Michael Ellerman

On Fri, 2017-08-04 at 09:55:14 UTC, Frederic Barrat wrote:
> P9 has support for PCI peer-to-peer, enabling a device to write in the
> mmio space of another device directly, without interrupting the CPU.
> 
> This patch adds support for it on powernv, by adding a new API to be
> called by drivers. The pnv_pci_set_p2p(...) call configures an
> 'initiator', i.e the device which will issue the mmio operation, and a
> 'target', i.e. the device on the receiving side.
> 
> P9 really only supports mmio stores for the time being but that's
> expected to change in the future, so the API allows to define both
> load and store operations.
> 
> /* PCI p2p descriptor */
> #define OPAL_PCI_P2P_ENABLE   0x1
> #define OPAL_PCI_P2P_LOAD 0x2
> #define OPAL_PCI_P2P_STORE0x4
> 
> int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target,
>  uint64_t desc)
> 
> It uses a new OPAL call, as the configuration magic is done on the
> PHBs by skiboot.
> 
> Signed-off-by: Frederic Barrat 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/2552910084a5e12e280caf082ab014

cheers

Re: powerpc/32: Fix boot failure on non 6xx platforms

2017-08-08 Thread Michael Ellerman

On Tue, 2017-08-08 at 06:37:24 UTC, Christophe Leroy wrote:
> commit d300627c6a536 ("powerpc/6xx: Handle DABR match before
> calling do_page_fault") breaks non 6xx platforms.
> 
> [6.029556] Failed to execute /init (error -14)
> [6.034623] Starting init: /bin/sh exists but couldn't execute it
> (error -14)
> [6.041489] Kernel panic - not syncing: No working init found.  Try
> passing init= option to kernel. See Linux
> Documentation/admin-guide/init.rst for guidance.
> [6.055518] CPU: 0 PID: 1 Comm: init Not tainted
> 4.13.0-rc3-s3k-dev-00143-g7aa62e972a56 #56
> [6.063745] Call Trace:
> [6.066224] [c60f1ed0] [c001a624] panic+0x108/0x250 (unreliable)
> [6.072140] [c60f1f30] [c0002640] rootfs_mount+0x0/0x58
> [6.077311] [c60f1f40] [c000cb80] ret_from_kernel_thread+0x5c/0x64
> [6.083405] Rebooting in 180 seconds..
> 
> This is because in handle_page_fault(), the call to do_page_fault()
> has been mistakenly enclosed inside an #ifdef CONFIG_6xx
> 
> Fixes: d300627c6a536 ("powerpc/6xx: Handle DABR match before
> calling do_page_fault")
> 
> Signed-off-by: Christophe Leroy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/64d0a506fbdd64906f168539bee32a

cheers

[PATCH] Revert "powerpc/64: Avoid restore_math call if possible in syscall exit"

2017-08-08 Thread Michael Ellerman

This reverts commit bc4f65e4cf9d6cc43e0e9ba0b8648cf9201cd55f.

As reported by Andreas, this commit is causing unrecoverable SLB misses in the
system call exit path:

  Unrecoverable exception 4100 at c000a1ec
  Oops: Unrecoverable exception, sig: 6 [#1]
  SMP NR_CPUS=2 PowerMac
  ...
  CPU: 0 PID: 18626 Comm: rm Not tainted 4.13.0-rc3 #1
  task: c0018335e080 task.stack: c00139e5
  NIP: c000a1ec LR: c000a118 CTR: 
  REGS: c00139e53bb0 TRAP: 4100   Not tainted  (4.13.0-rc3)
  MSR: 90001030  CR: 2444  XER: 2000 SOFTE: 1
  GPR00:  c00139e53e30 c0abb500 fffe
  GPR04: c001eb866298   c0018335e080
  GPR08: 9000d032  0002 f001
  GPR12: c00139e5 c000 3fffa8c0dca0 3fffa8c0dc88
  GPR16: 1000 0001 3fffa8c0eaa0 
  GPR20: 3fffa8c27528 3fffa8c27b00  
  GPR24: 3fffa8c0d918 31b3efa0 3fffa8c26d68 
  GPR28: 3fffa8c249e8 3fffa8c263d0 3fffa8c27550 31b3ef10
  NIP [c000a1ec] system_call_exit+0xc0/0x21c
  LR [c000a118] system_call+0x58/0x6c
  Call Trace:
  [c00139e53e30] [c000a118] system_call+0x58/0x6c (unreliable)
  Instruction dump:
  64a51000 7c6300d0 f8a101a0 4b9c 3c00 6006 780007c6 6400
  6000 7c004039 4082001c e8ed0170 <88070b78> 88c70b79 7c003214 2c20

This is caused by us trying to load THREAD_LOAD_FP with MSR_RI=0, and taking an
SLB miss on the thread struct.

Reported-by: Andreas Schwab 
Diagnosed-by: Nicholas Piggin 
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/entry_64.S | 60 +-
 arch/powerpc/kernel/process.c  |  4 ---
 2 files changed, 18 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 49d8422767b4..e925c1c99c71 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -223,17 +223,27 @@ system_call_exit:
andi.   
r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
bne-.Lsyscall_exit_work
 
-   /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */
-   li  r7,MSR_FP
+   andi.   r0,r8,MSR_FP
+   beq 2f
 #ifdef CONFIG_ALTIVEC
-   orisr7,r7,MSR_VEC@h
+   andis.  r0,r8,MSR_VEC@h
+   bne 3f
 #endif
-   and r0,r8,r7
-   cmpdr0,r7
-   bne .Lsyscall_restore_math
-.Lsyscall_restore_math_cont:
+2: addir3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_PPC_BOOK3S
+   li  r10,MSR_RI
+   mtmsrd  r10,1   /* Restore RI */
+#endif
+   bl  restore_math
+#ifdef CONFIG_PPC_BOOK3S
+   li  r11,0
+   mtmsrd  r11,1
+#endif
+   ld  r8,_MSR(r1)
+   ld  r3,RESULT(r1)
+   li  r11,-MAX_ERRNO
 
-   cmpld   r3,r11
+3: cmpld   r3,r11
ld  r5,_CCR(r1)
bge-.Lsyscall_error
 .Lsyscall_error_cont:
@@ -267,40 +277,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
std r5,_CCR(r1)
b   .Lsyscall_error_cont
 
-.Lsyscall_restore_math:
-   /*
-* Some initial tests from restore_math to avoid the heavyweight
-* C code entry and MSR manipulations.
-*/
-   LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK)
-   and.r0,r0,r8
-   bne 1f
-
-   ld  r7,PACACURRENT(r13)
-   lbz r0,THREAD+THREAD_LOAD_FP(r7)
-#ifdef CONFIG_ALTIVEC
-   lbz r6,THREAD+THREAD_LOAD_VEC(r7)
-   add r0,r0,r6
-#endif
-   cmpdi   r0,0
-   beq .Lsyscall_restore_math_cont
-
-1: addir3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S
-   li  r10,MSR_RI
-   mtmsrd  r10,1   /* Restore RI */
-#endif
-   bl  restore_math
-#ifdef CONFIG_PPC_BOOK3S
-   li  r11,0
-   mtmsrd  r11,1
-#endif
-   /* Restore volatiles, reload MSR from updated one */
-   ld  r8,_MSR(r1)
-   ld  r3,RESULT(r1)
-   li  r11,-MAX_ERRNO
-   b   .Lsyscall_restore_math_cont
-
 /* Traced system call support */
 .Lsyscall_dotrace:
bl  save_nvgprs
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9f3e2c932dcc..ec480966f9bf 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -511,10 +511,6 @@ void restore_math(struct pt_regs *regs)
 {
unsigned long msr;
 
-   /*
-* Syscall exit makes a similar initial check before branching
-* to restore_math. Keep them in synch.
-*/
if (!msr_tm_active(regs->msr) &&
!current->thread.load_fp && !loadvec(current->thread))
return;
-- 
2.7.4

ksmd circular locking warning, cpu_hotplug_lock vs ksm_thread_mutex

2017-08-08 Thread Michael Ellerman

Hi all,

Apologies for the large Cc list, but wasn't really sure who to send this to.

I've seen this once on a Power8 box, with next-20170807.

I think it happened while I was running the memory hoptlug selftests.

cheers

  [ 3532.474435] ==
  [ 3532.474440] WARNING: possible circular locking dependency detected
  [ 3532.474446] 4.13.0-rc3-gcc6-next-20170807-g4751f76 #1 Not tainted
  [ 3532.474450] --
  [ 3532.474454] ksmd/1459 is trying to acquire lock:
  [ 3532.474460]  (cpu_hotplug_lock.rw_sem){++}, at: [] 
lru_add_drain_all+0x20/0x40
  [ 3532.474476] 
 but task is already holding lock:
  [ 3532.474480]  (ksm_thread_mutex){+.+...}, at: [] 
ksm_scan_thread+0xd8/0x1a30
  [ 3532.474493] 
 which lock already depends on the new lock.
  
  [ 3532.474499] 
 the existing dependency chain (in reverse order) is:
  [ 3532.474504] 
 -> #3 (ksm_thread_mutex){+.+...}:
  [ 3532.474517]__mutex_lock+0x8c/0xa80
  [ 3532.474522]ksm_memory_callback+0xa4/0x390
  [ 3532.474529]notifier_call_chain+0xa4/0x110
  [ 3532.474533]__blocking_notifier_call_chain+0x74/0xb0
  [ 3532.474540]memory_notify+0x30/0x50
  [ 3532.474544]__offline_pages.constprop.6+0x1c0/0xa50
  [ 3532.474549]memory_subsys_offline+0x68/0xf0
  [ 3532.474555]device_offline+0x104/0x140
  [ 3532.474560]store_mem_state+0x178/0x190
  [ 3532.474566]dev_attr_store+0x3c/0x60
  [ 3532.474572]sysfs_kf_write+0x9c/0xc0
  [ 3532.474576]kernfs_fop_write+0x190/0x260
  [ 3532.474582]__vfs_write+0x44/0x1a0
  [ 3532.474586]vfs_write+0xd4/0x240
  [ 3532.474591]SyS_write+0x68/0x110
  [ 3532.474597]system_call+0x58/0x6c
  [ 3532.474600] 
 -> #2 ((memory_chain).rwsem){..}:
  [ 3532.474609]down_read+0x44/0xa0
  [ 3532.474613]__blocking_notifier_call_chain+0x58/0xb0
  [ 3532.474618]memory_notify+0x30/0x50
  [ 3532.474622]__offline_pages.constprop.6+0x1c0/0xa50
  [ 3532.474627]memory_subsys_offline+0x68/0xf0
  [ 3532.474631]device_offline+0x104/0x140
  [ 3532.474636]store_mem_state+0x178/0x190
  [ 3532.474641]dev_attr_store+0x3c/0x60
  [ 3532.474645]sysfs_kf_write+0x9c/0xc0
  [ 3532.474649]kernfs_fop_write+0x190/0x260
  [ 3532.474654]__vfs_write+0x44/0x1a0
  [ 3532.474659]vfs_write+0xd4/0x240
  [ 3532.474663]SyS_write+0x68/0x110
  [ 3532.474668]system_call+0x58/0x6c
  [ 3532.474671] 
 -> #1 (mem_hotplug_lock.rw_sem){++}:
  [ 3532.474680]get_online_mems+0x4c/0xd0
  [ 3532.474685]kmem_cache_create+0x6c/0x2a0
  [ 3532.474691]ptlock_cache_init+0x38/0x54
  [ 3532.474696]start_kernel+0x2ac/0x558
  [ 3532.474700]start_here_common+0x1c/0x4ac
  [ 3532.474704] 
 -> #0 (cpu_hotplug_lock.rw_sem){++}:
  [ 3532.474713]lock_acquire+0xec/0x2e0
  [ 3532.474718]cpus_read_lock+0x4c/0xd0
  [ 3532.474723]lru_add_drain_all+0x20/0x40
  [ 3532.474728]ksm_scan_thread+0xba4/0x1a30
  [ 3532.474734]kthread+0x164/0x1b0
  [ 3532.474739]ret_from_kernel_thread+0x5c/0x74
  [ 3532.474742] 
 other info that might help us debug this:
  
  [ 3532.474748] Chain exists of:
   cpu_hotplug_lock.rw_sem --> (memory_chain).rwsem --> 
ksm_thread_mutex
  
  [ 3532.474760]  Possible unsafe locking scenario:
  
  [ 3532.474764]CPU0CPU1
  [ 3532.474768]
  [ 3532.474771]   lock(ksm_thread_mutex);
  [ 3532.474775]lock((memory_chain).rwsem);
  [ 3532.474781]lock(ksm_thread_mutex);
  [ 3532.474786]   lock(cpu_hotplug_lock.rw_sem);
  [ 3532.474791] 
  *** DEADLOCK ***
  
  [ 3532.474797] 1 lock held by ksmd/1459:
  [ 3532.474800]  #0:  (ksm_thread_mutex){+.+...}, at: [] 
ksm_scan_thread+0xd8/0x1a30
  [ 3532.474810] 
 stack backtrace:
  [ 3532.474816] CPU: 0 PID: 1459 Comm: ksmd Not tainted 
4.13.0-rc3-gcc6-next-20170807-g4751f76 #1
  [ 3532.474822] Call Trace:
  [ 3532.474827] [c01e54d13930] [c0b57c38] dump_stack+0xe8/0x160 
(unreliable)
  [ 3532.474835] [c01e54d13970] [c0157968] 
print_circular_bug+0x288/0x3d0
  [ 3532.474842] [c01e54d13a10] [c015b9c8] 
__lock_acquire+0x1858/0x1a20
  [ 3532.474849] [c01e54d13b80] [c015c6fc] lock_acquire+0xec/0x2e0
  [ 3532.474855] [c01e54d13c50] [c00d85cc] cpus_read_lock+0x4c/0xd0
  [ 3532.474862] [c01e54d13c80] [c02b06b0] 
lru_add_drain_all+0x20/0x40
  [ 3532.474869] [c01e54d13ca0] [c0331244] 
ksm_scan_thread+0xba4/0x1a30
  [ 3532.474876] [c01e54d13dc0] [c010b614] kthrea

Applied "ASoC: fsl: Convert to using %pOF instead of full_name" to the asoc tree

2017-08-08 Thread Mark Brown

The patch

   ASoC: fsl: Convert to using %pOF instead of full_name

has been applied to the asoc tree at

   git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git 

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.  

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

>From 06d15a2ede999a77f0d05aa600ed473d90d9a909 Mon Sep 17 00:00:00 2001
From: Rob Herring 
Date: Mon, 7 Aug 2017 18:29:18 -0500
Subject: [PATCH] ASoC: fsl: Convert to using %pOF instead of full_name

Now that we have a custom printf format specifier, convert users of
full_name to use %pOF instead. This is preparation to remove storing
of the full path string for each node.

Signed-off-by: Rob Herring 
Acked-by: Nicolin Chen 
Signed-off-by: Mark Brown 
---
 sound/soc/fsl/fsl_dma.c|  4 ++--
 sound/soc/fsl/imx-audmux.c | 16 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sound/soc/fsl/fsl_dma.c b/sound/soc/fsl/fsl_dma.c
index ccadefceeff2..ed8ea002902d 100644
--- a/sound/soc/fsl/fsl_dma.c
+++ b/sound/soc/fsl/fsl_dma.c
@@ -897,8 +897,8 @@ static int fsl_soc_dma_probe(struct platform_device *pdev)
 
ret = of_address_to_resource(ssi_np, 0, &res);
if (ret) {
-   dev_err(&pdev->dev, "could not determine resources for %s\n",
-   ssi_np->full_name);
+   dev_err(&pdev->dev, "could not determine resources for %pOF\n",
+   ssi_np);
of_node_put(ssi_np);
return ret;
}
diff --git a/sound/soc/fsl/imx-audmux.c b/sound/soc/fsl/imx-audmux.c
index fc57da341d61..392d5eef356d 100644
--- a/sound/soc/fsl/imx-audmux.c
+++ b/sound/soc/fsl/imx-audmux.c
@@ -268,13 +268,13 @@ static int imx_audmux_parse_dt_defaults(struct 
platform_device *pdev,
 
ret = of_property_read_u32(child, "fsl,audmux-port", &port);
if (ret) {
-   dev_warn(&pdev->dev, "Failed to get fsl,audmux-port of 
child node \"%s\"\n",
-   child->full_name);
+   dev_warn(&pdev->dev, "Failed to get fsl,audmux-port of 
child node \"%pOF\"\n",
+   child);
continue;
}
if (!of_property_read_bool(child, "fsl,port-config")) {
-   dev_warn(&pdev->dev, "child node \"%s\" does not have 
property fsl,port-config\n",
-   child->full_name);
+   dev_warn(&pdev->dev, "child node \"%pOF\" does not have 
property fsl,port-config\n",
+   child);
continue;
}
 
@@ -292,15 +292,15 @@ static int imx_audmux_parse_dt_defaults(struct 
platform_device *pdev,
}
 
if (ret != -EOVERFLOW) {
-   dev_err(&pdev->dev, "Failed to read u32 at index %d of 
child %s\n",
-   i, child->full_name);
+   dev_err(&pdev->dev, "Failed to read u32 at index %d of 
child %pOF\n",
+   i, child);
continue;
}
 
if (audmux_type == IMX31_AUDMUX) {
if (i % 2) {
-   dev_err(&pdev->dev, "One pdcr value is missing 
in child node %s\n",
-   child->full_name);
+   dev_err(&pdev->dev, "One pdcr value is missing 
in child node %pOF\n",
+   child);
continue;
}
imx_audmux_v2_configure_port(port, ptcr, pdcr);
-- 
2.13.2

Re: Revert "powerpc/64: Avoid restore_math call if possible in syscall exit"

2017-08-08 Thread Michael Ellerman

On Tue, 2017-08-08 at 10:55:57 UTC, Michael Ellerman wrote:
> This reverts commit bc4f65e4cf9d6cc43e0e9ba0b8648cf9201cd55f.
> 
> As reported by Andreas, this commit is causing unrecoverable SLB misses in the
> system call exit path:
> 
>   Unrecoverable exception 4100 at c000a1ec
>   Oops: Unrecoverable exception, sig: 6 [#1]
>   SMP NR_CPUS=2 PowerMac
>   ...
>   CPU: 0 PID: 18626 Comm: rm Not tainted 4.13.0-rc3 #1
>   task: c0018335e080 task.stack: c00139e5
>   NIP: c000a1ec LR: c000a118 CTR: 
>   REGS: c00139e53bb0 TRAP: 4100   Not tainted  (4.13.0-rc3)
>   MSR: 90001030  CR: 2444  XER: 2000 SOFTE: 1
>   GPR00:  c00139e53e30 c0abb500 fffe
>   GPR04: c001eb866298   c0018335e080
>   GPR08: 9000d032  0002 f001
>   GPR12: c00139e5 c000 3fffa8c0dca0 3fffa8c0dc88
>   GPR16: 1000 0001 3fffa8c0eaa0 
>   GPR20: 3fffa8c27528 3fffa8c27b00  
>   GPR24: 3fffa8c0d918 31b3efa0 3fffa8c26d68 
>   GPR28: 3fffa8c249e8 3fffa8c263d0 3fffa8c27550 31b3ef10
>   NIP [c000a1ec] system_call_exit+0xc0/0x21c
>   LR [c000a118] system_call+0x58/0x6c
>   Call Trace:
>   [c00139e53e30] [c000a118] system_call+0x58/0x6c (unreliable)
>   Instruction dump:
>   64a51000 7c6300d0 f8a101a0 4b9c 3c00 6006 780007c6 6400
>   6000 7c004039 4082001c e8ed0170 <88070b78> 88c70b79 7c003214 2c20
> 
> This is caused by us trying to load THREAD_LOAD_FP with MSR_RI=0, and taking 
> an
> SLB miss on the thread struct.
> 
> Reported-by: Andreas Schwab 
> Diagnosed-by: Nicholas Piggin 
> Signed-off-by: Michael Ellerman 

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/44a12806d010944a5727f1dc991231

cheers

Re: [PATCH] powerpc: fix invalid use of register expressions

2017-08-08 Thread Michael Ellerman

Andreas Schwab  writes:

> binutils >= 2.26 now warns about misuse of register expressions in
> assembler operands that are actually literals, for example:
>
> arch/powerpc/kernel/entry_64.S:535: Warning: invalid register expression
>
> Signed-off-by: Andreas Schwab 
> ---
>  arch/powerpc/include/asm/ppc_asm.h |  2 +-
>  arch/powerpc/kernel/swsusp_asm64.S |  2 +-
>  arch/powerpc/kvm/book3s_64_slb.S   |  2 +-
>  arch/powerpc/lib/copypage_power7.S | 14 
>  arch/powerpc/lib/copyuser_power7.S | 66 
> +++---
>  arch/powerpc/lib/memcpy_power7.S   | 66 
> +++---
>  arch/powerpc/lib/string_64.S   |  2 +-
>  7 files changed, 77 insertions(+), 77 deletions(-)

Thanks. I updated the change log to mention that it's almost always
s/r0/0/.

And I folded in:

diff --git a/arch/powerpc/purgatory/trampoline.S 
b/arch/powerpc/purgatory/trampoline.S
index 3696ea6c4826..30277446892c 100644
--- a/arch/powerpc/purgatory/trampoline.S
+++ b/arch/powerpc/purgatory/trampoline.S
@@ -67,7 +67,7 @@ master:
mr  %r16,%r3/* save dt address in reg16 */
li  %r4,20
LWZX_BE %r6,%r3,%r4 /* fetch __be32 version number at byte 20 */
-   cmpwi   %r0,%r6,2   /* v2 or later? */
+   cmpwi   %cr0,%r6,2  /* v2 or later? */
blt 1f
li  %r4,28
STWX_BE %r17,%r3,%r4/* Store my cpu as __be32 at byte 28 */

cheers

[PATCH] powerpc/mm: Fix section mismatch warning in early_check_vec5()

2017-08-08 Thread Michael Ellerman

early_check_vec5() is called from and calls __init routines, so should
also be __init.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index b1c83a6bfd54..588a521966ec 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -381,7 +381,7 @@ early_param("disable_radix", parse_disable_radix);
  * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
  * radix.  If not, we clear the radix feature bit so we fall back to hash.
  */
-static void early_check_vec5(void)
+static void __init early_check_vec5(void)
 {
unsigned long root, chosen;
int size;
-- 
2.7.4

[PATCH] powerpc/xive: Fix section mismatch warnings

2017-08-08 Thread Michael Ellerman

Both xive_core_init() and xive_native_init() are called from and call
__init routines, so they should also be __init.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/sysdev/xive/common.c | 4 ++--
 arch/powerpc/sysdev/xive/native.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 6595462b1fc8..739ce590fa9d 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1395,8 +1395,8 @@ void xive_shutdown(void)
xive_ops->shutdown();
 }
 
-bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
-   u8 max_prio)
+bool __init xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 
offset,
+  u8 max_prio)
 {
xive_tima = area;
xive_tima_offset = offset;
diff --git a/arch/powerpc/sysdev/xive/native.c 
b/arch/powerpc/sysdev/xive/native.c
index 0f95476b01f6..1dbf782c9239 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -531,7 +531,7 @@ u32 xive_native_default_eq_shift(void)
 }
 EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
 
-bool xive_native_init(void)
+bool __init xive_native_init(void)
 {
struct device_node *np;
struct resource r;
-- 
2.7.4

Re: [v6 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-08 Thread Pasha Tatashin


Hi Will,

Thank you for looking at this change. What you described was in my 
previous iterations of this project.


See for example here: https://lkml.org/lkml/2017/5/5/369

I was asked to remove that flag, and only zero memory in place when 
needed. Overall the current approach is better everywhere else in the 
kernel, but it adds a little extra code to kasan initialization.


Pasha

On 08/08/2017 05:07 AM, Will Deacon wrote:

On Mon, Aug 07, 2017 at 04:38:45PM -0400, Pavel Tatashin wrote:

To optimize the performance of struct page initialization,
vmemmap_populate() will no longer zero memory.

We must explicitly zero the memory that is allocated by vmemmap_populate()
for kasan, as this memory does not go through struct page initialization
path.

Signed-off-by: Pavel Tatashin 
Reviewed-by: Steven Sistare 
Reviewed-by: Daniel Jordan 
Reviewed-by: Bob Picco 
---
  arch/arm64/mm/kasan_init.c | 42 ++
  1 file changed, 42 insertions(+)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 81f03959a4ab..e78a9ecbb687 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -135,6 +135,41 @@ static void __init clear_pgds(unsigned long start,
set_pgd(pgd_offset_k(start), __pgd(0));
  }
  
+/*

+ * Memory that was allocated by vmemmap_populate is not zeroed, so we must
+ * zero it here explicitly.
+ */
+static void
+zero_vmemmap_populated_memory(void)
+{
+   struct memblock_region *reg;
+   u64 start, end;
+
+   for_each_memblock(memory, reg) {
+   start = __phys_to_virt(reg->base);
+   end = __phys_to_virt(reg->base + reg->size);
+
+   if (start >= end)
+   break;
+
+   start = (u64)kasan_mem_to_shadow((void *)start);
+   end = (u64)kasan_mem_to_shadow((void *)end);
+
+   /* Round to the start end of the mapped pages */
+   start = round_down(start, SWAPPER_BLOCK_SIZE);
+   end = round_up(end, SWAPPER_BLOCK_SIZE);
+   memset((void *)start, 0, end - start);
+   }
+
+   start = (u64)kasan_mem_to_shadow(_text);
+   end = (u64)kasan_mem_to_shadow(_end);
+
+   /* Round to the start end of the mapped pages */
+   start = round_down(start, SWAPPER_BLOCK_SIZE);
+   end = round_up(end, SWAPPER_BLOCK_SIZE);
+   memset((void *)start, 0, end - start);
+}


I can't help but think this would be an awful lot nicer if you made
vmemmap_alloc_block take extra GFP flags as a parameter. That way, we could
implement a version of vmemmap_populate that does the zeroing when we need
it, without having to duplicate a bunch of the code like this. I think it
would also be less error-prone, because you wouldn't have to do the
allocation and the zeroing in two separate steps.

Will

[PATCH 00/12] powerpc/8xx: Some cleanup

2017-08-08 Thread Christophe Leroy

This serie does some cleanup in the area of the 8xx.

In the same spirit as what Michael did for 4xx, move 8xx
specific stuff into platforms/8xx

Then try to reduce the amount of #ifdefs specific to 8xx

Remove the CONFIG_8xx which is redundant with CONFIG_PPC_8xx

Plus some misc cleanups

Christophe Leroy (12):
  powerpc/8xx: Simplify CONFIG_8xx checks in Makefile
  powerpc/8xx: Move 8xx machine check handlers into platforms/8xx
  powerpc/8xx: Remove SoftwareEmulation()
  powerpc/cpm1: link to CONFIG_CPM1 instead of CONFIG_8xx
  powerpc/8xx: Move mpc8xx_pic.c from sysdev to platform/8xx
  powerpc/time: refactor MFTB() to limit number of ifdefs
  powerpc/kconfig: Simplify PCI_QSPAN selection
  powerpc/8xx: Getting rid of remaining use of CONFIG_8xx
  powerpc/8xx: remove CONFIG_8xx
  powerpc/8xx: Use symbolic PVR value
  powerpc/8xx: Use symbolic names for DSISR bits in DSI
  powerpc/8xx: Remove cpu dependent macro instructions from head_8xx

 arch/powerpc/Kconfig   |  8 ++--
 arch/powerpc/Makefile  |  2 +-
 arch/powerpc/boot/Makefile |  4 +-
 arch/powerpc/boot/ppc_asm.h|  8 
 arch/powerpc/boot/util.S   | 24 +++-
 arch/powerpc/include/asm/cache.h   |  2 +-
 arch/powerpc/include/asm/cputable.h|  4 +-
 arch/powerpc/include/asm/fs_pd.h   |  2 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h   |  2 +-
 arch/powerpc/include/asm/ppc_asm.h | 14 +--
 arch/powerpc/include/asm/reg.h | 16 
 arch/powerpc/include/asm/timex.h   |  6 +--
 arch/powerpc/kernel/Makefile   |  2 +-
 arch/powerpc/kernel/cputable.c |  6 +--
 arch/powerpc/kernel/head_8xx.S |  9 ++---
 arch/powerpc/kernel/irq.c  |  2 +-
 arch/powerpc/kernel/kgdb.c |  4 +-
 arch/powerpc/kernel/traps.c| 43 --
 arch/powerpc/kernel/vdso32/gettimeofday.S  | 12 ++
 arch/powerpc/mm/fault.c|  4 +-
 arch/powerpc/mm/mem.c  |  2 +-
 arch/powerpc/mm/mmu_decl.h | 10 ++---
 arch/powerpc/mm/tlb_nohash_low.S   |  2 +-
 arch/powerpc/platforms/8xx/Kconfig |  1 -
 arch/powerpc/platforms/8xx/Makefile|  2 +-
 arch/powerpc/platforms/8xx/m8xx_setup.c|  2 +-
 arch/powerpc/platforms/8xx/machine_check.c | 37 +++
 .../{sysdev/mpc8xx_pic.c => platforms/8xx/pic.c}   |  2 +-
 .../{sysdev/mpc8xx_pic.h => platforms/8xx/pic.h}   |  0
 arch/powerpc/platforms/Kconfig.cputype |  7 +---
 arch/powerpc/sysdev/Makefile   |  2 +-
 arch/powerpc/sysdev/fsl_soc.c  |  2 +-
 arch/powerpc/sysdev/fsl_soc.h  |  2 +-
 33 files changed, 109 insertions(+), 136 deletions(-)
 create mode 100644 arch/powerpc/platforms/8xx/machine_check.c
 rename arch/powerpc/{sysdev/mpc8xx_pic.c => platforms/8xx/pic.c} (99%)
 rename arch/powerpc/{sysdev/mpc8xx_pic.h => platforms/8xx/pic.h} (100%)

-- 
2.13.3

[PATCH 01/12] powerpc/8xx: Simplify CONFIG_8xx checks in Makefile

2017-08-08 Thread Christophe Leroy

The entire 8xx directory is omitted if CONFIG_8xx is not enabled, so
within the 8xx/Makefile CONFIG_8xx is always y. So convert
obj-$(CONFIG_8xx) to the more obvious obj-y.

Signed-off-by: Christophe Leroy 
---
 This serie applies on top of Michael's serie begining with '[1/9] powerpc/47x:
  Guard 47x cputable entries with CONFIG_PPC_47x' added to bundle mce'

 arch/powerpc/platforms/8xx/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/8xx/Makefile 
b/arch/powerpc/platforms/8xx/Makefile
index 76a81c3350a8..161f367ebf1e 100644
--- a/arch/powerpc/platforms/8xx/Makefile
+++ b/arch/powerpc/platforms/8xx/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the PowerPC 8xx linux kernel.
 #
-obj-$(CONFIG_PPC_8xx)+= m8xx_setup.o
+obj-y  += m8xx_setup.o
 obj-$(CONFIG_MPC885ADS)   += mpc885ads_setup.o
 obj-$(CONFIG_MPC86XADS)   += mpc86xads_setup.o
 obj-$(CONFIG_PPC_EP88XC)  += ep88xc.o
-- 
2.13.3

[PATCH 02/12] powerpc/8xx: Move 8xx machine check handlers into platforms/8xx

2017-08-08 Thread Christophe Leroy

In the same spirit as what was done for 4xx and 44x, move
the 8xx machine check into platforms/8xx

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/traps.c| 25 
 arch/powerpc/platforms/8xx/Makefile|  2 +-
 arch/powerpc/platforms/8xx/machine_check.c | 37 ++
 3 files changed, 38 insertions(+), 26 deletions(-)
 create mode 100644 arch/powerpc/platforms/8xx/machine_check.c

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9107f7f86058..b328ca2aef9c 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -558,31 +558,6 @@ int machine_check_e200(struct pt_regs *regs)
 
return 0;
 }
-#elif defined(CONFIG_PPC_8xx)
-int machine_check_8xx(struct pt_regs *regs)
-{
-   unsigned long reason = regs->msr;
-
-   pr_err("Machine check in kernel mode.\n");
-   pr_err("Caused by (from SRR1=%lx): ", reason);
-   if (reason & 0x4000)
-   pr_err("Fetch error at address %lx\n", regs->nip);
-   else
-   pr_err("Data access error at address %lx\n", regs->dar);
-
-#ifdef CONFIG_PCI
-   /* the qspan pci read routines can cause machine checks -- Cort
-*
-* yuck !!! that totally needs to go away ! There are better ways
-* to deal with that than having a wart in the mcheck handler.
-* -- BenH
-*/
-   bad_page_fault(regs, regs->dar, SIGBUS);
-   return 1;
-#else
-   return 0;
-#endif
-}
 #elif defined(CONFIG_PPC32)
 int machine_check_generic(struct pt_regs *regs)
 {
diff --git a/arch/powerpc/platforms/8xx/Makefile 
b/arch/powerpc/platforms/8xx/Makefile
index 161f367ebf1e..756be8345868 100644
--- a/arch/powerpc/platforms/8xx/Makefile
+++ b/arch/powerpc/platforms/8xx/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the PowerPC 8xx linux kernel.
 #
-obj-y  += m8xx_setup.o
+obj-y  += m8xx_setup.o machine_check.o
 obj-$(CONFIG_MPC885ADS)   += mpc885ads_setup.o
 obj-$(CONFIG_MPC86XADS)   += mpc86xads_setup.o
 obj-$(CONFIG_PPC_EP88XC)  += ep88xc.o
diff --git a/arch/powerpc/platforms/8xx/machine_check.c 
b/arch/powerpc/platforms/8xx/machine_check.c
new file mode 100644
index ..402016705a39
--- /dev/null
+++ b/arch/powerpc/platforms/8xx/machine_check.c
@@ -0,0 +1,37 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+
+int machine_check_8xx(struct pt_regs *regs)
+{
+   unsigned long reason = regs->msr;
+
+   pr_err("Machine check in kernel mode.\n");
+   pr_err("Caused by (from SRR1=%lx): ", reason);
+   if (reason & 0x4000)
+   pr_err("Fetch error at address %lx\n", regs->nip);
+   else
+   pr_err("Data access error at address %lx\n", regs->dar);
+
+#ifdef CONFIG_PCI
+   /* the qspan pci read routines can cause machine checks -- Cort
+*
+* yuck !!! that totally needs to go away ! There are better ways
+* to deal with that than having a wart in the mcheck handler.
+* -- BenH
+*/
+   bad_page_fault(regs, regs->dar, SIGBUS);
+   return 1;
+#else
+   return 0;
+#endif
+}
-- 
2.13.3

[PATCH 03/12] powerpc/8xx: Remove SoftwareEmulation()

2017-08-08 Thread Christophe Leroy

Since commit aa42c69c67f82 ("[POWERPC] Add support for FP emulation
for the e300c2 core"), program_check_exception() can be called for
math emulation. In that case, 'reason' is 0.

On the 8xx, there is a Software Emulation interrupt which is
called for all unimplemented and illegal instructions. This
interrupt calls SoftwareEmulation() which does almost the
same as program_check_exception() called with reason = 0.

The Software Emulation interrupt sets all reason bits to 0,
it is therefore possible to call program_check_exception()
directly from the interrupt handler.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S |  2 +-
 arch/powerpc/kernel/traps.c| 18 --
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 07ddced6bab3..778a0e11d0e6 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -300,7 +300,7 @@ SystemCall:
 /* On the MPC8xx, this is a software emulation interrupt.  It occurs
  * for all unimplemented and illegal instructions.
  */
-   EXCEPTION(0x1000, SoftEmu, SoftwareEmulation, EXC_XFER_STD)
+   EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD)
 
. = 0x1100
 /*
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index b328ca2aef9c..9d1f600bda08 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1518,24 +1518,6 @@ void performance_monitor_exception(struct pt_regs *regs)
perf_irq(regs);
 }
 
-#ifdef CONFIG_8xx
-void SoftwareEmulation(struct pt_regs *regs)
-{
-   CHECK_FULL_REGS(regs);
-
-   if (!user_mode(regs)) {
-   debugger(regs);
-   die("Kernel Mode Unimplemented Instruction or SW FPU Emulation",
-   regs, SIGFPE);
-   }
-
-   if (!emulate_math(regs))
-   return;
-
-   _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-}
-#endif /* CONFIG_8xx */
-
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
 {
-- 
2.13.3

[PATCH 04/12] powerpc/cpm1: link to CONFIG_CPM1 instead of CONFIG_8xx

2017-08-08 Thread Christophe Leroy

To remain consistent with what is done with CPM2, let's link
CPM1 related parts to CONFIG_CPM1 instead of CONFIG_8xx

When something depends on both CPM1 and CPM2 we associate it
with CONFIG_CPM

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/sysdev/Makefile  | 3 ++-
 arch/powerpc/sysdev/fsl_soc.c | 2 +-
 arch/powerpc/sysdev/fsl_soc.h | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 9e70421ad323..ff80780a2568 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -41,10 +41,11 @@ obj-$(CONFIG_XILINX_PCI)+= xilinx_pci.o
 obj-$(CONFIG_OF_RTC)   += of_rtc.o
 
 obj-$(CONFIG_CPM)  += cpm_common.o
+obj-$(CONFIG_CPM1) += cpm1.o
 obj-$(CONFIG_CPM2) += cpm2.o cpm2_pic.o
 obj-$(CONFIG_QUICC_ENGINE) += cpm_common.o
 obj-$(CONFIG_PPC_DCR)  += dcr.o
-obj-$(CONFIG_8xx)  += mpc8xx_pic.o cpm1.o
+obj-$(CONFIG_PPC_8xx)  += mpc8xx_pic.o
 obj-$(CONFIG_UCODE_PATCH)  += micropatch.o
 
 obj-$(CONFIG_PPC_MPC512x)  += mpc5xxx_clocks.o
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index 19101f9cfcfc..1f614fb2be56 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -98,7 +98,7 @@ u32 fsl_get_sys_freq(void)
 }
 EXPORT_SYMBOL(fsl_get_sys_freq);
 
-#if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx)
+#if defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE)
 
 u32 get_brgfreq(void)
 {
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index d73daa4f0ccf..2640446f8bc4 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -7,7 +7,7 @@
 struct spi_device;
 
 extern phys_addr_t get_immrbase(void);
-#if defined(CONFIG_CPM2) || defined(CONFIG_QUICC_ENGINE) || defined(CONFIG_8xx)
+#if defined(CONFIG_CPM) || defined(CONFIG_QUICC_ENGINE)
 extern u32 get_brgfreq(void);
 extern u32 get_baudrate(void);
 #else
-- 
2.13.3

[PATCH 05/12] powerpc/8xx: Move mpc8xx_pic.c from sysdev to platform/8xx

2017-08-08 Thread Christophe Leroy

mpc8xx_pic.c is dedicated to the 8xx, so move it to platform/8xx

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/platforms/8xx/Makefile   | 2 +-
 arch/powerpc/platforms/8xx/m8xx_setup.c   | 2 +-
 arch/powerpc/{sysdev/mpc8xx_pic.c => platforms/8xx/pic.c} | 2 +-
 arch/powerpc/{sysdev/mpc8xx_pic.h => platforms/8xx/pic.h} | 0
 arch/powerpc/sysdev/Makefile  | 1 -
 5 files changed, 3 insertions(+), 4 deletions(-)
 rename arch/powerpc/{sysdev/mpc8xx_pic.c => platforms/8xx/pic.c} (99%)
 rename arch/powerpc/{sysdev/mpc8xx_pic.h => platforms/8xx/pic.h} (100%)

diff --git a/arch/powerpc/platforms/8xx/Makefile 
b/arch/powerpc/platforms/8xx/Makefile
index 756be8345868..f9af3218bd9c 100644
--- a/arch/powerpc/platforms/8xx/Makefile
+++ b/arch/powerpc/platforms/8xx/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the PowerPC 8xx linux kernel.
 #
-obj-y  += m8xx_setup.o machine_check.o
+obj-y  += m8xx_setup.o machine_check.o pic.o
 obj-$(CONFIG_MPC885ADS)   += mpc885ads_setup.o
 obj-$(CONFIG_MPC86XADS)   += mpc86xads_setup.o
 obj-$(CONFIG_PPC_EP88XC)  += ep88xc.o
diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c 
b/arch/powerpc/platforms/8xx/m8xx_setup.c
index f81069f79a94..1917d69f84df 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -23,7 +23,7 @@
 #include 
 #include 
 
-#include 
+#include "pic.h"
 
 #include "mpc8xx.h"
 
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/platforms/8xx/pic.c
similarity index 99%
rename from arch/powerpc/sysdev/mpc8xx_pic.c
rename to arch/powerpc/platforms/8xx/pic.c
index 2842f9d63d21..8d5a25d43ef3 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.c
+++ b/arch/powerpc/platforms/8xx/pic.c
@@ -9,7 +9,7 @@
 #include 
 #include 
 
-#include "mpc8xx_pic.h"
+#include "pic.h"
 
 
 #define PIC_VEC_SPURRIOUS  15
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.h b/arch/powerpc/platforms/8xx/pic.h
similarity index 100%
rename from arch/powerpc/sysdev/mpc8xx_pic.h
rename to arch/powerpc/platforms/8xx/pic.h
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index ff80780a2568..79416fa2e3ba 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -45,7 +45,6 @@ obj-$(CONFIG_CPM1)+= cpm1.o
 obj-$(CONFIG_CPM2) += cpm2.o cpm2_pic.o
 obj-$(CONFIG_QUICC_ENGINE) += cpm_common.o
 obj-$(CONFIG_PPC_DCR)  += dcr.o
-obj-$(CONFIG_PPC_8xx)  += mpc8xx_pic.o
 obj-$(CONFIG_UCODE_PATCH)  += micropatch.o
 
 obj-$(CONFIG_PPC_MPC512x)  += mpc5xxx_clocks.o
-- 
2.13.3

[PATCH 06/12] powerpc/time: refactor MFTB() to limit number of ifdefs

2017-08-08 Thread Christophe Leroy

The 8xx cannot access the TBL and TBU registers using mfspr/mtspr
It must be accessed using mftb/mftbu

Due to this, there is a number of places with #ifdef CONFIG_8xx

This patch defines new macros MFTBL(x) and MFTBU(x) on the same model
as MFTB(x) and tries to make use of them as much as possible.

In arch/powerpc/include/asm/timex.h, we also remove the ifdef
for the asm() operands as the compiler doesn't mind unused operands

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/boot/ppc_asm.h   |  8 
 arch/powerpc/boot/util.S  | 24 +---
 arch/powerpc/include/asm/ppc_asm.h| 12 +---
 arch/powerpc/include/asm/timex.h  |  4 
 arch/powerpc/kernel/vdso32/gettimeofday.S | 12 +++-
 5 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/boot/ppc_asm.h b/arch/powerpc/boot/ppc_asm.h
index 68e388ee94fe..c63299f9fdd9 100644
--- a/arch/powerpc/boot/ppc_asm.h
+++ b/arch/powerpc/boot/ppc_asm.h
@@ -80,4 +80,12 @@
.long 0xa6037b7d; /* mtsrr1 r11 */ \
.long 0x244c  /* rfid   */
 
+#ifdef CONFIG_PPC_8xx
+#define MFTBL(dest)mftb dest
+#define MFTBU(dest)mftbu dest
+#else
+#define MFTBL(dest)mfspr dest, SPRN_TBRL
+#define MFTBU(dest)mfspr dest, SPRN_TBRU
+#endif
+
 #endif /* _PPC64_PPC_ASM_H */
diff --git a/arch/powerpc/boot/util.S b/arch/powerpc/boot/util.S
index 243b8497d58b..ec069177d942 100644
--- a/arch/powerpc/boot/util.S
+++ b/arch/powerpc/boot/util.S
@@ -71,32 +71,18 @@ udelay:
add r4,r4,r5
addir4,r4,-1
divwr4,r4,r5/* BUS ticks */
-#ifdef CONFIG_8xx
-1: mftbu   r5
-   mftbr6
-   mftbu   r7
-#else
-1: mfspr   r5, SPRN_TBRU
-   mfspr   r6, SPRN_TBRL
-   mfspr   r7, SPRN_TBRU
-#endif
+1: MFTBU(r5)
+   MFTBL(r6)
+   MFTBU(r7)
cmpw0,r5,r7
bne 1b  /* Get [synced] base time */
addcr9,r6,r4/* Compute end time */
addze   r8,r5
-#ifdef CONFIG_8xx
-2: mftbu   r5
-#else
-2: mfspr   r5, SPRN_TBRU
-#endif
+2: MFTBU(r5)
cmpw0,r5,r8
blt 2b
bgt 3f
-#ifdef CONFIG_8xx
-   mftbr6
-#else
-   mfspr   r6, SPRN_TBRL
-#endif
+   MFTBL(r6)
cmpw0,r6,r9
blt 2b
 3: blr
diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index 6baeeb9acd0d..d0e4f909ee36 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -378,10 +378,16 @@ BEGIN_FTR_SECTION_NESTED(96); \
cmpwi dest,0;   \
beq-  90b;  \
 END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96)
-#elif defined(CONFIG_8xx)
-#define MFTB(dest) mftb dest
 #else
-#define MFTB(dest) mfspr dest, SPRN_TBRL
+#define MFTB(dest) MFTBL(dest)
+#endif
+
+#ifdef CONFIG_PPC_8xx
+#define MFTBL(dest)mftb dest
+#define MFTBU(dest)mftbu dest
+#else
+#define MFTBL(dest)mfspr dest, SPRN_TBRL
+#define MFTBU(dest)mfspr dest, SPRN_TBRU
 #endif
 
 #ifndef CONFIG_SMP
diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h
index 2cf846edb3fc..b467dbcb0fb7 100644
--- a/arch/powerpc/include/asm/timex.h
+++ b/arch/powerpc/include/asm/timex.h
@@ -45,11 +45,7 @@ static inline cycles_t get_cycles(void)
"   .long 0\n"
"   .long 0\n"
".previous"
-#ifdef CONFIG_8xx
-   : "=r" (ret) : "i" (CPU_FTR_601));
-#else
: "=r" (ret) : "i" (CPU_FTR_601), "i" (SPRN_TBRL));
-#endif
return ret;
 #endif
 }
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S 
b/arch/powerpc/kernel/vdso32/gettimeofday.S
index 6b2b69616e77..769c2624e0a6 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -232,15 +232,9 @@ __do_get_tspec:
lwz r6,(CFG_TB_ORIG_STAMP+4)(r9)
 
/* Get a stable TB value */
-#ifdef CONFIG_8xx
-2: mftbu   r3
-   mftbl   r4
-   mftbu   r0
-#else
-2: mfspr   r3, SPRN_TBRU
-   mfspr   r4, SPRN_TBRL
-   mfspr   r0, SPRN_TBRU
-#endif
+2: MFTBU(r3)
+   MFTBL(r4)
+   MFTBU(r0)
cmplw   cr0,r3,r0
bne-2b
 
-- 
2.13.3

[PATCH 08/12] powerpc/8xx: Getting rid of remaining use of CONFIG_8xx

2017-08-08 Thread Christophe Leroy

Two config options exist to define powerpc MPC8xx:
* CONFIG_PPC_8xx
* CONFIG_8xx

arch/powerpc/platforms/Kconfig.cputype has contained the following
comment about CONFIG_8xx item for some years:
"# this is temp to handle compat with arch=ppc"

arch/powerpc is now the only place with remaining use of
CONFIG_8xx: get rid of them.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig |  4 ++--
 arch/powerpc/Makefile|  2 +-
 arch/powerpc/boot/Makefile   |  4 ++--
 arch/powerpc/include/asm/cache.h |  2 +-
 arch/powerpc/include/asm/cputable.h  |  4 ++--
 arch/powerpc/include/asm/fs_pd.h |  2 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 +-
 arch/powerpc/include/asm/ppc_asm.h   |  2 +-
 arch/powerpc/include/asm/reg.h   | 10 +-
 arch/powerpc/include/asm/timex.h |  2 +-
 arch/powerpc/kernel/Makefile |  2 +-
 arch/powerpc/kernel/cputable.c   |  4 ++--
 arch/powerpc/kernel/irq.c|  2 +-
 arch/powerpc/kernel/kgdb.c   |  4 ++--
 arch/powerpc/mm/fault.c  |  2 +-
 arch/powerpc/mm/mem.c|  2 +-
 arch/powerpc/mm/mmu_decl.h   | 10 +-
 arch/powerpc/mm/tlb_nohash_low.S |  2 +-
 arch/powerpc/platforms/8xx/Kconfig   |  1 -
 arch/powerpc/platforms/Kconfig.cputype   |  2 +-
 20 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f2e2a07dd422..634871cee587 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -394,7 +394,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 
 config MATH_EMULATION
bool "Math emulation"
-   depends on 4xx || 8xx || PPC_MPC832x || BOOKE
+   depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
---help---
  Some PowerPC chips designed for embedded applications do not have
  a floating-point unit and therefore do not implement the
@@ -956,7 +956,7 @@ config PPC_PCI_CHOICE
 
 config PCI
bool "PCI support" if PPC_PCI_CHOICE
-   default y if !40x && !CPM2 && !8xx && !PPC_83xx \
+   default y if !40x && !CPM2 && !PPC_8xx && !PPC_83xx \
&& !PPC_85xx && !PPC_86xx && !GAMECUBE_COMMON
default PCI_QSPAN if PPC_8xx
select GENERIC_PCI_IOMAP
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 0c4b282ec936..399765ccff2f 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -250,7 +250,7 @@ KBUILD_AFLAGS += $(aflags-y)
 KBUILD_CFLAGS += $(cflags-y)
 
 head-y := arch/powerpc/kernel/head_$(BITS).o
-head-$(CONFIG_8xx) := arch/powerpc/kernel/head_8xx.o
+head-$(CONFIG_PPC_8xx) := arch/powerpc/kernel/head_8xx.o
 head-$(CONFIG_40x) := arch/powerpc/kernel/head_40x.o
 head-$(CONFIG_44x) := arch/powerpc/kernel/head_44x.o
 head-$(CONFIG_FSL_BOOKE)   := arch/powerpc/kernel/head_fsl_booke.o
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 6f952fe1f084..8a437c5f6b01 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -115,7 +115,7 @@ src-wlib-y += crtsavres.S
 endif
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
 src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c
-src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c
+src-wlib-$(CONFIG_PPC_8xx) += mpc8xx.c planetcore.c fsl-soc.c
 src-wlib-$(CONFIG_PPC_82xx) += pq2.c fsl-soc.c planetcore.c
 src-wlib-$(CONFIG_EMBEDDED6xx) += mv64x60.c mv64x60_i2c.c ugecon.c fsl-soc.c
 
@@ -132,7 +132,7 @@ src-plat-$(CONFIG_44x) += treeboot-ebony.c cuboot-ebony.c 
treeboot-bamboo.c \
treeboot-iss4xx.c treeboot-currituck.c \
treeboot-akebono.c \
simpleboot.c fixed-head.S virtex.c
-src-plat-$(CONFIG_8xx) += cuboot-8xx.c fixed-head.S ep88xc.c redboot-8xx.c
+src-plat-$(CONFIG_PPC_8xx) += cuboot-8xx.c fixed-head.S ep88xc.c redboot-8xx.c
 src-plat-$(CONFIG_PPC_MPC52xx) += cuboot-52xx.c
 src-plat-$(CONFIG_PPC_82xx) += cuboot-pq2.c fixed-head.S ep8248e.c 
cuboot-824x.c
 src-plat-$(CONFIG_PPC_83xx) += cuboot-83xx.c fixed-head.S redboot-83xx.c
diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 5a90292afbad..d122f7f957ce 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -5,7 +5,7 @@
 
 
 /* bytes per L1 cache line */
-#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
+#if defined(CONFIG_PPC_8xx) || defined(CONFIG_403GCX)
 #define L1_CACHE_SHIFT 4
 #define MAX_COPY_PREFETCH  1
 #elif defined(CONFIG_PPC_E500MC)
diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index d02ad93bf708..a9bf921f4efc 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -513,7 +513,7 @@ enum {
 #els

[PATCH 07/12] powerpc/kconfig: Simplify PCI_QSPAN selection

2017-08-08 Thread Christophe Leroy

4xx, CPM2 and 8xx cannot be selected at the same time, so
no need to test 8xx && !4xx && !CPM2. Testing 8xx is enough.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5e9de178b557..f2e2a07dd422 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -958,7 +958,7 @@ config PCI
bool "PCI support" if PPC_PCI_CHOICE
default y if !40x && !CPM2 && !8xx && !PPC_83xx \
&& !PPC_85xx && !PPC_86xx && !GAMECUBE_COMMON
-   default PCI_QSPAN if !4xx && !CPM2 && 8xx
+   default PCI_QSPAN if PPC_8xx
select GENERIC_PCI_IOMAP
help
  Find out whether your system includes a PCI bus. PCI is the name of
@@ -974,7 +974,7 @@ config PCI_SYSCALL
 
 config PCI_QSPAN
bool "QSpan PCI"
-   depends on !4xx && !CPM2 && 8xx
+   depends on PPC_8xx
select PPC_I8259
help
  Say Y here if you have a system based on a Motorola 8xx-series
-- 
2.13.3

[PATCH 09/12] powerpc/8xx: remove CONFIG_8xx

2017-08-08 Thread Christophe Leroy

Two config options exist to define powerpc MPC8xx:
* CONFIG_PPC_8xx
* CONFIG_8xx

arch/powerpc/platforms/Kconfig.cputype has contained the following
comment about CONFIG_8xx item for some years:
"# this is temp to handle compat with arch=ppc"

There is no more users of CONFIG_8xx, so remove it.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/platforms/Kconfig.cputype | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 395593ef580c..13663efc1d31 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -32,7 +32,6 @@ config PPC_85xx
 config PPC_8xx
bool "Freescale 8xx"
select FSL_SOC
-   select 8xx
select PPC_LIB_RHEAP
select SYS_SUPPORTS_HUGETLBFS
 
@@ -149,10 +148,6 @@ config 6xx
depends on PPC32 && PPC_BOOK3S
select PPC_HAVE_PMU_SUPPORT
 
-# this is temp to handle compat with arch=ppc
-config 8xx
-   bool
-
 config E500
select FSL_EMB_PERFMON
select PPC_FSL_BOOK3E
-- 
2.13.3

[PATCH 10/12] powerpc/8xx: Use symbolic PVR value

2017-08-08 Thread Christophe Leroy

For the 8xx, PVR values defined in arch/powerpc/include/asm/reg.h
are nowhere used.

Remove all defines and add PVR_8xx

Use it in arch/powerpc/kernel/cputable.c

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/reg.h | 6 ++
 arch/powerpc/kernel/cputable.c | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 70722e5b93e7..c36823d64ec9 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1247,10 +1247,8 @@
  * differentiated by the version number in the Communication Processor
  * Module (CPM).
  */
-#define PVR_8210x0050
-#define PVR_823PVR_821
-#define PVR_850PVR_821
-#define PVR_860PVR_821
+#define PVR_8xx0x0050
+
 #define PVR_8240   0x00810100
 #define PVR_8245   0x80811014
 #define PVR_8260   PVR_8240
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index e9ba5b84ac9b..760872916013 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -1262,7 +1262,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 #ifdef CONFIG_PPC_8xx
{   /* 8xx */
.pvr_mask   = 0x,
-   .pvr_value  = 0x0050,
+   .pvr_value  = PVR_8xx,
.cpu_name   = "8xx",
/* CPU_FTR_MAYBE_CAN_DOZE is possible,
 * if the 8xx code is there */
-- 
2.13.3

[PATCH 11/12] powerpc/8xx: Use symbolic names for DSISR bits in DSI

2017-08-08 Thread Christophe Leroy

Use symbolic names for DSISR bits in DSI

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 2 +-
 arch/powerpc/mm/fault.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 778a0e11d0e6..a1c2a2cfec7e 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -595,7 +595,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */
mfspr   r5,SPRN_DSISR
stw r5,_DSISR(r11)
mfspr   r4,SPRN_DAR
-   andis.  r10,r5,0x4000
+   andis.  r10,r5,DSISR_NOHPTE@h
beq+1f
tlbie   r4
 dtlbie:
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7a218f69f956..6948d0628cde 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -392,7 +392,7 @@ static void sanity_check_fault(bool is_write, unsigned long 
error_code) { }
 #else
 #define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE)
 #if defined(CONFIG_PPC_8xx)
-#define page_fault_is_bad(__err)   ((__err) & 0x1000)
+#define page_fault_is_bad(__err)   ((__err) & DSISR_NOEXEC_OR_G)
 #elif defined(CONFIG_PPC64)
 #define page_fault_is_bad(__err)   ((__err) & DSISR_BAD_FAULT_64S)
 #else
-- 
2.13.3

[PATCH 12/12] powerpc/8xx: Remove cpu dependent macro instructions from head_8xx

2017-08-08 Thread Christophe Leroy

head_8xx is dedicated to 8xx so no need to use macros that
depends on the CPU

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index a1c2a2cfec7e..471936704261 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -123,7 +123,6 @@ turn_on_mmu:
lis r0,start_here@h
ori r0,r0,start_here@l
mtspr   SPRN_SRR0,r0
-   SYNC
rfi /* enables MMU */
 
 /*
@@ -170,7 +169,7 @@ turn_on_mmu:
stw r1,0(r11);  \
tovirt(r1,r11); /* set new kernel sp */ \
li  r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \
-   MTMSRD(r10);/* (except for mach check in rtas) */ \
+   mtmsr   r10;\
stw r0,GPR0(r11);   \
SAVE_4GPRS(3, r11); \
SAVE_2GPRS(7, r11)
@@ -915,10 +914,8 @@ start_here:
rfi
 /* Load up the kernel context */
 2:
-   SYNC/* Force all PTE updates to finish */
tlbia   /* Clear all TLB entries */
sync/* wait for tlbia/tlbie to finish */
-   TLBSYNC /* ... on all CPUs */
 
/* set up the PTE pointers for the Abatron bdiGDB.
*/
-- 
2.13.3

Re: [v6 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-08 Thread Will Deacon

On Tue, Aug 08, 2017 at 07:49:22AM -0400, Pasha Tatashin wrote:
> Hi Will,
> 
> Thank you for looking at this change. What you described was in my previous
> iterations of this project.
> 
> See for example here: https://lkml.org/lkml/2017/5/5/369
> 
> I was asked to remove that flag, and only zero memory in place when needed.
> Overall the current approach is better everywhere else in the kernel, but it
> adds a little extra code to kasan initialization.

Damn, I actually prefer the flag :)

But actually, if you look at our implementation of vmemmap_populate, then we
have our own version of vmemmap_populate_basepages that terminates at the
pmd level anyway if ARM64_SWAPPER_USES_SECTION_MAPS. If there's resistance
to do this in the core code, then I'd be inclined to replace our
vmemmap_populate implementation in the arm64 code with a single version that
can terminate at either the PMD or the PTE level, and do zeroing if
required. We're already special-casing it, so we don't really lose anything
imo.

Will

Re: [PATCH 10/13] powerpc/64s: idle simplify KVM idle on POWER9

2017-08-08 Thread Nicholas Piggin

On Tue, 8 Aug 2017 16:06:43 +0530
Gautham R Shenoy  wrote:

> Hi Nicholas,
> 
> On Sun, Aug 06, 2017 at 03:02:38AM +1000, Nicholas Piggin wrote:
> > POWER9 CPUs have independent MMU contexts per thread so KVM
> > does not have to bring sibling threads into real-mode when
> > switching MMU mode to guest. This can simplify POWER9 sleep/wake
> > paths and avoids hwsyncs.
> > 


> > @@ -444,6 +439,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
> > beq 1f
> > b   kvm_start_guest
> >  1:
> > +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)  
> 
> This would be 7 nops on power9. Should we move this to a different
> function and do a bl to that?

Yes that's a good idea.

> > +static void kvmppc_release_hwthread_secondary(int cpu)
> > +{
> > +   struct paca_struct *tpaca;
> > +
> > +   if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> > +   WARN_ON(1);
> > +   return;
> > +   }
> > +
> > +   tpaca = &paca[cpu];
> > +   tpaca->kvm_hstate.hwthread_req = 0;
> > +   kvmppc_release_hwthread(cpu);
> > +}
> > +
> > +  
> 
> Extra blank line not needed.

Sure.

> > @@ -2858,11 +2883,13 @@ static noinline void kvmppc_run_core(struct 
> > kvmppc_vcore *vc)
> > 
> > /* Let secondaries go back to the offline loop */
> > for (i = 0; i < controlled_threads; ++i) {
> > -   kvmppc_release_hwthread(pcpu + i);
> > if (sip && sip->napped[i])
> > kvmppc_ipi_thread(pcpu + i);
> > cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
> > }  
> 
> We are sending an IPI to the thread that has exited the guest and is
> currently napping. The IPI wakes it up so that it can executes
> offline loop. But we haven't released the hwthread yet, which means
> that hwthread_req for this thread is still set.
> 
> The thread wakes up from nap, executes the pnv_powersave_wakeup code
> where it can enter kvm_start_guest. Is this a legitimate race or am I
> missing something?

Oh I think it's just a silly mistake in my patch, good catch.
Would moving this loop below the one below solve it? I wasn't
completely happy with uglifying these loops by making the
primary release different than secondary... maybe I will just
move the difference into kvmppc_release_hwthread and which is
less intrusive to callers.

Thanks,
Nick

Re: [v6 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-08 Thread Pasha Tatashin


Hi Will,

> Damn, I actually prefer the flag :)
>
> But actually, if you look at our implementation of vmemmap_populate, 
then we

> have our own version of vmemmap_populate_basepages that terminates at the
> pmd level anyway if ARM64_SWAPPER_USES_SECTION_MAPS. If there's 
resistance

> to do this in the core code, then I'd be inclined to replace our
> vmemmap_populate implementation in the arm64 code with a single 
version that

> can terminate at either the PMD or the PTE level, and do zeroing if
> required. We're already special-casing it, so we don't really lose 
anything

> imo.

Another approach is to create a new mapping interface for kasan only. As 
what Ard Biesheuvel wrote:


> KASAN uses vmemmap_populate as a convenience: kasan has nothing to do
> with vmemmap, but the function already existed and happened to do what
> KASAN requires.
>
> Given that that will no longer be the case, it would be far better to
> stop using vmemmap_populate altogether, and clone it into a KASAN
> specific version (with an appropriate name) with the zeroing folded
> into it.

I agree with this statement, but I think it should not be part of this 
project.


Pasha

Re: block/ps3vram: Check return of ps3vram_cache_init

2017-08-08 Thread SF Markus Elfring

> @@ -741,7 +741,11 @@ static int ps3vram_probe(struct ps3_system_bus_device 
> *dev)
>   goto out_unmap_reports;
>   }
>  
> - ps3vram_cache_init(dev);
> + error = ps3vram_cache_init(dev);
> + if (error < 0) {
> + goto out_unmap_reports;
> + }
> +
>   ps3vram_proc_init(dev);
>  
>   queue = blk_alloc_queue(GFP_KERNEL);

I pointed a few development concerns out.
https://patchwork.ozlabs.org/patch/798575/

Will the commit subject “block/ps3vram: Check return value of 
ps3vram_cache_init()”
be more appropriate?

Regards,
Markus

RE: [v6 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-08 Thread David Laight

From: Pasha Tatashin
> Sent: 08 August 2017 12:49
> Thank you for looking at this change. What you described was in my
> previous iterations of this project.
> 
> See for example here: https://lkml.org/lkml/2017/5/5/369
> 
> I was asked to remove that flag, and only zero memory in place when
> needed. Overall the current approach is better everywhere else in the
> kernel, but it adds a little extra code to kasan initialization.

Perhaps you could #define the function prototype(s?) so that the flags
are not passed unless it is a kasan build?

David

Re: [v6 11/15] arm64/kasan: explicitly zero kasan shadow memory

2017-08-08 Thread Pasha Tatashin


On 2017-08-08 09:15, David Laight wrote:

From: Pasha Tatashin

Sent: 08 August 2017 12:49
Thank you for looking at this change. What you described was in my
previous iterations of this project.

See for example here: https://lkml.org/lkml/2017/5/5/369

I was asked to remove that flag, and only zero memory in place when
needed. Overall the current approach is better everywhere else in the
kernel, but it adds a little extra code to kasan initialization.


Perhaps you could #define the function prototype(s?) so that the flags
are not passed unless it is a kasan build?



Hi David,

Thank you for suggestion. I think a kasan specific vmemmap (what I 
described in the previous e-mail) would be a better solution over having 
different prototypes with different builds.  It would be cleaner to have 
all kasan specific code in one place.


Pasha

Re: [PATCH 0/3] Minor updates for PS3

2017-08-08 Thread Jens Axboe

On 08/08/2017 04:16 AM, Michael Ellerman wrote:
> Geoff Levand  writes:
> 
>> Hi Michael,
>>
>> A few very minor updates for PS3.  Please apply.
> 
> Jens do you want to take the block ones, or should I just take the lot?

Up to you, I'm fine either way.

-- 
Jens Axboe

[PATCH 01/16] mm: Dont assume page-table invariance during faults

2017-08-08 Thread Laurent Dufour

From: Peter Zijlstra 

One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the reliance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) 
---
 mm/memory.c | 27 ---
 1 file changed, 27 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index f65beaad319b..d08f494f1b37 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2103,30 +2103,6 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry which was
- * read non-atomically.  Before making any commitment, on those architectures
- * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
- * parts, do_swap_page must check under lock before unmapping the pte and
- * proceeding (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-   pte_t *page_table, pte_t orig_pte)
-{
-   int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-   if (sizeof(pte_t) > sizeof(unsigned long)) {
-   spinlock_t *ptl = pte_lockptr(mm, pmd);
-   spin_lock(ptl);
-   same = pte_same(*page_table, orig_pte);
-   spin_unlock(ptl);
-   }
-#endif
-   pte_unmap(page_table);
-   return same;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
 {
debug_dma_assert_idle(src);
@@ -2683,9 +2659,6 @@ int do_swap_page(struct vm_fault *vmf)
int exclusive = 0;
int ret = 0;
 
-   if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
-   goto out;
-
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
-- 
2.7.4

[PATCH 00/16] Speculative page faults

2017-08-08 Thread Laurent Dufour

This is a port on kernel 4.13 of the work done by Peter Zijlstra to
handle page fault without holding the mm semaphore [1].

The idea is to try to handle user space page faults without holding the
mmap_sem. This should allow better concurrency for massively threaded
process since the page fault handler will not wait for other threads memory
layout change to be done, assuming that this change is done in another part
of the process's memory space. This type page fault is named speculative
page fault. If the speculative page fault fails because of a concurrency is
detected or because underlying PMD or PTE tables are not yet allocating, it
is failing its processing and a classic page fault is then tried.

The speculative page fault (SPF) has to look for the VMA matching the fault
address without holding the mmap_sem, so the VMA list is now managed using
SRCU allowing lockless walking. The only impact would be the deferred file
derefencing in the case of a file mapping, since the file pointer is
released once the SRCU cleaning is done.  This patch relies on the change
done recently by Paul McKenney in SRCU which now runs a callback per CPU
instead of per SRCU structure [1].

The VMA's attributes checked during the speculative page fault processing
have to be protected against parallel changes. This is done by using a per
VMA sequence lock. This sequence lock allows the speculative page fault
handler to fast check for parallel changes in progress and to abort the
speculative page fault in that case.

Once the VMA is found, the speculative page fault handler would check for
the VMA's attributes to verify that the page fault has to be handled
correctly or not. Thus the VMA is protected through a sequence lock which
allows fast detection of concurrent VMA changes. If such a change is
detected, the speculative page fault is aborted and a *classic* page fault
is tried.  VMA sequence locks are added when VMA attributes which are
checked during the page fault are modified.

When the PTE is fetched, the VMA is checked to see if it has been changed,
so once the page table is locked, the VMA is valid, so any other changes
leading to touching this PTE will need to lock the page table, so no
parallel change is possible at this time.

Compared to the Peter's initial work, this series introduces a spin_trylock
when dealing with speculative page fault. This is required to avoid dead
lock when handling a page fault while a TLB invalidate is requested by an
other CPU holding the PTE. Another change due to a lock dependency issue
with mapping->i_mmap_rwsem.

This series builds on top of v4.13-rc4 and is functional on x86 and
PowerPC.

Tests have been made using a large commercial in-memory database on a
PowerPC system with 752 CPUs. The results are very encouraging since the
loading of the 2TB database was faster by 14% with the speculative page
fault.

Using ebizzy test [3], which spreads a lot of threads, the result are good
when running on both a large or a small system. When using kernbench, the
result are quite similar which expected as not so much multithreaded
processes are involved. But there is no performance degradation neither
which is good.

--
Benchmarks results

Note these test have been made on top of 4.13-rc3 with the following patch
from Paul McKenney applied: 
 "srcu: Provide ordering for CPU not involved in grace period" [5]

Ebizzy:
---
The test is counting the number of records per second it can manage, the
higher is the best. I run it like this 'ebizzy -mTRp'. To get consistent
result I repeated the test 100 times and measure the average result, mean
deviation and max.

- 16 CPUs x86 VM
Records/s   4.13-rc34.13-rc3-spf
Average 11455.9245803.64
Mean deviation  509.34  848.19
Max 13997   49824

- 80 CPUs Power 8 node:
Records/s   4.13-rc34.13-rc3-spf
Average 33848.7663427.62
Mean deviation  684.48  1618.84
Max 36235   70401

Kernbench:
--
This test is building a 4.12 kernel using platform default config. The
build has been run 5 times each time.

- 16 CPUs x86 VM
Average Half load -j 7 Run (std deviation)
 4.13.0-rc3 4.13.0-rc3-spf
Elapsed Time 166.668 (0.462299) 167.55 (0.432724)
User Time1083.11 (2.89018)  1083.76 (2.17015)
System Time  202.982 (0.984058) 210.364 (0.890382)
Percent CPU  771.2 (0.83666)771.8 (1.09545)
Context Switches 46789 (519.558)67602.4 (365.929)
Sleeps   83870.8 (836.392)  84269.4 (457.962)

Average Optimal load -j 16 Run (std deviation)
 4.13.0-rc3 4.13.0-rc3-spf
Elapsed Time 85.002 (0.298111)  85.406 (0.506784)
User Time1033.25 (52.6037)  1034.63 (51.8167)
System Time  185.46 (18.4826)   191.75 (19.6379)
Percent CPU  1062.6 (307.181)   1063.9 (307.948)
Context Switches 67423.3 (21762.7)  91316.1

[PATCH 02/16] mm: Prepare for FAULT_FLAG_SPECULATIVE

2017-08-08 Thread Laurent Dufour

From: Peter Zijlstra 

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Remove the comment about the fault_env structure which has been
 implemented as the vm_fault structure in the kernel]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  1 +
 mm/memory.c| 55 ++
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..8763ec96dc78 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -286,6 +286,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
diff --git a/mm/memory.c b/mm/memory.c
index d08f494f1b37..b93916c0b086 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2241,6 +2241,12 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 
&vmf->ptl);
+   return true;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page.
  *
@@ -2268,6 +2274,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+   int ret = VM_FAULT_OOM;
 
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2295,7 +2302,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
-   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+   if (!pte_map_lock(vmf)) {
+   mem_cgroup_cancel_charge(new_page, memcg, false);
+   ret = VM_FAULT_RETRY;
+   goto oom_free_new;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2383,7 +2394,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 oom:
if (old_page)
put_page(old_page);
-   return VM_FAULT_OOM;
+   return ret;
 }
 
 /**
@@ -2404,8 +2415,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-  &vmf->ptl);
+   if (!pte_map_lock(vmf))
+   return VM_FAULT_RETRY;
/*
 * We might have raced with another page fault while we released the
 * pte_offset_map_lock.
@@ -2523,8 +2534,11 @@ static int do_wp_page(struct vm_fault *vmf)
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
lock_page(vmf->page);
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, &vmf->ptl);
+   if (!pte_map_lock(vmf)) {
+   unlock_page(vmf->page);
+   put_page(vmf->page);
+   return VM_FAULT_RETRY;
+   }
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2682,8 +2696,10 @@ int do_swap_page(struct vm_fault *vmf)
 * Back out if somebody else faulted in this pte
 * while we released the pte lock.
 */
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, &vmf->ptl);
+   if (!pte_map_lock(vmf)) {
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   return VM_FAULT_RETRY;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2739

[PATCH 03/16] mm: Introduce pte_spinlock for FAULT_FLAG_SPECULATIVE

2017-08-08 Thread Laurent Dufour

When handling page fault without holding the mmap_sem the fetch of the
pte lock pointer and the locking will have to be done while ensuring
that the VMA is not touched in our back.

So move the fetch and locking operations in a dedicated function.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index b93916c0b086..11c5fe5f62bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2241,6 +2241,13 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
+static bool pte_spinlock(struct vm_fault *vmf)
+{
+   vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   spin_lock(vmf->ptl);
+   return true;
+}
+
 static bool pte_map_lock(struct vm_fault *vmf)
 {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 
&vmf->ptl);
@@ -3515,8 +3522,8 @@ static int do_numa_page(struct vm_fault *vmf)
 * validation through pte_unmap_same(). It's of NUMA type but
 * the pfn may be screwed if the read is non atomic.
 */
-   vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (!pte_spinlock(vmf))
+   return VM_FAULT_RETRY;
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
@@ -3708,8 +3715,8 @@ static int handle_pte_fault(struct vm_fault *vmf)
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
 
-   vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (!pte_spinlock(vmf))
+   return VM_FAULT_RETRY;
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
-- 
2.7.4

[PATCH 04/16] mm: VMA sequence count

2017-08-08 Thread Laurent Dufour

From: Peter Zijlstra 

Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence
counts such that we can easily test if a VMA is changed.

The unmap_page_range() one allows us to make assumptions about
page-tables; when we find the seqcount hasn't changed we can assume
page-tables are still valid.

The flip side is that we cannot distinguish between a vma_adjust() and
the unmap_page_range() -- where with the former we could have
re-checked the vma bounds against the address.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm_types.h |  1 +
 mm/memory.c  |  2 ++
 mm/mmap.c| 21 ++---
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f384bb62d8e..d7d6dae4c009 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -342,6 +342,7 @@ struct vm_area_struct {
struct mempolicy *vm_policy;/* NUMA policy for the VMA */
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+   seqcount_t vm_sequence;
 } __randomize_layout;
 
 struct core_thread {
diff --git a/mm/memory.c b/mm/memory.c
index 11c5fe5f62bb..7d61f64916a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1380,6 +1380,7 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long next;
 
BUG_ON(addr >= end);
+   write_seqcount_begin(&vma->vm_sequence);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -1389,6 +1390,7 @@ void unmap_page_range(struct mmu_gather *tlb,
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
+   write_seqcount_end(&vma->vm_sequence);
 }
 
 
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..140b22136cb7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -557,6 +557,8 @@ void __vma_link_rb(struct mm_struct *mm, struct 
vm_area_struct *vma,
else
mm->highest_vm_end = vm_end_gap(vma);
 
+   seqcount_init(&vma->vm_sequence);
+
/*
 * vma->vm_prev wasn't known when we followed the rbtree to find the
 * correct insertion point for that vma. As a result, we could not
@@ -798,6 +800,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
}
}
 
+   write_seqcount_begin(&vma->vm_sequence);
+   if (next && next != vma)
+   write_seqcount_begin_nested(&next->vm_sequence,
+   SINGLE_DEPTH_NESTING);
+
anon_vma = vma->anon_vma;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
@@ -902,6 +909,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
mm->map_count--;
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
+   write_seqcount_end(&next->vm_sequence);
/*
 * In mprotect's case 6 (see comments on vma_merge),
 * we must remove another next too. It would clutter
@@ -931,11 +939,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned 
long start,
if (remove_next == 2) {
remove_next = 1;
end = next->vm_end;
+   write_seqcount_end(&vma->vm_sequence);
goto again;
-   }
-   else if (next)
+   } else if (next) {
+   if (next != vma)
+   write_seqcount_begin_nested(&next->vm_sequence,
+   
SINGLE_DEPTH_NESTING);
vma_gap_update(next);
-   else {
+   } else {
/*
 * If remove_next == 2 we obviously can't
 * reach this path.
@@ -961,6 +972,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
if (insert && file)
uprobe_mmap(insert);
 
+   if (next && next != vma)
+   write_seqcount_end(&next->vm_sequence);
+   write_seqcount_end(&vma->vm_sequence);
+
validate_mm(mm);
 
return 0;
-- 
2.7.4

[PATCH 06/16] mm: RCU free VMAs

2017-08-08 Thread Laurent Dufour

From: Peter Zijlstra 

Manage the VMAs with SRCU such that we can do a lockless VMA lookup.

We put the fput(vma->vm_file) in the SRCU callback, this keeps files
valid during speculative faults, this is possible due to the delayed
fput work by Al Viro -- do we need srcu_barrier() in unmount
someplace?

We guard the mm_rb tree with a seqlock (this could be a seqcount but
we'd have to disable preemption around the write side in order to make
the retry loop in __read_seqcount_begin() work) such that we can know
if the rb tree walk was correct. We cannot trust the restult of a
lockless tree walk in the face of concurrent tree rotations; although
we can trust on the termination of such walks -- tree rotations
guarantee the end result is a tree again after all.

Furthermore, we rely on the WMB implied by the
write_seqlock/count_begin() to separate the VMA initialization and the
publishing stores, analogous to the RELEASE in rcu_assign_pointer().
We also rely on the RMB from read_seqretry() to separate the vma load
from further loads like the smp_read_barrier_depends() in regular
RCU.

We must not touch the vmacache while doing SRCU lookups as that is not
properly serialized against changes. We update gap information after
publishing the VMA, but A) we don't use that and B) the seqlock
read side would fix that anyhow.

We clear vma->vm_rb for nodes removed from the vma tree such that we
can easily detect such 'dead' nodes, we rely on the WMB from
write_sequnlock() to separate the tree removal and clearing the node.

Provide find_vma_srcu() which wraps the required magic.

Signed-off-by: Peter Zijlstra (Intel) 

[Remove the warnings in description about the SRCU global lock which
 has been removed now]
[Rename vma_is_dead() to vma_has_changed()]
[Pass vm_fault structure pointer instead of 2 arguments  to
 vmf_has_changed() ]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm_types.h |   2 +
 kernel/fork.c|   1 +
 mm/init-mm.c |   1 +
 mm/internal.h|  19 +
 mm/mmap.c| 100 +++
 5 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d7d6dae4c009..30c127b8a4d8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -343,6 +343,7 @@ struct vm_area_struct {
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
seqcount_t vm_sequence;
+   struct rcu_head vm_rcu_head;
 } __randomize_layout;
 
 struct core_thread {
@@ -360,6 +361,7 @@ struct kioctx_table;
 struct mm_struct {
struct vm_area_struct *mmap;/* list of VMAs */
struct rb_root mm_rb;
+   seqlock_t mm_seq;
u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..8d1223270fea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -791,6 +791,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+   seqlock_init(&mm->mm_seq);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 975e49f00f34..2b1fa061684f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -16,6 +16,7 @@
 
 struct mm_struct init_mm = {
.mm_rb  = RB_ROOT,
+   .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq),
.pgd= swapper_pg_dir,
.mm_users   = ATOMIC_INIT(2),
.mm_count   = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..9d6347e35747 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,25 @@ void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);
 
+extern struct srcu_struct vma_srcu;
+
+extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm,
+   unsigned long addr);
+
+static inline bool vma_has_changed(struct vm_fault *vmf)
+{
+   int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb);
+   unsigned seq = ACCESS_ONCE(vmf->vma->vm_sequence.sequence);
+
+   /*
+* Matches both the wmb in write_seqlock_{begin,end}() and
+* the wmb in vma_rb_erase().
+*/
+   smp_rmb();
+
+   return ret || seq != vmf->sequence;
+}
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 221b1f3e966a..73f5ffc03155 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -159,6 +159,23 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
 }
 
+DEFINE_SRCU(vma_srcu);
+
+static void __free_vma(struct rcu_head *head)
+{
+   struct vm_area_struct *vma =
+   container_of(head, struct vm_

[PATCH 05/16] mm: Protect VMA modifications using VMA sequence count

2017-08-08 Thread Laurent Dufour

The VMA sequence count has been introduced to allow fast detection of
VMA modification when running a page fault handler without holding
the mmap_sem.

This patch provides protection agains the VMA modification done in :
- madvise()
- mremap()
- mpol_rebind_policy()
- vma_replace_policy()
- change_prot_numa()
- mlock(), munlock()
- mprotect()
- mmap_region()
- collapse_huge_page()

Signed-off-by: Laurent Dufour 
---
 fs/proc/task_mmu.c |  2 ++
 mm/khugepaged.c|  3 +++
 mm/madvise.c   |  4 
 mm/mempolicy.c | 10 +-
 mm/mlock.c |  9 ++---
 mm/mmap.c  |  2 ++
 mm/mprotect.c  |  2 ++
 mm/mremap.c|  7 +++
 8 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b836fd61ed87..5c0c3ab10f3c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1064,8 +1064,10 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
+   write_seqcount_begin(&vma->vm_sequence);
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
+   write_seqcount_end(&vma->vm_sequence);
}
downgrade_write(&mm->mmap_sem);
break;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c01f177a1120..56dd994c05d0 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1005,6 +1005,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (mm_find_pmd(mm, address) != pmd)
goto out;
 
+   write_seqcount_begin(&vma->vm_sequence);
anon_vma_lock_write(vma->anon_vma);
 
pte = pte_offset_map(pmd, address);
@@ -1040,6 +1041,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+   write_seqcount_end(&vma->vm_sequence);
result = SCAN_FAIL;
goto out;
}
@@ -1074,6 +1076,7 @@ static void collapse_huge_page(struct mm_struct *mm,
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
+   write_seqcount_end(&vma->vm_sequence);
 
*hpage = NULL;
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 47d8d8a25eae..4f73ecaa0961 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -172,7 +172,9 @@ static long madvise_behavior(struct vm_area_struct *vma,
/*
 * vm_flags is protected by the mmap_sem held in write mode.
 */
+   write_seqcount_begin(&vma->vm_sequence);
vma->vm_flags = new_flags;
+   write_seqcount_end(&vma->vm_sequence);
 out:
return error;
 }
@@ -440,9 +442,11 @@ static void madvise_free_page_range(struct mmu_gather *tlb,
.private = tlb,
};
 
+   write_seqcount_begin(&vma->vm_sequence);
tlb_start_vma(tlb, vma);
walk_page_range(addr, end, &free_walk);
tlb_end_vma(tlb, vma);
+   write_seqcount_end(&vma->vm_sequence);
 }
 
 static int madvise_free_single_vma(struct vm_area_struct *vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d911fa5cb2a7..32ed50c0d4b2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -378,8 +378,11 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
struct vm_area_struct *vma;
 
down_write(&mm->mmap_sem);
-   for (vma = mm->mmap; vma; vma = vma->vm_next)
+   for (vma = mm->mmap; vma; vma = vma->vm_next) {
+   write_seqcount_begin(&vma->vm_sequence);
mpol_rebind_policy(vma->vm_policy, new);
+   write_seqcount_end(&vma->vm_sequence);
+   }
up_write(&mm->mmap_sem);
 }
 
@@ -537,9 +540,11 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 {
int nr_updated;
 
+   write_seqcount_begin(&vma->vm_sequence);
nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+   write_seqcount_end(&vma->vm_sequence);
 
return nr_updated;
 }
@@ -640,6 +645,7 @@ static int vma_replace_policy(struct vm_area_struct *vma,
if (IS_ERR(new))
return PTR_ERR(new);
 
+   write_seqcount_begin(&vma->vm_sequence);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
@@ -648,10 +654,12 @@ static int vma_replace_policy(struct vm_area_struct *vma,
 
old = vma->vm_policy;
vma->vm_policy

[PATCH 07/16] mm: Provide speculative fault infrastructure

2017-08-08 Thread Laurent Dufour

From: Peter Zijlstra 

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) 

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed()]
[Call p4d_alloc() as it is safe since pgd is valid]
[Call pud_alloc() as it is safe since p4d is valid]
[Set fe.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |   3 +
 mm/memory.c| 183 -
 2 files changed, 183 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8763ec96dc78..863a13af680a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -315,6 +315,7 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+   unsigned int sequence;
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1286,6 +1287,8 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+extern int handle_speculative_fault(struct mm_struct *mm,
+   unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
diff --git a/mm/memory.c b/mm/memory.c
index 7d61f64916a2..14236d98a5c5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2245,15 +2245,69 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 
 static bool pte_spinlock(struct vm_fault *vmf)
 {
+   bool ret = false;
+
+   /* Check if vma is still valid */
+   if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+   vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   spin_lock(vmf->ptl);
+   return true;
+   }
+
+   local_irq_disable();
+   if (vma_has_changed(vmf))
+   goto out;
+
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
-   return true;
+
+   if (vma_has_changed(vmf)) {
+   spin_unlock(vmf->ptl);
+   goto out;
+   }
+
+   ret = true;
+out:
+   local_irq_enable();
+   return ret;
 }
 
 static bool pte_map_lock(struct vm_fault *vmf)
 {
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, 
&vmf->ptl);
-   return true;
+   bool ret = false;
+   pte_t *pte;
+   spinlock_t *ptl;
+
+   if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+  vmf->address, &vmf->ptl);
+   return true;
+   }
+
+   /*
+* The first vma_has_changed() guarantees the page-tables are still
+* valid, having IRQs disabled ensures they stay around, hence the
+* second vma_has_changed() to make sure they are still valid once
+* we've got the lock. After that a concurrent zap_pte_range() will
+* block on the PTL and thus we're safe.
+*/
+   local_irq_disable();
+   if (vma_has_changed(vmf))
+   goto out;
+
+   pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &ptl);
+   if (vma_has_changed(vmf)) {
+   pte_unmap_unlock(pte, ptl);
+

[PATCH 09/16] x86/mm: Add speculative pagefault handling

2017-08-08 Thread Laurent Dufour

From: Peter Zijlstra 

Try a speculative fault before acquiring mmap_sem, if it returns with
VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
traditional fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
 handle_speculative_fault()]
[Retry with usual fault path in the case VM_ERROR is returned by
 handle_speculative_fault(). This allows signal to be delivered]
Signed-off-by: Laurent Dufour 
---
 arch/x86/mm/fault.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..46fb9c2a832d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1365,6 +1365,19 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
if (error_code & PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
 
+   if (error_code & PF_USER) {
+   fault = handle_speculative_fault(mm, address, flags);
+
+   /*
+* We also check against VM_FAULT_ERROR because we have to
+* raise a signal by calling later mm_fault_error() which
+* requires the vma pointer to be set. So in that case,
+* we fall through the normal path.
+*/
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR))
+   goto done;
+   }
+
/*
 * When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in
@@ -1474,6 +1487,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
+done:
/*
 * Major/minor page fault accounting. If any of the events
 * returned VM_FAULT_MAJOR, we account it as a major fault.
-- 
2.7.4

[PATCH 10/16] powerpc/mm: Add speculative page fault

2017-08-08 Thread Laurent Dufour

This patch enable the speculative page fault on the PowerPC
architecture.

This will try a speculative page fault without holding the mmap_sem,
if it returns with WM_FAULT_RETRY, the mmap_sem is acquired and the
traditional page fault processing is done.

Signed-off-by: Laurent Dufour 
---
 arch/powerpc/mm/fault.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4c422632047b..c6cd40901dd0 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -291,9 +291,31 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
if (is_write && is_user)
store_update_sp = store_updates_sp(regs);
 
-   if (is_user)
+   if (is_user) {
flags |= FAULT_FLAG_USER;
 
+   /* let's try a speculative page fault without grabbing the
+* mmap_sem.
+*/
+
+   /*
+* flags is set later based on the VMA's flags, for the common
+* speculative service, we need some flags to be set.
+*/
+   if (is_write)
+   flags |= FAULT_FLAG_WRITE;
+
+   fault = handle_speculative_fault(mm, address, flags);
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR))
+   goto done;
+
+   /*
+* Resetting flags since the following code assumes
+* FAULT_FLAG_WRITE is not set.
+*/
+   flags &= ~FAULT_FLAG_WRITE;
+   }
+
/* When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in the
 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -479,6 +501,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
rc = 0;
}
 
+done:
/*
 * Major/minor page fault accounting.
 */
-- 
2.7.4

[PATCH 12/16] mm: Protect SPF handler against anon_vma changes

2017-08-08 Thread Laurent Dufour

The speculative page fault handler must be protected against anon_vma
changes. This is because page_add_new_anon_rmap() is called during the
speculative path.

In addition, don't try speculative page fault if the VMA don't have an
anon_vma structure allocated because its allocation should be
protected by the mmap_sem.

In __vma_adjust() when importer->anon_vma is set, there is no need to
protect against speculative page faults since speculative page fault
is aborted if the vma->anon_vma is not set.

When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
valid since we checked for it when locking the pte and the anon_vma is
removed once the pte is unlocked. So even if the speculative page
fault handler is running concurrently with do_unmap(), as the pte is
locked in unmap_region() - through unmap_vmas() - and the anon_vma
unlinked later, because we check for the vma sequence counter which is
updated in unmap_page_range() before locking the pte, and then in
free_pgtables() so when locking the pte the change will be detected.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 519c28507a93..cb6906435ff5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -587,7 +587,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * Hide vma from rmap and truncate_pagecache before freeing
 * pgtables
 */
+   write_seqcount_begin(&vma->vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(&vma->vm_sequence);
unlink_file_vma(vma);
 
if (is_vm_hugetlb_page(vma)) {
@@ -601,7 +603,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
   && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+   write_seqcount_begin(&vma->vm_sequence);
unlink_anon_vmas(vma);
+   write_seqcount_end(&vma->vm_sequence);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
@@ -2403,7 +2407,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * thread doing COW.
 */
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
-   page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2873,7 +2877,7 @@ int do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -3015,7 +3019,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
}
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
 setpte:
@@ -3940,6 +3944,9 @@ int handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
if (address < vma->vm_start || vma->vm_end <= address)
goto unlock;
 
+   if (unlikely(!vma->anon_vma))
+   goto unlock;
+
/*
 * Huge pages are not yet supported.
 */
-- 
2.7.4

[PATCH 11/16] mm: Introduce __page_add_new_anon_rmap()

2017-08-08 Thread Laurent Dufour

When dealing with speculative page fault handler, we may race with VMA
being split or merged. In this case the vma->vm_start and vm->vm_end
fields may not match the address the page fault is occurring.

This can only happens when the VMA is split but in that case, the
anon_vma pointer of the new VMA will be the same as the original one,
because in __split_vma the new->anon_vma is set to src->anon_vma when
*new = *vma.

So even if the VMA boundaries are not correct, the anon_vma pointer is
still valid.

If the VMA has been merged, then the VMA in which it has been merged
must have the same anon_vma pointer otherwise the merge can't be done.

So in all the case we know that the anon_vma is valid, since we have
checked before starting the speculative page fault that the anon_vma
pointer is valid for this VMA and since there is an anon_vma this
means that at one time a page has been backed and that before the VMA
is cleaned, the page table lock would have to be grab to clean the
PTE, and the anon_vma field is checked once the PTE is locked.

This patch introduce a new __page_add_new_anon_rmap() service which
doesn't check for the VMA boundaries, and create a new inline one
which do the check. Currently __page_add_new_anon_rmap() is only
called during the speculative page fault path.

Signed-off-by: Laurent Dufour 
---
 include/linux/rmap.h | 12 ++--
 mm/rmap.c|  5 ++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 43ef2c30cb0f..f5cd4dbc78b0 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -170,8 +170,16 @@ void page_add_anon_rmap(struct page *, struct 
vm_area_struct *,
unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
   unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-   unsigned long, bool);
+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
+static inline void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address, bool compound)
+{
+   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+   __page_add_new_anon_rmap(page, vma, address, compound);
+}
+
 void page_add_file_rmap(struct page *, bool);
 void page_remove_rmap(struct page *, bool);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..e99f9cd7b399 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1117,7 +1117,7 @@ void do_page_add_anon_rmap(struct page *page,
 }
 
 /**
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
  * @page:  the page to add the mapping to
  * @vma:   the vm area in which the mapping is added
  * @address:   the user virtual address mapped
@@ -1127,12 +1127,11 @@ void do_page_add_anon_rmap(struct page *page,
  * This means the inc-and-test can be bypassed.
  * Page does not have to be locked.
  */
-void page_add_new_anon_rmap(struct page *page,
+void __page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, bool compound)
 {
int nr = compound ? hpage_nr_pages(page) : 1;
 
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__SetPageSwapBacked(page);
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-- 
2.7.4

[PATCH 08/16] mm: Try spin lock in speculative path

2017-08-08 Thread Laurent Dufour

There is a deadlock when a CPU is doing a speculative page fault and
another one is calling do_unmap().

The deadlock occurred because the speculative path try to spinlock the
pte while the interrupt are disabled. When the other CPU in the
unmap's path has locked the pte then is waiting for all the CPU to
invalidate the TLB. As the CPU doing the speculative fault have the
interrupt disable it can't invalidate the TLB, and can't get the lock.

Since we are in a speculative path, we can race with other mm action.
So let assume that the lock may not get acquired and fail the
speculative page fault.

Here are the stacks captured during the deadlock:

CPU 0
native_flush_tlb_others+0x7c/0x260
flush_tlb_mm_range+0x6a/0x220
tlb_flush_mmu_tlbonly+0x63/0xc0
unmap_page_range+0x897/0x9d0
? unmap_single_vma+0x7d/0xe0
? release_pages+0x2b3/0x360
unmap_single_vma+0x7d/0xe0
unmap_vmas+0x51/0xa0
unmap_region+0xbd/0x130
do_munmap+0x279/0x460
SyS_munmap+0x53/0x70

CPU 1
do_raw_spin_lock+0x14e/0x160
_raw_spin_lock+0x5d/0x80
? pte_map_lock+0x169/0x1b0
pte_map_lock+0x169/0x1b0
handle_pte_fault+0xbf2/0xd80
? trace_hardirqs_on+0xd/0x10
handle_speculative_fault+0x272/0x280
handle_speculative_fault+0x5/0x280
__do_page_fault+0x187/0x580
trace_do_page_fault+0x52/0x260
do_async_page_fault+0x19/0x70
async_page_fault+0x28/0x30

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 14236d98a5c5..519c28507a93 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2259,7 +2259,8 @@ static bool pte_spinlock(struct vm_fault *vmf)
goto out;
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (unlikely(!spin_trylock(vmf->ptl)))
+   goto out;
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
@@ -2295,8 +2296,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
if (vma_has_changed(vmf))
goto out;
 
-   pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, &ptl);
+   /*
+* Same as pte_offset_map_lock() except that we call
+* spin_trylock() in place of spin_lock() to avoid race with
+* unmap path which may have the lock and wait for this CPU
+* to invalidate TLB but this CPU has irq disabled.
+* Since we are in a speculative patch, accept it could fail
+*/
+   ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   pte = pte_offset_map(vmf->pmd, vmf->address);
+   if (unlikely(!spin_trylock(ptl))) {
+   pte_unmap(pte);
+   goto out;
+   }
+
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
goto out;
-- 
2.7.4

[PATCH 13/16] perf: Add a speculative page fault sw events

2017-08-08 Thread Laurent Dufour

Add new software events to count succeeded and failed speculative page
faults.

Signed-off-by: Laurent Dufour 
---
 include/uapi/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..fbfb03dff334 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -111,6 +111,8 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF_DONE  = 11,
+   PERF_COUNT_SW_SPF_FAILED= 12,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
-- 
2.7.4

[PATCH 14/16] x86/mm: Add support for SPF events

2017-08-08 Thread Laurent Dufour

Add support for the new speculative page faults software events.

Signed-off-by: Laurent Dufour 
---
 arch/x86/mm/fault.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 46fb9c2a832d..17985f11b9da 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1374,8 +1374,12 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
 * requires the vma pointer to be set. So in that case,
 * we fall through the normal path.
 */
-   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR))
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
+ regs, address);
goto done;
+   }
+   perf_sw_event(PERF_COUNT_SW_SPF_FAILED, 1, regs, address);
}
 
/*
-- 
2.7.4

[PATCH 15/16] powerpc/mm: Add support for SPF events

2017-08-08 Thread Laurent Dufour

Add support for the new speculative page faults software events.

Signed-off-by: Laurent Dufour 
---
 arch/powerpc/mm/fault.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c6cd40901dd0..112c4bc9da70 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -306,8 +306,13 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
flags |= FAULT_FLAG_WRITE;
 
fault = handle_speculative_fault(mm, address, flags);
-   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR))
+   if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
+ regs, address);
goto done;
+   }
+
+   perf_sw_event(PERF_COUNT_SW_SPF_FAILED, 1, regs, address);
 
/*
 * Resetting flags since the following code assumes
-- 
2.7.4

[PATCH 16/16] perf tools: Add support for SPF events

2017-08-08 Thread Laurent Dufour

Add support for the new speculative faults events.

Signed-off-by: Laurent Dufour 
---
 tools/include/uapi/linux/perf_event.h | 2 ++
 tools/perf/util/evsel.c   | 2 ++
 tools/perf/util/parse-events.c| 8 
 tools/perf/util/parse-events.l| 2 ++
 tools/perf/util/python.c  | 2 ++
 5 files changed, 16 insertions(+)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..fbfb03dff334 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -111,6 +111,8 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF_DONE  = 11,
+   PERF_COUNT_SW_SPF_FAILED= 12,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 413f74df08de..37d55ffd98b1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -426,6 +426,8 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = {
"alignment-faults",
"emulation-faults",
"dummy",
+   "speculative-faults",
+   "speculative-faults-failed",
 };
 
 static const char *__perf_evsel__sw_name(u64 config)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 01e779b91c8e..da1f87859366 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -135,6 +135,14 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
.symbol = "bpf-output",
.alias  = "",
},
+   [PERF_COUNT_SW_SPF_DONE] = {
+   .symbol = "speculative-faults",
+   .alias  = "spf",
+   },
+   [PERF_COUNT_SW_SPF_FAILED] = {
+   .symbol = "speculative-faults-failed",
+   .alias  = "spf-failed",
+   },
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 660fca05bc93..ca0adbc97683 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -274,6 +274,8 @@ alignment-faults{ return 
sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_AL
 emulation-faults   { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
 dummy  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 bpf-output { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
+speculative-faults|spf { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF_DONE); }
+speculative-faults-failed|spf-failed   { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF_FAILED); }
 
/*
 * We have to handle the kernel PMU event 
cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index c129e99114ae..b85e70e0da06 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1141,6 +1141,8 @@ static struct {
PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS),
PERF_CONST(COUNT_SW_EMULATION_FAULTS),
PERF_CONST(COUNT_SW_DUMMY),
+   PERF_CONST(COUNT_SW_SPF_DONE),
+   PERF_CONST(COUNT_SW_SPF_FAILED),
 
PERF_CONST(SAMPLE_IP),
PERF_CONST(SAMPLE_TID),
-- 
2.7.4

Re: [PATCH 0/4] Removing full paths from DT full_name

2017-08-08 Thread Rob Herring

On Mon, Aug 7, 2017 at 9:21 PM, Michael Ellerman  wrote:
> Rob Herring  writes:
>
>> On Tue, Jul 25, 2017 at 4:44 PM, Rob Herring  wrote:
>>> This series is the last steps to remove storing the full path for every
>>> DT node. Instead, we can create full path strings dynamically as needed
>>> with printf %pOF specifiers (commit ce4fecf1fe15). There are a number of
>>> remaining direct users of full_name after this series. I don't believe
>>> there should be any functional impact for those users with the change to
>>> only the node name (+unit-address). The majority are for struct
>>> resource.name. This should only affect /proc/iomem display.
>>>
>>> Patches 1 and 2 can be applied now for 4.14. For patches 3 and 4, my
>>> target is 4.15 after all the dependencies have been merged.
>>>
>>> PPC folks, Please test! The PPC parts are untested. A git branch with
>>> all the dependencies is here[1].
>>
>> PPC folks, any chance to test this?
>
> I got stuck on your %pOF conversion, which broke the vio.c code, because of:
>
> -   if (!strcmp(parent_node->full_name, 
> "/ibm,platform-facilities"))
> +   if (!strcmp(parent_node->full_name, 
> "ibm,platform-facilities"))
>
> But full_name hasn't been changed yet.

Ah, those lines need to be dropped from the %pOF conversion. I'll send
a new version.

What I was originally considering here was just:

strcmp(kbasename(parent_node->full_name), "ibm,platform-facilities")

That would work for "/foo/ibm,platform-facilities" too, but IMO
validation of the DT is not the kernel's job (if it is, we're doing a
horrible job). But in the end here, I decided to keep the existing
full path matching as patch 1 does.

> But patch 1 here should fix that, so I'll pull it all together and try
> and get it tested.

Thanks.

Rob

[PATCH v6 00/17] powerpc/vas: Enable VAS

2017-08-08 Thread Sukadev Bhattiprolu

POWER9 introduces a hardware subsystem referred to as the Virtual
Accelerator Switchboard (VAS). VAS allows kernel subsystems and user
space processes to directly access the Nest Accelerator (NX) engines
which implement compression and encryption algorithms in the hardware.

NX has been in Power processors since Power7+, but access to the NX
engines was through the 'icswx' instruction which is only available
to the kernel/hypervisor. Starting with POWER9, access to the NX
engines is provided to both kernel and user space processes through
VAS.

The switchboard (i.e VAS) multiplexes accesses between "receivers" and
"senders", where the "receivers" are typically the NX engines and the
"senders" are the kernel subsystems and user processors that wish to
access the receivers (NX engines).  Once a sender is "connected" to
a receiver through the switchboard, the senders can submit compression/
encryption requests to the hardware using the new (PowerISA 3.0)
"copy" and "paste" instructions.

Senders can also send "empty" messages to the receiver. If the receiver
is executing a WAIT instruction, this empty message serves to have the
receiver resume from the next instruction. (i.e acts as "wake up" message).
This usage of VAS is referred to as "Fast thread-wakeup".

Provides:

This patch set:
- configures the VAS subsystems in the hardware

- provides kernel interfaces to drivers like NX-842 and
  NX-FTW (new) to open receive and send/receive windows
  and to submit copy/paste (i.e compression) requests to
  the NX engines.

- implements an NX-FTW driver for the fast thread-wake up
  mechanism.  It provides the /dev/crypto/nx-ftw device node,
  and ioctls to allow users to use the FTW mechanism in VAS.

Follow-on patch set(s) will allow user space processes to submit
requests to the NX-GZIP engine (and possibly other engines).

Requires:

This patch set needs corresponding VAS/NX skiboot patches which
were merged into skiboot tree. i.e skiboot must include:

commit 3b3c596 (NX: Add P9 NX support for 842 compression engine)

Testing:

In-kernel compression requests were tested on DD1 POWER9 hardware
using the following NX-842 patch set from Haren Myneni:

https://lists.ozlabs.org/pipermail/linuxppc-dev/2017-July/160620.html

The ability to setup user space send/receive windows for FTW was
tested on DD1 hardware. The actual copy/paste of the empty messages
is not yet supported in hardware and that functionality was tested
on DD2 simics software.

Git Tree:

https://github.com/sukadev/linux/   

Branch: vas-kern-v6

Thanks to input from Ben Herrenschmidt, Michael Neuling, Michael Ellerman,
Robert Blackmore and Haren Myneni.

Changelog[v6]
- Add support for user space send/receive FTW windows
- Add a new, NX-FTW driver which provides the FTW user interface

Changelog[v5]
- [Ben Herrenschmidt] Make VAS a platform device in the device tree
  and use the core platform functions to parse the VAS properties.
  Map the VAS MMIO regions as non-cachable and paste regions as
  cachable. Use CONFIG_PPC_VAS rather than CONFIG_VAS; Don't assume
  VAS ids are sequential.
- Copy the FIFO address as is into LFIFO_BAR (don't shift it).

Changelog[v4]
Comments from Michael Neuling:
- Move VAS code from drivers/misc/vas to arch/powerpc/platforms/powernv
  since VAS only provides interfaces to other drivers like NX-842.
- Drop vas-internal.h and use vas.h in separate dirs for VAS
  internal, kernel API and user API
- Rather than create 6 separate device tree properties windows
  and window context, combine them into 6 "reg" properties.
- Drop vas_window_reset() since windows are reset/cleared before
  being assigned to kernel/users.
- Use ilog2() and radix_enabled() helpers

Changelog[v3]
- Rebase to v4.11-rc1
- Add interfaces to initialize send/receive window attributes to
  defaults that drivers can use (see arch/powerpc/include/asm/vas.h)
- Modify interface vas_paste() to return 0 or error code
- Fix a bug in setting Translation Control Mode (0b11 not 0x11)
- Enable send-window-credit checking 
- Reorg code  in vas_win_close()
- Minor reorgs and tweaks to register field settings to make it
  easier to add support for user space windows.
- Skip writing to read-only registers
- Start window indexing from 0 rather than 1

Changelog[v2]
- Use vas-id, HVWC, UWC and paste address, entries from device tree
  rather than defining/computing them in kernel and reorg code.

Sukadev Bhattiprolu (17):
  powerpc/vas: Define macros, register f

[PATCH v6 01/17] powerpc/vas: Define macros, register fields and structures

2017-08-08 Thread Sukadev Bhattiprolu

Define macros for the VAS hardware registers and bit-fields as well
as couple of data structures needed by the VAS driver.

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v6]
- Add some fields for FTW windows

Changelog[v4]
- [Michael Neuling] Move VAS code to arch/powerpc; Reorg vas.h and
  vas-internal.h to kernel and uapi versions; rather than creating
  separate properties for window context/address entries in device
  tree, combine them into "reg" properties; drop ->hwirq and irq_port
  fields from vas_window as they are only needed with user space
  windows.
- Drop the error check for CONFIG_PPC_4K_PAGES. Instead in a
  follow-on patch add a "depends on CONFIG_PPC_64K_PAGES".

Changelog[v3]
- Rename winctx->pid to winctx->pidr to reflect that its a value
  from the PID register (SPRN_PID), not the linux process id.
- Make it easier to split header into kernel/user parts
- To keep user interface simple, use macros rather than enum for
  the threshold-control modes.
- Add a pid field to struct vas_window - needed for user space
  send windows.

Changelog[v2]
- Add an overview of VAS in vas-internal.h
- Get window context parameters from device tree and drop
  unnecessary macros.
---
 arch/powerpc/include/asm/vas.h   |  35 
 arch/powerpc/include/uapi/asm/vas.h  |  25 +++
 arch/powerpc/platforms/powernv/vas.h | 382 +++
 3 files changed, 442 insertions(+)
 create mode 100644 arch/powerpc/include/asm/vas.h
 create mode 100644 arch/powerpc/include/uapi/asm/vas.h
 create mode 100644 arch/powerpc/platforms/powernv/vas.h

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
new file mode 100644
index 000..2c8558a
--- /dev/null
+++ b/arch/powerpc/include/asm/vas.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2016 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _MISC_VAS_H
+#define _MISC_VAS_H
+
+#include 
+
+/*
+ * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25
+ * (Local FIFO Size Register) of the VAS workbook.
+ */
+#define VAS_RX_FIFO_SIZE_MIN   (1 << 10)   /* 1KB */
+#define VAS_RX_FIFO_SIZE_MAX   (8 << 20)   /* 8MB */
+
+/*
+ * Co-processor Engine type.
+ */
+enum vas_cop_type {
+   VAS_COP_TYPE_FAULT,
+   VAS_COP_TYPE_842,
+   VAS_COP_TYPE_842_HIPRI,
+   VAS_COP_TYPE_GZIP,
+   VAS_COP_TYPE_GZIP_HIPRI,
+   VAS_COP_TYPE_FTW,
+   VAS_COP_TYPE_MAX,
+};
+
+#endif /* _MISC_VAS_H */
diff --git a/arch/powerpc/include/uapi/asm/vas.h 
b/arch/powerpc/include/uapi/asm/vas.h
new file mode 100644
index 000..ddfe046
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/vas.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2016 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_MISC_VAS_H
+#define _UAPI_MISC_VAS_H
+
+/*
+ * Threshold Control Mode: Have paste operation fail if the number of
+ * requests in receive FIFO exceeds a threshold.
+ *
+ * NOTE: No special error code yet if paste is rejected because of these
+ *  limits. So users can't distinguish between this and other errors.
+ */
+#define VAS_THRESH_DISABLED0
+#define VAS_THRESH_FIFO_GT_HALF_FULL   1
+#define VAS_THRESH_FIFO_GT_QTR_FULL2
+#define VAS_THRESH_FIFO_GT_EIGHTH_FULL 3
+
+#endif /* _UAPI_MISC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
new file mode 100644
index 000..312a378
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -0,0 +1,382 @@
+/*
+ * Copyright 2016 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _VAS_H
+#define _VAS_H
+#include 
+#include 
+#include 
+
+/*
+ * Overview of Virtual Accelerator Switchboard (VAS).
+ *
+ * VAS is a hardware "switchboard" that allows senders and receivers to
+ * exchange messages with _minimal_ kernel involvment. The receivers are
+ * typically NX coprocessor engines that perform compression or encryption
+ * in hardware, but receivers can also be other software threads.
+ *
+ * Senders are user/kernel threads that submit compression/encryption or
+ * other requests to the receivers. Senders must format their messages as
+ * Coprocessor Request Blocks (CRB)s and submit t

[PATCH v6 02/17] powerpc/vas: Move GET_FIELD/SET_FIELD to vas.h

2017-08-08 Thread Sukadev Bhattiprolu

Move the GET_FIELD and SET_FIELD macros to vas.h as VAS and other
users of VAS, including NX-842 can use those macros.

There is a lot of related code between the VAS/NX kernel drivers
and skiboot. For consistency switch the order of parameters in
SET_FIELD to match the order in skiboot.

Signed-off-by: Sukadev Bhattiprolu 
Reviewed-by: Dan Streetman 
---

Changelog[v3]
- Fix order of parameters in nx-842 driver.
---
 arch/powerpc/include/uapi/asm/vas.h | 8 
 drivers/crypto/nx/nx-842-powernv.c  | 7 ---
 drivers/crypto/nx/nx-842.h  | 5 -
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/uapi/asm/vas.h 
b/arch/powerpc/include/uapi/asm/vas.h
index ddfe046..21249f5 100644
--- a/arch/powerpc/include/uapi/asm/vas.h
+++ b/arch/powerpc/include/uapi/asm/vas.h
@@ -22,4 +22,12 @@
 #define VAS_THRESH_FIFO_GT_QTR_FULL2
 #define VAS_THRESH_FIFO_GT_EIGHTH_FULL 3
 
+/*
+ * Get/Set bit fields
+ */
+#define GET_FIELD(m, v)(((v) & (m)) >> MASK_LSH(m))
+#define MASK_LSH(m)(__builtin_ffsl(m) - 1)
+#define SET_FIELD(m, v, val)   \
+   (((v) & ~(m)) | typeof(v))(val)) << MASK_LSH(m)) & (m)))
+
 #endif /* _UAPI_MISC_VAS_H */
diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
index 1710f80..3abb045 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -22,6 +22,7 @@
 
 #include 
 #include 
+#include 
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dan Streetman ");
@@ -424,9 +425,9 @@ static int nx842_powernv_function(const unsigned char *in, 
unsigned int inlen,
 
/* set up CCW */
ccw = 0;
-   ccw = SET_FIELD(ccw, CCW_CT, nx842_ct);
-   ccw = SET_FIELD(ccw, CCW_CI_842, 0); /* use 0 for hw auto-selection */
-   ccw = SET_FIELD(ccw, CCW_FC_842, fc);
+   ccw = SET_FIELD(CCW_CT, ccw, nx842_ct);
+   ccw = SET_FIELD(CCW_CI_842, ccw, 0); /* use 0 for hw auto-selection */
+   ccw = SET_FIELD(CCW_FC_842, ccw, fc);
 
/* set up CRB's CSB addr */
csb_addr = nx842_get_pa(csb) & CRB_CSB_ADDRESS;
diff --git a/drivers/crypto/nx/nx-842.h b/drivers/crypto/nx/nx-842.h
index a4eee3b..30929bd 100644
--- a/drivers/crypto/nx/nx-842.h
+++ b/drivers/crypto/nx/nx-842.h
@@ -100,11 +100,6 @@ static inline unsigned long nx842_get_pa(void *addr)
return page_to_phys(vmalloc_to_page(addr)) + offset_in_page(addr);
 }
 
-/* Get/Set bit fields */
-#define MASK_LSH(m)(__builtin_ffsl(m) - 1)
-#define GET_FIELD(v, m)(((v) & (m)) >> MASK_LSH(m))
-#define SET_FIELD(v, m, val)   (((v) & ~(m)) | (((val) << MASK_LSH(m)) & (m)))
-
 /**
  * This provides the driver's constraints.  Different nx842 implementations
  * may have varying requirements.  The constraints are:
-- 
2.7.4

[PATCH v6 03/17] powerpc/vas: Define vas_init() and vas_exit()

2017-08-08 Thread Sukadev Bhattiprolu

Implement vas_init() and vas_exit() functions for a new VAS module.
This VAS module is essentially a library for other device drivers
and kernel users of the NX coprocessors like NX-842 and NX-GZIP.
In the future this will be extended to add support for user space
to access the NX coprocessors.

VAS is currently only supported with 64K page size.

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v5]:
- [Ben Herrenschmidt]: Create and use platform device tree nodes,
  fix up the "reg" properties for the VAS DT node and use the
  platform device helpers to parse the reg properties; Use linked
  list of VAS instances (don't assume vasids are sequential);
  Use CONFIG_PPC_VAS instead of CONFIG_VAS.

Changelog[v4]:
- [Michael Neuling] Fix some accidental deletions; fix help text
  in Kconfig; change vas_initialized to a function; move from
  drivers/misc to arch/powerpc/kernel
- Drop the vas_window_reset() interface. It is not needed as
  window will be initialized before each use.
- Add a "depends on PPC_64K_PAGES"

Changelog[v3]:
- Zero vas_instances memory on allocation
- [Haren Myneni] Fix description in Kconfig
Changelog[v2]:
- Get HVWC, UWC and window address parameters from device tree.
---
 .../devicetree/bindings/powerpc/ibm,vas.txt|  24 +++
 MAINTAINERS|  18 ++
 arch/powerpc/platforms/powernv/Kconfig |  14 ++
 arch/powerpc/platforms/powernv/Makefile|   1 +
 arch/powerpc/platforms/powernv/vas-window.c|  19 +++
 arch/powerpc/platforms/powernv/vas.c   | 183 +
 arch/powerpc/platforms/powernv/vas.h   |  10 +-
 7 files changed, 267 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/powerpc/ibm,vas.txt
 create mode 100644 arch/powerpc/platforms/powernv/vas-window.c
 create mode 100644 arch/powerpc/platforms/powernv/vas.c

diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt 
b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
new file mode 100644
index 000..8468a3a
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
@@ -0,0 +1,24 @@
+* IBM Powerpc Virtual Accelerator Switchboard (VAS)
+
+VAS is a hardware mechanism that allows ekrnel subsystems and user processes
+to directly submit compression and other requests to Nest accelerators (NX)
+or other coprocessors functions.
+
+Required properties:
+- compatible : should be "ibm,vas" or "ibm,power9-vas"
+- ibm,vas-id : A unique identifier for each instance of VAS in the system
+- reg : Should contain 4 pairs of 64-bit fields specifying the Hypervisor
+  window context start and length, OS/User window context start and length,
+  "Paste address" start and length, "Paste window id" start bit and number
+  of bits)
+- name : "vas"
+
+Example:
+
+   vas@60191 {
+   compatible = "ibm,vas", "ibm,power9-vas";
+   reg = <0x60191 0x200 0x60190 0x1 
0x8 0x1 0x20 0x10>;
+   name = "vas";
+   ibm,vas-id = <0x1>;
+   };
+
diff --git a/MAINTAINERS b/MAINTAINERS
index 3c41902..edc58c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6425,6 +6425,24 @@ F:   drivers/crypto/nx/nx.*
 F: drivers/crypto/nx/nx_csbcpb.h
 F: drivers/crypto/nx/nx_debugfs.h
 
+IBM Power Virtual Accelerator Switchboard
+M: Sukadev Bhattiprolu
+L: linuxppc-dev@lists.ozlabs.org
+S: Supported
+F: arch/powerpc/platforms/powernv/vas*
+F: arch/powerpc/include/asm/vas.h
+F: arch/powerpc/include/uapi/asm/vas.h
+
+IBM Power 842 compression accelerator
+M: Haren Myneni 
+S: Supported
+F: drivers/crypto/nx/Makefile
+F: drivers/crypto/nx/Kconfig
+F: drivers/crypto/nx/nx-842*
+F: include/linux/sw842.h
+F: crypto/842.c
+F: lib/842/
+
 IBM Power Linux RAID adapter
 M: Brian King 
 S: Supported
diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 6a6f4ef..f565454 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -30,3 +30,17 @@ config OPAL_PRD
help
  This enables the opal-prd driver, a facility to run processor
  recovery diagnostics on OpenPower machines
+
+config PPC_VAS
+   bool "IBM Virtual Accelerator Switchboard (VAS)"
+   depends on PPC_POWERNV && PPC_64K_PAGES
+   default n
+   help
+ This enables support for IBM Virtual Accelerator Switchboard (VAS).
+
+ VAS allows accelerators in co-processors like NX-GZIP and NX-842
+ to be accessible to kernel subsystems and user processes.
+
+ VAS adapters are found in POWER9 based systems.
+
+ If unsure, say N.
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
ind

[PATCH v6 04/17] powerpc/vas: Define helpers to access MMIO regions

2017-08-08 Thread Sukadev Bhattiprolu

Define some helper functions to access the MMIO regions. We use these
in follow-on patches to read/write VAS hardware registers. They are
also used to later issue 'paste' instructions to submit requests to
the NX hardware engines.

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog [v6]:
- Minor reorg to make setup/cleanup functions more symmetric

Changelog [v5]:
- [Ben Herrenschmidt]: Need cachable mapping for paste regions
  and non-cachable mapping for the MMIO regions. So, just use
  ioremap() for mapping the MMIO regions; use "winctx" instead
  of "wc" to avoid collision with "write combine".

Changelog [v3]:
- Minor reorg/cleanup of map/unmap functions

Changelog [v2]:
- Get HVWC, UWC and paste addresses from window->vinst (i.e DT)
  rather than kernel macros.
---
 arch/powerpc/platforms/powernv/vas-window.c | 173 
 1 file changed, 173 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 6156fbe..a3a705a 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -9,9 +9,182 @@
 
 #include 
 #include 
+#include 
+#include 
 
 #include "vas.h"
 
+/*
+ * Compute the paste address region for the window @window using the
+ * ->paste_base_addr and ->paste_win_id_shift we got from device tree.
+ */
+void compute_paste_address(struct vas_window *window, uint64_t *addr, int *len)
+{
+   uint64_t base, shift;
+   int winid;
+
+   base = window->vinst->paste_base_addr;
+   shift = window->vinst->paste_win_id_shift;
+   winid = window->winid;
+
+   *addr  = base + (winid << shift);
+   if (len)
+   *len = PAGE_SIZE;
+
+   pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
+}
+
+static inline void get_hvwc_mmio_bar(struct vas_window *window,
+   uint64_t *start, int *len)
+{
+   uint64_t pbaddr;
+
+   pbaddr = window->vinst->hvwc_bar_start;
+   *start = pbaddr + window->winid * VAS_HVWC_SIZE;
+   *len = VAS_HVWC_SIZE;
+}
+
+static inline void get_uwc_mmio_bar(struct vas_window *window,
+   uint64_t *start, int *len)
+{
+   uint64_t pbaddr;
+
+   pbaddr = window->vinst->uwc_bar_start;
+   *start = pbaddr + window->winid * VAS_UWC_SIZE;
+   *len = VAS_UWC_SIZE;
+}
+
+/*
+ * Map the paste bus address of the given send window into kernel address
+ * space. Unlike MMIO regions (map_mmio_region() below), paste region must
+ * be mapped cache-able and is only applicable to send windows.
+ */
+void *map_paste_region(struct vas_window *txwin)
+{
+   int rc, len;
+   void *map;
+   char *name;
+   uint64_t start;
+
+   rc = -ENOMEM;
+   name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id,
+   txwin->winid);
+   if (!name)
+   return ERR_PTR(rc);
+
+   txwin->paste_addr_name = name;
+   compute_paste_address(txwin, &start, &len);
+
+   if (!request_mem_region(start, len, name)) {
+   pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
+   __func__, start, len);
+   goto free_name;
+   }
+
+   map = ioremap_cache(start, len);
+   if (!map) {
+   pr_devel("%s(): ioremap_cache(0x%llx, %d) failed\n", __func__,
+   start, len);
+   goto free_name;
+   }
+
+   pr_devel("VAS: mapped paste addr 0x%llx to kaddr 0x%p\n", start, map);
+   return map;
+
+free_name:
+   kfree(name);
+   return ERR_PTR(rc);
+}
+
+
+static void *map_mmio_region(char *name, uint64_t start, int len)
+{
+   void *map;
+
+   if (!request_mem_region(start, len, name)) {
+   pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
+   __func__, start, len);
+   return NULL;
+   }
+
+   map = ioremap(start, len);
+   if (!map) {
+   pr_devel("%s(): ioremap(0x%llx, %d) failed\n", __func__, start,
+   len);
+   return NULL;
+   }
+
+   return map;
+}
+
+static void unmap_region(void *addr, uint64_t start, int len)
+{
+   iounmap(addr);
+   release_mem_region((phys_addr_t)start, len);
+}
+
+/*
+ * Unmap the paste address region for a window.
+ */
+void unmap_paste_region(struct vas_window *window)
+{
+   int len;
+   uint64_t busaddr_start;
+
+   if (window->paste_kaddr) {
+   compute_paste_address(window, &busaddr_start, &len);
+   unmap_region(window->paste_kaddr, busaddr_start, len);
+   window->paste_kaddr = NULL;
+   kfree(window->paste_addr_name);
+   window->paste_addr_name = NULL;
+   }
+}
+
+/*
+ * Unmap the MMIO regions for a window.
+ */
+static void unmap_winctx_mmio_bars(str

[PATCH v6 05/17] powerpc/vas: Define helpers to init window context

2017-08-08 Thread Sukadev Bhattiprolu

Define helpers to initialize window context registers of the VAS
hardware. These will be used in follow-on patches when opening/closing
VAS windows.

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v6]
- Add support for FTW windows and drop the fault window id
  code since it is not needed for FTW/kernel windows.
Changelog[v5]
- Fix: Copy the FIFO address into LFIFO_BAR register as is (don't
  shift address into bits 8:53).

Changelog[v4]
- Michael Neuling] Use ilog2(), radix_enabled() helpers;
  drop warning when 32-bit app uses VAS (a follow-on patch
  will check and return error). Set MSR_PR state to 0 for
  kernel (rather than reading from MSR).

Changelog[v3]
- Have caller, rather than init_xlate_regs() reset window regs
  so we don't reset any settings caller may already have set.
- Translation mode should be 0x3 (0b11) not 0x11.
- Skip initilaizing read-only registers NX_UTIL and NX_UTIL_SE
- Skip initializing adder registers from UWC - they are already
  initialized from the HVWC.
- Check winctx->user_win when setting translation registers
---
 arch/powerpc/platforms/powernv/vas-window.c | 305 
 arch/powerpc/platforms/powernv/vas.h|  55 +
 2 files changed, 360 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index a3a705a..3a50d6a 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "vas.h"
 
@@ -185,6 +186,310 @@ int map_winctx_mmio_bars(struct vas_window *window)
return 0;
 }
 
+/*
+ * Reset all valid registers in the HV and OS/User Window Contexts for
+ * the window identified by @window.
+ *
+ * NOTE: We cannot really use a for loop to reset window context. Not all
+ *  offsets in a window context are valid registers and the valid
+ *  registers are not sequential. And, we can only write to offsets
+ *  with valid registers (or is that only in Simics?).
+ */
+void reset_window_regs(struct vas_window *window)
+{
+   write_hvwc_reg(window, VREG(LPID), 0ULL);
+   write_hvwc_reg(window, VREG(PID), 0ULL);
+   write_hvwc_reg(window, VREG(XLATE_MSR), 0ULL);
+   write_hvwc_reg(window, VREG(XLATE_LPCR), 0ULL);
+   write_hvwc_reg(window, VREG(XLATE_CTL), 0ULL);
+   write_hvwc_reg(window, VREG(AMR), 0ULL);
+   write_hvwc_reg(window, VREG(SEIDR), 0ULL);
+   write_hvwc_reg(window, VREG(FAULT_TX_WIN), 0ULL);
+   write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL);
+   write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), 0ULL);
+   write_hvwc_reg(window, VREG(PSWID), 0ULL);
+   write_hvwc_reg(window, VREG(SPARE1), 0ULL);
+   write_hvwc_reg(window, VREG(SPARE2), 0ULL);
+   write_hvwc_reg(window, VREG(SPARE3), 0ULL);
+   write_hvwc_reg(window, VREG(SPARE4), 0ULL);
+   write_hvwc_reg(window, VREG(SPARE5), 0ULL);
+   write_hvwc_reg(window, VREG(SPARE6), 0ULL);
+   write_hvwc_reg(window, VREG(LFIFO_BAR), 0ULL);
+   write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), 0ULL);
+   write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), 0ULL);
+   write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL);
+   write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL);
+   write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL);
+   write_hvwc_reg(window, VREG(LRX_WCRED), 0ULL);
+   write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+   write_hvwc_reg(window, VREG(TX_WCRED), 0ULL);
+   write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+   write_hvwc_reg(window, VREG(LFIFO_SIZE), 0ULL);
+   write_hvwc_reg(window, VREG(WINCTL), 0ULL);
+   write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL);
+   write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), 0ULL);
+   write_hvwc_reg(window, VREG(TX_RSVD_BUF_COUNT), 0ULL);
+   write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), 0ULL);
+   write_hvwc_reg(window, VREG(LNOTIFY_CTL), 0ULL);
+   write_hvwc_reg(window, VREG(LNOTIFY_PID), 0ULL);
+   write_hvwc_reg(window, VREG(LNOTIFY_LPID), 0ULL);
+   write_hvwc_reg(window, VREG(LNOTIFY_TID), 0ULL);
+   write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), 0ULL);
+   write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL);
+
+   /* Skip read-only registers: NX_UTIL and NX_UTIL_SE */
+
+   /*
+* The send and receive window credit adder registers are also
+* accessible from HVWC and have been initialized above. We don't
+* need to initialize from the OS/User Window Context, so skip
+* following calls:
+*
+*  write_uwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL);
+*  write_uwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL);
+*/
+}
+
+/*
+ * Initialize window context registers related to Address Translation.
+ * These

[PATCH v6 06/17] powerpc/vas: Define helpers to alloc/free windows

2017-08-08 Thread Sukadev Bhattiprolu

Define helpers to allocate/free VAS window objects. These will
be used in follow-on patches when opening/closing windows.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/platforms/powernv/vas-window.c | 70 +
 1 file changed, 70 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 3a50d6a..9c12919 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -490,6 +490,76 @@ int init_winctx_regs(struct vas_window *window, struct 
vas_winctx *winctx)
return 0;
 }
 
+DEFINE_SPINLOCK(vas_ida_lock);
+
+void vas_release_window_id(struct ida *ida, int winid)
+{
+   spin_lock(&vas_ida_lock);
+   ida_remove(ida, winid);
+   spin_unlock(&vas_ida_lock);
+}
+
+int vas_assign_window_id(struct ida *ida)
+{
+   int rc, winid;
+
+   rc = ida_pre_get(ida, GFP_KERNEL);
+   if (!rc)
+   return -EAGAIN;
+
+   spin_lock(&vas_ida_lock);
+   rc = ida_get_new_above(ida, 0, &winid);
+   spin_unlock(&vas_ida_lock);
+
+   if (rc)
+   return rc;
+
+   if (winid > VAS_WINDOWS_PER_CHIP) {
+   pr_err("VAS: Too many (%d) open windows\n", winid);
+   vas_release_window_id(ida, winid);
+   return -EAGAIN;
+   }
+
+   return winid;
+}
+
+void vas_window_free(struct vas_window *window)
+{
+   int winid = window->winid;
+   struct vas_instance *vinst = window->vinst;
+
+   unmap_winctx_mmio_bars(window);
+   kfree(window);
+
+   vas_release_window_id(&vinst->ida, winid);
+}
+
+struct vas_window *vas_window_alloc(struct vas_instance *vinst)
+{
+   int winid;
+   struct vas_window *window;
+
+   winid = vas_assign_window_id(&vinst->ida);
+   if (winid < 0)
+   return ERR_PTR(winid);
+
+   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   if (!window)
+   return ERR_PTR(-ENOMEM);
+
+   window->vinst = vinst;
+   window->winid = winid;
+
+   if (map_winctx_mmio_bars(window))
+   goto out_free;
+
+   return window;
+
+out_free:
+   kfree(window);
+   return ERR_PTR(-ENOMEM);
+}
+
 /* stub for now */
 int vas_win_close(struct vas_window *window)
 {
-- 
2.7.4

[PATCH v6 07/17] powerpc/vas: Define vas_win_paste_addr()

2017-08-08 Thread Sukadev Bhattiprolu

Define an interface that the NX drivers can use to find the physical
paste address of a send window. This interface is expected to be used
with the mmap() operation of the NX driver's device. i.e the user space
process can use driver's mmap() operation to map the send window's paste
address into their address space and then use copy and paste instructions
to submit the CRBs to the NX engine.

Note that kernel drivers will use vas_paste_crb() directly and don't need
this interface.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h  |  7 +++
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 2c8558a..2b35b95 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -12,6 +12,8 @@
 
 #include 
 
+struct vas_window;
+
 /*
  * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25
  * (Local FIFO Size Register) of the VAS workbook.
@@ -32,4 +34,9 @@ enum vas_cop_type {
VAS_COP_TYPE_MAX,
 };
 
+/*
+ * Return the power bus paste address associated with @win so the caller
+ * can map that address into their address space.
+ */
+extern uint64_t vas_win_paste_addr(struct vas_window *win);
 #endif /* _MISC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 9c12919..3a4599f 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -35,6 +35,16 @@ void compute_paste_address(struct vas_window *window, 
uint64_t *addr, int *len)
pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
 }
 
+uint64_t vas_win_paste_addr(struct vas_window *win)
+{
+   uint64_t addr;
+
+   compute_paste_address(win, &addr, NULL);
+
+   return addr;
+}
+EXPORT_SYMBOL(vas_win_paste_addr);
+
 static inline void get_hvwc_mmio_bar(struct vas_window *window,
uint64_t *start, int *len)
 {
-- 
2.7.4

[PATCH v6 08/17] powerpc/vas: Define vas_win_id()

2017-08-08 Thread Sukadev Bhattiprolu

Define an interface to return a system-wide unique id for a given VAS
window.

The vas_win_id() will be used in a follow-on patch to generate an unique
handle for a user space receive window. Applications can use this handle
to pair send and receive windows for fast thread-wakeup.

The hardware refers to this system-wide unique id as a Partition Send
Window ID which is expected to be used during fault handling. Hence the
"pswid" in the function names.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/vas.h  |  5 +
 arch/powerpc/platforms/powernv/vas-window.c |  9 +
 arch/powerpc/platforms/powernv/vas.h| 28 
 3 files changed, 42 insertions(+)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 2b35b95..30667db 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -35,6 +35,11 @@ enum vas_cop_type {
 };
 
 /*
+ * Return a system-wide unique id for the VAS window @win.
+ */
+extern uint32_t vas_win_id(struct vas_window *win);
+
+/*
  * Return the power bus paste address associated with @win so the caller
  * can map that address into their address space.
  */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 3a4599f..42c1d4f 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -575,3 +575,12 @@ int vas_win_close(struct vas_window *window)
 {
return -1;
 }
+
+/*
+ * Return a system-wide unique window id for the window @win.
+ */
+uint32_t vas_win_id(struct vas_window *win)
+{
+   return encode_pswid(win->vinst->vas_id, win->winid);
+}
+EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 7b2bcd0..3eadf90 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -440,4 +440,32 @@ static inline uint64_t read_hvwc_reg(struct vas_window 
*win,
return in_be64(win->hvwc_map+reg);
 }
 
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * BitsUsage
+ * 0:7 VAS id (8 bits)
+ * 8:15Unused, 0 (3 bits)
+ * 16:31   Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+   u32 pswid = 0;
+
+   pswid |= vasid << (31 - 7);
+   pswid |= winid;
+
+   return pswid;
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+   if (vasid)
+   *vasid = pswid >> (31 - 7) & 0xFF;
+
+   if (winid)
+   *winid = pswid & 0x;
+}
 #endif /* _VAS_H */
-- 
2.7.4

[PATCH v6 09/17] powerpc/vas: Define vas_rx_win_open() interface

2017-08-08 Thread Sukadev Bhattiprolu

Define the vas_rx_win_open() interface. This interface is intended to be
used by the Nest Accelerator (NX) driver(s) to setup receive windows for
one or more NX engines (which implement compression/encryption algorithms
in the hardware).

Follow-on patches will provide an interface to close the window and to open
a send window that kenrel subsystems can use to access the NX engines.

The interface to open a receive window is expected to be invoked for each
instance of VAS in the system.

Signed-off-by: Sukadev Bhattiprolu 
---

Changelog[v6]:
- Add support for FTW windows

Changelog[v4]:
- Export the symbols

Changelog[v3]:
- Fault receive windows must enable interrupts and disable
  notifications. NX Windows are opposite.
- Use macros rather than enum for threshold-control mode
- Ignore irq_ports for in-kernel windows. They are needed for
  user space windows and will be added later
---
 arch/powerpc/include/asm/vas.h  |  47 
 arch/powerpc/platforms/powernv/vas-window.c | 357 +++-
 arch/powerpc/platforms/powernv/vas.h|  14 ++
 3 files changed, 417 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 30667db..a3778d7 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -35,6 +35,36 @@ enum vas_cop_type {
 };
 
 /*
+ * Receive window attributes specified by the (in-kernel) owner of window.
+ */
+struct vas_rx_win_attr {
+   void *rx_fifo;
+   int rx_fifo_size;
+   int wcreds_max;
+
+   bool pin_win;
+   bool rej_no_credit;
+   bool tx_wcred_mode;
+   bool rx_wcred_mode;
+   bool tx_win_ord_mode;
+   bool rx_win_ord_mode;
+   bool data_stamp;
+   bool nx_win;
+   bool fault_win;
+   bool user_win;
+   bool notify_disable;
+   bool intr_disable;
+   bool notify_early;
+
+   int lnotify_lpid;
+   int lnotify_pid;
+   int lnotify_tid;
+   uint32_t pswid;
+
+   int tc_mode;
+};
+
+/*
  * Return a system-wide unique id for the VAS window @win.
  */
 extern uint32_t vas_win_id(struct vas_window *win);
@@ -44,4 +74,21 @@ extern uint32_t vas_win_id(struct vas_window *win);
  * can map that address into their address space.
  */
 extern uint64_t vas_win_paste_addr(struct vas_window *win);
+
+/*
+ * Helper to initialize receive window attributes to defaults for an
+ * NX window.
+ */
+extern void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr,
+   enum vas_cop_type cop);
+
+/*
+ * Open a VAS receive window for the instance of VAS identified by @vasid
+ * Use @attr to initialize the attributes of the window.
+ *
+ * Return a handle to the window or ERR_PTR() on error.
+ */
+extern struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
+   struct vas_rx_win_attr *attr);
+
 #endif /* _MISC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 42c1d4f..ff64022 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -12,6 +12,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "vas.h"
 
@@ -544,7 +546,7 @@ void vas_window_free(struct vas_window *window)
vas_release_window_id(&vinst->ida, winid);
 }
 
-struct vas_window *vas_window_alloc(struct vas_instance *vinst)
+static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
 {
int winid;
struct vas_window *window;
@@ -570,6 +572,359 @@ struct vas_window *vas_window_alloc(struct vas_instance 
*vinst)
return ERR_PTR(-ENOMEM);
 }
 
+/*
+ * Check if current task has permissions to pair with the process that
+ * opened the receive window @rxwin. For now the check is based on
+ * kill_ok_by_cred() - i.e equivalent to current task being able to
+ * send a signal to owner of @rxwin.
+ */
+static bool valid_permissions(struct vas_window *rxwin)
+{
+   bool rc;
+   struct task_struct *wtask;
+   const struct cred *txcred, *rxcred;
+
+   rcu_read_lock();
+   wtask = find_task_by_vpid(rxwin->pid);
+
+   /*
+* CHECK: Don't need to get_task_struct(wtask) since we hold
+*RCU till we complete the uid checks? Since rxwin is
+*open, the task has not exited.
+*/
+
+   txcred = current_cred();
+   rxcred = __task_cred(wtask);
+
+   rc = false;
+   if (uid_eq(txcred->euid, rxcred->suid) ||
+   uid_eq(txcred->euid, rxcred->uid) ||
+   uid_eq(txcred->uid, rxcred->suid) ||
+   uid_eq(txcred->uid, rxcred->uid) ||
+   capable(CAP_KILL))
+   rc = true;
+
+   rcu_read_unlock();
+
+   return rc;
+}
+
+/*
+ * Find the user space receive window given the @pswid.
+ *
+ * The pswid, aka rx_win_handle, c

[PATCH v6 10/17] powerpc/vas: Define vas_rx_win_open() interface

2017-08-08 Thread Sukadev Bhattiprolu

Define the vas_rx_win_open() interface. This interface is intended to be
used by the Nest Accelerator (NX) driver(s) to setup receive windows for
one or more NX engines (which implement compression/encryption algorithms
in the hardware).

Follow-on patches will provide an interface to close the window and to open
a send window that kenrel subsystems can use to access the NX engines.

The interface to open a receive window is expected to be invoked for each
instance of VAS in the system.

Signed-off-by: Sukadev Bhattiprolu 
---

Changelog[v6]:
- Add support for FTW windows

Changelog[v4]:
- Export the symbols

Changelog[v3]:
- Fault receive windows must enable interrupts and disable
  notifications. NX Windows are opposite.
- Use macros rather than enum for threshold-control mode
- Ignore irq_ports for in-kernel windows. They are needed for
  user space windows and will be added later
---
 arch/powerpc/platforms/powernv/vas-window.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index ff64022..dfa7e67 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -717,7 +717,7 @@ void clear_vinst_win(struct vas_window *window)
 
mutex_lock(&vinst->mutex);
 
-   if (!window->tx_win) {
+   if (!window->user_win && !window->tx_win) {
WARN_ON_ONCE(!vinst->rxwin[window->cop]);
vinst->rxwin[window->cop] = NULL;
}
-- 
2.7.4

[PATCH v6 11/17] powerpc/vas: Define vas_win_close() interface

2017-08-08 Thread Sukadev Bhattiprolu

Define the vas_win_close() interface which should be used to close a
send or receive windows.

While the hardware configurations required to open send and receive windows
differ, the configuration to close a window is the same for both. So we use
a single interface to close the window.

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v4]:
- Drop the poll for credits return (we can set the required credit,
  but cannot really find the available credit at a point in time)
- Export the symbol

Changelog[v3]:
- Fix order of parameters in GET_FIELD().
- Update references and sequence for closing/quiescing a window.
---
 arch/powerpc/include/asm/vas.h  |  7 ++
 arch/powerpc/platforms/powernv/vas-window.c | 99 +++--
 2 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index a3778d7..e1c5376 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -91,4 +91,11 @@ extern void vas_init_rx_win_attr(struct vas_rx_win_attr 
*rxattr,
 extern struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
struct vas_rx_win_attr *attr);
 
+/*
+ * Close the send or receive window identified by @win. For receive windows
+ * return -EAGAIN if there are active send windows attached to this receive
+ * window.
+ */
+int vas_win_close(struct vas_window *win);
+
 #endif /* _MISC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index dfa7e67..9704a3b 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -139,7 +139,7 @@ static void unmap_region(void *addr, uint64_t start, int 
len)
 /*
  * Unmap the paste address region for a window.
  */
-void unmap_paste_region(struct vas_window *window)
+static void unmap_paste_region(struct vas_window *window)
 {
int len;
uint64_t busaddr_start;
@@ -535,7 +535,7 @@ int vas_assign_window_id(struct ida *ida)
return winid;
 }
 
-void vas_window_free(struct vas_window *window)
+static void vas_window_free(struct vas_window *window)
 {
int winid = window->winid;
struct vas_instance *vinst = window->vinst;
@@ -609,6 +609,14 @@ static bool valid_permissions(struct vas_window *rxwin)
return rc;
 }
 
+static void put_rx_win(struct vas_window *rxwin)
+{
+   /* Better not be a send window! */
+   WARN_ON_ONCE(rxwin->tx_win);
+
+   atomic_dec(&rxwin->num_txwins);
+}
+
 /*
  * Find the user space receive window given the @pswid.
  *
@@ -710,7 +718,7 @@ static void set_vinst_win(struct vas_instance *vinst,
  * Clear this window from the table(s) of windows for this VAS instance.
  * See also function header of set_vinst_win().
  */
-void clear_vinst_win(struct vas_window *window)
+static void clear_vinst_win(struct vas_window *window)
 {
int id = window->winid;
struct vas_instance *vinst = window->vinst;
@@ -925,11 +933,92 @@ struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
 }
 EXPORT_SYMBOL_GPL(vas_rx_win_open);
 
-/* stub for now */
+static void poll_window_busy_state(struct vas_window *window)
+{
+   int busy;
+   uint64_t val;
+
+retry:
+   /*
+* Poll Window Busy flag
+*/
+   val = read_hvwc_reg(window, VREG(WIN_STATUS));
+   busy = GET_FIELD(VAS_WIN_BUSY, val);
+   if (busy) {
+   val = 0;
+   schedule_timeout(2000);
+   goto retry;
+   }
+}
+
+static void poll_window_castout(struct vas_window *window)
+{
+   int cached;
+   uint64_t val;
+
+   /* Cast window context out of the cache */
+retry:
+   val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL));
+   cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val);
+   if (cached) {
+   val = 0ULL;
+   val = SET_FIELD(VAS_CASTOUT_REQ, val, 1);
+   val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0);
+   write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
+
+   schedule_timeout(2000);
+   goto retry;
+   }
+}
+
+/*
+ * Close a window.
+ *
+ * See Section 1.12.1 of VAS workbook v1.05 for details on closing window:
+ * - Disable new paste operations (unmap paste address)
+ * - Poll for the "Window Busy" bit to be cleared
+ * - Clear the Open/Enable bit for the Window.
+ * - Poll for return of window Credits (implies FIFO empty for Rx win?)
+ * - Unpin and cast window context out of cache
+ *
+ * Besides the hardware, kernel has some bookkeeping of course.
+ */
 int vas_win_close(struct vas_window *window)
 {
-   return -1;
+   uint64_t val;
+
+   if (!window)
+   return 0;
+
+   if (!window->tx_win && atomic_read(&window->num_txwins) != 0) {
+   pr_devel("VAS: Attempting to close an active Rx window!\n");
+

[PATCH v6 12/17] powerpc/vas: Define vas_tx_win_open()

2017-08-08 Thread Sukadev Bhattiprolu

Define an interface to open a VAS send window. This interface is
intended to be used the Nest Accelerator (NX) driver(s) to open
a send window and use it to submit compression/encryption requests
to a VAS receive window.

The receive window, identified by the [vasid, cop] parameters, must
already be open in VAS (i.e connected to an NX engine).

Signed-off-by: Sukadev Bhattiprolu 

---
Changelog[v6]:
- Add support for FTW windows

Changelog[v4]:
- [Ben Herrenschmidt] MMIO regions must be mapped non-cached and
  paste regions must be mapped cached. Define/use map_paste_region().

Changelog [v3]:
- Distinguish between hardware PID (SPRN_PID) and Linux pid.
- Use macros rather than enum for threshold-control mode
- Set the pid of send window from attr (needed for user space
  send windows).
- Ignore irq port setting for now. They are needed for user space
  windows and will be added later
---
 arch/powerpc/include/asm/vas.h  |  42 
 arch/powerpc/platforms/powernv/vas-window.c | 157 +++-
 2 files changed, 196 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index e1c5376..3fc6435 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -65,6 +65,29 @@ struct vas_rx_win_attr {
 };
 
 /*
+ * Window attributes specified by the in-kernel owner of a send window.
+ */
+struct vas_tx_win_attr {
+   enum vas_cop_type cop;
+   int wcreds_max;
+   int lpid;
+   int pidr;   /* hardware PID (from SPRN_PID) */
+   int pid;/* linux process id */
+   int pswid;
+   int rsvd_txbuf_count;
+   int tc_mode;
+
+   bool user_win;
+   bool pin_win;
+   bool rej_no_credit;
+   bool rsvd_txbuf_enable;
+   bool tx_wcred_mode;
+   bool rx_wcred_mode;
+   bool tx_win_ord_mode;
+   bool rx_win_ord_mode;
+};
+
+/*
  * Return a system-wide unique id for the VAS window @win.
  */
 extern uint32_t vas_win_id(struct vas_window *win);
@@ -92,6 +115,25 @@ extern struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
struct vas_rx_win_attr *attr);
 
 /*
+ * Helper to initialize send window attributes to defaults for an NX window.
+ */
+extern void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr,
+   enum vas_cop_type cop);
+
+/*
+ * Open a VAS send window for the instance of VAS identified by @vasid
+ * and the co-processor type @cop. Use @attr to initialize attributes
+ * of the window.
+ *
+ * Note: The instance of VAS must already have an open receive window for
+ * the coprocessor type @cop.
+ *
+ * Return a handle to the send window or ERR_PTR() on error.
+ */
+struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
+   struct vas_tx_win_attr *attr);
+
+/*
  * Close the send or receive window identified by @win. For receive windows
  * return -EAGAIN if there are active send windows attached to this receive
  * window.
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 9704a3b..3e2655c 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -72,7 +72,7 @@ static inline void get_uwc_mmio_bar(struct vas_window *window,
  * space. Unlike MMIO regions (map_mmio_region() below), paste region must
  * be mapped cache-able and is only applicable to send windows.
  */
-void *map_paste_region(struct vas_window *txwin)
+static void *map_paste_region(struct vas_window *txwin)
 {
int rc, len;
void *map;
@@ -109,7 +109,6 @@ void *map_paste_region(struct vas_window *txwin)
return ERR_PTR(rc);
 }
 
-
 static void *map_mmio_region(char *name, uint64_t start, int len)
 {
void *map;
@@ -657,7 +656,7 @@ struct vas_window *get_user_rxwin(struct vas_instance 
*vinst, uint32_t pswid)
  *
  * See also function header of set_vinst_win().
  */
-struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
+static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
enum vas_cop_type cop, uint32_t pswid)
 {
struct vas_window *rxwin;
@@ -933,6 +932,158 @@ struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
 }
 EXPORT_SYMBOL_GPL(vas_rx_win_open);
 
+void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type 
cop)
+{
+   memset(txattr, 0, sizeof(*txattr));
+
+   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+   txattr->rej_no_credit = false;
+   txattr->rx_wcred_mode = true;
+   txattr->tx_wcred_mode = true;
+   txattr->rx_win_ord_mode = true;
+   txattr->tx_win_ord_mode = true;
+   }
+}
+EXPORT_SYMBOL_GPL(vas_init_tx_win_attr);
+
+static void init_winctx_for_txwin(struct vas_wind

[PATCH v6 13/17] powerpc/vas: Define copy/paste interfaces

2017-08-08 Thread Sukadev Bhattiprolu

Define interfaces (wrappers) to the 'copy' and 'paste' instructions
(which are new in PowerISA 3.0). These are intended to be used to
by NX driver(s) to submit Coprocessor Request Blocks (CRBs) to the
NX hardware engines.

Signed-off-by: Sukadev Bhattiprolu 

---
Changelog[v4]
- Export symbols
Changelog[v3]
- Map raw CR value from paste instruction into an error code.
---
 MAINTAINERS |  1 +
 arch/powerpc/include/asm/vas.h  | 13 +
 arch/powerpc/platforms/powernv/copy-paste.h | 74 +
 arch/powerpc/platforms/powernv/vas-window.c | 52 
 arch/powerpc/platforms/powernv/vas.h| 15 ++
 5 files changed, 155 insertions(+)
 create mode 100644 arch/powerpc/platforms/powernv/copy-paste.h

diff --git a/MAINTAINERS b/MAINTAINERS
index edc58c9..c3f156c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6430,6 +6430,7 @@ M:Sukadev Bhattiprolu
 L: linuxppc-dev@lists.ozlabs.org
 S: Supported
 F: arch/powerpc/platforms/powernv/vas*
+F: arch/powerpc/platforms/powernv/copy-paste.h
 F: arch/powerpc/include/asm/vas.h
 F: arch/powerpc/include/uapi/asm/vas.h
 
diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 3fc6435..f9779c4 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -140,4 +140,17 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
  */
 int vas_win_close(struct vas_window *win);
 
+/*
+ * Copy the co-processor request block (CRB) @crb into the local L2 cache.
+ * For now, @offset must be 0 and @first must be true.
+ */
+extern int vas_copy_crb(void *crb, int offset, bool first);
+
+/*
+ * Paste a previously copied CRB (see vas_copy_crb()) from the L2 cache to
+ * the hardware address associated with the window @win. For now, @off must
+ * 0 and @last must be true. @re is expected/assumed to be true for NX windows.
+ */
+extern int vas_paste_crb(struct vas_window *win, int off, bool last, bool re);
+
 #endif /* _MISC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/copy-paste.h 
b/arch/powerpc/platforms/powernv/copy-paste.h
new file mode 100644
index 000..7783bb8
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/copy-paste.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2016 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Macros taken from tools/testing/selftests/powerpc/context_switch/cp_abort.c
+ */
+#define PASTE(RA, RB, L, RC) \
+   .long (0x7c00070c | (RA) << (31-15) | (RB) << (31-20) \
+ | (L) << (31-10) | (RC) << (31-31))
+
+#define COPY(RA, RB, L) \
+   .long (0x7c00060c | (RA) << (31-15) | (RB) << (31-20) \
+ | (L) << (31-10))
+
+#define CR0_FXM"0x80"
+#define CR0_SHIFT  28
+#define CR0_MASK   0xF
+/*
+ * Copy/paste instructions:
+ *
+ * copy RA,RB,L
+ * Copy contents of address (RA) + effective_address(RB)
+ * to internal copy-buffer.
+ *
+ * L == 1 indicates this is the first copy.
+ *
+ * L == 0 indicates its a continuation of a prior first copy.
+ *
+ * paste RA,RB,L
+ * Paste contents of internal copy-buffer to the address
+ * (RA) + effective_address(RB)
+ *
+ * L == 0 indicates its a continuation of a prior paste. i.e.
+ * don't wait for the completion or update status.
+ *
+ * L == 1 indicates this is the last paste in the group (i.e.
+ * wait for the group to complete and update status in CR0).
+ *
+ * For Power9, the L bit must be 'true' in both copy and paste.
+ */
+
+static inline int vas_copy(void *crb, int offset, int first)
+{
+   WARN_ON_ONCE(!first);
+
+   __asm__ __volatile(stringify_in_c(COPY(%0, %1, %2))";"
+   :
+   : "b" (offset), "b" (crb), "i" (1)
+   : "memory");
+
+   return 0;
+}
+
+static inline int vas_paste(void *paste_address, int offset, int last)
+{
+   unsigned long long cr;
+
+   WARN_ON_ONCE(!last);
+
+   cr = 0;
+   __asm__ __volatile(stringify_in_c(PASTE(%1, %2, 1, 1))";"
+   "mfocrf %0," CR0_FXM ";"
+   : "=r" (cr)
+   : "b" (paste_address), "b" (offset)
+   : "memory");
+
+   return cr;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 3e2655c..63367c7 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -16,6 +16,7 @@
 #include 
 
 #include "vas.h"
+#include "copy-paste.h"
 
 /*
  * Compute the paste address region for the window @window using the
@@ -1084,6 +1085,57 @@

[PATCH v6 14/17] powerpc: Add support for setting SPRN_TIDR

2017-08-08 Thread Sukadev Bhattiprolu

We need the SPRN_TIDR to bet set for use with fast thread-wakeup
(core-to-core wakeup).  Each thread in a process needs to have a
unique id within the process but as explained below, for now, we
assign globally unique thread ids to all threads in the system.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/processor.h |  4 ++
 arch/powerpc/kernel/process.c| 74 
 2 files changed, 78 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index fab7ff8..bf6ba63 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -232,6 +232,10 @@ struct debug_reg {
 struct thread_struct {
unsigned long   ksp;/* Kernel stack pointer */
 
+#ifdef CONFIG_PPC_VAS
+   unsigned long   tidr;
+#endif
+
 #ifdef CONFIG_PPC64
unsigned long   ksp_vsid;
 #endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9f3e2c9..6123859 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1213,6 +1213,16 @@ struct task_struct *__switch_to(struct task_struct *prev,
hard_irq_disable();
}
 
+#ifdef CONFIG_PPC_VAS
+   mtspr(SPRN_TIDR, new->thread.tidr);
+#endif
+   /*
+* We can't take a PMU exception inside _switch() since there is a
+* window where the kernel stack SLB and the kernel stack are out
+* of sync. Hard disable here.
+*/
+   hard_irq_disable();
+
/*
 * Call restore_sprs() before calling _switch(). If we move it after
 * _switch() then we miss out on calling it for new tasks. The reason
@@ -1449,9 +1459,70 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+#ifdef CONFIG_PPC_VAS
+static DEFINE_SPINLOCK(vas_thread_id_lock);
+static DEFINE_IDA(vas_thread_ida);
+
+/*
+ * We need to assign an unique thread id to each thread in a process. This
+ * thread id is intended to be used with the Fast Thread-wakeup (aka Core-
+ * to-core wakeup) mechanism being implemented on top of Virtual Accelerator
+ * Switchboard (VAS).
+ *
+ * To get a unique thread-id per process we could simply use task_pid_nr()
+ * but the problem is that task_pid_nr() is not yet available for the thread
+ * when copy_thread() is called. Fixing that would require changing more
+ * intrusive arch-neutral code in code path in copy_process()?.
+ *
+ * Further, to assign unique thread ids within each process, we need an
+ * atomic field (or an IDR) in task_struct, which again intrudes into the
+ * arch-neutral code.
+ *
+ * So try to assign globally unique thraed ids for now.
+ */
+static int assign_thread_id(void)
+{
+   int index;
+   int err;
+
+again:
+   if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL))
+   return -ENOMEM;
+
+   spin_lock(&vas_thread_id_lock);
+   err = ida_get_new_above(&vas_thread_ida, 1, &index);
+   spin_unlock(&vas_thread_id_lock);
+
+   if (err == -EAGAIN)
+   goto again;
+   else if (err)
+   return err;
+
+   if (index > MAX_USER_CONTEXT) {
+   spin_lock(&vas_thread_id_lock);
+   ida_remove(&vas_thread_ida, index);
+   spin_unlock(&vas_thread_id_lock);
+   return -ENOMEM;
+   }
+
+   return index;
+}
+
+static void free_thread_id(int id)
+{
+   spin_lock(&vas_thread_id_lock);
+   ida_remove(&vas_thread_ida, id);
+   spin_unlock(&vas_thread_id_lock);
+}
+#endif /* CONFIG_PPC_VAS */
+
+
 void
 release_thread(struct task_struct *t)
 {
+#ifdef CONFIG_PPC_VAS
+   free_thread_id(t->thread.tidr);
+#endif
 }
 
 /*
@@ -1587,6 +1658,9 @@ int copy_thread(unsigned long clone_flags, unsigned long 
usp,
 #endif
 
setup_ksp_vsid(p, sp);
+#ifdef CONFIG_PPC_VAS
+   p->thread.tidr = assign_thread_id();
+#endif
 
 #ifdef CONFIG_PPC64 
if (cpu_has_feature(CPU_FTR_DSCR)) {
-- 
2.7.4

[PATCH v6 15/17] powerpc/vas: Define window open ioctls API

2017-08-08 Thread Sukadev Bhattiprolu

Define the VAS_TX_WIN_OPEN and VAS_RX_WIN_OPEN ioctl interface. Each user
of VAS, like the NX-FTW driver in a follow-on patch, should implement
these ioctls.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/uapi/asm/vas.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/powerpc/include/uapi/asm/vas.h 
b/arch/powerpc/include/uapi/asm/vas.h
index 21249f5..e9730fb 100644
--- a/arch/powerpc/include/uapi/asm/vas.h
+++ b/arch/powerpc/include/uapi/asm/vas.h
@@ -10,6 +10,8 @@
 #ifndef _UAPI_MISC_VAS_H
 #define _UAPI_MISC_VAS_H
 
+#include 
+
 /*
  * Threshold Control Mode: Have paste operation fail if the number of
  * requests in receive FIFO exceeds a threshold.
@@ -22,6 +24,34 @@
 #define VAS_THRESH_FIFO_GT_QTR_FULL2
 #define VAS_THRESH_FIFO_GT_EIGHTH_FULL 3
 
+#define VAS_FLAGS_PIN_WINDOW   0x1
+#define VAS_FLAGS_HIGH_PRI 0x2
+
+#define VAS_TX_WIN_OPEN_IOW('v', 1, struct 
vas_tx_win_open_attr)
+#define VAS_RX_WIN_OPEN_IOW('v', 2, struct 
vas_rx_win_open_attr)
+
+struct vas_tx_win_open_attr {
+   int16_t version;
+   int16_t vas_id;
+   uint32_trx_win_handle;
+
+   int64_t reserved1;
+
+   int64_t flags;
+   int64_t reserved2;
+
+   int32_t tc_mode;
+   int32_t rsvd_txbuf;
+   int64_t reserved3[6];
+};
+
+struct vas_rx_win_open_attr {
+   int16_t version;
+   int16_t vas_id;
+   uint32_trx_win_handle;  /* output field */
+   int64_t reserved[8];
+};
+
 /*
  * Get/Set bit fields
  */
-- 
2.7.4

[PATCH v6 16/17] powerpc/vas: Implement a simple FTW driver

2017-08-08 Thread Sukadev Bhattiprolu

The Fast Thread Wake-up (FTW) driver provides user space applications an
interface to the Core-to-Core functionality in POWER9. The driver provides
the device node/ioctl API to applications and uses the external interfaces
to the VAS driver to interact with the VAS hardware.

A follow-on patch provides detailed description of the API for the driver.

Signed-off-by: Sukadev Bhattiprolu 
---
 MAINTAINERS |   1 +
 arch/powerpc/platforms/powernv/Kconfig  |  16 ++
 arch/powerpc/platforms/powernv/Makefile |   1 +
 arch/powerpc/platforms/powernv/nx-ftw.c | 486 
 4 files changed, 504 insertions(+)
 create mode 100644 arch/powerpc/platforms/powernv/nx-ftw.c

diff --git a/MAINTAINERS b/MAINTAINERS
index c3f156c..a45c0c4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6431,6 +6431,7 @@ L:linuxppc-dev@lists.ozlabs.org
 S: Supported
 F: arch/powerpc/platforms/powernv/vas*
 F: arch/powerpc/platforms/powernv/copy-paste.h
+F: arch/powerpc/platforms/powernv/nx-ftw*
 F: arch/powerpc/include/asm/vas.h
 F: arch/powerpc/include/uapi/asm/vas.h
 
diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index f565454..67ea0ff 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -44,3 +44,19 @@ config PPC_VAS
  VAS adapters are found in POWER9 based systems.
 
  If unsure, say N.
+
+config PPC_FTW
+   bool "IBM Fast Thread-Wakeup (FTW)"
+   depends on PPC_VAS
+   default n
+   help
+ This enables support for IBM Fast Thread-Wakeup driver.
+
+ The FTW driver allows applications to utilize a low overhead
+ core-to-core wake up mechansim in the IBM Virtual Accelerator
+ Switchboard (VAS) to improve performance.
+
+ VAS adapters are found in POWER9 based systems and are required
+ for the FTW driver to be operational.
+
+ If unsure, say N.
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index e4db292..dc60046 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_MEMORY_FAILURE)  += opal-memory-errors.o
 obj-$(CONFIG_TRACEPOINTS)  += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o
+obj-$(CONFIG_PPC_FTW)  += nx-ftw.o
diff --git a/arch/powerpc/platforms/powernv/nx-ftw.c 
b/arch/powerpc/platforms/powernv/nx-ftw.c
new file mode 100644
index 000..a0b6388
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/nx-ftw.c
@@ -0,0 +1,486 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * NX-FTW is a device driver used to provide user space access to the
+ * Core-to-Core aka Fast Thread Wakeup (FTW) functionality provided by
+ * the Virtual Accelerator Subsystem (VAS) in POWER9 systems. See also
+ * arch/powerpc/platforms/powernv/vas*.
+ *
+ * The driver creates the device node /dev/crypto/nx-ftw that can be
+ * used as follows:
+ *
+ * fd = open("/dev/crypto/nx-ftw", O_RDWR);
+ * rc = ioctl(fd, VAS_RX_WIN_OPEN, &rxattr);
+ * rc = ioctl(fd, VAS_TX_WIN_OPEN, &txattr);
+ * paste_addr = mmap(NULL, PAGE_SIZE, prot, MAP_SHARED, fd, 0ULL).
+ * vas_copy(&crb, 0, 1);
+ * vas_paste(paste_addr, 0, 1);
+ *
+ * where "vas_copy" and "vas_paste" are defined in copy-paste.h.
+ */
+
+static char*nxftw_dev_name = "nx-ftw";
+static atomic_tnxftw_instid = ATOMIC_INIT(0);
+static dev_t   nxftw_devt;
+static struct dentry   *nxftw_debugfs;
+static struct class*nxftw_dbgfs_class;
+
+/*
+ * Wrapper object for the nx-ftw device node - there is just one
+ * instance of this node for the whole system.
+ */
+struct nxftw_dev {
+   struct cdev cdev;
+   struct device *device;
+   char *name;
+   atomic_t refcount;
+} nxftw_device;
+
+/*
+ * One instance per open of a nx-ftw device. Each nxftw_instance is
+ * associated with a VAS window, after the caller issues VAS_RX_WIN_OPEN
+ * or VAS_TX_WIN_OPEN ioctl.
+ */
+struct nxftw_instance {
+   int instance;
+   bool tx_win;
+   struct vas_window *window;
+};
+
+#define VAS_DEFAULT_VAS_ID 0
+#define POWERNV_LPID   0   /* TODO: For VM/KVM guests? */
+
+static char *nxftw_devnode(struct device *dev, umode_t *mode)
+{
+   return kasprintf(GFP_KERNEL, "crypto/%s", dev_name(dev));
+}
+
+static int nxftw_open(struct inode *inode, struct file *fp)
+{
+   int minor;
+   struct nxftw_instance *nxti;
+
+   minor = MINOR(inode->i_rdev);
+
+   nxti = kzalloc(sizeof(*nxti), GFP_KERNEL);
+   if (!nxti)
+   return -ENOMEM;
+
+   nxti->instance = atomic_inc_return(&nxftw_instid);
+   nxti->w

[PATCH v6 17/17] powerpc/vas: Document FTW API/usage

2017-08-08 Thread Sukadev Bhattiprolu

Document the usage of the VAS Fast thread-wakeup API.

Thanks for input/comments from Benjamin Herrenschmidt, Michael Neuling,
Michael Ellerman, Robert Blackmore, Ian Munsie, Haren Myneni, Paul Mackerras.

Cc:Ian Munsie 
Cc:Paul Mackerras 
Signed-off-by: Sukadev Bhattiprolu 
---
 Documentation/powerpc/ftw-api.txt | 373 ++
 1 file changed, 373 insertions(+)
 create mode 100644 Documentation/powerpc/ftw-api.txt

diff --git a/Documentation/powerpc/ftw-api.txt 
b/Documentation/powerpc/ftw-api.txt
new file mode 100644
index 000..0b3f16f
--- /dev/null
+++ b/Documentation/powerpc/ftw-api.txt
@@ -0,0 +1,373 @@
+Virtual Accelerator Switchboard and Fast Thread-Wakeup API
+
+Power9 processor supports a hardware subystem known as the Virtual
+Accelerator Switchboard (VAS) which allows two entities in the Power9
+system to efficiently exchange messages. Messages must be formatted as
+Coprocessor Reqeust Blocks (CRB) and be submitted using the COPY/PASTE
+instructions (new in Power9).
+
+Usage of VAS depends on the entities exchanging the messages and
+currently two usages have been identified.
+
+First usage of VAS, referred to as VAS/NX involves a software thread
+submitting data compression requests to a co-processor (hardware/nest
+accelerator) aka NX engine. The API for this usage is described in the
+VAS/NX API document.
+
+Alternatively, VAS can be used by two software threads to efficiently
+exchange messages. Initially, this mechanism is intended to wake up a
+waiting thread quickly - i.e "fast thread wake-up (FTW)". This document
+describes the user API for this VAS/FTW mechanism.
+
+Application access to the FTW mechanism is provided through the NX-FTW
+device node (/dev/crypto/nx-ftw) implemented by the VAS/FTW device
+driver.
+
+A software thread T1 that intends to wait for an event must first setup
+a receive window, by opening the NX-FTW device and using the
+VAS_RX_WIN_OPEN ioctl. Upon successful return from the VAS_RX_WIN_OPEN
+ioctl, an rx_win_handle is returned.
+
+A software thread T2 that intends to wake up T1 at some point, must first
+set up a "send window" using the VAS_TX_WIN_OPEN ioctl and specify the
+rx_win_handle obtained by T1. After a successful VAS_TX_WIN_OPEN ioctl the
+send window of T2 is considered paired with the receive window of T1. The
+thread T2 must then use mmap() to obtain a "paste address" for the send
+window.
+
+With this set up, thread T1 can wait for an event using the WAIT
+instruction.
+
+Thread T2 can wake up T1 by using the "COPY/PASTE" instructions and
+submitting an empty/NULL CRB to the send window's paste address. The
+wait/wake up process can be repeated as long as the threads have the
+send/receive windows open.
+
+1. NX-FTW Device Node
+
+There is one /dev/crypto/nx-ftw node in the system and it provides
+access to the VAS/FTW functionality.
+
+The only valid operations on the NX-FTW node are:
+
+- open() the device for read and write.
+
+- issue either VAS_RX_WIN_OPEN or VAS_TX_WIN_OPEN ioctls to set up
+  receive or send (only one of them per open).
+
+- if the open is associated with send window (i.e VAS_TX_WIN_OPEN
+  ioctl was issued) mmap() the send window into the application's
+  virtual address space. (i.e get a 'paste_address' for the send
+  window).
+
+- close the device node.
+
+Other file operations on the NX-FTW node are undefined.
+
+Note tHAT the COPY and PASTE operations go directly to the hardware
+and not go through the NX-FTW device.
+
+Although a system may have several instances of the VAS in the system
+(typically, one per P9 chip) there is just one NX-FTW device node in
+the system.
+
+   When the NX-FTW device node is opened, the kernel assigns a suitable
+   instance of VAS to the process. Kernel will make a best-effort attempt
+   to assign an optimal instance of VAS for the process. In the initial
+release, the kernel does not support migrating the VAS instance if the
+process migrates from a processor on one chip to a processor on another
+chip.
+
+Applications may chose a specific instance of the VAS using the 'vas_id'
+field in the VAS_TX_WIN_OPEN and VAS_RX_WIN_OPEN ioctls as detailed below.
+
+2. Open NX-FTW node
+
+The device should be opened for read and write. No special privileges
+are needed to open the device. The device may be opened multiple times.
+
+Each open() of the NX-FTW device may be associated with either a send
+window or receive window but not both.
+
+See open(2) system call man pages for other details such as return
+values, error codes and restrictions.
+
+3. Setup Receive window (VAS_RX_WIN_OPEN ioctl)
+
+A thread that expects to wait for events and be woken up using COPY/PAST

Re: [PATCH] powerpc: xive: ensure active irqd when setting affinity

2017-08-08 Thread Sukadev Bhattiprolu

Michael Ellerman [m...@ellerman.id.au] wrote:
> Sukadev Bhattiprolu  writes:
> 
> > From fd0abf5c61b6041fdb75296e8580b86dc91d08d6 Mon Sep 17 00:00:00 2001
> > From: Benjamin Herrenschmidt 
> > Date: Tue, 1 Aug 2017 20:54:41 -0500
> > Subject: [PATCH] powerpc: xive: ensure active irqd when setting affinity
> >
> > Ensure irqd is active before attempting to set affinity. This should
> > make the set affinity code more robust. For instance, this prevents
> > these messages seen on a 4.12 based kernel when taking cpus offline:
> >
> >[  123.053037264,3] XIVE[ IC 00  ] ISN 2 lead to invalid IVE !
> >[   77.885859] xive: Error -6 reconfiguring irq 17
> >[   77.885862] IRQ17: set affinity failed(-6).
> >
> > The underlying problem with taking cpus offline was fixed in 4.13-rc1 by:
> >
> >commit 91f26cb4cd3c ("genirq/cpuhotplug: Do not migrated shutdown irqs")
> 
> So do we still need this? Or is the above only a partial fix?

It would be good to have this fix.

Commit 91f26cb4cd3c fixes the problem, so we wont see the errors with
that commit applied. But if such a problem were to show up again, xive
will handle them earlier before hitting those errors.

Sukadev

> 
> I'm a bit confused.
> 
> cheers

Re: [PATCH 16/16] perf tools: Add support for SPF events

2017-08-08 Thread Anshuman Khandual

On 08/08/2017 08:05 PM, Laurent Dufour wrote:
> Add support for the new speculative faults events.
> 
> Signed-off-by: Laurent Dufour 
> ---
>  tools/include/uapi/linux/perf_event.h | 2 ++
>  tools/perf/util/evsel.c   | 2 ++
>  tools/perf/util/parse-events.c| 8 
>  tools/perf/util/parse-events.l| 2 ++
>  tools/perf/util/python.c  | 2 ++
>  5 files changed, 16 insertions(+)
> 
> diff --git a/tools/include/uapi/linux/perf_event.h 
> b/tools/include/uapi/linux/perf_event.h
> index b1c0b187acfe..fbfb03dff334 100644
> --- a/tools/include/uapi/linux/perf_event.h
> +++ b/tools/include/uapi/linux/perf_event.h
> @@ -111,6 +111,8 @@ enum perf_sw_ids {
>   PERF_COUNT_SW_EMULATION_FAULTS  = 8,
>   PERF_COUNT_SW_DUMMY = 9,
>   PERF_COUNT_SW_BPF_OUTPUT= 10,
> + PERF_COUNT_SW_SPF_DONE  = 11,
> + PERF_COUNT_SW_SPF_FAILED= 12,
>  

PERF_COUNT_SW_SPF_FAULTS makes sense but not the FAILED one. IIRC,
there are no error path counting in perf SW events at the moment.
SPF_FAULTS and SPF_FAILS are VM internal events like THP collapse
etc. IMHO it should be added as a VM statistics counter or as a
trace point event instead.

Re: [PATCH] powerpc/include/asm: Remove unused 64bit cacheflush function

2017-08-08 Thread Andrew Donnellan


On 20/07/17 16:25, Matt Brown wrote:

The flush_dcache_phys_range function is no longer used in the kernel.
This patch removes and cleans up the function.

Signed-off-by: Matt Brown 


That does indeed look unused.

Reviewed-by: Andrew Donnellan 

--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited

Re: [PATCH 3/3] powerpc/mm: Mark __init memory no-execute when STRICT_KERNEL_RWX=y

2017-08-08 Thread Michael Ellerman

Christophe LEROY  writes:
> Le 14/07/2017 à 08:51, Michael Ellerman a écrit :
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
>> b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index c0737c86a362..3d562b210c65 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -1192,5 +1192,12 @@ static inline const int pud_pfn(pud_t pud)
>>  BUILD_BUG();
>>  return 0;
>>   }
>> +
>> +#ifdef CONFIG_STRICT_KERNEL_RWX
>> +void mark_initmem_nx(void);
>> +#else
>> +static inline void mark_initmem_nx(void) { }
>> +#endif
>> +
>
> Why do we want to limit that to CONFIG_STRICT_KERNEL_RWX ?
> Only the kernel text is marked X, even without CONFIG_STRICT_KERNEL_RWX 
> (at least on PPC32), so I believe we should clear X on init text in any 
> case, shouldn't we ?

You're right, but ..

On 64-bit when STRICT_KERNEL_RWX=n we make no effort to ensure the
start/end of the init text is on a page boundary.

eg. on 64-bit hash we will typically use a 16M page to map the whole
kernel, text/data/init_text/etc.

So yes we *should* always mark it no-execute but in practice we can't
because it's not page aligned.

But if that's different on (some?) 32-bit then we could introduce a new
CONFIG symbol that is enabled in the right cases.

cheers

RE: [PATCH v10 4/4] irqchip/qeic: remove PPCisms for QEIC

2017-08-08 Thread Qiang Zhao

On Tue 8/8/2017 6:05 PM, Michael Ellerman  wrote:

> -Original Message-
> From: Michael Ellerman [mailto:m...@ellerman.id.au]
> Sent: Tuesday, August 08, 2017 6:05 PM
> To: Qiang Zhao ; t...@linutronix.de
> Cc: o...@buserror.net; linuxppc-dev@lists.ozlabs.org; Xiaobo Xie
> ; linux-ker...@vger.kernel.org
> Subject: RE: [PATCH v10 4/4] irqchip/qeic: remove PPCisms for QEIC
> 
> Qiang Zhao  writes:
> 
> > On Mon 8/7/2017 3:02 PM, Michael Ellerman  wrote:
> >
> >> -Original Message-
> >> From: Michael Ellerman [mailto:m...@ellerman.id.au]
> >> Sent: Monday, August 07, 2017 3:02 PM
> >> To: Qiang Zhao ; t...@linutronix.de
> >> Cc: o...@buserror.net; Qiang Zhao ; linuxppc-
> >> d...@lists.ozlabs.org; Xiaobo Xie ; linux-
> >> ker...@vger.kernel.org
> >> Subject: Re: [PATCH v10 4/4] irqchip/qeic: remove PPCisms for QEIC
> >>
> >> Zhao Qiang  writes:
> >>
> >> > QEIC was supported on PowerPC, and dependent on PPC, Now it is
> >> > supported on other platforms, so remove PPCisms.
> >> >
> >> > Signed-off-by: Zhao Qiang 
> >> > ---
> >> >  arch/powerpc/platforms/83xx/km83xx.c  |   1 -
> >> >  arch/powerpc/platforms/83xx/misc.c|   1 -
> >> >  arch/powerpc/platforms/83xx/mpc832x_mds.c |   1 -
> >> >  arch/powerpc/platforms/83xx/mpc832x_rdb.c |   1 -
> >> >  arch/powerpc/platforms/83xx/mpc836x_mds.c |   1 -
> >> >  arch/powerpc/platforms/83xx/mpc836x_rdk.c |   1 -
> >> >  arch/powerpc/platforms/85xx/corenet_generic.c |   1 -
> >> >  arch/powerpc/platforms/85xx/mpc85xx_mds.c |   1 -
> >> >  arch/powerpc/platforms/85xx/mpc85xx_rdb.c |   1 -
> >> >  arch/powerpc/platforms/85xx/twr_p102x.c   |   1 -
> >> >  drivers/irqchip/irq-qeic.c| 188 
> >> > +++---
> >> >  include/soc/fsl/qe/qe_ic.h| 132 --
> >> >  12 files changed, 80 insertions(+), 250 deletions(-)  delete mode
> >> > 100644 include/soc/fsl/qe/qe_ic.h
> >> >
> >> > diff --git a/arch/powerpc/platforms/83xx/km83xx.c
> >> > b/arch/powerpc/platforms/83xx/km83xx.c
> >> > index d8642a4..b1cef0a 100644
> >> > --- a/arch/powerpc/platforms/83xx/km83xx.c
> >> > +++ b/arch/powerpc/platforms/83xx/km83xx.c
> >> > @@ -38,7 +38,6 @@
> >> >  #include 
> >> >  #include 
> >> >  #include 
> >> > -#include 
> >>
> >> You deleted that file in patch 2. So didn't you just break the build
> >> for the last two commits?
> >
> > Sorry, I am not sure what you said. Could you explain?
> 
> Don't worry about it. I was confused by the fact that we have both:
> 
>   drivers/soc/fsl/qe/qe_ic.h
> 
> and:
> 
>   include/soc/fsl/qe/qe_ic.h
> 
> cheers

I think this is a issue left over by history.
In patch with commit id 7aa1aa6ecec2af19d9aa85430ce3e56119e21626, I just move 
them out from arch/powerpc.  
Maybe need to ask the original author why there are 2 qe_ic.h.

Best Regards
Qiang Zhao

Re: [PATCH 08/10] powerpc/xive: take into account '/ibm,plat-res-int-priorities'

2017-08-08 Thread David Gibson

On Tue, Aug 08, 2017 at 10:56:18AM +0200, Cédric Le Goater wrote:
> '/ibm,plat-res-int-priorities' contains a list of priorities that the
> hypervisor has reserved for its own use. Scan these ranges to choose
> the lowest unused priority for the xive spapr backend.
> 
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/sysdev/xive/spapr.c | 62 
> +++-
>  1 file changed, 61 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/spapr.c 
> b/arch/powerpc/sysdev/xive/spapr.c
> index 7fc40047c23d..220331986bd8 100644
> --- a/arch/powerpc/sysdev/xive/spapr.c
> +++ b/arch/powerpc/sysdev/xive/spapr.c
> @@ -532,13 +532,70 @@ static const struct xive_ops xive_spapr_ops = {
>   .name   = "spapr",
>  };
>  
> +/*
> + * get max priority from "/ibm,plat-res-int-priorities"
> + */
> +static bool xive_get_max_prio(u8 *max_prio)
> +{
> + struct device_node *rootdn;
> + const __be32 *reg;
> + u32 len;
> + int prio, found;
> +
> + rootdn = of_find_node_by_path("/");
> + if (!rootdn) {
> + pr_err("not root node found !\n");
> + return false;
> + }
> +
> + reg = of_get_property(rootdn, "ibm,plat-res-int-priorities", &len);
> + if (!reg) {
> + pr_err("Failed to read 'ibm,plat-res-int-priorities' 
> property\n");
> + return false;
> + }
> +
> + if (len % (2 * sizeof(u32)) != 0) {
> + pr_err("invalid 'ibm,plat-res-int-priorities' property\n");
> + return false;
> + }
> +
> + /* HW supports priorities in the range [0-7] and 0xFF is a
> +  * wildcard priority used to mask. We scan the ranges reserved
> +  * by the hypervisor to find the lowest priority we can use.
> +  */
> + found = 0xFF;
> + for (prio = 0; prio < 8; prio++) {
> + int reserved = 0;
> + int i;
> +
> + for (i = 0; i < len / (2 * sizeof(u32)); i++) {
> + int base  = be32_to_cpu(reg[2 * i]);
> + int range = be32_to_cpu(reg[2 * i + 1]);
> +
> + if (prio >= base && prio < base + range)
> + reserved++;
> + }
> +
> + if (!reserved)
> + found = prio;

So you continue the loop here, rather than using break.  Which means
found will be the highest valued priority that's not reserved.  Is
that what you intended?  The commit message says you find the lowest
unused, but do lower numbers mean higher priorities or the other way around?

> + }
> +
> + if (found == 0xFF) {
> + pr_err("no valid priority found in 
> 'ibm,plat-res-int-priorities'\n");
> + return false;
> + }
> +
> + *max_prio = found;
> + return true;
> +}
> +
>  bool xive_spapr_init(void)
>  {
>   struct device_node *np;
>   struct resource r;
>   void __iomem *tima;
>   struct property *prop;
> - u8 max_prio = 7;
> + u8 max_prio;
>   u32 val;
>   u32 len;
>   const __be32 *reg;
> @@ -566,6 +623,9 @@ bool xive_spapr_init(void)
>   return false;
>   }
>  
> + if (!xive_get_max_prio(&max_prio))
> + return false;
> +
>   /* Feed the IRQ number allocator with the ranges given in the DT */
>   reg = of_get_property(np, "ibm,xive-lisn-ranges", &len);
>   if (!reg) {

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

1 2 >

1 - 100 of 106 matches

Mail list logo