date:20181213

Re: [PATCH 0/2] of: phandle_cache, fix refcounts, remove stale entry

2018-12-13 Thread Frank Rowand

Hi Michael Bringmann,

On 12/13/18 10:42 PM, frowand.l...@gmail.com wrote:
> From: Frank Rowand 
> 
> Non-overlay dynamic devicetree node removal may leave the node in
> the phandle cache.  Subsequent calls to of_find_node_by_phandle()
> will incorrectly find the stale entry.  This bug exposed the foloowing
> phandle cache refcount bug.
> 
> The refcount of phandle_cache entries is not incremented while in
> the cache, allowing use after free error after kfree() of the
> cached entry.
> 
> Frank Rowand (2):
>   of: of_node_get()/of_node_put() nodes held in phandle cache
>   of: __of_detach_node() - remove node from phandle cache
> 
>  drivers/of/base.c   | 99 
> -
>  drivers/of/dynamic.c|  3 ++
>  drivers/of/of_private.h |  4 ++
>  3 files changed, 81 insertions(+), 25 deletions(-)
> 

Can you please test that these patches fix the problem that you
reported in:

[PATCH v03] powerpc/mobility: Fix node detach/rename problem

Thanks,

Frank

[PATCH 2/2] of: __of_detach_node() - remove node from phandle cache

2018-12-13 Thread frowand . list

From: Frank Rowand 

Non-overlay dynamic devicetree node removal may leave the node in
the phandle cache.  Subsequent calls to of_find_node_by_phandle()
will incorrectly find the stale entry.  Remove the node from the
cache.

Add paranoia checks in of_find_node_by_phandle() as a second level
of defense (do not return cached node if detached, do not add node
to cache if detached).

Reported-by: Michael Bringmann 
Signed-off-by: Frank Rowand 
---
 drivers/of/base.c   | 29 -
 drivers/of/dynamic.c|  3 +++
 drivers/of/of_private.h |  4 
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d599367cb92a..34a5125713c8 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -162,6 +162,27 @@ int of_free_phandle_cache(void)
 late_initcall_sync(of_free_phandle_cache);
 #endif
 
+/*
+ * Caller must hold devtree_lock.
+ */
+void __of_free_phandle_cache_entry(phandle handle)
+{
+   phandle masked_handle;
+
+   if (!handle)
+   return;
+
+   masked_handle = handle & phandle_cache_mask;
+
+   if (phandle_cache) {
+   if (phandle_cache[masked_handle] &&
+   handle == phandle_cache[masked_handle]->phandle) {
+   of_node_put(phandle_cache[masked_handle]);
+   phandle_cache[masked_handle] = NULL;
+   }
+   }
+}
+
 void of_populate_phandle_cache(void)
 {
unsigned long flags;
@@ -1209,11 +1230,17 @@ struct device_node *of_find_node_by_phandle(phandle 
handle)
if (phandle_cache[masked_handle] &&
handle == phandle_cache[masked_handle]->phandle)
np = phandle_cache[masked_handle];
+   if (np && of_node_check_flag(np, OF_DETACHED)) {
+   of_node_put(np);
+   phandle_cache[masked_handle] = NULL;
+   np = NULL;
+   }
}
 
if (!np) {
for_each_of_allnodes(np)
-   if (np->phandle == handle) {
+   if (np->phandle == handle &&
+   !of_node_check_flag(np, OF_DETACHED)) {
if (phandle_cache) {
/* will put when removed from cache */
of_node_get(np);
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index f4f8ed9b5454..ecea92f68c87 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -268,6 +268,9 @@ void __of_detach_node(struct device_node *np)
}
 
of_node_set_flag(np, OF_DETACHED);
+
+   /* race with of_find_node_by_phandle() prevented by devtree_lock */
+   __of_free_phandle_cache_entry(np->phandle);
 }
 
 /**
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index 5d1567025358..24786818e32e 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -84,6 +84,10 @@ static inline void __of_detach_node_sysfs(struct device_node 
*np) {}
 int of_resolve_phandles(struct device_node *tree);
 #endif
 
+#if defined(CONFIG_OF_DYNAMIC)
+void __of_free_phandle_cache_entry(phandle handle);
+#endif
+
 #if defined(CONFIG_OF_OVERLAY)
 void of_overlay_mutex_lock(void);
 void of_overlay_mutex_unlock(void);
-- 
Frank Rowand

[PATCH 1/2] of: of_node_get()/of_node_put() nodes held in phandle cache

2018-12-13 Thread frowand . list

From: Frank Rowand 

The phandle cache contains struct device_node pointers.  The refcount
of the pointers was not incremented while in the cache, allowing use
after free error after kfree() of the node.  Add the proper increment
and decrement of the use count.

Fixes: 0b3ce78e90fc ("of: cache phandle nodes to reduce cost of 
of_find_node_by_phandle()")

Signed-off-by: Frank Rowand 
---

do not "cc: stable", unless the following commits are also in stable:

  commit e54192b48da7 ("of: fix phandle cache creation for DTs with no 
phandles")
  commit b9952b5218ad ("of: overlay: update phandle cache on overlay apply and 
remove")
  commit 0b3ce78e90fc ("of: cache phandle nodes to reduce cost of 
of_find_node_by_phandle()")

 drivers/of/base.c | 70 ---
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 09692c9b32a7..d599367cb92a 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -116,9 +116,6 @@ int __weak of_node_to_nid(struct device_node *np)
 }
 #endif
 
-static struct device_node **phandle_cache;
-static u32 phandle_cache_mask;
-
 /*
  * Assumptions behind phandle_cache implementation:
  *   - phandle property values are in a contiguous range of 1..n
@@ -127,6 +124,44 @@ int __weak of_node_to_nid(struct device_node *np)
  *   - the phandle lookup overhead reduction provided by the cache
  * will likely be less
  */
+
+static struct device_node **phandle_cache;
+static u32 phandle_cache_mask;
+
+/*
+ * Caller must hold devtree_lock.
+ */
+void __of_free_phandle_cache(void)
+{
+   u32 cache_entries = phandle_cache_mask + 1;
+   u32 k;
+
+   if (!phandle_cache)
+   return;
+
+   for (k = 0; k < cache_entries; k++)
+   of_node_put(phandle_cache[k]);
+
+   kfree(phandle_cache);
+   phandle_cache = NULL;
+}
+
+int of_free_phandle_cache(void)
+{
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(&devtree_lock, flags);
+
+   __of_free_phandle_cache();
+
+   raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+   return 0;
+}
+#if !defined(CONFIG_MODULES)
+late_initcall_sync(of_free_phandle_cache);
+#endif
+
 void of_populate_phandle_cache(void)
 {
unsigned long flags;
@@ -136,8 +171,7 @@ void of_populate_phandle_cache(void)
 
raw_spin_lock_irqsave(&devtree_lock, flags);
 
-   kfree(phandle_cache);
-   phandle_cache = NULL;
+   __of_free_phandle_cache();
 
for_each_of_allnodes(np)
if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL)
@@ -155,30 +189,15 @@ void of_populate_phandle_cache(void)
goto out;
 
for_each_of_allnodes(np)
-   if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL)
+   if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL) {
+   of_node_get(np);
phandle_cache[np->phandle & phandle_cache_mask] = np;
+   }
 
 out:
raw_spin_unlock_irqrestore(&devtree_lock, flags);
 }
 
-int of_free_phandle_cache(void)
-{
-   unsigned long flags;
-
-   raw_spin_lock_irqsave(&devtree_lock, flags);
-
-   kfree(phandle_cache);
-   phandle_cache = NULL;
-
-   raw_spin_unlock_irqrestore(&devtree_lock, flags);
-
-   return 0;
-}
-#if !defined(CONFIG_MODULES)
-late_initcall_sync(of_free_phandle_cache);
-#endif
-
 void __init of_core_init(void)
 {
struct device_node *np;
@@ -1195,8 +1214,11 @@ struct device_node *of_find_node_by_phandle(phandle 
handle)
if (!np) {
for_each_of_allnodes(np)
if (np->phandle == handle) {
-   if (phandle_cache)
+   if (phandle_cache) {
+   /* will put when removed from cache */
+   of_node_get(np);
phandle_cache[masked_handle] = np;
+   }
break;
}
}
-- 
Frank Rowand

[PATCH 0/2] of: phandle_cache, fix refcounts, remove stale entry

2018-12-13 Thread frowand . list

From: Frank Rowand 

Non-overlay dynamic devicetree node removal may leave the node in
the phandle cache.  Subsequent calls to of_find_node_by_phandle()
will incorrectly find the stale entry.  This bug exposed the foloowing
phandle cache refcount bug.

The refcount of phandle_cache entries is not incremented while in
the cache, allowing use after free error after kfree() of the
cached entry.

Frank Rowand (2):
  of: of_node_get()/of_node_put() nodes held in phandle cache
  of: __of_detach_node() - remove node from phandle cache

 drivers/of/base.c   | 99 -
 drivers/of/dynamic.c|  3 ++
 drivers/of/of_private.h |  4 ++
 3 files changed, 81 insertions(+), 25 deletions(-)

-- 
Frank Rowand

Re: [PATCH] powerpc/prom: fix early DEBUG messages

2018-12-13 Thread Michael Ellerman

Christophe Leroy  writes:

> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index fe758cedb93f..d8e56e03c9c6 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -749,7 +749,11 @@ void __init early_init_devtree(void *params)
>   memblock_allow_resize();
>   memblock_dump_all();
>  
> +#ifdef CONFIG_PHYS_64BIT
>   DBG("Phys. mem: %llx\n", memblock_phys_mem_size());
> +#else
> + DBG("Phys. mem: %x\n", memblock_phys_mem_size());
> +#endif

Can we just do:

DBG("Phys. mem: %llx\n", (unsigned long long)memblock_phys_mem_size());

?

cheers

lockdep WARN_ON in ppc440 with -next

2018-12-13 Thread Joel Stanley

Hello,

I was booting next-20181213 in qemu with lockdep enabled and saw this:

spin_lock-torture:--- Start of test [debug]: nwriters_stress=2
nreaders_stress=0 stat_interval=60 verbose=1 shuffle_interval=3
stutter=5 shutdown_secs=0 onoff_interval=0 onoff_holdoff=0
spin_lock-torture: Creating torture_shuffle task
spin_lock-torture: Creating torture_stutter task
spin_lock-torture: torture_shuffle task started
spin_lock-torture: Creating lock_torture_writer task
spin_lock-torture: torture_stutter task started
spin_lock-torture: Creating lock_torture_writer task
spin_lock-torture: lock_torture_writer task started
spin_lock-torture: Creating lock_torture_stats task
spin_lock-torture: lock_torture_writer task started
spin_lock-torture: lock_torture_stats task started
torture_init_begin: Refusing rcu init: spin_lock running.
torture_init_begin: One torture test at a time!
workingset: timestamp_bits=30 max_order=15 bucket_order=0
jffs2: version 2.2. (NAND) © 2001-2006 Red Hat, Inc.
[ cut here ]
DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)
WARNING: CPU: 0 PID: 379 at kernel/locking/lockdep.c:3765
check_flags+0x230/0x248
Modules linked in:
CPU: 0 PID: 379 Comm: modprobe Not tainted 4.20.0-rc6-next-20181213-dirty #1
NIP:  c007c7a4 LR: c007c7a4 CTR: 
REGS: c7b95de0 TRAP: 0700   Not tainted  (4.20.0-rc6-next-20181213-dirty)
MSR:  00021000   CR: 22000c22  XER: 2000

GPR00: c007c7a4 c7b95e90 c7b9a580 002f  0001 c0fed2d6 c0fed2d7
GPR08:  21637572 c0f741df 0004 22000c22 100d803e  
GPR16:        
GPR24: c7b94000    c0860f98  00029000 10003d38
NIP [c007c7a4] check_flags+0x230/0x248
LR [c007c7a4] check_flags+0x230/0x248
Call Trace:
[c7b95e90] [c007c7a4] check_flags+0x230/0x248 (unreliable)
[c7b95ea0] [c007f430] lock_is_held_type+0x4c/0x114
[c7b95ed0] [c066801c] __schedule+0x84/0x934
[c7b95f30] [c066893c] schedule+0x70/0x9c
[c7b95f40] [c000f928] recheck+0x0/0x24
--- interrupt: 901 at 0xb7bdbe18
LR = 0xb7b6da7c
Instruction dump:
3c80c08d 2c03 8084e69c 2c84 4e823342 41940020 3c60c076 3c80c076
4cc63182 3863c724 38840cf8 4bfbdd1d <0fe0> 3c60c076 38630d13 4cc63182
irq event stamp: 428
hardirqs last  enabled at (426): [] _raw_spin_unlock_irq+0x30/0x58
hardirqs last disabled at (427): [] __do_softirq+0x394/0x52c
softirqs last  enabled at (428): [] irq_exit+0xdc/0x100
softirqs last disabled at (419): [] irq_exit+0xdc/0x100
---[ end trace 3f1ce6c3406f3dee ]---
possible reason: unannotated irqs-on.
irq event stamp: 428
hardirqs last  enabled at (426): [] _raw_spin_unlock_irq+0x30/0x58
hardirqs last disabled at (427): [] __do_softirq+0x394/0x52c
softirqs last  enabled at (428): [] irq_exit+0xdc/0x100
softirqs last disabled at (419): [] irq_exit+0xdc/0x100
Block layer SCSI generic (bsg) driver version 0.4 loaded (major 253)
io scheduler mq-deadline registered
io scheduler kyber registered

I am running ppc44x_defconfig under qemu with the bamboo machine:

qemu-system-ppc -m 128m -machine bamboo -dtb
arch/powerpc/boot/dts/bamboo.dtb -append console=ttyS0 -initrd
images/ppc32/rootfs.cpio -nographic -kernel
linux-next/arch/powerpc/boot/zImage

[PATCH V4 8/8] KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L3 guest

2018-12-13 Thread Suraj Jitindar Singh

Previously when a device was being emulated by an L1 guest for an L2
guest, that device couldn't then be passed through to an L3 guest. This
was because the L1 guest had no method for accessing L3 memory.

The hcall H_COPY_TOFROM_GUEST provides this access. Thus this setup for
passthrough can now be allowed.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 9 -
 arch/powerpc/kvm/book3s_hv_nested.c| 5 -
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index da89d10e5886..8522b034a4b2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -37,11 +37,10 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int 
pid,
int old_pid, old_lpid;
bool is_load = !!to;
 
-   /* Can't access quadrants 1 or 2 in non-HV mode */
-   if (kvmhv_on_pseries()) {
-   /* TODO h-call */
-   return -EPERM;
-   }
+   /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
+   if (kvmhv_on_pseries())
+   return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
+ __pa(to), __pa(from), n);
 
quadrant = 1;
if (!pid)
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 5903175751b4..a9db12cbc0fa 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1284,11 +1284,6 @@ static long int __kvmhv_nested_page_fault(struct kvm_run 
*run,
}
 
/* passthrough of emulated MMIO case */
-   if (kvmhv_on_pseries()) {
-   pr_err("emulated MMIO passthrough?\n");
-   return -EINVAL;
-   }
-
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
}
if (memslot->flags & KVM_MEM_READONLY) {
-- 
2.13.6

[PATCH V4 7/8] KVM: PPC: Introduce new hcall H_COPY_TOFROM_GUEST to access quadrants 1 & 2

2018-12-13 Thread Suraj Jitindar Singh

A guest cannot access quadrants 1 or 2 as this would result in an
exception. Thus introduce the hcall H_COPY_TOFROM_GUEST to be used by a
guest when it wants to perform an access to quadrants 1 or 2, for
example when it wants to access memory for one of its nested guests.

Also provide an implementation for the kvm-hv module.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/hvcall.h  |  1 +
 arch/powerpc/include/asm/kvm_book3s.h  |  4 ++
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  7 ++--
 arch/powerpc/kvm/book3s_hv.c   |  6 ++-
 arch/powerpc/kvm/book3s_hv_nested.c| 75 ++
 5 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 33a4fc891947..463c63a9fcf1 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -335,6 +335,7 @@
 #define H_SET_PARTITION_TABLE  0xF800
 #define H_ENTER_NESTED 0xF804
 #define H_TLB_INVALIDATE   0xF808
+#define H_COPY_TOFROM_GUEST0xF80C
 
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR  1
diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index ea94110bfde4..720483733bb2 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,6 +188,9 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, 
unsigned long hc);
 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr);
+extern unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
+   gva_t eaddr, void *to, void *from,
+   unsigned long n);
 extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
void *to, unsigned long n);
 extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -302,6 +305,7 @@ long kvmhv_nested_init(void);
 void kvmhv_nested_exit(void);
 void kvmhv_vm_nested_init(struct kvm *kvm);
 long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
+long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu);
 void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
 void kvmhv_release_all_nested(struct kvm *kvm);
 long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index e1e3ef710bd0..da89d10e5886 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -29,9 +29,9 @@
  */
 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
 
-static unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
-   gva_t eaddr, void *to, void *from,
-   unsigned long n)
+unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
+ gva_t eaddr, void *to, void *from,
+ unsigned long n)
 {
unsigned long quadrant, ret = n;
int old_pid, old_lpid;
@@ -82,6 +82,7 @@ static unsigned long __kvmhv_copy_tofrom_guest_radix(int 
lpid, int pid,
 
return ret;
 }
+EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
 
 static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
  void *to, void *from, unsigned long n)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2280bc4778f5..bd07f9b7c5e8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -996,7 +996,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
if (nesting_enabled(vcpu->kvm))
ret = kvmhv_do_nested_tlbie(vcpu);
break;
-
+   case H_COPY_TOFROM_GUEST:
+   ret = H_FUNCTION;
+   if (nesting_enabled(vcpu->kvm))
+   ret = kvmhv_copy_tofrom_guest_nested(vcpu);
+   break;
default:
return RESUME_HOST;
}
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 991f40ce4eea..5903175751b4 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -462,6 +462,81 @@ long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
 }
 
 /*
+ * Handle the H_COPY_TOFROM_GUEST hcall.
+ * r4 = L1 lpid of nested guest
+ * r5 = pid
+ * r6 = eaddr to access
+ * r7 = to buffer (L1 gpa)
+ * r8 = from buffer (L1 gpa)
+ * r9 = n bytes to copy
+ */
+long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
+{
+   struct kvm_nested_guest *gp;
+   int l1_lpid = kvmppc_get_gpr(vcpu, 4);
+   int pid = kvmppc_get_gpr(vcpu, 5);
+   gva_t eaddr =

[PATCH V4 6/8] KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L2 guest

2018-12-13 Thread Suraj Jitindar Singh

Allow for a device which is being emulated at L0 (the host) for an L1
guest to be passed through to a nested (L2) guest.

The existing kvmppc_hv_emulate_mmio function can be used here. The main
challenge is that for a load the result must be stored into the L2 gpr,
not an L1 gpr as would normally be the case after going out to qemu to
complete the operation. This presents a challenge as at this point the
L2 gpr state has been written back into L1 memory.

To work around this we store the address in L1 memory of the L2 gpr
where the result of the load is to be stored and use the new io_gpr
value KVM_MMIO_REG_NESTED_GPR to indicate that this is a nested load for
which completion must be done when returning back into the kernel. Then
in kvmppc_complete_mmio_load() the resultant value is written into L1
memory at the location of the indicated L2 gpr.

Note that we don't currently let an L1 guest emulate a device for an L2
guest which is then passed through to an L3 guest.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 +-
 arch/powerpc/include/asm/kvm_host.h   |  3 +++
 arch/powerpc/kvm/book3s_hv.c  | 12 ++
 arch/powerpc/kvm/book3s_hv_nested.c   | 43 ++-
 arch/powerpc/kvm/powerpc.c|  8 +++
 5 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 5883fcce7009..ea94110bfde4 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -311,7 +311,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu,
 void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
 void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
   struct hv_guest_state *hr);
-long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
+long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu);
 
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index fac6f631ed29..7a2483a139cf 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -793,6 +793,7 @@ struct kvm_vcpu_arch {
/* For support of nested guests */
struct kvm_nested_guest *nested;
u32 nested_vcpu_id;
+   gpa_t nested_io_gpr;
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
@@ -827,6 +828,8 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FQPR  0x00c0
 #define KVM_MMIO_REG_VSX   0x0100
 #define KVM_MMIO_REG_VMX   0x0180
+#define KVM_MMIO_REG_NESTED_GPR0xffc0
+
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8a0921176a60..2280bc4778f5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -985,6 +985,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
kvmppc_set_gpr(vcpu, 3, 0);
vcpu->arch.hcall_needed = 0;
return -EINTR;
+   } else if (ret == H_TOO_HARD) {
+   kvmppc_set_gpr(vcpu, 3, 0);
+   vcpu->arch.hcall_needed = 0;
+   return RESUME_HOST;
}
break;
case H_TLB_INVALIDATE:
@@ -1336,7 +1340,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
return r;
 }
 
-static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
+static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu 
*vcpu)
 {
int r;
int srcu_idx;
@@ -1394,7 +1398,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu 
*vcpu)
 */
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-   r = kvmhv_nested_page_fault(vcpu);
+   r = kvmhv_nested_page_fault(run, vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
@@ -1404,7 +1408,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu 
*vcpu)
if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-   r = kvmhv_nested_page_fault(vcpu);
+   r = kvmhv_nested_page_fault(run, vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
 
@@ -4059,7 +4063,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
if (!nested)
r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
else
-   r = kvmppc_handle_nested_exit(vcpu);
+   r = kvmppc_handle_nested_exit(kvm_run, vcp

[PATCH V4 5/8] KVM: PPC: Update kvmppc_st and kvmppc_ld to use quadrants

2018-12-13 Thread Suraj Jitindar Singh

The functions kvmppc_st and kvmppc_ld are used to access guest memory
from the host using a guest effective address. They do so by translating
through the process table to obtain a guest real address and then using
kvm_read_guest or kvm_write_guest to make the access with the guest real
address.

This method of access however only works for L1 guests and will give the
incorrect results for a nested guest.

We can however use the store_to_eaddr and load_from_eaddr kvmppc_ops to
perform the access for a nested guesti (and a L1 guest). So attempt this
method first and fall back to the old method if this fails and we aren't
running a nested guest.

At this stage there is no fall back method to perform the access for a
nested guest and this is left as a future improvement. For now we will
return to the nested guest and rely on the fact that a translation
should be faulted in before retrying the access.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/kvm/powerpc.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95859c53a5cd..cb029fcab404 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -331,10 +331,17 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int 
size, void *ptr,
 {
ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
struct kvmppc_pte pte;
-   int r;
+   int r = -EINVAL;
 
vcpu->stat.st++;
 
+   if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->store_to_eaddr)
+   r = vcpu->kvm->arch.kvm_ops->store_to_eaddr(vcpu, eaddr, ptr,
+   size);
+
+   if ((!r) || (r == -EAGAIN))
+   return r;
+
r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
 XLATE_WRITE, &pte);
if (r < 0)
@@ -367,10 +374,17 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int 
size, void *ptr,
 {
ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
struct kvmppc_pte pte;
-   int rc;
+   int rc = -EINVAL;
 
vcpu->stat.ld++;
 
+   if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->load_from_eaddr)
+   rc = vcpu->kvm->arch.kvm_ops->load_from_eaddr(vcpu, eaddr, ptr,
+ size);
+
+   if ((!rc) || (rc == -EAGAIN))
+   return rc;
+
rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
  XLATE_READ, &pte);
if (rc)
-- 
2.13.6

[PATCH V4 4/8] KVM: PPC: Add load_from_eaddr and store_to_eaddr to the kvmppc_ops struct

2018-12-13 Thread Suraj Jitindar Singh

The kvmppc_ops struct is used to store function pointers to kvm
implementation specific functions.

Introduce two new functions load_from_eaddr and store_to_eaddr to be
used to load from and store to a guest effective address respectively.

Also implement these for the kvm-hv module. If we are using the radix
mmu then we can call the functions to access quadrant 1 and 2.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 
 arch/powerpc/kvm/book3s_hv.c   | 40 ++
 2 files changed, 44 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 9b89b1918dfc..159dd76700cb 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -326,6 +326,10 @@ struct kvmppc_ops {
unsigned long flags);
void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
int (*enable_nested)(struct kvm *kvm);
+   int (*load_from_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+  int size);
+   int (*store_to_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+ int size);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a56f8413758a..8a0921176a60 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5214,6 +5214,44 @@ static int kvmhv_enable_nested(struct kvm *kvm)
return 0;
 }
 
+static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void 
*ptr,
+int size)
+{
+   int rc = -EINVAL;
+
+   if (kvmhv_vcpu_is_radix(vcpu)) {
+   rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
+
+   if (rc > 0)
+   rc = -EINVAL;
+   }
+
+   /* For now quadrants are the only way to access nested guest memory */
+   if (rc && vcpu->arch.nested)
+   rc = -EAGAIN;
+
+   return rc;
+}
+
+static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+   int size)
+{
+   int rc = -EINVAL;
+
+   if (kvmhv_vcpu_is_radix(vcpu)) {
+   rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
+
+   if (rc > 0)
+   rc = -EINVAL;
+   }
+
+   /* For now quadrants are the only way to access nested guest memory */
+   if (rc && vcpu->arch.nested)
+   rc = -EAGAIN;
+
+   return rc;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5254,6 +5292,8 @@ static struct kvmppc_ops kvm_ops_hv = {
.get_rmmu_info = kvmhv_get_rmmu_info,
.set_smt_mode = kvmhv_set_smt_mode,
.enable_nested = kvmhv_enable_nested,
+   .load_from_eaddr = kvmhv_load_from_eaddr,
+   .store_to_eaddr = kvmhv_store_to_eaddr,
 };
 
 static int kvm_init_subcore_bitmap(void)
-- 
2.13.6

[PATCH V4 3/8] KVM: PPC: Book3S HV: Implement functions to access quadrants 1 & 2

2018-12-13 Thread Suraj Jitindar Singh

The POWER9 radix mmu has the concept of quadrants. The quadrant number
is the two high bits of the effective address and determines the fully
qualified address to be used for the translation. The fully qualified
address consists of the effective lpid, the effective pid and the
effective address. This gives then 4 possible quadrants 0, 1, 2, and 3.

When accessing these quadrants the fully qualified address is obtained
as follows:

Quadrant| Hypervisor| Guest
--
| EA[0:1] = 0b00| EA[0:1] = 0b00
0   | effLPID = 0   | effLPID = LPIDR
| effPID  = PIDR| effPID  = PIDR
--
| EA[0:1] = 0b01|
1   | effLPID = LPIDR   | Invalid Access
| effPID  = PIDR|
--
| EA[0:1] = 0b10|
2   | effLPID = LPIDR   | Invalid Access
| effPID  = 0   |
--
| EA[0:1] = 0b11| EA[0:1] = 0b11
3   | effLPID = 0   | effLPID = LPIDR
| effPID  = 0   | effPID  = 0
--

In the Guest;
Quadrant 3 is normally used to address the operating system since this
uses effPID=0 and effLPID=LPIDR, meaning the PID register doesn't need to
be switched.
Quadrant 0 is normally used to address user space since the effLPID and
effPID are taken from the corresponding registers.

In the Host;
Quadrant 0 and 3 are used as above, however the effLPID is always 0 to
address the host.

Quadrants 1 and 2 can be used by the host to address guest memory using
a guest effective address. Since the effLPID comes from the LPID register,
the host loads the LPID of the guest it would like to access (and the
PID of the process) and can perform accesses to a guest effective
address.

This means quadrant 1 can be used to address the guest user space and
quadrant 2 can be used to address the guest operating system from the
hypervisor, using a guest effective address.

Access to the quadrants can cause a Hypervisor Data Storage Interrupt
(HDSI) due to being unable to perform partition scoped translation.
Previously this could only be generated from a guest and so the code
path expects us to take the KVM trampoline in the interrupt handler.
This is no longer the case so we modify the handler to call
bad_page_fault() to check if we were expecting this fault so we can
handle it gracefully and just return with an error code. In the hash mmu
case we still raise an unknown exception since quadrants aren't defined
for the hash mmu.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s.h  |  4 ++
 arch/powerpc/kernel/exceptions-64s.S   |  9 
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 97 ++
 arch/powerpc/mm/fault.c|  1 +
 4 files changed, 111 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 09f8e9ba69bc..5883fcce7009 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,6 +188,10 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm 
*kvm, unsigned long hc);
 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr);
+extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
+   void *to, unsigned long n);
+extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
+ void *from, unsigned long n);
 extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
  struct kvmppc_pte *gpte, u64 root,
  u64 *pte_ret_p);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 89d32bb79d5e..db2691ff4c0b 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -995,7 +995,16 @@ EXC_COMMON_BEGIN(h_data_storage_common)
bl  save_nvgprs
RECONCILE_IRQ_STATE(r10, r11)
addir3,r1,STACK_FRAME_OVERHEAD
+BEGIN_MMU_FTR_SECTION
+   ld  r4,PACA_EXGEN+EX_DAR(r13)
+   lwz r5,PACA_EXGEN+EX_DSISR(r13)
+   std r4,_DAR(r1)
+   std r5,_DSISR(r1)
+   li  r5,SIGSEGV
+   bl  bad_page_fault
+MMU_FTR_SECTION_ELSE
bl  unknown_exception
+ALT_MMU_F

[PATCH V4 2/8] KVM: PPC: Book3S HV: Add function kvmhv_vcpu_is_radix()

2018-12-13 Thread Suraj Jitindar Singh

There exists a function kvm_is_radix() which is used to determine if a
kvm instance is using the radix mmu. However this only applies to the
first level (L1) guest. Add a function kvmhv_vcpu_is_radix() which can
be used to determine if the current execution context of the vcpu is
radix, accounting for if the vcpu is running a nested guest.

Currently all nested guests must be radix but this may change in the
future.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 13 +
 arch/powerpc/kvm/book3s_hv_nested.c  |  1 +
 2 files changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 6d298145d564..7a9e472f2872 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -55,6 +55,7 @@ struct kvm_nested_guest {
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
short prev_cpu[NR_CPUS];
+   u8 radix;   /* is this nested guest radix */
 };
 
 /*
@@ -150,6 +151,18 @@ static inline bool kvm_is_radix(struct kvm *kvm)
return kvm->arch.radix;
 }
 
+static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
+{
+   bool radix;
+
+   if (vcpu->arch.nested)
+   radix = vcpu->arch.nested->radix;
+   else
+   radix = kvm_is_radix(vcpu->kvm);
+
+   return radix;
+}
+
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 401d2ecbebc5..4fca462e54c4 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -480,6 +480,7 @@ struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm 
*kvm, unsigned int lpid)
if (shadow_lpid < 0)
goto out_free2;
gp->shadow_lpid = shadow_lpid;
+   gp->radix = 1;
 
memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
 
-- 
2.13.6

[PATCH V4 1/8] KVM: PPC: Only report KVM_CAP_SPAPR_TCE_VFIO on powernv machines

2018-12-13 Thread Suraj Jitindar Singh

The kvm capability KVM_CAP_SPAPR_TCE_VFIO is used to indicate the
availability of in kernel tce acceleration for vfio. However it is
currently the case that this is only available on a powernv machine,
not for a pseries machine.

Thus make this capability dependent on having the cpu feature
CPU_FTR_HVMODE.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/kvm/powerpc.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2869a299c4ed..95859c53a5cd 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -496,6 +496,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
int r;
/* Assume we're using HV mode when the HV module is loaded */
int hv_enabled = kvmppc_hv_ops ? 1 : 0;
+   int kvm_on_pseries = !cpu_has_feature(CPU_FTR_HVMODE);
 
if (kvm) {
/*
@@ -543,8 +544,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_SPAPR_TCE:
case KVM_CAP_SPAPR_TCE_64:
-   /* fallthrough */
+   r = 1;
+   break;
case KVM_CAP_SPAPR_TCE_VFIO:
+   r = !kvm_on_pseries;
+   break;
case KVM_CAP_PPC_RTAS:
case KVM_CAP_PPC_FIXUP_HCALL:
case KVM_CAP_PPC_ENABLE_HCALL:
-- 
2.13.6

[PATCH V4 0/8] KVM: PPC: Implement passthrough of emulated devices for nested guests

2018-12-13 Thread Suraj Jitindar Singh

This patch series allows for emulated devices to be passed through to nested
guests, irrespective of at which level the device is being emulated.

Note that the emulated device must be using dma, not virtio.

For example, passing through an emulated e1000:

1. Emulate the device at L(n) for L(n+1)

qemu-system-ppc64 -netdev type=user,id=net0 -device e1000,netdev=net0

2. Assign the VFIO-PCI driver at L(n+1)

echo vfio-pci > /sys/bus/pci/devices/:00:00.0/driver_override
echo :00:00.0 > /sys/bus/pci/drivers/e1000/unbind
echo :00:00.0 > /sys/bus/pci/drivers/vfio-pci/bind
chmod 666 /dev/vfio/0

3. Pass the device through from L(n+1) to L(n+2)

qemu-system-ppc64 -device vfio-pci,host=:00:00.0

4. L(n+2) can now access the device which will be emulated at L(n)

V2 -> V3:
1/8: None
2/8: None
3/8: None
4/8: None
5/8: None
6/8: Add if def to fix compilation for some platforms
7/8: None
8/8: None

Suraj Jitindar Singh (8):
  KVM: PPC: Only report KVM_CAP_SPAPR_TCE_VFIO on powernv machines
  KVM: PPC: Book3S HV: Add function kvmhv_vcpu_is_radix()
  KVM: PPC: Book3S HV: Implement functions to access quadrants 1 & 2
  KVM: PPC: Add load_from_eaddr and store_to_eaddr to the kvmppc_ops
struct
  KVM: PPC: Update kvmppc_st and kvmppc_ld to use quadrants
  KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L2
guest
  KVM: PPC: Introduce new hcall H_COPY_TOFROM_GUEST to access quadrants
1 & 2
  KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L3
guest

 arch/powerpc/include/asm/hvcall.h|   1 +
 arch/powerpc/include/asm/kvm_book3s.h|  10 ++-
 arch/powerpc/include/asm/kvm_book3s_64.h |  13 
 arch/powerpc/include/asm/kvm_host.h  |   3 +
 arch/powerpc/include/asm/kvm_ppc.h   |   4 ++
 arch/powerpc/kernel/exceptions-64s.S |   9 +++
 arch/powerpc/kvm/book3s_64_mmu_radix.c   |  97 ++
 arch/powerpc/kvm/book3s_hv.c |  58 ++--
 arch/powerpc/kvm/book3s_hv_nested.c  | 114 +--
 arch/powerpc/kvm/powerpc.c   |  32 -
 arch/powerpc/mm/fault.c  |   1 +
 11 files changed, 327 insertions(+), 15 deletions(-)

-- 
2.13.6

Re: [PATCH kernel v5 03/20] powerpc/vfio/iommu/kvm: Do not pin device memory

2018-12-13 Thread Paul Mackerras

On Thu, Dec 13, 2018 at 05:17:17PM +1100, Alexey Kardashevskiy wrote:
> This new memory does not have page structs as it is not plugged to
> the host so gup() will fail anyway.
> 
> This adds 2 helpers:
> - mm_iommu_newdev() to preregister the "memory device" memory so
> the rest of API can still be used;
> - mm_iommu_is_devmem() to know if the physical address is one of thise
> new regions which we must avoid unpinning of.
> 
> This adds @mm to tce_page_is_contained() and iommu_tce_xchg() to test
> if the memory is device memory to avoid pfn_to_page().
> 
> This adds a check for device memory in mm_iommu_ua_mark_dirty_rm() which
> does delayed pages dirtying.
> 
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: Paul Mackerras

[PATCH V3 8/8] KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L3 guest

2018-12-13 Thread Suraj Jitindar Singh

Previously when a device was being emulated by an L1 guest for an L2
guest, that device couldn't then be passed through to an L3 guest. This
was because the L1 guest had no method for accessing L3 memory.

The hcall H_COPY_TOFROM_GUEST provides this access. Thus this setup for
passthrough can now be allowed.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 9 -
 arch/powerpc/kvm/book3s_hv_nested.c| 5 -
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index da89d10e5886..8522b034a4b2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -37,11 +37,10 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int 
pid,
int old_pid, old_lpid;
bool is_load = !!to;
 
-   /* Can't access quadrants 1 or 2 in non-HV mode */
-   if (kvmhv_on_pseries()) {
-   /* TODO h-call */
-   return -EPERM;
-   }
+   /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
+   if (kvmhv_on_pseries())
+   return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
+ __pa(to), __pa(from), n);
 
quadrant = 1;
if (!pid)
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 5903175751b4..a9db12cbc0fa 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1284,11 +1284,6 @@ static long int __kvmhv_nested_page_fault(struct kvm_run 
*run,
}
 
/* passthrough of emulated MMIO case */
-   if (kvmhv_on_pseries()) {
-   pr_err("emulated MMIO passthrough?\n");
-   return -EINVAL;
-   }
-
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
}
if (memslot->flags & KVM_MEM_READONLY) {
-- 
2.13.6

[PATCH V3 7/8] KVM: PPC: Introduce new hcall H_COPY_TOFROM_GUEST to access quadrants 1 & 2

2018-12-13 Thread Suraj Jitindar Singh

A guest cannot access quadrants 1 or 2 as this would result in an
exception. Thus introduce the hcall H_COPY_TOFROM_GUEST to be used by a
guest when it wants to perform an access to quadrants 1 or 2, for
example when it wants to access memory for one of its nested guests.

Also provide an implementation for the kvm-hv module.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/hvcall.h  |  1 +
 arch/powerpc/include/asm/kvm_book3s.h  |  4 ++
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  7 ++--
 arch/powerpc/kvm/book3s_hv.c   |  6 ++-
 arch/powerpc/kvm/book3s_hv_nested.c| 75 ++
 5 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 33a4fc891947..463c63a9fcf1 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -335,6 +335,7 @@
 #define H_SET_PARTITION_TABLE  0xF800
 #define H_ENTER_NESTED 0xF804
 #define H_TLB_INVALIDATE   0xF808
+#define H_COPY_TOFROM_GUEST0xF80C
 
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR  1
diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index ea94110bfde4..720483733bb2 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,6 +188,9 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, 
unsigned long hc);
 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr);
+extern unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
+   gva_t eaddr, void *to, void *from,
+   unsigned long n);
 extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
void *to, unsigned long n);
 extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -302,6 +305,7 @@ long kvmhv_nested_init(void);
 void kvmhv_nested_exit(void);
 void kvmhv_vm_nested_init(struct kvm *kvm);
 long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
+long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu);
 void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
 void kvmhv_release_all_nested(struct kvm *kvm);
 long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index e1e3ef710bd0..da89d10e5886 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -29,9 +29,9 @@
  */
 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
 
-static unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
-   gva_t eaddr, void *to, void *from,
-   unsigned long n)
+unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
+ gva_t eaddr, void *to, void *from,
+ unsigned long n)
 {
unsigned long quadrant, ret = n;
int old_pid, old_lpid;
@@ -82,6 +82,7 @@ static unsigned long __kvmhv_copy_tofrom_guest_radix(int 
lpid, int pid,
 
return ret;
 }
+EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
 
 static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
  void *to, void *from, unsigned long n)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2280bc4778f5..bd07f9b7c5e8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -996,7 +996,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
if (nesting_enabled(vcpu->kvm))
ret = kvmhv_do_nested_tlbie(vcpu);
break;
-
+   case H_COPY_TOFROM_GUEST:
+   ret = H_FUNCTION;
+   if (nesting_enabled(vcpu->kvm))
+   ret = kvmhv_copy_tofrom_guest_nested(vcpu);
+   break;
default:
return RESUME_HOST;
}
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 991f40ce4eea..5903175751b4 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -462,6 +462,81 @@ long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
 }
 
 /*
+ * Handle the H_COPY_TOFROM_GUEST hcall.
+ * r4 = L1 lpid of nested guest
+ * r5 = pid
+ * r6 = eaddr to access
+ * r7 = to buffer (L1 gpa)
+ * r8 = from buffer (L1 gpa)
+ * r9 = n bytes to copy
+ */
+long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
+{
+   struct kvm_nested_guest *gp;
+   int l1_lpid = kvmppc_get_gpr(vcpu, 4);
+   int pid = kvmppc_get_gpr(vcpu, 5);
+   gva_t eaddr =

[PATCH V3 6/8] KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L2 guest

2018-12-13 Thread Suraj Jitindar Singh

Allow for a device which is being emulated at L0 (the host) for an L1
guest to be passed through to a nested (L2) guest.

The existing kvmppc_hv_emulate_mmio function can be used here. The main
challenge is that for a load the result must be stored into the L2 gpr,
not an L1 gpr as would normally be the case after going out to qemu to
complete the operation. This presents a challenge as at this point the
L2 gpr state has been written back into L1 memory.

To work around this we store the address in L1 memory of the L2 gpr
where the result of the load is to be stored and use the new io_gpr
value KVM_MMIO_REG_NESTED_GPR to indicate that this is a nested load for
which completion must be done when returning back into the kernel. Then
in kvmppc_complete_mmio_load() the resultant value is written into L1
memory at the location of the indicated L2 gpr.

Note that we don't currently let an L1 guest emulate a device for an L2
guest which is then passed through to an L3 guest.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 +-
 arch/powerpc/include/asm/kvm_host.h   |  3 +++
 arch/powerpc/kvm/book3s_hv.c  | 12 ++
 arch/powerpc/kvm/book3s_hv_nested.c   | 43 ++-
 arch/powerpc/kvm/powerpc.c|  6 +
 5 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 5883fcce7009..ea94110bfde4 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -311,7 +311,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu,
 void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
 void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
   struct hv_guest_state *hr);
-long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
+long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu);
 
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index fac6f631ed29..7a2483a139cf 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -793,6 +793,7 @@ struct kvm_vcpu_arch {
/* For support of nested guests */
struct kvm_nested_guest *nested;
u32 nested_vcpu_id;
+   gpa_t nested_io_gpr;
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
@@ -827,6 +828,8 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FQPR  0x00c0
 #define KVM_MMIO_REG_VSX   0x0100
 #define KVM_MMIO_REG_VMX   0x0180
+#define KVM_MMIO_REG_NESTED_GPR0xffc0
+
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8a0921176a60..2280bc4778f5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -985,6 +985,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
kvmppc_set_gpr(vcpu, 3, 0);
vcpu->arch.hcall_needed = 0;
return -EINTR;
+   } else if (ret == H_TOO_HARD) {
+   kvmppc_set_gpr(vcpu, 3, 0);
+   vcpu->arch.hcall_needed = 0;
+   return RESUME_HOST;
}
break;
case H_TLB_INVALIDATE:
@@ -1336,7 +1340,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
return r;
 }
 
-static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
+static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu 
*vcpu)
 {
int r;
int srcu_idx;
@@ -1394,7 +1398,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu 
*vcpu)
 */
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-   r = kvmhv_nested_page_fault(vcpu);
+   r = kvmhv_nested_page_fault(run, vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
@@ -1404,7 +1408,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu 
*vcpu)
if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-   r = kvmhv_nested_page_fault(vcpu);
+   r = kvmhv_nested_page_fault(run, vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
 
@@ -4059,7 +4063,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
if (!nested)
r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
else
-   r = kvmppc_handle_nested_exit(vcpu);
+   r = kvmppc_handle_nested_exit(kvm_run, vcpu)

[PATCH V3 5/8] KVM: PPC: Update kvmppc_st and kvmppc_ld to use quadrants

2018-12-13 Thread Suraj Jitindar Singh

The functions kvmppc_st and kvmppc_ld are used to access guest memory
from the host using a guest effective address. They do so by translating
through the process table to obtain a guest real address and then using
kvm_read_guest or kvm_write_guest to make the access with the guest real
address.

This method of access however only works for L1 guests and will give the
incorrect results for a nested guest.

We can however use the store_to_eaddr and load_from_eaddr kvmppc_ops to
perform the access for a nested guesti (and a L1 guest). So attempt this
method first and fall back to the old method if this fails and we aren't
running a nested guest.

At this stage there is no fall back method to perform the access for a
nested guest and this is left as a future improvement. For now we will
return to the nested guest and rely on the fact that a translation
should be faulted in before retrying the access.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/kvm/powerpc.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95859c53a5cd..cb029fcab404 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -331,10 +331,17 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int 
size, void *ptr,
 {
ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
struct kvmppc_pte pte;
-   int r;
+   int r = -EINVAL;
 
vcpu->stat.st++;
 
+   if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->store_to_eaddr)
+   r = vcpu->kvm->arch.kvm_ops->store_to_eaddr(vcpu, eaddr, ptr,
+   size);
+
+   if ((!r) || (r == -EAGAIN))
+   return r;
+
r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
 XLATE_WRITE, &pte);
if (r < 0)
@@ -367,10 +374,17 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int 
size, void *ptr,
 {
ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
struct kvmppc_pte pte;
-   int rc;
+   int rc = -EINVAL;
 
vcpu->stat.ld++;
 
+   if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->load_from_eaddr)
+   rc = vcpu->kvm->arch.kvm_ops->load_from_eaddr(vcpu, eaddr, ptr,
+ size);
+
+   if ((!rc) || (rc == -EAGAIN))
+   return rc;
+
rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
  XLATE_READ, &pte);
if (rc)
-- 
2.13.6

[PATCH V3 4/8] KVM: PPC: Add load_from_eaddr and store_to_eaddr to the kvmppc_ops struct

2018-12-13 Thread Suraj Jitindar Singh

The kvmppc_ops struct is used to store function pointers to kvm
implementation specific functions.

Introduce two new functions load_from_eaddr and store_to_eaddr to be
used to load from and store to a guest effective address respectively.

Also implement these for the kvm-hv module. If we are using the radix
mmu then we can call the functions to access quadrant 1 and 2.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 
 arch/powerpc/kvm/book3s_hv.c   | 40 ++
 2 files changed, 44 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 9b89b1918dfc..159dd76700cb 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -326,6 +326,10 @@ struct kvmppc_ops {
unsigned long flags);
void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
int (*enable_nested)(struct kvm *kvm);
+   int (*load_from_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+  int size);
+   int (*store_to_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+ int size);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a56f8413758a..8a0921176a60 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5214,6 +5214,44 @@ static int kvmhv_enable_nested(struct kvm *kvm)
return 0;
 }
 
+static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void 
*ptr,
+int size)
+{
+   int rc = -EINVAL;
+
+   if (kvmhv_vcpu_is_radix(vcpu)) {
+   rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
+
+   if (rc > 0)
+   rc = -EINVAL;
+   }
+
+   /* For now quadrants are the only way to access nested guest memory */
+   if (rc && vcpu->arch.nested)
+   rc = -EAGAIN;
+
+   return rc;
+}
+
+static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+   int size)
+{
+   int rc = -EINVAL;
+
+   if (kvmhv_vcpu_is_radix(vcpu)) {
+   rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
+
+   if (rc > 0)
+   rc = -EINVAL;
+   }
+
+   /* For now quadrants are the only way to access nested guest memory */
+   if (rc && vcpu->arch.nested)
+   rc = -EAGAIN;
+
+   return rc;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5254,6 +5292,8 @@ static struct kvmppc_ops kvm_ops_hv = {
.get_rmmu_info = kvmhv_get_rmmu_info,
.set_smt_mode = kvmhv_set_smt_mode,
.enable_nested = kvmhv_enable_nested,
+   .load_from_eaddr = kvmhv_load_from_eaddr,
+   .store_to_eaddr = kvmhv_store_to_eaddr,
 };
 
 static int kvm_init_subcore_bitmap(void)
-- 
2.13.6

[PATCH V3 3/8] KVM: PPC: Book3S HV: Implement functions to access quadrants 1 & 2

2018-12-13 Thread Suraj Jitindar Singh

The POWER9 radix mmu has the concept of quadrants. The quadrant number
is the two high bits of the effective address and determines the fully
qualified address to be used for the translation. The fully qualified
address consists of the effective lpid, the effective pid and the
effective address. This gives then 4 possible quadrants 0, 1, 2, and 3.

When accessing these quadrants the fully qualified address is obtained
as follows:

Quadrant| Hypervisor| Guest
--
| EA[0:1] = 0b00| EA[0:1] = 0b00
0   | effLPID = 0   | effLPID = LPIDR
| effPID  = PIDR| effPID  = PIDR
--
| EA[0:1] = 0b01|
1   | effLPID = LPIDR   | Invalid Access
| effPID  = PIDR|
--
| EA[0:1] = 0b10|
2   | effLPID = LPIDR   | Invalid Access
| effPID  = 0   |
--
| EA[0:1] = 0b11| EA[0:1] = 0b11
3   | effLPID = 0   | effLPID = LPIDR
| effPID  = 0   | effPID  = 0
--

In the Guest;
Quadrant 3 is normally used to address the operating system since this
uses effPID=0 and effLPID=LPIDR, meaning the PID register doesn't need to
be switched.
Quadrant 0 is normally used to address user space since the effLPID and
effPID are taken from the corresponding registers.

In the Host;
Quadrant 0 and 3 are used as above, however the effLPID is always 0 to
address the host.

Quadrants 1 and 2 can be used by the host to address guest memory using
a guest effective address. Since the effLPID comes from the LPID register,
the host loads the LPID of the guest it would like to access (and the
PID of the process) and can perform accesses to a guest effective
address.

This means quadrant 1 can be used to address the guest user space and
quadrant 2 can be used to address the guest operating system from the
hypervisor, using a guest effective address.

Access to the quadrants can cause a Hypervisor Data Storage Interrupt
(HDSI) due to being unable to perform partition scoped translation.
Previously this could only be generated from a guest and so the code
path expects us to take the KVM trampoline in the interrupt handler.
This is no longer the case so we modify the handler to call
bad_page_fault() to check if we were expecting this fault so we can
handle it gracefully and just return with an error code. In the hash mmu
case we still raise an unknown exception since quadrants aren't defined
for the hash mmu.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s.h  |  4 ++
 arch/powerpc/kernel/exceptions-64s.S   |  9 
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 97 ++
 arch/powerpc/mm/fault.c|  1 +
 4 files changed, 111 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 09f8e9ba69bc..5883fcce7009 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,6 +188,10 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm 
*kvm, unsigned long hc);
 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr);
+extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
+   void *to, unsigned long n);
+extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
+ void *from, unsigned long n);
 extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
  struct kvmppc_pte *gpte, u64 root,
  u64 *pte_ret_p);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 89d32bb79d5e..db2691ff4c0b 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -995,7 +995,16 @@ EXC_COMMON_BEGIN(h_data_storage_common)
bl  save_nvgprs
RECONCILE_IRQ_STATE(r10, r11)
addir3,r1,STACK_FRAME_OVERHEAD
+BEGIN_MMU_FTR_SECTION
+   ld  r4,PACA_EXGEN+EX_DAR(r13)
+   lwz r5,PACA_EXGEN+EX_DSISR(r13)
+   std r4,_DAR(r1)
+   std r5,_DSISR(r1)
+   li  r5,SIGSEGV
+   bl  bad_page_fault
+MMU_FTR_SECTION_ELSE
bl  unknown_exception
+ALT_MMU_F

[PATCH V3 2/8] KVM: PPC: Book3S HV: Add function kvmhv_vcpu_is_radix()

2018-12-13 Thread Suraj Jitindar Singh

There exists a function kvm_is_radix() which is used to determine if a
kvm instance is using the radix mmu. However this only applies to the
first level (L1) guest. Add a function kvmhv_vcpu_is_radix() which can
be used to determine if the current execution context of the vcpu is
radix, accounting for if the vcpu is running a nested guest.

Currently all nested guests must be radix but this may change in the
future.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 13 +
 arch/powerpc/kvm/book3s_hv_nested.c  |  1 +
 2 files changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 6d298145d564..7a9e472f2872 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -55,6 +55,7 @@ struct kvm_nested_guest {
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
short prev_cpu[NR_CPUS];
+   u8 radix;   /* is this nested guest radix */
 };
 
 /*
@@ -150,6 +151,18 @@ static inline bool kvm_is_radix(struct kvm *kvm)
return kvm->arch.radix;
 }
 
+static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
+{
+   bool radix;
+
+   if (vcpu->arch.nested)
+   radix = vcpu->arch.nested->radix;
+   else
+   radix = kvm_is_radix(vcpu->kvm);
+
+   return radix;
+}
+
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 401d2ecbebc5..4fca462e54c4 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -480,6 +480,7 @@ struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm 
*kvm, unsigned int lpid)
if (shadow_lpid < 0)
goto out_free2;
gp->shadow_lpid = shadow_lpid;
+   gp->radix = 1;
 
memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
 
-- 
2.13.6

[PATCH V3 1/8] KVM: PPC: Only report KVM_CAP_SPAPR_TCE_VFIO on powernv machines

2018-12-13 Thread Suraj Jitindar Singh

The kvm capability KVM_CAP_SPAPR_TCE_VFIO is used to indicate the
availability of in kernel tce acceleration for vfio. However it is
currently the case that this is only available on a powernv machine,
not for a pseries machine.

Thus make this capability dependent on having the cpu feature
CPU_FTR_HVMODE.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/kvm/powerpc.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2869a299c4ed..95859c53a5cd 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -496,6 +496,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
int r;
/* Assume we're using HV mode when the HV module is loaded */
int hv_enabled = kvmppc_hv_ops ? 1 : 0;
+   int kvm_on_pseries = !cpu_has_feature(CPU_FTR_HVMODE);
 
if (kvm) {
/*
@@ -543,8 +544,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_SPAPR_TCE:
case KVM_CAP_SPAPR_TCE_64:
-   /* fallthrough */
+   r = 1;
+   break;
case KVM_CAP_SPAPR_TCE_VFIO:
+   r = !kvm_on_pseries;
+   break;
case KVM_CAP_PPC_RTAS:
case KVM_CAP_PPC_FIXUP_HCALL:
case KVM_CAP_PPC_ENABLE_HCALL:
-- 
2.13.6

[PATCH V3 0/8] KVM: PPC: Implement passthrough of emulated devices for nested guests

2018-12-13 Thread Suraj Jitindar Singh

This patch series allows for emulated devices to be passed through to nested
guests, irrespective of at which level the device is being emulated.

Note that the emulated device must be using dma, not virtio.

For example, passing through an emulated e1000:

1. Emulate the device at L(n) for L(n+1)

qemu-system-ppc64 -netdev type=user,id=net0 -device e1000,netdev=net0

2. Assign the VFIO-PCI driver at L(n+1)

echo vfio-pci > /sys/bus/pci/devices/:00:00.0/driver_override
echo :00:00.0 > /sys/bus/pci/drivers/e1000/unbind
echo :00:00.0 > /sys/bus/pci/drivers/vfio-pci/bind
chmod 666 /dev/vfio/0

3. Pass the device through from L(n+1) to L(n+2)

qemu-system-ppc64 -device vfio-pci,host=:00:00.0

4. L(n+2) can now access the device which will be emulated at L(n)

V2 -> V3:
1/8: None
2/8: None
3/8: None
4/8: None
5/8: None
6/8: None
7/8: Use guest physical address for the args in H_COPY_TOFROM_GUEST to
 match the comment.
8/8: None

Suraj Jitindar Singh (8):
  KVM: PPC: Only report KVM_CAP_SPAPR_TCE_VFIO on powernv machines
  KVM: PPC: Book3S HV: Add function kvmhv_vcpu_is_radix()
  KVM: PPC: Book3S HV: Implement functions to access quadrants 1 & 2
  KVM: PPC: Add load_from_eaddr and store_to_eaddr to the kvmppc_ops
struct
  KVM: PPC: Update kvmppc_st and kvmppc_ld to use quadrants
  KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L2
guest
  KVM: PPC: Introduce new hcall H_COPY_TOFROM_GUEST to access quadrants
1 & 2
  KVM: PPC: Book3S HV: Allow passthrough of an emulated device to an L3
guest

 arch/powerpc/include/asm/hvcall.h|   1 +
 arch/powerpc/include/asm/kvm_book3s.h|  10 ++-
 arch/powerpc/include/asm/kvm_book3s_64.h |  13 
 arch/powerpc/include/asm/kvm_host.h  |   3 +
 arch/powerpc/include/asm/kvm_ppc.h   |   4 ++
 arch/powerpc/kernel/exceptions-64s.S |   9 +++
 arch/powerpc/kvm/book3s_64_mmu_radix.c   |  97 ++
 arch/powerpc/kvm/book3s_hv.c |  58 ++--
 arch/powerpc/kvm/book3s_hv_nested.c  | 114 +--
 arch/powerpc/kvm/powerpc.c   |  30 +++-
 arch/powerpc/mm/fault.c  |   1 +
 11 files changed, 325 insertions(+), 15 deletions(-)

-- 
2.13.6

Re: [PATCH V2 7/8] KVM: PPC: Introduce new hcall H_COPY_TOFROM_GUEST to access quadrants 1 & 2

2018-12-13 Thread Suraj Jitindar Singh

On Thu, 2018-12-13 at 16:24 +1100, Paul Mackerras wrote:
> On Mon, Dec 10, 2018 at 02:58:24PM +1100, Suraj Jitindar Singh wrote:
> > A guest cannot access quadrants 1 or 2 as this would result in an
> > exception. Thus introduce the hcall H_COPY_TOFROM_GUEST to be used
> > by a
> > guest when it wants to perform an access to quadrants 1 or 2, for
> > example when it wants to access memory for one of its nested
> > guests.
> > 
> > Also provide an implementation for the kvm-hv module.
> > 
> > Signed-off-by: Suraj Jitindar Singh 
> 
> [snip]
> 
> >  /*
> > + * Handle the H_COPY_TOFROM_GUEST hcall.
> > + * r4 = L1 lpid of nested guest
> > + * r5 = pid
> > + * r6 = eaddr to access
> > + * r7 = to buffer (L1 gpa)
> > + * r8 = from buffer (L1 gpa)
> 
> Comment says these are GPAs...
> 
> > + * r9 = n bytes to copy
> > + */
> > +long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
> > +{
> > +   struct kvm_nested_guest *gp;
> > +   int l1_lpid = kvmppc_get_gpr(vcpu, 4);
> > +   int pid = kvmppc_get_gpr(vcpu, 5);
> > +   gva_t eaddr = kvmppc_get_gpr(vcpu, 6);
> > +   void *gp_to = (void *) kvmppc_get_gpr(vcpu, 7);
> > +   void *gp_from = (void *) kvmppc_get_gpr(vcpu, 8);
> > +   void *buf;
> > +   unsigned long n = kvmppc_get_gpr(vcpu, 9);
> > +   bool is_load = !!gp_to;
> > +   long rc;
> > +
> > +   if (gp_to && gp_from) /* One must be NULL to determine the
> > direction */
> > +   return H_PARAMETER;
> > +
> > +   if (eaddr & (0xFFFUL << 52))
> > +   return H_PARAMETER;
> > +
> > +   buf = kzalloc(n, GFP_KERNEL);
> > +   if (!buf)
> > +   return H_NO_MEM;
> > +
> > +   gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false);
> > +   if (!gp) {
> > +   rc = H_PARAMETER;
> > +   goto out_free;
> > +   }
> > +
> > +   mutex_lock(&gp->tlb_lock);
> > +
> > +   if (is_load) {
> > +   /* Load from the nested guest into our buffer */
> > +   rc = __kvmhv_copy_tofrom_guest_radix(gp-
> > >shadow_lpid, pid,
> > +eaddr, buf,
> > NULL, n);
> > +   if (rc)
> > +   goto not_found;
> > +
> > +   /* Write what was loaded into our buffer back to
> > the L1 guest */
> > +   rc = kvmppc_st(vcpu, (ulong *) &gp_to, n, buf,
> > true);
> 
> but using kvmppc_st implies that it is an EA (and in fact when you
> call it in the next patch you pass an EA).
> 
> It would be more like other hcalls to pass a GPA, meaning that you
> would use kvm_write_guest() here.  On the other hand, with the
> quadrant access, kvmppc_st() might well be faster than
> kvm_write_guest.
> 
> So you need to decide which it is and either fix the comment or
> change
> the code.

Lets stick with gpa for now then for consistency, with room for
optimisation.

> 
> Paul.

Re: [PATCH] powerpc/mm: make NULL pointer deferences explicit on bad page faults.

2018-12-13 Thread Michael Ellerman

Hi Christophe,

You know it's the trivial patches that are going to get lots of review
comments :)

Christophe Leroy  writes:
> As several other arches including x86, this patch makes it explicit
> that a bad page fault is a NULL pointer dereference when the fault
> address is lower than PAGE_SIZE

I'm being pedantic, but it's not necessarily a NULL pointer dereference.
It might just be a direct access to a low address, eg:

 char *p = 0x100;
 *p = 0;

That's not a NULL pointer dereference.

But other arches do print this so I guess it's OK to add, and in most
cases it will be an actual NULL pointer dereference.

I wonder though if we should use 4096 rather than PAGE_SIZE, given
that's the actual value other arches are using. We support 256K pages on
some systems, which is getting quite large.

> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index d51cf5f4e45e..501a1eadb3e9 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -631,13 +631,16 @@ void bad_page_fault(struct pt_regs *regs, unsigned long 
> address, int sig)
>   switch (TRAP(regs)) {
>   case 0x300:
>   case 0x380:
> - printk(KERN_ALERT "Unable to handle kernel paging request for "
> - "data at address 0x%08lx\n", regs->dar);
> + pr_alert("Unable to handle kernel %s for data at address 
> 0x%08lx\n",
> +  regs->dar < PAGE_SIZE ? "NULL pointer dereference" :
> +  "paging request",
> +  regs->dar);

This is now too long I think, with printk time you get:

[ 1096.450711] Unable to handle kernel NULL pointer dereference for data at 
address 0x

Which is 93 columns. It's true on many systems it doesn't really matter
any more, but it would still be good if it was shorter.

I like that on x86 they prefix it with "BUG:", just to avoid any confusion.

What if we had for the NULL pointer case:

  BUG: Kernel NULL pointer dereference at 0x

And for the normal case:

  BUG: Unable to handle kernel data access at 0x

Note on the very next line we print:
  Faulting instruction address: 0xc0795cc8

So there should be no confusion about whether "at" refers to the data
address or the instruction address.

>   case 0x400:
>   case 0x480:
> - printk(KERN_ALERT "Unable to handle kernel paging request for "
> - "instruction fetch\n");
> + pr_alert("Unable to handle kernel %s for instruction fetch\n",
> +  regs->nip < PAGE_SIZE ? "NULL pointer dereference" :
> +  "paging request");

I don't really like using "NULL pointer dereference" here, that
terminology makes me think of a load/store, I think it confuses things
rather than making it clearer.

What about:

  BUG: Unable to handle kernel instruction fetch at 0x

>   break;
>   case 0x600:
>   printk(KERN_ALERT "Unable to handle kernel paging request for "

It would be good to clean up these other cases as well. They seem to be
trying to use the "page request for" terminology which leads to them
being very wordy. I assume that was done to help people grepping kernel
logs for errors, but I think we should not worry about that if we have
the "BUG:" prefix.

So we have:
printk(KERN_ALERT "Unable to handle kernel paging request for "
"unaligned access at address 0x%08lx\n", regs->dar);

What about:

  BUG: Unable to handle kernel unaligned access at 0x

And:
printk(KERN_ALERT "Unable to handle kernel paging request for "
"unknown fault\n");

What about:

  BUG: Unable to handle unknown paging fault at 0x

Thoughts?

cheers

Re: [PATCH v1 2/2] powerpc/pseries: Add debugfs interface to retrieve VPHN info

2018-12-13 Thread Michael Ellerman

Hi Naveen,

"Naveen N. Rao"  writes:
> Hi Michael,
>
> Naveen N. Rao wrote:
>> Add debugfs interface to retrieve associativity information for lpar
>> vcpus (debugfs/vphn/lpar) and the hypervisor cpus (debugfs/vphn/hyp).
>> This information is useful to derive various metrics, including the vcpu
>> dispatch statistics in a SPLPAR environment.
>
> Any thoughts on this approach vs. adding a tracepoint?

Sorry I've been unresponsive on this stuff, I don't know this area that
well.

I guess I'm not opposed to adding some stuff to debugfs, but only if
it's for debugging.

We don't want customers running tools or scripts that rely on this stuff
in debugfs.

If we need to expose more information to be used by production tools
then I think we need to look at sysfs or something like taskstats as a
proper API.

cheers

[PATCH] kernel/dma/direct: Do not include SME mask in the DMA supported check

2018-12-13 Thread Lendacky, Thomas

The dma_direct_supported() function intends to check the DMA mask against
specific values. However, the phys_to_dma() function includes the SME
encryption mask, which defeats the intended purpose of the check. This
results in drivers that support less than 48-bit DMA (SME encryption mask
is bit 47) from being able to set the DMA mask successfully when SME is
active, which results in the driver failing to initialize.

Change the function used to check the mask from phys_to_dma() to
__phys_to_dma() so that the SME encryption mask is not part of the check.

Fixes: c1d0af1a1d5d ("kernel/dma/direct: take DMA offset into account in 
dma_direct_supported")
Signed-off-by: Tom Lendacky 
---
 kernel/dma/direct.c |7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 22a12ab..375c77e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -309,7 +309,12 @@ int dma_direct_supported(struct device *dev, u64 mask)
 
min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
 
-   return mask >= phys_to_dma(dev, min_mask);
+   /*
+* This check needs to be against the actual bit mask value, so
+* use __phys_to_dma() here so that the SME encryption mask isn't
+* part of the check.
+*/
+   return mask >= __phys_to_dma(dev, min_mask);
 }
 
 int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christian Zigotzky


On 13 December 2018 at 6:48PM, Christian Zigotzky wrote:

On 13 December 2018 at 2:34PM, Christian Zigotzky wrote:

On 13 December 2018 at 12:25PM, Christoph Hellwig wrote:

On Thu, Dec 13, 2018 at 12:19:26PM +0100, Christian Zigotzky wrote:

I tried it again but I get the following error message:

MODPOST vmlinux.o
arch/powerpc/kernel/dma-iommu.o: In function 
`.dma_iommu_get_required_mask':

(.text+0x274): undefined reference to `.dma_direct_get_required_mask'
make: *** [vmlinux] Error 1

Sorry, you need this one liner before all the patches posted last time:

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d8819e3a1eb1..7e78c2798f2f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -154,6 +154,7 @@ config PPC
  select CLONE_BACKWARDS
  select DCACHE_WORD_ACCESS    if PPC64 && CPU_LITTLE_ENDIAN
  select DYNAMIC_FTRACE    if FUNCTION_TRACER
+    select DMA_DIRECT_OPS
  select EDAC_ATOMIC_SCRUB
  select EDAC_SUPPORT
  select GENERIC_ATOMIC64    if PPC32

Thanks. Result: PASEMI onboard ethernet works and the X5000 (P5020 
board) boots with the patch '0001-get_required_mask.patch'.


-- Christian


Next patch: '0002-swiotlb-dma_supported.patch' for the last good 
commit (977706f9755d2d697aa6f45b4f9f0e07516efeda).


The PASEMI onboard ethernet works and the X5000 (P5020 board) boots.

-- Christian



Next patch: '0003-nommu-dma_supported.patch'

No problems with the PASEMI onboard ethernet and the P5020 board boots.

-- Christian

[PATCH v6 24/27] syscall_get_arch: add "struct task_struct *" argument

2018-12-13 Thread Dmitry V. Levin

This argument is required to extend the generic ptrace API with
PTRACE_GET_SYSCALL_INFO request: syscall_get_arch() is going
to be called from ptrace_request() along with syscall_get_nr(),
syscall_get_arguments(), syscall_get_error(), and
syscall_get_return_value() functions with a tracee as their argument.

Reverts: 5e937a9ae913 ("syscall_get_arch: remove useless function arguments")
Reverts: 1002d94d3076 ("syscall.h: fix doc text for syscall_get_arch()")
Reviewed-by: Andy Lutomirski  # for x86
Reviewed-by: Palmer Dabbelt 
Acked-by: Paul Burton  # MIPS parts
Acked-by: Michael Ellerman  (powerpc)
Acked-by: Kees Cook  # seccomp parts
Acked-by: Mark Salter  # for the c6x bit
Cc: Eric Paris 
Cc: Paul Moore 
Cc: Richard Henderson 
Cc: Ivan Kokshaysky 
Cc: Matt Turner 
Cc: Vineet Gupta 
Cc: Russell King 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Aurelien Jacquiot 
Cc: Yoshinori Sato 
Cc: Richard Kuo 
Cc: Tony Luck 
Cc: Fenghua Yu 
Cc: Geert Uytterhoeven 
Cc: Michal Simek 
Cc: Greentime Hu 
Cc: Vincent Chen 
Cc: Ley Foon Tan 
Cc: Jonas Bonn 
Cc: Stefan Kristiansson 
Cc: Stafford Horne 
Cc: James E.J. Bottomley 
Cc: Helge Deller 
Cc: Albert Ou 
Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Cc: Rich Felker 
Cc: David S. Miller 
Cc: Guan Xuetao 
Cc: Jeff Dike 
Cc: Richard Weinberger 
Cc: Chris Zankel 
Cc: Max Filippov 
Cc: Arnd Bergmann 
Cc: Will Drewry 
Cc: Oleg Nesterov 
Cc: Elvira Khabirova 
Cc: Eugene Syromyatnikov 
Cc: Ralf Baechle 
Cc: James Hogan 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: linux-al...@vger.kernel.org
Cc: linux-snps-...@lists.infradead.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-c6x-...@linux-c6x.org
Cc: uclinux-h8-de...@lists.sourceforge.jp
Cc: linux-hexa...@vger.kernel.org
Cc: linux-i...@vger.kernel.org
Cc: linux-m...@lists.linux-m68k.org
Cc: linux-m...@vger.kernel.org
Cc: nios2-...@lists.rocketboards.org
Cc: openr...@lists.librecores.org
Cc: linux-par...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-ri...@lists.infradead.org
Cc: linux-s...@vger.kernel.org
Cc: linux...@vger.kernel.org
Cc: sparcli...@vger.kernel.org
Cc: linux...@lists.infradead.org
Cc: linux-xte...@linux-xtensa.org
Cc: linux-a...@vger.kernel.org
Cc: linux-au...@redhat.com
Signed-off-by: Dmitry V. Levin 
---

Notes:
v6: added more Acked-by
v5: added Cc
v2: cleaned up mips part, added Reviewed-by

 arch/alpha/include/asm/syscall.h  |  2 +-
 arch/arc/include/asm/syscall.h|  2 +-
 arch/arm/include/asm/syscall.h|  2 +-
 arch/arm64/include/asm/syscall.h  |  4 ++--
 arch/c6x/include/asm/syscall.h|  2 +-
 arch/csky/include/asm/syscall.h   |  2 +-
 arch/h8300/include/asm/syscall.h  |  2 +-
 arch/hexagon/include/asm/syscall.h|  2 +-
 arch/ia64/include/asm/syscall.h   |  2 +-
 arch/m68k/include/asm/syscall.h   |  2 +-
 arch/microblaze/include/asm/syscall.h |  2 +-
 arch/mips/include/asm/syscall.h   |  6 +++---
 arch/mips/kernel/ptrace.c |  2 +-
 arch/nds32/include/asm/syscall.h  |  2 +-
 arch/nios2/include/asm/syscall.h  |  2 +-
 arch/openrisc/include/asm/syscall.h   |  2 +-
 arch/parisc/include/asm/syscall.h |  4 ++--
 arch/powerpc/include/asm/syscall.h| 10 --
 arch/riscv/include/asm/syscall.h  |  2 +-
 arch/s390/include/asm/syscall.h   |  4 ++--
 arch/sh/include/asm/syscall_32.h  |  2 +-
 arch/sh/include/asm/syscall_64.h  |  2 +-
 arch/sparc/include/asm/syscall.h  |  5 +++--
 arch/unicore32/include/asm/syscall.h  |  2 +-
 arch/x86/include/asm/syscall.h|  8 +---
 arch/x86/um/asm/syscall.h |  2 +-
 arch/xtensa/include/asm/syscall.h |  2 +-
 include/asm-generic/syscall.h |  5 +++--
 kernel/auditsc.c  |  4 ++--
 kernel/seccomp.c  |  4 ++--
 30 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/arch/alpha/include/asm/syscall.h b/arch/alpha/include/asm/syscall.h
index c67d6a69d7c8..20078aef0922 100644
--- a/arch/alpha/include/asm/syscall.h
+++ b/arch/alpha/include/asm/syscall.h
@@ -33,7 +33,7 @@ syscall_get_return_value(struct task_struct *task, struct 
pt_regs *regs)
 }
 
 static inline int
-syscall_get_arch(void)
+syscall_get_arch(struct task_struct *task)
 {
return AUDIT_ARCH_ALPHA;
 }
diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h
index c7fc4c0c3bcb..caf2697ef5b7 100644
--- a/arch/arc/include/asm/syscall.h
+++ b/arch/arc/include/asm/syscall.h
@@ -70,7 +70,7 @@ syscall_get_arguments(struct task_struct *task, struct 
pt_regs *regs,
 }
 
 static inline int
-syscall_get_arch(void)
+syscall_get_arch(struct task_struct *task)
 {
return IS_ENABLED(CONFIG_ISA_ARCOMPACT)
? (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index 06dea6bce293..3940ceac0bdc 100644
--- a/arch/arm/include/as

[PATCH v6 00/27] ptrace: add PTRACE_GET_SYSCALL_INFO request

2018-12-13 Thread Dmitry V. Levin

PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain
details of the syscall the tracee is blocked in.

There are two reasons for a special syscall-related ptrace request.

Firstly, with the current ptrace API there are cases when ptracer cannot
retrieve necessary information about syscalls.  Some examples include:
* The notorious int-0x80-from-64-bit-task issue.  See [1] for details.
In short, if a 64-bit task performs a syscall through int 0x80, its tracer
has no reliable means to find out that the syscall was, in fact,
a compat syscall, and misidentifies it.
* Syscall-enter-stop and syscall-exit-stop look the same for the tracer.
Common practice is to keep track of the sequence of ptrace-stops in order
not to mix the two syscall-stops up.  But it is not as simple as it looks;
for example, strace had a (just recently fixed) long-standing bug where
attaching strace to a tracee that is performing the execve system call
led to the tracer identifying the following syscall-exit-stop as
syscall-enter-stop, which messed up all the state tracking.
* Since the introduction of commit 84d77d3f06e7e8dea057d10e8ec77ad71f721be3
("ptrace: Don't allow accessing an undumpable mm"), both PTRACE_PEEKDATA
and process_vm_readv become unavailable when the process dumpable flag
is cleared.  On such architectures as ia64 this results in all syscall
arguments being unavailable for the tracer.

Secondly, ptracers also have to support a lot of arch-specific code for
obtaining information about the tracee.  For some architectures, this
requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall
argument and return value.

PTRACE_GET_SYSCALL_INFO returns the following structure:

struct ptrace_syscall_info {
__u8 op;/* PTRACE_SYSCALL_INFO_* */
__u32 arch __attribute__((__aligned__(sizeof(__u32;
__u64 instruction_pointer;
__u64 stack_pointer;
union {
struct {
__u64 nr;
__u64 args[6];
} entry;
struct {
__s64 rval;
__u8 is_error;
} exit;
struct {
__u64 nr;
__u64 args[6];
__u32 ret_data;
} seccomp;
};
};

The structure was chosen according to [2], except for the following
changes:
* seccomp substructure was added as a superset of entry substructure;
* the type of nr field was changed from int to __u64 because syscall
numbers are, as a practical matter, 64 bits;
* stack_pointer field was added along with instruction_pointer field
since it is readily available and can save the tracer from extra
PTRACE_GETREGS/PTRACE_GETREGSET calls;
* arch is always initialized to aid with tracing system calls
* such as execve();
* instruction_pointer and stack_pointer are always initialized
so they could be easily obtained for non-syscall stops;
* a boolean is_error field was added along with rval field, this way
the tracer can more reliably distinguish a return value
from an error value.

strace has been ported to PTRACE_GET_SYSCALL_INFO, you can find it
in [3] and [4].

[1] 
https://lore.kernel.org/lkml/ca+55afzcsvmddj9lh_gdbz1ozhyem6zrgpbdajnywm2lf_e...@mail.gmail.com/
[2] 
https://lore.kernel.org/lkml/caobl_7gm0n80n7j_dfw_eqyflyzq+sf4y2avsccv88tb3aw...@mail.gmail.com/
[3] https://github.com/strace/strace/commits/ldv/PTRACE_GET_SYSCALL_INFO
[4] https://gitlab.com/strace/strace/commits/ldv/PTRACE_GET_SYSCALL_INFO

---

Notes:
v6:
* Add syscall_get_arguments and syscall_set_arguments wrappers
  to asm-generic/syscall.h, requested by Geert.
* Change PTRACE_GET_SYSCALL_INFO return code: do not take trailing paddings
  into account, use the end of the last field of the structure being 
written.
* Change struct ptrace_syscall_info:
  * remove .frame_pointer field, is is not needed and not portable;
  * make .arch field explicitly aligned, remove no longer needed
padding before .arch field;
  * remove trailing pads, they are no longer needed.

v5:
* Merge separate series and patches into the single series.
* Change PTRACE_EVENTMSG_SYSCALL_{ENTRY,EXIT} values as requested by Oleg.
* Change struct ptrace_syscall_info: generalize instruction_pointer,
  stack_pointer, and frame_pointer fields by moving them from
  ptrace_syscall_info.{entry,seccomp} substructures to ptrace_syscall_info
  and initializing them for all stops.
* Add PTRACE_SYSCALL_INFO_NONE, set it when not in a syscall stop,
  so e.g. "strace -i" could use PTRACE_SYSCALL_INFO_SECCOMP to obtain
  instruction_pointer when the tracee is in a signal stop.
* Patch all remaining architectures to provide all necessary
  syscall_get_* functions.
* Make available for all architectures: do not conditionalize on
  CONFIG_HAVE_ARCH_TRACEHOOK since all syscall_get_*

Re: [PATCH] dma-direct: Fix return value of dma_direct_supported

2018-12-13 Thread Christoph Hellwig

On Thu, Dec 13, 2018 at 07:45:57PM +, Lendacky, Thomas wrote:
> So I think this needs to be __phys_to_dma() here. I only recently got a
> system that had a device where the driver only supported 32-bit DMA and
> found that when SME is active this returns 0 and causes the driver to fail
> to initialize. This is because the SME encryption bit (bit 47) is part of
> the check when using phys_to_dma(). During actual DMA when SME is active,
> bounce buffers will be used for anything that can't meet the 48-bit
> requirement. But for this test, using __phys_to_dma() should give the
> desired results, right?
> 
> If you agree with this, I'll submit a patch to make the change. I missed
> this in 4.19, so I'll need to submit something to stable, too. The only
> issue there is the 4.20 fix won't apply cleanly to 4.19.

Yes, please send a patch.  Please make sure it includes a code comment
that explains why the __-prefixed version is used.

Re: [PATCH] dma-direct: Fix return value of dma_direct_supported

2018-12-13 Thread Lendacky, Thomas

On 10/04/2018 10:13 AM, Alexander Duyck wrote:
> On Thu, Oct 4, 2018 at 4:25 AM Robin Murphy  wrote:
>>
>> On 04/10/18 00:48, Alexander Duyck wrote:
>>> It appears that in commit 9d7a224b463e ("dma-direct: always allow dma mask
>>> <= physiscal memory size") the logic of the test was changed from a "<" to
>>> a ">=" however I don't see any reason for that change. I am assuming that
>>> there was some additional change planned, specifically I suspect the logic
>>> was intended to be reversed and possibly used for a return. Since that is
>>> the case I have gone ahead and done that.
>>
>> Bah, seems I got hung up on the min_mask code above it and totally
>> overlooked that the condition itself got flipped. It probably also can't
>> help that it's an int return type, but treated as a bool by callers
>> rather than "0 for success" as int tends to imply in isolation.
>>
>> Anyway, paying a bit more attention this time, I think this looks like
>> the right fix - cheers Alex.
>>
>> Robin.
> 
> Thanks for the review.
> 
> - Alex
> 
> P.S. It looks like I forgot to add Christoph to the original mail
> since I had just copied the To and Cc from the original submission, so
> I added him to the Cc for this.
> 
>>> This addresses issues I had on my system that prevented me from booting
>>> with the above mentioned commit applied on an x86_64 system w/ Intel IOMMU.
>>>
>>> Fixes: 9d7a224b463e ("dma-direct: always allow dma mask <= physiscal memory 
>>> size")
>>> Signed-off-by: Alexander Duyck 
>>> ---
>>>   kernel/dma/direct.c |4 +---
>>>   1 file changed, 1 insertion(+), 3 deletions(-)
>>>
>>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>>> index 5a0806b5351b..65872f6c2e93 100644
>>> --- a/kernel/dma/direct.c
>>> +++ b/kernel/dma/direct.c
>>> @@ -301,9 +301,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
>>>
>>>   min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
>>>
>>> - if (mask >= phys_to_dma(dev, min_mask))
>>> - return 0;
>>> - return 1;
>>> + return mask >= phys_to_dma(dev, min_mask);

So I think this needs to be __phys_to_dma() here. I only recently got a
system that had a device where the driver only supported 32-bit DMA and
found that when SME is active this returns 0 and causes the driver to fail
to initialize. This is because the SME encryption bit (bit 47) is part of
the check when using phys_to_dma(). During actual DMA when SME is active,
bounce buffers will be used for anything that can't meet the 48-bit
requirement. But for this test, using __phys_to_dma() should give the
desired results, right?

If you agree with this, I'll submit a patch to make the change. I missed
this in 4.19, so I'll need to submit something to stable, too. The only
issue there is the 4.20 fix won't apply cleanly to 4.19.

Thanks,
Tom

>>>   }
>>>
>>>   int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
>>>
>>> ___
>>> iommu mailing list
>>> io...@lists.linux-foundation.org
>>> https://lists.linuxfoundation.org/mailman/listinfo/iommu
>>>
>> ___
>> iommu mailing list
>> io...@lists.linux-foundation.org
>> https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 2/2] s390/pci: handle function enumeration after sriov enablement

2018-12-13 Thread Sebastian Ott

Implement pcibios_sriov_{add|del}_vfs as empty functions. VF
creation will be triggered by the hotplug code.

Signed-off-by: Sebastian Ott 
---
 arch/s390/pci/pci.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 9f6f392a4461..b5f8db652bf5 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -731,6 +731,17 @@ struct dev_pm_ops pcibios_pm_ops = {
 };
 #endif /* CONFIG_HIBERNATE_CALLBACKS */
 
+#ifdef CONFIG_PCI_IOV
+int pcibios_sriov_add_vfs(struct pci_dev *pdev, u16 num_vfs)
+{
+   return 0;
+}
+
+void pcibios_sriov_del_vfs(struct pci_dev *pdev)
+{
+}
+#endif
+
 static int zpci_alloc_domain(struct zpci_dev *zdev)
 {
if (zpci_unique_uid) {
-- 
2.16.4

[PATCH 1/2] PCI: provide pcibios_sriov_add_vfs

2018-12-13 Thread Sebastian Ott

Move VF detection and device creation code to weak functions
such that architectures can provide a different implementation.

Signed-off-by: Sebastian Ott 
---
 drivers/pci/iov.c   | 43 +++
 include/linux/pci.h |  2 ++
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 9616eca3182f..1bfdb4deafd7 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -252,6 +252,33 @@ int __weak pcibios_sriov_disable(struct pci_dev *pdev)
return 0;
 }
 
+int __weak pcibios_sriov_add_vfs(struct pci_dev *dev, u16 num_vfs)
+{
+   unsigned int i;
+   int rc;
+
+   for (i = 0; i < num_vfs; i++) {
+   rc = pci_iov_add_virtfn(dev, i);
+   if (rc)
+   goto failed;
+   }
+   return 0;
+failed:
+   while (i--)
+   pci_iov_remove_virtfn(dev, i);
+
+   return rc;
+}
+
+void __weak pcibios_sriov_del_vfs(struct pci_dev *dev)
+{
+   struct pci_sriov *iov = dev->sriov;
+   int i;
+
+   for (i = 0; i < iov->num_VFs; i++)
+   pci_iov_remove_virtfn(dev, i);
+}
+
 static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 {
int rc;
@@ -337,21 +364,15 @@ static int sriov_enable(struct pci_dev *dev, int 
nr_virtfn)
msleep(100);
pci_cfg_access_unlock(dev);
 
-   for (i = 0; i < initial; i++) {
-   rc = pci_iov_add_virtfn(dev, i);
-   if (rc)
-   goto failed;
-   }
+   rc = pcibios_sriov_add_vfs(dev, initial);
+   if (rc)
+   goto err_pcibios;
 
kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
iov->num_VFs = nr_virtfn;
 
return 0;
 
-failed:
-   while (i--)
-   pci_iov_remove_virtfn(dev, i);
-
 err_pcibios:
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
@@ -370,14 +391,12 @@ static int sriov_enable(struct pci_dev *dev, int 
nr_virtfn)
 
 static void sriov_disable(struct pci_dev *dev)
 {
-   int i;
struct pci_sriov *iov = dev->sriov;
 
if (!iov->num_VFs)
return;
 
-   for (i = 0; i < iov->num_VFs; i++)
-   pci_iov_remove_virtfn(dev, i);
+   pcibios_sriov_del_vfs(dev);
 
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 11c71c4ecf75..84ca3bcdac76 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2001,6 +2001,8 @@ void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool 
probe);
 /* Arch may override these (weak) */
 int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
 int pcibios_sriov_disable(struct pci_dev *pdev);
+int pcibios_sriov_add_vfs(struct pci_dev *dev, u16 num_vfs);
+void pcibios_sriov_del_vfs(struct pci_dev *dev);
 resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno);
 #else
 static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
-- 
2.16.4

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christian Zigotzky


On 13 December 2018 at 2:34PM, Christian Zigotzky wrote:

On 13 December 2018 at 12:25PM, Christoph Hellwig wrote:

On Thu, Dec 13, 2018 at 12:19:26PM +0100, Christian Zigotzky wrote:

I tried it again but I get the following error message:

MODPOST vmlinux.o
arch/powerpc/kernel/dma-iommu.o: In function 
`.dma_iommu_get_required_mask':

(.text+0x274): undefined reference to `.dma_direct_get_required_mask'
make: *** [vmlinux] Error 1

Sorry, you need this one liner before all the patches posted last time:

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d8819e3a1eb1..7e78c2798f2f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -154,6 +154,7 @@ config PPC
  select CLONE_BACKWARDS
  select DCACHE_WORD_ACCESS    if PPC64 && CPU_LITTLE_ENDIAN
  select DYNAMIC_FTRACE    if FUNCTION_TRACER
+    select DMA_DIRECT_OPS
  select EDAC_ATOMIC_SCRUB
  select EDAC_SUPPORT
  select GENERIC_ATOMIC64    if PPC32

Thanks. Result: PASEMI onboard ethernet works and the X5000 (P5020 
board) boots with the patch '0001-get_required_mask.patch'.


-- Christian


Next patch: '0002-swiotlb-dma_supported.patch' for the last good commit 
(977706f9755d2d697aa6f45b4f9f0e07516efeda).


The PASEMI onboard ethernet works and the X5000 (P5020 board) boots.

-- Christian

[PATCH v6 25/27] powerpc/ptrace: replace ptrace_report_syscall() with a tracehook call

2018-12-13 Thread Dmitry V. Levin

From: Elvira Khabirova 

Arch code should use tracehook_*() helpers, as documented in
include/linux/tracehook.h, ptrace_report_syscall() is not expected to
be used outside that file.

The patch does not look very nice, but at least it is correct
and opens the way for PTRACE_GET_SYSCALL_INFO API.

Co-authored-by: Dmitry V. Levin 
Fixes: 5521eb4bca2d ("powerpc/ptrace: Add support for PTRACE_SYSEMU")
Signed-off-by: Elvira Khabirova 
Signed-off-by: Dmitry V. Levin 
[mpe: Take this as a minimal fix for 4.20, we'll rework it later]
Signed-off-by: Michael Ellerman 
---

Notes:
v6: this is the fix that was taken into powerpc tree
v5: reverted to a simple approach, compile- and run-tested
v4: rewritten to call tracehook_report_syscall_entry() once, compile-tested
v3: add a descriptive comment
v2: explicitly ignore tracehook_report_syscall_entry() return code

 arch/powerpc/kernel/ptrace.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index afb819f4ca68..714c3480c52d 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -3266,12 +3266,17 @@ long do_syscall_trace_enter(struct pt_regs *regs)
user_exit();
 
if (test_thread_flag(TIF_SYSCALL_EMU)) {
-   ptrace_report_syscall(regs);
/*
+* A nonzero return code from tracehook_report_syscall_entry()
+* tells us to prevent the syscall execution, but we are not
+* going to execute it anyway.
+*
 * Returning -1 will skip the syscall execution. We want to
 * avoid clobbering any register also, thus, not 'gotoing'
 * skip label.
 */
+   if (tracehook_report_syscall_entry(regs))
+   ;
return -1;
}
 
-- 
ldv

[PATCH v6 18/27] powerpc: define syscall_get_error()

2018-12-13 Thread Dmitry V. Levin

syscall_get_error() is required to be implemented on this
architecture in addition to already implemented syscall_get_nr(),
syscall_get_arguments(), syscall_get_return_value(), and
syscall_get_arch() functions in order to extend the generic
ptrace API with PTRACE_GET_SYSCALL_INFO request.

Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Oleg Nesterov 
Cc: Andy Lutomirski 
Cc: Elvira Khabirova 
Cc: Eugene Syromyatnikov 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Dmitry V. Levin 
---

Notes:
v6: unchanged

v5:
This change has been tested with
tools/testing/selftests/ptrace/get_syscall_info.c and strace,
so it's correct from PTRACE_GET_SYSCALL_INFO point of view.

This cast doubts on commit v4.3-rc1~86^2~81 that changed
syscall_set_return_value() in a way that doesn't quite match
syscall_get_error(), but syscall_set_return_value() is out
of scope of this series, so I'll just let you know my concerns.

 arch/powerpc/include/asm/syscall.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index ab9f3f0a8637..1d03e753391d 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -39,6 +39,16 @@ static inline void syscall_rollback(struct task_struct *task,
regs->gpr[3] = regs->orig_gpr3;
 }

+static inline long syscall_get_error(struct task_struct *task,
+struct pt_regs *regs)
+{
+   /*
+* If the system call failed,
+* regs->gpr[3] contains a positive ERRORCODE.
+*/
+   return (regs->ccr & 0x1000UL) ? -regs->gpr[3] : 0;
+}
+
 static inline long syscall_get_return_value(struct task_struct *task,
struct pt_regs *regs)
 {
-- 
ldv

Re: [PATCH v1 03/13] powerpc/mm/32s: rework mmu_mapin_ram()

2018-12-13 Thread Christophe Leroy


Hi Again,

Le 13/12/2018 à 13:16, Christophe Leroy a écrit :

Hi,

On 12/03/2018 09:55 PM, Jonathan Neuschäfer wrote:

Hi,

On Thu, Nov 29, 2018 at 07:00:16PM +, Christophe Leroy wrote:

This patch reworks mmu_mapin_ram() to be more generic and map as much
blocks as possible. It now supports blocks not starting at address 0.

It scans DBATs array to find free ones instead of forcing the use of
BAT2 and BAT3.

Signed-off-by: Christophe Leroy 
---


I've just tested this series on my Wii, and starting from this patch
(03/13), it hangs at the following lines of output:

[    0.00] printk: bootconsole [udbg0] enabled
[    0.00] Total memory = 319MB; using 1024kB for hash table (at 
(ptrval))


Before this patch it looks like this and boots to userspace:

[    0.00] printk: bootconsole [udbg0] enabled
[    0.00] Total memory = 319MB; using 1024kB for hash table (at 
(ptrval))
[    0.00] Linux version 4.20.0-rc5-wii-00022-gfbb911b84755 
(jn@longitude) (gcc version 8.2.0 (Debian 8.2.0-9)) #1337 PREEMPT Mon 
Dec 3 21:49:02 CET 2018

ug_udbg_init: early -> final
usbgecko_udbg: ready
[    0.00] Using wii machine description


Can you tell/provide the .config and dts used ?

You seem to have 319MB RAM wherease arch/powerpc/boot/dts/wii.dts only 
has 88MB Memory:


 memory {
     device_type = "memory";
     reg = <0x 0x0180    /* MEM1 24MB 1T-SRAM */
    0x1000 0x0400>;    /* MEM2 64MB GDDR3 */
 };


Putting the same description in my mpc832x board DTS and doing a few 
hacks to get the WII functions called, I get the following:


[0.00] Top of RAM: 0x1400, Total RAM: 0x580
[0.00] Memory hole size: 232MB
[0.00] Zone ranges:
[0.00]   DMA  [mem 0x-0x13ff]
[0.00]   Normal   empty
[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x-0x017f]
[0.00]   node   0: [mem 0x1000-0x13ff]
[0.00] Initmem setup node 0 [mem 
0x-0x13ff]

[0.00] On node 0 totalpages: 22528
[0.00]   DMA zone: 640 pages used for memmap
[0.00]   DMA zone: 0 pages reserved
[0.00]   DMA zone: 22528 pages, LIFO batch:3
[0.00] pcpu-alloc: s0 r0 d32768 u32768 alloc=1*32768
[0.00] pcpu-alloc: [0] 0
[0.00] Built 1 zonelists, mobility grouping on.  Total pages: 21888
[0.00] Kernel command line: loglevel=7 
ip=192.168.2.5:192.168.2.2::255.0
[0.00] Dentry cache hash table entries: 16384 (order: 4, 65536 
bytes)

[0.00] Inode-cache hash table entries: 8192 (order: 3, 32768 bytes)
[0.00] Memory: 77060K/90112K available (6548K kernel code, 1156K 
rwdata,

[0.00] Kernel virtual memory layout:
[0.00]   * 0xfffdf000..0xf000  : fixmap
[0.00]   * 0xfdffd000..0xfe00  : early ioremap
[0.00]   * 0xd500..0xfdffd000  : vmalloc & ioremap




root@vgoippro:~# cat /sys/kernel/debug/powerpc/block_address_translation
---[ Instruction Block Address Translation ]---
0: 0xc000-0xc0ff 0x Kernel EXEC coherent
1: -
2: 0xc100-0xc17f 0x0100 Kernel EXEC coherent
3: -
4: 0xd000-0xd3ff 0x1000 Kernel EXEC coherent
5: -
6: -
7: -

---[ Data Block Address Translation ]---
0: 0xc000-0xc0ff 0x Kernel RW coherent
1: 0xfffe-0x 0x0d00 Kernel RW no cache guarded
2: 0xc100-0xc17f 0x0100 Kernel RW coherent
3: -
4: 0xd000-0xd3ff 0x1000 Kernel RW coherent
5: -
6: -
7: -


Could you please provide the dmesg and 
/sys/kernel/debug/powerpc/block_address_translation from before this 
patch, so that we can compare and identify the differences if any ?


Thanks
Christophe





Christophe


...

I've tested at patch 1, 2, 3, 4, and 13, so I don't know if it works
somewhere in the middle, but probably not.

(And in case you're wondering about the 22 in the version string: Those
are mostly patches that give me a serial console.)

I'm not sure what is going on, because I haven't looked closely at the
patches or tried to debug the problem. If you have some debugging tips,
I can try them.


Jonathan Neuschäfer

[PATCH] powerpc/prom: fix early DEBUG messages

2018-12-13 Thread Christophe Leroy

This patch fixes early DEBUG messages in prom.c:
- Use %px instead of %p to see the addresses
- Use %x instead of %llx when phys_addr_t is not 64 bits.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/prom.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index fe758cedb93f..d8e56e03c9c6 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -129,7 +129,7 @@ static void __init move_device_tree(void)
p = __va(memblock_phys_alloc(size, PAGE_SIZE));
memcpy(p, initial_boot_params, size);
initial_boot_params = p;
-   DBG("Moved device tree to 0x%p\n", p);
+   DBG("Moved device tree to 0x%px\n", p);
}
 
DBG("<- move_device_tree\n");
@@ -689,7 +689,7 @@ void __init early_init_devtree(void *params)
 {
phys_addr_t limit;
 
-   DBG(" -> early_init_devtree(%p)\n", params);
+   DBG(" -> early_init_devtree(%px)\n", params);
 
/* Too early to BUG_ON(), do it by hand */
if (!early_init_dt_verify(params))
@@ -749,7 +749,11 @@ void __init early_init_devtree(void *params)
memblock_allow_resize();
memblock_dump_all();
 
+#ifdef CONFIG_PHYS_64BIT
DBG("Phys. mem: %llx\n", memblock_phys_mem_size());
+#else
+   DBG("Phys. mem: %x\n", memblock_phys_mem_size());
+#endif
 
/* We may need to relocate the flat tree, do it now.
 * FIXME .. and the initrd too? */
-- 
2.13.3

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christian Zigotzky


On 13 December 2018 at 12:25PM, Christoph Hellwig wrote:

On Thu, Dec 13, 2018 at 12:19:26PM +0100, Christian Zigotzky wrote:

I tried it again but I get the following error message:

MODPOST vmlinux.o
arch/powerpc/kernel/dma-iommu.o: In function `.dma_iommu_get_required_mask':
(.text+0x274): undefined reference to `.dma_direct_get_required_mask'
make: *** [vmlinux] Error 1

Sorry, you need this one liner before all the patches posted last time:

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d8819e3a1eb1..7e78c2798f2f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -154,6 +154,7 @@ config PPC
select CLONE_BACKWARDS
select DCACHE_WORD_ACCESS   if PPC64 && CPU_LITTLE_ENDIAN
select DYNAMIC_FTRACE   if FUNCTION_TRACER
+   select DMA_DIRECT_OPS
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
select GENERIC_ATOMIC64 if PPC32

Thanks. Result: PASEMI onboard ethernet works and the X5000 (P5020 
board) boots with the patch '0001-get_required_mask.patch'.


-- Christian

Re: [PATCH v1 03/13] powerpc/mm/32s: rework mmu_mapin_ram()

2018-12-13 Thread Christophe Leroy


Hi,

On 12/03/2018 09:55 PM, Jonathan Neuschäfer wrote:

Hi,

On Thu, Nov 29, 2018 at 07:00:16PM +, Christophe Leroy wrote:

This patch reworks mmu_mapin_ram() to be more generic and map as much
blocks as possible. It now supports blocks not starting at address 0.

It scans DBATs array to find free ones instead of forcing the use of
BAT2 and BAT3.

Signed-off-by: Christophe Leroy 
---


I've just tested this series on my Wii, and starting from this patch
(03/13), it hangs at the following lines of output:

[0.00] printk: bootconsole [udbg0] enabled
[0.00] Total memory = 319MB; using 1024kB for hash table (at (ptrval))

Before this patch it looks like this and boots to userspace:

[0.00] printk: bootconsole [udbg0] enabled
[0.00] Total memory = 319MB; using 1024kB for hash table (at (ptrval))
[0.00] Linux version 4.20.0-rc5-wii-00022-gfbb911b84755 (jn@longitude) 
(gcc version 8.2.0 (Debian 8.2.0-9)) #1337 PREEMPT Mon Dec 3 21:49:02 CET 2018
ug_udbg_init: early -> final
usbgecko_udbg: ready
[0.00] Using wii machine description


Can you tell/provide the .config and dts used ?

You seem to have 319MB RAM wherease arch/powerpc/boot/dts/wii.dts only 
has 88MB Memory:


memory {
device_type = "memory";
reg = <0x 0x0180 /* MEM1 24MB 1T-SRAM */
   0x1000 0x0400>;   /* MEM2 64MB GDDR3 */
};

Christophe


...

I've tested at patch 1, 2, 3, 4, and 13, so I don't know if it works
somewhere in the middle, but probably not.

(And in case you're wondering about the 22 in the version string: Those
are mostly patches that give me a serial console.)

I'm not sure what is going on, because I haven't looked closely at the
patches or tried to debug the problem. If you have some debugging tips,
I can try them.


Jonathan Neuschäfer

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christoph Hellwig

On Thu, Dec 13, 2018 at 12:19:26PM +0100, Christian Zigotzky wrote:
> I tried it again but I get the following error message:
>
> MODPOST vmlinux.o
> arch/powerpc/kernel/dma-iommu.o: In function `.dma_iommu_get_required_mask':
> (.text+0x274): undefined reference to `.dma_direct_get_required_mask'
> make: *** [vmlinux] Error 1

Sorry, you need this one liner before all the patches posted last time:

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d8819e3a1eb1..7e78c2798f2f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -154,6 +154,7 @@ config PPC
select CLONE_BACKWARDS
select DCACHE_WORD_ACCESS   if PPC64 && CPU_LITTLE_ENDIAN
select DYNAMIC_FTRACE   if FUNCTION_TRACER
+   select DMA_DIRECT_OPS
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
select GENERIC_ATOMIC64 if PPC32

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christian Zigotzky


On 13 December 2018 at 10:47AM, Christian Zigotzky wrote:

On 13 December 2018 at 10:10AM, Christoph Hellwig wrote:

On Thu, Dec 13, 2018 at 09:41:50AM +0100, Christian Zigotzky wrote:
Today I tried the first patch (0001-get_required_mask.patch) with 
the last
good commit (977706f9755d2d697aa6f45b4f9f0e07516efeda). 
Unfortunately this

patch is already included in the last good commit
(977706f9755d2d697aa6f45b4f9f0e07516efeda). I will try the next patch.

Hmm, I don't think this is the case.  This is my local git log output:

commit 83a4b87de6bc6a75b500c9959de88e2157fbcd7c
Author: Christoph Hellwig 
Date:   Wed Dec 12 15:07:49 2018 +0100

 get_required_mask

commit 977706f9755d2d697aa6f45b4f9f0e07516efeda
Author: Christoph Hellwig 
Date:   Sat Nov 10 22:34:27 2018 +0100

 powerpc/dma: remove dma_nommu_mmap_coherent

I've also pushed a git branch with these out to:

 git://git.infradead.org/users/hch/misc.git powerpc-dma.5-debug

Sorry Christioph. I was wrong. The first patch isn't included in the 
last good commit. I will try it again. I can only test beside my main 
work. That means it takes longer.


-- Christian



I tried it again but I get the following error message:

MODPOST vmlinux.o
arch/powerpc/kernel/dma-iommu.o: In function `.dma_iommu_get_required_mask':
(.text+0x274): undefined reference to `.dma_direct_get_required_mask'
make: *** [vmlinux] Error 1

Re: [PATCH v1 2/2] powerpc/pseries: Add debugfs interface to retrieve VPHN info

2018-12-13 Thread Naveen N. Rao


Hi Michael,

Naveen N. Rao wrote:

Add debugfs interface to retrieve associativity information for lpar
vcpus (debugfs/vphn/lpar) and the hypervisor cpus (debugfs/vphn/hyp).
This information is useful to derive various metrics, including the vcpu
dispatch statistics in a SPLPAR environment.


Any thoughts on this approach vs. adding a tracepoint?


Thanks,
Naveen



Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/mm/numa.c | 105 +
 1 file changed, 105 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 6677a578f18d..f0b0e87016e6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int numa_enabled = 1;
 
@@ -1089,6 +1090,107 @@ static long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity)

return rc;
 }
 
+#ifdef CONFIG_DEBUG_FS

+static ssize_t vphn_lpar_cpu_file_read(struct file *filp, char __user *buf,
+   size_t len, loff_t *pos)
+{
+   int cpu = (long)filp->private_data;
+   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   int hwcpu = get_hard_smp_processor_id(cpu);
+   long int rc;
+
+   if (len != sizeof(associativity))
+   return -EINVAL;
+
+   rc = hcall_vphn(hwcpu, 1, associativity);
+   if (rc)
+   return -EFAULT;
+
+   rc = copy_to_user(buf, &associativity, sizeof(associativity));
+   if (rc)
+   return -EFAULT;
+
+   return sizeof(associativity);
+}
+
+static ssize_t vphn_hyp_cpu_file_read(struct file *filp, char __user *buf,
+   size_t len, loff_t *pos)
+{
+   int cpu = (long)filp->private_data;
+   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   long int rc;
+
+   if (len != sizeof(associativity))
+   return -EINVAL;
+
+   rc = hcall_vphn(cpu, 2, associativity);
+   if (rc)
+   return -EFAULT;
+
+   rc = copy_to_user(buf, &associativity, sizeof(associativity));
+   if (rc)
+   return -EFAULT;
+
+   return sizeof(associativity);
+}
+
+static const struct file_operations vphn_lpar_cpu_fops = {
+   .open   = simple_open,
+   .read   = vphn_lpar_cpu_file_read,
+   .llseek = no_llseek,
+};
+
+static const struct file_operations vphn_hyp_cpu_fops = {
+   .open   = simple_open,
+   .read   = vphn_hyp_cpu_file_read,
+   .llseek = no_llseek,
+};
+
+static int debug_init_vphn_entries(void)
+{
+   struct dentry *vphn_dir, *vphn_lpar_dir, *vphn_hyp_dir;
+   struct dentry *vphn_lpar_cpu_file, *vphn_hyp_cpu_file;
+   long cpu;
+   char name[10];
+
+   if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+   return 0;
+
+   vphn_dir = debugfs_create_dir("vphn", powerpc_debugfs_root);
+   if (!vphn_dir) {
+   pr_warn("%s: can't create vphn debugfs root dir\n", __func__);
+   return -ENOMEM;
+   }
+
+   vphn_lpar_dir = debugfs_create_dir("lpar", vphn_dir);
+   vphn_hyp_dir = debugfs_create_dir("hyp", vphn_dir);
+   if (!vphn_lpar_dir || !vphn_hyp_dir) {
+   pr_warn("%s: can't create vphn dir\n", __func__);
+   goto err_remove_dir;
+   }
+
+   for_each_possible_cpu(cpu) {
+   sprintf(name, "cpu-%ld", cpu);
+   vphn_lpar_cpu_file = debugfs_create_file(name, 0400,
+   vphn_lpar_dir, (void *)cpu, 
&vphn_lpar_cpu_fops);
+   vphn_hyp_cpu_file = debugfs_create_file(name, 0400,
+   vphn_hyp_dir, (void *)cpu, &vphn_hyp_cpu_fops);
+   if (!vphn_lpar_cpu_file || !vphn_hyp_cpu_file) {
+   pr_warn("%s: can't create vphn cpu file\n", __func__);
+   goto err_remove_dir;
+   }
+   }
+
+   return 0;
+
+err_remove_dir:
+   debugfs_remove_recursive(vphn_dir);
+   return -ENOMEM;
+}
+#else
+static int debug_init_vphn_entries(void) { return 0; }
+#endif /* CONFIG_DEBUG_FS */
+
 /*
  * Change polling interval for associativity changes.
  */
@@ -1619,6 +1721,9 @@ static int topology_update_init(void)
if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
return -ENOMEM;
 
+	if (!debug_init_vphn_entries())

+   return -ENOMEM;
+
topology_inited = 1;
return 0;
 }
--
2.19.2

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christian Zigotzky


On 13 December 2018 at 10:10AM, Christoph Hellwig wrote:

On Thu, Dec 13, 2018 at 09:41:50AM +0100, Christian Zigotzky wrote:

Today I tried the first patch (0001-get_required_mask.patch) with the last
good commit (977706f9755d2d697aa6f45b4f9f0e07516efeda). Unfortunately this
patch is already included in the last good commit
(977706f9755d2d697aa6f45b4f9f0e07516efeda). I will try the next patch.

Hmm, I don't think this is the case.  This is my local git log output:

commit 83a4b87de6bc6a75b500c9959de88e2157fbcd7c
Author: Christoph Hellwig 
Date:   Wed Dec 12 15:07:49 2018 +0100

 get_required_mask

commit 977706f9755d2d697aa6f45b4f9f0e07516efeda
Author: Christoph Hellwig 
Date:   Sat Nov 10 22:34:27 2018 +0100

 powerpc/dma: remove dma_nommu_mmap_coherent

I've also pushed a git branch with these out to:

 git://git.infradead.org/users/hch/misc.git powerpc-dma.5-debug

Sorry Christioph. I was wrong. The first patch isn't included in the 
last good commit. I will try it again. I can only test beside my main 
work. That means it takes longer.


-- Christian

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christoph Hellwig

On Thu, Dec 13, 2018 at 09:41:50AM +0100, Christian Zigotzky wrote:
> Today I tried the first patch (0001-get_required_mask.patch) with the last 
> good commit (977706f9755d2d697aa6f45b4f9f0e07516efeda). Unfortunately this 
> patch is already included in the last good commit 
> (977706f9755d2d697aa6f45b4f9f0e07516efeda). I will try the next patch.

Hmm, I don't think this is the case.  This is my local git log output:

commit 83a4b87de6bc6a75b500c9959de88e2157fbcd7c
Author: Christoph Hellwig 
Date:   Wed Dec 12 15:07:49 2018 +0100

get_required_mask

commit 977706f9755d2d697aa6f45b4f9f0e07516efeda
Author: Christoph Hellwig 
Date:   Sat Nov 10 22:34:27 2018 +0100

powerpc/dma: remove dma_nommu_mmap_coherent

I've also pushed a git branch with these out to:

git://git.infradead.org/users/hch/misc.git powerpc-dma.5-debug

[PATCH v5 5/5] powerpc: generate uapi header and system call table files

2018-12-13 Thread Firoz Khan

System call table generation script must be run to gener-
ate unistd_32/64.h and syscall_table_32/64/c32/spu.h files.
This patch will have changes which will invokes the script.

This patch will generate unistd_32/64.h and syscall_table-
_32/64/c32/spu.h files by the syscall table generation
script invoked by parisc/Makefile and the generated files
against the removed files must be identical.

The generated uapi header file will be included in uapi/-
asm/unistd.h and generated system call table header file
will be included by kernel/systbl.S file.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/Makefile   |   3 +
 arch/powerpc/include/asm/Kbuild |   4 +
 arch/powerpc/include/asm/systbl.h   | 395 
 arch/powerpc/include/uapi/asm/Kbuild|   2 +
 arch/powerpc/include/uapi/asm/unistd.h  | 392 +--
 arch/powerpc/kernel/Makefile|  10 -
 arch/powerpc/kernel/systbl.S|  52 +---
 arch/powerpc/kernel/systbl_chk.c|  60 -
 arch/powerpc/platforms/cell/spu_callbacks.c |  17 +-
 9 files changed, 26 insertions(+), 909 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/systbl.h
 delete mode 100644 arch/powerpc/kernel/systbl_chk.c

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 8a2ce14..34897191 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -402,6 +402,9 @@ archclean:
 
 archprepare: checkbin
 
+archheaders:
+   $(Q)$(MAKE) $(build)=arch/powerpc/kernel/syscalls all
+
 ifdef CONFIG_STACKPROTECTOR
 prepare: stack_protector_prepare
 
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d22..77ff7fb 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,3 +1,7 @@
+generated-y += syscall_table_32.h
+generated-y += syscall_table_64.h
+generated-y += syscall_table_c32.h
+generated-y += syscall_table_spu.h
 generic-y += div64.h
 generic-y += export.h
 generic-y += irq_regs.h
diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
deleted file mode 100644
index c4321b9..000
--- a/arch/powerpc/include/asm/systbl.h
+++ /dev/null
@@ -1,395 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * List of powerpc syscalls. For the meaning of the _SPU suffix see
- * arch/powerpc/platforms/cell/spu_callbacks.c
- */
-
-SYSCALL(restart_syscall)
-SYSCALL(exit)
-PPC_SYS(fork)
-SYSCALL_SPU(read)
-SYSCALL_SPU(write)
-COMPAT_SYS_SPU(open)
-SYSCALL_SPU(close)
-SYSCALL_SPU(waitpid)
-SYSCALL_SPU(creat)
-SYSCALL_SPU(link)
-SYSCALL_SPU(unlink)
-COMPAT_SYS(execve)
-SYSCALL_SPU(chdir)
-COMPAT_SYS_SPU(time)
-SYSCALL_SPU(mknod)
-SYSCALL_SPU(chmod)
-SYSCALL_SPU(lchown)
-SYSCALL(ni_syscall)
-OLDSYS(stat)
-COMPAT_SYS_SPU(lseek)
-SYSCALL_SPU(getpid)
-COMPAT_SYS(mount)
-SYSX(sys_ni_syscall,sys_oldumount,sys_oldumount)
-SYSCALL_SPU(setuid)
-SYSCALL_SPU(getuid)
-COMPAT_SYS_SPU(stime)
-COMPAT_SYS(ptrace)
-SYSCALL_SPU(alarm)
-OLDSYS(fstat)
-SYSCALL(pause)
-COMPAT_SYS(utime)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(access)
-SYSCALL_SPU(nice)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(sync)
-SYSCALL_SPU(kill)
-SYSCALL_SPU(rename)
-SYSCALL_SPU(mkdir)
-SYSCALL_SPU(rmdir)
-SYSCALL_SPU(dup)
-SYSCALL_SPU(pipe)
-COMPAT_SYS_SPU(times)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(brk)
-SYSCALL_SPU(setgid)
-SYSCALL_SPU(getgid)
-SYSCALL(signal)
-SYSCALL_SPU(geteuid)
-SYSCALL_SPU(getegid)
-SYSCALL(acct)
-SYSCALL(umount)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(ioctl)
-COMPAT_SYS_SPU(fcntl)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(setpgid)
-SYSCALL(ni_syscall)
-SYSX(sys_ni_syscall,sys_olduname,sys_olduname)
-SYSCALL_SPU(umask)
-SYSCALL_SPU(chroot)
-COMPAT_SYS(ustat)
-SYSCALL_SPU(dup2)
-SYSCALL_SPU(getppid)
-SYSCALL_SPU(getpgrp)
-SYSCALL_SPU(setsid)
-SYS32ONLY(sigaction)
-SYSCALL_SPU(sgetmask)
-SYSCALL_SPU(ssetmask)
-SYSCALL_SPU(setreuid)
-SYSCALL_SPU(setregid)
-SYS32ONLY(sigsuspend)
-SYSX(sys_ni_syscall,compat_sys_sigpending,sys_sigpending)
-SYSCALL_SPU(sethostname)
-COMPAT_SYS_SPU(setrlimit)
-SYSX(sys_ni_syscall,compat_sys_old_getrlimit,sys_old_getrlimit)
-COMPAT_SYS_SPU(getrusage)
-COMPAT_SYS_SPU(gettimeofday)
-COMPAT_SYS_SPU(settimeofday)
-SYSCALL_SPU(getgroups)
-SYSCALL_SPU(setgroups)
-SYSX(sys_ni_syscall,sys_ni_syscall,ppc_select)
-SYSCALL_SPU(symlink)
-OLDSYS(lstat)
-SYSCALL_SPU(readlink)
-SYSCALL(uselib)
-SYSCALL(swapon)
-SYSCALL(reboot)
-SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir)
-SYSCALL_SPU(mmap)
-SYSCALL_SPU(munmap)
-COMPAT_SYS_SPU(truncate)
-COMPAT_SYS_SPU(ftruncate)
-SYSCALL_SPU(fchmod)
-SYSCALL_SPU(fchown)
-SYSCALL_SPU(getpriority)
-SYSCALL_SPU(setpriority)
-SYSCALL(ni_syscall)
-COMPAT_SYS(statfs)
-COMPAT_SYS(fstatfs)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(socketcall)
-SYSCALL_SPU(syslog)
-COMPAT_SYS_SPU(setitimer)
-COMPAT_SYS_SPU(getitimer)
-COMPAT_SYS_SPU(newstat)
-COMPAT_SYS_SPU(newlstat)
-COMPAT_SYS_SPU(newfstat)
-SYSX(sys_ni_syscall,sys_uname,sys_uname)
-SYSCALL(ni_sysca

[PATCH v5 4/5] powerpc: split compat syscall table out from native table

2018-12-13 Thread Firoz Khan

PowerPC uses a syscall table with native and compat calls
interleaved, which is a slightly simpler way to define two
matching tables.

As we move to having the tables generated, that advantage
is no longer important, but the interleaved table gets in
the way of using the same scripts as on the other archit-
ectures.

Split out a new compat_sys_call_table symbol that contains
all the compat calls, and leave the main table for the nat-
ive calls, to more closely match the method we use every-
where else.

Suggested-by: Arnd Bergmann 
Signed-off-by: Firoz Khan 
---
 arch/powerpc/include/asm/syscall.h |  3 +--
 arch/powerpc/kernel/entry_64.S |  7 +--
 arch/powerpc/kernel/systbl.S   | 35 ---
 arch/powerpc/kernel/vdso.c |  7 +--
 4 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/syscall.h 
b/arch/powerpc/include/asm/syscall.h
index ab9f3f0..1a0e7a8 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -18,9 +18,8 @@
 #include 
 
 /* ftrace syscalls requires exporting the sys_call_table */
-#ifdef CONFIG_FTRACE_SYSCALLS
 extern const unsigned long sys_call_table[];
-#endif /* CONFIG_FTRACE_SYSCALLS */
+extern const unsigned long compat_sys_call_table[];
 
 static inline int syscall_get_nr(struct task_struct *task, struct pt_regs 
*regs)
 {
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 7b1693a..5574d92 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -54,6 +54,9 @@
 SYS_CALL_TABLE:
.tc sys_call_table[TC],sys_call_table
 
+COMPAT_SYS_CALL_TABLE:
+   .tc compat_sys_call_table[TC],compat_sys_call_table
+
 /* This value is used to mark exception frames on the stack. */
 exception_marker:
.tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER
@@ -173,7 +176,7 @@ system_call:/* label this so stack 
traces look sane */
ld  r11,SYS_CALL_TABLE@toc(2)
andis.  r10,r10,_TIF_32BIT@h
beq 15f
-   addir11,r11,8   /* use 32-bit syscall entries */
+   ld  r11,COMPAT_SYS_CALL_TABLE@toc(2)
clrldi  r3,r3,32
clrldi  r4,r4,32
clrldi  r5,r5,32
@@ -181,7 +184,7 @@ system_call:/* label this so stack 
traces look sane */
clrldi  r7,r7,32
clrldi  r8,r8,32
 15:
-   slwir0,r0,4
+   slwir0,r0,3
 
barrier_nospec_asm
/*
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 9ff1913..0fa84e1 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -17,13 +17,13 @@
 #include 
 
 #ifdef CONFIG_PPC64
-#define SYSCALL(func)  .8byte  DOTSYM(sys_##func),DOTSYM(sys_##func)
-#define COMPAT_SYS(func)   .8byte  
DOTSYM(sys_##func),DOTSYM(compat_sys_##func)
-#define PPC_SYS(func)  .8byte  DOTSYM(ppc_##func),DOTSYM(ppc_##func)
-#define OLDSYS(func)   .8byte  
DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall)
-#define SYS32ONLY(func).8byte  
DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func)
-#define PPC64ONLY(func).8byte  
DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall)
-#define SYSX(f, f3264, f32).8byte  DOTSYM(f),DOTSYM(f3264)
+#define SYSCALL(func)  .8byte  DOTSYM(sys_##func)
+#define COMPAT_SYS(func)   .8byte  DOTSYM(sys_##func)
+#define PPC_SYS(func)  .8byte  DOTSYM(ppc_##func)
+#define OLDSYS(func)   .8byte  DOTSYM(sys_ni_syscall)
+#define SYS32ONLY(func).8byte  DOTSYM(sys_ni_syscall)
+#define PPC64ONLY(func).8byte  DOTSYM(ppc_##func)
+#define SYSX(f, f3264, f32).8byte  DOTSYM(f)
 #else
 #define SYSCALL(func)  .long   sys_##func
 #define COMPAT_SYS(func)   .long   sys_##func
@@ -46,6 +46,27 @@
 
 .globl sys_call_table
 sys_call_table:
+#include 
+
+#undef SYSCALL
+#undef COMPAT_SYS
+#undef PPC_SYS
+#undef OLDSYS
+#undef SYS32ONLY
+#undef PPC64ONLY
+#undef SYSX
 
+#ifdef CONFIG_COMPAT
+#define SYSCALL(func)  .8byte  DOTSYM(sys_##func)
+#define COMPAT_SYS(func)   .8byte  DOTSYM(compat_sys_##func)
+#define PPC_SYS(func)  .8byte  DOTSYM(ppc_##func)
+#define OLDSYS(func)   .8byte  DOTSYM(sys_ni_syscall)
+#define SYS32ONLY(func).8byte  DOTSYM(compat_sys_##func)
+#define PPC64ONLY(func).8byte  DOTSYM(sys_ni_syscall)
+#define SYSX(f, f3264, f32).8byte  DOTSYM(f3264)
+
+.globl compat_sys_call_table
+compat_sys_call_table:
 #define compat_sys_sigsuspend  sys_sigsuspend
 #include 
+#endif
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 65b3bdb..7725a97 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -671,15 +671,18 @@ static void __init vdso_setup_syscall_map(void)
 {
unsigned int i;
extern unsigned long *sys_call_table;
+#ifdef CONFIG_PPC64
+   extern unsigned long *com

[PATCH v5 3/5] powerpc: add system call table generation support

2018-12-13 Thread Firoz Khan

The system call tables are in different format in all
architecture and it will be difficult to manually add or
modify the system calls in the respective files. To make
it easy by keeping a script and which will generate the
uapi header and syscall table file. This change will also
help to unify the implementation across all architectures.

The system call table generation script is added in
syscalls directory which contain the script to generate
both uapi header file and system call table files.
The syscall.tbl file will be the input for the scripts.

syscall.tbl contains the list of available system calls
along with system call number and corresponding entry point.
Add a new system call in this architecture will be possible
by adding new entry in the syscall.tbl file.

Adding a new table entry consisting of:
- System call number.
- ABI.
- System call name.
- Entry point name.
- Compat entry name, if required.

syscallhdr.sh and syscalltbl.sh will generate uapi header-
unistd_32/64.h and syscall_table_32/64/c32/spu.h files
respectively. File syscall_table_32/64/c32/spu.h is incl-
uded by syscall.S - the real system call table. Both *.sh
files will parse the content syscall.tbl to generate the
header and table files.

ARM, s390 and x86 architecuture does have similar support.
I leverage their implementation to come up with a generic
solution.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/kernel/syscalls/Makefile  |  63 +
 arch/powerpc/kernel/syscalls/syscall.tbl   | 427 +
 arch/powerpc/kernel/syscalls/syscallhdr.sh |  37 +++
 arch/powerpc/kernel/syscalls/syscalltbl.sh |  36 +++
 4 files changed, 563 insertions(+)
 create mode 100644 arch/powerpc/kernel/syscalls/Makefile
 create mode 100644 arch/powerpc/kernel/syscalls/syscall.tbl
 create mode 100644 arch/powerpc/kernel/syscalls/syscallhdr.sh
 create mode 100644 arch/powerpc/kernel/syscalls/syscalltbl.sh

diff --git a/arch/powerpc/kernel/syscalls/Makefile 
b/arch/powerpc/kernel/syscalls/Makefile
new file mode 100644
index 000..27b4895
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/Makefile
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+kapi := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
+
+_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')  \
+ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
+
+syscall := $(srctree)/$(src)/syscall.tbl
+syshdr := $(srctree)/$(src)/syscallhdr.sh
+systbl := $(srctree)/$(src)/syscalltbl.sh
+
+quiet_cmd_syshdr = SYSHDR  $@
+  cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@'   \
+  '$(syshdr_abis_$(basetarget))'   \
+  '$(syshdr_pfx_$(basetarget))'\
+  '$(syshdr_offset_$(basetarget))'
+
+quiet_cmd_systbl = SYSTBL  $@
+  cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@'   \
+  '$(systbl_abis_$(basetarget))'   \
+  '$(systbl_abi_$(basetarget))'\
+  '$(systbl_offset_$(basetarget))'
+
+syshdr_abis_unistd_32 := common,nospu,32
+$(uapi)/unistd_32.h: $(syscall) $(syshdr)
+   $(call if_changed,syshdr)
+
+syshdr_abis_unistd_64 := common,nospu,64
+$(uapi)/unistd_64.h: $(syscall) $(syshdr)
+   $(call if_changed,syshdr)
+
+systbl_abis_syscall_table_32 := common,nospu,32
+systbl_abi_syscall_table_32 := 32
+$(kapi)/syscall_table_32.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+systbl_abis_syscall_table_64 := common,nospu,64
+systbl_abi_syscall_table_64 := 64
+$(kapi)/syscall_table_64.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+systbl_abis_syscall_table_c32 := common,nospu,32
+systbl_abi_syscall_table_c32 := c32
+$(kapi)/syscall_table_c32.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+systbl_abis_syscall_table_spu := common,spu
+systbl_abi_syscall_table_spu := spu
+$(kapi)/syscall_table_spu.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+uapisyshdr-y   += unistd_32.h unistd_64.h
+kapisyshdr-y   += syscall_table_32.h   \
+  syscall_table_64.h   \
+  syscall_table_c32.h  \
+  syscall_table_spu.h
+
+targets+= $(uapisyshdr-y) $(kapisyshdr-y)
+
+PHONY += all
+all: $(addprefix $(uapi)/,$(uapisyshdr-y))
+all: $(addprefix $(kapi)/,$(kapisyshdr-y))
+   @:
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl 
b/arch/powerpc/kernel/syscalls/syscall.tbl
new file mode 100644
index 000..db3bbb8
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -0,0 +1,427 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# system call numbers and entry vectors for powerpc
+#
+# The format is:
+# 
+#
+# The  can be common, spu, nospu, 64, or 32 for this file.
+#
+0  nospu   restart_syscall sys_restart_syscall
+1

[PATCH v5 2/5] powerpc: move macro definition from asm/systbl.h

2018-12-13 Thread Firoz Khan

Move the macro definition for compat_sys_sigsuspend from
asm/systbl.h to the file which it is getting included.

One of the patch in this patch series is generating uapi
header and syscall table files. In order to come up with
a common implimentation across all architecture, we need
to do this change.

This change will simplify the implementation of system
call table generation script and help to come up a common
implementation across all architecture.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/include/asm/systbl.h | 1 -
 arch/powerpc/kernel/systbl.S  | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index 01b5171..c4321b9 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -76,7 +76,6 @@
 SYSCALL_SPU(ssetmask)
 SYSCALL_SPU(setreuid)
 SYSCALL_SPU(setregid)
-#define compat_sys_sigsuspend sys_sigsuspend
 SYS32ONLY(sigsuspend)
 SYSX(sys_ni_syscall,compat_sys_sigpending,sys_sigpending)
 SYSCALL_SPU(sethostname)
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 919a327..9ff1913 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -47,4 +47,5 @@
 .globl sys_call_table
 sys_call_table:
 
+#define compat_sys_sigsuspend  sys_sigsuspend
 #include 
-- 
1.9.1

[PATCH v5 1/5] powerpc: add __NR_syscalls along with NR_syscalls

2018-12-13 Thread Firoz Khan

NR_syscalls macro holds the number of system call exist
in powerpc architecture. We have to change the value of
NR_syscalls, if we add or delete a system call.

One of the patch in this patch series has a script which
will generate a uapi header based on syscall.tbl file.
The syscall.tbl file contains the number of system call
information. So we have two option to update NR_syscalls
value.

1. Update NR_syscalls in asm/unistd.h manually by count-
   ing the no.of system calls. No need to update NR_sys-
   calls until we either add a new system call or delete
   existing system call.

2. We can keep this feature in above mentioned script,
   that will count the number of syscalls and keep it in
   a generated file. In this case we don't need to expli-
   citly update NR_syscalls in asm/unistd.h file.

The 2nd option will be the recommended one. For that, I
added the __NR_syscalls macro in uapi/asm/unistd.h along
with NR_syscalls asm/unistd.h. The macro __NR_syscalls
also added for making the name convention same across all
architecture. While __NR_syscalls isn't strictly part of
the uapi, having it as part of the generated header to
simplifies the implementation. We also need to enclose
this macro with #ifdef __KERNEL__ to avoid side effects.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/include/asm/unistd.h  | 3 +--
 arch/powerpc/include/uapi/asm/unistd.h | 5 -
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index b0de85b..a3c35e6 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -11,8 +11,7 @@
 
 #include 
 
-
-#define NR_syscalls389
+#define NR_syscalls__NR_syscalls
 
 #define __NR__exit __NR_exit
 
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index 985534d..7195868 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -10,7 +10,6 @@
 #ifndef _UAPI_ASM_POWERPC_UNISTD_H_
 #define _UAPI_ASM_POWERPC_UNISTD_H_
 
-
 #define __NR_restart_syscall 0
 #define __NR_exit1
 #define __NR_fork2
@@ -401,4 +400,8 @@
 #define __NR_rseq  387
 #define __NR_io_pgetevents 388
 
+#ifdef __KERNEL__
+#define __NR_syscalls  389
+#endif
+
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
-- 
1.9.1

[PATCH v5 0/5] powerpc: system call table generation support

2018-12-13 Thread Firoz Khan

The purpose of this patch series is, we can easily
add/modify/delete system call table support by cha-
nging entry in syscall.tbl file instead of manually
changing many files. The other goal is to unify the 
system call table generation support implementation 
across all the architectures. 

The system call tables are in different format in 
all architecture. It will be difficult to manually
add, modify or delete the system calls in the resp-
ective files manually. To make it easy by keeping a 
script and which'll generate uapi header file and 
syscall table file.

syscall.tbl contains the list of available system 
calls along with system call number and correspond-
ing entry point. Add a new system call in this arch-
itecture will be possible by adding new entry in 
the syscall.tbl file.

Adding a new table entry consisting of:
- System call number.
- ABI.
- System call name.
- Entry point name.
- Compat entry name, if required.
- spu entry name, if required.

ARM, s390 and x86 architecuture does exist the sim-
ilar support. I leverage their implementation to 
come up with a generic solution.

I have done the same support for work for alpha, 
ia64, m68k, microblaze, mips, parisc, sh, sparc, 
and xtensa. Below mentioned git repository contains
more details about the workflow.

https://github.com/frzkhn/system_call_table_generator/

Finally, this is the ground work to solve the Y2038
issue. We need to add two dozen of system calls to 
solve Y2038 issue. So this patch series will help to
add new system calls easily by adding new entry in the
syscall.tbl.

Changes since v4:
 - DOTSYM macro removed for ppc32, which was causing
   the compilation error.

Changes since v3:
 - split compat syscall table out from native table.
 - modified the script to add new line in the generated
   file.

Changes since v2:
 - modified/optimized the syscall.tbl to avoid duplicate
   for the spu entries.
 - updated the syscalltbl.sh to meet the above point.

Changes since v1:
 - optimized/updated the syscall table generation 
   scripts.
 - fixed all mixed indentation issues in syscall.tbl.
 - added "comments" in syscall_*.tbl.
 - changed from generic-y to generated-y in Kbuild.

Firoz Khan (5):
  powerpc: add __NR_syscalls along with NR_syscalls
  powerpc: move macro definition from asm/systbl.h
  powerpc: add system call table generation support
  powerpc: split compat syscall table out from native table
  powerpc: generate uapi header and system call table files

 arch/powerpc/Makefile   |   3 +
 arch/powerpc/include/asm/Kbuild |   4 +
 arch/powerpc/include/asm/syscall.h  |   3 +-
 arch/powerpc/include/asm/systbl.h   | 396 --
 arch/powerpc/include/asm/unistd.h   |   3 +-
 arch/powerpc/include/uapi/asm/Kbuild|   2 +
 arch/powerpc/include/uapi/asm/unistd.h  | 389 +
 arch/powerpc/kernel/Makefile|  10 -
 arch/powerpc/kernel/entry_64.S  |   7 +-
 arch/powerpc/kernel/syscalls/Makefile   |  63 
 arch/powerpc/kernel/syscalls/syscall.tbl| 427 
 arch/powerpc/kernel/syscalls/syscallhdr.sh  |  37 +++
 arch/powerpc/kernel/syscalls/syscalltbl.sh  |  36 +++
 arch/powerpc/kernel/systbl.S|  40 ++-
 arch/powerpc/kernel/systbl_chk.c|  60 
 arch/powerpc/kernel/vdso.c  |   7 +-
 arch/powerpc/platforms/cell/spu_callbacks.c |  17 +-
 17 files changed, 606 insertions(+), 898 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/systbl.h
 create mode 100644 arch/powerpc/kernel/syscalls/Makefile
 create mode 100644 arch/powerpc/kernel/syscalls/syscall.tbl
 create mode 100644 arch/powerpc/kernel/syscalls/syscallhdr.sh
 create mode 100644 arch/powerpc/kernel/syscalls/syscalltbl.sh
 delete mode 100644 arch/powerpc/kernel/systbl_chk.c

-- 
1.9.1

Re: use generic DMA mapping code in powerpc V4

2018-12-13 Thread Christian Zigotzky


On 12 December 2018 at 3:39PM, Christian Zigotzky wrote:

Hi Christoph,

Thanks a lot for your reply. I will test your patches tomorrow.

Cheers,
Christian

Sent from my iPhone


On 12. Dec 2018, at 15:15, Christoph Hellwig  wrote:

Thanks for bisecting.  I've spent some time going over the conversion
but can't really pinpoint it.  I have three little patches that switch
parts of the code to the generic version.  This is on top of the
last good commmit (977706f9755d2d697aa6f45b4f9f0e07516efeda).

Can you check with whіch one things stop working?


<0001-get_required_mask.patch>
<0002-swiotlb-dma_supported.patch>
<0003-nommu-dma_supported.patch>
<0004-alloc-free.patch>


Today I tried the first patch (0001-get_required_mask.patch) with the 
last good commit (977706f9755d2d697aa6f45b4f9f0e07516efeda). 
Unfortunately this patch is already included in the last good commit 
(977706f9755d2d697aa6f45b4f9f0e07516efeda). I will try the next patch.


-- Christian

Re: [PATCH] powerpc/8xx: hide itlbie and dtlbie symbols

2018-12-13 Thread Christophe Leroy





Le 12/12/2018 à 14:05, Michael Ellerman a écrit :

Christophe Leroy  writes:


When disassembling InstructionTLBError we get the following messy code:

c000138c:   7d 84 63 78 mr  r4,r12
c0001390:   75 25 58 00 andis.  r5,r9,22528
c0001394:   75 2a 40 00 andis.  r10,r9,16384
c0001398:   41 a2 00 08 beq c00013a0 
c000139c:   7c 00 22 64 tlbie   r4,r0

c00013a0 :
c00013a0:   39 40 04 01 li  r10,1025
c00013a4:   91 4b 00 b0 stw r10,176(r11)
c00013a8:   39 40 10 32 li  r10,4146
c00013ac:   48 00 cc 59 bl  c000e004 

For a cleaner code dump, this patch replaces itlbie and dtlbie
symbols by numeric symbols.

c000138c:   7d 84 63 78 mr  r4,r12
c0001390:   75 25 58 00 andis.  r5,r9,22528
c0001394:   75 2a 40 00 andis.  r10,r9,16384
c0001398:   41 a2 00 08 beq c00013a0 
c000139c:   7c 00 22 64 tlbie   r4,r0
c00013a0:   39 40 04 01 li  r10,1025
c00013a4:   91 4b 00 b0 stw r10,176(r11)
c00013a8:   39 40 10 32 li  r10,4146
c00013ac:   48 00 cc 59 bl  c000e004 
Signed-off-by: Christophe Leroy 
---
  arch/powerpc/kernel/head_8xx.S | 14 ++
  1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 3b67b9533c82..8c848acfe249 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -552,11 +552,10 @@ InstructionTLBError:
mr  r4,r12
andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
andis.  r10,r9,SRR1_ISI_NOPT@h
-   beq+1f
+   beq+1301f
tlbie   r4
-itlbie:
/* 0x400 is InstructionAccess exception, needed by bad_page_fault() */
-1: EXC_XFER_LITE(0x400, handle_page_fault)
+1301:  EXC_XFER_LITE(0x400, handle_page_fault)


You could use a local symbol, something like:


Thanks for the tip, I sent v2

Christophe



beq+1f
tlbie   r4
.Litlbie:
/* 0x400 is InstructionAccess exception, needed by bad_page_fault() */
1:  EXC_XFER_LITE(0x400, handle_page_fault)


cheers

[PATCH v2] powerpc/8xx: hide itlbie and dtlbie symbols

2018-12-13 Thread Christophe Leroy

When disassembling InstructionTLBError we get the following messy code:

c000138c:   7d 84 63 78 mr  r4,r12
c0001390:   75 25 58 00 andis.  r5,r9,22528
c0001394:   75 2a 40 00 andis.  r10,r9,16384
c0001398:   41 a2 00 08 beq c00013a0 
c000139c:   7c 00 22 64 tlbie   r4,r0

c00013a0 :
c00013a0:   39 40 04 01 li  r10,1025
c00013a4:   91 4b 00 b0 stw r10,176(r11)
c00013a8:   39 40 10 32 li  r10,4146
c00013ac:   48 00 cc 59 bl  c000e004 

For a cleaner code dump, this patch replaces itlbie and dtlbie
symbols by local symbols.

c000138c:   7d 84 63 78 mr  r4,r12
c0001390:   75 25 58 00 andis.  r5,r9,22528
c0001394:   75 2a 40 00 andis.  r10,r9,16384
c0001398:   41 a2 00 08 beq c00013a0 
c000139c:   7c 00 22 64 tlbie   r4,r0
c00013a0:   39 40 04 01 li  r10,1025
c00013a4:   91 4b 00 b0 stw r10,176(r11)
c00013a8:   39 40 10 32 li  r10,4146
c00013ac:   48 00 cc 59 bl  c000e004 
Signed-off-by: Christophe Leroy 
---
 v2: Using local named symbol instead of num symbol

 arch/powerpc/kernel/head_8xx.S | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index b171b7c0a0e7..34cd6c260da6 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -548,11 +548,11 @@ InstructionTLBError:
mr  r4,r12
andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
andis.  r10,r9,SRR1_ISI_NOPT@h
-   beq+1f
+   beq+.Litlbie
tlbie   r4
-itlbie:
/* 0x400 is InstructionAccess exception, needed by bad_page_fault() */
-1: EXC_XFER_LITE(0x400, handle_page_fault)
+.Litlbie:
+   EXC_XFER_LITE(0x400, handle_page_fault)
 
 /* This is the data TLB error on the MPC8xx.  This could be due to
  * many reasons, including a dirty update to a pte.  We bail out to
@@ -574,10 +574,10 @@ DARFixed:/* Return from dcbx instruction bug workaround */
stw r5,_DSISR(r11)
mfspr   r4,SPRN_DAR
andis.  r10,r5,DSISR_NOHPTE@h
-   beq+1f
+   beq+.Ldtlbie
tlbie   r4
-dtlbie:
-1: li  r10,RPN_PATTERN
+.Ldtlbie:
+   li  r10,RPN_PATTERN
mtspr   SPRN_DAR,r10/* Tag DAR, to be used in DTLB Error */
/* 0x300 is DataAccess exception, needed by bad_page_fault() */
EXC_XFER_LITE(0x300, handle_page_fault)
@@ -600,8 +600,8 @@ DataBreakpoint:
mtspr   SPRN_SPRG_SCRATCH1, r11
mfcrr10
mfspr   r11, SPRN_SRR0
-   cmplwi  cr0, r11, (dtlbie - PAGE_OFFSET)@l
-   cmplwi  cr7, r11, (itlbie - PAGE_OFFSET)@l
+   cmplwi  cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l
+   cmplwi  cr7, r11, (.Litlbie - PAGE_OFFSET)@l
beq-cr0, 11f
beq-cr7, 11f
EXCEPTION_PROLOG_1
-- 
2.13.3

57 matches

Mail list logo