date:20190520

[Bug 203647] Locking API testsuite fails "mixed read-lock/lock-write ABBA" rlock on kernels >=4.14.x

2019-05-20 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=203647

Michael Ellerman (mich...@ellerman.id.au) changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 CC||mich...@ellerman.id.au
 Resolution|--- |DOCUMENTED

--- Comment #6 from Michael Ellerman (mich...@ellerman.id.au) ---
This appears to be working as expected, which I admit is a little confusing.

The key thing is that at the end you see, eg:

  [0.179788] Good, all 261 testcases passed! |

See the commit that added the test:

https://git.kernel.org/torvalds/c/e91498589746

  locking/lockdep/selftests: Add mixed read-write ABBA tests

  Currently lockdep has limited support for recursive readers, add a few
  mixed read-write ABBA selftests to show the extend of these
  limitations.

And in the code:

print_testname("mixed read-lock/lock-write ABBA");
pr_cont(" |");
dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
#ifdef CONFIG_PROVE_LOCKING
/*
 * Lockdep does indeed fail here, but there's nothing we can do about
 * that now.  Don't kill lockdep for it.
 */
unexpected_testcase_failures--;
#endif

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [RFC PATCH v2 07/10] KVM: PPC: Ultravisor: Restrict LDBAR access

2019-05-20 Thread Madhavan Srinivasan




On 18/05/19 7:55 PM, Claudio Carvalho wrote:
From: Ram Pai  When the ultravisor firmware is 
available, it takes control over the LDBAR register. In this case, 
thread-imc updates and save/restore operations on the LDBAR register 
are handled by ultravisor.

we should remove the core and thread imc nodes in the skiboot
if the ultravisor is enabled. We dont need imc-pmu.c file changes, imc-pmu.c
inits only if we have the corresponding nodes. I will post a skiboot patch.

Maddy


Signed-off-by: Ram Pai 
[Restrict LDBAR access in assembly code and some in C, update the commit
  message]
Signed-off-by: Claudio Carvalho 
---
  arch/powerpc/kvm/book3s_hv.c |  4 +-
  arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  2 +
  arch/powerpc/perf/imc-pmu.c  | 64 
  arch/powerpc/platforms/powernv/idle.c|  6 +-
  arch/powerpc/platforms/powernv/subcore-asm.S |  4 ++
  5 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0fab0a201027..81f35f955d16 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -75,6 +75,7 @@
  #include 
  #include 
  #include 
+#include 

  #include "book3s.h"

@@ -3117,7 +3118,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore 
*vc)
subcore_size = MAX_SMT_THREADS / split;
split_info.rpr = mfspr(SPRN_RPR);
split_info.pmmar = mfspr(SPRN_PMMAR);
-   split_info.ldbar = mfspr(SPRN_LDBAR);
+   if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+   split_info.ldbar = mfspr(SPRN_LDBAR);
split_info.subcore_size = subcore_size;
} else {
split_info.subcore_size = 1;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index dd014308f065..938cfa5dceed 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -375,8 +375,10 @@ BEGIN_FTR_SECTION
mtspr   SPRN_RPR, r0
ld  r0, KVM_SPLIT_PMMAR(r6)
mtspr   SPRN_PMMAR, r0
+BEGIN_FW_FTR_SECTION_NESTED(70)
ld  r0, KVM_SPLIT_LDBAR(r6)
mtspr   SPRN_LDBAR, r0
+END_FW_FTR_SECTION_NESTED(FW_FEATURE_ULTRAVISOR, 0, 70)
isync
  FTR_SECTION_ELSE
/* On P9 we use the split_info for coordinating LPCR changes */
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753e2eb2..39c84de74da9 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -17,6 +17,7 @@
  #include 
  #include 
  #include 
+#include 

  /* Nest IMC data structures and variables */

@@ -816,6 +817,17 @@ static int core_imc_event_init(struct perf_event *event)
return 0;
  }

+static void thread_imc_ldbar_disable(void *dummy)
+{
+   /*
+* By Zeroing LDBAR, we disable thread-imc updates. When the ultravisor
+* firmware is available, it is responsible for handling thread-imc
+* updates, though
+*/
+   if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+   mtspr(SPRN_LDBAR, 0);
+}
+
  /*
   * Allocates a page of memory for each of the online cpus, and load
   * LDBAR with 0.
@@ -856,7 +868,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
per_cpu(thread_imc_mem, cpu_id) = local_mem;
}

-   mtspr(SPRN_LDBAR, 0);
+   thread_imc_ldbar_disable(NULL);
return 0;
  }

@@ -867,7 +879,7 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)

  static int ppc_thread_imc_cpu_offline(unsigned int cpu)
  {
-   mtspr(SPRN_LDBAR, 0);
+   thread_imc_ldbar_disable(NULL);
return 0;
  }

@@ -1010,7 +1022,6 @@ static int thread_imc_event_add(struct perf_event *event, 
int flags)
  {
int core_id;
struct imc_pmu_ref *ref;
-   u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, 
smp_processor_id());

if (flags & PERF_EF_START)
imc_event_start(event, flags);
@@ -1019,8 +1030,14 @@ static int thread_imc_event_add(struct perf_event 
*event, int flags)
return -EINVAL;

core_id = smp_processor_id() / threads_per_core;
-   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | 
THREAD_IMC_ENABLE;
-   mtspr(SPRN_LDBAR, ldbar_value);
+   if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
+   u64 ldbar_value, *local_mem;
+
+   local_mem = per_cpu(thread_imc_mem, smp_processor_id());
+   ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) |
+   THREAD_IMC_ENABLE;
+   mtspr(SPRN_LDBAR, ldbar_value);
+   }

/*
 * imc pmus are enabled only when it is used.
@@ -1053,7 +1070,7 @@ static void thread_imc_event_del(struct perf_event 
*event, int flags)
int core_id;

Re: [PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests

2019-05-20 Thread Christoph Hellwig

> diff --git a/arch/powerpc/include/asm/mem_encrypt.h 
> b/arch/powerpc/include/asm/mem_encrypt.h
> new file mode 100644
> index ..45d5e4d0e6e0
> --- /dev/null
> +++ b/arch/powerpc/include/asm/mem_encrypt.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +/*
> + * SVM helper functions
> + *
> + * Copyright 2019 IBM Corporation
> + */
> +
> +#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
> +#define _ASM_POWERPC_MEM_ENCRYPT_H
> +
> +#define sme_me_mask  0ULL
> +
> +static inline bool sme_active(void) { return false; }
> +static inline bool sev_active(void) { return false; }
> +
> +int set_memory_encrypted(unsigned long addr, int numpages);
> +int set_memory_decrypted(unsigned long addr, int numpages);
> +
> +#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */

S/390 seems to be adding a stub header just like this.  Can you please
clean up the Kconfig and generic headers bits for memory encryption so
that we don't need all this boilerplate code?

>  config PPC_SVM
>   bool "Secure virtual machine (SVM) support for POWER"
>   depends on PPC_PSERIES
> + select SWIOTLB
> + select ARCH_HAS_MEM_ENCRYPT
>   default n

n is the default default, no need to explictly specify it.

Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the

2019-05-20 Thread Madhavan Srinivasan




On 20/05/19 2:35 PM, Anju T Sudhakar wrote:

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Example:
In a system which has 2 sockets, with
NUMA node0 CPU(s): 0-87
NUMA node8 CPU(s): 88-175

Time taken to offline cpu 88-175:
real2m56.099s
user0m0.191s
sys 0m0.000s

Use cpumask_last() to choose the target cpu, when the designated cpu
goes online, so the migration will happen only when the last_cpu in the
mask goes offline. This way the time taken to offline all cpus in a
chip/core can be reduced.

With the patch,

Time taken  to offline cpu 88-175:
real0m12.207s
user0m0.171s
sys 0m0.000s

cpumask_last() is a better way to find the target cpu, since in most of the
cases cpuhotplug is performed in an increasing order(even in ppc64_cpu).

cpumask_any_but() can still be used to check the possibility of other
online cpus from the same chip/core if the last cpu in the mask goes
offline.


Reviewed-by: Madhavan Srinivasan 

Also add "Fixes:" tag and this should go to stable right?


Signed-off-by: Anju T Sudhakar 
---
  arch/powerpc/perf/imc-pmu.c | 14 --
  1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753..fbfd6e7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 */
nid = cpu_to_node(cpu);
l_cpumask = cpumask_of_node(nid);
-   target = cpumask_any_but(l_cpumask, cpu);
+   target = cpumask_last(l_cpumask);
+
+   /*
+* If this(target) is the last cpu in the cpumask for this chip,
+* check for any possible online cpu in the chip.
+*/
+   if (unlikely(target == cpu))
+   target = cpumask_any_but(l_cpumask, cpu);

/*
 * Update the cpumask with the target cpu and
@@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return 0;

/* Find any online cpu in that core except the current "cpu" */
-   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+   ncpu = cpumask_last(cpu_sibling_mask(cpu));
+
+   if (unlikely(ncpu == cpu))
+   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);

if (ncpu >= 0 && ncpu < nr_cpu_ids) {
cpumask_set_cpu(ncpu, _imc_cpumask);

Re: [RFC PATCH 02/12] powerpc: Add support for adding an ESM blob to the zImage wrapper

2019-05-20 Thread Christoph Hellwig

On Tue, May 21, 2019 at 01:49:02AM -0300, Thiago Jung Bauermann wrote:
> From: Benjamin Herrenschmidt 
> 
> For secure VMs, the signing tool will create a ticket called the "ESM blob"
> for the Enter Secure Mode ultravisor call with the signatures of the kernel
> and initrd among other things.
> 
> This adds support to the wrapper script for adding that blob via the "-e"
> option to the zImage.pseries.
> 
> It also adds code to the zImage wrapper itself to retrieve and if necessary
> relocate the blob, and pass its address to Linux via the device-tree, to be
> later consumed by prom_init.

Where does the "BLOB" come from?  How is it licensed and how can we
satisfy the GPL with it?

[PATCH 12/12] powerpc/configs: Enable secure guest support in pseries and ppc64 defconfigs

2019-05-20 Thread Thiago Jung Bauermann

From: Ryan Grimm 

Enables running as a secure guest in platforms with an Ultravisor.

Signed-off-by: Ryan Grimm 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/configs/ppc64_defconfig   | 1 +
 arch/powerpc/configs/pseries_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index d7c381009636..725297438320 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -31,6 +31,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 CONFIG_PPC_MAPLE=y
 CONFIG_PPC_PASEMI=y
 CONFIG_PPC_PASEMI_IOMMU=y
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index 62e12f61a3b2..724a574fe4b2 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -42,6 +42,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 # CONFIG_PPC_PMAC is not set
 CONFIG_RTAS_FLASH=m
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y

Re: [PATCH] powerpc/powernv: Return for invalid IMC domain

2019-05-20 Thread Madhavan Srinivasan




On 20/05/19 2:27 PM, Anju T Sudhakar wrote:

Currently init_imc_pmu() can be failed either because
an IMC unit with invalid domain(i.e an IMC node not
supported by the kernel) is attempted a pmu-registration
or something went wrong while registering a valid IMC unit.
In both the cases kernel provides a 'Registration failed'
error message.

Example:
Log message, when trace-imc node is not supported by the kernel, and the
skiboot supports trace-imc node.

So for kernel, trace-imc node is now an unknown domain.

[1.731870] nest_phb5_imc performance monitor hardware support registered
[1.731944] nest_powerbus0_imc performance monitor hardware support 
registered
[1.734458] thread_imc performance monitor hardware support registered
[1.734460] IMC Unknown Device type
[1.734462] IMC PMU (null) Register failed
[1.734558] nest_xlink0_imc performance monitor hardware support registered
[1.734614] nest_xlink1_imc performance monitor hardware support registered
[1.734670] nest_xlink2_imc performance monitor hardware support registered
[1.747043] Initialise system trusted keyrings
[1.747054] Key type blacklist registered


To avoid ambiguity on the error message, return for invalid domain
before attempting a pmu registration.


Reviewed-by: Madhavan Srinivasan 



Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`)
Reported-by: Pavaman Subramaniyam 
Signed-off-by: Anju T Sudhakar 
---
  arch/powerpc/platforms/powernv/opal-imc.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a0794..4e8b0e1 100644



--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
struct imc_pmu *pmu_ptr;
u32 offset;

+   /* Return for unknown domain */
+   if (domain < 0)
+   return -EINVAL;
+
/* memory for pmu */
pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
if (!pmu_ptr)

[PATCH 10/12] powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests

2019-05-20 Thread Thiago Jung Bauermann

Secure guest memory is inacessible to devices so regular DMA isn't
possible.

In that case set devices' dma_map_ops to NULL so that the generic
DMA code path will use SWIOTLB and DMA to bounce buffers.

Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/platforms/pseries/iommu.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 03bbb299320e..7d9550edb700 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "pseries.h"
 
@@ -1332,7 +1333,10 @@ void iommu_init_early_pSeries(void)
of_reconfig_notifier_register(_reconfig_nb);
register_memory_notifier(_mem_nb);
 
-   set_pci_dma_ops(_iommu_ops);
+   if (is_secure_guest())
+   set_pci_dma_ops(NULL);
+   else
+   set_pci_dma_ops(_iommu_ops);
 }
 
 static int __init disable_multitce(char *str)

Re: [PATCH] crypto: vmx - convert to skcipher API

2019-05-20 Thread Michael Ellerman

Eric Biggers  writes:
> From: Eric Biggers 
>
> Convert the VMX implementations of AES-CBC, AES-CTR, and AES-XTS from
> the deprecated "blkcipher" API to the "skcipher" API.
>
> As part of this, I moved the skcipher_request for the fallback algorithm
> off the stack and into the request context of the parent algorithm.
>
> I tested this in a PowerPC VM with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y.

I booted it a few times on a Power9 bare metal machine with
panic_on_fail=1 and fuzz_iterations=400, no issues.

Tested-by: Michael Ellerman 

cheers

> Signed-off-by: Eric Biggers 
> ---
>  drivers/crypto/vmx/aes_cbc.c   | 183 -
>  drivers/crypto/vmx/aes_ctr.c   | 165 +
>  drivers/crypto/vmx/aes_xts.c   | 175 ++-
>  drivers/crypto/vmx/aesp8-ppc.h |   2 -
>  drivers/crypto/vmx/vmx.c   |  72 +++--
>  5 files changed, 252 insertions(+), 345 deletions(-)
>
> diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
> index dae8af3c46dce..92e75a05d6a9e 100644
> --- a/drivers/crypto/vmx/aes_cbc.c
> +++ b/drivers/crypto/vmx/aes_cbc.c
> @@ -7,64 +7,52 @@
>   * Author: Marcelo Henrique Cerri 
>   */
>  
> -#include 
> -#include 
> -#include 
> -#include 
>  #include 
>  #include 
>  #include 
>  #include 
> -#include 
> -#include 
> +#include 
>  
>  #include "aesp8-ppc.h"
>  
>  struct p8_aes_cbc_ctx {
> - struct crypto_sync_skcipher *fallback;
> + struct crypto_skcipher *fallback;
>   struct aes_key enc_key;
>   struct aes_key dec_key;
>  };
>  
> -static int p8_aes_cbc_init(struct crypto_tfm *tfm)
> +static int p8_aes_cbc_init(struct crypto_skcipher *tfm)
>  {
> - const char *alg = crypto_tfm_alg_name(tfm);
> - struct crypto_sync_skcipher *fallback;
> - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
> -
> - fallback = crypto_alloc_sync_skcipher(alg, 0,
> -   CRYPTO_ALG_NEED_FALLBACK);
> + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
> + struct crypto_skcipher *fallback;
>  
> + fallback = crypto_alloc_skcipher("cbc(aes)", 0,
> +  CRYPTO_ALG_NEED_FALLBACK |
> +  CRYPTO_ALG_ASYNC);
>   if (IS_ERR(fallback)) {
> - printk(KERN_ERR
> -"Failed to allocate transformation for '%s': %ld\n",
> -alg, PTR_ERR(fallback));
> + pr_err("Failed to allocate cbc(aes) fallback: %ld\n",
> +PTR_ERR(fallback));
>   return PTR_ERR(fallback);
>   }
>  
> - crypto_sync_skcipher_set_flags(
> - fallback,
> - crypto_skcipher_get_flags((struct crypto_skcipher *)tfm));
> + crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
> + crypto_skcipher_reqsize(fallback));
>   ctx->fallback = fallback;
> -
>   return 0;
>  }
>  
> -static void p8_aes_cbc_exit(struct crypto_tfm *tfm)
> +static void p8_aes_cbc_exit(struct crypto_skcipher *tfm)
>  {
> - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
> + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
>  
> - if (ctx->fallback) {
> - crypto_free_sync_skcipher(ctx->fallback);
> - ctx->fallback = NULL;
> - }
> + crypto_free_skcipher(ctx->fallback);
>  }
>  
> -static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
> +static int p8_aes_cbc_setkey(struct crypto_skcipher *tfm, const u8 *key,
>unsigned int keylen)
>  {
> + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
>   int ret;
> - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
>  
>   preempt_disable();
>   pagefault_disable();
> @@ -75,108 +63,71 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, 
> const u8 *key,
>   pagefault_enable();
>   preempt_enable();
>  
> - ret |= crypto_sync_skcipher_setkey(ctx->fallback, key, keylen);
> + ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
>  
>   return ret ? -EINVAL : 0;
>  }
>  
> -static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
> -   struct scatterlist *dst,
> -   struct scatterlist *src, unsigned int nbytes)
> +static int p8_aes_cbc_crypt(struct skcipher_request *req, int enc)
>  {
> + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
> + const struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
> + struct skcipher_walk walk;
> + unsigned int nbytes;
>   int ret;
> - struct blkcipher_walk walk;
> - struct p8_aes_cbc_ctx *ctx =
> - crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
>  
>   if (!crypto_simd_usable()) {
> - SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
> - skcipher_request_set_sync_tfm(req, ctx->fallback);
> -

[PATCH 07/12] powerpc/pseries/svm: Use shared memory for Debug Trace Log (DTL)

2019-05-20 Thread Thiago Jung Bauermann

From: Anshuman Khandual 

Secure guests need to share the DTL buffers with the hypervisor. To that
end, use a kmem_cache constructor which converts the underlying buddy
allocated SLUB cache pages into shared memory.

Signed-off-by: Anshuman Khandual 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/svm.h  |  5 
 arch/powerpc/platforms/pseries/Makefile |  1 +
 arch/powerpc/platforms/pseries/setup.c  |  5 +++-
 arch/powerpc/platforms/pseries/svm.c| 40 +
 4 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
index fef3740f46a6..f253116c31fc 100644
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@@ -15,6 +15,9 @@ static inline bool is_secure_guest(void)
return mfmsr() & MSR_S;
 }
 
+void dtl_cache_ctor(void *addr);
+#define get_dtl_cache_ctor()   (is_secure_guest() ? dtl_cache_ctor : NULL)
+
 #else /* CONFIG_PPC_SVM */
 
 static inline bool is_secure_guest(void)
@@ -22,5 +25,7 @@ static inline bool is_secure_guest(void)
return false;
 }
 
+#define get_dtl_cache_ctor() NULL
+
 #endif /* CONFIG_PPC_SVM */
 #endif /* _ASM_POWERPC_SVM_H */
diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index a43ec843c8e2..b7b6e6f52bd0 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_LPARCFG) += lparcfg.o
 obj-$(CONFIG_IBMVIO)   += vio.o
 obj-$(CONFIG_IBMEBUS)  += ibmebus.o
 obj-$(CONFIG_PAPR_SCM) += papr_scm.o
+obj-$(CONFIG_PPC_SVM)  += svm.o
 
 ifdef CONFIG_PPC_PSERIES
 obj-$(CONFIG_SUSPEND)  += suspend.o
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index e4f0dfd4ae33..c928e6e8a279 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -71,6 +71,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "pseries.h"
 #include "../../../../drivers/pci/pci.h"
@@ -329,8 +330,10 @@ static inline int alloc_dispatch_logs(void)
 
 static int alloc_dispatch_log_kmem_cache(void)
 {
+   void (*ctor)(void *) = get_dtl_cache_ctor();
+
dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
-   DISPATCH_LOG_BYTES, 0, NULL);
+   DISPATCH_LOG_BYTES, 0, ctor);
if (!dtl_cache) {
pr_warn("Failed to create dispatch trace log buffer cache\n");
pr_warn("Stolen time statistics will be unreliable\n");
diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
new file mode 100644
index ..c508196f7c83
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Secure VM platform
+ *
+ * Copyright 2019 IBM Corporation
+ * Author: Anshuman Khandual 
+ */
+
+#include 
+#include 
+
+/* There's one dispatch log per CPU. */
+#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
+
+static struct page *dtl_page_store[NR_DTL_PAGE];
+static long dtl_nr_pages;
+
+static bool is_dtl_page_shared(struct page *page)
+{
+   long i;
+
+   for (i = 0; i < dtl_nr_pages; i++)
+   if (dtl_page_store[i] == page)
+   return true;
+
+   return false;
+}
+
+void dtl_cache_ctor(void *addr)
+{
+   unsigned long pfn = PHYS_PFN(__pa(addr));
+   struct page *page = pfn_to_page(pfn);
+
+   if (!is_dtl_page_shared(page)) {
+   dtl_page_store[dtl_nr_pages] = page;
+   dtl_nr_pages++;
+   WARN_ON(dtl_nr_pages >= NR_DTL_PAGE);
+   uv_share_page(pfn, 1);
+   }
+}

[PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests

2019-05-20 Thread Thiago Jung Bauermann

From: Anshuman Khandual 

SWIOTLB checks range of incoming CPU addresses to be bounced and sees if
the device can access it through its DMA window without requiring bouncing.
In such cases it just chooses to skip bouncing. But for cases like secure
guests on powerpc platform all addresses need to be bounced into the shared
pool of memory because the host cannot access it otherwise. Hence the need
to do the bouncing is not related to device's DMA window and use of bounce
buffers is forced by setting swiotlb_force.

Also, connect the shared memory conversion functions into the
ARCH_HAS_MEM_ENCRYPT hooks and call swiotlb_update_mem_attributes() to
convert SWIOTLB's memory pool to shared memory.

Signed-off-by: Anshuman Khandual 
[ Use ARCH_HAS_MEM_ENCRYPT hooks to share swiotlb memory pool. ]
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/mem_encrypt.h | 19 +++
 arch/powerpc/platforms/pseries/Kconfig |  5 +++
 arch/powerpc/platforms/pseries/svm.c   | 45 ++
 3 files changed, 69 insertions(+)

diff --git a/arch/powerpc/include/asm/mem_encrypt.h 
b/arch/powerpc/include/asm/mem_encrypt.h
new file mode 100644
index ..45d5e4d0e6e0
--- /dev/null
+++ b/arch/powerpc/include/asm/mem_encrypt.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SVM helper functions
+ *
+ * Copyright 2019 IBM Corporation
+ */
+
+#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
+#define _ASM_POWERPC_MEM_ENCRYPT_H
+
+#define sme_me_mask0ULL
+
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
+#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index 82c16aa4f1ce..41b10f3bc729 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -145,9 +145,14 @@ config PAPR_SCM
help
  Enable access to hypervisor provided storage class memory.
 
+config ARCH_HAS_MEM_ENCRYPT
+   def_bool n
+
 config PPC_SVM
bool "Secure virtual machine (SVM) support for POWER"
depends on PPC_PSERIES
+   select SWIOTLB
+   select ARCH_HAS_MEM_ENCRYPT
default n
help
 Support secure guests on POWER. There are certain POWER platforms which
diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
index c508196f7c83..618622d636d5 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -7,8 +7,53 @@
  */
 
 #include 
+#include 
+#include 
+#include 
 #include 
 
+static int __init init_svm(void)
+{
+   if (!is_secure_guest())
+   return 0;
+
+   /* Don't release the SWIOTLB buffer. */
+   ppc_swiotlb_enable = 1;
+
+   /*
+* Since the guest memory is inaccessible to the host, devices always
+* need to use the SWIOTLB buffer for DMA even if dma_capable() says
+* otherwise.
+*/
+   swiotlb_force = SWIOTLB_FORCE;
+
+   /* Share the SWIOTLB buffer with the host. */
+   swiotlb_update_mem_attributes();
+
+   return 0;
+}
+machine_early_initcall(pseries, init_svm);
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+   if (!PAGE_ALIGNED(addr))
+   return -EINVAL;
+
+   uv_unshare_page(PHYS_PFN(__pa(addr)), numpages);
+
+   return 0;
+}
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+   if (!PAGE_ALIGNED(addr))
+   return -EINVAL;
+
+   uv_share_page(PHYS_PFN(__pa(addr)), numpages);
+
+   return 0;
+}
+
 /* There's one dispatch log per CPU. */
 #define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)

[PATCH 09/12] powerpc/pseries/svm: Disable doorbells in SVM guests

2019-05-20 Thread Thiago Jung Bauermann

From: Sukadev Bhattiprolu 

Normally, the HV emulates some instructions like MSGSNDP, MSGCLRP
from a KVM guest. To emulate the instructions, it must first read
the instruction from the guest's memory and decode its parameters.

However for a secure guest (aka SVM), the page containing the
instruction is in secure memory and the HV cannot access directly.
It would need the Ultravisor (UV) to facilitate accessing the
instruction and parameters but the UV currently does not have
the support for such accesses.

Until the UV has such support, disable doorbells in SVMs. This might
incur a performance hit but that is yet to be quantified.

With this patch applied (needed only in SVMs not needed for HV) we
are able to launch SVM guests with multi-core support. Eg:

qemu -smp sockets=2,cores=2,threads=2.

Fix suggested by Benjamin Herrenschmidt. Thanks to input from
Paul Mackerras, Ram Pai and Michael Anderson.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/platforms/pseries/smp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/smp.c 
b/arch/powerpc/platforms/pseries/smp.c
index 3df46123cce3..95a5c24a1544 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -45,6 +45,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "pseries.h"
 #include "offline_states.h"
@@ -225,7 +226,7 @@ static __init void pSeries_smp_probe_xics(void)
 {
xics_smp_probe();
 
-   if (cpu_has_feature(CPU_FTR_DBELL))
+   if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest())
smp_ops->cause_ipi = smp_pseries_cause_ipi;
else
smp_ops->cause_ipi = icp_ops->cause_ipi;

[PATCH 08/12] powerpc/pseries/svm: Export guest SVM status to user space via sysfs

2019-05-20 Thread Thiago Jung Bauermann

From: Ryan Grimm 

User space might want to know it's running in a secure VM.  It can't do
a mfmsr because mfmsr is a privileged instruction.

The solution here is to create a cpu attribute:

/sys/devices/system/cpu/svm

which will read 0 or 1 based on the S bit of the guest's CPU 0.

Signed-off-by: Ryan Grimm 
Reviewed-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/kernel/sysfs.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e8e93c2c7d03..8fdab134e9ae 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cacheinfo.h"
 #include "setup.h"
@@ -714,6 +715,32 @@ static struct device_attribute pa6t_attrs[] = {
 #endif /* HAS_PPC_PMC_PA6T */
 #endif /* HAS_PPC_PMC_CLASSIC */
 
+#ifdef CONFIG_PPC_SVM
+static void get_svm(void *val)
+{
+   u32 *value = val;
+
+   *value = is_secure_guest();
+}
+
+static ssize_t show_svm(struct device *dev, struct device_attribute *attr, 
char *buf)
+{
+   u32 val;
+   smp_call_function_single(0, get_svm, , 1);
+   return sprintf(buf, "%u\n", val);
+}
+static DEVICE_ATTR(svm, 0444, show_svm, NULL);
+
+static void create_svm_file(void)
+{
+   device_create_file(cpu_subsys.dev_root, _attr_svm);
+}
+#else
+static void create_svm_file(void)
+{
+}
+#endif /* CONFIG_PPC_SVM */
+
 static int register_cpu_online(unsigned int cpu)
 {
struct cpu *c = _cpu(cpu_devices, cpu);
@@ -1057,6 +1084,8 @@ static int __init topology_init(void)
sysfs_create_dscr_default();
 #endif /* CONFIG_PPC64 */
 
+   create_svm_file();
+
return 0;
 }
 subsys_initcall(topology_init);

[PATCH 06/12] powerpc/pseries/svm: Use shared memory for LPPACA structures

2019-05-20 Thread Thiago Jung Bauermann

From: Anshuman Khandual 

LPPACA structures need to be shared with the host. Hence they need to be in
shared memory. Instead of allocating individual chunks of memory for a
given structure from memblock, a contiguous chunk of memory is allocated
and then converted into shared memory. Subsequent allocation requests will
come from the contiguous chunk which will be always shared memory for all
structures.

While we are able to use a kmem_cache constructor for the Debug Trace Log,
LPPACAs are allocated very early in the boot process (before SLUB is
available) so we need to use a simpler scheme here.

Introduce helper is_svm_platform() which uses the S bit of the MSR to tell
whether we're running as a secure guest.

Signed-off-by: Anshuman Khandual 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/svm.h | 26 
 arch/powerpc/kernel/paca.c | 43 +-
 2 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
new file mode 100644
index ..fef3740f46a6
--- /dev/null
+++ b/arch/powerpc/include/asm/svm.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SVM helper functions
+ *
+ * Copyright 2019 Anshuman Khandual, IBM Corporation.
+ */
+
+#ifndef _ASM_POWERPC_SVM_H
+#define _ASM_POWERPC_SVM_H
+
+#ifdef CONFIG_PPC_SVM
+
+static inline bool is_secure_guest(void)
+{
+   return mfmsr() & MSR_S;
+}
+
+#else /* CONFIG_PPC_SVM */
+
+static inline bool is_secure_guest(void)
+{
+   return false;
+}
+
+#endif /* CONFIG_PPC_SVM */
+#endif /* _ASM_POWERPC_SVM_H */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 854105db5cff..a9622f4b45bb 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "setup.h"
 
@@ -58,6 +60,41 @@ static void *__init alloc_paca_data(unsigned long size, 
unsigned long align,
 
 #define LPPACA_SIZE 0x400
 
+static void *__init alloc_shared_lppaca(unsigned long size, unsigned long 
align,
+   unsigned long limit, int cpu)
+{
+   size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE);
+   static unsigned long shared_lppaca_size;
+   static void *shared_lppaca;
+   void *ptr;
+
+   if (!shared_lppaca) {
+   memblock_set_bottom_up(true);
+
+   shared_lppaca =
+   memblock_alloc_try_nid(shared_lppaca_total_size,
+  PAGE_SIZE, MEMBLOCK_LOW_LIMIT,
+  limit, NUMA_NO_NODE);
+   if (!shared_lppaca)
+   panic("cannot allocate shared data");
+
+   memblock_set_bottom_up(false);
+   uv_share_page(PHYS_PFN(__pa(shared_lppaca)),
+ shared_lppaca_total_size >> PAGE_SHIFT);
+   }
+
+   ptr = shared_lppaca + shared_lppaca_size;
+   shared_lppaca_size += size;
+
+   /*
+* This is very early in boot, so no harm done if the kernel crashes at
+* this point.
+*/
+   BUG_ON(shared_lppaca_size >= shared_lppaca_total_size);
+
+   return ptr;
+}
+
 /*
  * See asm/lppaca.h for more detail.
  *
@@ -87,7 +124,11 @@ static struct lppaca * __init new_lppaca(int cpu, unsigned 
long limit)
if (early_cpu_has_feature(CPU_FTR_HVMODE))
return NULL;
 
-   lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu);
+   if (is_secure_guest())
+   lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu);
+   else
+   lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu);
+
init_lppaca(lp);
 
return lp;

[PATCH 05/12] powerpc/pseries: Add and use LPPACA_SIZE constant

2019-05-20 Thread Thiago Jung Bauermann

Helps document what the hard-coded number means.

Also take the opportunity to fix an #endif comment.

Suggested-by: Alexey Kardashevskiy 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/kernel/paca.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 9cc91d03ab62..854105db5cff 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -56,6 +56,8 @@ static void *__init alloc_paca_data(unsigned long size, 
unsigned long align,
 
 #ifdef CONFIG_PPC_PSERIES
 
+#define LPPACA_SIZE 0x400
+
 /*
  * See asm/lppaca.h for more detail.
  *
@@ -69,7 +71,7 @@ static inline void init_lppaca(struct lppaca *lppaca)
 
*lppaca = (struct lppaca) {
.desc = cpu_to_be32(0xd397d781),/* "LpPa" */
-   .size = cpu_to_be16(0x400),
+   .size = cpu_to_be16(LPPACA_SIZE),
.fpregs_in_use = 1,
.slb_count = cpu_to_be16(64),
.vmxregs_in_use = 0,
@@ -79,19 +81,18 @@ static inline void init_lppaca(struct lppaca *lppaca)
 static struct lppaca * __init new_lppaca(int cpu, unsigned long limit)
 {
struct lppaca *lp;
-   size_t size = 0x400;
 
-   BUILD_BUG_ON(size < sizeof(struct lppaca));
+   BUILD_BUG_ON(sizeof(struct lppaca) > LPPACA_SIZE);
 
if (early_cpu_has_feature(CPU_FTR_HVMODE))
return NULL;
 
-   lp = alloc_paca_data(size, 0x400, limit, cpu);
+   lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu);
init_lppaca(lp);
 
return lp;
 }
-#endif /* CONFIG_PPC_BOOK3S */
+#endif /* CONFIG_PPC_PSERIES */
 
 #ifdef CONFIG_PPC_BOOK3S_64

[PATCH 00/12] Secure Virtual Machine Enablement

2019-05-20 Thread Thiago Jung Bauermann

This series enables Secure Virtual Machines (SVMs) on powerpc. SVMs use the
Protected Execution Facility (PEF) and request to be migrated to secure
memory during prom_init() so by default all of their memory is inaccessible
to the hypervisor. There is an Ultravisor call that the VM can use to
request certain pages to be made accessible to (or shared with) the
hypervisor.

The objective of these patches is to have the guest perform this request
for buffers that need to be accessed by the hypervisor such as the LPPACAs,
the SWIOTLB memory and the Debug Trace Log.

The patch set applies on top of Claudio Carvalho's "kvmppc: Paravirtualize KVM
to support ultravisor" series:

https://lore.kernel.org/linuxppc-dev/20190518142524.28528-1-cclau...@linux.ibm.com/

I only need the following two patches from his series:

[RFC PATCH v2 02/10] KVM: PPC: Ultravisor: Introduce the MSR_S bit
[RFC PATCH v2 04/10] KVM: PPC: Ultravisor: Add generic ultravisor call handler

Patches 2 and 3 are posted as RFC because we are still finalizing the
details on how the ESM blob will be passed to the kernel. All other patches
are (hopefully) in upstreamable shape.

Unfortunately this series still doesn't enable the use of virtio devices in
the secure guest. This support depends on a discussion that is currently
ongoing with the virtio community:

https://lore.kernel.org/linuxppc-dev/87womn8inf.fsf@morokweng.localdomain/

This was the last time I posted this patch set:

https://lore.kernel.org/linuxppc-dev/20180824162535.22798-1-bauer...@linux.ibm.com/

At that time, it wasn't possible to launch a real secure guest because the
Ultravisor was still in very early development. Now there is a relatively
mature Ultravisor and I was able to test it using Claudio's patches in the
host kernel, booting normally using an initramfs for the root filesystem.

This is the command used to start up the guest with QEMU 4.0:

qemu-system-ppc64   \
-nodefaults \
-cpu host   \
-machine 
pseries,accel=kvm,kvm-type=HV,cap-htm=off,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken
 \
-display none   \
-serial mon:stdio   \
-smp 1  \
-m 4G   \
-kernel /home/bauermann/vmlinux \
-initrd /home/bauermann/fs_small.cpio   \
-append 'debug'

Changelog since the RFC from August:

- Patch "powerpc/pseries: Introduce option to build secure virtual machines"
  - New patch.

- Patch "powerpc: Add support for adding an ESM blob to the zImage wrapper"
  - Patch from Benjamin Herrenschmidt, first posted here:

https://lore.kernel.org/linuxppc-dev/20180531043417.25073-1-b...@kernel.crashing.org/
  - Made minor adjustments to some comments. Code is unchanged.

- Patch "powerpc/prom_init: Add the ESM call to prom_init"
  - New patch from Ram Pai and Michael Anderson.

- Patch "powerpc/pseries/svm: Add helpers for UV_SHARE_PAGE and UV_UNSHARE_PAGE"
  - New patch from Ram Pai.

- Patch "powerpc/pseries: Add and use LPPACA_SIZE constant"
  - Moved LPPACA_SIZE macro inside the CONFIG_PPC_PSERIES #ifdef.
  - Put sizeof() operand left of comparison operator in BUILD_BUG_ON()
macro to appease a checkpatch warning.

- Patch "powerpc/pseries/svm: Use shared memory for LPPACA structures"
  - Moved definition of is_secure_guest() helper to this patch.
  - Changed shared_lppaca and shared_lppaca_size from globals to static
variables inside alloc_shared_lppaca().
  - Changed shared_lppaca to hold virtual address instead of physical
address.

- Patch "powerpc/pseries/svm: Use shared memory for Debug Trace Log (DTL)"
  - Add get_dtl_cache_ctor() macro. Suggested by Ram Pai.

- Patch "powerpc/pseries/svm: Export guest SVM status to user space via sysfs"
  - New patch from Ryan Grimm.

- Patch "powerpc/pseries/svm: Disable doorbells in SVM guests"
  - New patch from Sukadev Bhattiprolu.

- Patch "powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests"
  - New patch.

- Patch "powerpc/pseries/svm: Force SWIOTLB for secure guests"
  - New patch with code that was previously in other patches.

- Patch "powerpc/configs: Enable secure guest support in pseries and ppc64 
defconfigs"
  - New patch from Ryan Grimm.

- Patch "powerpc/pseries/svm: Detect Secure Virtual Machine (SVM) platform"
  - Dropped this patch by moving its code to other patches.

- Patch "powerpc/svm: Select CONFIG_DMA_DIRECT_OPS and CONFIG_SWIOTLB"
  - No need to select CONFIG_DMA_DIRECT_OPS anymore. The CONFIG_SWIOTLB
change was moved to another patch and this patch was dropped.

- Patch "powerpc/pseries/svm: Add memory conversion (shared/secure) helper 
functions"
  - Dropped patch since the helper functions were unnecessary wrappers
around uv_share_page() and uv_unshare_page().

- Patch "powerpc/svm: Convert SWIOTLB buffers

[PATCH 04/12] powerpc/pseries/svm: Add helpers for UV_SHARE_PAGE and UV_UNSHARE_PAGE

2019-05-20 Thread Thiago Jung Bauermann

From: Ram Pai 

These functions are used when the guest wants to grant the hypervisor
access to certain pages.

Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/ultravisor-api.h |  2 ++
 arch/powerpc/include/asm/ultravisor.h | 14 ++
 2 files changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/ultravisor-api.h 
b/arch/powerpc/include/asm/ultravisor-api.h
index 0e8b72081718..ed68b02869fd 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -20,6 +20,8 @@
 /* opcodes */
 #define UV_WRITE_PATE  0xF104
 #define UV_ESM 0xF110
+#define UV_SHARE_PAGE  0xF130
+#define UV_UNSHARE_PAGE0xF134
 #define UV_RETURN  0xF11C
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/include/asm/ultravisor.h 
b/arch/powerpc/include/asm/ultravisor.h
index 09e0a615d96f..537f7717d21a 100644
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -44,6 +44,20 @@ static inline int uv_register_pate(u64 lpid, u64 dw0, u64 
dw1)
return ucall(UV_WRITE_PATE, retbuf, lpid, dw0, dw1);
 }
 
+static inline int uv_share_page(u64 pfn, u64 npages)
+{
+   unsigned long retbuf[UCALL_BUFSIZE];
+
+   return ucall(UV_SHARE_PAGE, retbuf, pfn, npages);
+}
+
+static inline int uv_unshare_page(u64 pfn, u64 npages)
+{
+   unsigned long retbuf[UCALL_BUFSIZE];
+
+   return ucall(UV_UNSHARE_PAGE, retbuf, pfn, npages);
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_H */

[RFC PATCH 03/12] powerpc/prom_init: Add the ESM call to prom_init

2019-05-20 Thread Thiago Jung Bauermann

From: Ram Pai 

Make the Enter-Secure-Mode (ESM) ultravisor call to switch the VM to secure
mode. Add "svm=" command line option to turn off switching to secure mode.
Introduce CONFIG_PPC_SVM to control support for secure guests.

Signed-off-by: Ram Pai 
[ Generate an RTAS os-term hcall when the ESM ucall fails. ]
Signed-off-by: Michael Anderson 
[ Cleaned up the code a bit. ]
Signed-off-by: Thiago Jung Bauermann 
---
 .../admin-guide/kernel-parameters.txt |   5 +
 arch/powerpc/include/asm/ultravisor-api.h |   1 +
 arch/powerpc/kernel/prom_init.c   | 124 ++
 3 files changed, 130 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index c45a19d654f3..7237d86b25c6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4501,6 +4501,11 @@
/sys/power/pm_test). Only available when CONFIG_PM_DEBUG
is set. Default value is 5.
 
+   svm=[PPC]
+   Format: { on | off | y | n | 1 | 0 }
+   This parameter controls use of the Protected
+   Execution Facility on pSeries.
+
swapaccount=[0|1]
[KNL] Enable accounting of swap in memory resource
controller if no parameter or 1 is given or disable
diff --git a/arch/powerpc/include/asm/ultravisor-api.h 
b/arch/powerpc/include/asm/ultravisor-api.h
index 15e6ce77a131..0e8b72081718 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -19,6 +19,7 @@
 
 /* opcodes */
 #define UV_WRITE_PATE  0xF104
+#define UV_ESM 0xF110
 #define UV_RETURN  0xF11C
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 523bb99d7676..5d8a3efb54f2 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -174,6 +175,10 @@ static unsigned long __prombss prom_tce_alloc_end;
 static bool __prombss prom_radix_disable;
 #endif
 
+#ifdef CONFIG_PPC_SVM
+static bool __prombss prom_svm_disable;
+#endif
+
 struct platform_support {
bool hash_mmu;
bool radix_mmu;
@@ -809,6 +814,17 @@ static void __init early_cmdline_parse(void)
if (prom_radix_disable)
prom_debug("Radix disabled from cmdline\n");
 #endif /* CONFIG_PPC_PSERIES */
+
+#ifdef CONFIG_PPC_SVM
+   opt = prom_strstr(prom_cmd_line, "svm=");
+   if (opt) {
+   bool val;
+
+   opt += sizeof("svm=") - 1;
+   if (!prom_strtobool(opt, ))
+   prom_svm_disable = !val;
+   }
+#endif /* CONFIG_PPC_SVM */
 }
 
 #ifdef CONFIG_PPC_PSERIES
@@ -1707,6 +1723,43 @@ static void __init prom_close_stdin(void)
}
 }
 
+#ifdef CONFIG_PPC_SVM
+static int prom_rtas_os_term_hcall(uint64_t args)
+{
+   register uint64_t arg1 asm("r3") = 0xf000;
+   register uint64_t arg2 asm("r4") = args;
+
+   asm volatile("sc 1\n" : "=r" (arg1) :
+   "r" (arg1),
+   "r" (arg2) :);
+   return arg1;
+}
+
+static struct rtas_args __prombss os_term_args;
+
+static void __init prom_rtas_os_term(char *str)
+{
+   phandle rtas_node;
+   __be32 val;
+   u32 token;
+
+   prom_printf("%s: start...\n", __func__);
+   rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas"));
+   prom_printf("rtas_node: %x\n", rtas_node);
+   if (!PHANDLE_VALID(rtas_node))
+   return;
+
+   val = 0;
+   prom_getprop(rtas_node, "ibm,os-term", , sizeof(val));
+   token = be32_to_cpu(val);
+   prom_printf("ibm,os-term: %x\n", token);
+   if (token == 0)
+   prom_panic("Could not get token for ibm,os-term\n");
+   os_term_args.token = cpu_to_be32(token);
+   prom_rtas_os_term_hcall((uint64_t)_term_args);
+}
+#endif /* CONFIG_PPC_SVM */
+
 /*
  * Allocate room for and instantiate RTAS
  */
@@ -3162,6 +3215,74 @@ static void unreloc_toc(void)
 #endif
 #endif
 
+#ifdef CONFIG_PPC_SVM
+/*
+ * The ESM blob is a data structure with information needed by the Ultravisor 
to
+ * validate the integrity of the secure guest.
+ */
+static void *get_esm_blob(void)
+{
+   /*
+* FIXME: We are still finalizing the details on how prom_init will grab
+* the ESM blob. When that is done, this function will be updated.
+*/
+   return (void *)0xdeadbeef;
+}
+
+/*
+ * Perform the Enter Secure Mode ultracall.
+ */
+static int enter_secure_mode(void *esm_blob, void *retaddr, void *fdt)
+{
+   register uint64_t func asm("r0") = UV_ESM;
+   register uint64_t arg1 asm("r3") = (uint64_t)esm_blob;
+   register uint64_t arg2

[RFC PATCH 02/12] powerpc: Add support for adding an ESM blob to the zImage wrapper

2019-05-20 Thread Thiago Jung Bauermann

From: Benjamin Herrenschmidt 

For secure VMs, the signing tool will create a ticket called the "ESM blob"
for the Enter Secure Mode ultravisor call with the signatures of the kernel
and initrd among other things.

This adds support to the wrapper script for adding that blob via the "-e"
option to the zImage.pseries.

It also adds code to the zImage wrapper itself to retrieve and if necessary
relocate the blob, and pass its address to Linux via the device-tree, to be
later consumed by prom_init.

Signed-off-by: Benjamin Herrenschmidt 
[ Minor adjustments to some comments. ]
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/boot/main.c   | 41 ++
 arch/powerpc/boot/ops.h|  2 ++
 arch/powerpc/boot/wrapper  | 24 +---
 arch/powerpc/boot/zImage.lds.S |  8 +++
 4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c
index 78aaf4ffd7ab..ca612efd3e81 100644
--- a/arch/powerpc/boot/main.c
+++ b/arch/powerpc/boot/main.c
@@ -150,6 +150,46 @@ static struct addr_range prep_initrd(struct addr_range 
vmlinux, void *chosen,
return (struct addr_range){(void *)initrd_addr, initrd_size};
 }
 
+#ifdef __powerpc64__
+static void prep_esm_blob(struct addr_range vmlinux, void *chosen)
+{
+   unsigned long esm_blob_addr, esm_blob_size;
+
+   /* Do we have an ESM (Enter Secure Mode) blob? */
+   if (_esm_blob_end <= _esm_blob_start)
+   return;
+
+   printf("Attached ESM blob at 0x%p-0x%p\n\r",
+  _esm_blob_start, _esm_blob_end);
+   esm_blob_addr = (unsigned long)_esm_blob_start;
+   esm_blob_size = _esm_blob_end - _esm_blob_start;
+
+   /*
+* If the ESM blob is too low it will be clobbered when the
+* kernel relocates to its final location.  In this case,
+* allocate a safer place and move it.
+*/
+   if (esm_blob_addr < vmlinux.size) {
+   void *old_addr = (void *)esm_blob_addr;
+
+   printf("Allocating 0x%lx bytes for esm_blob ...\n\r",
+  esm_blob_size);
+   esm_blob_addr = (unsigned long)malloc(esm_blob_size);
+   if (!esm_blob_addr)
+   fatal("Can't allocate memory for ESM blob !\n\r");
+   printf("Relocating ESM blob 0x%lx <- 0x%p (0x%lx bytes)\n\r",
+  esm_blob_addr, old_addr, esm_blob_size);
+   memmove((void *)esm_blob_addr, old_addr, esm_blob_size);
+   }
+
+   /* Tell the kernel ESM blob address via device tree. */
+   setprop_val(chosen, "linux,esm-blob-start", (u32)(esm_blob_addr));
+   setprop_val(chosen, "linux,esm-blob-end", (u32)(esm_blob_addr + 
esm_blob_size));
+}
+#else
+static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { }
+#endif
+
 /* A buffer that may be edited by tools operating on a zImage binary so as to
  * edit the command line passed to vmlinux (by setting /chosen/bootargs).
  * The buffer is put in it's own section so that tools may locate it easier.
@@ -218,6 +258,7 @@ void start(void)
vmlinux = prep_kernel();
initrd = prep_initrd(vmlinux, chosen,
 loader_info.initrd_addr, loader_info.initrd_size);
+   prep_esm_blob(vmlinux, chosen);
prep_cmdline(chosen);
 
printf("Finalizing device tree...");
diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h
index cd043726ed88..e0606766480f 100644
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@@ -251,6 +251,8 @@ extern char _initrd_start[];
 extern char _initrd_end[];
 extern char _dtb_start[];
 extern char _dtb_end[];
+extern char _esm_blob_start[];
+extern char _esm_blob_end[];
 
 static inline __attribute__((const))
 int __ilog2_u32(u32 n)
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index f9141eaec6ff..36b2ad6cd5b7 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -14,6 +14,7 @@
 # -i initrdspecify initrd file
 # -d devtree   specify device-tree blob
 # -s tree.dts  specify device-tree source file (needs dtc installed)
+# -e esm_blob   specify ESM blob for secure images
 # -c   cache $kernel.strip.gz (use if present & newer, else make)
 # -C prefixspecify command prefix for cross-building tools
 #  (strip, objcopy, ld)
@@ -38,6 +39,7 @@ platform=of
 initrd=
 dtb=
 dts=
+esm_blob=
 cacheit=
 binary=
 compression=.gz
@@ -60,9 +62,9 @@ tmpdir=.
 
 usage() {
 echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
-echo '   [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-echo '   [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
-echo '   [--no-compression] [vmlinux]' >&2
+echo '   [-d devtree] [-s tree.dts] [-e esm_blob]' >&2
+echo '   [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2
+echo '   [-Z (gz|xz|none)] [--no-compression]

[PATCH 01/12] powerpc/pseries: Introduce option to build secure virtual machines

2019-05-20 Thread Thiago Jung Bauermann

Introduce CONFIG_PPC_SVM to control support for secure guests and include
Ultravisor-related helpers when it is selected

Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/ultravisor.h  |  2 +-
 arch/powerpc/kernel/Makefile   |  4 +++-
 arch/powerpc/platforms/pseries/Kconfig | 12 
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/ultravisor.h 
b/arch/powerpc/include/asm/ultravisor.h
index 4ffec7a36acd..09e0a615d96f 100644
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -28,7 +28,7 @@ extern int early_init_dt_scan_ultravisor(unsigned long node, 
const char *uname,
  * This call supports up to 6 arguments and 4 return arguments. Use
  * UCALL_BUFSIZE to size the return argument buffer.
  */
-#if defined(CONFIG_PPC_UV)
+#if defined(CONFIG_PPC_UV) || defined(CONFIG_PPC_SVM)
 long ucall(unsigned long opcode, unsigned long *retbuf, ...);
 #else
 static long ucall(unsigned long opcode, unsigned long *retbuf, ...)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 43ff4546e469..1e9b721634c8 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -154,7 +154,9 @@ endif
 
 obj-$(CONFIG_EPAPR_PARAVIRT)   += epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)+= kvm.o kvm_emul.o
-obj-$(CONFIG_PPC_UV)   += ultravisor.o ucall.o
+ifneq ($(CONFIG_PPC_UV)$(CONFIG_PPC_SVM),)
+obj-y  += ultravisor.o ucall.o
+endif
 
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index 9c6b3d860518..82c16aa4f1ce 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -144,3 +144,15 @@ config PAPR_SCM
tristate "Support for the PAPR Storage Class Memory interface"
help
  Enable access to hypervisor provided storage class memory.
+
+config PPC_SVM
+   bool "Secure virtual machine (SVM) support for POWER"
+   depends on PPC_PSERIES
+   default n
+   help
+Support secure guests on POWER. There are certain POWER platforms which
+support secure guests using the Protected Execution Facility, with the
+help of an Ultravisor executing below the hypervisor layer. This
+enables the support for those guests.
+
+If unsure, say "N".

Re: [RFC PATCH] powerpc/mm: Implement STRICT_MODULE_RWX

2019-05-20 Thread Russell Currey

On Wed, 2019-05-15 at 06:20 +, Christophe Leroy wrote:

Confirming this works on hash and radix book3s64.

> +
> + // only operate on VM areas for now
> + area = find_vm_area((void *)addr);
> + if (!area || end > (unsigned long)area->addr + area->size ||
> + !(area->flags & VM_ALLOC))
> + return -EINVAL;

https://lore.kernel.org/patchwork/project/lkml/list/?series=391470

With this patch, the above series causes crashes on (at least) Hash,
since it adds another user of change_page_rw() and change_page_nx()
that for reasons I don't understand yet, we can't handle.  I can work
around this with:

if (area->flags & VM_FLUSH_RESET_PERMS)
return 0;

so this is broken on at least one platform as of 5.2-rc1.  We're going
to look into this more to see if there's anything else we have to do as
a result of this series before the next merge window, or if just
working around it like this is good enough.

- Russell

Re: [PATCH] crypto: vmx - convert to SPDX license identifiers

2019-05-20 Thread Michael Ellerman

Eric Biggers  writes:
> From: Eric Biggers 
>
> Remove the boilerplate license text and replace it with the equivalent
> SPDX license identifier.
>
> Signed-off-by: Eric Biggers 
> ---
>  drivers/crypto/vmx/aes.c | 14 +-
>  drivers/crypto/vmx/aes_cbc.c | 14 +-
>  drivers/crypto/vmx/aes_ctr.c | 14 +-
>  drivers/crypto/vmx/aes_xts.c | 14 +-
>  drivers/crypto/vmx/vmx.c | 14 +-
>  5 files changed, 5 insertions(+), 65 deletions(-)

Looks good to me.

Reviewed-by: Michael Ellerman 

cheers

Re: [PATCH 10/10] docs: fix broken documentation links

2019-05-20 Thread Wolfram Sang

On Mon, May 20, 2019 at 11:47:39AM -0300, Mauro Carvalho Chehab wrote:
> Mostly due to x86 and acpi conversion, several documentation
> links are still pointing to the old file. Fix them.
> 
> Signed-off-by: Mauro Carvalho Chehab 

Thanks, didn't notice that.

>  Documentation/i2c/instantiating-devices  |  2 +-

...

> diff --git a/Documentation/i2c/instantiating-devices 
> b/Documentation/i2c/instantiating-devices
> index 0d85ac1935b7..5a3e2f331e8c 100644
> --- a/Documentation/i2c/instantiating-devices
> +++ b/Documentation/i2c/instantiating-devices
> @@ -85,7 +85,7 @@ Method 1c: Declare the I2C devices via ACPI
>  ---
>  
>  ACPI can also describe I2C devices. There is special documentation for this
> -which is currently located at Documentation/acpi/enumeration.txt.
> +which is currently located at 
> Documentation/firmware-guide/acpi/enumeration.rst.
>  
>  
>  Method 2: Instantiate the devices explicitly

For this I2C part:

Reviewed-by: Wolfram Sang 



signature.asc
Description: PGP signature

Re: [PATCH] misc: remove redundant 'default n' from Kconfig-s

2019-05-20 Thread Michał Mirosław

On Mon, May 20, 2019 at 04:10:46PM +0200, Bartlomiej Zolnierkiewicz wrote:
> 'default n' is the default value for any bool or tristate Kconfig
> setting so there is no need to write it explicitly.
> 
> Also since commit f467c5640c29 ("kconfig: only write '# CONFIG_FOO
> is not set' for visible symbols") the Kconfig behavior is the same
> regardless of 'default n' being present or not:
> 
> ...
> One side effect of (and the main motivation for) this change is making
> the following two definitions behave exactly the same:
> 
> config FOO
> bool
> 
> config FOO
> bool
> default n
> 
> With this change, neither of these will generate a
> '# CONFIG_FOO is not set' line (assuming FOO isn't selected/implied).
> That might make it clearer to people that a bare 'default n' is
> redundant.
> ...
> 
> Signed-off-by: Bartlomiej Zolnierkiewicz 
[...]
>  drivers/misc/cb710/Kconfig|1 -

Acked-by: Michał Mirosław

[PATCH] crypto: testmgr - fix length truncation with large page size

2019-05-20 Thread Eric Biggers

From: Eric Biggers 

On PowerPC with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y, there is sometimes
a crash in generate_random_aead_testvec().  The problem is that the
generated test vectors use data lengths of up to about 2 * PAGE_SIZE,
which is 128 KiB on PowerPC; however, the data length fields in the test
vectors are 'unsigned short', so the lengths get truncated.  Fix this by
changing the relevant fields to 'unsigned int'.

Fixes: 40153b10d91c ("crypto: testmgr - fuzz AEADs against their generic 
implementation")
Signed-off-by: Eric Biggers 
---
 crypto/testmgr.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 9a13c634b2077..fb2afdd544e00 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -43,7 +43,7 @@ struct hash_testvec {
const char *key;
const char *plaintext;
const char *digest;
-   unsigned short psize;
+   unsigned int psize;
unsigned short ksize;
int setkey_error;
int digest_error;
@@ -74,7 +74,7 @@ struct cipher_testvec {
const char *ctext;
unsigned char wk; /* weak key flag */
unsigned short klen;
-   unsigned short len;
+   unsigned int len;
bool fips_skip;
bool generates_iv;
int setkey_error;
@@ -110,9 +110,9 @@ struct aead_testvec {
unsigned char novrfy;
unsigned char wk;
unsigned char klen;
-   unsigned short plen;
-   unsigned short clen;
-   unsigned short alen;
+   unsigned int plen;
+   unsigned int clen;
+   unsigned int alen;
int setkey_error;
int setauthsize_error;
int crypt_error;
-- 
2.21.0.1020.gf2820cf01a-goog

[PATCH] crypto: vmx - convert to skcipher API

2019-05-20 Thread Eric Biggers

From: Eric Biggers 

Convert the VMX implementations of AES-CBC, AES-CTR, and AES-XTS from
the deprecated "blkcipher" API to the "skcipher" API.

As part of this, I moved the skcipher_request for the fallback algorithm
off the stack and into the request context of the parent algorithm.

I tested this in a PowerPC VM with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y.

Signed-off-by: Eric Biggers 
---
 drivers/crypto/vmx/aes_cbc.c   | 183 -
 drivers/crypto/vmx/aes_ctr.c   | 165 +
 drivers/crypto/vmx/aes_xts.c   | 175 ++-
 drivers/crypto/vmx/aesp8-ppc.h |   2 -
 drivers/crypto/vmx/vmx.c   |  72 +++--
 5 files changed, 252 insertions(+), 345 deletions(-)

diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index dae8af3c46dce..92e75a05d6a9e 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -7,64 +7,52 @@
  * Author: Marcelo Henrique Cerri 
  */
 
-#include 
-#include 
-#include 
-#include 
 #include 
 #include 
 #include 
 #include 
-#include 
-#include 
+#include 
 
 #include "aesp8-ppc.h"
 
 struct p8_aes_cbc_ctx {
-   struct crypto_sync_skcipher *fallback;
+   struct crypto_skcipher *fallback;
struct aes_key enc_key;
struct aes_key dec_key;
 };
 
-static int p8_aes_cbc_init(struct crypto_tfm *tfm)
+static int p8_aes_cbc_init(struct crypto_skcipher *tfm)
 {
-   const char *alg = crypto_tfm_alg_name(tfm);
-   struct crypto_sync_skcipher *fallback;
-   struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
-
-   fallback = crypto_alloc_sync_skcipher(alg, 0,
- CRYPTO_ALG_NEED_FALLBACK);
+   struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct crypto_skcipher *fallback;
 
+   fallback = crypto_alloc_skcipher("cbc(aes)", 0,
+CRYPTO_ALG_NEED_FALLBACK |
+CRYPTO_ALG_ASYNC);
if (IS_ERR(fallback)) {
-   printk(KERN_ERR
-  "Failed to allocate transformation for '%s': %ld\n",
-  alg, PTR_ERR(fallback));
+   pr_err("Failed to allocate cbc(aes) fallback: %ld\n",
+  PTR_ERR(fallback));
return PTR_ERR(fallback);
}
 
-   crypto_sync_skcipher_set_flags(
-   fallback,
-   crypto_skcipher_get_flags((struct crypto_skcipher *)tfm));
+   crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
+   crypto_skcipher_reqsize(fallback));
ctx->fallback = fallback;
-
return 0;
 }
 
-static void p8_aes_cbc_exit(struct crypto_tfm *tfm)
+static void p8_aes_cbc_exit(struct crypto_skcipher *tfm)
 {
-   struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+   struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-   if (ctx->fallback) {
-   crypto_free_sync_skcipher(ctx->fallback);
-   ctx->fallback = NULL;
-   }
+   crypto_free_skcipher(ctx->fallback);
 }
 
-static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
+static int p8_aes_cbc_setkey(struct crypto_skcipher *tfm, const u8 *key,
 unsigned int keylen)
 {
+   struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
int ret;
-   struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
 
preempt_disable();
pagefault_disable();
@@ -75,108 +63,71 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const 
u8 *key,
pagefault_enable();
preempt_enable();
 
-   ret |= crypto_sync_skcipher_setkey(ctx->fallback, key, keylen);
+   ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
 
return ret ? -EINVAL : 0;
 }
 
-static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
+static int p8_aes_cbc_crypt(struct skcipher_request *req, int enc)
 {
+   struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+   const struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+   struct skcipher_walk walk;
+   unsigned int nbytes;
int ret;
-   struct blkcipher_walk walk;
-   struct p8_aes_cbc_ctx *ctx =
-   crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
 
if (!crypto_simd_usable()) {
-   SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
-   skcipher_request_set_sync_tfm(req, ctx->fallback);
-   skcipher_request_set_callback(req, desc->flags, NULL, NULL);
-   skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
-   ret = crypto_skcipher_encrypt(req);
-   skcipher_request_zero(req);
-   } else {
-   blkcipher_walk_init(, dst, src, nbytes);
-   ret

[PATCH] crypto: vmx - convert to SPDX license identifiers

2019-05-20 Thread Eric Biggers

From: Eric Biggers 

Remove the boilerplate license text and replace it with the equivalent
SPDX license identifier.

Signed-off-by: Eric Biggers 
---
 drivers/crypto/vmx/aes.c | 14 +-
 drivers/crypto/vmx/aes_cbc.c | 14 +-
 drivers/crypto/vmx/aes_ctr.c | 14 +-
 drivers/crypto/vmx/aes_xts.c | 14 +-
 drivers/crypto/vmx/vmx.c | 14 +-
 5 files changed, 5 insertions(+), 65 deletions(-)

diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index 603a620819941..2e9476158df49 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * AES routines supporting VMX instructions on the Power 8
  *
  * Copyright (C) 2015 International Business Machines Inc.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 only.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  * Author: Marcelo Henrique Cerri 
  */
 
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index a1a9a6f0d42cf..dae8af3c46dce 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * AES CBC routines supporting VMX instructions on the Power 8
  *
  * Copyright (C) 2015 International Business Machines Inc.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 only.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  * Author: Marcelo Henrique Cerri 
  */
 
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index 192a53512f5e8..dc31101178446 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * AES CTR routines supporting VMX instructions on the Power 8
  *
  * Copyright (C) 2015 International Business Machines Inc.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 only.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  * Author: Marcelo Henrique Cerri 
  */
 
diff --git a/drivers/crypto/vmx/aes_xts.c b/drivers/crypto/vmx/aes_xts.c
index 00d412d811ae6..aee1339f134ec 100644
--- a/drivers/crypto/vmx/aes_xts.c
+++ b/drivers/crypto/vmx/aes_xts.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * AES XTS routines supporting VMX In-core instructions on Power 8
  *
  * Copyright (C) 2015 International Business Machines Inc.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundations; version 2 only.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY of FITNESS FOR A PARTICUPAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  * Author: Leonidas S. Barbosa 
  */
 
diff --git a/drivers/crypto/vmx/vmx.c b/drivers/crypto/vmx/vmx.c
index a9f5198306155..abd89c2bcec4d 100644
--- a/drivers/crypto/vmx/vmx.c
+++ b/drivers/crypto/vmx/vmx.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /**
  * Routines supporting VMX instructions on

Re: [PATCH] crypto: vmx - CTR: always increment IV as quadword

2019-05-20 Thread Eric Biggers

On Mon, May 20, 2019 at 11:59:05AM +1000, Daniel Axtens wrote:
> Daniel Axtens  writes:
> 
> > The kernel self-tests picked up an issue with CTR mode:
> > alg: skcipher: p8_aes_ctr encryption test failed (wrong result) on test 
> > vector 3, cfg="uneven misaligned splits, may sleep"
> >
> > Test vector 3 has an IV of FFFD, so
> > after 3 increments it should wrap around to 0.
> >
> > In the aesp8-ppc code from OpenSSL, there are two paths that
> > increment IVs: the bulk (8 at a time) path, and the individual
> > path which is used when there are fewer than 8 AES blocks to
> > process.
> >
> > In the bulk path, the IV is incremented with vadduqm: "Vector
> > Add Unsigned Quadword Modulo", which does 128-bit addition.
> >
> > In the individual path, however, the IV is incremented with
> > vadduwm: "Vector Add Unsigned Word Modulo", which instead
> > does 4 32-bit additions. Thus the IV would instead become
> > , throwing off the result.
> >
> > Use vadduqm.
> >
> > This was probably a typo originally, what with q and w being
> > adjacent. It is a pretty narrow edge case: I am really
> > impressed by the quality of the kernel self-tests!
> >
> > Fixes: 5c380d623ed3 ("crypto: vmx - Add support for VMS instructions by 
> > ASM")
> > Cc: sta...@vger.kernel.org
> > Signed-off-by: Daniel Axtens 
> >
> > ---
> >
> > I'll pass this along internally to get it into OpenSSL as well.
> 
> I passed this along to OpenSSL and got pretty comprehensively schooled:
> https://github.com/openssl/openssl/pull/8942
> 
> It seems we tweak the openssl code to use a 128-bit counter, whereas
> the original code was in fact designed for a 32-bit counter. We must
> have changed the vaddu instruction in the bulk path but not in the
> individual path, as they're both vadduwm (4x32-bit) upstream.
> 
> I think this change is still correct with regards to the kernel,
> but I guess it's probably something where I should have done a more
> thorough read of the documentation before diving in to the code, and
> perhaps we should note it in the code somewhere too. Ah well.
> 
> Regards,
> Daniel
> 

Ah, I didn't realize there are multiple conventions for CTR.  Yes, all CTR
implementations in the kernel follow the 128-bit convention, and evidently the
test vectors test for that.  Apparently the VMX OpenSSL code got incompletely
changed from the 32-bit convention by this commit, so that's what you're fixing:

commit 1d4aa0b4c1816e8ca92a6aadb0d8f6b43c56c0d0
Author: Leonidas Da Silva Barbosa 
Date:   Fri Aug 14 10:12:22 2015 -0300

crypto: vmx - Fixing AES-CTR counter bug

AES-CTR is using a counter 8bytes-8bytes what miss match with
kernel specs.

In the previous code a vadduwm was done to increment counter.
Replacing this for a vadduqm now considering both cases counter
8-8 bytes and full 16bytes.

A comment in the code would indeed be helpful.

Note that the kernel doesn't currently need a 32-bit CTR implementation for GCM
like OpenSSL does, because the kernel currently only supports 12-byte IVs with
GCM.  So the low 32 bits of the counter start at 1 and don't overflow,
regardless of whether the counter is incremented mod 2^32 or mod 2^128.

- Eric

Re: [PATCH 5/5] asm-generic: remove ptrace.h

2019-05-20 Thread Oleg Nesterov

On 05/20, Christoph Hellwig wrote:
>
> No one is using this header anymore.
> 
> Signed-off-by: Christoph Hellwig 
> Acked-by: Arnd Bergmann 
> ---
>  MAINTAINERS|  1 -
>  arch/mips/include/asm/ptrace.h |  5 ---
>  include/asm-generic/ptrace.h   | 74 --
>  3 files changed, 80 deletions(-)
>  delete mode 100644 include/asm-generic/ptrace.h

Acked-by: Oleg Nesterov

Re: [PATCH 4/5] x86: don't use asm-generic/ptrace.h

2019-05-20 Thread Oleg Nesterov

On 05/20, Christoph Hellwig wrote:
>
> Doing the indirection through macros for the regs accessors just
> makes them harder to read, so implement the helpers directly.

Acked-by: Oleg Nesterov

[PATCH v3 1/2] pid: add pidfd_open()

2019-05-20 Thread Christian Brauner

This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
process that is created via traditional fork()/clone() calls that is only
referenced by a PID:

int pidfd = pidfd_open(1234, 0);
ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);

With the introduction of pidfds through CLONE_PIDFD it is possible to
created pidfds at process creation time.
However, a lot of processes get created with traditional PID-based calls
such as fork() or clone() (without CLONE_PIDFD). For these processes a
caller can currently not create a pollable pidfd. This is a problem for
Android's low memory killer (LMK) and service managers such as systemd.
Both are examples of tools that want to make use of pidfds to get reliable
notification of process exit for non-parents (pidfd polling) and race-free
signal sending (pidfd_send_signal()). They intend to switch to this API for
process supervision/management as soon as possible. Having no way to get
pollable pidfds from PID-only processes is one of the biggest blockers for
them in adopting this api. With pidfd_open() making it possible to retrieve
pidfds for PID-based processes we enable them to adopt this api.

In line with Arnd's recent changes to consolidate syscall numbers across
architectures, I have added the pidfd_open() syscall to all architectures
at the same time.

Signed-off-by: Christian Brauner 
Reviewed-by: Oleg Nesterov 
Acked-by: Arnd Bergmann 
Cc: "Eric W. Biederman" 
Cc: Kees Cook 
Cc: Joel Fernandes (Google) 
Cc: Thomas Gleixner 
Cc: Jann Horn 
Cc: David Howells 
Cc: Andy Lutomirsky 
Cc: Andrew Morton 
Cc: Aleksa Sarai 
Cc: Linus Torvalds 
Cc: Al Viro 
Cc: linux-...@vger.kernel.org
---
v1:
- kbuild test robot :
  - add missing entry for pidfd_open to arch/arm/tools/syscall.tbl
- Oleg Nesterov :
  - use simpler thread-group leader check
v2:
- Oleg Nesterov :
  - avoid using additional variable
  - remove unneeded comment
- Arnd Bergmann :
  - switch from 428 to 434 since the new mount api has taken it
  - bump syscall numbers in arch/arm64/include/asm/unistd.h
- Joel Fernandes (Google) :
  - switch from ESRCH to EINVAL when the passed-in pid does not refer to a
thread-group leader
- Christian Brauner :
  - rebase on v5.2-rc1
  - adapt syscall number to account for new mount api syscalls
v3:
- Arnd Bergmann :
  - add missing syscall entries for mips-o32 and mips-n64
---
 arch/alpha/kernel/syscalls/syscall.tbl  |  1 +
 arch/arm/tools/syscall.tbl  |  1 +
 arch/arm64/include/asm/unistd.h |  2 +-
 arch/arm64/include/asm/unistd32.h   |  2 +
 arch/ia64/kernel/syscalls/syscall.tbl   |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl   |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl|  1 +
 arch/s390/kernel/syscalls/syscall.tbl   |  1 +
 arch/sh/kernel/syscalls/syscall.tbl |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl  |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl  |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl  |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl |  1 +
 include/linux/pid.h |  1 +
 include/linux/syscalls.h|  1 +
 include/uapi/asm-generic/unistd.h   |  4 +-
 kernel/fork.c   |  2 +-
 kernel/pid.c| 43 +
 23 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl 
b/arch/alpha/kernel/syscalls/syscall.tbl
index 9e7704e44f6d..1db9bbcfb84e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 541common  fsconfigsys_fsconfig
 542common  fsmount sys_fsmount
 543common  fspick  sys_fspick
+544common  pidfd_open  sys_pidfd_open
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index aaf479a9e92d..81e6e1817c45 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -447,3 +447,4 @@
 431common  fsconfigsys_fsconfig
 432common  fsmount sys_fsmount
 433common  fspick  sys_fspick
+434common  pidfd_open  sys_pidfd_open
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 70e6882853c0..e8f7d95a1481 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -44,7 +44,7 @@
 #define __ARM_NR_compat_set_tls(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END(__ARM_NR_COMPAT_BASE +

[PATCH v3 2/2] tests: add pidfd_open() tests

2019-05-20 Thread Christian Brauner

This adds testing for the new pidfd_open() syscalls. Specifically, we test:
- that no invalid flags can be passed to pidfd_open()
- that no invalid pid can be passed to pidfd_open()
- that a pidfd can be retrieved with pidfd_open()
- that the retrieved pidfd references the correct pid

Signed-off-by: Christian Brauner 
Cc: Arnd Bergmann 
Cc: "Eric W. Biederman" 
Cc: Kees Cook 
Cc: Joel Fernandes (Google) 
Cc: Thomas Gleixner 
Cc: Jann Horn 
Cc: David Howells 
Cc: "Michael Kerrisk (man-pages)" 
Cc: Andy Lutomirsky 
Cc: Andrew Morton 
Cc: Oleg Nesterov 
Cc: Aleksa Sarai 
Cc: Linus Torvalds 
Cc: Al Viro 
Cc: linux-...@vger.kernel.org
---
v1: unchanged
v2:
- Christian Brauner :
  - set number of planned tests via ksft_set_plan()
v3: unchanged
---
 tools/testing/selftests/pidfd/.gitignore  |   1 +
 tools/testing/selftests/pidfd/Makefile|   2 +-
 tools/testing/selftests/pidfd/pidfd.h |  57 ++
 .../testing/selftests/pidfd/pidfd_open_test.c | 169 ++
 tools/testing/selftests/pidfd/pidfd_test.c|  41 +
 5 files changed, 229 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/selftests/pidfd/pidfd.h
 create mode 100644 tools/testing/selftests/pidfd/pidfd_open_test.c

diff --git a/tools/testing/selftests/pidfd/.gitignore 
b/tools/testing/selftests/pidfd/.gitignore
index 822a1e63d045..16d84d117bc0 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -1 +1,2 @@
+pidfd_open_test
 pidfd_test
diff --git a/tools/testing/selftests/pidfd/Makefile 
b/tools/testing/selftests/pidfd/Makefile
index deaf8073bc06..b36c0be70848 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,6 +1,6 @@
 CFLAGS += -g -I../../../../usr/include/
 
-TEST_GEN_PROGS := pidfd_test
+TEST_GEN_PROGS := pidfd_test pidfd_open_test
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd.h 
b/tools/testing/selftests/pidfd/pidfd.h
new file mode 100644
index ..8452e910463f
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PIDFD_H
+#define __PIDFD_H
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest.h"
+
+/*
+ * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
+ * That means, when it wraps around any pid < 300 will be skipped.
+ * So we need to use a pid > 300 in order to test recycling.
+ */
+#define PID_RECYCLE 1000
+
+/*
+ * Define a few custom error codes for the child process to clearly indicate
+ * what is happening. This way we can tell the difference between a system
+ * error, a test error, etc.
+ */
+#define PIDFD_PASS 0
+#define PIDFD_FAIL 1
+#define PIDFD_ERROR 2
+#define PIDFD_SKIP 3
+#define PIDFD_XFAIL 4
+
+int wait_for_pid(pid_t pid)
+{
+   int status, ret;
+
+again:
+   ret = waitpid(pid, , 0);
+   if (ret == -1) {
+   if (errno == EINTR)
+   goto again;
+
+   return -1;
+   }
+
+   if (!WIFEXITED(status))
+   return -1;
+
+   return WEXITSTATUS(status);
+}
+
+
+#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c 
b/tools/testing/selftests/pidfd/pidfd_open_test.c
new file mode 100644
index ..0377133dd6dc
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+   return syscall(__NR_pidfd_open, pid, flags);
+}
+
+static int safe_int(const char *numstr, int *converted)
+{
+   char *err = NULL;
+   long sli;
+
+   errno = 0;
+   sli = strtol(numstr, , 0);
+   if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
+   return -ERANGE;
+
+   if (errno != 0 && sli == 0)
+   return -EINVAL;
+
+   if (err == numstr || *err != '\0')
+   return -EINVAL;
+
+   if (sli > INT_MAX || sli < INT_MIN)
+   return -ERANGE;
+
+   *converted = (int)sli;
+   return 0;
+}
+
+static int char_left_gc(const char *buffer, size_t len)
+{
+   size_t i;
+
+   for (i = 0; i < len; i++) {
+   if (buffer[i] == ' ' ||
+   buffer[i] == '\t')
+   continue;
+
+   return i;
+   }
+
+   return 0;
+}
+
+static int char_right_gc(const char *buffer, size_t len)
+{
+   int i;
+
+   for (i = len - 1; i >= 0; i--) {
+   if (buffer[i] == ' '  ||
+   buffer[i] == '\t' ||
+   buffer[i] ==

Re: [PATCH] mm: add account_locked_vm utility function

2019-05-20 Thread Daniel Jordan

On Mon, May 20, 2019 at 04:19:34PM +1000, Alexey Kardashevskiy wrote:
> On 04/05/2019 06:16, Daniel Jordan wrote:
> > locked_vm accounting is done roughly the same way in five places, so
> > unify them in a helper.  Standardize the debug prints, which vary
> > slightly.
> 
> And I rather liked that prints were different and tell precisely which
> one of three each printk is.

I'm not following.  One of three...callsites?  But there were five callsites.

Anyway, I added a _RET_IP_ to the debug print so you can differentiate.

> I commented below but in general this seems working.
> 
> Tested-by: Alexey Kardashevskiy 

Thanks!  And for the review as well.

> > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> > b/drivers/vfio/vfio_iommu_spapr_tce.c
> > index 6b64e45a5269..d39a1b830d82 100644
> > --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> > @@ -34,49 +35,13 @@
> >  static void tce_iommu_detach_group(void *iommu_data,
> > struct iommu_group *iommu_group);
> >  
> > -static long try_increment_locked_vm(struct mm_struct *mm, long npages)
> > +static int tce_account_locked_vm(struct mm_struct *mm, unsigned long 
> > npages,
> > +bool inc)
> >  {
> > -   long ret = 0, locked, lock_limit;
> > -
> > if (WARN_ON_ONCE(!mm))
> > return -EPERM;
> 
> 
> If this WARN_ON is the only reason for having tce_account_locked_vm()
> instead of calling account_locked_vm() directly, you can then ditch the
> check as I have never ever seen this triggered.

Great, will do.

> > diff --git a/drivers/vfio/vfio_iommu_type1.c 
> > b/drivers/vfio/vfio_iommu_type1.c
> > index d0f731c9920a..15ac76171ccd 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -273,25 +273,14 @@ static int vfio_lock_acct(struct vfio_dma *dma, long 
> > npage, bool async)
> > return -ESRCH; /* process exited */
> >  
> > ret = down_write_killable(>mmap_sem);
> > -   if (!ret) {
> > -   if (npage > 0) {
> > -   if (!dma->lock_cap) {
> > -   unsigned long limit;
> > -
> > -   limit = task_rlimit(dma->task,
> > -   RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > -
> > -   if (mm->locked_vm + npage > limit)
> > -   ret = -ENOMEM;
> > -   }
> > -   }
> > +   if (ret)
> > +   goto out;
> 
> 
> A single "goto" to jump just 3 lines below seems unnecessary.

No strong preference here, I'll take out the goto.

> > +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool 
> > inc,
> > +   struct task_struct *task, bool bypass_rlim)
> > +{
> > +   unsigned long locked_vm, limit;
> > +   int ret = 0;
> > +
> > +   locked_vm = mm->locked_vm;
> > +   if (inc) {
> > +   if (!bypass_rlim) {
> > +   limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +   if (locked_vm + pages > limit) {
> > +   ret = -ENOMEM;
> > +   goto out;
> > +   }
> > +   }
> 
> Nit:
> 
> if (!ret)
> 
> and then you don't need "goto out".

Ok, sure.

> > +   mm->locked_vm = locked_vm + pages;
> > +   } else {
> > +   WARN_ON_ONCE(pages > locked_vm);
> > +   mm->locked_vm = locked_vm - pages;
> 
> 
> Can go negative here. Not a huge deal but inaccurate imo.

I hear you, but setting a negative value to zero, as we had done previously,
doesn't make much sense to me.

Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest

2019-05-20 Thread Bharata B Rao

On Tue, May 21, 2019 at 12:55:49AM +1000, Nicholas Piggin wrote:
> Bharata B Rao's on May 21, 2019 12:29 am:
> > On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote:
> >> On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote:
> >> > Bharata B Rao's on May 20, 2019 3:56 pm:
> >> > > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote:
> >> > >> >> > git bisect points to
> >> > >> >> >
> >> > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d
> >> > >> >> > Author: Nicholas Piggin 
> >> > >> >> > Date:   Fri Jul 27 21:48:17 2018 +1000
> >> > >> >> >
> >> > >> >> > powerpc/64s: Fix page table fragment refcount race vs 
> >> > >> >> > speculative references
> >> > >> >> >
> >> > >> >> > The page table fragment allocator uses the main page 
> >> > >> >> > refcount racily
> >> > >> >> > with respect to speculative references. A customer observed 
> >> > >> >> > a BUG due
> >> > >> >> > to page table page refcount underflow in the fragment 
> >> > >> >> > allocator. This
> >> > >> >> > can be caused by the fragment allocator set_page_count 
> >> > >> >> > stomping on a
> >> > >> >> > speculative reference, and then the speculative failure 
> >> > >> >> > handler
> >> > >> >> > decrements the new reference, and the underflow eventually 
> >> > >> >> > pops when
> >> > >> >> > the page tables are freed.
> >> > >> >> >
> >> > >> >> > Fix this by using a dedicated field in the struct page for 
> >> > >> >> > the page
> >> > >> >> > table fragment allocator.
> >> > >> >> >
> >> > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory 
> >> > >> >> > wastage")
> >> > >> >> > Cc: sta...@vger.kernel.org # v3.10+
> >> > >> >> 
> >> > >> >> That's the commit that added the BUG_ON(), so prior to that you 
> >> > >> >> won't
> >> > >> >> see the crash.
> >> > >> > 
> >> > >> > Right, but the commit says it fixes page table page refcount 
> >> > >> > underflow by
> >> > >> > introducing a new field >pt_frag_refcount. Now we are hitting 
> >> > >> > the underflow
> >> > >> > for this pt_frag_refcount.
> >> > >> 
> >> > >> The fixed underflow is caused by a bug (race on page count) that got 
> >> > >> fixed by that patch. You are hitting a different underflow here. It's
> >> > >> not certain my patch caused it, I'm just trying to reproduce now.
> >> > > 
> >> > > Ok.
> >> > 
> >> > Can't reproduce I'm afraid, tried adding and removing 8GB memory from a
> >> > 4GB guest (via host adding / removing memory device), and it just works.
> >> 
> >> Boot, add 8G, reboot, remove 8G is the sequence to reproduce.
> >> 
> >> > 
> >> > It's likely to be an edge case like an off by one or rounding error
> >> > that just happens to trigger in your config. Might be easiest if you
> >> > could test with a debug patch.
> >> 
> >> Sure, I will continue debugging.
> > 
> > When the guest is rebooted after hotplug, the entire memory (which includes
> > the hotplugged memory) gets remapped again freshly. However at this time
> > since no slab is available yet, pt_frag_refcount never gets initialized as 
> > we
> > never do pte_fragment_alloc() for these mappings. So we right away hit the
> > underflow during the first unplug itself, it looks like.
> 
> Nice catch, good debugging work.

Thanks, with help from Aneesh.

> 
> > I will check how this can be fixed.
> 
> Tricky problem. What do you think? You might be able to make the early 
> page table allocations in the same pattern as the frag allocations, and 
> then fill in the struct page metadata when you have those.

Will explore.

> 
> Other option may be create a new set of page tables after mm comes up
> to replace the early page tables with. That's a bigger hammer though.

Will also check if similar scenario exists on x86 and if so, how and when
pte frag data is fixed there.

Regards,
Bharata.

Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest

2019-05-20 Thread Aneesh Kumar K.V


On 5/20/19 8:25 PM, Nicholas Piggin wrote:

Bharata B Rao's on May 21, 2019 12:29 am:

On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote:

On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote:

Bharata B Rao's on May 20, 2019 3:56 pm:

On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote:

git bisect points to

commit 4231aba000f5a4583dd9f67057aadb68c3eca99d
Author: Nicholas Piggin 
Date:   Fri Jul 27 21:48:17 2018 +1000

 powerpc/64s: Fix page table fragment refcount race vs speculative 
references

 The page table fragment allocator uses the main page refcount racily
 with respect to speculative references. A customer observed a BUG due
 to page table page refcount underflow in the fragment allocator. This
 can be caused by the fragment allocator set_page_count stomping on a
 speculative reference, and then the speculative failure handler
 decrements the new reference, and the underflow eventually pops when
 the page tables are freed.

 Fix this by using a dedicated field in the struct page for the page
 table fragment allocator.

 Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage")
 Cc: sta...@vger.kernel.org # v3.10+


That's the commit that added the BUG_ON(), so prior to that you won't
see the crash.


Right, but the commit says it fixes page table page refcount underflow by
introducing a new field >pt_frag_refcount. Now we are hitting the 
underflow
for this pt_frag_refcount.


The fixed underflow is caused by a bug (race on page count) that got
fixed by that patch. You are hitting a different underflow here. It's
not certain my patch caused it, I'm just trying to reproduce now.


Ok.


Can't reproduce I'm afraid, tried adding and removing 8GB memory from a
4GB guest (via host adding / removing memory device), and it just works.


Boot, add 8G, reboot, remove 8G is the sequence to reproduce.



It's likely to be an edge case like an off by one or rounding error
that just happens to trigger in your config. Might be easiest if you
could test with a debug patch.


Sure, I will continue debugging.


When the guest is rebooted after hotplug, the entire memory (which includes
the hotplugged memory) gets remapped again freshly. However at this time
since no slab is available yet, pt_frag_refcount never gets initialized as we
never do pte_fragment_alloc() for these mappings. So we right away hit the
underflow during the first unplug itself, it looks like.


Nice catch, good debugging work.


I will check how this can be fixed.


Tricky problem. What do you think? You might be able to make the early
page table allocations in the same pattern as the frag allocations, and
then fill in the struct page metadata when you have those.



I guess we need to do something similar to what x86 does. We need to 
walk the init_mm page table again and re-init struct page and other data 
structures backing the tables?


-aneesh

Re: [PATCH 1/3] powerpc/pseries: Simplify cpu readd to use drc_index

2019-05-20 Thread Nathan Lynch

Tyrel Datwyler  writes:

> On 05/16/2019 12:17 PM, Nathan Lynch wrote:
>> Tyrel Datwyler  writes:
>>> The current dlpar_cpu_readd() takes in a cpu_id and uses that to look up
>>> the cpus device_node so that we can get at the ibm,my-drc-index
>>> property. The only user of cpu readd is an OF notifier call back. This
>>> call back already has a reference to the device_node and therefore can
>>> retrieve the drc_index from the device_node.
>> 
>> dlpar_cpu_readd is a hack to try to change the CPU-node relationship at
>> runtime without destabilizing the system. It doesn't accomplish that and
>> it should just be removed (and I'm working on that).
>> 
>
> I will politely disagree. We've done exactly this from userspace for
> years. My experience still suggests that memory affinity is the
> problem area, and that the work to push this all into the kernel
> originally was poorly tested.

Kernel implementation details aside, how do you change the cpu-node
relationship at runtime without breaking NUMA-aware applications? Is
this not a fundamental issue to address before adding code like this?

Re: [PATCH v2 1/2] pid: add pidfd_open()

2019-05-20 Thread Arnd Bergmann

On Mon, May 20, 2019 at 4:48 PM Christian Brauner  wrote:
>
> On Mon, May 20, 2019 at 04:37:03PM +0200, Arnd Bergmann wrote:
> > On Mon, May 20, 2019 at 3:46 PM Christian Brauner  
> > wrote:
> > >
> > > In line with Arnd's recent changes to consolidate syscall numbers across
> > > architectures, I have added the pidfd_open() syscall to all architectures
> > > at the same time.
> >
> > Thanks! I've checked that the ones you have added are all
> > done correctly. However, double-checking that you got all of them,
> > I noticed that you missed mips-o32 and mips-n64. With those added:
> >
> > Acked-by: Arnd Bergmann 
>
> Perfect, will plumb mips-o32 and mips-n64 and resend once more with your
> ack added.
> Sidenote: You plan on merging the common syscall tables or will there be
> a script to do this work per-arch in the future?

David Howells also asked about this. I think having a common table will
be best in the long run, patches welcome.

As you noticed, there are still a few minor differences between the files
on mips, arm, x86, alpha and s390, and we are missing the .tbl files
for arm-compat and asm-generic, as well as an architecture independent
script.

Once that is all taken care of, we can move the entries for syscall
403 and higher into a common file, and change the script to pick
up the contents from there in addition to the architecture specific
file.

  Arnd

Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest

2019-05-20 Thread Nicholas Piggin

Bharata B Rao's on May 21, 2019 12:29 am:
> On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote:
>> On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote:
>> > Bharata B Rao's on May 20, 2019 3:56 pm:
>> > > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote:
>> > >> >> > git bisect points to
>> > >> >> >
>> > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d
>> > >> >> > Author: Nicholas Piggin 
>> > >> >> > Date:   Fri Jul 27 21:48:17 2018 +1000
>> > >> >> >
>> > >> >> > powerpc/64s: Fix page table fragment refcount race vs 
>> > >> >> > speculative references
>> > >> >> >
>> > >> >> > The page table fragment allocator uses the main page refcount 
>> > >> >> > racily
>> > >> >> > with respect to speculative references. A customer observed a 
>> > >> >> > BUG due
>> > >> >> > to page table page refcount underflow in the fragment 
>> > >> >> > allocator. This
>> > >> >> > can be caused by the fragment allocator set_page_count 
>> > >> >> > stomping on a
>> > >> >> > speculative reference, and then the speculative failure handler
>> > >> >> > decrements the new reference, and the underflow eventually 
>> > >> >> > pops when
>> > >> >> > the page tables are freed.
>> > >> >> >
>> > >> >> > Fix this by using a dedicated field in the struct page for the 
>> > >> >> > page
>> > >> >> > table fragment allocator.
>> > >> >> >
>> > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory 
>> > >> >> > wastage")
>> > >> >> > Cc: sta...@vger.kernel.org # v3.10+
>> > >> >> 
>> > >> >> That's the commit that added the BUG_ON(), so prior to that you won't
>> > >> >> see the crash.
>> > >> > 
>> > >> > Right, but the commit says it fixes page table page refcount 
>> > >> > underflow by
>> > >> > introducing a new field >pt_frag_refcount. Now we are hitting 
>> > >> > the underflow
>> > >> > for this pt_frag_refcount.
>> > >> 
>> > >> The fixed underflow is caused by a bug (race on page count) that got 
>> > >> fixed by that patch. You are hitting a different underflow here. It's
>> > >> not certain my patch caused it, I'm just trying to reproduce now.
>> > > 
>> > > Ok.
>> > 
>> > Can't reproduce I'm afraid, tried adding and removing 8GB memory from a
>> > 4GB guest (via host adding / removing memory device), and it just works.
>> 
>> Boot, add 8G, reboot, remove 8G is the sequence to reproduce.
>> 
>> > 
>> > It's likely to be an edge case like an off by one or rounding error
>> > that just happens to trigger in your config. Might be easiest if you
>> > could test with a debug patch.
>> 
>> Sure, I will continue debugging.
> 
> When the guest is rebooted after hotplug, the entire memory (which includes
> the hotplugged memory) gets remapped again freshly. However at this time
> since no slab is available yet, pt_frag_refcount never gets initialized as we
> never do pte_fragment_alloc() for these mappings. So we right away hit the
> underflow during the first unplug itself, it looks like.

Nice catch, good debugging work.

> I will check how this can be fixed.

Tricky problem. What do you think? You might be able to make the early 
page table allocations in the same pattern as the frag allocations, and 
then fill in the struct page metadata when you have those.

Other option may be create a new set of page tables after mm comes up
to replace the early page tables with. That's a bigger hammer though.

Thanks,
Nick

Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()

2019-05-20 Thread Steven Rostedt

On Mon, 20 May 2019 20:12:48 +0530
"Naveen N. Rao"  wrote:

> Thanks, that definitely helps make things clearer. A very small nit from 
> your first patch -- it would be good to also convert the calls to 
> ftrace_check_record() to use 'true' or 'false' for the 'update' field.

Heh, I was so focused on the "enable" part, I did the "update" as a
second thought, and forgot to update the callers. Thanks for pointing
that out.

> 
> I will test my series in more detail and post a v1.

Great.

-- Steve

Re: [PATCH v2 1/2] pid: add pidfd_open()

2019-05-20 Thread Christian Brauner

On Mon, May 20, 2019 at 04:37:03PM +0200, Arnd Bergmann wrote:
> On Mon, May 20, 2019 at 3:46 PM Christian Brauner  
> wrote:
> >
> > In line with Arnd's recent changes to consolidate syscall numbers across
> > architectures, I have added the pidfd_open() syscall to all architectures
> > at the same time.
> 
> Thanks! I've checked that the ones you have added are all
> done correctly. However, double-checking that you got all of them,
> I noticed that you missed mips-o32 and mips-n64. With those added:
> 
> Acked-by: Arnd Bergmann 

Perfect, will plumb mips-o32 and mips-n64 and resend once more with your
ack added.
Sidenote: You plan on merging the common syscall tables or will there be
a script to do this work per-arch in the future?

[PATCH 10/10] docs: fix broken documentation links

2019-05-20 Thread Mauro Carvalho Chehab

Mostly due to x86 and acpi conversion, several documentation
links are still pointing to the old file. Fix them.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/acpi/dsd/leds.txt  |  2 +-
 Documentation/admin-guide/kernel-parameters.rst  |  6 +++---
 Documentation/admin-guide/kernel-parameters.txt  | 16 
 Documentation/admin-guide/ras.rst|  2 +-
 .../devicetree/bindings/net/fsl-enetc.txt|  7 +++
 .../bindings/pci/amlogic,meson-pcie.txt  |  2 +-
 .../bindings/regulator/qcom,rpmh-regulator.txt   |  2 +-
 Documentation/devicetree/booting-without-of.txt  |  2 +-
 Documentation/driver-api/gpio/board.rst  |  2 +-
 Documentation/driver-api/gpio/consumer.rst   |  2 +-
 .../firmware-guide/acpi/enumeration.rst  |  2 +-
 .../firmware-guide/acpi/method-tracing.rst   |  2 +-
 Documentation/i2c/instantiating-devices  |  2 +-
 Documentation/sysctl/kernel.txt  |  4 ++--
 .../translations/it_IT/process/4.Coding.rst  |  2 +-
 .../translations/it_IT/process/howto.rst |  2 +-
 .../it_IT/process/stable-kernel-rules.rst|  4 ++--
 .../translations/zh_CN/process/4.Coding.rst  |  2 +-
 Documentation/x86/x86_64/5level-paging.rst   |  2 +-
 Documentation/x86/x86_64/boot-options.rst|  4 ++--
 .../x86/x86_64/fake-numa-for-cpusets.rst |  2 +-
 MAINTAINERS  |  6 +++---
 arch/arm/Kconfig |  2 +-
 arch/arm64/kernel/kexec_image.c  |  2 +-
 arch/powerpc/Kconfig |  2 +-
 arch/x86/Kconfig | 16 
 arch/x86/Kconfig.debug   |  2 +-
 arch/x86/boot/header.S   |  2 +-
 arch/x86/entry/entry_64.S|  2 +-
 arch/x86/include/asm/bootparam_utils.h   |  2 +-
 arch/x86/include/asm/page_64_types.h |  2 +-
 arch/x86/include/asm/pgtable_64_types.h  |  2 +-
 arch/x86/kernel/cpu/microcode/amd.c  |  2 +-
 arch/x86/kernel/kexec-bzimage64.c|  2 +-
 arch/x86/kernel/pci-dma.c|  2 +-
 arch/x86/mm/tlb.c|  2 +-
 arch/x86/platform/pvh/enlighten.c|  2 +-
 drivers/acpi/Kconfig | 10 +-
 drivers/net/ethernet/faraday/ftgmac100.c |  2 +-
 .../fieldbus/Documentation/fieldbus_dev.txt  |  4 ++--
 drivers/vhost/vhost.c|  2 +-
 include/acpi/acpi_drivers.h  |  2 +-
 include/linux/fs_context.h   |  2 +-
 include/linux/lsm_hooks.h|  2 +-
 mm/Kconfig   |  2 +-
 security/Kconfig |  2 +-
 tools/include/linux/err.h|  2 +-
 tools/objtool/Documentation/stack-validation.txt |  4 ++--
 tools/testing/selftests/x86/protection_keys.c|  2 +-
 49 files changed, 78 insertions(+), 79 deletions(-)

diff --git a/Documentation/acpi/dsd/leds.txt b/Documentation/acpi/dsd/leds.txt
index 81a63af42ed2..cc58b1a574c5 100644
--- a/Documentation/acpi/dsd/leds.txt
+++ b/Documentation/acpi/dsd/leds.txt
@@ -96,4 +96,4 @@ where
 
http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
 referenced 2019-02-21.
 
-[7] Documentation/acpi/dsd/data-node-reference.txt
+[7] Documentation/firmware-guide/acpi/dsd/data-node-references.rst
diff --git a/Documentation/admin-guide/kernel-parameters.rst 
b/Documentation/admin-guide/kernel-parameters.rst
index 0124980dca2d..8d3273e32eb1 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -167,7 +167,7 @@ parameter is applicable::
X86-32  X86-32, aka i386 architecture is enabled.
X86-64  X86-64 architecture is enabled.
More X86-64 boot options can be found in
-   Documentation/x86/x86_64/boot-options.txt .
+   Documentation/x86/x86_64/boot-options.rst.
X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
X86_UV  SGI UV support is enabled.
XEN Xen support is enabled
@@ -181,10 +181,10 @@ In addition, the following text indicates that the 
option::
 Parameters denoted with BOOT are actually interpreted by the boot
 loader, and have no meaning to the kernel directly.
 Do not modify the syntax of boot loader parameters without extreme
-need or coordination with .
+need or coordination with .
 
 There are also arch-specific kernel-parameters not documented here.
-See for example .
+See for example .
 
 Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
 a trailing = on the name of any parameter states that that parameter will
diff --git a/Documentation/admin-guide/kernel-parameters.txt

Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()

2019-05-20 Thread Naveen N. Rao

Hi Steven,

Steven Rostedt wrote:

On Mon, 20 May 2019 09:13:20 -0400
Steven Rostedt  wrote:

> I haven't yet tested this patch on x86, but this looked wrong so sending 
> this as a RFC.  

This code has been through a bit of updates, and I need to go through
and clean it up. I'll have to take a look and convert "int" to "bool"
so that "enable" is not confusing.

Thanks, I think I'll try to do a clean up first, and then this patch
shouldn't "look wrong" after that.

I'm going to apply the attached two patches. There may be some
conflicts between yours here and these, but nothing that Linus can't
figure out. Do you feel more comfortable with this code, if these
patches are applied?

Thanks, that definitely helps make things clearer. A very small nit from 
your first patch -- it would be good to also convert the calls to 
ftrace_check_record() to use 'true' or 'false' for the 'update' field.

I will test my series in more detail and post a v1.

- Naveen

Re: [PATCH v2 1/2] pid: add pidfd_open()

2019-05-20 Thread Arnd Bergmann

On Mon, May 20, 2019 at 3:46 PM Christian Brauner  wrote:
>
> In line with Arnd's recent changes to consolidate syscall numbers across
> architectures, I have added the pidfd_open() syscall to all architectures
> at the same time.

Thanks! I've checked that the ones you have added are all
done correctly. However, double-checking that you got all of them,
I noticed that you missed mips-o32 and mips-n64. With those added:

Acked-by: Arnd Bergmann

Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest

2019-05-20 Thread Bharata B Rao

On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote:
> On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote:
> > Bharata B Rao's on May 20, 2019 3:56 pm:
> > > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote:
> > >> >> > git bisect points to
> > >> >> >
> > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d
> > >> >> > Author: Nicholas Piggin 
> > >> >> > Date:   Fri Jul 27 21:48:17 2018 +1000
> > >> >> >
> > >> >> > powerpc/64s: Fix page table fragment refcount race vs 
> > >> >> > speculative references
> > >> >> >
> > >> >> > The page table fragment allocator uses the main page refcount 
> > >> >> > racily
> > >> >> > with respect to speculative references. A customer observed a 
> > >> >> > BUG due
> > >> >> > to page table page refcount underflow in the fragment 
> > >> >> > allocator. This
> > >> >> > can be caused by the fragment allocator set_page_count stomping 
> > >> >> > on a
> > >> >> > speculative reference, and then the speculative failure handler
> > >> >> > decrements the new reference, and the underflow eventually pops 
> > >> >> > when
> > >> >> > the page tables are freed.
> > >> >> >
> > >> >> > Fix this by using a dedicated field in the struct page for the 
> > >> >> > page
> > >> >> > table fragment allocator.
> > >> >> >
> > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage")
> > >> >> > Cc: sta...@vger.kernel.org # v3.10+
> > >> >> 
> > >> >> That's the commit that added the BUG_ON(), so prior to that you won't
> > >> >> see the crash.
> > >> > 
> > >> > Right, but the commit says it fixes page table page refcount underflow 
> > >> > by
> > >> > introducing a new field >pt_frag_refcount. Now we are hitting 
> > >> > the underflow
> > >> > for this pt_frag_refcount.
> > >> 
> > >> The fixed underflow is caused by a bug (race on page count) that got 
> > >> fixed by that patch. You are hitting a different underflow here. It's
> > >> not certain my patch caused it, I'm just trying to reproduce now.
> > > 
> > > Ok.
> > 
> > Can't reproduce I'm afraid, tried adding and removing 8GB memory from a
> > 4GB guest (via host adding / removing memory device), and it just works.
> 
> Boot, add 8G, reboot, remove 8G is the sequence to reproduce.
> 
> > 
> > It's likely to be an edge case like an off by one or rounding error
> > that just happens to trigger in your config. Might be easiest if you
> > could test with a debug patch.
> 
> Sure, I will continue debugging.

When the guest is rebooted after hotplug, the entire memory (which includes
the hotplugged memory) gets remapped again freshly. However at this time
since no slab is available yet, pt_frag_refcount never gets initialized as we
never do pte_fragment_alloc() for these mappings. So we right away hit the
underflow during the first unplug itself, it looks like.

I will check how this can be fixed.

> 
> Regards,
> Bharata.

[PATCH] misc: remove redundant 'default n' from Kconfig-s

2019-05-20 Thread Bartlomiej Zolnierkiewicz

'default n' is the default value for any bool or tristate Kconfig
setting so there is no need to write it explicitly.

Also since commit f467c5640c29 ("kconfig: only write '# CONFIG_FOO
is not set' for visible symbols") the Kconfig behavior is the same
regardless of 'default n' being present or not:

...
One side effect of (and the main motivation for) this change is making
the following two definitions behave exactly the same:

config FOO
bool

config FOO
bool
default n

With this change, neither of these will generate a
'# CONFIG_FOO is not set' line (assuming FOO isn't selected/implied).
That might make it clearer to people that a bare 'default n' is
redundant.
...

Signed-off-by: Bartlomiej Zolnierkiewicz 
---
 drivers/misc/Kconfig  |   10 --
 drivers/misc/altera-stapl/Kconfig |1 -
 drivers/misc/c2port/Kconfig   |2 --
 drivers/misc/cb710/Kconfig|1 -
 drivers/misc/cxl/Kconfig  |3 ---
 drivers/misc/echo/Kconfig |1 -
 drivers/misc/genwqe/Kconfig   |1 -
 drivers/misc/lis3lv02d/Kconfig|2 --
 drivers/misc/ocxl/Kconfig |1 -
 9 files changed, 22 deletions(-)

Index: b/drivers/misc/Kconfig
===
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -8,7 +8,6 @@ config SENSORS_LIS3LV02D
tristate
depends on INPUT
select INPUT_POLLDEV
-   default n
 
 config AD525X_DPOT
tristate "Analog Devices Digital Potentiometers"
@@ -61,7 +60,6 @@ config ATMEL_TCLIB
 
 config DUMMY_IRQ
tristate "Dummy IRQ handler"
-   default n
---help---
  This module accepts a single 'irq' parameter, which it should 
register for.
  The sole purpose of this module is to help with debugging of systems 
on
@@ -117,7 +115,6 @@ config PHANTOM
 config INTEL_MID_PTI
tristate "Parallel Trace Interface for MIPI P1149.7 cJTAG standard"
depends on PCI && TTY && (X86_INTEL_MID || COMPILE_TEST)
-   default n
help
  The PTI (Parallel Trace Interface) driver directs
  trace data routed from various parts in the system out
@@ -193,7 +190,6 @@ config ATMEL_SSC
 
 config ENCLOSURE_SERVICES
tristate "Enclosure Services"
-   default n
help
  Provides support for intelligent enclosures (bays which
  contain storage devices).  You also need either a host
@@ -217,7 +213,6 @@ config SGI_XP
 config CS5535_MFGPT
tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer 
(MFGPT) support"
depends on MFD_CS5535
-   default n
help
  This driver provides access to MFGPT functionality for other
  drivers that need timers.  MFGPTs are available in the CS5535 and
@@ -250,7 +245,6 @@ config CS5535_CLOCK_EVENT_SRC
 config HP_ILO
tristate "Channel interface driver for the HP iLO processor"
depends on PCI
-   default n
help
  The channel interface driver allows applications to communicate
  with iLO management processors present on HP ProLiant servers.
@@ -285,7 +279,6 @@ config QCOM_FASTRPC
 config SGI_GRU
tristate "SGI GRU driver"
depends on X86_UV && SMP
-   default n
select MMU_NOTIFIER
---help---
The GRU is a hardware resource located in the system chipset. The GRU
@@ -300,7 +293,6 @@ config SGI_GRU
 config SGI_GRU_DEBUG
bool  "SGI GRU driver debug"
depends on SGI_GRU
-   default n
---help---
This option enables additional debugging code for the SGI GRU driver.
If you are unsure, say N.
@@ -358,7 +350,6 @@ config SENSORS_BH1770
 config SENSORS_APDS990X
 tristate "APDS990X combined als and proximity sensors"
 depends on I2C
-default n
 ---help---
   Say Y here if you want to build a driver for Avago APDS990x
   combined ambient light and proximity sensor chip.
@@ -386,7 +377,6 @@ config DS1682
 config SPEAR13XX_PCIE_GADGET
bool "PCIe gadget support for SPEAr13XX platform"
depends on ARCH_SPEAR13XX && BROKEN
-   default n
help
 This option enables gadget support for PCIe controller. If
 board file defines any controller as PCIe endpoint then a sysfs
Index: b/drivers/misc/altera-stapl/Kconfig
===
--- a/drivers/misc/altera-stapl/Kconfig
+++ b/drivers/misc/altera-stapl/Kconfig
@@ -4,6 +4,5 @@ comment "Altera FPGA firmware download m
 config ALTERA_STAPL
tristate "Altera FPGA firmware download module"
depends on I2C
-   default n
help
  An Altera FPGA module. Say Y when you want to support this tool.
Index: b/drivers/misc/c2port/Kconfig

Re: [PATCH v2 1/2] pid: add pidfd_open()

2019-05-20 Thread Geert Uytterhoeven

On Mon, May 20, 2019 at 3:46 PM Christian Brauner  wrote:
> This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> process that is created via traditional fork()/clone() calls that is only
> referenced by a PID:
>
> int pidfd = pidfd_open(1234, 0);
> ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);
>
> With the introduction of pidfds through CLONE_PIDFD it is possible to
> created pidfds at process creation time.
> However, a lot of processes get created with traditional PID-based calls
> such as fork() or clone() (without CLONE_PIDFD). For these processes a
> caller can currently not create a pollable pidfd. This is a problem for
> Android's low memory killer (LMK) and service managers such as systemd.
> Both are examples of tools that want to make use of pidfds to get reliable
> notification of process exit for non-parents (pidfd polling) and race-free
> signal sending (pidfd_send_signal()). They intend to switch to this API for
> process supervision/management as soon as possible. Having no way to get
> pollable pidfds from PID-only processes is one of the biggest blockers for
> them in adopting this api. With pidfd_open() making it possible to retrieve
> pidfds for PID-based processes we enable them to adopt this api.
>
> In line with Arnd's recent changes to consolidate syscall numbers across
> architectures, I have added the pidfd_open() syscall to all architectures
> at the same time.
>
> Signed-off-by: Christian Brauner 
> Reviewed-by: Oleg Nesterov 

>  arch/m68k/kernel/syscalls/syscall.tbl   |  1 +

Acked-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

[PATCH v7 1/1] iommu: enhance IOMMU dma mode build options

2019-05-20 Thread Zhen Lei

First, add build option IOMMU_DEFAULT_{LAZY|STRICT}, so that we have the
opportunity to set {lazy|strict} mode as default at build time. Then put
the three config options in an choice, make people can only choose one of
the three at a time.

The default IOMMU dma modes on each ARCHs have no change.

Signed-off-by: Zhen Lei 
---
 arch/ia64/kernel/pci-dma.c|  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
 arch/s390/pci/pci_dma.c   |  2 +-
 arch/x86/kernel/pci-dma.c |  7 ++---
 drivers/iommu/Kconfig | 44 ++-
 drivers/iommu/amd_iommu_init.c|  3 ++-
 drivers/iommu/intel-iommu.c   |  2 +-
 drivers/iommu/iommu.c |  3 ++-
 8 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index fe988c49f01ce6a..655511dbf3c3b34 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -22,7 +22,7 @@
 int force_iommu __read_mostly;
 #endif

-int iommu_pass_through;
+int iommu_pass_through = IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);

 static int __init pci_iommu_init(void)
 {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3ead4c237ed0ec9..383e082a9bb985c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -85,7 +85,8 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
va_end(args);
 }

-static bool pnv_iommu_bypass_disabled __read_mostly;
+static bool pnv_iommu_bypass_disabled __read_mostly =
+   !IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);
 static bool pci_reset_phbs __read_mostly;

 static int __init iommu_setup(char *str)
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 9e52d1527f71495..784ad1e0acecfb1 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -17,7 +17,7 @@

 static struct kmem_cache *dma_region_table_cache;
 static struct kmem_cache *dma_page_table_cache;
-static int s390_iommu_strict;
+static int s390_iommu_strict = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT);

 static int zpci_refresh_global(struct zpci_dev *zdev)
 {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d460998ae828514..fb2bab42a0a3173 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -43,11 +43,8 @@
  * It is also possible to disable by default in kernel config, and enable with
  * iommu=nopt at boot time.
  */
-#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
-int iommu_pass_through __read_mostly = 1;
-#else
-int iommu_pass_through __read_mostly;
-#endif
+int iommu_pass_through __read_mostly =
+   IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH);

 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6f07f3b21816c64..8a1f1793cde76b4 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -74,17 +74,47 @@ config IOMMU_DEBUGFS
  debug/iommu directory, and then populate a subdirectory with
  entries as required.

-config IOMMU_DEFAULT_PASSTHROUGH
-   bool "IOMMU passthrough by default"
+choice
+   prompt "IOMMU default DMA mode"
depends on IOMMU_API
-help
- Enable passthrough by default, removing the need to pass in
- iommu.passthrough=on or iommu=pt through command line. If this
- is enabled, you can still disable with iommu.passthrough=off
- or iommu=nopt depending on the architecture.
+   default IOMMU_DEFAULT_PASSTHROUGH if (PPC_POWERNV && PCI)
+   default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU || S390_IOMMU)
+   default IOMMU_DEFAULT_STRICT
+   help
+ This option allows IOMMU DMA mode to be chose at build time, to
+ override the default DMA mode of each ARCHs, removing the need to
+ pass in kernel parameters through command line. You can still use
+ ARCHs specific boot options to override this option again.
+
+config IOMMU_DEFAULT_PASSTHROUGH
+   bool "passthrough"
+   help
+ In this mode, the DMA access through IOMMU without any addresses
+ translation. That means, the wrong or illegal DMA access can not
+ be caught, no error information will be reported.

  If unsure, say N here.

+config IOMMU_DEFAULT_LAZY
+   bool "lazy"
+   help
+ Support lazy mode, where for every IOMMU DMA unmap operation, the
+ flush operation of IOTLB and the free operation of IOVA are deferred.
+ They are only guaranteed to be done before the related IOVA will be
+ reused.
+
+config IOMMU_DEFAULT_STRICT
+   bool "strict"
+   help
+ For every IOMMU DMA unmap operation, the flush operation of IOTLB and
+ the free operation of IOVA are guaranteed to be done in the

[PATCH v7 0/1] iommu: enhance IOMMU dma mode build options

2019-05-20 Thread Zhen Lei

v6 --> v7:
1. Fix some text editing errors

v5 --> v6:
1. give up adding boot option iommu.dma_mode

v4 --> v5:
As Hanjun and Thomas Gleixner's suggestion:
1. Keep the old ARCH specific boot options no change.
2. Keep build option CONFIG_IOMMU_DEFAULT_PASSTHROUGH no change.

v4:
As Robin Murphy's suggestion:
"It's also not necessarily obvious to the user how this interacts with
IOMMU_DEFAULT_PASSTHROUGH, so if we really do go down this route, maybe it
would be better to refactor the whole lot into a single selection of something
like IOMMU_DEFAULT_MODE anyway."

In this version, I tried to normalize the IOMMU dma mode boot options for all
ARCHs. When IOMMU is enabled, there are 3 dma modes: paasthrough(bypass),
lazy(mapping but defer the IOTLB invalidation), strict. But currently each
ARCHs defined their private boot options, different with each other. For
example, to enable/disable "passthrough", ARM64 use iommu.passthrough=1/0,
X86 use iommu=pt/nopt, PPC/POWERNV use iommu=nobypass.

Zhen Lei (1):
  iommu: enhance IOMMU dma mode build options

 arch/ia64/kernel/pci-dma.c|  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
 arch/s390/pci/pci_dma.c   |  2 +-
 arch/x86/kernel/pci-dma.c |  7 ++---
 drivers/iommu/Kconfig | 44 ++-
 drivers/iommu/amd_iommu_init.c|  3 ++-
 drivers/iommu/intel-iommu.c   |  2 +-
 drivers/iommu/iommu.c |  3 ++-
 8 files changed, 48 insertions(+), 18 deletions(-)

-- 
1.8.3

[PATCH v2 2/2] tests: add pidfd_open() tests

2019-05-20 Thread Christian Brauner

This adds testing for the new pidfd_open() syscalls. Specifically, we test:
- that no invalid flags can be passed to pidfd_open()
- that no invalid pid can be passed to pidfd_open()
- that a pidfd can be retrieved with pidfd_open()
- that the retrieved pidfd references the correct pid

Signed-off-by: Christian Brauner 
Cc: Arnd Bergmann 
Cc: "Eric W. Biederman" 
Cc: Kees Cook 
Cc: Joel Fernandes (Google) 
Cc: Thomas Gleixner 
Cc: Jann Horn 
Cc: David Howells 
Cc: "Michael Kerrisk (man-pages)" 
Cc: Andy Lutomirsky 
Cc: Andrew Morton 
Cc: Oleg Nesterov 
Cc: Aleksa Sarai 
Cc: Linus Torvalds 
Cc: Al Viro 
Cc: linux-...@vger.kernel.org
---
v1: unchanged
v2:
- Christian Brauner :
  - set number of planned tests via ksft_set_plan()
---
 tools/testing/selftests/pidfd/Makefile|   2 +-
 tools/testing/selftests/pidfd/pidfd.h |  57 ++
 .../testing/selftests/pidfd/pidfd_open_test.c | 169 ++
 tools/testing/selftests/pidfd/pidfd_test.c|  41 +
 4 files changed, 228 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/selftests/pidfd/pidfd.h
 create mode 100644 tools/testing/selftests/pidfd/pidfd_open_test.c

diff --git a/tools/testing/selftests/pidfd/Makefile 
b/tools/testing/selftests/pidfd/Makefile
index deaf8073bc06..b36c0be70848 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,6 +1,6 @@
 CFLAGS += -g -I../../../../usr/include/
 
-TEST_GEN_PROGS := pidfd_test
+TEST_GEN_PROGS := pidfd_test pidfd_open_test
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd.h 
b/tools/testing/selftests/pidfd/pidfd.h
new file mode 100644
index ..8452e910463f
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PIDFD_H
+#define __PIDFD_H
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest.h"
+
+/*
+ * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
+ * That means, when it wraps around any pid < 300 will be skipped.
+ * So we need to use a pid > 300 in order to test recycling.
+ */
+#define PID_RECYCLE 1000
+
+/*
+ * Define a few custom error codes for the child process to clearly indicate
+ * what is happening. This way we can tell the difference between a system
+ * error, a test error, etc.
+ */
+#define PIDFD_PASS 0
+#define PIDFD_FAIL 1
+#define PIDFD_ERROR 2
+#define PIDFD_SKIP 3
+#define PIDFD_XFAIL 4
+
+int wait_for_pid(pid_t pid)
+{
+   int status, ret;
+
+again:
+   ret = waitpid(pid, , 0);
+   if (ret == -1) {
+   if (errno == EINTR)
+   goto again;
+
+   return -1;
+   }
+
+   if (!WIFEXITED(status))
+   return -1;
+
+   return WEXITSTATUS(status);
+}
+
+
+#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c 
b/tools/testing/selftests/pidfd/pidfd_open_test.c
new file mode 100644
index ..0377133dd6dc
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+   return syscall(__NR_pidfd_open, pid, flags);
+}
+
+static int safe_int(const char *numstr, int *converted)
+{
+   char *err = NULL;
+   long sli;
+
+   errno = 0;
+   sli = strtol(numstr, , 0);
+   if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
+   return -ERANGE;
+
+   if (errno != 0 && sli == 0)
+   return -EINVAL;
+
+   if (err == numstr || *err != '\0')
+   return -EINVAL;
+
+   if (sli > INT_MAX || sli < INT_MIN)
+   return -ERANGE;
+
+   *converted = (int)sli;
+   return 0;
+}
+
+static int char_left_gc(const char *buffer, size_t len)
+{
+   size_t i;
+
+   for (i = 0; i < len; i++) {
+   if (buffer[i] == ' ' ||
+   buffer[i] == '\t')
+   continue;
+
+   return i;
+   }
+
+   return 0;
+}
+
+static int char_right_gc(const char *buffer, size_t len)
+{
+   int i;
+
+   for (i = len - 1; i >= 0; i--) {
+   if (buffer[i] == ' '  ||
+   buffer[i] == '\t' ||
+   buffer[i] == '\n' ||
+   buffer[i] == '\0')
+   continue;
+
+   return i + 1;
+   }
+
+   return 0;
+}
+
+static char *trim_whitespace_in_place(char *buffer)
+{
+   buffer += char_left_gc(buffer, strlen(buffer));
+   buffer[char_right_gc(buffer, strlen(buffer))] = '\0';
+   return buffer;

[PATCH v2 1/2] pid: add pidfd_open()

2019-05-20 Thread Christian Brauner

This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
process that is created via traditional fork()/clone() calls that is only
referenced by a PID:

int pidfd = pidfd_open(1234, 0);
ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);

With the introduction of pidfds through CLONE_PIDFD it is possible to
created pidfds at process creation time.
However, a lot of processes get created with traditional PID-based calls
such as fork() or clone() (without CLONE_PIDFD). For these processes a
caller can currently not create a pollable pidfd. This is a problem for
Android's low memory killer (LMK) and service managers such as systemd.
Both are examples of tools that want to make use of pidfds to get reliable
notification of process exit for non-parents (pidfd polling) and race-free
signal sending (pidfd_send_signal()). They intend to switch to this API for
process supervision/management as soon as possible. Having no way to get
pollable pidfds from PID-only processes is one of the biggest blockers for
them in adopting this api. With pidfd_open() making it possible to retrieve
pidfds for PID-based processes we enable them to adopt this api.

In line with Arnd's recent changes to consolidate syscall numbers across
architectures, I have added the pidfd_open() syscall to all architectures
at the same time.

Signed-off-by: Christian Brauner 
Reviewed-by: Oleg Nesterov 
Cc: Arnd Bergmann 
Cc: "Eric W. Biederman" 
Cc: Kees Cook 
Cc: Joel Fernandes (Google) 
Cc: Thomas Gleixner 
Cc: Jann Horn 
Cc: David Howells 
Cc: Andy Lutomirsky 
Cc: Andrew Morton 
Cc: Aleksa Sarai 
Cc: Linus Torvalds 
Cc: Al Viro 
Cc: linux-...@vger.kernel.org
---
v1:
- kbuild test robot :
  - add missing entry for pidfd_open to arch/arm/tools/syscall.tbl
- Oleg Nesterov :
  - use simpler thread-group leader check
v2:
- Oleg Nesterov :
  - avoid using additional variable
  - remove unneeded comment
- Arnd Bergmann :
  - switch from 428 to 434 since the new mount api has taken it
  - bump syscall numbers in arch/arm64/include/asm/unistd.h
- Joel Fernandes (Google) :
  - switch from ESRCH to EINVAL when the passed-in pid does not refer to a
thread-group leader
- Christian Brauner :
  - rebase on v5.2-rc1
  - adapt syscall number to account for new mount api syscalls
---
 arch/alpha/kernel/syscalls/syscall.tbl  |  1 +
 arch/arm/tools/syscall.tbl  |  1 +
 arch/arm64/include/asm/unistd.h |  2 +-
 arch/arm64/include/asm/unistd32.h   |  2 +
 arch/ia64/kernel/syscalls/syscall.tbl   |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl   |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl|  1 +
 arch/s390/kernel/syscalls/syscall.tbl   |  1 +
 arch/sh/kernel/syscalls/syscall.tbl |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl  |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl  |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl  |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl |  1 +
 include/linux/pid.h |  1 +
 include/linux/syscalls.h|  1 +
 include/uapi/asm-generic/unistd.h   |  4 +-
 kernel/fork.c   |  2 +-
 kernel/pid.c| 43 +
 21 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl 
b/arch/alpha/kernel/syscalls/syscall.tbl
index 9e7704e44f6d..1db9bbcfb84e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 541common  fsconfigsys_fsconfig
 542common  fsmount sys_fsmount
 543common  fspick  sys_fspick
+544common  pidfd_open  sys_pidfd_open
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index aaf479a9e92d..81e6e1817c45 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -447,3 +447,4 @@
 431common  fsconfigsys_fsconfig
 432common  fsmount sys_fsmount
 433common  fspick  sys_fspick
+434common  pidfd_open  sys_pidfd_open
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 70e6882853c0..e8f7d95a1481 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -44,7 +44,7 @@
 #define __ARM_NR_compat_set_tls(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls   434
+#define __NR_compat_syscalls   435
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h

Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()

2019-05-20 Thread Steven Rostedt

On Mon, 20 May 2019 09:13:20 -0400
Steven Rostedt  wrote:


> > I haven't yet tested this patch on x86, but this looked wrong so sending 
> > this as a RFC.  
> 
> This code has been through a bit of updates, and I need to go through
> and clean it up. I'll have to take a look and convert "int" to "bool"
> so that "enable" is not confusing.
> 
> Thanks, I think I'll try to do a clean up first, and then this patch
> shouldn't "look wrong" after that.
> 

I'm going to apply the attached two patches. There may be some
conflicts between yours here and these, but nothing that Linus can't
figure out. Do you feel more comfortable with this code, if these
patches are applied?

-- Steve
>From d6b694e350e61259ad18fe2b912911b52ff4767a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" 
Date: Mon, 20 May 2019 09:26:24 -0400
Subject: [PATCH 1/2] ftrace: Make enable and update parameters bool when
 applicable

The code modification functions have "enable" and "update" variables that
are sometimes "int" but used as "bool". Remove the ambiguity and make them
"bool" when they are only used for true or false values.

Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n@linux.vnet.ibm.com

Reported-by: "Naveen N. Rao" 
Signed-off-by: Steven Rostedt (VMware) 
---
 include/linux/ftrace.h |  4 ++--
 kernel/trace/ftrace.c  | 16 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 25e2995d4a4c..8a8cb3c401b2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -427,8 +427,8 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
 	 iter = ftrace_rec_iter_next(iter))
 
 
-int ftrace_update_record(struct dyn_ftrace *rec, int enable);
-int ftrace_test_record(struct dyn_ftrace *rec, int enable);
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable);
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable);
 void ftrace_run_stop_machine(int command);
 unsigned long ftrace_location(unsigned long ip);
 unsigned long ftrace_location_range(unsigned long start, unsigned long end);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12aff849c04..47b41502a24c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1768,7 +1768,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		count++;
 
 		/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
-		update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE;
+		update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE;
 
 		/* Shortcut, if we handled all records, we are done. */
 		if (!all && count == hash->count)
@@ -2047,7 +2047,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
 	}
 }
 
-static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
+static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 {
 	unsigned long flag = 0UL;
 
@@ -2146,12 +2146,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 /**
  * ftrace_update_record, set a record that now is tracing or not
  * @rec: the record to update
- * @enable: set to 1 if the record is tracing, zero to force disable
+ * @enable: set to true if the record is tracing, false to force disable
  *
  * The records that represent all functions that can be traced need
  * to be updated when tracing has been enabled.
  */
-int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
 {
 	return ftrace_check_record(rec, enable, 1);
 }
@@ -2159,13 +2159,13 @@ int ftrace_update_record(struct dyn_ftrace *rec, int enable)
 /**
  * ftrace_test_record, check if the record has been enabled or not
  * @rec: the record to test
- * @enable: set to 1 to check if enabled, 0 if it is disabled
+ * @enable: set to true to check if enabled, false if it is disabled
  *
  * The arch code may need to test if a record is already set to
  * tracing to determine how to modify the function code that it
  * represents.
  */
-int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable)
 {
 	return ftrace_check_record(rec, enable, 0);
 }
@@ -2356,7 +2356,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
 }
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, bool enable)
 {
 	unsigned long ftrace_old_addr;
 	unsigned long ftrace_addr;
@@ -2395,7 +2395,7 @@ void __weak ftrace_replace_code(int mod_flags)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
+	bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
 	int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
 	int failed;
 
-- 
2.20.1

>From a15f59dc19e1fdf7eda74e7aec386975740d1515 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt

Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-05-20 Thread Michael S. Tsirkin

On Wed, Apr 17, 2019 at 06:42:00PM -0300, Thiago Jung Bauermann wrote:
> I rephrased it in terms of address translation. What do you think of
> this version? The flag name is slightly different too:
> 
> 
> VIRTIO_F_ACCESS_PLATFORM_NO_TRANSLATION This feature has the same
> meaning as VIRTIO_F_ACCESS_PLATFORM both when set and when not set,
> with the exception that address translation is guaranteed to be
> unnecessary when accessing memory addresses supplied to the device
> by the driver. Which is to say, the device will always use physical
> addresses matching addresses used by the driver (typically meaning
> physical addresses used by the CPU) and not translated further. This
> flag should be set by the guest if offered, but to allow for
> backward-compatibility device implementations allow for it to be
> left unset by the guest. It is an error to set both this flag and
> VIRTIO_F_ACCESS_PLATFORM.


OK so VIRTIO_F_ACCESS_PLATFORM is designed to allow unpriveledged
drivers. This is why devices fail when it's not negotiated.

This confuses me.
If driver is unpriveledged then what happens with this flag?
It can supply any address it wants. Will that corrupt kernel
memory?

-- 
MST

Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()

2019-05-20 Thread Steven Rostedt

On Sat, 18 May 2019 00:32:46 +0530
"Naveen N. Rao"  wrote:

> In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be
> schedulable), the generic ftrace_replace_code() function was modified to
> accept a flags argument in place of a single 'enable' flag. However, the
> x86 version of this function was not updated. Fix the same.
> 
> Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable")
> Signed-off-by: Naveen N. Rao 
> ---
> I haven't yet tested this patch on x86, but this looked wrong so sending 
> this as a RFC.

This code has been through a bit of updates, and I need to go through
and clean it up. I'll have to take a look and convert "int" to "bool"
so that "enable" is not confusing.

Thanks, I think I'll try to do a clean up first, and then this patch
shouldn't "look wrong" after that.

-- Steve

> 
> - Naveen
> 
> 
>  arch/x86/kernel/ftrace.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
> index 0caf8122d680..0c01b344ba16 100644
> --- a/arch/x86/kernel/ftrace.c
> +++ b/arch/x86/kernel/ftrace.c
> @@ -554,8 +554,9 @@ static void run_sync(void)
>   local_irq_disable();
>  }
>  
> -void ftrace_replace_code(int enable)
> +void ftrace_replace_code(int mod_flags)
>  {
> + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
>   struct ftrace_rec_iter *iter;
>   struct dyn_ftrace *rec;
>   const char *report = "adding breakpoints";

Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-05-20 Thread Michael S. Tsirkin

On Fri, Apr 26, 2019 at 08:56:43PM -0300, Thiago Jung Bauermann wrote:
> 
> Michael S. Tsirkin  writes:
> 
> > On Wed, Apr 24, 2019 at 10:01:56PM -0300, Thiago Jung Bauermann wrote:
> >>
> >> Michael S. Tsirkin  writes:
> >>
> >> > On Wed, Apr 17, 2019 at 06:42:00PM -0300, Thiago Jung Bauermann wrote:
> >> >>
> >> >> Michael S. Tsirkin  writes:
> >> >>
> >> >> > On Thu, Mar 21, 2019 at 09:05:04PM -0300, Thiago Jung Bauermann wrote:
> >> >> >>
> >> >> >> Michael S. Tsirkin  writes:
> >> >> >>
> >> >> >> > On Wed, Mar 20, 2019 at 01:13:41PM -0300, Thiago Jung Bauermann 
> >> >> >> > wrote:
> >> >> >> >> >From what I understand of the ACCESS_PLATFORM definition, the 
> >> >> >> >> >host will
> >> >> >> >> only ever try to access memory addresses that are supplied to it 
> >> >> >> >> by the
> >> >> >> >> guest, so all of the secure guest memory that the host cares 
> >> >> >> >> about is
> >> >> >> >> accessible:
> >> >> >> >>
> >> >> >> >> If this feature bit is set to 0, then the device has same 
> >> >> >> >> access to
> >> >> >> >> memory addresses supplied to it as the driver has. In 
> >> >> >> >> particular,
> >> >> >> >> the device will always use physical addresses matching 
> >> >> >> >> addresses
> >> >> >> >> used by the driver (typically meaning physical addresses used 
> >> >> >> >> by the
> >> >> >> >> CPU) and not translated further, and can access any address 
> >> >> >> >> supplied
> >> >> >> >> to it by the driver. When clear, this overrides any
> >> >> >> >> platform-specific description of whether device access is 
> >> >> >> >> limited or
> >> >> >> >> translated in any way, e.g. whether an IOMMU may be present.
> >> >> >> >>
> >> >> >> >> All of the above is true for POWER guests, whether they are secure
> >> >> >> >> guests or not.
> >> >> >> >>
> >> >> >> >> Or are you saying that a virtio device may want to access memory
> >> >> >> >> addresses that weren't supplied to it by the driver?
> >> >> >> >
> >> >> >> > Your logic would apply to IOMMUs as well.  For your mode, there are
> >> >> >> > specific encrypted memory regions that driver has access to but 
> >> >> >> > device
> >> >> >> > does not. that seems to violate the constraint.
> >> >> >>
> >> >> >> Right, if there's a pre-configured 1:1 mapping in the IOMMU such that
> >> >> >> the device can ignore the IOMMU for all practical purposes I would
> >> >> >> indeed say that the logic would apply to IOMMUs as well. :-)
> >> >> >>
> >> >> >> I guess I'm still struggling with the purpose of signalling to the
> >> >> >> driver that the host may not have access to memory addresses that it
> >> >> >> will never try to access.
> >> >> >
> >> >> > For example, one of the benefits is to signal to host that driver does
> >> >> > not expect ability to access all memory. If it does, host can
> >> >> > fail initialization gracefully.
> >> >>
> >> >> But why would the ability to access all memory be necessary or even
> >> >> useful? When would the host access memory that the driver didn't tell it
> >> >> to access?
> >> >
> >> > When I say all memory I mean even memory not allowed by the IOMMU.
> >>
> >> Yes, but why? How is that memory relevant?
> >
> > It's relevant when driver is not trusted to only supply correct
> > addresses. The feature was originally designed to support userspace
> > drivers within guests.
> 
> Ah, thanks for clarifying. I don't think that's a problem in our case.
> If the guest provides an incorrect address, the hardware simply won't
> allow the host to access it.
> 
> >> >> >> > Another idea is maybe something like virtio-iommu?
> >> >> >>
> >> >> >> You mean, have legacy guests use virtio-iommu to request an IOMMU
> >> >> >> bypass? If so, it's an interesting idea for new guests but it doesn't
> >> >> >> help with guests that are out today in the field, which don't have A
> >> >> >> virtio-iommu driver.
> >> >> >
> >> >> > I presume legacy guests don't use encrypted memory so why do we
> >> >> > worry about them at all?
> >> >>
> >> >> They don't use encrypted memory, but a host machine will run a mix of
> >> >> secure and legacy guests. And since the hypervisor doesn't know whether
> >> >> a guest will be secure or not at the time it is launched, legacy guests
> >> >> will have to be launched with the same configuration as secure guests.
> >> >
> >> > OK and so I think the issue is that hosts generally fail if they set
> >> > ACCESS_PLATFORM and guests do not negotiate it.
> >> > So you can not just set ACCESS_PLATFORM for everyone.
> >> > Is that the issue here?
> >>
> >> Yes, that is one half of the issue. The other is that even if hosts
> >> didn't fail, existing legacy guests wouldn't "take the initiative" of
> >> not negotiating ACCESS_PLATFORM to get the improved performance. They'd
> >> have to be modified to do that.
> >
> > So there's a non-encrypted guest, hypervisor wants to set
> > ACCESS_PLATFORM to allow encrypted guests but that will slow down legacy
> > guests since

Re: [RFC PATCH 1/4] ftrace: Expose flags used for ftrace_replace_code()

2019-05-20 Thread Steven Rostedt

On Sat, 18 May 2019 00:32:45 +0530
"Naveen N. Rao"  wrote:

> Since ftrace_replace_code() is a __weak function and can be overridden,
> we need to expose the flags that can be set. So, move the flags enum to
> the header file.
> 
> Signed-off-by: Naveen N. Rao 

Reviewed-by: Steven Rostedt (VMware) 

-- Steve

> ---
>  include/linux/ftrace.h | 5 +
>  kernel/trace/ftrace.c  | 5 -
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 20899919ead8..835e761f63b0 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -162,6 +162,11 @@ enum {
>   FTRACE_OPS_FL_TRACE_ARRAY   = 1 << 15,
>  };
>  
> +enum {
> + FTRACE_MODIFY_ENABLE_FL = (1 << 0),
> + FTRACE_MODIFY_MAY_SLEEP_FL  = (1 << 1),
> +};
> +
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  /* The hash used to know what functions callbacks trace */
>  struct ftrace_ops_hash {
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index b920358dd8f7..38c15cd27fc4 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -78,11 +78,6 @@
>  #define ASSIGN_OPS_HASH(opsname, val)
>  #endif
>  
> -enum {
> - FTRACE_MODIFY_ENABLE_FL = (1 << 0),
> - FTRACE_MODIFY_MAY_SLEEP_FL  = (1 << 1),
> -};
> -
>  struct ftrace_ops ftrace_list_end __read_mostly = {
>   .func   = ftrace_stub,
>   .flags  = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,

Re: Linux 5.1-rc5

2019-05-20 Thread Greg KH

On Thu, May 02, 2019 at 05:10:55PM +0200, Martin Schwidefsky wrote:
> On Thu, 2 May 2019 16:31:10 +0200
> Greg KH  wrote:
> 
> > On Thu, May 02, 2019 at 04:17:58PM +0200, Martin Schwidefsky wrote:
> > > On Thu, 2 May 2019 14:21:28 +0200
> > > Greg KH  wrote:
> > >   
> > > > On Mon, Apr 15, 2019 at 09:17:10AM -0700, Linus Torvalds wrote:  
> > > > > On Sun, Apr 14, 2019 at 10:19 PM Christoph Hellwig 
> > > > >  wrote:
> > > > > >
> > > > > > Can we please have the page refcount overflow fixes out on the list
> > > > > > for review, even if it is after the fact?
> > > > > 
> > > > > They were actually on a list for review long before the fact, but it
> > > > > was the security mailing list. The issue actually got discussed back
> > > > > in January along with early versions of the patches, but then we
> > > > > dropped the ball because it just wasn't on anybody's radar and it got
> > > > > resurrected late March. Willy wrote a rather bigger patch-series, and
> > > > > review of that is what then resulted in those commits. So they may
> > > > > look recent, but that's just because the original patches got
> > > > > seriously edited down and rewritten.
> > > > > 
> > > > > That said, powerpc and s390 should at least look at maybe adding a
> > > > > check for the page ref in their gup paths too. Powerpc has the special
> > > > > gup_hugepte() case, and s390 has its own version of gup entirely. I
> > > > > was actually hoping the s390 guys would look at using the generic gup
> > > > > code.
> > > > > 
> > > > > I ruthlessly also entirely ignored MIPS, SH and sparc, since they seem
> > > > > largely irrelevant, partly since even theoretically this whole issue
> > > > > needs a _lot_ of memory.
> > > > > 
> > > > > Michael, Martin, see commit 6b3a70773630 ("Merge branch 'page-refs'
> > > > > (page ref overflow)"). You may or may not really care.
> > > > 
> > > > I've now queued these patches up for the next round of stable releases,
> > > > as some people seem to care about these.
> > > > 
> > > > I didn't see any follow-on patches for s390 or ppc64 hit the tree for
> > > > these changes, am I just missing them and should also queue up a few
> > > > more to handle this issue on those platforms?  
> > > 
> > > I fixed that with a different approach. The following two patches are
> > > queued for the next merge window:
> > > 
> > > d1874a0c2805 "s390/mm: make the pxd_offset functions more robust"
> > > 1a42010cdc26 "s390/mm: convert to the generic get_user_pages_fast code"
> > > 
> > > With these two s390 now uses the generic gup code in mm/gup.c  
> > 
> > Nice!  Do you want me to queue those up for the stable backports once
> > they hit a public -rc release?
> 
> Yes please!

Now queued up to 5.0 and 5.1, but did not apply to 4.19 or older :(

thanks,

greg k-h

[PATCH] selftests/powerpc: Add a test of spectre_v2 mitigations

2019-05-20 Thread Michael Ellerman

This test uses the PMU to count branch prediction hits/misses for a
known loop, and compare the result to the reported spectre v2
mitigation.

This gives us a way of sanity checking that the reported mitigation is
actually in effect.

Sample output for some cases, eg:

  Power9:
sysfs reports: 'Vulnerable'
 PM_BR_PRED_CCACHE: result368 running/enabled 5792777124
PM_BR_MPRED_CCACHE: result319 running/enabled 5792775546
 PM_BR_PRED_PCACHE: result 2147483281 running/enabled 5792773128
PM_BR_MPRED_PCACHE: result  213604201 running/enabled 5792771640
Miss percent 9 %
OK - Measured branch prediction rates match reported spectre v2 mitigation.

sysfs reports: 'Mitigation: Indirect branch serialisation (kernel only)'
 PM_BR_PRED_CCACHE: result895 running/enabled 5780320920
PM_BR_MPRED_CCACHE: result822 running/enabled 5780312414
 PM_BR_PRED_PCACHE: result 2147482754 running/enabled 5780308836
PM_BR_MPRED_PCACHE: result  213639731 running/enabled 5780307912
Miss percent 9 %
OK - Measured branch prediction rates match reported spectre v2 mitigation.

sysfs reports: 'Mitigation: Indirect branch cache disabled'
 PM_BR_PRED_CCACHE: result 2147483649 running/enabled 20540186160
PM_BR_MPRED_CCACHE: result 2147483649 running/enabled 20540180056
 PM_BR_PRED_PCACHE: result  0 running/enabled 20540176090
PM_BR_MPRED_PCACHE: result  0 running/enabled 20540174182
Miss percent 100 %
OK - Measured branch prediction rates match reported spectre v2 mitigation.

  Power8:
sysfs reports: 'Vulnerable'
 PM_BR_PRED_CCACHE: result 2147483649 running/enabled 3505888142
PM_BR_MPRED_CCACHE: result  9 running/enabled 3505882788
Miss percent 0 %
OK - Measured branch prediction rates match reported spectre v2 mitigation.

sysfs reports: 'Mitigation: Indirect branch cache disabled'
 PM_BR_PRED_CCACHE: result 2147483649 running/enabled 16931421988
PM_BR_MPRED_CCACHE: result 2147483649 running/enabled 16931416478
Miss percent 100 %
OK - Measured branch prediction rates match reported spectre v2 mitigation.
success: spectre_v2

Signed-off-by: Michael Ellerman 
---
 .../testing/selftests/powerpc/include/utils.h |   1 +
 .../selftests/powerpc/security/Makefile   |   3 +-
 .../selftests/powerpc/security/branch_loops.S |  82 +++
 .../selftests/powerpc/security/spectre_v2.c   | 218 ++
 tools/testing/selftests/powerpc/utils.c   |  20 ++
 5 files changed, 323 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/security/branch_loops.S
 create mode 100644 tools/testing/selftests/powerpc/security/spectre_v2.c

diff --git a/tools/testing/selftests/powerpc/include/utils.h 
b/tools/testing/selftests/powerpc/include/utils.h
index 7636bf45d5d5..be493b4823f7 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -34,6 +34,7 @@ int pick_online_cpu(void);
 
 int read_debugfs_file(char *debugfs_file, int *result);
 int write_debugfs_file(char *debugfs_file, int result);
+int read_sysfs_file(char *debugfs_file, char *result, size_t result_size);
 void set_dscr(unsigned long val);
 int perf_event_open_counter(unsigned int type,
unsigned long config, int group_fd);
diff --git a/tools/testing/selftests/powerpc/security/Makefile 
b/tools/testing/selftests/powerpc/security/Makefile
index 85861c46b445..d68e6031695c 100644
--- a/tools/testing/selftests/powerpc/security/Makefile
+++ b/tools/testing/selftests/powerpc/security/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0+
 
-TEST_GEN_PROGS := rfi_flush
+TEST_GEN_PROGS := rfi_flush spectre_v2
 top_srcdir = ../../../../..
 
 CFLAGS += -I../../../../../usr/include
@@ -8,3 +8,4 @@ CFLAGS += -I../../../../../usr/include
 include ../../lib.mk
 
 $(TEST_GEN_PROGS): ../harness.c ../utils.c
+$(OUTPUT)/spectre_v2: ../pmu/event.c branch_loops.S
diff --git a/tools/testing/selftests/powerpc/security/branch_loops.S 
b/tools/testing/selftests/powerpc/security/branch_loops.S
new file mode 100644
index ..22e9204e3421
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/branch_loops.S
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2019, Michael Ellerman, IBM Corp.
+ */
+
+#include 
+
+   .data
+
+jump_table:
+   .long   0x0
+   .long   (.Lstate_1 - .Lstate_0)
+   .long   (.Lstate_2 - .Lstate_0)
+   .long   (.Lstate_3 - .Lstate_0)
+   .long   (.Lstate_4 - .Lstate_0)
+   .long   (.Lstate_5 - .Lstate_0)
+   .long   (.Lstate_6 - .Lstate_0)
+   .long   (.Lstate_7 - .Lstate_0)
+
+   .text
+
+#define ITER_SHIFT 31
+
+.macro state number
+   .balign 32
+.Lstate_\number:
+   .if \number==7
+   li  r3, 0
+   .else
+   li  r3, \number+1
+   .endif
+   b   .Lloop
+.endm
+

[PATCH v2] selftests/powerpc: Add a test of bad (out-of-range) accesses

2019-05-20 Thread Michael Ellerman

Userspace isn't allowed to access certain address ranges, make sure we
actually test that to at least some degree.

This would have caught the recent bug where the SLB fault handler was
incorrectly called on an out-of-range access when using the Radix MMU.
It also would have caught the bug we had in get_region_id() where we
were inserting SLB entries for bad addresses.

Signed-off-by: Michael Ellerman 
Signed-off-by: Nicholas Piggin 
---
v2: Incorporate changes from Nick's version. Check the faulted address,
and make sure we get BNDERR for things that are completely out of
bounds.

 tools/testing/selftests/powerpc/mm/.gitignore |   3 +-
 tools/testing/selftests/powerpc/mm/Makefile   |   4 +-
 .../selftests/powerpc/mm/bad_accesses.c   | 167 ++
 3 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/mm/bad_accesses.c

diff --git a/tools/testing/selftests/powerpc/mm/.gitignore 
b/tools/testing/selftests/powerpc/mm/.gitignore
index ba919308fe30..c189170ba041 100644
--- a/tools/testing/selftests/powerpc/mm/.gitignore
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -3,4 +3,5 @@ subpage_prot
 tempfile
 prot_sao
 segv_errors
-wild_bctr
\ No newline at end of file
+wild_bctr
+bad_accesses
\ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/mm/Makefile 
b/tools/testing/selftests/powerpc/mm/Makefile
index 43d68420e363..f9667f980241 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -2,7 +2,8 @@
 noarg:
$(MAKE) -C ../
 
-TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors 
wild_bctr
+TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors \
+ wild_bctr bad_accesses
 TEST_GEN_FILES := tempfile
 
 top_srcdir = ../../../../..
@@ -13,6 +14,7 @@ $(TEST_GEN_PROGS): ../harness.c
 $(OUTPUT)/prot_sao: ../utils.c
 
 $(OUTPUT)/wild_bctr: CFLAGS += -m64
+$(OUTPUT)/bad_accesses: CFLAGS += -m64
 
 $(OUTPUT)/tempfile:
dd if=/dev/zero of=$@ bs=64k count=1
diff --git a/tools/testing/selftests/powerpc/mm/bad_accesses.c 
b/tools/testing/selftests/powerpc/mm/bad_accesses.c
new file mode 100644
index ..f18b2b91b263
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/bad_accesses.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright 2019, Michael Ellerman, IBM Corp.
+//
+// Test that out-of-bounds reads/writes behave as expected.
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+
+
+// 64-bit kernel is always here
+#define PAGE_OFFSET(0xcul << 60)
+
+static unsigned long kernel_virt_end;
+
+static volatile int fault_code;
+static volatile unsigned long fault_addr;
+static jmp_buf setjmp_env;
+
+static void segv_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+   fault_code = info->si_code;
+   fault_addr = (unsigned long)info->si_addr;
+   siglongjmp(setjmp_env, 1);
+}
+
+int bad_access(char *p, bool write)
+{
+   char x;
+
+   fault_code = 0;
+   fault_addr = 0;
+
+   if (sigsetjmp(setjmp_env, 1) == 0) {
+   if (write)
+   *p = 1;
+   else
+   x = *p;
+
+   printf("Bad - no SEGV! (%c)\n", x);
+   return 1;
+   }
+
+   // If we see MAPERR that means we took a page fault rather than an SLB
+   // miss. We only expect to take page faults for addresses within the
+   // valid kernel range.
+   FAIL_IF(fault_code == SEGV_MAPERR && \
+   (fault_addr < PAGE_OFFSET || fault_addr >= kernel_virt_end));
+
+   FAIL_IF(fault_code != SEGV_MAPERR && fault_code != SEGV_BNDERR);
+
+   return 0;
+}
+
+static int using_hash_mmu(bool *using_hash)
+{
+   char line[128];
+   FILE *f;
+   int rc;
+
+   f = fopen("/proc/cpuinfo", "r");
+   FAIL_IF(!f);
+
+   rc = 0;
+   while (fgets(line, sizeof(line), f) != NULL) {
+   if (strcmp(line, "MMU   : Hash\n") == 0) {
+   *using_hash = true;
+   goto out;
+   }
+
+   if (strcmp(line, "MMU   : Radix\n") == 0) {
+   *using_hash = false;
+   goto out;
+   }
+   }
+
+   rc = -1;
+out:
+   fclose(f);
+   return rc;
+}
+
+static int test(void)
+{
+   unsigned long i, j, addr, region_shift, page_shift, page_size;
+   struct sigaction sig;
+   bool hash_mmu;
+
+   sig = (struct sigaction) {
+   .sa_sigaction = segv_handler,
+   .sa_flags = SA_SIGINFO,
+   };
+
+   FAIL_IF(sigaction(SIGSEGV, , NULL) != 0);
+
+   FAIL_IF(using_hash_mmu(_mmu));
+
+   page_size = sysconf(_SC_PAGESIZE);
+   if (page_size == (64 * 1024))
+   page_shift = 16;
+   else
+   page_shift = 12;
+
+

[PATCH v2] selftests/powerpc: Add a test of bad (out-of-range) accesses

2019-05-20 Thread Michael Ellerman

Userspace isn't allowed to access certain address ranges, make sure we
actually test that to at least some degree.

This would have caught the recent bug where the SLB fault handler was
incorrectly called on an out-of-range access when using the Radix MMU.
It also would have caught the bug we had in get_region_id() where we
were inserting SLB entries for bad addresses.

Signed-off-by: Michael Ellerman 
Signed-off-by: Nicholas Piggin 
---
v2: Incorporate changes from Nick's version. Check the faulted address,
and make sure we get BNDERR for things that are completely out of
bounds.

 tools/testing/selftests/powerpc/mm/.gitignore |   3 +-
 tools/testing/selftests/powerpc/mm/Makefile   |   4 +-
 .../selftests/powerpc/mm/bad_accesses.c   | 167 ++
 3 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/mm/bad_accesses.c

diff --git a/tools/testing/selftests/powerpc/mm/.gitignore 
b/tools/testing/selftests/powerpc/mm/.gitignore
index ba919308fe30..c189170ba041 100644
--- a/tools/testing/selftests/powerpc/mm/.gitignore
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -3,4 +3,5 @@ subpage_prot
 tempfile
 prot_sao
 segv_errors
-wild_bctr
\ No newline at end of file
+wild_bctr
+bad_accesses
\ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/mm/Makefile 
b/tools/testing/selftests/powerpc/mm/Makefile
index 43d68420e363..f9667f980241 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -2,7 +2,8 @@
 noarg:
$(MAKE) -C ../
 
-TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors 
wild_bctr
+TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors \
+ wild_bctr bad_accesses
 TEST_GEN_FILES := tempfile
 
 top_srcdir = ../../../../..
@@ -13,6 +14,7 @@ $(TEST_GEN_PROGS): ../harness.c
 $(OUTPUT)/prot_sao: ../utils.c
 
 $(OUTPUT)/wild_bctr: CFLAGS += -m64
+$(OUTPUT)/bad_accesses: CFLAGS += -m64
 
 $(OUTPUT)/tempfile:
dd if=/dev/zero of=$@ bs=64k count=1
diff --git a/tools/testing/selftests/powerpc/mm/bad_accesses.c 
b/tools/testing/selftests/powerpc/mm/bad_accesses.c
new file mode 100644
index ..f18b2b91b263
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/bad_accesses.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright 2019, Michael Ellerman, IBM Corp.
+//
+// Test that out-of-bounds reads/writes behave as expected.
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+
+
+// 64-bit kernel is always here
+#define PAGE_OFFSET(0xcul << 60)
+
+static unsigned long kernel_virt_end;
+
+static volatile int fault_code;
+static volatile unsigned long fault_addr;
+static jmp_buf setjmp_env;
+
+static void segv_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+   fault_code = info->si_code;
+   fault_addr = (unsigned long)info->si_addr;
+   siglongjmp(setjmp_env, 1);
+}
+
+int bad_access(char *p, bool write)
+{
+   char x;
+
+   fault_code = 0;
+   fault_addr = 0;
+
+   if (sigsetjmp(setjmp_env, 1) == 0) {
+   if (write)
+   *p = 1;
+   else
+   x = *p;
+
+   printf("Bad - no SEGV! (%c)\n", x);
+   return 1;
+   }
+
+   // If we see MAPERR that means we took a page fault rather than an SLB
+   // miss. We only expect to take page faults for addresses within the
+   // valid kernel range.
+   FAIL_IF(fault_code == SEGV_MAPERR && \
+   (fault_addr < PAGE_OFFSET || fault_addr >= kernel_virt_end));
+
+   FAIL_IF(fault_code != SEGV_MAPERR && fault_code != SEGV_BNDERR);
+
+   return 0;
+}
+
+static int using_hash_mmu(bool *using_hash)
+{
+   char line[128];
+   FILE *f;
+   int rc;
+
+   f = fopen("/proc/cpuinfo", "r");
+   FAIL_IF(!f);
+
+   rc = 0;
+   while (fgets(line, sizeof(line), f) != NULL) {
+   if (strcmp(line, "MMU   : Hash\n") == 0) {
+   *using_hash = true;
+   goto out;
+   }
+
+   if (strcmp(line, "MMU   : Radix\n") == 0) {
+   *using_hash = false;
+   goto out;
+   }
+   }
+
+   rc = -1;
+out:
+   fclose(f);
+   return rc;
+}
+
+static int test(void)
+{
+   unsigned long i, j, addr, region_shift, page_shift, page_size;
+   struct sigaction sig;
+   bool hash_mmu;
+
+   sig = (struct sigaction) {
+   .sa_sigaction = segv_handler,
+   .sa_flags = SA_SIGINFO,
+   };
+
+   FAIL_IF(sigaction(SIGSEGV, , NULL) != 0);
+
+   FAIL_IF(using_hash_mmu(_mmu));
+
+   page_size = sysconf(_SC_PAGESIZE);
+   if (page_size == (64 * 1024))
+   page_shift = 16;
+   else
+   page_shift = 12;
+
+

[PATCH v4 1/3] PM: wakeup: Add routine to help fetch wakeup source object.

2019-05-20 Thread Ran Wang

Some user might want to go through all registered wakeup sources
and doing things accordingly. For example, SoC PM driver might need to
do HW programming to prevent powering down specific IP which wakeup
source depending on. And is user's responsibility to identify if this
wakeup source he is interested in.

Signed-off-by: Ran Wang 
---
Change in v4:
- None.

Change in v3:
- Adjust indentation of *attached_dev;.

Change in v2:
- None.

 drivers/base/power/wakeup.c |   18 ++
 include/linux/pm_wakeup.h   |3 +++
 2 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 5b2b6a0..6904485 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -226,6 +227,22 @@ void wakeup_source_unregister(struct wakeup_source *ws)
}
 }
 EXPORT_SYMBOL_GPL(wakeup_source_unregister);
+/**
+ * wakeup_source_get_next - Get next wakeup source from the list
+ * @ws: Previous wakeup source object, null means caller want first one.
+ */
+struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws)
+{
+   struct list_head *ws_head = _sources;
+
+   if (ws)
+   return list_next_or_null_rcu(ws_head, >entry,
+   struct wakeup_source, entry);
+   else
+   return list_entry_rcu(ws_head->next,
+   struct wakeup_source, entry);
+}
+EXPORT_SYMBOL_GPL(wakeup_source_get_next);
 
 /**
  * device_wakeup_attach - Attach a wakeup source object to a device object.
@@ -242,6 +259,7 @@ static int device_wakeup_attach(struct device *dev, struct 
wakeup_source *ws)
return -EEXIST;
}
dev->power.wakeup = ws;
+   ws->attached_dev = dev;
if (dev->power.wakeirq)
device_wakeup_attach_irq(dev, dev->power.wakeirq);
spin_unlock_irq(>power.lock);
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 0ff134d..913b2fb 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -50,6 +50,7 @@
  * @wakeup_count: Number of times the wakeup source might abort suspend.
  * @active: Status of the wakeup source.
  * @has_timeout: The wakeup source has been activated with a timeout.
+ * @attached_dev: The device it attached to
  */
 struct wakeup_source {
const char  *name;
@@ -70,6 +71,7 @@ struct wakeup_source {
unsigned long   wakeup_count;
boolactive:1;
boolautosleep_enabled:1;
+   struct device   *attached_dev;
 };
 
 #ifdef CONFIG_PM_SLEEP
@@ -101,6 +103,7 @@ static inline void device_set_wakeup_path(struct device 
*dev)
 extern void wakeup_source_remove(struct wakeup_source *ws);
 extern struct wakeup_source *wakeup_source_register(const char *name);
 extern void wakeup_source_unregister(struct wakeup_source *ws);
+extern struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws);
 extern int device_wakeup_enable(struct device *dev);
 extern int device_wakeup_disable(struct device *dev);
 extern void device_set_wakeup_capable(struct device *dev, bool capable);
-- 
1.7.1

[PATCH v4 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Ran Wang

The NXP's QorIQ Processors based on ARM Core have RCPM module
(Run Control and Power Management), which performs all device-level
tasks associated with power management such as wakeup source control.

This driver depends on PM wakeup source framework which help to
collect wake information.

Signed-off-by: Ran Wang 
Acked-by: Pavel Machek 
---
Change in v4:
- Remove extra ',' in author line of rcpm.c
- Update usage of wakeup_source_get_next() to be less confusing to the
  reader, code logic remain the same.

Change in v3:
- Some whitespace ajdustment.

Change in v2:
- Rebase Kconfig and Makefile update to latest mainline.

 drivers/soc/fsl/Kconfig  |8 +++
 drivers/soc/fsl/Makefile |1 +
 drivers/soc/fsl/rcpm.c   |  122 ++
 3 files changed, 131 insertions(+), 0 deletions(-)
 create mode 100644 drivers/soc/fsl/rcpm.c

diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig
index 61f8e14..8e84e40 100644
--- a/drivers/soc/fsl/Kconfig
+++ b/drivers/soc/fsl/Kconfig
@@ -29,4 +29,12 @@ config FSL_MC_DPIO
  other DPAA2 objects. This driver does not expose the DPIO
  objects individually, but groups them under a service layer
  API.
+
+config FSL_RCPM
+   bool "Freescale RCPM support"
+   depends on PM_SLEEP
+   help
+ The NXP's QorIQ Processors based on ARM Core have RCPM module
+ (Run Control and Power Management), which performs all device-level
+ tasks associated with power management, such as wakeup source control.
 endmenu
diff --git a/drivers/soc/fsl/Makefile b/drivers/soc/fsl/Makefile
index 803ef1b..c1be6ee 100644
--- a/drivers/soc/fsl/Makefile
+++ b/drivers/soc/fsl/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_QUICC_ENGINE)  += qe/
 obj-$(CONFIG_CPM)  += qe/
 obj-$(CONFIG_FSL_GUTS) += guts.o
 obj-$(CONFIG_FSL_MC_DPIO)  += dpio/
+obj-$(CONFIG_FSL_RCPM) += rcpm.o
diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c
new file mode 100644
index 000..ff27adc
--- /dev/null
+++ b/drivers/soc/fsl/rcpm.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// rcpm.c - Freescale QorIQ RCPM driver
+//
+// Copyright 2019 NXP
+//
+// Author: Ran Wang 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RCPM_WAKEUP_CELL_MAX_SIZE  7
+
+struct rcpm {
+   unsigned intwakeup_cells;
+   void __iomem*ippdexpcr_base;
+   boollittle_endian;
+};
+
+static int rcpm_pm_prepare(struct device *dev)
+{
+   struct device_node  *np = dev->of_node;
+   struct wakeup_source*ws;
+   struct rcpm *rcpm;
+   u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp;
+   int i, ret;
+
+   rcpm = dev_get_drvdata(dev);
+   if (!rcpm)
+   return -EINVAL;
+
+   /* Begin with first registered wakeup source */
+   ws = NULL;
+   while (wakeup_source_get_next(ws)) {
+   ret = device_property_read_u32_array(ws->attached_dev,
+   "fsl,rcpm-wakeup", value, rcpm->wakeup_cells + 
1);
+
+   /*  Wakeup source should refer to current rcpm device */
+   if (ret || (np->phandle != value[0])) {
+   dev_info(dev, "%s doesn't refer to this rcpm\n",
+   ws->name);
+   continue;
+   }
+
+   for (i = 0; i < rcpm->wakeup_cells; i++) {
+   /* We can only OR related bits */
+   if (value[i + 1]) {
+   if (rcpm->little_endian) {
+   tmp = ioread32(rcpm->ippdexpcr_base + i 
* 4);
+   tmp |= value[i + 1];
+   iowrite32(tmp, rcpm->ippdexpcr_base + i 
* 4);
+   } else {
+   tmp = ioread32be(rcpm->ippdexpcr_base + 
i * 4);
+   tmp |= value[i + 1];
+   iowrite32be(tmp, rcpm->ippdexpcr_base + 
i * 4);
+   }
+   }
+   }
+   }
+
+   return 0;
+}
+
+static const struct dev_pm_ops rcpm_pm_ops = {
+   .prepare =  rcpm_pm_prepare,
+};
+
+static int rcpm_probe(struct platform_device *pdev)
+{
+   struct device   *dev = >dev;
+   struct resource *r;
+   struct rcpm *rcpm;
+   int ret;
+
+   rcpm = devm_kzalloc(dev, sizeof(*rcpm), GFP_KERNEL);
+   if (!rcpm)
+   return -ENOMEM;
+
+   r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+   if (!r)
+   return -ENODEV;
+
+   rcpm->ippdexpcr_base = devm_ioremap_resource(>dev, r);
+   if (IS_ERR(rcpm->ippdexpcr_base)) {
+   ret =  PTR_ERR(rcpm->ippdexpcr_base);

[PATCH v4 2/3] Documentation: dt: binding: fsl: Add 'little-endian' and update Chassis define

2019-05-20 Thread Ran Wang

By default, QorIQ SoC's RCPM register block is Big Endian. But
there are some exceptions, such as LS1088A and LS2088A, are Little
Endian. So add this optional property to help identify them.

Actually LS2021A and other Layerscapes won't totally follow Chassis
2.1, so separate them from powerpc SoC.

Signed-off-by: Ran Wang 
---
Change in v4:
- Adjust indectation of 'ls1021a, ls1012a, ls1043a, ls1046a'.

Change in v3:
- None.

Change in v2:
- None.

 Documentation/devicetree/bindings/soc/fsl/rcpm.txt |8 +++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt 
b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
index e284e4e..058154c 100644
--- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
+++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
@@ -20,6 +20,7 @@ Required properites:
* "fsl,qoriq-rcpm-1.0": for chassis 1.0 rcpm
* "fsl,qoriq-rcpm-2.0": for chassis 2.0 rcpm
* "fsl,qoriq-rcpm-2.1": for chassis 2.1 rcpm
+   * "fsl,qoriq-rcpm-2.1+": for chassis 2.1+ rcpm
 
 All references to "1.0" and "2.0" refer to the QorIQ chassis version to
 which the chip complies.
@@ -27,7 +28,12 @@ Chassis Version  Example Chips
 ------
 1.0p4080, p5020, p5040, p2041, p3041
 2.0t4240, b4860, b4420
-2.1t1040, ls1021
+2.1t1040,
+2.1+   ls1021a, ls1012a, ls1043a, ls1046a
+
+Optional properties:
+ - little-endian : RCPM register block is Little Endian. Without it RCPM
+   will be Big Endian (default case).
 
 Example:
 The RCPM node for T4240:
-- 
1.7.1

Re: [PATCH] kbuild: do not check name uniqueness of builtin modules

2019-05-20 Thread Greg KH

On Mon, May 20, 2019 at 11:54:37AM +0900, Masahiro Yamada wrote:
> I just thought it was a good idea to scan builtin.modules in the name
> uniqueness checking, but Stephen reported a false positive.
> 
> ppc64_defconfig produces:
> 
>   warning: same basename if the following are built as modules:
> arch/powerpc/platforms/powermac/nvram.ko
> drivers/char/nvram.ko
> 
> ..., which is a false positive because the former is never built as
> a module as you see in arch/powerpc/platforms/powermac/Makefile:
> 
>   # CONFIG_NVRAM is an arch. independent tristate symbol, for pmac32 we really
>   # need this to be a bool.  Cheat here and pretend CONFIG_NVRAM=m is really
>   # CONFIG_NVRAM=y
>   obj-$(CONFIG_NVRAM:m=y) += nvram.o
> 
> Since we cannot predict how tricky Makefiles are written in wild,
> builtin.modules may potentially contain false positives. I do not
> think it is a big deal as far as kmod is concerned, but false positive
> warnings in the kernel build makes people upset. It is better to not
> do it.
> 
> Even without checking builtin.modules, we have enough (and more solid)
> test coverage with allmodconfig.
> 
> While I touched this part, I replaced the sed code with neater one
> provided by Stephen.
> 
> Link: https://lkml.org/lkml/2019/5/19/120
> Link: https://lkml.org/lkml/2019/5/19/123
> Fixes: 3a48a91901c5 ("kbuild: check uniqueness of module names")
> Reported-by: Stephen Rothwell 
> Signed-off-by: Masahiro Yamada 
> ---

Reviewed-by: Greg Kroah-Hartman

Re: [PATCH] kbuild: do not check name uniqueness of builtin modules

2019-05-20 Thread Arnd Bergmann

On Mon, May 20, 2019 at 4:57 AM Masahiro Yamada
 wrote:
>
> I just thought it was a good idea to scan builtin.modules in the name
> uniqueness checking, but Stephen reported a false positive.
>
> ppc64_defconfig produces:
>
>   warning: same basename if the following are built as modules:
> arch/powerpc/platforms/powermac/nvram.ko
> drivers/char/nvram.ko
>
> ..., which is a false positive because the former is never built as
> a module as you see in arch/powerpc/platforms/powermac/Makefile:
>
>   # CONFIG_NVRAM is an arch. independent tristate symbol, for pmac32 we really
>   # need this to be a bool.  Cheat here and pretend CONFIG_NVRAM=m is really
>   # CONFIG_NVRAM=y
>   obj-$(CONFIG_NVRAM:m=y) += nvram.o
>
> Since we cannot predict how tricky Makefiles are written in wild,
> builtin.modules may potentially contain false positives. I do not
> think it is a big deal as far as kmod is concerned, but false positive
> warnings in the kernel build makes people upset. It is better to not
> do it.
>
> Even without checking builtin.modules, we have enough (and more solid)
> test coverage with allmodconfig.
>
> While I touched this part, I replaced the sed code with neater one
> provided by Stephen.
>
> Link: https://lkml.org/lkml/2019/5/19/120
> Link: https://lkml.org/lkml/2019/5/19/123
> Fixes: 3a48a91901c5 ("kbuild: check uniqueness of module names")
> Reported-by: Stephen Rothwell 
> Signed-off-by: Masahiro Yamada 

Looks good to me

Acked-by: Arnd Bergmann 

> ---
>
>  scripts/modules-check.sh | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/scripts/modules-check.sh b/scripts/modules-check.sh
> index 2f659530e1ec..39e8cb36ba19 100755
> --- a/scripts/modules-check.sh
> +++ b/scripts/modules-check.sh
> @@ -6,10 +6,10 @@ set -e
>  # Check uniqueness of module names
>  check_same_name_modules()
>  {
> -   for m in $(sed 's:.*/::' modules.order modules.builtin | sort | uniq 
> -d)
> +   for m in $(sed 's:.*/::' modules.order | sort | uniq -d)
> do
> -   echo "warning: same basename if the following are built as 
> modules:" >&2
> -   sed "/\/$m/!d;s:^kernel/:  :" modules.order modules.builtin 
> >&2
> +   echo "warning: same module names found:" >&2
> +   sed -n "/\/$m/s:^kernel/:  :p" modules.order >&2
> done
>  }
>
> --
> 2.17.1
>

Re: [PATCH V2 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Pavel Machek

Hi!

> > > > You are right, but the current code is "interesting". What about
> > > >
> > > > ws = NULL;
> > > > while (ws = wakeup_source_get_next(NULL)) ...
> > > >
> > > > then?
> > >
> > > Did you mean:
> > >  ws = NULL;
> > >  while (ws = wakeup_source_get_next(ws)) ...
> > >
> > >Yes, that will be the same to my original logic, do you recommend
> > > to change to this? :)
> > 
> > Yes please. It will be less confusing to the reader.
> 
> OK, if no other comment, I will work out v4, fix this and extra ','
>  
> > Thanks (and sorry for cross-talk),
> 
> That's OK, thanks for your time.

You can add

Acked-by: Pavel Machek 

to that version.

Best regards,
Pavel
-- 
DENX Software Engineering GmbH,  Managing Director: Wolfgang Denk
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany


signature.asc
Description: Digital signature

Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the

2019-05-20 Thread Anju T Sudhakar


Hi,

Somehow the subject of this patch didn't appear completely here.

The Subject of this patch is as follows,

`Subject [PATCH] powerpc/perf: Use cpumask_last() to determine the 
designated cpu for nest/core units.`


Thanks,

Anju


On 5/20/19 2:35 PM, Anju T Sudhakar wrote:

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

RE: [PATCH V2 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Ran Wang

Hi Pavel,

On Monday, May 20, 2019 17:08 Pavel Machek wrote:
> > > Hi!
> > >
> > > > > > +static int rcpm_pm_prepare(struct device *dev) {
> > > > > > +   struct device_node *np = dev->of_node;
> > > > > > +   struct wakeup_source *ws;
> > > > > > +   struct rcpm *rcpm;
> > > > > > +   u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp;
> > > > > > +   int i, ret;
> > > > > > +
> > > > > > +   rcpm = dev_get_drvdata(dev);
> > > > > > +   if (!rcpm)
> > > > > > +   return -EINVAL;
> > > > > > +
> > > > > > +   /* Begin with first registered wakeup source */
> > > > > > +   ws = wakeup_source_get_next(NULL);
> > > > > > +   while (ws) {
> > > > >
> > > > > while (ws = wakeup_source_get_next(NULL)) ?
> > > >
> > > > Actually, we only pass NULL to wakeup_source_get_next() at very
> > > > first call to get 1st wakeup source. Then in the while loop, we
> > > > will fetch next source but not 1st, that's different. I am afraid
> > > > your suggestion is not quite correct.
> > >
> > > Sorry, I seen your next version before seeing this explanation.
> > >
> > > You are right, but the current code is "interesting". What about
> > >
> > > ws = NULL;
> > > while (ws = wakeup_source_get_next(NULL)) ...
> > >
> > > then?
> >
> > Did you mean:
> >  ws = NULL;
> >  while (ws = wakeup_source_get_next(ws)) ...
> >
> >Yes, that will be the same to my original logic, do you recommend
> > to change to this? :)
> 
> Yes please. It will be less confusing to the reader.

OK, if no other comment, I will work out v4, fix this and extra ','
 
> Thanks (and sorry for cross-talk),

That's OK, thanks for your time.

Regards,
Ran

Re: [PATCH v2] ocxl: Fix potential memory leak on context creation

2019-05-20 Thread Greg Kurz

On Mon, 20 May 2019 09:16:18 +0200
Frederic Barrat  wrote:

> If we couldn't fully init a context, we were leaking memory.
> 
> Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts")

Oops... missed that during review :-\

> Signed-off-by: Frederic Barrat 
> ---
> 
> Changelog:
> v2: reset context pointer in case of allocation failure (Andrew)
> 

Alternatively you could change the code to do:

ctx = kzalloc(sizeof(struct ocxl_context), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
.
.
.
if (pasid < 0) {
mutex_unlock(>contexts_lock);
kfree(ctx);
return pasid;
}
.
.
.
*context = ctx;
return 0;
}

This has the advantage of clearing any risk of side-effect with
*context forever, which is a safer practice IMHO.

Patch is correct anyway, so:

Reviewed-by: Greg Kurz 

>  drivers/misc/ocxl/context.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
> index bab9c9364184..24e4fb010275 100644
> --- a/drivers/misc/ocxl/context.c
> +++ b/drivers/misc/ocxl/context.c
> @@ -22,6 +22,8 @@ int ocxl_context_alloc(struct ocxl_context **context, 
> struct ocxl_afu *afu,
>   afu->pasid_base + afu->pasid_max, GFP_KERNEL);
>   if (pasid < 0) {
>   mutex_unlock(>contexts_lock);
> + kfree(*context);
> + *context = NULL;
>   return pasid;
>   }
>   afu->pasid_count++;

Re: [PATCH V2 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Pavel Machek

On Mon 2019-05-20 09:03:50, Ran Wang wrote:
> Hi Pavel,
> 
> On Monday, May 20, 2019 16:57, Pavel Machek wrote:
> > 
> > Hi!
> > 
> > > > > +static int rcpm_pm_prepare(struct device *dev) {
> > > > > + struct device_node *np = dev->of_node;
> > > > > + struct wakeup_source *ws;
> > > > > + struct rcpm *rcpm;
> > > > > + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp;
> > > > > + int i, ret;
> > > > > +
> > > > > + rcpm = dev_get_drvdata(dev);
> > > > > + if (!rcpm)
> > > > > + return -EINVAL;
> > > > > +
> > > > > + /* Begin with first registered wakeup source */
> > > > > + ws = wakeup_source_get_next(NULL);
> > > > > + while (ws) {
> > > >
> > > > while (ws = wakeup_source_get_next(NULL)) ?
> > >
> > > Actually, we only pass NULL to wakeup_source_get_next() at very first
> > > call to get 1st wakeup source. Then in the while loop, we will fetch
> > > next source but not 1st, that's different. I am afraid your suggestion
> > > is not quite correct.
> > 
> > Sorry, I seen your next version before seeing this explanation.
> > 
> > You are right, but the current code is "interesting". What about
> > 
> > ws = NULL;
> > while (ws = wakeup_source_get_next(NULL)) ...
> > 
> > then?
> 
> Did you mean:
>  ws = NULL;
>  while (ws = wakeup_source_get_next(ws)) ...
> 
>Yes, that will be the same to my original logic, do you recommend to change
> to this? :)

Yes please. It will be less confusing to the reader.

Thanks (and sorry for cross-talk),
Pavel
-- 
DENX Software Engineering GmbH,  Managing Director: Wolfgang Denk
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany


signature.asc
Description: Digital signature

[PATCH] powerpc/perf: Use cpumask_last() to determine the

2019-05-20 Thread Anju T Sudhakar

Nest and core imc(In-memory Collection counters) assigns a particular
cpu as the designated target for counter data collection.
During system boot, the first online cpu in a chip gets assigned as
the designated cpu for that chip(for nest-imc) and the first online cpu
in a core gets assigned as the designated cpu for that core(for core-imc).

If the designated cpu goes offline, the next online cpu from the same
chip(for nest-imc)/core(for core-imc) is assigned as the next target,
and the event context is migrated to the target cpu.
Currently, cpumask_any_but() function is used to find the target cpu.
Though this function is expected to return a `random` cpu, this always
returns the next online cpu.

If all cpus in a chip/core is offlined in a sequential manner, starting
from the first cpu, the event migration has to happen for all the cpus
which goes offline. Since the migration process involves a grace period,
the total time taken to offline all the cpus will be significantly high.

Example:
In a system which has 2 sockets, with
NUMA node0 CPU(s): 0-87
NUMA node8 CPU(s): 88-175

Time taken to offline cpu 88-175:
real2m56.099s
user0m0.191s
sys 0m0.000s

Use cpumask_last() to choose the target cpu, when the designated cpu
goes online, so the migration will happen only when the last_cpu in the
mask goes offline. This way the time taken to offline all cpus in a
chip/core can be reduced.

With the patch, 

Time taken  to offline cpu 88-175:
real0m12.207s
user0m0.171s
sys 0m0.000s

cpumask_last() is a better way to find the target cpu, since in most of the
cases cpuhotplug is performed in an increasing order(even in ppc64_cpu).

cpumask_any_but() can still be used to check the possibility of other
online cpus from the same chip/core if the last cpu in the mask goes
offline.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/perf/imc-pmu.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 31fa753..fbfd6e7 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 */
nid = cpu_to_node(cpu);
l_cpumask = cpumask_of_node(nid);
-   target = cpumask_any_but(l_cpumask, cpu);
+   target = cpumask_last(l_cpumask);
+
+   /*
+* If this(target) is the last cpu in the cpumask for this chip,
+* check for any possible online cpu in the chip.
+*/
+   if (unlikely(target == cpu))
+   target = cpumask_any_but(l_cpumask, cpu);
 
/*
 * Update the cpumask with the target cpu and
@@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return 0;
 
/* Find any online cpu in that core except the current "cpu" */
-   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+   ncpu = cpumask_last(cpu_sibling_mask(cpu));
+
+   if (unlikely(ncpu == cpu))
+   ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
 
if (ncpu >= 0 && ncpu < nr_cpu_ids) {
cpumask_set_cpu(ncpu, _imc_cpumask);
-- 
1.8.3.1

RE: [PATCH V2 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Ran Wang

Hi Pavel,

On Monday, May 20, 2019 16:57, Pavel Machek wrote:
> 
> Hi!
> 
> > > > +static int rcpm_pm_prepare(struct device *dev) {
> > > > +   struct device_node *np = dev->of_node;
> > > > +   struct wakeup_source *ws;
> > > > +   struct rcpm *rcpm;
> > > > +   u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp;
> > > > +   int i, ret;
> > > > +
> > > > +   rcpm = dev_get_drvdata(dev);
> > > > +   if (!rcpm)
> > > > +   return -EINVAL;
> > > > +
> > > > +   /* Begin with first registered wakeup source */
> > > > +   ws = wakeup_source_get_next(NULL);
> > > > +   while (ws) {
> > >
> > > while (ws = wakeup_source_get_next(NULL)) ?
> >
> > Actually, we only pass NULL to wakeup_source_get_next() at very first
> > call to get 1st wakeup source. Then in the while loop, we will fetch
> > next source but not 1st, that's different. I am afraid your suggestion
> > is not quite correct.
> 
> Sorry, I seen your next version before seeing this explanation.
> 
> You are right, but the current code is "interesting". What about
> 
> ws = NULL;
> while (ws = wakeup_source_get_next(NULL)) ...
> 
> then?

Did you mean:
 ws = NULL;
 while (ws = wakeup_source_get_next(ws)) ...

   Yes, that will be the same to my original logic, do you recommend to change
to this? :)

Regards,
Ran

[PATCH] powerpc/powernv: Return for invalid IMC domain

2019-05-20 Thread Anju T Sudhakar

Currently init_imc_pmu() can be failed either because
an IMC unit with invalid domain(i.e an IMC node not
supported by the kernel) is attempted a pmu-registration
or something went wrong while registering a valid IMC unit.
In both the cases kernel provides a 'Registration failed'
error message.

Example:
Log message, when trace-imc node is not supported by the kernel, and the
skiboot supports trace-imc node.

So for kernel, trace-imc node is now an unknown domain.

[1.731870] nest_phb5_imc performance monitor hardware support registered
[1.731944] nest_powerbus0_imc performance monitor hardware support 
registered
[1.734458] thread_imc performance monitor hardware support registered
[1.734460] IMC Unknown Device type
[1.734462] IMC PMU (null) Register failed
[1.734558] nest_xlink0_imc performance monitor hardware support registered
[1.734614] nest_xlink1_imc performance monitor hardware support registered
[1.734670] nest_xlink2_imc performance monitor hardware support registered
[1.747043] Initialise system trusted keyrings
[1.747054] Key type blacklist registered


To avoid ambiguity on the error message, return for invalid domain
before attempting a pmu registration. 

Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`)
Reported-by: Pavaman Subramaniyam 
Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a0794..4e8b0e1 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int 
pmu_index, int domain)
struct imc_pmu *pmu_ptr;
u32 offset;
 
+   /* Return for unknown domain */
+   if (domain < 0)
+   return -EINVAL;
+
/* memory for pmu */
pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
if (!pmu_ptr)
-- 
1.8.3.1

Re: [RFC PATCH 4/4] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel

2019-05-20 Thread Naveen N. Rao


Nicholas Piggin wrote:

Naveen N. Rao's on May 18, 2019 5:02 am:

With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to
enable function tracing and profiling. So far, with dynamic ftrace, we
used to only patch out the branch to _mcount(). However, Nick Piggin
points out that "mflr is executed by the branch unit that can only
execute one per cycle on POWER9 and shared with branches, so it would be
nice to avoid it where possible."

We cannot simply nop out the mflr either. Michael Ellerman pointed out
that when enabling function tracing, there can be a race if tracing is
enabled when some thread was interrupted after executing a nop'ed out
mflr. In this case, the thread would execute the now-patched-in branch
to _mcount() without having executed the preceding mflr.

To solve this, we now enable function tracing in 2 steps: patch in the
mflr instruction, use synchronize_rcu_tasks() to ensure all existing
threads make progress, and then patch in the branch to _mcount(). We
override ftrace_replace_code() with a powerpc64 variant for this
purpose.

Signed-off-by: Nicholas Piggin 
Signed-off-by: Naveen N. Rao 


Nice! Thanks for doing a real patch. You needn't add my SOB there: my
hack was obviously garbage :) Suggested-by if anything, then for
clarity of changelog you can write the motivation directly rather than
quote me.


Thanks, I meant to call out the fact that I had added your SOB before
sending the patch, but missed doing so. Your patch was perfectly fine ;)



I don't know the ftrace subsystem well, but the powerpc instructions
and patching sequence appears to match what we agreed is the right way
to go.

As a suggestion, I would perhaps add most of information from the
second and third paragraphs of the changelog into comments
(and also explain that the lone mflr r0 is harmless).

But otherwise it looks good

Reviewed-by: Nicholas Piggin 


Thanks, I will incorporate those changes.


- Naveen

Re: [PATCH V2 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Pavel Machek

Hi!

> > > +static int rcpm_pm_prepare(struct device *dev) {
> > > + struct device_node *np = dev->of_node;
> > > + struct wakeup_source *ws;
> > > + struct rcpm *rcpm;
> > > + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp;
> > > + int i, ret;
> > > +
> > > + rcpm = dev_get_drvdata(dev);
> > > + if (!rcpm)
> > > + return -EINVAL;
> > > +
> > > + /* Begin with first registered wakeup source */
> > > + ws = wakeup_source_get_next(NULL);
> > > + while (ws) {
> > 
> > while (ws = wakeup_source_get_next(NULL)) ?
> 
> Actually, we only pass NULL to wakeup_source_get_next() at very first
> call to get 1st wakeup source. Then in the while loop, we will fetch
> next source but not 1st, that's different. I am afraid your suggestion
> is not quite correct.

Sorry, I seen your next version before seeing this explanation.

You are right, but the current code is "interesting". What about

ws = NULL;
while (ws = wakeup_source_get_next(NULL)) ...

then?

Best regards,
Pavel
-- 
DENX Software Engineering GmbH,  Managing Director: Wolfgang Denk
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany


signature.asc
Description: Digital signature

Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest

2019-05-20 Thread Bharata B Rao

On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote:
> Bharata B Rao's on May 20, 2019 3:56 pm:
> > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote:
> >> >> > git bisect points to
> >> >> >
> >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d
> >> >> > Author: Nicholas Piggin 
> >> >> > Date:   Fri Jul 27 21:48:17 2018 +1000
> >> >> >
> >> >> > powerpc/64s: Fix page table fragment refcount race vs speculative 
> >> >> > references
> >> >> >
> >> >> > The page table fragment allocator uses the main page refcount 
> >> >> > racily
> >> >> > with respect to speculative references. A customer observed a BUG 
> >> >> > due
> >> >> > to page table page refcount underflow in the fragment allocator. 
> >> >> > This
> >> >> > can be caused by the fragment allocator set_page_count stomping 
> >> >> > on a
> >> >> > speculative reference, and then the speculative failure handler
> >> >> > decrements the new reference, and the underflow eventually pops 
> >> >> > when
> >> >> > the page tables are freed.
> >> >> >
> >> >> > Fix this by using a dedicated field in the struct page for the 
> >> >> > page
> >> >> > table fragment allocator.
> >> >> >
> >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage")
> >> >> > Cc: sta...@vger.kernel.org # v3.10+
> >> >> 
> >> >> That's the commit that added the BUG_ON(), so prior to that you won't
> >> >> see the crash.
> >> > 
> >> > Right, but the commit says it fixes page table page refcount underflow by
> >> > introducing a new field >pt_frag_refcount. Now we are hitting the 
> >> > underflow
> >> > for this pt_frag_refcount.
> >> 
> >> The fixed underflow is caused by a bug (race on page count) that got 
> >> fixed by that patch. You are hitting a different underflow here. It's
> >> not certain my patch caused it, I'm just trying to reproduce now.
> > 
> > Ok.
> 
> Can't reproduce I'm afraid, tried adding and removing 8GB memory from a
> 4GB guest (via host adding / removing memory device), and it just works.

Boot, add 8G, reboot, remove 8G is the sequence to reproduce.

> 
> It's likely to be an edge case like an off by one or rounding error
> that just happens to trigger in your config. Might be easiest if you
> could test with a debug patch.

Sure, I will continue debugging.

Regards,
Bharata.

[PATCH v2] mm/nvdimm: Pick the right alignment default when creating dax devices

2019-05-20 Thread Aneesh Kumar K.V

Allow arch to provide the supported alignments and use hugepage alignment only
if we support hugepage. Right now we depend on compile time configs whereas this
patch switch this to runtime discovery.

Architectures like ppc64 can have THP enabled in code, but then can have
hugepage size disabled by the hypervisor. This allows us to create dax devices
with PAGE_SIZE alignment in this case.

Existing dax namespace with alignment larger than PAGE_SIZE will fail to
initialize in this specific case. We still allow fsdax namespace initialization.

With respect to identifying whether to enable hugepage fault for a dax device,
if THP is enabled during compile, we default to taking hugepage fault and in dax
fault handler if we find the fault size > alignment we retry with PAGE_SIZE
fault size.

This also addresses the below failure scenario on ppc64

ndctl create-namespace --mode=devdax  | grep align
 "align":16777216,
 "align":16777216

cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments
 65536 16777216

daxio.static-debug  -z -o /dev/dax0.0
  Bus error (core dumped)

  $ dmesg | tail
   lpar: Failed hash pte insert with error -4
   hash-mmu: mm: Hashing failure ! EA=0x7fff1700 access=0x8006 
current=daxio
   hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 
pte=0xc00501002b86
   daxio[3860]: bus error (7) at 7fff1700 nip 7fff973c007c lr 7fff973bff34 
code 2 in libpmem.so.1.0.0[7fff973b+2]
   daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c 
e93f0088 f93f0120
   daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128  e93f0088 
39290008 f93f0110

The failure was due to guest kernel using wrong page size.

The namespaces created with 16M alignment will appear as below on a config with
16M page size disabled.

$ ndctl list -Ni
[
  {
"dev":"namespace0.1",
"mode":"fsdax",
"map":"dev",
"size":5351931904,
"uuid":"fc6e9667-461a-4718-82b4-69b24570bddb",
"align":16777216,
"blockdev":"pmem0.1",
"supported_alignments":[
  65536
]
  },
  {
"dev":"namespace0.0",
"mode":"fsdax",< devdax 16M alignment marked disabled.
"map":"mem",
"size":5368709120,
"uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484",
"state":"disabled"
  }
]

Signed-off-by: Aneesh Kumar K.V 
---

Changes from V1:
* Return -EOPNOTSUPP if we find that we can't initialize the namespace
* Compare fix for DAX signature.

 arch/powerpc/include/asm/libnvdimm.h |  9 
 arch/powerpc/mm/Makefile |  1 +
 arch/powerpc/mm/nvdimm.c | 34 
 arch/x86/include/asm/libnvdimm.h | 19 
 drivers/nvdimm/nd.h  |  6 -
 drivers/nvdimm/pfn_devs.c| 32 +-
 drivers/nvdimm/pmem.c| 26 +
 include/linux/huge_mm.h  |  7 +-
 8 files changed, 122 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

diff --git a/arch/powerpc/include/asm/libnvdimm.h 
b/arch/powerpc/include/asm/libnvdimm.h
new file mode 100644
index ..d35fd7f48603
--- /dev/null
+++ b/arch/powerpc/include/asm/libnvdimm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_LIBNVDIMM_H
+#define _ASM_POWERPC_LIBNVDIMM_H
+
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
+extern unsigned long *nd_pfn_supported_alignments(void);
+extern unsigned long nd_pfn_default_alignment(void);
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..42e4a399ba5d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM) += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)   += ptdump/
 obj-$(CONFIG_KASAN)+= kasan/
+obj-$(CONFIG_NVDIMM_PFN)   += nvdimm.o
diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c
new file mode 100644
index ..a29a4510715e
--- /dev/null
+++ b/arch/powerpc/mm/nvdimm.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+
+#include 
+/*
+ * We support only pte and pmd mappings for now.
+ */
+const unsigned long *nd_pfn_supported_alignments(void)
+{
+   static unsigned long supported_alignments[3];
+
+   supported_alignments[0] = PAGE_SIZE;
+
+   if (has_transparent_hugepage())
+   supported_alignments[1] = HPAGE_PMD_SIZE;
+   else
+   supported_alignments[1] = 0;
+
+   supported_alignments[2] = 0;
+   return supported_alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+unsigned long nd_pfn_default_alignment(void)
+{
+
+   if (has_transparent_hugepage())
+   return HPAGE_PMD_SIZE;
+   return PAGE_SIZE;
+}
diff --git

RE: [EXT] Re: [PATCH 2/3] arm64: dts: ls1028a: Add PCIe controller DT nodes

2019-05-20 Thread Xiaowei Bao

Hi Arndt，

-Original Message-
From: Arnd Bergmann  
Sent: 2019年5月17日 16:59
To: Xiaowei Bao 
Cc: Bjorn Helgaas ; Rob Herring ; Mark 
Rutland ; Shawn Guo ; Leo Li 
; Kishon ; Lorenzo Pieralisi 
; gregkh ; M.h. Lian 
; Mingkai Hu ; Roy Zang 
; Kate Stewart ; Philippe 
Ombredanne ; Shawn Lin ; 
linux-pci ; DTML ; Linux 
Kernel Mailing List ; Linux ARM 
; linuxppc-dev 

Subject: Re: [EXT] Re: [PATCH 2/3] arm64: dts: ls1028a: Add PCIe controller DT 
nodes

Caution: EXT Email

On Fri, May 17, 2019 at 5:21 AM Xiaowei Bao  wrote:
> -Original Message-
> From: Arnd Bergmann 
> On Wed, May 15, 2019 at 9:36 AM Xiaowei Bao  wrote:
> > Signed-off-by: Xiaowei Bao 
> > ---
> >  arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi |   52 
> > 
> >  1 files changed, 52 insertions(+), 0 deletions(-)
> >
> > diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi 
> > b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
> > index b045812..50b579b 100644
> > --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
> > +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
> > @@ -398,6 +398,58 @@
> > status = "disabled";
> > };
> >
> > +   pcie@340 {
> > +   compatible = "fsl,ls1028a-pcie";
> > +   reg = <0x00 0x0340 0x0 0x0010   /* 
> > controller registers */
> > +  0x80 0x 0x0 0x2000>; /* 
> > configuration space */
> > +   reg-names = "regs", "config";
> > +   interrupts = , /* 
> > PME interrupt */
> > +; /* 
> > aer interrupt */
> > +   interrupt-names = "pme", "aer";
> > +   #address-cells = <3>;
> > +   #size-cells = <2>;
> > +   device_type = "pci";
> > +   dma-coherent;
> > +   num-lanes = <4>;
> > +   bus-range = <0x0 0xff>;
> > +   ranges = <0x8100 0x0 0x 0x80 0x0001 
> > 0x0 0x0001   /* downstream I/O */
> > + 0x8200 0x0 0x4000 0x80 
> > + 0x4000 0x0 0x4000>; /* non-prefetchable memory */
>
> Are you sure there is no support for 64-bit BARs or prefetchable memory?
> [Xiaowei Bao] sorry for late reply, Thought that our Layerscape platform has 
> not added prefetchable memory support in DTS, so this platform has not been 
> added, I will submit a separate patch to add prefetchable memory support for 
> all Layerscape platforms.

Ok, thanks.

> Of course, the prefetchable PCIE device can work in our boards, 
> because the RC will assign non-prefetchable memory for this device. We 
> reserve 1G no-prefetchable memory for PCIE device, it is enough for general 
> devices.

Sure, many devices work just fine, this is mostly a question of supporting 
those devices that do require multiple gigabytes, or that need prefetchable 
memory semantics to get the expected performance. GPUs are the obvious example, 
but I think there are others (infiniband?).
[Xiaowei Bao] sorry, I don't know much about infiniband and GPU, as you said, I 
think many devices works fine with this DTS, I will add the prefetchable memory 
entry in DTS future and submit another patch.

  Arnd

RE: [PATCH v3 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Ran Wang

Hi Pavel,

On Monday, May 20, 2019 15:27: Pavel Machek wrote:
> 
> Hi!
> 
> > The NXP's QorIQ Processors based on ARM Core have RCPM module (Run
> > Control and Power Management), which performs all device-level tasks
> > associated with power management such as wakeup source control.
> >
> > This driver depends on PM wakeup source framework which help to
> > collect wake information.
> >
> > Signed-off-by: Ran Wang 
> 
> > +// Copyright 2019 NXP
> > +//
> > +// Author: Ran Wang ,
> 
> extra ,

OK, will update.

> > +   rcpm = dev_get_drvdata(dev);
> > +   if (!rcpm)
> > +   return -EINVAL;
> > +
> > +   /* Begin with first registered wakeup source */
> > +   ws = wakeup_source_get_next(NULL);
> > +   while (ws) {
> 
> while (ws = wakeup_source_get_next(NULL))
> 
> ?
I just answered this in v2 mail thread: 
"Actually, we only pass NULL to wakeup_source_get_next()
at very first call to get 1st wakeup source. Then in the while
loop, we will fetch next source but not 1st, that's different.
I am afraid your suggestion is not quite correct."

Regards
Ran

Re: [PATCH v3 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Pavel Machek

Hi!

> The NXP's QorIQ Processors based on ARM Core have RCPM module
> (Run Control and Power Management), which performs all device-level
> tasks associated with power management such as wakeup source control.
> 
> This driver depends on PM wakeup source framework which help to
> collect wake information.
> 
> Signed-off-by: Ran Wang 

> +// Copyright 2019 NXP
> +//
> +// Author: Ran Wang ,

extra ,

> + rcpm = dev_get_drvdata(dev);
> + if (!rcpm)
> + return -EINVAL;
> +
> + /* Begin with first registered wakeup source */
> + ws = wakeup_source_get_next(NULL);
> + while (ws) {

while (ws = wakeup_source_get_next(NULL))

?

Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature

Re: [PATCH v2] ocxl: Fix potential memory leak on context creation

2019-05-20 Thread Andrew Donnellan


Acked-by: Andrew Donnellan 

On 20/5/19 5:16 pm, Frederic Barrat wrote:

If we couldn't fully init a context, we were leaking memory.

Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts")
Signed-off-by: Frederic Barrat 
---

Changelog:
v2: reset context pointer in case of allocation failure (Andrew)

  drivers/misc/ocxl/context.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index bab9c9364184..24e4fb010275 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -22,6 +22,8 @@ int ocxl_context_alloc(struct ocxl_context **context, struct 
ocxl_afu *afu,
afu->pasid_base + afu->pasid_max, GFP_KERNEL);
if (pasid < 0) {
mutex_unlock(>contexts_lock);
+   kfree(*context);
+   *context = NULL;
return pasid;
}
afu->pasid_count++;



--
Andrew Donnellan  OzLabs, ADL Canberra
a...@linux.ibm.com IBM Australia Limited

[PATCH v2] ocxl: Fix potential memory leak on context creation

2019-05-20 Thread Frederic Barrat

If we couldn't fully init a context, we were leaking memory.

Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts")
Signed-off-by: Frederic Barrat 
---

Changelog:
v2: reset context pointer in case of allocation failure (Andrew)

 drivers/misc/ocxl/context.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index bab9c9364184..24e4fb010275 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -22,6 +22,8 @@ int ocxl_context_alloc(struct ocxl_context **context, struct 
ocxl_afu *afu,
afu->pasid_base + afu->pasid_max, GFP_KERNEL);
if (pasid < 0) {
mutex_unlock(>contexts_lock);
+   kfree(*context);
+   *context = NULL;
return pasid;
}
afu->pasid_count++;
-- 
2.21.0

Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest

2019-05-20 Thread Nicholas Piggin

Bharata B Rao's on May 20, 2019 3:56 pm:
> On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote:
>> >> > git bisect points to
>> >> >
>> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d
>> >> > Author: Nicholas Piggin 
>> >> > Date:   Fri Jul 27 21:48:17 2018 +1000
>> >> >
>> >> > powerpc/64s: Fix page table fragment refcount race vs speculative 
>> >> > references
>> >> >
>> >> > The page table fragment allocator uses the main page refcount racily
>> >> > with respect to speculative references. A customer observed a BUG 
>> >> > due
>> >> > to page table page refcount underflow in the fragment allocator. 
>> >> > This
>> >> > can be caused by the fragment allocator set_page_count stomping on a
>> >> > speculative reference, and then the speculative failure handler
>> >> > decrements the new reference, and the underflow eventually pops when
>> >> > the page tables are freed.
>> >> >
>> >> > Fix this by using a dedicated field in the struct page for the page
>> >> > table fragment allocator.
>> >> >
>> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage")
>> >> > Cc: sta...@vger.kernel.org # v3.10+
>> >> 
>> >> That's the commit that added the BUG_ON(), so prior to that you won't
>> >> see the crash.
>> > 
>> > Right, but the commit says it fixes page table page refcount underflow by
>> > introducing a new field >pt_frag_refcount. Now we are hitting the 
>> > underflow
>> > for this pt_frag_refcount.
>> 
>> The fixed underflow is caused by a bug (race on page count) that got 
>> fixed by that patch. You are hitting a different underflow here. It's
>> not certain my patch caused it, I'm just trying to reproduce now.
> 
> Ok.

Can't reproduce I'm afraid, tried adding and removing 8GB memory from a
4GB guest (via host adding / removing memory device), and it just works.

It's likely to be an edge case like an off by one or rounding error
that just happens to trigger in your config. Might be easiest if you
could test with a debug patch.

Thanks,
Nick

[PATCH v3 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Ran Wang

The NXP's QorIQ Processors based on ARM Core have RCPM module
(Run Control and Power Management), which performs all device-level
tasks associated with power management such as wakeup source control.

This driver depends on PM wakeup source framework which help to
collect wake information.

Signed-off-by: Ran Wang 
---
Change in v3:
- Some whitespace ajdustment.

Change in v2:
- Rebase Kconfig and Makefile update to latest mainline.

 drivers/soc/fsl/Kconfig  |8 +++
 drivers/soc/fsl/Makefile |1 +
 drivers/soc/fsl/rcpm.c   |  124 ++
 3 files changed, 133 insertions(+), 0 deletions(-)
 create mode 100644 drivers/soc/fsl/rcpm.c

diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig
index 61f8e14..8e84e40 100644
--- a/drivers/soc/fsl/Kconfig
+++ b/drivers/soc/fsl/Kconfig
@@ -29,4 +29,12 @@ config FSL_MC_DPIO
  other DPAA2 objects. This driver does not expose the DPIO
  objects individually, but groups them under a service layer
  API.
+
+config FSL_RCPM
+   bool "Freescale RCPM support"
+   depends on PM_SLEEP
+   help
+ The NXP's QorIQ Processors based on ARM Core have RCPM module
+ (Run Control and Power Management), which performs all device-level
+ tasks associated with power management, such as wakeup source control.
 endmenu
diff --git a/drivers/soc/fsl/Makefile b/drivers/soc/fsl/Makefile
index 803ef1b..c1be6ee 100644
--- a/drivers/soc/fsl/Makefile
+++ b/drivers/soc/fsl/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_QUICC_ENGINE)  += qe/
 obj-$(CONFIG_CPM)  += qe/
 obj-$(CONFIG_FSL_GUTS) += guts.o
 obj-$(CONFIG_FSL_MC_DPIO)  += dpio/
+obj-$(CONFIG_FSL_RCPM) += rcpm.o
diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c
new file mode 100644
index 000..678d1cd
--- /dev/null
+++ b/drivers/soc/fsl/rcpm.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// rcpm.c - Freescale QorIQ RCPM driver
+//
+// Copyright 2019 NXP
+//
+// Author: Ran Wang ,
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define RCPM_WAKEUP_CELL_MAX_SIZE  7
+
+struct rcpm {
+   unsigned intwakeup_cells;
+   void __iomem*ippdexpcr_base;
+   boollittle_endian;
+};
+
+static int rcpm_pm_prepare(struct device *dev)
+{
+   struct device_node  *np = dev->of_node;
+   struct wakeup_source*ws;
+   struct rcpm *rcpm;
+   u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp;
+   int i, ret;
+
+   rcpm = dev_get_drvdata(dev);
+   if (!rcpm)
+   return -EINVAL;
+
+   /* Begin with first registered wakeup source */
+   ws = wakeup_source_get_next(NULL);
+   while (ws) {
+   ret = device_property_read_u32_array(ws->attached_dev,
+   "fsl,rcpm-wakeup", value, rcpm->wakeup_cells + 
1);
+
+   /*  Wakeup source should refer to current rcpm device */
+   if (ret || (np->phandle != value[0])) {
+   dev_info(dev, "%s doesn't refer to this rcpm\n",
+   ws->name);
+   ws = wakeup_source_get_next(ws);
+   continue;
+   }
+
+   for (i = 0; i < rcpm->wakeup_cells; i++) {
+   /* We can only OR related bits */
+   if (value[i + 1]) {
+   if (rcpm->little_endian) {
+   tmp = ioread32(rcpm->ippdexpcr_base + i 
* 4);
+   tmp |= value[i + 1];
+   iowrite32(tmp, rcpm->ippdexpcr_base + i 
* 4);
+   } else {
+   tmp = ioread32be(rcpm->ippdexpcr_base + 
i * 4);
+   tmp |= value[i + 1];
+   iowrite32be(tmp, rcpm->ippdexpcr_base + 
i * 4);
+   }
+   }
+   }
+   ws = wakeup_source_get_next(ws);
+   }
+
+   return 0;
+}
+
+static const struct dev_pm_ops rcpm_pm_ops = {
+   .prepare =  rcpm_pm_prepare,
+};
+
+static int rcpm_probe(struct platform_device *pdev)
+{
+   struct device   *dev = >dev;
+   struct resource *r;
+   struct rcpm *rcpm;
+   int ret;
+
+   rcpm = devm_kzalloc(dev, sizeof(*rcpm), GFP_KERNEL);
+   if (!rcpm)
+   return -ENOMEM;
+
+   r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+   if (!r)
+   return -ENODEV;
+
+   rcpm->ippdexpcr_base = devm_ioremap_resource(>dev, r);
+   if (IS_ERR(rcpm->ippdexpcr_base)) {
+   ret =  PTR_ERR(rcpm->ippdexpcr_base);
+   return ret;
+   }
+
+   rcpm->little_endian = device_property_read_bool(
+

[PATCH v3 1/3] PM: wakeup: Add routine to help fetch wakeup source object.

2019-05-20 Thread Ran Wang

Some user might want to go through all registered wakeup sources
and doing things accordingly. For example, SoC PM driver might need to
do HW programming to prevent powering down specific IP which wakeup
source depending on. And is user's responsibility to identify if this
wakeup source he is interested in.

Signed-off-by: Ran Wang 
---
Change in v3:
- Adjust indentation of *attached_dev;.

Change in v2:
- None.

 drivers/base/power/wakeup.c |   18 ++
 include/linux/pm_wakeup.h   |3 +++
 2 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 5b2b6a0..6904485 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -226,6 +227,22 @@ void wakeup_source_unregister(struct wakeup_source *ws)
}
 }
 EXPORT_SYMBOL_GPL(wakeup_source_unregister);
+/**
+ * wakeup_source_get_next - Get next wakeup source from the list
+ * @ws: Previous wakeup source object, null means caller want first one.
+ */
+struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws)
+{
+   struct list_head *ws_head = _sources;
+
+   if (ws)
+   return list_next_or_null_rcu(ws_head, >entry,
+   struct wakeup_source, entry);
+   else
+   return list_entry_rcu(ws_head->next,
+   struct wakeup_source, entry);
+}
+EXPORT_SYMBOL_GPL(wakeup_source_get_next);
 
 /**
  * device_wakeup_attach - Attach a wakeup source object to a device object.
@@ -242,6 +259,7 @@ static int device_wakeup_attach(struct device *dev, struct 
wakeup_source *ws)
return -EEXIST;
}
dev->power.wakeup = ws;
+   ws->attached_dev = dev;
if (dev->power.wakeirq)
device_wakeup_attach_irq(dev, dev->power.wakeirq);
spin_unlock_irq(>power.lock);
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 0ff134d..913b2fb 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -50,6 +50,7 @@
  * @wakeup_count: Number of times the wakeup source might abort suspend.
  * @active: Status of the wakeup source.
  * @has_timeout: The wakeup source has been activated with a timeout.
+ * @attached_dev: The device it attached to
  */
 struct wakeup_source {
const char  *name;
@@ -70,6 +71,7 @@ struct wakeup_source {
unsigned long   wakeup_count;
boolactive:1;
boolautosleep_enabled:1;
+   struct device   *attached_dev;
 };
 
 #ifdef CONFIG_PM_SLEEP
@@ -101,6 +103,7 @@ static inline void device_set_wakeup_path(struct device 
*dev)
 extern void wakeup_source_remove(struct wakeup_source *ws);
 extern struct wakeup_source *wakeup_source_register(const char *name);
 extern void wakeup_source_unregister(struct wakeup_source *ws);
+extern struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws);
 extern int device_wakeup_enable(struct device *dev);
 extern int device_wakeup_disable(struct device *dev);
 extern void device_set_wakeup_capable(struct device *dev, bool capable);
-- 
1.7.1

[PATCH v3 2/3] Documentation: dt: binding: fsl: Add 'little-endian' and update Chassis define

2019-05-20 Thread Ran Wang

By default, QorIQ SoC's RCPM register block is Big Endian. But
there are some exceptions, such as LS1088A and LS2088A, are Little
Endian. So add this optional property to help identify them.

Actually LS2021A and other Layerscapes won't totally follow Chassis
2.1, so separate them from powerpc SoC.

Signed-off-by: Ran Wang 
---
Change in v3:
- None.

Change in v2:
- None.

 Documentation/devicetree/bindings/soc/fsl/rcpm.txt |8 +++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt 
b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
index e284e4e..058154c 100644
--- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
+++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
@@ -20,6 +20,7 @@ Required properites:
* "fsl,qoriq-rcpm-1.0": for chassis 1.0 rcpm
* "fsl,qoriq-rcpm-2.0": for chassis 2.0 rcpm
* "fsl,qoriq-rcpm-2.1": for chassis 2.1 rcpm
+   * "fsl,qoriq-rcpm-2.1+": for chassis 2.1+ rcpm
 
 All references to "1.0" and "2.0" refer to the QorIQ chassis version to
 which the chip complies.
@@ -27,7 +28,12 @@ Chassis Version  Example Chips
 ------
 1.0p4080, p5020, p5040, p2041, p3041
 2.0t4240, b4860, b4420
-2.1t1040, ls1021
+2.1t1040,
+2.1+   ls1021a, ls1012a, ls1043a, ls1046a
+
+Optional properties:
+ - little-endian : RCPM register block is Little Endian. Without it RCPM
+   will be Big Endian (default case).
 
 Example:
 The RCPM node for T4240:
-- 
1.7.1

Re: [PATCH] ocxl: Fix potential memory leak on context creation

2019-05-20 Thread Frederic Barrat





Le 20/05/2019 à 03:45, Andrew Donnellan a écrit :

On 18/5/19 12:20 am, Frederic Barrat wrote:

If we couldn't fully init a context, we were leaking memory.

Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI 
contexts")

Signed-off-by: Frederic Barrat 


Acked-by: Andrew Donnellan 


---
  drivers/misc/ocxl/context.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index bab9c9364184..ab93156aa83e 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -22,6 +22,7 @@ int ocxl_context_alloc(struct ocxl_context 
**context, struct ocxl_afu *afu,

  afu->pasid_base + afu->pasid_max, GFP_KERNEL);
  if (pasid < 0) {
  mutex_unlock(>contexts_lock);
+    kfree(*context);


(defensive programming: set *context = NULL so that if the caller 
ignores the return code we get an obvious crash)



Good point. v2 on its way



  return pasid;
  }
  afu->pasid_count++;

RE: [PATCH V2 3/3] soc: fsl: add RCPM driver

2019-05-20 Thread Ran Wang

Hi Pavel,

On Monday, May 20, 2019 05:39, Pavel Machek wrote:
> 
> Hi!
> 
> 
> > +
> > +struct rcpm {
> > +   unsigned int wakeup_cells;
> > +   void __iomem *ippdexpcr_base;
> > +   boollittle_endian;
> > +};
> 
> Inconsistent whitespace

OK, will make them aligned.

> 
> > +static int rcpm_pm_prepare(struct device *dev) {
> > +   struct device_node *np = dev->of_node;
> > +   struct wakeup_source *ws;
> > +   struct rcpm *rcpm;
> > +   u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp;
> > +   int i, ret;
> > +
> > +   rcpm = dev_get_drvdata(dev);
> > +   if (!rcpm)
> > +   return -EINVAL;
> > +
> > +   /* Begin with first registered wakeup source */
> > +   ws = wakeup_source_get_next(NULL);
> > +   while (ws) {
> 
> while (ws = wakeup_source_get_next(NULL)) ?

Actually, we only pass NULL to wakeup_source_get_next() at very first
call to get 1st wakeup source. Then in the while loop, we will fetch
next source but not 1st, that's different. I am afraid your suggestion
is not quite correct.

> 
> > +static int rcpm_probe(struct platform_device *pdev) {
> > +   struct device   *dev = >dev;
> > +   struct resource *r;
> > +   struct rcpm *rcpm;
> > +   int ret;
> 
> Whitespace.

OK, will update, thanks for your review.

Regards,
Ran

Re: [RFC PATCH v2 09/10] KVM: PPC: Book3S HV: Fixed for running secure guests

2019-05-20 Thread Paul Mackerras

On Sat, May 18, 2019 at 11:25:23AM -0300, Claudio Carvalho wrote:
> From: Paul Mackerras 
> 
> - Pass SRR1 in r11 for UV_RETURN because SRR0 and SRR1 get set by
>   the sc 2 instruction. (Note r3 - r10 potentially have hcall return
>   values in them.)
> 
> - Fix kvmppc_msr_interrupt to preserve the MSR_S bit.
> 
> Signed-off-by: Paul Mackerras 
> Signed-off-by: Claudio Carvalho 

This should be folded into the previous patch.

Paul.

Re: [RFC PATCH v2 08/10] KVM: PPC: Ultravisor: Return to UV for hcalls from SVM

2019-05-20 Thread Paul Mackerras

On Sat, May 18, 2019 at 11:25:22AM -0300, Claudio Carvalho wrote:
> From: Sukadev Bhattiprolu 
> 
> All hcalls from a secure VM go to the ultravisor from where they are
> reflected into the HV. When we (HV) complete processing such hcalls,
> we should return to the UV rather than to the guest kernel.

This paragraph in the patch description, and the comment in
book3s_hv_rmhandlers.S, are confusing and possibly misleading in
focussing on returns from hcalls, when the change is needed for any
sort of entry to the guest from the hypervisor, whether it is a return
from an hcall, a return from a hypervisor interrupt, or the first time
that a guest vCPU is run.

This paragraph needs to explain that to enter a secure guest, we have
to go through the ultravisor, therefore we do a ucall when we are
entering a secure guest.

[snip]

> +/*
> + * The hcall we just completed was from Ultravisor. Use UV_RETURN
> + * ultra call to return to the Ultravisor. Results from the hcall
> + * are already in the appropriate registers (r3:12), except for
> + * R6,7 which we used as temporary registers above. Restore them,
> + * and set R0 to the ucall number (UV_RETURN).
> + */

This needs to say something like "We are entering a secure guest, so
we have to invoke the ultravisor to do that.  If we are returning from
a hcall, the results are already ...".

> +ret_to_ultra:
> + lwz r6, VCPU_CR(r4)
> + mtcrr6
> + LOAD_REG_IMMEDIATE(r0, UV_RETURN)
> + ld  r7, VCPU_GPR(R7)(r4)
> + ld  r6, VCPU_GPR(R6)(r4)
> + ld  r4, VCPU_GPR(R4)(r4)
> + sc  2
>  
>  /*
>   * Enter the guest on a P9 or later system where we have exactly
> -- 
> 2.20.1

Paul.

Re: [RFC PATCH v2 07/10] KVM: PPC: Ultravisor: Restrict LDBAR access

2019-05-20 Thread Paul Mackerras

On Sat, May 18, 2019 at 11:25:21AM -0300, Claudio Carvalho wrote:
> From: Ram Pai 
> 
> When the ultravisor firmware is available, it takes control over the
> LDBAR register. In this case, thread-imc updates and save/restore
> operations on the LDBAR register are handled by ultravisor.
> 
> Signed-off-by: Ram Pai 
> [Restrict LDBAR access in assembly code and some in C, update the commit
>  message]
> Signed-off-by: Claudio Carvalho 

Some of the places that you are patching below are explicitly only
executed on POWER8, which can't have an ultravisor, and therefore
the change isn't needed:

> ---
>  arch/powerpc/kvm/book3s_hv.c |  4 +-
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  2 +
>  arch/powerpc/perf/imc-pmu.c  | 64 
>  arch/powerpc/platforms/powernv/idle.c|  6 +-
>  arch/powerpc/platforms/powernv/subcore-asm.S |  4 ++
>  5 files changed, 52 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 0fab0a201027..81f35f955d16 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -75,6 +75,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "book3s.h"
>  
> @@ -3117,7 +3118,8 @@ static noinline void kvmppc_run_core(struct 
> kvmppc_vcore *vc)
>   subcore_size = MAX_SMT_THREADS / split;
>   split_info.rpr = mfspr(SPRN_RPR);
>   split_info.pmmar = mfspr(SPRN_PMMAR);
> - split_info.ldbar = mfspr(SPRN_LDBAR);
> + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
> + split_info.ldbar = mfspr(SPRN_LDBAR);

This is inside an if (is_power8) statement.

> diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S 
> b/arch/powerpc/platforms/powernv/subcore-asm.S
> index 39bb24aa8f34..e4383fa5e150 100644
> --- a/arch/powerpc/platforms/powernv/subcore-asm.S
> +++ b/arch/powerpc/platforms/powernv/subcore-asm.S
> @@ -44,7 +44,9 @@ _GLOBAL(split_core_secondary_loop)
>  
>  real_mode:
>   /* Grab values from unsplit SPRs */
> +BEGIN_FW_FTR_SECTION
>   mfspr   r6,  SPRN_LDBAR
> +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ULTRAVISOR)
>   mfspr   r7,  SPRN_PMMAR
>   mfspr   r8,  SPRN_PMCR
>   mfspr   r9,  SPRN_RPR
> @@ -77,7 +79,9 @@ real_mode:
>   mtspr   SPRN_HDEC, r4
>  
>   /* Restore SPR values now we are split */
> +BEGIN_FW_FTR_SECTION
>   mtspr   SPRN_LDBAR, r6
> +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ULTRAVISOR)

Only POWER8 supports split-core mode, so we can only get here on
POWER8.

Paul.

[WIP RFC PATCH 6/6] fwvarfs: Add opal_secvar backend

2019-05-20 Thread Daniel Axtens

COMPILE TESTED ONLY.

mount -t fwvarfs opal_secvar /fw

Signed-off-by: Daniel Axtens 
---
 Documentation/filesystems/fwvarfs.txt |   3 +
 fs/fwvarfs/Kconfig|  11 ++
 fs/fwvarfs/Makefile   |   1 +
 fs/fwvarfs/fwvarfs.c  |   3 +
 fs/fwvarfs/fwvarfs.h  |   4 +
 fs/fwvarfs/opal_secvar.c  | 218 ++
 6 files changed, 240 insertions(+)
 create mode 100644 fs/fwvarfs/opal_secvar.c

diff --git a/Documentation/filesystems/fwvarfs.txt 
b/Documentation/filesystems/fwvarfs.txt
index 7c1e921e5c50..3ecc4b4428a5 100644
--- a/Documentation/filesystems/fwvarfs.txt
+++ b/Documentation/filesystems/fwvarfs.txt
@@ -36,6 +36,9 @@ Supported backends
backend. Read-only with no creation or deletion, but sufficient that
efivar --print --name  works the same as efivarfs.
 
+ * opal_secvar - COMPILE-TESTED ONLY implementation against PowerNV
+   OPAL Secure Variable storage.
+
 Usage
 -
 
diff --git a/fs/fwvarfs/Kconfig b/fs/fwvarfs/Kconfig
index e4474da11dbc..cb9cbc6f8fb3 100644
--- a/fs/fwvarfs/Kconfig
+++ b/fs/fwvarfs/Kconfig
@@ -34,3 +34,14 @@ config FWVAR_FS_EFI_BACKEND
  in the same way the do with efivarfs.
 
  Say N here unless you're exploring fwvarfs.
+
+config FWVAR_FS_OPAL_SECVAR_BACKEND
+   bool "OPAL Secure Variable backend"
+   depends on FWVAR_FS
+   depends on OPAL_SECVAR
+   help
+ Include a read-only, compile-tested-only, not up-to-date
+ backend for OPAL Secure Variables. This is really just
+ designed to show how the code would work, and you should
+ only select Y here if you are developing OPAL secure
+ variables.
diff --git a/fs/fwvarfs/Makefile b/fs/fwvarfs/Makefile
index 2ab9dfd650ca..8d258acdfef7 100644
--- a/fs/fwvarfs/Makefile
+++ b/fs/fwvarfs/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_FWVAR_FS)  += fwvarfs.o
 
 obj-$(CONFIG_FWVAR_FS_MEM_BACKEND) += mem.o
 obj-$(CONFIG_FWVAR_FS_EFI_BACKEND) += efi.o
+obj-$(CONFIG_FWVAR_FS_OPAL_SECVAR_BACKEND) += opal_secvar.o
diff --git a/fs/fwvarfs/fwvarfs.c b/fs/fwvarfs/fwvarfs.c
index 643ec6585b4d..3c93b6442e95 100644
--- a/fs/fwvarfs/fwvarfs.c
+++ b/fs/fwvarfs/fwvarfs.c
@@ -24,6 +24,9 @@ static struct fwvarfs_backend *fwvarfs_backends[] = {
 #endif
 #ifdef CONFIG_FWVAR_FS_EFI_BACKEND
_efi_backend,
+#endif
+#ifdef CONFIG_FWVAR_FS_OPAL_SECVAR_BACKEND
+   _opal_secvar_backend,
 #endif
NULL,
 };
diff --git a/fs/fwvarfs/fwvarfs.h b/fs/fwvarfs/fwvarfs.h
index 49bde268401f..5780046dafae 100644
--- a/fs/fwvarfs/fwvarfs.h
+++ b/fs/fwvarfs/fwvarfs.h
@@ -117,4 +117,8 @@ extern struct fwvarfs_backend fwvarfs_mem_backend;
 extern struct fwvarfs_backend fwvarfs_efi_backend;
 #endif
 
+#if defined(CONFIG_FWVAR_FS_OPAL_SECVAR_BACKEND)
+extern struct fwvarfs_backend fwvarfs_opal_secvar_backend;
+#endif
+
 #endif /* FWVARFS_H */
diff --git a/fs/fwvarfs/opal_secvar.c b/fs/fwvarfs/opal_secvar.c
new file mode 100644
index ..4a1749317ed9
--- /dev/null
+++ b/fs/fwvarfs/opal_secvar.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Daniel Axtens
+ *
+ * Loosely based on efivarfs:
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr 
+ * and drivers/firmware/efi/vars.c:
+ * Copyright (C) 2001,2003,2004 Dell 
+ * Copyright (C) 2004 Intel Corporation 
+ *
+ * We cheat by not allowing for case-insensitivity.
+ */
+
+#include 
+#include 
+#include 
+#include "fwvarfs.h"
+
+#include 
+#include 
+#include 
+#include 
+
+#define pr_fmt(fmt)"fwvarfs: opal_secvar: " fmt
+
+static LIST_HEAD(opal_secvar_file_list);
+
+struct fwvarfs_opal_secvar_file {
+   struct list_head list;
+   u16 *name;
+   guid_t vendor;
+};
+
+// stolen from efi.h
+static inline char *
+guid_to_str(guid_t *guid, char *out)
+{
+   sprintf(out, "%pUl", guid->b);
+   return out;
+}
+
+// need a forward decl to pass down to register
+struct fwvarfs_backend fwvarfs_opal_secvar_backend;
+
+
+static ssize_t fwvarfs_opal_secvar_read(void *variable, char *buf,
+   size_t count, loff_t off)
+{
+   struct fwvarfs_opal_secvar_file *file_data = variable;
+   unsigned long datasize = 0;
+   u32 attributes;
+   void *data;
+   ssize_t size = 0;
+   loff_t ppos = off;
+   int rc;
+
+   // get size
+   rc = opal_get_secure_variable(file_data->name, _data->vendor,
+ NULL, , NULL);
+   if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL)
+   return -EIO;
+
+   data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
+
+   if (!data)
+   return -ENOMEM;
+
+   rc = opal_get_secure_variable(file_data->name, _data->vendor,
+   , , data + sizeof(attributes));
+   if (rc != OPAL_SUCCESS) {
+   size = -EIO;
+   goto out_free;
+   }
+
+

[WIP RFC PATCH 5/6] powerpc/powernv: Remove EFI support for OPAL secure variables

2019-05-20 Thread Daniel Axtens

Replace it with a generic API.

Compile tested only.

Signed-off-by: Daniel Axtens 
---
 arch/powerpc/include/asm/opal-secvar.h   |  58 +++
 arch/powerpc/platforms/Kconfig   |   3 -
 arch/powerpc/platforms/powernv/Kconfig   |   5 +-
 arch/powerpc/platforms/powernv/opal-secvar.c | 104 ---
 4 files changed, 83 insertions(+), 87 deletions(-)
 create mode 100644 arch/powerpc/include/asm/opal-secvar.h

diff --git a/arch/powerpc/include/asm/opal-secvar.h 
b/arch/powerpc/include/asm/opal-secvar.h
new file mode 100644
index ..ba9bd52138d9
--- /dev/null
+++ b/arch/powerpc/include/asm/opal-secvar.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PowerNV code for secure variables
+ *
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Claudio Carvalho 
+ * Author: Daniel Axtens 
+ *
+ */
+
+#ifndef __OPAL_SECVAR_H
+#define __OPAL_SECVAR_H
+
+#include 
+#include 
+
+#ifdef CONFIG_OPAL_SECVAR
+int opal_get_secure_variable(u16 *name, guid_t *vendor, u32 *attr,
+unsigned long *data_size, void *data);
+
+int opal_get_next_secure_variable(unsigned long *name_size, u16 *name,
+ guid_t *vendor);
+
+int opal_set_secure_variable(u16 *name, guid_t *vendor, u32 attr,
+unsigned long data_size, void *data);
+
+int opal_query_secure_variable_info(u32 attr, u64 *storage_space,
+   u64 *remaining_space,
+   u64 *max_variable_size);
+#else
+
+static inline int opal_get_secure_variable(u16 *name, guid_t *vendor,
+   u32 *attr, unsigned long *data_size, void *data)
+{
+   return OPAL_UNSUPPORTED;
+}
+
+static inline int opal_get_next_secure_variable(unsigned long *name_size,
+   u16 *name, guid_t *vendor)
+{
+   return OPAL_UNSUPPORTED;
+}
+
+static inline int opal_set_secure_variable(u16 *name, guid_t *vendor,
+   u32 attr, unsigned long data_size, void *data)
+{
+   return OPAL_UNSUPPORTED;
+}
+
+static inline int opal_query_secure_variable_info(u32 attr,
+   u64 *storage_space, u64 *remaining_space,
+   u64 *max_variable_size)
+{
+   return OPAL_UNSUPPORTED;
+}
+
+#endif
+#endif
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 8e30510bc0c1..f3fb79fccc72 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -326,7 +326,4 @@ config XILINX_PCI
bool "Xilinx PCI host bridge support"
depends on PCI && XILINX_VIRTEX
 
-config EFI
-   bool
-
 endmenu
diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 879f8e766098..a71fc5daa60a 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -52,7 +52,6 @@ config OPAL_SECVAR
bool "OPAL Secure Variables"
depends on PPC_POWERNV && !CPU_BIG_ENDIAN
select UCS2_STRING
-   select EFI
help
- This enables the kernel to access OPAL secure variables via EFI
- runtime variable services.
+ This enables the kernel to access OPAL secure variables via
+ an API.
diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c 
b/arch/powerpc/platforms/powernv/opal-secvar.c
index e333828bd0bc..af753b94cceb 100644
--- a/arch/powerpc/platforms/powernv/opal-secvar.c
+++ b/arch/powerpc/platforms/powernv/opal-secvar.c
@@ -8,62 +8,20 @@
  */
 #define pr_fmt(fmt) "secvar: "fmt
 
-#include 
 #include 
 #include 
+#include 
+#include 
 
 static bool opal_secvar_supported;
 
-static efi_status_t opal_to_efi_status_log(int rc, const char *func_name)
-{
-   efi_status_t status;
-
-   switch (rc) {
-   case OPAL_EMPTY:
-   status = EFI_NOT_FOUND;
-   break;
-   case OPAL_HARDWARE:
-   status = EFI_DEVICE_ERROR;
-   break;
-   case OPAL_NO_MEM:
-   pr_err("%s: No space in the volatile storage\n", func_name);
-   status = EFI_OUT_OF_RESOURCES;
-   break;
-   case OPAL_PARAMETER:
-   status = EFI_INVALID_PARAMETER;
-   break;
-   case OPAL_PARTIAL:
-   status = EFI_BUFFER_TOO_SMALL;
-   break;
-   case OPAL_PERMISSION:
-   status = EFI_WRITE_PROTECTED;
-   break;
-   case OPAL_RESOURCE:
-   pr_err("%s: No space in the non-volatile storage\n", func_name);
-   status = EFI_OUT_OF_RESOURCES;
-   break;
-   case OPAL_SUCCESS:
-   status = EFI_SUCCESS;
-   break;
-   default:
-   pr_err("%s: Unknown OPAL error %d\n", func_name, rc);
-   status = EFI_DEVICE_ERROR;
-   break;
-   }
-
-   return status;
-}
-
-#define opal_to_efi_status(rc) opal_to_efi_status_log(rc, __func__)

[WIP RFC PATCH 4/6] powerpc/powernv: Add support for OPAL secure variables

2019-05-20 Thread Daniel Axtens

From: Claudio Carvalho 

[dja: this is a WIP version - a new version is coming that changes
the interface. I also had to renumber the opal calls to get this
to apply. Basically, this is an illustration of the concept: more
work would be required to get this to actually function.]

The X.509 certificates trusted by the platform and other information
required to secure boot the host OS kernel are wrapped in secure
variables, which are controlled by OPAL.

The OPAL secure variables can be handled through the following OPAL
calls.

OPAL_SECVAR_GET:
Returns the data for a given secure variable name and vendor GUID.

OPAL_SECVAR_GET_NEXT:
For a given secure variable, it returns the name and vendor GUID
of the next variable.

OPAL_SECVAR_ENQUEUE:
Enqueue the supplied secure variable update so that it can be processed
by OPAL in the next boot. Variable updates cannot be be processed right
away because the variable storage is write locked at runtime.

OPAL_SECVAR_INFO:
Returns size information about the variable.

This patch adds support for OPAL secure variables by setting up the EFI
runtime variable services to make OPAL calls.

This patch also introduces CONFIG_OPAL_SECVAR for enabling the OPAL
secure variables support in the kernel. Since CONFIG_OPAL_SECVAR selects
CONFIG_EFI, it also allow us to manage the OPAL secure variables from
userspace via efivarfs.

Signed-off-by: Claudio Carvalho 
Signed-off-by: Daniel Axtens 
---
 arch/powerpc/include/asm/opal-api.h  |   6 +-
 arch/powerpc/include/asm/opal.h  |  10 ++
 arch/powerpc/platforms/Kconfig   |   3 +
 arch/powerpc/platforms/powernv/Kconfig   |   9 +
 arch/powerpc/platforms/powernv/Makefile  |   1 +
 arch/powerpc/platforms/powernv/opal-call.c   |   4 +
 arch/powerpc/platforms/powernv/opal-secvar.c | 179 +++
 7 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-secvar.c

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index e1577cfa7186..8054e1e983ff 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -212,7 +212,11 @@
 #define OPAL_HANDLE_HMI2   166
 #defineOPAL_NX_COPROC_INIT 167
 #define OPAL_XIVE_GET_VP_STATE 170
-#define OPAL_LAST  170
+#define OPAL_SECVAR_GET171
+#define OPAL_SECVAR_GET_NEXT   172
+#define OPAL_SECVAR_ENQUEUE173
+#define OPAL_SECVAR_INFO   174
+#define OPAL_LAST  174
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 4cc37e708bc7..4b8046caaf4f 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -394,6 +394,16 @@ void opal_powercap_init(void);
 void opal_psr_init(void);
 void opal_sensor_groups_init(void);
 
+extern int opal_secvar_get(uint64_t name, uint64_t vendor, uint64_t attr,
+  uint64_t data_size, uint64_t data);
+extern int opal_secvar_get_next(uint64_t name_size, uint64_t name,
+   uint64_t vendor);
+extern int opal_secvar_enqueue(uint64_t name, uint64_t vendor, uint64_t attr,
+  uint64_t data_size, uint64_t data);
+extern int opal_secvar_info(uint64_t attr, uint64_t storage_space,
+   uint64_t remaining_space,
+   uint64_t max_variable_size);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index f3fb79fccc72..8e30510bc0c1 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -326,4 +326,7 @@ config XILINX_PCI
bool "Xilinx PCI host bridge support"
depends on PCI && XILINX_VIRTEX
 
+config EFI
+   bool
+
 endmenu
diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 850eee860cf2..879f8e766098 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -47,3 +47,12 @@ config PPC_VAS
  VAS adapters are found in POWER9 based systems.
 
  If unsure, say N.
+
+config OPAL_SECVAR
+   bool "OPAL Secure Variables"
+   depends on PPC_POWERNV && !CPU_BIG_ENDIAN
+   select UCS2_STRING
+   select EFI
+   help
+ This enables the kernel to access OPAL secure variables via EFI
+ runtime variable services.
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index da2e99efbd04..1511d836fd19 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@

[WIP RFC PATCH 3/6] fwvarfs: efi backend

2019-05-20 Thread Daniel Axtens

Add a read-only EFI backend. This does not rely on efivarfs at all
(although it does borrow heavily from the code).

It only supports reading the variables, but it supports the same
format as efivarfs so tools like efivar continue to work if you
mount this filesystem in the same place.

Two small quirks:
 - efivarfs (at least as configured on Ubuntu) allows users to
   access the files, here only root can.
 - efivarfs makes GUID comparison case-insensitive, this does not.

Signed-off-by: Daniel Axtens 
---
 Documentation/filesystems/fwvarfs.txt |   5 +
 fs/fwvarfs/Kconfig|  11 ++
 fs/fwvarfs/Makefile   |   1 +
 fs/fwvarfs/efi.c  | 177 ++
 fs/fwvarfs/fwvarfs.c  |   4 +-
 fs/fwvarfs/fwvarfs.h  |   4 +
 6 files changed, 201 insertions(+), 1 deletion(-)
 create mode 100644 fs/fwvarfs/efi.c

diff --git a/Documentation/filesystems/fwvarfs.txt 
b/Documentation/filesystems/fwvarfs.txt
index bf1bccba6ab9..7c1e921e5c50 100644
--- a/Documentation/filesystems/fwvarfs.txt
+++ b/Documentation/filesystems/fwvarfs.txt
@@ -32,6 +32,10 @@ Supported backends
operations. Files created persist across mount/unmount but as no
hardware is involved they do not persist across reboots.
 
+ * efi - a partial reimplementation of efivarfs against the fwvarfs
+   backend. Read-only with no creation or deletion, but sufficient that
+   efivar --print --name  works the same as efivarfs.
+
 Usage
 -
 
@@ -40,6 +44,7 @@ mount -t fwvarfs  
 For example:
 
 mount -t fwvarfs mem /fw/mem/
+mount -t fwvarfs efi /sys/firmware/efi/efivars
 
 API
 ---
diff --git a/fs/fwvarfs/Kconfig b/fs/fwvarfs/Kconfig
index 62a47cddd4b5..e4474da11dbc 100644
--- a/fs/fwvarfs/Kconfig
+++ b/fs/fwvarfs/Kconfig
@@ -23,3 +23,14 @@ config FWVAR_FS_MEM_BACKEND
  demonstration of fwvarfs.
 
  You can safely say N here unless you're exploring fwvarfs.
+
+config FWVAR_FS_EFI_BACKEND
+   bool "EFI backend"
+   depends on FWVAR_FS
+   help
+ Include a read-only EFI backend, largely cribbed from
+ efivarfs. This is handy for demonstrating that the same
+ userspace tools can read from EFI variables over fwvarfs
+ in the same way the do with efivarfs.
+
+ Say N here unless you're exploring fwvarfs.
diff --git a/fs/fwvarfs/Makefile b/fs/fwvarfs/Makefile
index f1585baccabe..2ab9dfd650ca 100644
--- a/fs/fwvarfs/Makefile
+++ b/fs/fwvarfs/Makefile
@@ -6,3 +6,4 @@
 obj-$(CONFIG_FWVAR_FS) += fwvarfs.o
 
 obj-$(CONFIG_FWVAR_FS_MEM_BACKEND) += mem.o
+obj-$(CONFIG_FWVAR_FS_EFI_BACKEND) += efi.o
diff --git a/fs/fwvarfs/efi.c b/fs/fwvarfs/efi.c
new file mode 100644
index ..7aa5c186d0c9
--- /dev/null
+++ b/fs/fwvarfs/efi.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Daniel Axtens
+ *
+ * Based on efivarfs:
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr 
+ *
+ * We cheat by not allowing for case-insensitivity.
+ */
+
+#include 
+#include 
+#include 
+#include "fwvarfs.h"
+
+#include 
+#include 
+#include 
+
+static LIST_HEAD(efivar_list);
+static LIST_HEAD(efivar_file_list);
+
+struct fwvarfs_efi_file {
+   bool is_removable;
+   struct list_head list;
+   struct efivar_entry *entry;
+};
+
+// need a forward decl to pass down to register
+struct fwvarfs_backend fwvarfs_efi_backend;
+
+
+static ssize_t fwvarfs_efi_read(void *variable, char *buf,
+   size_t count, loff_t off)
+{
+   struct fwvarfs_efi_file *file_data = variable;
+   struct efivar_entry *var = file_data->entry;
+   unsigned long datasize = 0;
+   u32 attributes;
+   void *data;
+   ssize_t size = 0;
+   loff_t ppos = off;
+   int err;
+
+   err = efivar_entry_size(var, );
+
+   /*
+* efivarfs represents uncommitted variables with
+* zero-length files. Reading them should return EOF.
+*/
+   if (err == -ENOENT)
+   return 0;
+   else if (err)
+   return err;
+
+   data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
+
+   if (!data)
+   return -ENOMEM;
+
+   size = efivar_entry_get(var, , ,
+   data + sizeof(attributes));
+   if (size)
+   goto out_free;
+
+   memcpy(data, , sizeof(attributes));
+   size = memory_read_from_buffer(buf, count, ,
+  data, datasize + sizeof(attributes));
+out_free:
+   kfree(data);
+
+   return size;
+}
+
+static int fwvarfs_efi_callback(efi_char16_t *name16, efi_guid_t vendor,
+   unsigned long name_size, void *data)
+{
+   struct efivar_entry *entry;
+   struct fwvarfs_efi_file *file_data;
+   unsigned long size = 0;
+   char *name;
+   int len;
+   int err = -ENOMEM;
+
+

[WIP RFC PATCH 2/6] fwvarfs: a generic firmware variable filesystem

2019-05-20 Thread Daniel Axtens

Sometimes it is helpful to be able to access firmware variables as
file, like efivarfs, but not all firmware is EFI. Create a framework
that allows generic access to firmware variables exposed by a
implementations of a simple backend API.

Add a demonstration memory-based backend.

Signed-off-by: Daniel Axtens 
---
 Documentation/filesystems/fwvarfs.txt | 146 +
 fs/Kconfig|   1 +
 fs/Makefile   |   1 +
 fs/fwvarfs/Kconfig|  25 +++
 fs/fwvarfs/Makefile   |   8 +
 fs/fwvarfs/fwvarfs.c  | 289 ++
 fs/fwvarfs/fwvarfs.h  | 116 +++
 fs/fwvarfs/mem.c  | 113 ++
 fs/kernfs/dir.c   |   1 -
 include/uapi/linux/magic.h|   1 +
 10 files changed, 700 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/fwvarfs.txt
 create mode 100644 fs/fwvarfs/Kconfig
 create mode 100644 fs/fwvarfs/Makefile
 create mode 100644 fs/fwvarfs/fwvarfs.c
 create mode 100644 fs/fwvarfs/fwvarfs.h
 create mode 100644 fs/fwvarfs/mem.c

diff --git a/Documentation/filesystems/fwvarfs.txt 
b/Documentation/filesystems/fwvarfs.txt
new file mode 100644
index ..bf1bccba6ab9
--- /dev/null
+++ b/Documentation/filesystems/fwvarfs.txt
@@ -0,0 +1,146 @@
+fwvarfs
+===
+
+fwvarfs is a generic firmware variable file-system. A platform
+provides a backend implementing a few very simple callbacks, and
+fwvarfs handles all the details required to present the variables as a
+filesystem.
+
+The minimum functionality for a backend is the ability to enumerate
+existing variables. Backends can optionally also allow:
+ - reading of variables
+ - writing of variables
+ - creation of variables
+ - deletion of variables
+
+Key assumptions
+---
+
+ * Variables for each backend live in a single, flat directory -
+   there's no concept of subdirectories.
+
+ * Files are created with mode 600 if the backend provides a write
+   hook, and 400 otherwise, and are owned by root:root.
+
+ * The set of variables stored can be determined at boot time, and
+   nothing outside of fwvarfs creates new variables after boot.
+
+Supported backends
+--
+
+ * mem - a memory-backed example filesystem supporting all
+   operations. Files created persist across mount/unmount but as no
+   hardware is involved they do not persist across reboots.
+
+Usage
+-
+
+mount -t fwvarfs  
+
+For example:
+
+mount -t fwvarfs mem /fw/mem/
+
+API
+---
+
+A backend is installed by creating a fwvarfs_backend struct,
+containing the name of the backend and various callbacks. The backend
+must be registered by adding it to the list at the top of
+fs/fwvarfs/fwvarfs.c
+
+The fwvarfs infrastructure provides the following function to backends:
+
+  int fwvarfs_register_var(struct fwvarfs_backend *backend, const char *name, 
void *variable, size_t size);
+
+  Register a variable with fwvarfs to allow it to be seen by users.
+
+  backend: the backend to which to add this variable
+
+  name: the name of file representing the variable. Must be a valid
+filename, so no nulls or slashes.
+
+  variable: data private to the backend representing the variable -
+will be passed back to every callback
+
+  size: the initial size of the variable
+
+
+The backend must then provide the following functions:
+
+int (*enumerate)(void);
+
+   Mandatory and called at init time, a backend must call
+   fwvarfs_register_var for all variables it wants to expose to
+   the user.
+
+void (*destroy)(void *variable);
+
+   Mandatory if you provide a create or unlink hook, and may
+   become mandatory in the future for cleanup.
+
+   Free backend data associated with variable. It will not be
+   referenced after this point by fwvarfs.
+
+
+ssize_t (*read)(void *variable, char *dest, size_t bytes, loff_t off);
+
+   Read from variable into the a kernel buffer. Similar semantics
+   to a usual read operation, except that off is not a pointer
+   (unlike the usual ppos).
+
+   variable: the variable to read
+   dest: kernel buffer to read into
+   bytes: maximum number of bytes to read
+   off: offset to read from
+
+   Returns the number of bytes read or an error.
+   If this hook is not provided, all reads will fail with -EPERM.
+
+ssize_t (*write)(void *variable, const char *src, size_t bytes, loff_t 
off);
+
+   Write into the variable from the given kernel buffer.
+
+   variable: the variable to write
+   src: kernel buffer with contents
+   bytes: write at most this many bytes
+   off: offset into the file to write at.
+
+   Returns the number of bytes written or an error.
+   If this hook is not provided, all writes will fail with -EPERM.
+
+
+void* (*create)(const char *name);
+
+   Create a variable with the

[WIP RFC PATCH 1/6] kernfs: add create() and unlink() hooks

2019-05-20 Thread Daniel Axtens

I'm building a generic firmware variable filesystem on top of kernfs
and I'd like to be able to create and unlink files.

The hooks are fairly straightforward. create() returns a kernfs_node*,
which is safe with regards to cleanup on error paths, because there
is no way that things can fail after that point in the current
implementation. However, currently O_EXCL is not implemented and
that may create failure paths, in which case we may need to revisit
this later.

Signed-off-by: Daniel Axtens 
---
 fs/kernfs/dir.c| 55 ++
 include/linux/kernfs.h |  3 +++
 2 files changed, 58 insertions(+)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 016ba88f7335..74fe51dbd027 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1175,6 +1175,59 @@ static int kernfs_iop_rename(struct inode *old_dir, 
struct dentry *old_dentry,
return ret;
 }
 
+static int kernfs_iop_create(struct inode *dir, struct dentry *dentry,
+umode_t mode, bool excl)
+{
+   struct kernfs_node *parent = dir->i_private;
+   struct kernfs_node *kn;
+   struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
+
+   if (!scops || !scops->create)
+   return -EPERM;
+
+   if (!kernfs_get_active(parent))
+   return -ENODEV;
+
+   // TODO: add some locking to ensure that scops->create
+   // is called only once, and possibly to handle the O_EXCL case
+   WARN_ONCE(excl, "excl unimplemented");
+
+   kn = scops->create(parent, dentry->d_name.name, mode);
+
+   if (!kn)
+   return -EPERM;
+
+   if (IS_ERR(kn))
+   return PTR_ERR(kn);
+
+   d_instantiate(dentry, kernfs_get_inode(dir->i_sb, kn));
+
+   return 0;
+}
+
+static int kernfs_iop_unlink(struct inode *dir, struct dentry *dentry)
+{
+   struct kernfs_node *parent = dir->i_private;
+   struct kernfs_node *kn = d_inode(dentry)->i_private;
+   struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
+   int ret;
+
+
+   if (!scops || !scops->unlink)
+   return -EPERM;
+
+   if (!kernfs_get_active(parent))
+   return -ENODEV;
+
+   ret = scops->unlink(kn);
+   if (ret)
+   return ret;
+
+   drop_nlink(d_inode(dentry));
+   dput(dentry);
+   return 0;
+};
+
 const struct inode_operations kernfs_dir_iops = {
.lookup = kernfs_iop_lookup,
.permission = kernfs_iop_permission,
@@ -1185,6 +1238,8 @@ const struct inode_operations kernfs_dir_iops = {
.mkdir  = kernfs_iop_mkdir,
.rmdir  = kernfs_iop_rmdir,
.rename = kernfs_iop_rename,
+   .create = kernfs_iop_create,
+   .unlink = kernfs_iop_unlink,
 };
 
 static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 2bf477f86eb1..282b96acbd7e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -179,6 +179,9 @@ struct kernfs_syscall_ops {
  const char *new_name);
int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
 struct kernfs_root *root);
+   struct kernfs_node* (*create)(struct kernfs_node *parent,
+ const char *name, umode_t mode);
+   int (*unlink)(struct kernfs_node *kn);
 };
 
 struct kernfs_root {
-- 
2.19.1

[WIP RFC PATCH 0/6] Generic Firmware Variable Filesystem

2019-05-20 Thread Daniel Axtens

Hi all,

As PowerNV moves towards secure boot, we need a place to put secure
variables. One option that has been canvassed is to make our secure
variables look like EFI variables. This is an early sketch of another
approach where we create a generic firmware variable file system,
fwvarfs, and an OPAL Secure Variable backend for it.

In short, platforms provide a simple backend that can interface with
the hardware, and fwvarfs deals with translating that into a
filesystem that you can use. Almost all of the hard work is done by
kernfs: fwvarfs provides a pretty thin layer on top of that to make
backends a simple as possible.

Behaviour and the API is documented in Documentation/filesystems/fwvarfs.txt

To demonstrate the concept, a fully functional memory-based backend is
provided, and a read-only but userspace-compatible EFI backend.

For OPAL secure variables, I have taken Claudio's commit, tweaked it
to apply to linux-next, replaced all the EFI support with a generic
API, and then written a backend against that. There's a coming version
from Claudio that moves the opal calls towards a simple key/value
interface rather than (name, vendor) pairs - I haven't waited for
that: this is really just to demonstrate that it could be done rather
than an attempt to get mergable code.  It is also compile tested only
as I haven't yet set myself up with a test machine.

The patches are a bit rough, and there are a number of outstanding
TODOs sprinkled in everywhere. The idea is just to do a proof of
concept to inform our discussions:

 - Is this the sort of approach you'd like (generic vs specific)?
 
 - Does the backend API make sense?
 
 - Is the use of kernfs the correct decision, or is it potentially too
   limiting? (e.g. no ability to do case-insensitivity like efivarfs)

 - Is assuming flat fwvars correct or is there a firmware with a
   hierarchical structure?

Regards,
Daniel

Claudio Carvalho (1):
  powerpc/powernv: Add support for OPAL secure variables

Daniel Axtens (5):
  kernfs: add create() and unlink() hooks
  fwvarfs: a generic firmware variable filesystem
  fwvarfs: efi backend
  powerpc/powernv: Remove EFI support for OPAL secure variables
  fwvarfs: Add opal_secvar backend

 Documentation/filesystems/fwvarfs.txt| 154 ++
 arch/powerpc/include/asm/opal-api.h  |   6 +-
 arch/powerpc/include/asm/opal-secvar.h   |  58 
 arch/powerpc/include/asm/opal.h  |  10 +
 arch/powerpc/platforms/powernv/Kconfig   |   8 +
 arch/powerpc/platforms/powernv/Makefile  |   1 +
 arch/powerpc/platforms/powernv/opal-call.c   |   4 +
 arch/powerpc/platforms/powernv/opal-secvar.c | 121 
 fs/Kconfig   |   1 +
 fs/Makefile  |   1 +
 fs/fwvarfs/Kconfig   |  47 +++
 fs/fwvarfs/Makefile  |  10 +
 fs/fwvarfs/efi.c | 177 +++
 fs/fwvarfs/fwvarfs.c | 294 +++
 fs/fwvarfs/fwvarfs.h | 124 
 fs/fwvarfs/mem.c | 113 +++
 fs/fwvarfs/opal_secvar.c | 218 ++
 fs/kernfs/dir.c  |  54 
 include/linux/kernfs.h   |   3 +
 include/uapi/linux/magic.h   |   1 +
 20 files changed, 1404 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/fwvarfs.txt
 create mode 100644 arch/powerpc/include/asm/opal-secvar.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-secvar.c
 create mode 100644 fs/fwvarfs/Kconfig
 create mode 100644 fs/fwvarfs/Makefile
 create mode 100644 fs/fwvarfs/efi.c
 create mode 100644 fs/fwvarfs/fwvarfs.c
 create mode 100644 fs/fwvarfs/fwvarfs.h
 create mode 100644 fs/fwvarfs/mem.c
 create mode 100644 fs/fwvarfs/opal_secvar.c

-- 
2.19.1

Re: [PATCH] mm: add account_locked_vm utility function

2019-05-20 Thread Alexey Kardashevskiy




On 04/05/2019 06:16, Daniel Jordan wrote:
> locked_vm accounting is done roughly the same way in five places, so
> unify them in a helper.  Standardize the debug prints, which vary
> slightly.

And I rather liked that prints were different and tell precisely which
one of three each printk is.

I commented below but in general this seems working.

Tested-by: Alexey Kardashevskiy 




>  Error codes stay the same, so user-visible behavior does too.
> 
> Signed-off-by: Daniel Jordan 
> Cc: Alan Tull 
> Cc: Alexey Kardashevskiy 
> Cc: Alex Williamson 
> Cc: Andrew Morton 
> Cc: Benjamin Herrenschmidt 
> Cc: Christoph Lameter 
> Cc: Christophe Leroy 
> Cc: Davidlohr Bueso 
> Cc: Jason Gunthorpe 
> Cc: Mark Rutland 
> Cc: Michael Ellerman 
> Cc: Moritz Fischer 
> Cc: Paul Mackerras 
> Cc: Steve Sistare 
> Cc: Wu Hao 
> Cc: linux...@kvack.org
> Cc: k...@vger.kernel.org
> Cc: kvm-...@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-f...@vger.kernel.org
> Cc: linux-ker...@vger.kernel.org
> ---
> 
> Based on v5.1-rc7.  Tested with the vfio type1 driver.  Any feedback
> welcome.
> 
> Andrew, this one patch replaces these six from [1]:
> 
> mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch
> vfio-type1-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> vfio-spapr_tce-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> fpga-dlf-afu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> kvm-book3s-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> powerpc-mmu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> 
> That series converts locked_vm to an atomic, but on closer inspection causes 
> at
> least one accounting race in mremap, and fixing it just for this type
> conversion came with too much ugly in the core mm to justify, especially when
> the right long-term fix is making these drivers use pinned_vm instead.
> 
> Christophe's suggestion of cmpxchg[2] does prevent the races he
> mentioned for pinned_vm, but others would still remain.  In perf_mmap
> and the hfi1 driver, pinned_vm is checked against the rlimit racily and
> then later increased when the pinned_vm originally read may have gone
> stale.  Any fixes for that, that I could think of, seem about as good as
> what's there now, so I left it.  I have a patch that uses cmpxchg with
> pinned_vm if others feel strongly that the aforementioned races should
> be fixed.
> 
> Daniel
> 
> [1] 
> https://lore.kernel.org/linux-mm/20190402204158.27582-1-daniel.m.jor...@oracle.com/
> [2] 
> https://lore.kernel.org/linux-mm/964bd5b0-f1e5-7bf0-5c58-18e75c550...@c-s.fr/
> 
>  arch/powerpc/kvm/book3s_64_vio.c| 44 +++-
>  arch/powerpc/mm/mmu_context_iommu.c | 41 +++---
>  drivers/fpga/dfl-afu-dma-region.c   | 53 +++--
>  drivers/vfio/vfio_iommu_spapr_tce.c | 52 +---
>  drivers/vfio/vfio_iommu_type1.c | 23 -
>  include/linux/mm.h  | 19 +++
>  mm/util.c   | 48 ++
>  7 files changed, 94 insertions(+), 186 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> b/arch/powerpc/kvm/book3s_64_vio.c
> index f02b04973710..f7d37fa6003a 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -30,6 +30,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -56,43 +57,6 @@ static unsigned long kvmppc_stt_pages(unsigned long 
> tce_pages)
>   return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
>  }
>  
> -static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
> -{
> - long ret = 0;
> -
> - if (!current || !current->mm)
> - return ret; /* process exited */
> -
> - down_write(>mm->mmap_sem);
> -
> - if (inc) {
> - unsigned long locked, lock_limit;
> -
> - locked = current->mm->locked_vm + stt_pages;
> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> - if (locked > lock_limit && !capable(CAP_IPC_LOCK))
> - ret = -ENOMEM;
> - else
> - current->mm->locked_vm += stt_pages;
> - } else {
> - if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
> - stt_pages = current->mm->locked_vm;
> -
> - current->mm->locked_vm -= stt_pages;
> - }
> -
> - pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
> - inc ? '+' : '-',
> - stt_pages << PAGE_SHIFT,
> - current->mm->locked_vm << PAGE_SHIFT,
> - rlimit(RLIMIT_MEMLOCK),
> - ret ? " - exceeded" : "");
> -
> - up_write(>mm->mmap_sem);
> -
> - return ret;
> -}
> -
>  static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
>  {
>   struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
> @@

[PATCH 5/5] asm-generic: remove ptrace.h

2019-05-20 Thread Christoph Hellwig

No one is using this header anymore.

Signed-off-by: Christoph Hellwig 
Acked-by: Arnd Bergmann 
---
 MAINTAINERS|  1 -
 arch/mips/include/asm/ptrace.h |  5 ---
 include/asm-generic/ptrace.h   | 74 --
 3 files changed, 80 deletions(-)
 delete mode 100644 include/asm-generic/ptrace.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 5cfbea4ce575..f3392488ffaf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12769,7 +12769,6 @@ F:  include/linux/regset.h
 F: include/linux/tracehook.h
 F: include/uapi/linux/ptrace.h
 F: include/uapi/linux/ptrace.h
-F: include/asm-generic/ptrace.h
 F: kernel/ptrace.c
 F: arch/*/ptrace*.c
 F: arch/*/*/ptrace*.c
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index b6578611dddb..1e76774b36dd 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -56,11 +56,6 @@ static inline unsigned long kernel_stack_pointer(struct 
pt_regs *regs)
return regs->regs[31];
 }
 
-/*
- * Don't use asm-generic/ptrace.h it defines FP accessors that don't make
- * sense on MIPS.  We rather want an error if they get invoked.
- */
-
 static inline void instruction_pointer_set(struct pt_regs *regs,
unsigned long val)
 {
diff --git a/include/asm-generic/ptrace.h b/include/asm-generic/ptrace.h
deleted file mode 100644
index 82e674f6b337..
--- a/include/asm-generic/ptrace.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Common low level (register) ptrace helpers
- *
- * Copyright 2004-2011 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later.
- */
-
-#ifndef __ASM_GENERIC_PTRACE_H__
-#define __ASM_GENERIC_PTRACE_H__
-
-#ifndef __ASSEMBLY__
-
-/* Helpers for working with the instruction pointer */
-#ifndef GET_IP
-#define GET_IP(regs) ((regs)->pc)
-#endif
-#ifndef SET_IP
-#define SET_IP(regs, val) (GET_IP(regs) = (val))
-#endif
-
-static inline unsigned long instruction_pointer(struct pt_regs *regs)
-{
-   return GET_IP(regs);
-}
-static inline void instruction_pointer_set(struct pt_regs *regs,
-   unsigned long val)
-{
-   SET_IP(regs, val);
-}
-
-#ifndef profile_pc
-#define profile_pc(regs) instruction_pointer(regs)
-#endif
-
-/* Helpers for working with the user stack pointer */
-#ifndef GET_USP
-#define GET_USP(regs) ((regs)->usp)
-#endif
-#ifndef SET_USP
-#define SET_USP(regs, val) (GET_USP(regs) = (val))
-#endif
-
-static inline unsigned long user_stack_pointer(struct pt_regs *regs)
-{
-   return GET_USP(regs);
-}
-static inline void user_stack_pointer_set(struct pt_regs *regs,
-  unsigned long val)
-{
-   SET_USP(regs, val);
-}
-
-/* Helpers for working with the frame pointer */
-#ifndef GET_FP
-#define GET_FP(regs) ((regs)->fp)
-#endif
-#ifndef SET_FP
-#define SET_FP(regs, val) (GET_FP(regs) = (val))
-#endif
-
-static inline unsigned long frame_pointer(struct pt_regs *regs)
-{
-   return GET_FP(regs);
-}
-static inline void frame_pointer_set(struct pt_regs *regs,
- unsigned long val)
-{
-   SET_FP(regs, val);
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif
-- 
2.20.1

[PATCH 4/5] x86: don't use asm-generic/ptrace.h

2019-05-20 Thread Christoph Hellwig

Doing the indirection through macros for the regs accessors just
makes them harder to read, so implement the helpers directly.

Note that only the helpers actually used are implemented now.

Signed-off-by: Christoph Hellwig 
Acked-by: Ingo Molnar 
---
 arch/x86/include/asm/ptrace.h | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 8a7fc0cca2d1..e22816e865ca 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -98,7 +98,6 @@ struct cpuinfo_x86;
 struct task_struct;
 
 extern unsigned long profile_pc(struct pt_regs *regs);
-#define profile_pc profile_pc
 
 extern unsigned long
 convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
@@ -175,11 +174,32 @@ static inline unsigned long kernel_stack_pointer(struct 
pt_regs *regs)
 }
 #endif
 
-#define GET_IP(regs) ((regs)->ip)
-#define GET_FP(regs) ((regs)->bp)
-#define GET_USP(regs) ((regs)->sp)
+static inline unsigned long instruction_pointer(struct pt_regs *regs)
+{
+   return regs->ip;
+}
+
+static inline void instruction_pointer_set(struct pt_regs *regs,
+   unsigned long val)
+{
+   regs->ip = val;
+}
+
+static inline unsigned long frame_pointer(struct pt_regs *regs)
+{
+   return regs->bp;
+}
 
-#include 
+static inline unsigned long user_stack_pointer(struct pt_regs *regs)
+{
+   return regs->sp;
+}
+
+static inline void user_stack_pointer_set(struct pt_regs *regs,
+   unsigned long val)
+{
+   regs->sp = val;
+}
 
 /* Query offset/name of register from its name/offset */
 extern int regs_query_register_offset(const char *name);
-- 
2.20.1

1 2 >

1 - 100 of 105 matches

Mail list logo