[PATCH v4 2/2] powerpc/powernv: Re-enable imc trace-mode in kernel

2020-03-12 Thread Anju T Sudhakar
commit <249fad734a25> ""powerpc/perf: Disable trace_imc pmu"
disables IMC(In-Memory Collection) trace-mode in kernel, since frequent
mode switching between accumulation mode and trace mode via the spr LDBAR
in the hardware can trigger a checkstop(system crash).

Patch to re-enable imc-trace mode in kernel.

The previous patch(1/2) in this series will address the mode switching issue
by implementing a global lock, and will restrict the usage of
accumulation and trace-mode at a time.

Signed-off-by: Anju T Sudhakar 
---
 arch/powerpc/platforms/powernv/opal-imc.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 968b9a4d1cd9..7824cc364bc4 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -268,14 +268,7 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
-   /*
-* FIXME. Using trace_imc events to monitor application
-* or KVM thread performance can cause a checkstop
-* (system crash).
-* Disable it for now.
-*/
-   pr_info_once("IMC: disabling trace_imc PMU\n");
-   domain = -1;
+   domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
-- 
2.20.1



[PATCH v4 1/2] powerpc/perf: Implement a global lock to avoid races between trace, core and thread imc events.

2020-03-12 Thread Anju T Sudhakar
IMC(In-memory Collection Counters) does performance monitoring in
two different modes, i.e accumulation mode(core-imc and thread-imc events),
and trace mode(trace-imc events). A cpu thread can either be in
accumulation-mode or trace-mode at a time and this is done via the LDBAR
register in POWER architecture. The current design does not address the
races between thread-imc and trace-imc events.

Patch implements a global id and lock to avoid the races between
core, trace and thread imc events. With this global id-lock
implementation, the system can either run core, thread or trace imc
events at a time. i.e. to run any core-imc events, thread/trace imc events
should not be enabled/monitored.

Signed-off-by: Anju T Sudhakar 
---
Changes from v3->v4:

- Added mutex lock for thread, core and trace imc cpu offline path.

Changes from v2->v3:

- Addressed the off-line comments from Michael Ellerman
- Optimized the *_event_init code path for trace, core and thread imc
- Handled the global refc in cpuhotplug scenario
- Re-order the patch series
- Removed the selftest patches and will send as a follow up patch

Changes from v1 -> v2:

- Added self test patches to the series.

---
 arch/powerpc/perf/imc-pmu.c | 173 +++-
 1 file changed, 149 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..eb82dda884e5 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+   .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+   .id = 0,
+   .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
return container_of(event->pmu, struct imc_pmu, pmu);
@@ -698,6 +708,16 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
return -EINVAL;
 
ref->refc = 0;
+   /*
+* Reduce the global reference count, if this is the
+* last cpu in this core and core-imc event running
+* in this cpu.
+*/
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == IMC_DOMAIN_CORE)
+   imc_global_refc.refc--;
+
+   mutex_unlock(_global_refc.lock);
}
return 0;
 }
@@ -710,6 +730,23 @@ static int core_imc_pmu_cpumask_init(void)
 ppc_core_imc_cpu_offline);
 }
 
+static void reset_global_refc(struct perf_event *event)
+{
+   mutex_lock(_global_refc.lock);
+   imc_global_refc.refc--;
+
+   /*
+* If no other thread is running any
+* event for this domain(thread/core/trace),
+* set the global id to zero.
+*/
+   if (imc_global_refc.refc <= 0) {
+   imc_global_refc.refc = 0;
+   imc_global_refc.id = 0;
+   }
+   mutex_unlock(_global_refc.lock);
+}
+
 static void core_imc_counters_release(struct perf_event *event)
 {
int rc, core_id;
@@ -759,6 +796,8 @@ static void core_imc_counters_release(struct perf_event 
*event)
ref->refc = 0;
}
mutex_unlock(>lock);
+
+   reset_global_refc(event);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -819,6 +858,29 @@ static int core_imc_event_init(struct perf_event *event)
++ref->refc;
mutex_unlock(>lock);
 
+   /*
+* Since the system can run either in accumulation or trace-mode
+* of IMC at a time, core-imc events are allowed only if no other
+* trace/thread imc events are enabled/monitored.
+*
+* Take the global lock, and check the refc.id
+* to know whether any other trace/thread imc
+* events are running.
+*/
+   mutex_lock(_global_refc.lock);
+   if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
+   /*
+* No other trace/thread imc events are running in
+* the system, so set the refc.id to core-imc.
+*/
+   imc_global_refc.id = IMC_DOMAIN_CORE;
+   imc_global_refc.refc++;
+   } else {
+   mutex_unlock(_global_refc.lock);
+   return -EBUSY;
+   }
+   mutex_unlock(_global_refc.lock);
+
event->hw.event_base = (u64)pcmi->vbase + (config & 
IMC_EVENT_OFFSET_MASK);
event->destroy = core_imc_counters_release;
return 0;
@@ -877,7 +939,23 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
 
 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-   

Re: [PATCH] powerpc/32: Fix missing NULL pmd check in virt_to_kpte()

2020-03-12 Thread Nathan Chancellor
On Sat, Mar 07, 2020 at 10:09:15AM +, Christophe Leroy wrote:
> Commit 2efc7c085f05 ("powerpc/32: drop get_pteptr()"),
> replaced get_pteptr() by virt_to_kpte(). But virt_to_kpte() lacks a
> NULL pmd check and returns an invalid non NULL pointer when there
> is no page table.
> 
> Reported-by: Nick Desaulniers 
> Fixes: 2efc7c085f05 ("powerpc/32: drop get_pteptr()")
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/pgtable.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index b80bfd41828d..b1f1d5339735 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -54,7 +54,9 @@ static inline pmd_t *pmd_ptr_k(unsigned long va)
>  
>  static inline pte_t *virt_to_kpte(unsigned long vaddr)
>  {
> - return pte_offset_kernel(pmd_ptr_k(vaddr), vaddr);
> + pmd_t *pmd = pmd_ptr_k(vaddr);
> +
> + return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
>  }
>  #endif
>  
> -- 
> 2.25.0
> 

With QEMU 4.2.0, I can confirm this fixes the panic:

Tested-by: Nathan Chancellor 


Re: [PATCHv3 2/2] pseries/scm: buffer pmem's bound addr in dt for kexec kernel

2020-03-12 Thread Oliver O'Halloran
On Wed, Mar 4, 2020 at 7:50 PM Pingfan Liu  wrote:
>
> At present, plpar_hcall(H_SCM_BIND_MEM, ...) takes a very long time, so
> if dumping to fsdax, it will take a very long time.
>
> Take a closer look, during the papr_scm initialization, the only
> configuration is through drc_pmem_bind()-> plpar_hcall(H_SCM_BIND_MEM,
> ...), which helps to set up the bound address.
>
> On pseries, for kexec -l/-p kernel, there is no reset of hardware, and this
> step can be stepped around to save times.  So the pmem bound address can be
> passed to the 2nd kernel through a dynamic added property "bound-addr" in
> dt node 'ibm,pmemory'.
>
> Signed-off-by: Pingfan Liu 
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Hari Bathini 
> Cc: Aneesh Kumar K.V 
> Cc: Oliver O'Halloran 
> Cc: Dan Williams 
> Cc: Andrew Donnellan 
> Cc: Christophe Leroy 
> Cc: Rob Herring 
> Cc: Frank Rowand 
> Cc: ke...@lists.infradead.org
> ---
> note: This patch has not been tested since I can not get such a pseries with 
> pmem.
>   Please kindly to give some suggestion, thanks.

There was some qemu patches to implement the Hcall interface floating
around a while ago. I'm not sure they ever made it into upstream qemu
though.

> ---
>  arch/powerpc/platforms/pseries/of_helpers.c |  1 +
>  arch/powerpc/platforms/pseries/papr_scm.c   | 33 
> -
>  drivers/of/base.c   |  1 +
>  3 files changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c 
> b/arch/powerpc/platforms/pseries/of_helpers.c
> index 1022e0f..2c7bab4 100644
> --- a/arch/powerpc/platforms/pseries/of_helpers.c
> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
> @@ -34,6 +34,7 @@ struct property *new_property(const char *name, const int 
> length,
> kfree(new);
> return NULL;
>  }
> +EXPORT_SYMBOL(new_property);
>
>  /**
>   * pseries_of_derive_parent - basically like dirname(1)
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index 0b4467e..54ae903 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -14,6 +14,7 @@
>  #include 
>
>  #include 
> +#include "of_helpers.h"
>
>  #define BIND_ANY_ADDR (~0ul)
>
> @@ -383,7 +384,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>  {
> struct device_node *dn = pdev->dev.of_node;
> u32 drc_index, metadata_size;
> -   u64 blocks, block_size;
> +   u64 blocks, block_size, bound_addr = 0;
> struct papr_scm_priv *p;
> const char *uuid_str;
> u64 uuid[2];
> @@ -440,17 +441,29 @@ static int papr_scm_probe(struct platform_device *pdev)
> p->metadata_size = metadata_size;
> p->pdev = pdev;
>
> -   /* request the hypervisor to bind this region to somewhere in memory 
> */
> -   rc = drc_pmem_bind(p);
> +   of_property_read_u64(dn, "bound-addr", _addr);
> +   if (bound_addr) {
> +   p->bound_addr = bound_addr;
> +   } else {
> +   struct property *property;
> +   u64 big;
>
> -   /* If phyp says drc memory still bound then force unbound and retry */
> -   if (rc == H_OVERLAP)
> -   rc = drc_pmem_query_n_bind(p);
> +   /* request the hypervisor to bind this region to somewhere in 
> memory */
> +   rc = drc_pmem_bind(p);
>
> -   if (rc != H_SUCCESS) {
> -   dev_err(>pdev->dev, "bind err: %d\n", rc);
> -   rc = -ENXIO;
> -   goto err;
> +   /* If phyp says drc memory still bound then force unbound and 
> retry */
> +   if (rc == H_OVERLAP)
> +   rc = drc_pmem_query_n_bind(p);
> +
> +   if (rc != H_SUCCESS) {
> +   dev_err(>pdev->dev, "bind err: %d\n", rc);
> +   rc = -ENXIO;
> +   goto err;
> +   }
> +   big = cpu_to_be64(p->bound_addr);
> +   property = new_property("bound-addr", sizeof(u64), (const 
> unsigned char *),
> +   NULL);

That should probably be "linux,bound-addr"

The other thing that stands out to me is that you aren't removing the
property when the region is unbound. As a general rule I'd prefer we
didn't hack the DT at runtime, but if we are going to then we should
make sure we're not putting anything wrong in there.

> +   of_add_property(dn, property);
> }
>
> /* setup the resource for the newly bound range */
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index ae03b12..602d2a9 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -1817,6 +1817,7 @@ int of_add_property(struct device_node *np, struct 
> property *prop)
>
> return rc;
>  }
> +EXPORT_SYMBOL_GPL(of_add_property);
>
>  int 

Re: [PATCH] powerpc/pseries: fix of_read_drc_info_cell() to point at next record

2020-03-12 Thread Michael Ellerman
Nathan Lynch  writes:
> Michael Ellerman  writes:
>>
>> It would also be *really* nice if we had some unit tests for this
>> parsing code, it's demonstrably very bug prone.
>
> Can you say more about what form unit tests could take? Like some self
> tests that run at boot time, or is there a way to run the code in a user
> space test harness?

It depends.

A userspace selftest is ideal as it's the easiest option for people to
run, ie. they just have to build a user program. There's also CI systems
setup to run the kernel selftests automatically.

See eg. tools/testing/selftests/powerpc/vphn/vphn.c

We also have boot time tests, they are good for code that we want to
test in the kernel proper, or that are hard to extract into a userspace
program. Most of those are configurable, so testing them requires
someone to enable the appropriate CONFIG_FOO and build & boot that
kernel. That reduces coverage obviously.

See eg. arch/powerpc/lib/code-patching.c

These days there's also kunit, which is a "proper" way to do kernel unit
tests. They get built into the kernel but then there's some support for
running those like a user program using UML on x86. On powerpc we'd have
to boot the kunit enabled kernel on hardware or in qemu to exercise the
tests. So they're essentially just boot time tests but with some nicer
infrastructure vs doing it all by hand like we used to.

In this case the actual function we want to test could be trivially
built into a userspace program, but the underlying device tree helpers
probably can't be.

eg. drivers/of/property.c is quite large and contains quite a few
things, we'd need to shim quite a bit to get that building in userspace
I suspect, which then becomes a maintenance burden.

So this is probably a good candidate for a kunit test, though I haven't
personally written/converted any tests to kunit so I can't say exactly
how easy that is.

cheers


Re: [PATCH] powerpc/pseries: fix of_read_drc_info_cell() to point at next record

2020-03-12 Thread Michael Ellerman
Tyrel Datwyler  writes:
> On 3/11/20 10:43 PM, Michael Ellerman wrote:
>> Tyrel Datwyler  writes:
>>> On 3/10/20 10:25 AM, Nathan Lynch wrote:
 Tyrel Datwyler  writes:
> The expectation is that when calling of_read_drc_info_cell()
> repeatedly to parse multiple drc-info records that the in/out curval
> parameter points at the start of the next record on return. However,
> the current behavior has curval still pointing at the final value of
> the record just parsed. The result of which is that if the
> ibm,drc-info property contains multiple properties the parsed value
> of the drc_type for any record after the first has the power_domain
> value of the previous record appended to the type string.
>
> Ex: observed the following 0x prepended to PHB
>
> [   69.485037] drc-info: type: \xff\xff\xff\xffPHB, prefix: PHB , 
> index_start: 0x2001
> [   69.485038] drc-info: suffix_start: 1, sequential_elems: 3072, 
> sequential_inc: 1
> [   69.485038] drc-info: power-domain: 0x, last_index: 0x2c00
>
> Fix by incrementing curval past the power_domain value to point at
> drc_type string of next record.
>
> Fixes: a29396653b8bf ("pseries/drc-info: Search DRC properties for CPU 
> indexes")

 I have a different commit hash for that:
 e83636ac3334 pseries/drc-info: Search DRC properties for CPU indexes
>>>
>>> Oof, looks like I grabbed the commit hash from the SLES 15 SP1 branch. You 
>>> have
>>> the correct upstream commit.
>>>
>>> Fixes: e83636ac3334 ("pseries/drc-info: Search DRC properties for CPU 
>>> indexes")
>>>
>>> Michael, let me know if you want me to resubmit, or if you will fixup the 
>>> Fixes
>>> tag on your end?
>> 
>> I can update the Fixes tag.
>> 
>> What's the practical effect of this bug? It seems like it should break
>> all the hotplug stuff, but presumably it doesn't in practice for some
>> reason?
>> 
>> It would also be *really* nice if we had some unit tests for this
>> parsing code, it's demonstrably very bug prone.
>
> In practice PHBs are the only type of connector in the ibm,drc-info property
> that has multiple records. So, it breaks PHB hotplug, but by chance not pci,
> cpu, slot, or mem because they happen to only ever be a single record.

Thanks. I folded that into the change log.

cheers


Re: [PATCH v5 1/7] ASoC: dt-bindings: fsl_asrc: Add new property fsl, asrc-format

2020-03-12 Thread Shengjiu Wang
Hi Rob

On Tue, Mar 10, 2020 at 5:20 AM Nicolin Chen  wrote:
>
> On Mon, Mar 09, 2020 at 11:58:28AM +0800, Shengjiu Wang wrote:
> > In order to support new EASRC and simplify the code structure,
> > We decide to share the common structure between them. This bring
> > a problem that EASRC accept format directly from devicetree, but
> > ASRC accept width from devicetree.
> >
> > In order to align with new ESARC, we add new property fsl,asrc-format.
> > The fsl,asrc-format can replace the fsl,asrc-width, then driver
> > can accept format from devicetree, don't need to convert it to
> > format through width.
> >
> > Signed-off-by: Shengjiu Wang 
> > ---
> >  Documentation/devicetree/bindings/sound/fsl,asrc.txt | 5 +
> >  1 file changed, 5 insertions(+)
> >
> > diff --git a/Documentation/devicetree/bindings/sound/fsl,asrc.txt 
> > b/Documentation/devicetree/bindings/sound/fsl,asrc.txt
> > index cb9a25165503..780455cf7f71 100644
> > --- a/Documentation/devicetree/bindings/sound/fsl,asrc.txt
> > +++ b/Documentation/devicetree/bindings/sound/fsl,asrc.txt
> > @@ -51,6 +51,11 @@ Optional properties:
> > will be in use as default. Otherwise, the big endian
> > mode will be in use for all the device registers.
> >
> > +   - fsl,asrc-format : Defines a mutual sample format used by DPCM Back
> > +   Ends, which can replace the fsl,asrc-width.
> > +   The value is SNDRV_PCM_FORMAT_S16_LE, or
> > +   SNDRV_PCM_FORMAT_S24_LE
>
> I am still holding the concern at the DT binding of this format,
> as it uses values from ASoC header file instead of a dt-binding
> header file -- not sure if we can do this. Let's wait for Rob's
> comments.

Could you please share your comments or proposal about
Nicolin's concern?

best regards
wang shengjiu


Re: [PATCH v3 24/27] powerpc/powernv/pmem: Expose SMART data via ndctl

2020-03-12 Thread Alastair D'Silva
On Thu, 2020-03-05 at 14:36 +1100, Andrew Donnellan wrote:
> On 21/2/20 2:27 pm, Alastair D'Silva wrote:
> > +static int ndctl_smart(struct ocxlpmem *ocxlpmem, struct
> > nd_cmd_pkg *pkg)
> > +{
> > +   u32 length, i;
> > +   struct nd_ocxl_smart *out;
> > +   int rc;
> > +
> > +   mutex_lock(>admin_command.lock);
> > +
> > +   rc = admin_command_request(ocxlpmem, ADMIN_COMMAND_SMART);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = admin_command_execute(ocxlpmem);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = admin_command_complete_timeout(ocxlpmem,
> > ADMIN_COMMAND_SMART);
> > +   if (rc < 0) {
> > +   dev_err(>dev, "SMART timeout\n");
> > +   goto out;
> > +   }
> > +
> > +   rc = admin_response(ocxlpmem);
> > +   if (rc < 0)
> > +   goto out;
> > +   if (rc != STATUS_SUCCESS) {
> > +   warn_status(ocxlpmem, "Unexpected status from SMART",
> > rc);
> > +   goto out;
> > +   }
> > +
> > +   rc = smart_header_parse(ocxlpmem, );
> > +   if (rc)
> > +   goto out;
> > +
> > +   pkg->nd_fw_size = length;
> > +
> > +   length = min(length, pkg->nd_size_out); // bytes
> > +   out = (struct nd_ocxl_smart *)pkg->nd_payload;
> > +   // Each SMART attribute is 2 * 64 bits
> > +   out->count = length / (2 * sizeof(u64)); // attributes
> 
>  From what I can tell - 8 bytes of nd_ocxl_smart are taken up for
> the 
> count + reserved bytes, so this is going to potentially overrun the
> user 
> buffer.

Ok

> 
> > +
> > +   for (i = 0; i < length; i += sizeof(u64)) {
> 
> It might be neater to make i count up by 1 and then multiply by 
> sizeof(u64) later.
> 
I tried, it doesn't make much difference to the complexity, and makes
it less clear that we are stepping in 64bit chunks.

> > +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu,
> > +ocxlpmem-
> > >admin_command.data_offset + sizeof(u64) + i,
> 
> + 0x08 rather than + sizeof(u64) for consistency.
> 
We use sizeof(u64) in the rest of this function.

> > +OCXL_LITTLE_ENDIAN,
> > +
> > >attribs[i/sizeof(u64)]);
> > +   if (rc)
> > +   goto out;
> > +   }
> > +
> > +   rc = admin_response_handled(ocxlpmem);
> > +   if (rc)
> > +   goto out;
> > +
> > +   rc = 0;
> > +   goto out;
> > +
> > +out:
> > +   mutex_unlock(>admin_command.lock);
> > +   return rc;
> > +}
> > +
> > +static int ndctl_call(struct ocxlpmem *ocxlpmem, void *buf,
> > unsigned int buf_len)
> > +{
> > +   struct nd_cmd_pkg *pkg = buf;
> > +
> > +   if (buf_len < sizeof(struct nd_cmd_pkg)) {
> > +   dev_err(>dev, "Invalid ND_CALL size=%u\n",
> > buf_len);
> > +   return -EINVAL;
> > +   }
> > +
> > +   if (pkg->nd_family != NVDIMM_FAMILY_OCXL) {
> > +   dev_err(>dev, "Invalid ND_CALL
> > family=0x%llx\n", pkg->nd_family);
> > +   return -EINVAL;
> > +   }
> > +
> > +   switch (pkg->nd_command) {
> > +   case ND_CMD_OCXL_SMART:
> > +   ndctl_smart(ocxlpmem, pkg);
> 
> Did you intend to dispose of the return code here?

Whoops.

> 
> > +   break;
> > +
> > +   default:
> > +   dev_err(>dev, "Invalid ND_CALL
> > command=0x%llx\n", pkg->nd_command);
> > +   return -EINVAL;
> > +   }
> > +
> > +
> > +   return 0;
> > +}
> > +
> >   static int ndctl(struct nvdimm_bus_descriptor *nd_desc,
> >  struct nvdimm *nvdimm,
> >  unsigned int cmd, void *buf, unsigned int buf_len, int
> > *cmd_rc)
> > @@ -88,6 +211,10 @@ static int ndctl(struct nvdimm_bus_descriptor
> > *nd_desc,
> > struct ocxlpmem *ocxlpmem = container_of(nd_desc, struct
> > ocxlpmem, bus_desc);
> >   
> > switch (cmd) {
> > +   case ND_CMD_CALL:
> > +   *cmd_rc = ndctl_call(ocxlpmem, buf, buf_len);
> > +   return 0;
> > +
> > case ND_CMD_GET_CONFIG_SIZE:
> > *cmd_rc = ndctl_config_size(buf);
> > return 0;
> > @@ -171,6 +298,7 @@ static int register_lpc_mem(struct ocxlpmem
> > *ocxlpmem)
> > set_bit(ND_CMD_GET_CONFIG_SIZE, _cmd_mask);
> > set_bit(ND_CMD_GET_CONFIG_DATA, _cmd_mask);
> > set_bit(ND_CMD_SET_CONFIG_DATA, _cmd_mask);
> > +   set_bit(ND_CMD_CALL, _cmd_mask);
> >   
> > set_bit(NDD_ALIASING, _flags);
> >   
> > diff --git a/arch/powerpc/platforms/powernv/pmem/ocxl_internal.h
> > b/arch/powerpc/platforms/powernv/pmem/ocxl_internal.h
> > index 927690f4888f..0eb7a35d24ae 100644
> > --- a/arch/powerpc/platforms/powernv/pmem/ocxl_internal.h
> > +++ b/arch/powerpc/platforms/powernv/pmem/ocxl_internal.h
> > @@ -7,6 +7,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   
> >   #define LABEL_AREA_SIZE   (1UL << PA_SECTION_SHIFT)
> >   #define DEFAULT_TIMEOUT 100
> > @@ -98,6 +99,23 @@ struct ocxlpmem_function0 {
> > struct ocxl_fn *ocxl_fn;
> >   };
> >   
> > +struct nd_ocxl_smart {
> > +   __u8 count;
> > +   __u8 reserved[7];
> > +   

Re: [RFC 00/11] perf: Enhancing perf to export processor hazard information

2020-03-12 Thread Kim Phillips
On 3/11/20 11:00 AM, Ravi Bangoria wrote:
> Hi Kim,

Hi Ravi,

> On 3/6/20 3:36 AM, Kim Phillips wrote:
>>> On 3/3/20 3:55 AM, Kim Phillips wrote:
 On 3/2/20 2:21 PM, Stephane Eranian wrote:
> On Mon, Mar 2, 2020 at 2:13 AM Peter Zijlstra  
> wrote:
>>
>> On Mon, Mar 02, 2020 at 10:53:44AM +0530, Ravi Bangoria wrote:
>>> Modern processors export such hazard data in Performance
>>> Monitoring Unit (PMU) registers. Ex, 'Sampled Instruction Event
>>> Register' on IBM PowerPC[1][2] and 'Instruction-Based Sampling' on
>>> AMD[3] provides similar information.
>>>
>>> Implementation detail:
>>>
>>> A new sample_type called PERF_SAMPLE_PIPELINE_HAZ is introduced.
>>> If it's set, kernel converts arch specific hazard information
>>> into generic format:
>>>
>>>     struct perf_pipeline_haz_data {
>>>    /* Instruction/Opcode type: Load, Store, Branch  */
>>>    __u8    itype;
>>>    /* Instruction Cache source */
>>>    __u8    icache;
>>>    /* Instruction suffered hazard in pipeline stage */
>>>    __u8    hazard_stage;
>>>    /* Hazard reason */
>>>    __u8    hazard_reason;
>>>    /* Instruction suffered stall in pipeline stage */
>>>    __u8    stall_stage;
>>>    /* Stall reason */
>>>    __u8    stall_reason;
>>>    __u16   pad;
>>>     };
>>
>> Kim, does this format indeed work for AMD IBS?

 It's not really 1:1, we don't have these separations of stages
 and reasons, for example: we have missed in L2 cache, for example.
 So IBS output is flatter, with more cycle latency figures than
 IBM's AFAICT.
>>>
>>> AMD IBS captures pipeline latency data incase Fetch sampling like the
>>> Fetch latency, tag to retire latency, completion to retire latency and
>>> so on. Yes, Ops sampling do provide more data on load/store centric
>>> information. But it also captures more detailed data for Branch 
>>> instructions.
>>> And we also looked at ARM SPE, which also captures more details pipeline
>>> data and latency information.
>>>
> Personally, I don't like the term hazard. This is too IBM Power
> specific. We need to find a better term, maybe stall or penalty.

 Right, IBS doesn't have a filter to only count stalled or otherwise
 bad events.  IBS' PPR descriptions has one occurrence of the
 word stall, and no penalty.  The way I read IBS is it's just
 reporting more sample data than just the precise IP: things like
 hits, misses, cycle latencies, addresses, types, etc., so words
 like 'extended', or the 'auxiliary' already used today even
 are more appropriate for IBS, although I'm the last person to
 bikeshed.
>>>
>>> We are thinking of using "pipeline" word instead of Hazard.
>>
>> Hm, the word 'pipeline' occurs 0 times in IBS documentation.
> 
> NP. We thought pipeline is generic hw term so we proposed "pipeline"
> word. We are open to term which can be generic enough.
> 
>>
>> I realize there are a couple of core pipeline-specific pieces
>> of information coming out of it, but the vast majority
>> are addresses, latencies of various components in the memory
>> hierarchy, and various component hit/miss bits.
> 
> Yes. we should capture core pipeline specific details. For example,
> IBS generates Branch unit information(IbsOpData1) and Icahce related
> data(IbsFetchCtl) which is something that shouldn't be extended as
> part of perf-mem, IMO.

Sure, IBS Op-side output is more 'perf mem' friendly, and so it
should populate perf_mem_data_src fields, just like POWER9 can:

union perf_mem_data_src {
...
__u64   mem_rsvd:24,
mem_snoopx:2,   /* snoop mode, ext */
mem_remote:1,   /* remote */
mem_lvl_num:4,  /* memory hierarchy level number */
mem_dtlb:7, /* tlb access */
mem_lock:2, /* lock instr */
mem_snoop:5,/* snoop mode */
mem_lvl:14, /* memory hierarchy level */
mem_op:5;   /* type of opcode */


E.g., SIER[LDST] SIER[A_XLATE_SRC] can be used to populate
mem_lvl[_num], SIER_TYPE can be used to populate 'mem_op',
'mem_lock', and the Reload Bus Source Encoding bits can
be used to populate mem_snoop, right?

For IBS, I see PERF_SAMPLE_ADDR and PERF_SAMPLE_PHYS_ADDR can be
used for the ld/st target addresses, too.

>> What's needed here is a vendor-specific extended
>> sample information that all these technologies gather,
>> of which things like e.g., 'L1 TLB cycle latency' we
>> all should have in common.
> 
> Yes. We will include fields to capture the latency cycles (like Issue
> latency, Instruction completion latency etc..) along with other pipeline
> details in the proposed structure.

Latency 

[PATCH 6/6] soc: fsl: qe: fix sparse warnings for ucc_slow.c

2020-03-12 Thread Li Yang
Fixes the following sparse warnings:

drivers/soc/fsl/qe/ucc_slow.c:78:17: warning: incorrect type in assignment 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:78:17:expected struct ucc_slow *us_regs
drivers/soc/fsl/qe/ucc_slow.c:78:17:got struct ucc_slow [noderef]  
*us_regs
drivers/soc/fsl/qe/ucc_slow.c:81:18: warning: incorrect type in argument 1 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:81:18:expected void const volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:81:18:got restricted __be32 *
drivers/soc/fsl/qe/ucc_slow.c:90:9: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:90:9:expected void volatile [noderef]  
*addr
drivers/soc/fsl/qe/ucc_slow.c:90:9:got restricted __be32 *
drivers/soc/fsl/qe/ucc_slow.c:99:17: warning: incorrect type in assignment 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:99:17:expected struct ucc_slow *us_regs
drivers/soc/fsl/qe/ucc_slow.c:99:17:got struct ucc_slow [noderef]  
*us_regs
drivers/soc/fsl/qe/ucc_slow.c:102:18: warning: incorrect type in argument 1 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:102:18:expected void const volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:102:18:got restricted __be32 *
drivers/soc/fsl/qe/ucc_slow.c:111:9: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:111:9:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:111:9:got restricted __be32 *
drivers/soc/fsl/qe/ucc_slow.c:172:28: warning: Using plain integer as NULL 
pointer
drivers/soc/fsl/qe/ucc_slow.c:174:25: warning: cast removes address space 
'' of expression
drivers/soc/fsl/qe/ucc_slow.c:175:25: warning: cast removes address space 
'' of expression
drivers/soc/fsl/qe/ucc_slow.c:194:23: warning: incorrect type in assignment 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:194:23:expected struct ucc_slow_pram *us_pram
drivers/soc/fsl/qe/ucc_slow.c:194:23:got void [noderef]  *
drivers/soc/fsl/qe/ucc_slow.c:204:9: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:204:9:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:204:9:got restricted __be16 *
drivers/soc/fsl/qe/ucc_slow.c:229:41: warning: incorrect type in assignment 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:229:41:expected struct qe_bd *tx_bd
drivers/soc/fsl/qe/ucc_slow.c:229:41:got void [noderef]  *
drivers/soc/fsl/qe/ucc_slow.c:232:17: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:232:17:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:232:17:got restricted __be32 *
drivers/soc/fsl/qe/ucc_slow.c:234:17: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:234:17:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:234:17:got unsigned int [usertype] *
drivers/soc/fsl/qe/ucc_slow.c:238:9: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:238:9:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:238:9:got restricted __be32 *
drivers/soc/fsl/qe/ucc_slow.c:239:9: warning: cast from restricted __be32
drivers/soc/fsl/qe/ucc_slow.c:239:9: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/ucc_slow.c:239:9:expected unsigned int [usertype] val
drivers/soc/fsl/qe/ucc_slow.c:239:9:got restricted __be32 [usertype]
drivers/soc/fsl/qe/ucc_slow.c:239:9: warning: cast from restricted __be32
drivers/soc/fsl/qe/ucc_slow.c:239:9: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:239:9:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:239:9:got unsigned int [usertype] *
drivers/soc/fsl/qe/ucc_slow.c:242:26: warning: incorrect type in assignment 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:242:26:expected struct qe_bd *rx_bd
drivers/soc/fsl/qe/ucc_slow.c:242:26:got void [noderef]  *
drivers/soc/fsl/qe/ucc_slow.c:245:17: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:245:17:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:245:17:got unsigned int [usertype] *
drivers/soc/fsl/qe/ucc_slow.c:247:17: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc_slow.c:247:17:expected void volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc_slow.c:247:17:got restricted __be32 *
drivers/soc/fsl/qe/ucc_slow.c:251:9: warning: cast from restricted __be32
drivers/soc/fsl/qe/ucc_slow.c:251:9: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/ucc_slow.c:251:9:expected unsigned int [usertype] val
drivers/soc/fsl/qe/ucc_slow.c:251:9:   

[PATCH 5/6] soc: fsl: qe: fix sparse warnings for ucc_fast.c

2020-03-12 Thread Li Yang
Fixes the following sparse warnings:

drivers/soc/fsl/qe/ucc_fast.c:218:22: warning: incorrect type in assignment 
(different base types)
drivers/soc/fsl/qe/ucc_fast.c:218:22:expected unsigned int [noderef] 
[usertype]  *p_ucce
drivers/soc/fsl/qe/ucc_fast.c:218:22:got restricted __be32 [noderef] 
 *
drivers/soc/fsl/qe/ucc_fast.c:219:22: warning: incorrect type in assignment 
(different base types)
drivers/soc/fsl/qe/ucc_fast.c:219:22:expected unsigned int [noderef] 
[usertype]  *p_uccm
drivers/soc/fsl/qe/ucc_fast.c:219:22:got restricted __be32 [noderef] 
 *

Signed-off-by: Li Yang 
---
 include/soc/fsl/qe/ucc_fast.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/soc/fsl/qe/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h
index ba0e838f962a..dc4e79468094 100644
--- a/include/soc/fsl/qe/ucc_fast.h
+++ b/include/soc/fsl/qe/ucc_fast.h
@@ -178,10 +178,10 @@ struct ucc_fast_info {
 struct ucc_fast_private {
struct ucc_fast_info *uf_info;
struct ucc_fast __iomem *uf_regs; /* a pointer to the UCC regs. */
-   u32 __iomem *p_ucce;/* a pointer to the event register in memory. */
-   u32 __iomem *p_uccm;/* a pointer to the mask register in memory. */
+   __be32 __iomem *p_ucce; /* a pointer to the event register in memory. */
+   __be32 __iomem *p_uccm; /* a pointer to the mask register in memory. */
 #ifdef CONFIG_UGETH_TX_ON_DEMAND
-   u16 __iomem *p_utodr;   /* pointer to the transmit on demand register */
+   __be16 __iomem *p_utodr;/* pointer to the transmit on demand register */
 #endif
int enabled_tx; /* Whether channel is enabled for Tx (ENT) */
int enabled_rx; /* Whether channel is enabled for Rx (ENR) */
-- 
2.17.1



[PATCH 1/6] soc: fsl: qe: fix sparse warnings for qe.c

2020-03-12 Thread Li Yang
Fixes the following sparse warnings:
drivers/soc/fsl/qe/qe.c:426:9: warning: cast to restricted __be32
drivers/soc/fsl/qe/qe.c:528:41: warning: incorrect type in assignment 
(different base types)
drivers/soc/fsl/qe/qe.c:528:41:expected unsigned long long static 
[addressable] [toplevel] [usertype] extended_modes
drivers/soc/fsl/qe/qe.c:528:41:got restricted __be64 const [usertype] 
extended_modes

Signed-off-by: Li Yang 
---
 drivers/soc/fsl/qe/qe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
index 96c2057d8d8e..447146861c2c 100644
--- a/drivers/soc/fsl/qe/qe.c
+++ b/drivers/soc/fsl/qe/qe.c
@@ -423,7 +423,7 @@ static void qe_upload_microcode(const void *base,
qe_iowrite32be(be32_to_cpu(code[i]), _immr->iram.idata);

/* Set I-RAM Ready Register */
-   qe_iowrite32be(be32_to_cpu(QE_IRAM_READY), _immr->iram.iready);
+   qe_iowrite32be(QE_IRAM_READY, _immr->iram.iready);
 }
 
 /*
@@ -525,7 +525,7 @@ int qe_upload_firmware(const struct qe_firmware *firmware)
 */
memset(_firmware_info, 0, sizeof(qe_firmware_info));
strlcpy(qe_firmware_info.id, firmware->id, sizeof(qe_firmware_info.id));
-   qe_firmware_info.extended_modes = firmware->extended_modes;
+   qe_firmware_info.extended_modes = be64_to_cpu(firmware->extended_modes);
memcpy(qe_firmware_info.vtraps, firmware->vtraps,
sizeof(firmware->vtraps));
 
-- 
2.17.1



[PATCH 3/6] soc: fsl: qe: fix sparse warnings for ucc.c

2020-03-12 Thread Li Yang
Fixes the following sparse warnings:

drivers/soc/fsl/qe/ucc.c:637:20: warning: incorrect type in assignment 
(different address spaces)
drivers/soc/fsl/qe/ucc.c:637:20:expected struct qe_mux *qe_mux_reg
drivers/soc/fsl/qe/ucc.c:637:20:got struct qe_mux [noderef]  *
drivers/soc/fsl/qe/ucc.c:652:9: warning: incorrect type in argument 1 
(different address spaces)
drivers/soc/fsl/qe/ucc.c:652:9:expected void const volatile [noderef] 
 *addr
drivers/soc/fsl/qe/ucc.c:652:9:got restricted __be32 *
drivers/soc/fsl/qe/ucc.c:652:9: warning: incorrect type in argument 2 
(different address spaces)
drivers/soc/fsl/qe/ucc.c:652:9:expected void volatile [noderef]  
*addr
drivers/soc/fsl/qe/ucc.c:652:9:got restricted __be32 *

Signed-off-by: Li Yang 
---
 drivers/soc/fsl/qe/ucc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qe/ucc.c b/drivers/soc/fsl/qe/ucc.c
index 90157acc5ba6..d6c93970df4d 100644
--- a/drivers/soc/fsl/qe/ucc.c
+++ b/drivers/soc/fsl/qe/ucc.c
@@ -632,7 +632,7 @@ int ucc_set_tdm_rxtx_sync(u32 tdm_num, enum qe_clock clock,
 {
int source;
u32 shift;
-   struct qe_mux *qe_mux_reg;
+   struct qe_mux __iomem *qe_mux_reg;
 
qe_mux_reg = _immr->qmx;
 
-- 
2.17.1



[PATCH 4/6] soc: fsl: qe: fix sparse warnings for qe_ic.c

2020-03-12 Thread Li Yang
Fixes the following sparse warnings:

drivers/soc/fsl/qe/qe_ic.c:253:32: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/qe_ic.c:253:32:expected restricted __be32 [noderef] 
[usertype]  *base
drivers/soc/fsl/qe/qe_ic.c:253:32:got unsigned int [noderef] [usertype] 
 *regs
drivers/soc/fsl/qe/qe_ic.c:254:26: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/qe_ic.c:254:26:expected restricted __be32 [noderef] 
[usertype]  *base
drivers/soc/fsl/qe/qe_ic.c:254:26:got unsigned int [noderef] [usertype] 
 *regs
drivers/soc/fsl/qe/qe_ic.c:269:32: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/qe_ic.c:269:32:expected restricted __be32 [noderef] 
[usertype]  *base
drivers/soc/fsl/qe/qe_ic.c:269:32:got unsigned int [noderef] [usertype] 
 *regs
drivers/soc/fsl/qe/qe_ic.c:270:26: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/qe_ic.c:270:26:expected restricted __be32 [noderef] 
[usertype]  *base
drivers/soc/fsl/qe/qe_ic.c:270:26:got unsigned int [noderef] [usertype] 
 *regs
drivers/soc/fsl/qe/qe_ic.c:341:31: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/qe_ic.c:341:31:expected restricted __be32 [noderef] 
[usertype]  *base
drivers/soc/fsl/qe/qe_ic.c:341:31:got unsigned int [noderef] [usertype] 
 *regs
drivers/soc/fsl/qe/qe_ic.c:357:31: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/qe_ic.c:357:31:expected restricted __be32 [noderef] 
[usertype]  *base
drivers/soc/fsl/qe/qe_ic.c:357:31:got unsigned int [noderef] [usertype] 
 *regs
drivers/soc/fsl/qe/qe_ic.c:450:26: warning: incorrect type in argument 1 
(different base types)
drivers/soc/fsl/qe/qe_ic.c:450:26:expected restricted __be32 [noderef] 
[usertype]  *base
drivers/soc/fsl/qe/qe_ic.c:450:26:got unsigned int [noderef] [usertype] 
 *regs

Signed-off-by: Li Yang 
---
 drivers/soc/fsl/qe/qe_ic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qe/qe_ic.c b/drivers/soc/fsl/qe/qe_ic.c
index 0dd5bdb04a14..0390af00 100644
--- a/drivers/soc/fsl/qe/qe_ic.c
+++ b/drivers/soc/fsl/qe/qe_ic.c
@@ -44,7 +44,7 @@
 
 struct qe_ic {
/* Control registers offset */
-   u32 __iomem *regs;
+   __be32 __iomem *regs;
 
/* The remapper for this QEIC */
struct irq_domain *irqhost;
-- 
2.17.1



[PATCH 2/6] soc: fsl: qe: fix sparse warning for qe_common.c

2020-03-12 Thread Li Yang
Fixes the following sparse warning:

drivers/soc/fsl/qe/qe_common.c:75:48: warning: incorrect type in argument 2 
(different base types)
drivers/soc/fsl/qe/qe_common.c:75:48:expected restricted __be32 const 
[usertype] *addr
drivers/soc/fsl/qe/qe_common.c:75:48:got unsigned int *

Signed-off-by: Li Yang 
---
 drivers/soc/fsl/qe/qe_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qe/qe_common.c b/drivers/soc/fsl/qe/qe_common.c
index a81a1a79f1ca..75075591f630 100644
--- a/drivers/soc/fsl/qe/qe_common.c
+++ b/drivers/soc/fsl/qe/qe_common.c
@@ -46,7 +46,7 @@ int cpm_muram_init(void)
 {
struct device_node *np;
struct resource r;
-   u32 zero[OF_MAX_ADDR_CELLS] = {};
+   __be32 zero[OF_MAX_ADDR_CELLS] = {};
resource_size_t max = 0;
int i = 0;
int ret = 0;
-- 
2.17.1



[PATCH 0/6] Fix sparse warnings for common qe library code

2020-03-12 Thread Li Yang
The QE code was previously only supported on big-endian PowerPC systems
that use the same endian as the QE device.  The endian transfer code is
not really exercised.  Recent updates extended the QE drivers to
little-endian ARM/ARM64 systems which makes the endian transfer really
meaningful and hence triggered more sparse warnings for the endian
mismatch.  Some of these endian issues are real issues that need to be
fixed.

While at it, fixed some direct de-references of IO memory space and
suppressed other __iomem address space mismatch issues by adding correct
address space attributes.

Li Yang (6):
  soc: fsl: qe: fix sparse warnings for qe.c
  soc: fsl: qe: fix sparse warning for qe_common.c
  soc: fsl: qe: fix sparse warnings for ucc.c
  soc: fsl: qe: fix sparse warnings for qe_ic.c
  soc: fsl: qe: fix sparse warnings for ucc_fast.c
  soc: fsl: qe: fix sparse warnings for ucc_slow.c

 drivers/soc/fsl/qe/qe.c|  4 ++--
 drivers/soc/fsl/qe/qe_common.c |  2 +-
 drivers/soc/fsl/qe/qe_ic.c |  2 +-
 drivers/soc/fsl/qe/ucc.c   |  2 +-
 drivers/soc/fsl/qe/ucc_slow.c  | 33 +
 include/soc/fsl/qe/ucc_fast.h  |  6 +++---
 include/soc/fsl/qe/ucc_slow.h  | 13 ++---
 7 files changed, 27 insertions(+), 35 deletions(-)

-- 
2.17.1



Re: [PATCH] powerpc/pseries: fix of_read_drc_info_cell() to point at next record

2020-03-12 Thread Tyrel Datwyler
On 3/11/20 10:43 PM, Michael Ellerman wrote:
> Tyrel Datwyler  writes:
>> On 3/10/20 10:25 AM, Nathan Lynch wrote:
>>> Tyrel Datwyler  writes:
 The expectation is that when calling of_read_drc_info_cell()
 repeatedly to parse multiple drc-info records that the in/out curval
 parameter points at the start of the next record on return. However,
 the current behavior has curval still pointing at the final value of
 the record just parsed. The result of which is that if the
 ibm,drc-info property contains multiple properties the parsed value
 of the drc_type for any record after the first has the power_domain
 value of the previous record appended to the type string.

 Ex: observed the following 0x prepended to PHB

 [   69.485037] drc-info: type: \xff\xff\xff\xffPHB, prefix: PHB , 
 index_start: 0x2001
 [   69.485038] drc-info: suffix_start: 1, sequential_elems: 3072, 
 sequential_inc: 1
 [   69.485038] drc-info: power-domain: 0x, last_index: 0x2c00

 Fix by incrementing curval past the power_domain value to point at
 drc_type string of next record.

 Fixes: a29396653b8bf ("pseries/drc-info: Search DRC properties for CPU 
 indexes")
>>>
>>> I have a different commit hash for that:
>>> e83636ac3334 pseries/drc-info: Search DRC properties for CPU indexes
>>
>> Oof, looks like I grabbed the commit hash from the SLES 15 SP1 branch. You 
>> have
>> the correct upstream commit.
>>
>> Fixes: e83636ac3334 ("pseries/drc-info: Search DRC properties for CPU 
>> indexes")
>>
>> Michael, let me know if you want me to resubmit, or if you will fixup the 
>> Fixes
>> tag on your end?
> 
> I can update the Fixes tag.
> 
> What's the practical effect of this bug? It seems like it should break
> all the hotplug stuff, but presumably it doesn't in practice for some
> reason?
> 
> It would also be *really* nice if we had some unit tests for this
> parsing code, it's demonstrably very bug prone.
> 
> cheers
> 

In practice PHBs are the only type of connector in the ibm,drc-info property
that has multiple records. So, it breaks PHB hotplug, but by chance not pci,
cpu, slot, or mem because they happen to only ever be a single record.

I have a follow up patch to at least add some pr_debug at the end of this
parsing function to dump each record as its read.

-Tyrel


Re: [PATCH -next] PCI: rpaphp: remove set but not used variable 'value'

2020-03-12 Thread Tyrel Datwyler
On 3/12/20 7:41 AM, Bjorn Helgaas wrote:
> On Thu, Mar 12, 2020 at 09:38:02AM -0500, Bjorn Helgaas wrote:
>> On Thu, Mar 12, 2020 at 10:04:12PM +0800, Chen Zhou wrote:
>>> Fixes gcc '-Wunused-but-set-variable' warning:
>>>
>>> drivers/pci/hotplug/rpaphp_core.c: In function is_php_type:
>>> drivers/pci/hotplug/rpaphp_core.c:291:16: warning:
>>> variable value set but not used [-Wunused-but-set-variable]
>>>
>>> Reported-by: Hulk Robot 
>>> Signed-off-by: Chen Zhou 
>>
>> Michael, if you want this:
>>
>> Acked-by: Bjorn Helgaas 
>>
>> If you don't mind, edit the subject to follow the convention, e.g.,
>>
>>   PCI: rpaphp: Remove unused variable 'value'
>>
>> Apparently simple_strtoul() is deprecated and we're supposed to use
>> kstrtoul() instead.  Looks like kstrtoul() might simplify the code a
>> little, too, e.g.,
>>
>>   if (kstrtoul(drc_type, 0, ) == 0)
>> return 1;
>>
>>   return 0;
> 
> I guess there are several other uses of simple_strtoul() in this file.
> Not sure if it's worth changing them all, just this one, or just the
> patch below as-is.

If we are going to change one might as well do them all at once. If the original
submitter wants to send the follow up that is fine, or I can send a patch when I
have a minute.

-Tyrel

> 
>>> ---
>>>  drivers/pci/hotplug/rpaphp_core.c | 3 +--
>>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/pci/hotplug/rpaphp_core.c 
>>> b/drivers/pci/hotplug/rpaphp_core.c
>>> index e408e40..5d871ef 100644
>>> --- a/drivers/pci/hotplug/rpaphp_core.c
>>> +++ b/drivers/pci/hotplug/rpaphp_core.c
>>> @@ -288,11 +288,10 @@ EXPORT_SYMBOL_GPL(rpaphp_check_drc_props);
>>>  
>>>  static int is_php_type(char *drc_type)
>>>  {
>>> -   unsigned long value;
>>> char *endptr;
>>>  
>>> /* PCI Hotplug nodes have an integer for drc_type */
>>> -   value = simple_strtoul(drc_type, , 10);
>>> +   simple_strtoul(drc_type, , 10);
>>> if (endptr == drc_type)
>>> return 0;
>>>  
>>> -- 
>>> 2.7.4
>>>



Re: [PATCH v5 4/7] ASoC: fsl_asrc: rename asrc_priv to asrc

2020-03-12 Thread Mark Brown
On Mon, Mar 09, 2020 at 02:30:17PM -0700, Nicolin Chen wrote:
> On Mon, Mar 09, 2020 at 11:58:31AM +0800, Shengjiu Wang wrote:
> > In order to move common structure to fsl_asrc_common.h
> > we change the name of asrc_priv to asrc, the asrc_priv
> > will be used by new struct fsl_asrc_priv.

> This actually could be a cleanup patch which comes as the
> first one in this series, so that we could ack it and get
> merged without depending on others. Maybe next version?

Yes, please.  Or even just send it separately.


signature.asc
Description: PGP signature


Re: [5.6.0-rc2-next-20200218/powerpc] Boot failure on POWER9

2020-03-12 Thread Sachin Sant
> The patch below might work. Sachin can you test this? I tried faking up
> a system with a memoryless node zero but couldn't get it to even start
> booting.
> 
The patch did not help. The kernel crashed during
the boot with the same call trace.

BUG_ON() introduced with the patch was not triggered.

Thanks
-Sachin

> cheers
> 
> 
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 9b4f5fb719e0..d1f11437f6c4 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -282,6 +282,9 @@ void __init mem_init(void)
>*/
>   BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
> 
> + BUG_ON(smp_processor_id() != boot_cpuid);
> + set_numa_mem(local_memory_node(numa_cpu_lookup_table[boot_cpuid]));
> +
> #ifdef CONFIG_SWIOTLB
>   /*
>* Some platforms (e.g. 85xx) limit DMA-able memory way below



Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-12 Thread Vlastimil Babka
On 3/12/20 5:13 PM, Srikar Dronamraju wrote:
> * Vlastimil Babka  [2020-03-12 14:51:38]:
> 
>> > * Vlastimil Babka  [2020-03-12 10:30:50]:
>> > 
>> >> On 3/12/20 9:23 AM, Sachin Sant wrote:
>> >> >> On 12-Mar-2020, at 10:57 AM, Srikar Dronamraju 
>> >> >>  wrote:
>> >> >> * Michal Hocko  [2020-03-11 12:57:35]:
>> >> >>> On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
>> >>  To ensure a cpuless, memoryless dummy node is not online, powerpc 
>> >>  need
>> >>  to make sure all possible but not present cpu_to_node are set to a
>> >>  proper node.
>> >> >>> 
>> >> >>> Just curious, is this somehow related to
>> >> >>> http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?
>> >> >>> 
>> >> >> 
>> >> >> The issue I am trying to fix is a known issue in Powerpc since many 
>> >> >> years.
>> >> >> So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: 
>> >> >> allocate
>> >> >> shrinker_map on appropriate NUMA node"). 
>> >> >> 
>> > 
>> > While I am not an expert in the slub area, I looked at the patch
>> > a75056fc1e7c and had some thoughts on why this could be causing this issue.
>> > 
>> > On the system where the crash happens, the possible number of nodes is much
>> > greater than the number of onlined nodes. The pdgat or the NODE_DATA is 
>> > only
>> > available for onlined nodes.
>> > 
>> > With a75056fc1e7c memcg_alloc_shrinker_maps, we end up calling kzalloc_node
>> > for all possible nodes and in ___slab_alloc we end up looking at the
>> > node_present_pages which is NODE_DATA(nid)->node_present_pages.
>> > i.e for a node whose pdgat struct is not allocated, we are trying to
>> > dereference.
>> 
>> From what we saw, the pgdat does exist, the problem is that slab's per-node 
>> data
>> doesn't exist for a node that doesn't have present pages, as it would be a 
>> waste
>> of memory.
> 
> Just to be clear
> Before my 3 patches to fix dummy node:
> srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/possible
> 0-31
> srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/online
> 0-1

OK

>> 
>> Uh actually you are probably right, the NODE_DATA doesn't exist anymore? In
>> Sachin's first report [1] we have
>> 
>> [0.00] numa:   NODE_DATA [mem 0x8bfedc900-0x8bfee3fff]
>> [0.00] numa: NODE_DATA(0) on node 1
>> [0.00] numa:   NODE_DATA [mem 0x8bfed5200-0x8bfedc8ff]
>> 
> 
> So even if pgdat would exist for nodes 0 and 1, there is no pgdat for the
> rest 30 nodes.

I see. Perhaps node_present_pages(node) is not safe in SLUB then and it should
check online first, as you suggested.

>> But in this thread, with your patches Sachin reports:
> 
> and with my patches
> srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/possible
> 0-31
> srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/online
> 1
> 
>> 
>> [0.00] numa:   NODE_DATA [mem 0x8bfedc900-0x8bfee3fff]
>> 
> 
> so we only see one pgdat.
> 
>> So I assume it's just node 1. In that case, node_present_pages is really 
>> dangerous.
>> 
>> [1]
>> https://lore.kernel.org/linux-next/3381cd91-ab3d-4773-ba04-e7a072a63...@linux.vnet.ibm.com/
>> 
>> > Also for a memoryless/cpuless node or possible but not present nodes,
>> > node_to_mem_node(node) will still end up as node (atleast on powerpc).
>> 
>> I think that's the place where this would be best to fix.
>> 
> 
> Maybe. I thought about it but the current set_numa_mem semantics are apt
> for memoryless cpu node and not for possible nodes.  We could have upto 256
> possible nodes and only 2 nodes (1,2) with cpu and 1 node (1) with memory.
> node_to_mem_node seems to return what is set in set_numa_mem().
> set_numa_mem() seems to say set my numa_mem node for the current memoryless
> node to the param passed.
> 
> But how do we set numa_mem for all the other 253 possible nodes, which
> probably will have 0 as default?
> 
> Should we introduce another API such that we could update for all possible
> nodes?

If we want to rely on node_to_mem_node() to give us something safe for each
possible node, then probably it would have to be like that, yeah.

>> > I tried with this hunk below and it works.
>> > 
>> > But I am not sure if we need to check at other places were
>> > node_present_pages is being called.
>> 
>> I think this seems to defeat the purpose of node_to_mem_node()? Shouldn't it
>> return only nodes that are online with present memory?
>> CCing Joonsoo who seems to have introduced this in ad2c8144418c ("topology: 
>> add
>> support for node_to_mem_node() to determine the fallback node")
>> 
> 
> Agree 
> 
>> I think we do need well defined and documented rules around 
>> node_to_mem_node(),
>> cpu_to_node(), existence of NODE_DATA, various node_states bitmaps etc so
>> everyone handles it the same, safe way.

So let's try to brainstorm how this would look like? What I mean are some rules
like below, even if some details in my current understanding are most likely
incorrect:

with nid present in:
N_POSSIBLE - pgdat might not 

Re: [RFC PATCH v1] pseries/drmem: don't cache node id in drmem_lmb struct

2020-03-12 Thread Scott Cheloha
Hi Michal,

On Thu, Mar 12, 2020 at 06:02:37AM +0100, Michal Suchánek wrote:
> On Wed, Mar 11, 2020 at 06:08:15PM -0500, Scott Cheloha wrote:
> > At memory hot-remove time we can retrieve an LMB's nid from its
> > corresponding memory_block.  There is no need to store the nid
> > in multiple locations.
> > 
> > Signed-off-by: Scott Cheloha 
> > ---
> > The linear search in powerpc's memory_add_physaddr_to_nid() has become a
> > bottleneck at boot on systems with many LMBs.
> > 
> > As described in this patch here:
> > 
> > https://lore.kernel.org/linuxppc-dev/20200221172901.1596249-2-chel...@linux.ibm.com/
> > 
> > the linear search seriously cripples drmem_init().
> > 
> > The obvious solution (shown in that patch) is to just make the search
> > in memory_add_physaddr_to_nid() faster.  An XArray seems well-suited
> > to the task of mapping an address range to an LMB object.
> > 
> > The less obvious approach is to just call memory_add_physaddr_to_nid()
> > in fewer places.
> > 
> > I'm not sure which approach is correct, hence the RFC.
> 
> You basically revert the below which will likely cause the very error
> that was fixed there:
> 
> commit b2d3b5ee66f2a04a918cc043cec0c9ed3de58f40
> Author: Nathan Fontenot 
> Date:   Tue Oct 2 10:35:59 2018 -0500
> 
> powerpc/pseries: Track LMB nid instead of using device tree
> 
> When removing memory we need to remove the memory from the node
> it was added to instead of looking up the node it should be in
> in the device tree.
> 
> During testing we have seen scenarios where the affinity for a
> LMB changes due to a partition migration or PRRN event. In these
> cases the node the LMB exists in may not match the node the device
> tree indicates it belongs in. This can lead to a system crash
> when trying to DLPAR remove the LMB after a migration or PRRN
> event. The current code looks up the node in the device tree to
> remove the LMB from, the crash occurs when we try to offline this
> node and it does not have any data, i.e. node_data[nid] == NULL.

I'm aware of this patch and that this is a near-total revert.

I'm not reintroducing the original behavior, though.  Instead of going
to the device tree to recompute the expected node id I'm retrieving it
from the LMB's corresponding memory_block.

That crucial difference is this chunk:

--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -376,25 +376,29 @@ static int dlpar_add_lmb(struct drmem_lmb *);
 
 static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 {
+   struct memory_block *mem_block;
unsigned long block_sz;
int rc;
 
if (!lmb_is_removable(lmb))
return -EINVAL;
 
+   mem_block = lmb_to_memblock(lmb);
+   if (mem_block == NULL)
+   return -EINVAL;
+
rc = dlpar_offline_lmb(lmb);
if (rc)
return rc;
 
block_sz = pseries_memory_block_size();
 
-   __remove_memory(lmb->nid, lmb->base_addr, block_sz);
+   __remove_memory(mem_block->nid, lmb->base_addr, block_sz);
 
/* Update memory regions for memory remove */
memblock_remove(lmb->base_addr, block_sz);
 
invalidate_lmb_associativity_index(lmb);
-   lmb_clear_nid(lmb);
lmb->flags &= ~DRCONF_MEM_ASSIGNED;
 
return 0;
---

Unless it's possible that the call:

__add_memory(my_nid, my_addr, my_size);

does not yield the following:

memory_block.nid == my_nid

on success, then I think the solution in my patch is equivalent to and
simpler than the one introduced in the patch you quote.

Can you see a way the above would not hold?

Then again, maybe there's a good reason not to retrieve the node id in
this way.  I'm thinking David Hildenbrand and/or Nathan Fontenont may
have some insight on that.


Re: [PATCH] powerpc/pseries: fix of_read_drc_info_cell() to point at next record

2020-03-12 Thread Nathan Lynch
Michael Ellerman  writes:
>
> It would also be *really* nice if we had some unit tests for this
> parsing code, it's demonstrably very bug prone.

Can you say more about what form unit tests could take? Like some self
tests that run at boot time, or is there a way to run the code in a user
space test harness?


Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-12 Thread Srikar Dronamraju
* Vlastimil Babka  [2020-03-12 14:51:38]:

> > * Vlastimil Babka  [2020-03-12 10:30:50]:
> > 
> >> On 3/12/20 9:23 AM, Sachin Sant wrote:
> >> >> On 12-Mar-2020, at 10:57 AM, Srikar Dronamraju 
> >> >>  wrote:
> >> >> * Michal Hocko  [2020-03-11 12:57:35]:
> >> >>> On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
> >>  To ensure a cpuless, memoryless dummy node is not online, powerpc need
> >>  to make sure all possible but not present cpu_to_node are set to a
> >>  proper node.
> >> >>> 
> >> >>> Just curious, is this somehow related to
> >> >>> http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?
> >> >>> 
> >> >> 
> >> >> The issue I am trying to fix is a known issue in Powerpc since many 
> >> >> years.
> >> >> So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: 
> >> >> allocate
> >> >> shrinker_map on appropriate NUMA node"). 
> >> >> 
> > 
> > While I am not an expert in the slub area, I looked at the patch
> > a75056fc1e7c and had some thoughts on why this could be causing this issue.
> > 
> > On the system where the crash happens, the possible number of nodes is much
> > greater than the number of onlined nodes. The pdgat or the NODE_DATA is only
> > available for onlined nodes.
> > 
> > With a75056fc1e7c memcg_alloc_shrinker_maps, we end up calling kzalloc_node
> > for all possible nodes and in ___slab_alloc we end up looking at the
> > node_present_pages which is NODE_DATA(nid)->node_present_pages.
> > i.e for a node whose pdgat struct is not allocated, we are trying to
> > dereference.
> 
> From what we saw, the pgdat does exist, the problem is that slab's per-node 
> data
> doesn't exist for a node that doesn't have present pages, as it would be a 
> waste
> of memory.

Just to be clear
Before my 3 patches to fix dummy node:
srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/possible
0-31
srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/online
0-1

> 
> Uh actually you are probably right, the NODE_DATA doesn't exist anymore? In
> Sachin's first report [1] we have
> 
> [0.00] numa:   NODE_DATA [mem 0x8bfedc900-0x8bfee3fff]
> [0.00] numa: NODE_DATA(0) on node 1
> [0.00] numa:   NODE_DATA [mem 0x8bfed5200-0x8bfedc8ff]
> 

So even if pgdat would exist for nodes 0 and 1, there is no pgdat for the
rest 30 nodes.

> But in this thread, with your patches Sachin reports:

and with my patches
srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/possible
0-31
srikar@ltc-zzci-2 /sys/devices/system/node $ cat $PWD/online
1

> 
> [0.00] numa:   NODE_DATA [mem 0x8bfedc900-0x8bfee3fff]
> 

so we only see one pgdat.

> So I assume it's just node 1. In that case, node_present_pages is really 
> dangerous.
> 
> [1]
> https://lore.kernel.org/linux-next/3381cd91-ab3d-4773-ba04-e7a072a63...@linux.vnet.ibm.com/
> 
> > Also for a memoryless/cpuless node or possible but not present nodes,
> > node_to_mem_node(node) will still end up as node (atleast on powerpc).
> 
> I think that's the place where this would be best to fix.
> 

Maybe. I thought about it but the current set_numa_mem semantics are apt
for memoryless cpu node and not for possible nodes.  We could have upto 256
possible nodes and only 2 nodes (1,2) with cpu and 1 node (1) with memory.
node_to_mem_node seems to return what is set in set_numa_mem().
set_numa_mem() seems to say set my numa_mem node for the current memoryless
node to the param passed.

But how do we set numa_mem for all the other 253 possible nodes, which
probably will have 0 as default?

Should we introduce another API such that we could update for all possible
nodes?

> > I tried with this hunk below and it works.
> > 
> > But I am not sure if we need to check at other places were
> > node_present_pages is being called.
> 
> I think this seems to defeat the purpose of node_to_mem_node()? Shouldn't it
> return only nodes that are online with present memory?
> CCing Joonsoo who seems to have introduced this in ad2c8144418c ("topology: 
> add
> support for node_to_mem_node() to determine the fallback node")
> 

Agree 

> I think we do need well defined and documented rules around 
> node_to_mem_node(),
> cpu_to_node(), existence of NODE_DATA, various node_states bitmaps etc so
> everyone handles it the same, safe way.
> 

Other option would be to tweak Kirill Tkhai's patch such that we call
kvmalloc_node()/kzalloc_node() if node is online and call kvmalloc/kvzalloc
if the node is offline.

> > diff --git a/mm/slub.c b/mm/slub.c
> > index 626cbcbd977f..bddb93bed55e 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -2571,9 +2571,13 @@ static void *___slab_alloc(struct kmem_cache *s, 
> > gfp_t gfpflags, int node,
> > if (unlikely(!node_match(page, node))) {
> > int searchnode = node;
> >  
> > -   if (node != NUMA_NO_NODE && !node_present_pages(node))
> > -   searchnode = node_to_mem_node(node);
> > -
> > +   if (node != NUMA_NO_NODE) {
> > +   

Re: [PATCH -next] PCI: rpaphp: remove set but not used variable 'value'

2020-03-12 Thread Bjorn Helgaas
On Thu, Mar 12, 2020 at 09:38:02AM -0500, Bjorn Helgaas wrote:
> On Thu, Mar 12, 2020 at 10:04:12PM +0800, Chen Zhou wrote:
> > Fixes gcc '-Wunused-but-set-variable' warning:
> > 
> > drivers/pci/hotplug/rpaphp_core.c: In function is_php_type:
> > drivers/pci/hotplug/rpaphp_core.c:291:16: warning:
> > variable value set but not used [-Wunused-but-set-variable]
> > 
> > Reported-by: Hulk Robot 
> > Signed-off-by: Chen Zhou 
> 
> Michael, if you want this:
> 
> Acked-by: Bjorn Helgaas 
> 
> If you don't mind, edit the subject to follow the convention, e.g.,
> 
>   PCI: rpaphp: Remove unused variable 'value'
> 
> Apparently simple_strtoul() is deprecated and we're supposed to use
> kstrtoul() instead.  Looks like kstrtoul() might simplify the code a
> little, too, e.g.,
> 
>   if (kstrtoul(drc_type, 0, ) == 0)
> return 1;
> 
>   return 0;

I guess there are several other uses of simple_strtoul() in this file.
Not sure if it's worth changing them all, just this one, or just the
patch below as-is.

> > ---
> >  drivers/pci/hotplug/rpaphp_core.c | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> > 
> > diff --git a/drivers/pci/hotplug/rpaphp_core.c 
> > b/drivers/pci/hotplug/rpaphp_core.c
> > index e408e40..5d871ef 100644
> > --- a/drivers/pci/hotplug/rpaphp_core.c
> > +++ b/drivers/pci/hotplug/rpaphp_core.c
> > @@ -288,11 +288,10 @@ EXPORT_SYMBOL_GPL(rpaphp_check_drc_props);
> >  
> >  static int is_php_type(char *drc_type)
> >  {
> > -   unsigned long value;
> > char *endptr;
> >  
> > /* PCI Hotplug nodes have an integer for drc_type */
> > -   value = simple_strtoul(drc_type, , 10);
> > +   simple_strtoul(drc_type, , 10);
> > if (endptr == drc_type)
> > return 0;
> >  
> > -- 
> > 2.7.4
> > 


Re: [PATCH -next] PCI: rpaphp: remove set but not used variable 'value'

2020-03-12 Thread Bjorn Helgaas
On Thu, Mar 12, 2020 at 10:04:12PM +0800, Chen Zhou wrote:
> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> drivers/pci/hotplug/rpaphp_core.c: In function is_php_type:
> drivers/pci/hotplug/rpaphp_core.c:291:16: warning:
>   variable value set but not used [-Wunused-but-set-variable]
> 
> Reported-by: Hulk Robot 
> Signed-off-by: Chen Zhou 

Michael, if you want this:

Acked-by: Bjorn Helgaas 

If you don't mind, edit the subject to follow the convention, e.g.,

  PCI: rpaphp: Remove unused variable 'value'

Apparently simple_strtoul() is deprecated and we're supposed to use
kstrtoul() instead.  Looks like kstrtoul() might simplify the code a
little, too, e.g.,

  if (kstrtoul(drc_type, 0, ) == 0)
return 1;

  return 0;

> ---
>  drivers/pci/hotplug/rpaphp_core.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/rpaphp_core.c 
> b/drivers/pci/hotplug/rpaphp_core.c
> index e408e40..5d871ef 100644
> --- a/drivers/pci/hotplug/rpaphp_core.c
> +++ b/drivers/pci/hotplug/rpaphp_core.c
> @@ -288,11 +288,10 @@ EXPORT_SYMBOL_GPL(rpaphp_check_drc_props);
>  
>  static int is_php_type(char *drc_type)
>  {
> - unsigned long value;
>   char *endptr;
>  
>   /* PCI Hotplug nodes have an integer for drc_type */
> - value = simple_strtoul(drc_type, , 10);
> + simple_strtoul(drc_type, , 10);
>   if (endptr == drc_type)
>   return 0;
>  
> -- 
> 2.7.4
> 


[PATCH -next] PCI: rpaphp: remove set but not used variable 'value'

2020-03-12 Thread Chen Zhou
Fixes gcc '-Wunused-but-set-variable' warning:

drivers/pci/hotplug/rpaphp_core.c: In function is_php_type:
drivers/pci/hotplug/rpaphp_core.c:291:16: warning:
variable value set but not used [-Wunused-but-set-variable]

Reported-by: Hulk Robot 
Signed-off-by: Chen Zhou 
---
 drivers/pci/hotplug/rpaphp_core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/pci/hotplug/rpaphp_core.c 
b/drivers/pci/hotplug/rpaphp_core.c
index e408e40..5d871ef 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -288,11 +288,10 @@ EXPORT_SYMBOL_GPL(rpaphp_check_drc_props);
 
 static int is_php_type(char *drc_type)
 {
-   unsigned long value;
char *endptr;
 
/* PCI Hotplug nodes have an integer for drc_type */
-   value = simple_strtoul(drc_type, , 10);
+   simple_strtoul(drc_type, , 10);
if (endptr == drc_type)
return 0;
 
-- 
2.7.4



Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-12 Thread Vlastimil Babka
On 3/12/20 2:14 PM, Srikar Dronamraju wrote:
> * Vlastimil Babka  [2020-03-12 10:30:50]:
> 
>> On 3/12/20 9:23 AM, Sachin Sant wrote:
>> >> On 12-Mar-2020, at 10:57 AM, Srikar Dronamraju 
>> >>  wrote:
>> >> * Michal Hocko  [2020-03-11 12:57:35]:
>> >>> On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
>>  To ensure a cpuless, memoryless dummy node is not online, powerpc need
>>  to make sure all possible but not present cpu_to_node are set to a
>>  proper node.
>> >>> 
>> >>> Just curious, is this somehow related to
>> >>> http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?
>> >>> 
>> >> 
>> >> The issue I am trying to fix is a known issue in Powerpc since many years.
>> >> So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: allocate
>> >> shrinker_map on appropriate NUMA node"). 
>> >> 
>> >> I tried v5.6-rc4 + a75056fc1e7c but didnt face any issues booting the
>> >> kernel. Will work with Sachin/Abdul (reporters of the issue).
> 
> I had used v1 and not v2. So my mistake.
> 
>> > I applied this 3 patch series on top of March 11 next tree (commit 
>> > d44a64766795 )
>> > The kernel still fails to boot with same call trace.
>> 
> 
> While I am not an expert in the slub area, I looked at the patch
> a75056fc1e7c and had some thoughts on why this could be causing this issue.
> 
> On the system where the crash happens, the possible number of nodes is much
> greater than the number of onlined nodes. The pdgat or the NODE_DATA is only
> available for onlined nodes.
> 
> With a75056fc1e7c memcg_alloc_shrinker_maps, we end up calling kzalloc_node
> for all possible nodes and in ___slab_alloc we end up looking at the
> node_present_pages which is NODE_DATA(nid)->node_present_pages.
> i.e for a node whose pdgat struct is not allocated, we are trying to
> dereference.

>From what we saw, the pgdat does exist, the problem is that slab's per-node 
>data
doesn't exist for a node that doesn't have present pages, as it would be a waste
of memory.

Uh actually you are probably right, the NODE_DATA doesn't exist anymore? In
Sachin's first report [1] we have

[0.00] numa:   NODE_DATA [mem 0x8bfedc900-0x8bfee3fff]
[0.00] numa: NODE_DATA(0) on node 1
[0.00] numa:   NODE_DATA [mem 0x8bfed5200-0x8bfedc8ff]

But in this thread, with your patches Sachin reports:

[0.00] numa:   NODE_DATA [mem 0x8bfedc900-0x8bfee3fff]

So I assume it's just node 1. In that case, node_present_pages is really 
dangerous.

[1]
https://lore.kernel.org/linux-next/3381cd91-ab3d-4773-ba04-e7a072a63...@linux.vnet.ibm.com/

> Also for a memoryless/cpuless node or possible but not present nodes,
> node_to_mem_node(node) will still end up as node (atleast on powerpc).

I think that's the place where this would be best to fix.

> I tried with this hunk below and it works.
> 
> But I am not sure if we need to check at other places were
> node_present_pages is being called.

I think this seems to defeat the purpose of node_to_mem_node()? Shouldn't it
return only nodes that are online with present memory?
CCing Joonsoo who seems to have introduced this in ad2c8144418c ("topology: add
support for node_to_mem_node() to determine the fallback node")

I think we do need well defined and documented rules around node_to_mem_node(),
cpu_to_node(), existence of NODE_DATA, various node_states bitmaps etc so
everyone handles it the same, safe way.

> diff --git a/mm/slub.c b/mm/slub.c
> index 626cbcbd977f..bddb93bed55e 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2571,9 +2571,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t 
> gfpflags, int node,
>   if (unlikely(!node_match(page, node))) {
>   int searchnode = node;
>  
> - if (node != NUMA_NO_NODE && !node_present_pages(node))
> - searchnode = node_to_mem_node(node);
> -
> + if (node != NUMA_NO_NODE) {
> + if (!node_online(node) || !node_present_pages(node)) {
> + searchnode = node_to_mem_node(node);
> + if (!node_online(searchnode))
> + searchnode = first_online_node;
> + }
> + }
>   if (unlikely(!node_match(page, searchnode))) {
>   stat(s, ALLOC_NODE_MISMATCH);
>   deactivate_slab(s, page, c->freelist, c);
> 
>> > 
>> 
> 



[PATCH v3 6/6] asm-generic/tlb: avoid potential double flush

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 0758cd8304942292e95a0f750c374533db378b32 upstream.

Aneesh reported that:

tlb_flush_mmu()
  tlb_flush_mmu_tlbonly()
tlb_flush() <-- #1
  tlb_flush_mmu_free()
tlb_table_flush()
  tlb_table_invalidate()
tlb_flush_mmu_tlbonly()
  tlb_flush()   <-- #2

does two TLBIs when tlb->fullmm, because __tlb_reset_range() will not
clear tlb->end in that case.

Observe that any caller to __tlb_adjust_range() also sets at least one of
the tlb->freed_tables || tlb->cleared_p* bits, and those are
unconditionally cleared by __tlb_reset_range().

Change the condition for actually issuing TLBI to having one of those bits
set, as opposed to having tlb->end != 0.

Link: 
http://lkml.kernel.org/r/20200116064531.483522-4-aneesh.ku...@linux.ibm.com
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Aneesh Kumar K.V 
Reported-by: "Aneesh Kumar K.V" 
Cc:   # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported to 4.19 stable]
---
 include/asm-generic/tlb.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 19934cdd143e..427a70c56ddd 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -179,7 +179,12 @@ static inline void __tlb_reset_range(struct mmu_gather 
*tlb)
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
-   if (!tlb->end)
+   /*
+* Anything calling __tlb_adjust_range() also sets at least one of
+* these bits.
+*/
+   if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
+ tlb->cleared_puds || tlb->cleared_p4ds))
return;
 
tlb_flush(tlb);
-- 
2.24.1



[PATCH v3 5/6] mm/mmu_gather: invalidate TLB correctly on batch allocation failure and flush

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 0ed1325967ab5f7a4549a2641c6ebe115f76e228 upstream.

Architectures for which we have hardware walkers of Linux page table
should flush TLB on mmu gather batch allocation failures and batch flush.
Some architectures like POWER supports multiple translation modes (hash
and radix) and in the case of POWER only radix translation mode needs the
above TLBI.  This is because for hash translation mode kernel wants to
avoid this extra flush since there are no hardware walkers of linux page
table.  With radix translation, the hardware also walks linux page table
and with that, kernel needs to make sure to TLB invalidate page walk cache
before page table pages are freed.

More details in commit d86564a2f085 ("mm/tlb, x86/mm: Support invalidating
TLB caches for RCU_TABLE_FREE")

The changes to sparc are to make sure we keep the old behavior since we
are now removing HAVE_RCU_TABLE_NO_INVALIDATE.  The default value for
tlb_needs_table_invalidate is to always force an invalidate and sparc can
avoid the table invalidate.  Hence we define tlb_needs_table_invalidate to
false for sparc architecture.

Link: 
http://lkml.kernel.org/r/20200116064531.483522-3-aneesh.ku...@linux.ibm.com
Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes")
Signed-off-by: Peter Zijlstra (Intel) 
Cc:   # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported to 4.19 stable]
---
 arch/Kconfig|  3 ---
 arch/powerpc/Kconfig|  1 -
 arch/powerpc/include/asm/tlb.h  | 11 +++
 arch/sparc/Kconfig  |  1 -
 arch/sparc/include/asm/tlb_64.h |  9 +
 include/asm-generic/tlb.h   | 15 +++
 mm/memory.c | 16 
 7 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 061a12b8140e..3abbdb0cea44 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -363,9 +363,6 @@ config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
bool
 
-config HAVE_RCU_TABLE_NO_INVALIDATE
-   bool
-
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1a00ce4b0040..e5bc0cfea2b1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -216,7 +216,6 @@ config PPC
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index f0e571b2dc7c..63418275f402 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -30,6 +30,17 @@
 #define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
 
 extern void tlb_flush(struct mmu_gather *tlb);
+/*
+ * book3s:
+ * Hash does not use the linux page-tables, so we can avoid
+ * the TLB invalidate for page-table freeing, Radix otoh does use the
+ * page-tables and needs the TLBI.
+ *
+ * nohash:
+ * We still do TLB invalidate in the __pte_free_tlb routine before we
+ * add the page table pages to mmu gather table batch.
+ */
+#define tlb_needs_table_invalidate()   radix_enabled()
 
 /* Get the generic bits... */
 #include 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index d90d632868aa..e6f2a38d2e61 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -64,7 +64,6 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index a2f3fa61ee36..8cb8f3833239 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -28,6 +28,15 @@ void flush_tlb_pending(void);
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 #define tlb_flush(tlb) flush_tlb_pending()
 
+/*
+ * SPARC64's hardware TLB fill does not use the Linux page-tables
+ * and therefore we don't need a TLBI when freeing page-table pages.
+ */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#define tlb_needs_table_invalidate()   (false)
+#endif
+
 #include 
 
 #endif /* _SPARC64_TLB_H */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f2b9dc9cbaf8..19934cdd143e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -61,8 +61,23 @@ struct mmu_table_batch {
 extern void tlb_table_flush(struct mmu_gather *tlb);
 extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 
+/*
+ * This allows an architecture that does not use the linux page-tables for
+ * hardware to skip the TLBI when freeing page tables.
+ */
+#ifndef 

[PATCH v3 4/6] powerpc/mmu_gather: enable RCU_TABLE_FREE even for !SMP case

2020-03-12 Thread Santosh Sivaraj
From: "Aneesh Kumar K.V" 

commit 12e4d53f3f04e81f9e83d6fc10edc7314ab9f6b9 upstream.

Patch series "Fixup page directory freeing", v4.

This is a repost of patch series from Peter with the arch specific changes
except ppc64 dropped.  ppc64 changes are added here because we are redoing
the patch series on top of ppc64 changes.  This makes it easy to backport
these changes.  Only the first 2 patches need to be backported to stable.

The thing is, on anything SMP, freeing page directories should observe the
exact same order as normal page freeing:

 1) unhook page/directory
 2) TLB invalidate
 3) free page/directory

Without this, any concurrent page-table walk could end up with a
Use-after-Free.  This is esp.  trivial for anything that has software
page-table walkers (HAVE_FAST_GUP / software TLB fill) or the hardware
caches partial page-walks (ie.  caches page directories).

Even on UP this might give issues since mmu_gather is preemptible these
days.  An interrupt or preempted task accessing user pages might stumble
into the free page if the hardware caches page directories.

This patch series fixes ppc64 and add generic MMU_GATHER changes to
support the conversion of other architectures.  I haven't added patches
w.r.t other architecture because they are yet to be acked.

This patch (of 9):

A followup patch is going to make sure we correctly invalidate page walk
cache before we free page table pages.  In order to keep things simple
enable RCU_TABLE_FREE even for !SMP so that we don't have to fixup the
!SMP case differently in the followup patch

!SMP case is right now broken for radix translation w.r.t page walk
cache flush.  We can get interrupted in between page table free and
that would imply we have page walk cache entries pointing to tables
which got freed already.  Michael said "both our platforms that run on
Power9 force SMP on in Kconfig, so the !SMP case is unlikely to be a
problem for anyone in practice, unless they've hacked their kernel to
build it !SMP."

Link: 
http://lkml.kernel.org/r/20200116064531.483522-2-aneesh.ku...@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported for 4.19 stable]
---
 arch/powerpc/Kconfig | 2 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 8 
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 2 --
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 8 
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 9 +
 arch/powerpc/mm/pgtable-book3s64.c   | 7 ---
 6 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e09cfb109b8c..1a00ce4b0040 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -215,7 +215,7 @@ config PPC
select HAVE_HARDLOCKUP_DETECTOR_PERFif PERF_EVENTS && 
HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
-   select HAVE_RCU_TABLE_FREE  if SMP
+   select HAVE_RCU_TABLE_FREE
select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 82e44b1a00ae..79ba3fbb512e 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -110,7 +110,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
 #define check_pgt_cache()  do { } while (0)
 #define get_hugepd_cache_index(x)  (x)
 
-#ifdef CONFIG_SMP
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
@@ -127,13 +126,6 @@ static inline void __tlb_remove_table(void *_table)
 
pgtable_free(table, shift);
 }
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-   void *table, int shift)
-{
-   pgtable_free(table, shift);
-}
-#endif
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
  unsigned long address)
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h 
b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index f9019b579903..1013c0214213 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -47,9 +47,7 @@ extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned 
long);
 extern void pte_fragment_free(unsigned long *, int);
 extern void pmd_fragment_free(unsigned long *);
 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
-#ifdef CONFIG_SMP
 extern void __tlb_remove_table(void *_table);
-#endif
 
 static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h 

[PATCH v3 3/6] asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 96bc9567cbe112e9320250f01b9c060c882e8619 upstream.

Make issuing a TLB invalidate for page-table pages the normal case.

The reason is twofold:

 - too many invalidates is safer than too few,
 - most architectures use the linux page-tables natively
   and would thus require this.

Make it an opt-out, instead of an opt-in.

No change in behavior intended.

Signed-off-by: Peter Zijlstra (Intel) 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for upcoming tlbflush backports]
---
 arch/Kconfig | 2 +-
 arch/powerpc/Kconfig | 1 +
 arch/sparc/Kconfig   | 1 +
 arch/x86/Kconfig | 1 -
 mm/memory.c  | 2 +-
 5 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a336548487e6..061a12b8140e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -363,7 +363,7 @@ config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
bool
 
-config HAVE_RCU_TABLE_INVALIDATE
+config HAVE_RCU_TABLE_NO_INVALIDATE
bool
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6f475dc5829b..e09cfb109b8c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -216,6 +216,7 @@ config PPC
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE  if SMP
+   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e6f2a38d2e61..d90d632868aa 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -64,6 +64,7 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
+   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index af35f5caadbe..181d0d522977 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -181,7 +181,6 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE  if PARAVIRT
-   select HAVE_RCU_TABLE_INVALIDATEif HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && 
(UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_STACKPROTECTOR  if CC_HAS_SANE_STACKPROTECTOR
diff --git a/mm/memory.c b/mm/memory.c
index 1832c5ed6ac0..ba5689610c04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -327,7 +327,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct 
page *page, int page_
  */
 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 {
-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
/*
 * Invalidate page-table caches used by hardware walkers. Then we still
 * need to RCU-sched wait while freeing the pages because software
-- 
2.24.1



[PATCH v3 2/6] asm-generic/tlb: Track which levels of the page tables have been cleared

2020-03-12 Thread Santosh Sivaraj
From: Will Deacon 

commit a6d60245d6d9b1caf66b0d94419988c4836980af upstream

It is common for architectures with hugepage support to require only a
single TLB invalidation operation per hugepage during unmap(), rather than
iterating through the mapping at a PAGE_SIZE increment. Currently,
however, the level in the page table where the unmap() operation occurs
is not stored in the mmu_gather structure, therefore forcing
architectures to issue additional TLB invalidation operations or to give
up and over-invalidate by e.g. invalidating the entire TLB.

Ideally, we could add an interval rbtree to the mmu_gather structure,
which would allow us to associate the correct mapping granule with the
various sub-mappings within the range being invalidated. However, this
is costly in terms of book-keeping and memory management, so instead we
approximate by keeping track of the page table levels that are cleared
and provide a means to query the smallest granule required for invalidation.

Signed-off-by: Will Deacon 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for upcoming tlbflush backports]
---
 include/asm-generic/tlb.h | 58 +--
 mm/memory.c   |  4 ++-
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 97306b32d8d2..f2b9dc9cbaf8 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -114,6 +114,14 @@ struct mmu_gather {
 */
unsigned intfreed_tables : 1;
 
+   /*
+* at which levels have we cleared entries?
+*/
+   unsigned intcleared_ptes : 1;
+   unsigned intcleared_pmds : 1;
+   unsigned intcleared_puds : 1;
+   unsigned intcleared_p4ds : 1;
+
struct mmu_gather_batch *active;
struct mmu_gather_batch local;
struct page *__pages[MMU_GATHER_BUNDLE];
@@ -148,6 +156,10 @@ static inline void __tlb_reset_range(struct mmu_gather 
*tlb)
tlb->end = 0;
}
tlb->freed_tables = 0;
+   tlb->cleared_ptes = 0;
+   tlb->cleared_pmds = 0;
+   tlb->cleared_puds = 0;
+   tlb->cleared_p4ds = 0;
 }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@@ -197,6 +209,25 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 }
 #endif
 
+static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
+{
+   if (tlb->cleared_ptes)
+   return PAGE_SHIFT;
+   if (tlb->cleared_pmds)
+   return PMD_SHIFT;
+   if (tlb->cleared_puds)
+   return PUD_SHIFT;
+   if (tlb->cleared_p4ds)
+   return P4D_SHIFT;
+
+   return PAGE_SHIFT;
+}
+
+static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
+{
+   return 1UL << tlb_get_unmap_shift(tlb);
+}
+
 /*
  * In the case of tlb vma handling, we can optimise these away in the
  * case where we're doing a full MM flush.  When we're doing a munmap,
@@ -230,13 +261,19 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 #define tlb_remove_tlb_entry(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->cleared_ptes = 1;  \
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
 
-#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)\
-   do { \
-   __tlb_adjust_range(tlb, address, huge_page_size(h)); \
-   __tlb_remove_tlb_entry(tlb, ptep, address);  \
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)   \
+   do {\
+   unsigned long _sz = huge_page_size(h);  \
+   __tlb_adjust_range(tlb, address, _sz);  \
+   if (_sz == PMD_SIZE)\
+   tlb->cleared_pmds = 1;  \
+   else if (_sz == PUD_SIZE)   \
+   tlb->cleared_puds = 1;  \
+   __tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
 
 /**
@@ -250,6 +287,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)   \
do {\
__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE);   \
+   tlb->cleared_pmds = 1;  \
__tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
} while (0)
 
@@ -264,6 +302,7 @@ static inline void 

[PATCH v3 1/6] asm-generic/tlb: Track freeing of page-table directories in struct mmu_gather

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 22a61c3c4f1379ef8b0ce0d5cb78baf3178950e2 upstream

Some architectures require different TLB invalidation instructions
depending on whether it is only the last-level of page table being
changed, or whether there are also changes to the intermediate
(directory) entries higher up the tree.

Add a new bit to the flags bitfield in struct mmu_gather so that the
architecture code can operate accordingly if it's the intermediate
levels being invalidated.

Signed-off-by: Peter Zijlstra 
Signed-off-by: Will Deacon 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for tlbflush backports]
---
 include/asm-generic/tlb.h | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b3353e21f3b3..97306b32d8d2 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -97,12 +97,22 @@ struct mmu_gather {
 #endif
unsigned long   start;
unsigned long   end;
-   /* we are in the middle of an operation to clear
-* a full mm and can make some optimizations */
-   unsigned intfullmm : 1,
-   /* we have performed an operation which
-* requires a complete flush of the tlb */
-   need_flush_all : 1;
+   /*
+* we are in the middle of an operation to clear
+* a full mm and can make some optimizations
+*/
+   unsigned intfullmm : 1;
+
+   /*
+* we have performed an operation which
+* requires a complete flush of the tlb
+*/
+   unsigned intneed_flush_all : 1;
+
+   /*
+* we have removed page directories
+*/
+   unsigned intfreed_tables : 1;
 
struct mmu_gather_batch *active;
struct mmu_gather_batch local;
@@ -137,6 +147,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->start = TASK_SIZE;
tlb->end = 0;
}
+   tlb->freed_tables = 0;
 }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@@ -278,6 +289,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define pte_free_tlb(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pte_free_tlb(tlb, ptep, address); \
} while (0)
 #endif
@@ -285,7 +297,8 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pmd_free_tlb(tlb, pmdp, address); \
} while (0)
 #endif
@@ -295,6 +308,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define pud_free_tlb(tlb, pudp, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pud_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
@@ -304,7 +318,8 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #ifndef p4d_free_tlb
 #define p4d_free_tlb(tlb, pudp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__p4d_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
-- 
2.24.1



[PATCH v3 0/6] Memory corruption may occur due to incorrent tlb flush

2020-03-12 Thread Santosh Sivaraj
The TLB flush optimisation (a46cc7a90f: powerpc/mm/radix: Improve TLB/PWC
flushes) may result in random memory corruption. Any concurrent page-table walk
could end up with a Use-after-Free. Even on UP this might give issues, since
mmu_gather is preemptible these days. An interrupt or preempted task accessing
user pages might stumble into the free page if the hardware caches page
directories.

The series is a backport of the fix sent by Peter [1].

The first three patches are dependencies for the last patch (avoid potential
double flush). If the performance impact due to double flush is considered
trivial then the first three patches and last patch may be dropped.

This is only for v4.19 stable.

[1] https://patchwork.kernel.org/cover/11284843/

--
Changelog:
v2: Send the patches with the correct format (commit sha1 upstream) for stable
v3: Fix compilation issue on ppc40x_defconfig and ppc44x_defconfig

--
Aneesh Kumar K.V (1):
  powerpc/mmu_gather: enable RCU_TABLE_FREE even for !SMP case

Peter Zijlstra (4):
  asm-generic/tlb: Track freeing of page-table directories in struct
mmu_gather
  asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE
  mm/mmu_gather: invalidate TLB correctly on batch allocation failure
and flush
  asm-generic/tlb: avoid potential double flush

Will Deacon (1):
  asm-generic/tlb: Track which levels of the page tables have been
cleared

 arch/Kconfig |   3 -
 arch/powerpc/Kconfig |   2 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h |   8 --
 arch/powerpc/include/asm/book3s/64/pgalloc.h |   2 -
 arch/powerpc/include/asm/nohash/32/pgalloc.h |   8 --
 arch/powerpc/include/asm/nohash/64/pgalloc.h |   9 +-
 arch/powerpc/include/asm/tlb.h   |  11 ++
 arch/powerpc/mm/pgtable-book3s64.c   |   7 --
 arch/sparc/include/asm/tlb_64.h  |   9 ++
 arch/x86/Kconfig |   1 -
 include/asm-generic/tlb.h| 103 ---
 mm/memory.c  |  20 ++--
 12 files changed, 123 insertions(+), 60 deletions(-)

-- 
2.24.1



Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-12 Thread Srikar Dronamraju
* Vlastimil Babka  [2020-03-12 10:30:50]:

> On 3/12/20 9:23 AM, Sachin Sant wrote:
> >> On 12-Mar-2020, at 10:57 AM, Srikar Dronamraju  
> >> wrote:
> >> * Michal Hocko  [2020-03-11 12:57:35]:
> >>> On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
>  To ensure a cpuless, memoryless dummy node is not online, powerpc need
>  to make sure all possible but not present cpu_to_node are set to a
>  proper node.
> >>> 
> >>> Just curious, is this somehow related to
> >>> http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?
> >>> 
> >> 
> >> The issue I am trying to fix is a known issue in Powerpc since many years.
> >> So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: allocate
> >> shrinker_map on appropriate NUMA node"). 
> >> 
> >> I tried v5.6-rc4 + a75056fc1e7c but didnt face any issues booting the
> >> kernel. Will work with Sachin/Abdul (reporters of the issue).

I had used v1 and not v2. So my mistake.

> > I applied this 3 patch series on top of March 11 next tree (commit 
> > d44a64766795 )
> > The kernel still fails to boot with same call trace.
> 

While I am not an expert in the slub area, I looked at the patch
a75056fc1e7c and had some thoughts on why this could be causing this issue.

On the system where the crash happens, the possible number of nodes is much
greater than the number of onlined nodes. The pdgat or the NODE_DATA is only
available for onlined nodes.

With a75056fc1e7c memcg_alloc_shrinker_maps, we end up calling kzalloc_node
for all possible nodes and in ___slab_alloc we end up looking at the
node_present_pages which is NODE_DATA(nid)->node_present_pages.
i.e for a node whose pdgat struct is not allocated, we are trying to
dereference.

Also for a memoryless/cpuless node or possible but not present nodes,
node_to_mem_node(node) will still end up as node (atleast on powerpc).

I tried with this hunk below and it works.

But I am not sure if we need to check at other places were
node_present_pages is being called.

diff --git a/mm/slub.c b/mm/slub.c
index 626cbcbd977f..bddb93bed55e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2571,9 +2571,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t 
gfpflags, int node,
if (unlikely(!node_match(page, node))) {
int searchnode = node;
 
-   if (node != NUMA_NO_NODE && !node_present_pages(node))
-   searchnode = node_to_mem_node(node);
-
+   if (node != NUMA_NO_NODE) {
+   if (!node_online(node) || !node_present_pages(node)) {
+   searchnode = node_to_mem_node(node);
+   if (!node_online(searchnode))
+   searchnode = first_online_node;
+   }
+   }
if (unlikely(!node_match(page, searchnode))) {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist, c);

> > 
> 

-- 
Thanks and Regards
Srikar Dronamraju



Re: [5.6.0-rc2-next-20200218/powerpc] Boot failure on POWER9

2020-03-12 Thread Michael Ellerman
Michal Hocko  writes:
> On Thu 27-02-20 19:26:54, Michal Hocko wrote:
>> [Cc ppc maintainers]
> [...]
>> Please have a look at 
>> http://lkml.kernel.org/r/52ef4673-7292-4c4c-b459-af583951b...@linux.vnet.ibm.com
>> for the boot log with the debugging patch which tracks set_numa_mem.
>> This seems to lead to a crash in the slab allocator bebcause
>> node_to_mem_node(0) for memory less node resolves to the memory less
>> node http://lkml.kernel.org/r/dd450314-d428-6776-af07-f92c04c7b...@suse.cz.
>> The original report is 
>> http://lkml.kernel.org/r/3381cd91-ab3d-4773-ba04-e7a072a63...@linux.vnet.ibm.com
>
> ping 

The obvious fix is:

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 37c12e3bab9e..33b1fca0b258 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -892,6 +892,7 @@ void smp_prepare_boot_cpu(void)
paca_ptrs[boot_cpuid]->__current = current;
 #endif
set_numa_node(numa_cpu_lookup_table[boot_cpuid]);
+   set_numa_mem(local_memory_node(numa_cpu_lookup_table[boot_cpuid]));
current_set[boot_cpuid] = current;
 }


But that doesn't work because smp_prepare_boot_cpu() is called too
early:

asmlinkage __visible void __init start_kernel(void)
{
...
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
boot_cpu_hotplug_init();

build_all_zonelists(NULL);


And local_memory_node() uses first_zones_zonelist() which doesn't work
prior to build_all_zonelists() being called.


The patch below might work. Sachin can you test this? I tried faking up
a system with a memoryless node zero but couldn't get it to even start
booting.

cheers


diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9b4f5fb719e0..d1f11437f6c4 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -282,6 +282,9 @@ void __init mem_init(void)
 */
BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
 
+   BUG_ON(smp_processor_id() != boot_cpuid);
+   set_numa_mem(local_memory_node(numa_cpu_lookup_table[boot_cpuid]));
+
 #ifdef CONFIG_SWIOTLB
/*
 * Some platforms (e.g. 85xx) limit DMA-able memory way below


Re: [PATCH v3] ima: add a new CONFIG for loading arch-specific policies

2020-03-12 Thread Michael Ellerman
Nayna Jain  writes:
> From: Nayna Jain 
>
> Every time a new architecture defines the IMA architecture specific
> functions - arch_ima_get_secureboot() and arch_ima_get_policy(), the IMA
> include file needs to be updated. To avoid this "noise", this patch
> defines a new IMA Kconfig IMA_SECURE_AND_OR_TRUSTED_BOOT option, allowing
> the different architectures to select it.
>
> Suggested-by: Linus Torvalds 
> Signed-off-by: Nayna Jain 
> Acked-by: Ard Biesheuvel 
> Cc: Ard Biesheuvel 
> Cc: Philipp Rudo 
> Cc: Michael Ellerman 
> ---
> v3:
> * Removes CONFIG_IMA dependency. Thanks Ard.
> * Updated the patch with improvements suggested by Michael. It now uses
> "imply" instead of "select". Thanks Michael.

Acked-by: Michael Ellerman  (powerpc)

cheers


Re: [RESEND PATCH v2 1/9] iomap: Constify ioreadX() iomem argument (as in generic implementation)

2020-03-12 Thread Michael Ellerman
Krzysztof Kozlowski  writes:
> diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
> index 5ac84efc6ede..9fe4fb3b08aa 100644
> --- a/arch/powerpc/kernel/iomap.c
> +++ b/arch/powerpc/kernel/iomap.c
> @@ -15,23 +15,23 @@
>   * Here comes the ppc64 implementation of the IOMAP 
>   * interfaces.
>   */
> -unsigned int ioread8(void __iomem *addr)
> +unsigned int ioread8(const void __iomem *addr)
>  {
>   return readb(addr);
>  }
> -unsigned int ioread16(void __iomem *addr)
> +unsigned int ioread16(const void __iomem *addr)
>  {
>   return readw(addr);
>  }
> -unsigned int ioread16be(void __iomem *addr)
> +unsigned int ioread16be(const void __iomem *addr)
>  {
>   return readw_be(addr);
>  }
> -unsigned int ioread32(void __iomem *addr)
> +unsigned int ioread32(const void __iomem *addr)
>  {
>   return readl(addr);
>  }
> -unsigned int ioread32be(void __iomem *addr)
> +unsigned int ioread32be(const void __iomem *addr)
>  {
>   return readl_be(addr);
>  }
> @@ -41,27 +41,27 @@ EXPORT_SYMBOL(ioread16be);
>  EXPORT_SYMBOL(ioread32);
>  EXPORT_SYMBOL(ioread32be);
>  #ifdef __powerpc64__
> -u64 ioread64(void __iomem *addr)
> +u64 ioread64(const void __iomem *addr)
>  {
>   return readq(addr);
>  }
> -u64 ioread64_lo_hi(void __iomem *addr)
> +u64 ioread64_lo_hi(const void __iomem *addr)
>  {
>   return readq(addr);
>  }
> -u64 ioread64_hi_lo(void __iomem *addr)
> +u64 ioread64_hi_lo(const void __iomem *addr)
>  {
>   return readq(addr);
>  }
> -u64 ioread64be(void __iomem *addr)
> +u64 ioread64be(const void __iomem *addr)
>  {
>   return readq_be(addr);
>  }
> -u64 ioread64be_lo_hi(void __iomem *addr)
> +u64 ioread64be_lo_hi(const void __iomem *addr)
>  {
>   return readq_be(addr);
>  }
> -u64 ioread64be_hi_lo(void __iomem *addr)
> +u64 ioread64be_hi_lo(const void __iomem *addr)
>  {
>   return readq_be(addr);
>  }
> @@ -139,15 +139,15 @@ EXPORT_SYMBOL(iowrite64be_hi_lo);
>   * FIXME! We could make these do EEH handling if we really
>   * wanted. Not clear if we do.
>   */
> -void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
> +void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count)
>  {
>   readsb(addr, dst, count);
>  }
> -void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
> +void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count)
>  {
>   readsw(addr, dst, count);
>  }
> -void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
> +void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count)
>  {
>   readsl(addr, dst, count);
>  }

This looks OK to me.

Acked-by: Michael Ellerman  (powerpc)

cheers


Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-03-12 Thread Jiri Olsa
On Mon, Mar 09, 2020 at 11:55:50AM +0530, Kajol Jain wrote:

SNIP

> diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
> index 9377538f4097..d17664e628db 100644
> --- a/tools/perf/util/expr.h
> +++ b/tools/perf/util/expr.h
> @@ -15,6 +15,7 @@ struct parse_ctx {
>   struct parse_id ids[MAX_PARSE_ID];
>  };
>  
> +int expr__runtimeparam;
>  void expr__ctx_init(struct parse_ctx *ctx);
>  void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
>  int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr);
> diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
> index 1928f2a3dddc..ec4b00671f67 100644
> --- a/tools/perf/util/expr.l
> +++ b/tools/perf/util/expr.l
> @@ -45,6 +45,21 @@ static char *normalize(char *str)
>   *dst++ = '/';
>   else if (*str == '\\')
>   *dst++ = *++str;
> +else if (*str == '?') {
> +
> + int size = snprintf(NULL, 0, "%d", expr__runtimeparam);
> + char * paramval = (char *)malloc(size);

can't we agree that any reasonable number in here
wouldn't cross 20 bytes in string or so and use
buffer for that instead of that malloc exercise?

thanks,
jirka

> + int i = 0;
> +
> + if(!paramval)
> + *dst++ = '0';
> + else {
> + sprintf(paramval, "%d", expr__runtimeparam);
> + while(i < size)
> + *dst++ = paramval[i++];
> + free(paramval);
> + }
> + }

SNIP



Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-03-12 Thread Jiri Olsa
On Mon, Mar 09, 2020 at 11:55:50AM +0530, Kajol Jain wrote:

SNIP

> +static int metricgroup__add_metric_runtime_param(struct strbuf *events,
> + struct list_head *group_list, struct pmu_event *pe)
> +{
> + int i, count;
> + int ret = -EINVAL;
> +
> + count = arch_get_runtimeparam();
> +
> + /* This loop is added to create multiple
> +  * events depend on count value and add
> +  * those events to group_list.
> +  */
> +
> + for (i = 0; i < count; i++) {
> + const char **ids;
> + int idnum;
> + struct egroup *eg;
> + char value[PATH_MAX];
> +
> + expr__runtimeparam = i;
> +
> + if (expr__find_other(pe->metric_expr,
> + NULL, , ) < 0)
> + return ret;
> +
> + if (events->len > 0)
> + strbuf_addf(events, ",");
> +
> + if (metricgroup__has_constraint(pe))
> + metricgroup__add_metric_non_group(events, ids, idnum);
> + else
> + metricgroup__add_metric_weak_group(events, ids, idnum);
> +
> + eg = malloc(sizeof(struct egroup));
> + if (!eg) {
> + ret = -ENOMEM;
> + return ret;
> + }
> + sprintf(value, "%s%c%d", pe->metric_name, '_', i);
> + eg->ids = ids;
> + eg->idnum = idnum;
> + eg->metric_name = strdup(value);
> + eg->metric_expr = pe->metric_expr;
> + eg->metric_unit = pe->unit;
> + list_add_tail(>nd, group_list);
> + ret = 0;
> +
> + if (ret != 0)
> + break;

the inside loop is essentialy what you factor out to
metricgroup__add_metric_param right? please nove
addition of metricgroup__add_metric_param function
into separate patch

jirka


> + }
> + return ret;
> +}
> +static int metricgroup__add_metric_param(struct strbuf *events,
> + struct list_head *group_list, struct pmu_event *pe)
> +{
> +
> + const char **ids;
> + int idnum;
> + struct egroup *eg;
> + int ret = -EINVAL;
> +
> + if (expr__find_other(pe->metric_expr,
> +  NULL, , ) < 0)
> + return ret;
> + if (events->len > 0)
> + strbuf_addf(events, ",");
> +
> + if (metricgroup__has_constraint(pe))
> + metricgroup__add_metric_non_group(events, ids, idnum);
> + else
> + metricgroup__add_metric_weak_group(events, ids, idnum);
> +
> + eg = malloc(sizeof(struct egroup));
> + if (!eg)
> + ret = -ENOMEM;
> +
> + eg->ids = ids;
> + eg->idnum = idnum;
> + eg->metric_name = pe->metric_name;
> + eg->metric_expr = pe->metric_expr;
> + eg->metric_unit = pe->unit;
> + list_add_tail(>nd, group_list);
> + ret = 0;
> +
> + return ret;
> +}

SNIP



Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-03-12 Thread Jiri Olsa
On Tue, Mar 10, 2020 at 03:34:55PM -0300, Arnaldo Carvalho de Melo wrote:

SNIP

> > diff --git a/tools/perf/arch/powerpc/util/header.c 
> > b/tools/perf/arch/powerpc/util/header.c
> > index 3b4cdfc5efd6..036f6b2ce202 100644
> > --- a/tools/perf/arch/powerpc/util/header.c
> > +++ b/tools/perf/arch/powerpc/util/header.c
> > @@ -7,6 +7,11 @@
> >  #include 
> >  #include 
> >  #include "header.h"
> > +#include "metricgroup.h"
> > +#include "evlist.h"
> > +#include 
> > +#include "pmu.h"
> > +#include 
> >  
> >  #define mfspr(rn)   ({unsigned long rval; \
> >  asm volatile("mfspr %0," __stringify(rn) \
> > @@ -16,6 +21,8 @@
> >  #define PVR_VER(pvr)(((pvr) >>  16) & 0x) /* Version field */
> >  #define PVR_REV(pvr)(((pvr) >>   0) & 0x) /* Revison field */
> >  
> > +#define SOCKETS_INFO_FILE_PATH "/devices/hv_24x7/interface/"
> > +
> >  int
> >  get_cpuid(char *buffer, size_t sz)
> >  {
> > @@ -44,3 +51,18 @@ get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
> >  
> > return bufp;
> >  }
> > +
> > +int arch_get_runtimeparam(void)
> > +{
> > +   int count;
> > +   char path[PATH_MAX];
> > +   char filename[] = "sockets";
> > +
> > +   snprintf(path, PATH_MAX,
> > +SOCKETS_INFO_FILE_PATH "%s", filename);

also, what's the point of using snprintf in here?

jirka



Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-03-12 Thread Jiri Olsa
On Mon, Mar 09, 2020 at 11:55:50AM +0530, Kajol Jain wrote:

SNIP

> diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
> index 1928f2a3dddc..ec4b00671f67 100644
> --- a/tools/perf/util/expr.l
> +++ b/tools/perf/util/expr.l
> @@ -45,6 +45,21 @@ static char *normalize(char *str)
>   *dst++ = '/';
>   else if (*str == '\\')
>   *dst++ = *++str;
> +else if (*str == '?') {
> +

extra line ^^^

jirka

> + int size = snprintf(NULL, 0, "%d", expr__runtimeparam);
> + char * paramval = (char *)malloc(size);
> + int i = 0;
> +
> + if(!paramval)

SNIP



Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-03-12 Thread Jiri Olsa
On Mon, Mar 09, 2020 at 11:55:50AM +0530, Kajol Jain wrote:

SNIP

> diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
> index c3a8c701609a..11eeeb929b91 100644
> --- a/tools/perf/util/metricgroup.c
> +++ b/tools/perf/util/metricgroup.c
> @@ -474,6 +474,98 @@ static bool metricgroup__has_constraint(struct pmu_event 
> *pe)
>   return false;
>  }
>  
> +int __weak arch_get_runtimeparam(void)
> +{
> + return 1;
> +}
> +
> +static int metricgroup__add_metric_runtime_param(struct strbuf *events,
> + struct list_head *group_list, struct pmu_event *pe)
> +{
> + int i, count;
> + int ret = -EINVAL;
> +
> + count = arch_get_runtimeparam();
> +
> + /* This loop is added to create multiple
> +  * events depend on count value and add
> +  * those events to group_list.
> +  */
> +
> + for (i = 0; i < count; i++) {
> + const char **ids;
> + int idnum;
> + struct egroup *eg;
> + char value[PATH_MAX];
> +
> + expr__runtimeparam = i;

so the expr__runtimeparam is always set before we call the
expr parsing function - wither expr__find_other or expr__parse,

and it's used inside the normalize flexer function, which has
access to the passed context.. so I don't see a reason why
expr__runtimeparam couldn't be added in struct parse_ctx
and used from there..

while in this, perhaps we should rename parse_ctx to expr_ctx,
to keep the namespace straight (in separate patch)

thanks,
jirka



Re: [RESEND PATCH v2 6/9] drm/mgag200: Constify ioreadX() iomem argument (as in generic implementation)

2020-03-12 Thread Thomas Zimmermann
Hi Krzysztof,

I just received a resend email from 3 weeks ago :/

Do you want me to merge the mgag200 patch into drm-misc-next?

Best regards
Thomas

Am 19.02.20 um 18:50 schrieb Krzysztof Kozlowski:
> The ioreadX() helpers have inconsistent interface.  On some architectures
> void *__iomem address argument is a pointer to const, on some not.
> 
> Implementations of ioreadX() do not modify the memory under the address
> so they can be converted to a "const" version for const-safety and
> consistency among architectures.
> 
> Signed-off-by: Krzysztof Kozlowski 
> Reviewed-by: Thomas Zimmermann 
> 
> ---
> 
> Changes since v1:
> 1. Add Thomas' review.
> ---
>  drivers/gpu/drm/mgag200/mgag200_drv.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/mgag200/mgag200_drv.h 
> b/drivers/gpu/drm/mgag200/mgag200_drv.h
> index aa32aad222c2..6512b3af4fb7 100644
> --- a/drivers/gpu/drm/mgag200/mgag200_drv.h
> +++ b/drivers/gpu/drm/mgag200/mgag200_drv.h
> @@ -34,9 +34,9 @@
>  
>  #define MGAG200FB_CONN_LIMIT 1
>  
> -#define RREG8(reg) ioread8(((void __iomem *)mdev->rmmio) + (reg))
> +#define RREG8(reg) ioread8(((const void __iomem *)mdev->rmmio) + (reg))
>  #define WREG8(reg, v) iowrite8(v, ((void __iomem *)mdev->rmmio) + (reg))
> -#define RREG32(reg) ioread32(((void __iomem *)mdev->rmmio) + (reg))
> +#define RREG32(reg) ioread32(((const void __iomem *)mdev->rmmio) + (reg))
>  #define WREG32(reg, v) iowrite32(v, ((void __iomem *)mdev->rmmio) + (reg))
>  
>  #define ATTR_INDEX 0x1fc0
> 

-- 
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Felix Imendörffer



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-12 Thread Vlastimil Babka
On 3/12/20 9:23 AM, Sachin Sant wrote:
> 
> 
>> On 12-Mar-2020, at 10:57 AM, Srikar Dronamraju  
>> wrote:
>> 
>> * Michal Hocko  [2020-03-11 12:57:35]:
>> 
>>> On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
 A Powerpc system with multiple possible nodes and with CONFIG_NUMA
 enabled always used to have a node 0, even if node 0 does not any cpus
 or memory attached to it. As per PAPR, node affinity of a cpu is only
 available once its present / online. For all cpus that are possible but
 not present, cpu_to_node() would point to node 0.
 
 To ensure a cpuless, memoryless dummy node is not online, powerpc need
 to make sure all possible but not present cpu_to_node are set to a
 proper node.
>>> 
>>> Just curious, is this somehow related to
>>> http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?
>>> 
>> 
>> The issue I am trying to fix is a known issue in Powerpc since many years.
>> So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: allocate
>> shrinker_map on appropriate NUMA node"). 
>> 
>> I tried v5.6-rc4 + a75056fc1e7c but didnt face any issues booting the
>> kernel. Will work with Sachin/Abdul (reporters of the issue).
>> 
> 
> I applied this 3 patch series on top of March 11 next tree (commit 
> d44a64766795 )
> The kernel still fails to boot with same call trace.

Yeah when I skimmed the patches, I don't think they address the issue where
node_to_mem_node(0) = 0 [1]. You could reapply the debug print patch to verify,
but it seems very likely. So I'm not surprised you get the same trace.

[1] 
https://lore.kernel.org/linux-next/9a86f865-50b5-7483-9257-dbb08fecd...@suse.cz/

> [6.159357] BUG: Kernel NULL pointer dereference on read at 0x73b0
> [6.159363] Faulting instruction address: 0xc03d7174
> [6.159368] Oops: Kernel access of bad area, sig: 11 [#1]
> [6.159372] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> [6.159378] Modules linked in:
> [6.159382] CPU: 17 PID: 1 Comm: systemd Not tainted 
> 5.6.0-rc5-next-20200311-autotest+ #1
> [6.159388] NIP:  c03d7174 LR: c03d7714 CTR: 
> c0400e70
> [6.159393] REGS: c008b36836d0 TRAP: 0300   Not tainted  
> (5.6.0-rc5-next-20200311-autotest+)
> [6.159398] MSR:  80009033   CR: 24004848  
> XER: 
> [6.159406] CFAR: c000dec4 DAR: 73b0 DSISR: 4000 
> IRQMASK: 1
> [6.159406] GPR00: c03d7714 c008b3683960 c155e300 
> c008b301f500
> [6.159406] GPR04: 0dc0  c03456f8 
> c008bb198620
> [6.159406] GPR08: 0008ba0f 0001  
> 
> [6.159406] GPR12: 24004848 c0001ec55e00  
> 
> [6.159406] GPR16: c008b0a82048 c1595898 c1750ca8 
> 0002
> [6.159406] GPR20: c1750cb8 c1624478 000fffe0 
> 5deadbeef122
> [6.159406] GPR24: 0001 0dc0  
> c03456f8
> [6.159406] GPR28: c008b301f500 c008bb198620  
> c00c02285a40
> [6.159453] NIP [c03d7174] ___slab_alloc+0x1f4/0x760
> [6.159458] LR [c03d7714] __slab_alloc+0x34/0x60
> [6.159462] Call Trace:
> [6.159465] [c008b3683a40] [c008b3683a70] 0xc008b3683a70
> [6.159471] [c008b3683a70] [c03d8b20] 
> __kmalloc_node+0x110/0x490
> [6.159477] [c008b3683af0] [c03456f8] kvmalloc_node+0x58/0x110
> [6.159483] [c008b3683b30] [c0400f78] 
> mem_cgroup_css_online+0x108/0x270
> [6.159489] [c008b3683b90] [c0236ed8] online_css+0x48/0xd0
> [6.159494] [c008b3683bc0] [c023ffac] 
> cgroup_apply_control_enable+0x2ec/0x4d0
> [6.159501] [c008b3683ca0] [c02437c8] cgroup_mkdir+0x228/0x5f0
> [6.159506] [c008b3683d10] [c0521780] 
> kernfs_iop_mkdir+0x90/0xf0
> [6.159512] [c008b3683d50] [c043f670] vfs_mkdir+0x110/0x230
> [6.159517] [c008b3683da0] [c0443150] do_mkdirat+0xb0/0x1a0
> [6.159523] [c008b3683e20] [c000b278] system_call+0x5c/0x68
> [6.159527] Instruction dump:
> [6.159531] 7c421378 e95f 714a0001 4082fff0 4b64 6000 6000 
> faa10088
> [6.159538] 3ea2000c 3ab56178 7b4a1f24 7d55502a  2faa 
> 409e0394 3d02002a
> [6.159545] ---[ end trace 36d65cb66091a5b6 ]—
> 
> Boot log attached.
> 
> Thanks
> -Sachin
> 



Re: [PATCH 1/3] powerpc/numa: Set numa_node for all possible cpus

2020-03-12 Thread Sachin Sant


> On 12-Mar-2020, at 10:57 AM, Srikar Dronamraju  
> wrote:
> 
> * Michal Hocko  [2020-03-11 12:57:35]:
> 
>> On Wed 11-03-20 16:32:35, Srikar Dronamraju wrote:
>>> A Powerpc system with multiple possible nodes and with CONFIG_NUMA
>>> enabled always used to have a node 0, even if node 0 does not any cpus
>>> or memory attached to it. As per PAPR, node affinity of a cpu is only
>>> available once its present / online. For all cpus that are possible but
>>> not present, cpu_to_node() would point to node 0.
>>> 
>>> To ensure a cpuless, memoryless dummy node is not online, powerpc need
>>> to make sure all possible but not present cpu_to_node are set to a
>>> proper node.
>> 
>> Just curious, is this somehow related to
>> http://lkml.kernel.org/r/20200227182650.gg3...@dhcp22.suse.cz?
>> 
> 
> The issue I am trying to fix is a known issue in Powerpc since many years.
> So this surely not a problem after a75056fc1e7c (mm/memcontrol.c: allocate
> shrinker_map on appropriate NUMA node"). 
> 
> I tried v5.6-rc4 + a75056fc1e7c but didnt face any issues booting the
> kernel. Will work with Sachin/Abdul (reporters of the issue).
> 

I applied this 3 patch series on top of March 11 next tree (commit d44a64766795 
)
The kernel still fails to boot with same call trace.

[6.159357] BUG: Kernel NULL pointer dereference on read at 0x73b0
[6.159363] Faulting instruction address: 0xc03d7174
[6.159368] Oops: Kernel access of bad area, sig: 11 [#1]
[6.159372] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
[6.159378] Modules linked in:
[6.159382] CPU: 17 PID: 1 Comm: systemd Not tainted 
5.6.0-rc5-next-20200311-autotest+ #1
[6.159388] NIP:  c03d7174 LR: c03d7714 CTR: c0400e70
[6.159393] REGS: c008b36836d0 TRAP: 0300   Not tainted  
(5.6.0-rc5-next-20200311-autotest+)
[6.159398] MSR:  80009033   CR: 24004848  
XER: 
[6.159406] CFAR: c000dec4 DAR: 73b0 DSISR: 4000 
IRQMASK: 1
[6.159406] GPR00: c03d7714 c008b3683960 c155e300 
c008b301f500
[6.159406] GPR04: 0dc0  c03456f8 
c008bb198620
[6.159406] GPR08: 0008ba0f 0001  

[6.159406] GPR12: 24004848 c0001ec55e00  

[6.159406] GPR16: c008b0a82048 c1595898 c1750ca8 
0002
[6.159406] GPR20: c1750cb8 c1624478 000fffe0 
5deadbeef122
[6.159406] GPR24: 0001 0dc0  
c03456f8
[6.159406] GPR28: c008b301f500 c008bb198620  
c00c02285a40
[6.159453] NIP [c03d7174] ___slab_alloc+0x1f4/0x760
[6.159458] LR [c03d7714] __slab_alloc+0x34/0x60
[6.159462] Call Trace:
[6.159465] [c008b3683a40] [c008b3683a70] 0xc008b3683a70
[6.159471] [c008b3683a70] [c03d8b20] __kmalloc_node+0x110/0x490
[6.159477] [c008b3683af0] [c03456f8] kvmalloc_node+0x58/0x110
[6.159483] [c008b3683b30] [c0400f78] 
mem_cgroup_css_online+0x108/0x270
[6.159489] [c008b3683b90] [c0236ed8] online_css+0x48/0xd0
[6.159494] [c008b3683bc0] [c023ffac] 
cgroup_apply_control_enable+0x2ec/0x4d0
[6.159501] [c008b3683ca0] [c02437c8] cgroup_mkdir+0x228/0x5f0
[6.159506] [c008b3683d10] [c0521780] kernfs_iop_mkdir+0x90/0xf0
[6.159512] [c008b3683d50] [c043f670] vfs_mkdir+0x110/0x230
[6.159517] [c008b3683da0] [c0443150] do_mkdirat+0xb0/0x1a0
[6.159523] [c008b3683e20] [c000b278] system_call+0x5c/0x68
[6.159527] Instruction dump:
[6.159531] 7c421378 e95f 714a0001 4082fff0 4b64 6000 6000 
faa10088
[6.159538] 3ea2000c 3ab56178 7b4a1f24 7d55502a  2faa 409e0394 
3d02002a
[6.159545] ---[ end trace 36d65cb66091a5b6 ]—

Boot log attached.

Thanks
-Sachin

memory-less-node-boot.log
Description: Binary data


Re: [RESEND PATCH v2 8/9] media: fsl-viu: Constify ioreadX() iomem argument (as in generic implementation)

2020-03-12 Thread Hans Verkuil
On 2/19/20 6:50 PM, Krzysztof Kozlowski wrote:
> The ioreadX() helpers have inconsistent interface.  On some architectures
> void *__iomem address argument is a pointer to const, on some not.
> 
> Implementations of ioreadX() do not modify the memory under the address
> so they can be converted to a "const" version for const-safety and
> consistency among architectures.
> 
> Signed-off-by: Krzysztof Kozlowski 

Acked-by: Hans Verkuil 

Regards,

Hans

> ---
>  drivers/media/platform/fsl-viu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/media/platform/fsl-viu.c 
> b/drivers/media/platform/fsl-viu.c
> index 81a8faedbba6..991d9dc82749 100644
> --- a/drivers/media/platform/fsl-viu.c
> +++ b/drivers/media/platform/fsl-viu.c
> @@ -34,7 +34,7 @@
>  /* Allow building this driver with COMPILE_TEST */
>  #if !defined(CONFIG_PPC) && !defined(CONFIG_MICROBLAZE)
>  #define out_be32(v, a)   iowrite32be(a, (void __iomem *)v)
> -#define in_be32(a)   ioread32be((void __iomem *)a)
> +#define in_be32(a)   ioread32be((const void __iomem *)a)
>  #endif
>  
>  #define BUFFER_TIMEOUT   msecs_to_jiffies(500)  /* 0.5 seconds */
> 



[PATCH kernel] powerpc/prom_init: Pass the "os-term" message to hypervisor

2020-03-12 Thread Alexey Kardashevskiy
The "os-term" RTAS calls has one argument with a message address of
OS termination cause. rtas_os_term() already passes it but the recently
added prom_init's version of that missed it; it also does not fill args
correctly.

This passes the message address and initializes the number of arguments.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/prom_init.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 577345382b23..673f13b87db1 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1773,6 +1773,9 @@ static void __init prom_rtas_os_term(char *str)
if (token == 0)
prom_panic("Could not get token for ibm,os-term\n");
os_term_args.token = cpu_to_be32(token);
+   os_term_args.nargs = cpu_to_be32(1);
+   os_term_args.nret = cpu_to_be32(1);
+   os_term_args.args[0] = cpu_to_be32(__pa(str));
prom_rtas_hcall((uint64_t)_term_args);
 }
 #endif /* CONFIG_PPC_SVM */
-- 
2.17.1



[PATCH v4] powerpc: Replace setup_irq() by request_irq()

2020-03-12 Thread afzal mohammed
request_irq() is preferred over setup_irq(). Invocations of setup_irq()
occur after memory allocators are ready.

Per tglx[1], setup_irq() existed in olden days when allocators were not
ready by the time early interrupts were initialized.

Hence replace setup_irq() by request_irq().

[1] https://lkml.kernel.org/r/alpine.DEB.2.20.1710191609480.1971@nanos

Signed-off-by: afzal mohammed 
---

v4:
 * pass non-NULL dev_id while requesting shared irq in mpc85xx_cds.c, as
request_irq() can fail due to sanity check, which is not done in
setup_irq()
v3:
 * Split out from tree wide series, as Thomas suggested to get it thr'
respective maintainers
 * Modify pr_err displayed in case of error
 * Re-arrange code & choose pr_err args as required to improve readability
 * Remove irrelevant parts from commit message & improve
 
v2:
 * Replace pr_err("request_irq() on %s failed" by
   pr_err("%s: request_irq() failed"
 * Commit message massage

 arch/powerpc/platforms/85xx/mpc85xx_cds.c | 11 -
 arch/powerpc/platforms/8xx/cpm1.c |  9 ++-
 arch/powerpc/platforms/8xx/m8xx_setup.c   |  9 ++-
 arch/powerpc/platforms/chrp/setup.c   | 14 ---
 arch/powerpc/platforms/powermac/pic.c | 29 +--
 arch/powerpc/platforms/powermac/smp.c | 12 --
 6 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c 
b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
index 6b1436abe9b1..915ab6710b93 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
@@ -218,12 +218,6 @@ static irqreturn_t mpc85xx_8259_cascade_action(int irq, 
void *dev_id)
 {
return IRQ_HANDLED;
 }
-
-static struct irqaction mpc85xxcds_8259_irqaction = {
-   .handler = mpc85xx_8259_cascade_action,
-   .flags = IRQF_SHARED | IRQF_NO_THREAD,
-   .name = "8259 cascade",
-};
 #endif /* PPC_I8259 */
 #endif /* CONFIG_PCI */
 
@@ -271,7 +265,10 @@ static int mpc85xx_cds_8259_attach(void)
 *  disabled when the last user of the shared IRQ line frees their
 *  interrupt.
 */
-   if ((ret = setup_irq(cascade_irq, _8259_irqaction))) {
+   ret = request_irq(cascade_irq, mpc85xx_8259_cascade_action,
+ IRQF_SHARED | IRQF_NO_THREAD, "8259 cascade",
+ cascade_node);
+   if (ret) {
printk(KERN_ERR "Failed to setup cascade interrupt\n");
return ret;
}
diff --git a/arch/powerpc/platforms/8xx/cpm1.c 
b/arch/powerpc/platforms/8xx/cpm1.c
index a43ee7d1ff85..4db4ca2e1222 100644
--- a/arch/powerpc/platforms/8xx/cpm1.c
+++ b/arch/powerpc/platforms/8xx/cpm1.c
@@ -120,12 +120,6 @@ static irqreturn_t cpm_error_interrupt(int irq, void *dev)
return IRQ_HANDLED;
 }
 
-static struct irqaction cpm_error_irqaction = {
-   .handler = cpm_error_interrupt,
-   .flags = IRQF_NO_THREAD,
-   .name = "error",
-};
-
 static const struct irq_domain_ops cpm_pic_host_ops = {
.map = cpm_pic_host_map,
 };
@@ -187,7 +181,8 @@ unsigned int __init cpm_pic_init(void)
if (!eirq)
goto end;
 
-   if (setup_irq(eirq, _error_irqaction))
+   if (request_irq(eirq, cpm_error_interrupt, IRQF_NO_THREAD, "error",
+   NULL))
printk(KERN_ERR "Could not allocate CPM error IRQ!");
 
setbits32(_reg->cpic_cicr, CICR_IEN);
diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c 
b/arch/powerpc/platforms/8xx/m8xx_setup.c
index f1c805c8adbc..df4d57d07f9a 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -39,12 +39,6 @@ static irqreturn_t timebase_interrupt(int irq, void *dev)
return IRQ_HANDLED;
 }
 
-static struct irqaction tbint_irqaction = {
-   .handler = timebase_interrupt,
-   .flags = IRQF_NO_THREAD,
-   .name = "tbint",
-};
-
 /* per-board overridable init_internal_rtc() function. */
 void __init __attribute__ ((weak))
 init_internal_rtc(void)
@@ -157,7 +151,8 @@ void __init mpc8xx_calibrate_decr(void)
(TBSCR_TBF | TBSCR_TBE));
immr_unmap(sys_tmr2);
 
-   if (setup_irq(virq, _irqaction))
+   if (request_irq(virq, timebase_interrupt, IRQF_NO_THREAD, "tbint",
+   NULL))
panic("Could not allocate timer IRQ!");
 }
 
diff --git a/arch/powerpc/platforms/chrp/setup.c 
b/arch/powerpc/platforms/chrp/setup.c
index fcf6f2342ef4..8328cd5817b0 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -451,13 +451,6 @@ static void __init chrp_find_openpic(void)
of_node_put(np);
 }
 
-#if defined(CONFIG_VT) && defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_XMON)
-static struct irqaction xmon_irqaction = {
-   .handler = xmon_irq,
-   .name = "XMON break",
-};
-#endif
-
 static void __init chrp_find_8259(void)
 {
 

Re: [PATCH v4 6/8] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-03-12 Thread kajoljain



On 3/11/20 12:04 AM, Arnaldo Carvalho de Melo wrote:
> Em Mon, Mar 09, 2020 at 11:55:50AM +0530, Kajol Jain escreveu:
>> Patch enhances current metric infrastructure to handle "?" in the metric
>> expression. The "?" can be use for parameters whose value not known while
>> creating metric events and which can be replace later at runtime to
>> the proper value. It also add flexibility to create multiple events out
>> of single metric event added in json file.
>>
>> Patch adds function 'arch_get_runtimeparam' which is a arch specific
>> function, returns the count of metric events need to be created.
>> By default it return 1.
>>
>> One loop is added in function 'metricgroup__add_metric', which create
>> multiple events at run time depend on return value of
>> 'arch_get_runtimeparam' and merge that event in 'group_list'.
>>
>> This infrastructure needed for hv_24x7 socket/chip level events.
>> "hv_24x7" chip level events needs specific chip-id to which the
>> data is requested. Function 'arch_get_runtimeparam' implemented
>> in header.c which extract number of sockets from sysfs file
>> "sockets" under "/sys/devices/hv_24x7/interface/".
>>
>> Signed-off-by: Kajol Jain 
>> ---
>>  tools/perf/arch/powerpc/util/header.c |  22 +
>>  tools/perf/util/expr.h|   1 +
>>  tools/perf/util/expr.l|  19 +++-
>>  tools/perf/util/metricgroup.c | 124 --
>>  tools/perf/util/metricgroup.h |   1 +
>>  tools/perf/util/stat-shadow.c |   8 ++
>>  6 files changed, 148 insertions(+), 27 deletions(-)
>>
>> diff --git a/tools/perf/arch/powerpc/util/header.c 
>> b/tools/perf/arch/powerpc/util/header.c
>> index 3b4cdfc5efd6..036f6b2ce202 100644
>> --- a/tools/perf/arch/powerpc/util/header.c
>> +++ b/tools/perf/arch/powerpc/util/header.c
>> @@ -7,6 +7,11 @@
>>  #include 
>>  #include 
>>  #include "header.h"
>> +#include "metricgroup.h"
>> +#include "evlist.h"
>> +#include 
>> +#include "pmu.h"
>> +#include 
>>  
>>  #define mfspr(rn)   ({unsigned long rval; \
>>   asm volatile("mfspr %0," __stringify(rn) \
>> @@ -16,6 +21,8 @@
>>  #define PVR_VER(pvr)(((pvr) >>  16) & 0x) /* Version field */
>>  #define PVR_REV(pvr)(((pvr) >>   0) & 0x) /* Revison field */
>>  
>> +#define SOCKETS_INFO_FILE_PATH "/devices/hv_24x7/interface/"
>> +
>>  int
>>  get_cpuid(char *buffer, size_t sz)
>>  {
>> @@ -44,3 +51,18 @@ get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
>>  
>>  return bufp;
>>  }
>> +
>> +int arch_get_runtimeparam(void)
>> +{
>> +int count;
>> +char path[PATH_MAX];
>> +char filename[] = "sockets";
>> +
>> +snprintf(path, PATH_MAX,
>> + SOCKETS_INFO_FILE_PATH "%s", filename);
>> +
>> +if (sysfs__read_ull(path, (unsigned long long *)) < 0)
>> +return 1;
>> +else
>> +return count;
> 
> Why this cast dance? We have sysfs__read_int(path, ).
> 
> Also this is more compact:
> 
>   return sysfs__read_int(path, ) < 0 ? 1 : count;

Hi Arnaldo,
 Yes it will be better to use like that.
> 
>> +}
>> diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
>> index 9377538f4097..d17664e628db 100644
>> --- a/tools/perf/util/expr.h
>> +++ b/tools/perf/util/expr.h
>> @@ -15,6 +15,7 @@ struct parse_ctx {
>>  struct parse_id ids[MAX_PARSE_ID];
>>  };
>>  
>> +int expr__runtimeparam;
>>  void expr__ctx_init(struct parse_ctx *ctx);
>>  void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
>>  int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr);
>> diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
>> index 1928f2a3dddc..ec4b00671f67 100644
>> --- a/tools/perf/util/expr.l
>> +++ b/tools/perf/util/expr.l
>> @@ -45,6 +45,21 @@ static char *normalize(char *str)
>>  *dst++ = '/';
>>  else if (*str == '\\')
>>  *dst++ = *++str;
>> +else if (*str == '?') {
>> +
>> +int size = snprintf(NULL, 0, "%d", expr__runtimeparam);
> 
> TIL that C99 allows for a NULL str to format and return the number of
> bytes it would write if the string was large enough... wonders never
> cease :-)
> 
>> +char * paramval = (char *)malloc(size);
> 
> No need for the cast, malloc returns void *, or has that changed? 8-)
> and please no space before the variable name.
> 

Okk, Will take care of that.

> Humm this is all complicated, why not use asprintf and have something
> like:
> 
>> +int i = 0;
>> +
>> +if(!paramval)
>> +*dst++ = '0';
>> +else {
>> +sprintf(paramval, "%d", expr__runtimeparam);
>> +while(i < size)
>> +*dst++ = paramval[i++];
>> +free(paramval);
>> +}
> 
>   char *paramval;
>