[PATCH] powerpc/mm: Remove reduntant initmem information from log

2017-04-06 Thread Anshuman Khandual
Generic core VM already prints these information in the log
buffer, hence there is no need for a second print. This just
removes the second print from arch powerpc NUMA init path.

Before the patch:

$dmesg | grep "Initmem"

numa: Initmem setup node 0 [mem 0x-0x]
numa: Initmem setup node 1 [mem 0x1-0x1]
numa: Initmem setup node 2 [mem 0x2-0x2]
numa: Initmem setup node 3 [mem 0x3-0x3]
numa: Initmem setup node 4 [mem 0x4-0x4]
numa: Initmem setup node 5 [mem 0x5-0x5]
numa: Initmem setup node 6 [mem 0x6-0x6]
numa: Initmem setup node 7 [mem 0x7-0x7]
Initmem setup node 0 [mem 0x-0x]
Initmem setup node 1 [mem 0x0001-0x0001]
Initmem setup node 2 [mem 0x0002-0x0002]
Initmem setup node 3 [mem 0x0003-0x0003]
Initmem setup node 4 [mem 0x0004-0x0004]
Initmem setup node 5 [mem 0x0005-0x0005]
Initmem setup node 6 [mem 0x0006-0x0006]
Initmem setup node 7 [mem 0x0007-0x0007]

After the patch:

$dmesg | grep "Initmem"

Initmem setup node 0 [mem 0x-0x]
Initmem setup node 1 [mem 0x0001-0x0001]
Initmem setup node 2 [mem 0x0002-0x0002]
Initmem setup node 3 [mem 0x0003-0x0003]
Initmem setup node 4 [mem 0x0004-0x0004]
Initmem setup node 5 [mem 0x0005-0x0005]
Initmem setup node 6 [mem 0x0006-0x0006]
Initmem setup node 7 [mem 0x0007-0x0007]

Signed-off-by: Anshuman Khandual 
---
Generic core VM prints the information inside free_area_init_node
function but only when CONFIG_HAVE_MEMBLOCK_NODE_MAP is enabled.
So if there are other PPC platforms which dont enable the config,
we can put the code section inside applicable platform configs
instead of removing it completely.

 arch/powerpc/mm/numa.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9befaee..371792e 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -875,13 +875,6 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
void *nd;
int tnid;
 
-   if (spanned_pages)
-   pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
-   nid, start_pfn << PAGE_SHIFT,
-   (end_pfn << PAGE_SHIFT) - 1);
-   else
-   pr_info("Initmem setup node %d\n", nid);
-
nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
nd = __va(nd_pa);
 
-- 
1.8.3.1



Re: [PATCH V3 7/7] cxl: Add psl9 specific code

2017-04-06 Thread christophe lombard

Le 03/04/2017 à 15:05, Frederic Barrat a écrit :



Le 28/03/2017 à 17:14, Christophe Lombard a écrit :

The new Coherent Accelerator Interface Architecture, level 2, for the
IBM POWER9 brings new content and features:
- POWER9 Service Layer
- Registers
- Radix mode
- Process element entry
- Dedicated-Shared Process Programming Model
- Translation Fault Handling
- CAPP
- Memory Context ID
If a valid mm_struct is found the memory context id is used for each
transaction associated with the process handle. The PSL uses the
context ID to find the corresponding process element.

Signed-off-by: Christophe Lombard 
---
 drivers/misc/cxl/context.c |  16 ++-
 drivers/misc/cxl/cxl.h | 137 +
 drivers/misc/cxl/debugfs.c |  19 
 drivers/misc/cxl/fault.c   |  78 --
 drivers/misc/cxl/guest.c   |   8 +-
 drivers/misc/cxl/irq.c |  53 ++
 drivers/misc/cxl/native.c  | 247 
-
 drivers/misc/cxl/pci.c | 246 
+---

 drivers/misc/cxl/trace.h   |  43 
 9 files changed, 752 insertions(+), 95 deletions(-)

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index ac2531e..45363be 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -188,12 +188,24 @@ int cxl_context_iomap(struct cxl_context *ctx, 
struct vm_area_struct *vma)

 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
 if (start + len > ctx->afu->adapter->ps_size)
 return -EINVAL;
+
+if (cxl_is_psl9(ctx->afu)) {
+/* make sure there is a valid problem state
+ * area space for this AFU
+ */
+if (ctx->master && !ctx->afu->psa) {
+pr_devel("AFU doesn't support mmio space\n");
+return -EINVAL;
+}
+
+/* Can't mmap until the AFU is enabled */
+if (!ctx->afu->enabled)
+return -EBUSY;
+}
 } else {
 if (start + len > ctx->psn_size)
 return -EINVAL;
-}

-if (ctx->afu->current_mode != CXL_MODE_DEDICATED) {
 /* make sure there is a valid per process space for this AFU */
 if ((ctx->master && !ctx->afu->psa) || (!ctx->afu->pp_psa)) {
 pr_devel("AFU doesn't support mmio space\n");
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 2f2d9e4..aac0504 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -63,7 +63,7 @@ typedef struct {
 /* Memory maps. Ref CXL Appendix A */

 /* PSL Privilege 1 Memory Map */
-/* Configuration and Control area */
+/* Configuration and Control area - CAIA 1&2 */
 static const cxl_p1_reg_t CXL_PSL_CtxTime = {0x};
 static const cxl_p1_reg_t CXL_PSL_ErrIVTE = {0x0008};
 static const cxl_p1_reg_t CXL_PSL_KEY1= {0x0010};
@@ -98,11 +98,29 @@ static const cxl_p1_reg_t CXL_XSL_Timebase = 
{0x0100};

 static const cxl_p1_reg_t CXL_XSL_TB_CTLSTAT = {0x0108};
 static const cxl_p1_reg_t CXL_XSL_FEC   = {0x0158};
 static const cxl_p1_reg_t CXL_XSL_DSNCTL= {0x0168};
+/* PSL registers - CAIA 2 */
+static const cxl_p1_reg_t CXL_PSL9_CONTROL  = {0x0020};
+static const cxl_p1_reg_t CXL_XSL9_DSNCTL   = {0x0168};
+static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300};
+static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308};
+static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310};
+static const cxl_p1_reg_t CXL_PSL9_DEBUG= {0x0320};
+static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348};
+static const cxl_p1_reg_t CXL_PSL9_DSNDCTL  = {0x0350};
+static const cxl_p1_reg_t CXL_PSL9_TB_CTLSTAT = {0x0340};
+static const cxl_p1_reg_t CXL_PSL9_TRACECFG = {0x0368};
+static const cxl_p1_reg_t CXL_PSL9_APCDEDALLOC = {0x0378};
+static const cxl_p1_reg_t CXL_PSL9_APCDEDTYPE = {0x0380};
+static const cxl_p1_reg_t CXL_PSL9_TNR_ADDR = {0x0388};
+static const cxl_p1_reg_t CXL_PSL9_GP_CT = {0x0398};
+static const cxl_p1_reg_t CXL_XSL9_IERAT = {0x0588};
+static const cxl_p1_reg_t CXL_XSL9_ILPP  = {0x0590};
+
 /* 0x7F00:7FFF Reserved PCIe MSI-X Pending Bit Array area */
 /* 0x8000: Reserved PCIe MSI-X Table Area */

 /* PSL Slice Privilege 1 Memory Map */
-/* Configuration Area */
+/* Configuration Area - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_PSL_SR_An  = {0x00};
 static const cxl_p1n_reg_t CXL_PSL_LPID_An= {0x08};
 static const cxl_p1n_reg_t CXL_PSL_AMBAR_An   = {0x10};
@@ -111,17 +129,18 @@ static const cxl_p1n_reg_t 
CXL_PSL_ID_An  = {0x20};

 static const cxl_p1n_reg_t CXL_PSL_SERR_An= {0x28};
 /* Memory Management and Lookaside Buffer Management - CAIA 1*/
 static const cxl_p1n_reg_t CXL_PSL_SDR_An = {0x30};
+/* Memory Management and Lookaside Buffer Management - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_PSL_AMOR_An= {0x38};
-/* Pointer Area */
+/* Pointer Area - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_HAURP_An   = {0x80};
 static const cxl

Re: [PATCH 2/3] of/fdt: introduce of_scan_flat_dt_subnodes and of_get_flat_dt_phandle

2017-04-06 Thread Michael Ellerman
Rob Herring  writes:

> On Wed, Apr 5, 2017 at 9:32 AM, Nicholas Piggin  wrote:
>> On Wed, 5 Apr 2017 08:35:06 -0500
>> Rob Herring  wrote:
>>
>>> On Wed, Apr 5, 2017 at 7:37 AM, Nicholas Piggin  wrote:
>>> > Introduce primitives for FDT parsing. These will be used for powerpc
>>> > cpufeatures node scanning, which has quite complex structure but should
>>> > be processed early.
>>>
>>> Have you looked at unflattening the FDT earlier?
>>
>> Hi, thanks for taking a look. Did you mean to trim the cc list?
>
> Ugg, no. I've added everyone back.
>
>> It may be possible but I'd like to avoid it if we can. There might
>> turn out to be some errata or feature that requires early setup. And
>> the current cpu feature parsing code does it with flat dt.
>
> Well, I'd like to avoid expanding usage of flat DT parsing in the
> kernel. But you could just put this function into arch/powerpc and I'd
> never see it, but I like that even less. Mainly, I just wanted to
> raise the point.
>
> Your argument works until you need that setup in assembly code, then
> you are in the situation that you need to either handle the setup in
> bootloader/firmware or have an simple way to determine that condition.

Where Nick does this FDT parsing is literally the first C code that runs
in the kernel. The MMU is off, we don't even know how much memory we
have, or where it is.

So it's basically impossible to do the unflattening earlier. What we
could do is move the CPU feature setup *later*.

As Ben & Nick said that would be a pretty intrusive change, and
definitely not something we want to do now.

What we could probably do is split some of the flat tree parsing, so
that we discover memory, even just some of it, then unflatten, and then
do more setup using the unflattened tree.

But that will require a lot of delicate work. So we'd definitely like to
do that separately from this series.

cheers


Re: [PATCH v8 2/3] powerpc/book3s: EXPORT_SYMBOL machine_check_print_event_info

2017-04-06 Thread Michael Ellerman
Mahesh J Salgaonkar  writes:

> From: Mahesh Salgaonkar 
>
> It will be used in arch/powerpc/kvm/book3s_hv.c KVM module.
>
> Signed-off-by: Mahesh Salgaonkar 
> ---
>  arch/powerpc/kernel/mce.c |1 +
>  1 file changed, 1 insertion(+)

Thanks.

Acked-by: Michael Ellerman 

cheers


Re: [v2 5/5] mm: teach platforms not to zero struct pages memory

2017-04-06 Thread Aneesh Kumar K.V
Heiko Carstens  writes:

> On Fri, Mar 24, 2017 at 03:19:52PM -0400, Pavel Tatashin wrote:
>> If we are using deferred struct page initialization feature, most of
>> "struct page"es are getting initialized after other CPUs are started, and
>> hence we are benefiting from doing this job in parallel. However, we are
>> still zeroing all the memory that is allocated for "struct pages" using the
>> boot CPU.  This patch solves this problem, by deferring zeroing "struct
>> pages" to only when they are initialized.
>> 
>> Signed-off-by: Pavel Tatashin 
>> Reviewed-by: Shannon Nelson 
>> ---
>>  arch/powerpc/mm/init_64.c |2 +-
>>  arch/s390/mm/vmem.c   |2 +-
>>  arch/sparc/mm/init_64.c   |2 +-
>>  arch/x86/mm/init_64.c |2 +-
>>  4 files changed, 4 insertions(+), 4 deletions(-)
>> 
>> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
>> index eb4c270..24faf2d 100644
>> --- a/arch/powerpc/mm/init_64.c
>> +++ b/arch/powerpc/mm/init_64.c
>> @@ -181,7 +181,7 @@ int __meminit vmemmap_populate(unsigned long start, 
>> unsigned long end, int node)
>>  if (vmemmap_populated(start, page_size))
>>  continue;
>> 
>> -p = vmemmap_alloc_block(page_size, node, true);
>> +p = vmemmap_alloc_block(page_size, node, VMEMMAP_ZERO);
>>  if (!p)
>>  return -ENOMEM;
>> 
>> diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
>> index 9c75214..ffe9ba1 100644
>> --- a/arch/s390/mm/vmem.c
>> +++ b/arch/s390/mm/vmem.c
>> @@ -252,7 +252,7 @@ int __meminit vmemmap_populate(unsigned long start, 
>> unsigned long end, int node)
>>  void *new_page;
>> 
>>  new_page = vmemmap_alloc_block(PMD_SIZE, node,
>> -   true);
>> +   VMEMMAP_ZERO);
>>  if (!new_page)
>>  goto out;
>>  pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
>
> s390 has two call sites that need to be converted, like you did in one of
> your previous patches. The same seems to be true for powerpc, unless there
> is a reason to not convert them?
>

vmemmap_list_alloc is not really struct page allocation right ? We are
just allocating memory to be used as vmemmmap_backing. But considering
we are updating all the three elements of the sturct, we can avoid that
memset . But instead of VMEMMAP_ZERO we can just pass false in that case
?

-aneesh



Re: WARN @lib/refcount.c:128 during hot unplug of I/O adapter.

2017-04-06 Thread Sachin Sant

> On 07-Apr-2017, at 2:14 AM, Tyrel Datwyler  wrote:
> 
> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>> any I/O adapter results in the following warning
> 
> I remember you mentioning this when the issue was brought up for CPUs. I
> assume the case is the same here where the issue is only seen with
> adapters that were hot-added after boot (ie. hot-remove of adapter
> present at boot doesn't trip the warning)?
> 

Correct, can be recreated only with adapters that were hot-added after boot.

> -Tyrel
> 
>> 
>> Thanks
>> -Sachin
>> 
>> 
> 



Re: WARN @lib/refcount.c:128 during hot unplug of I/O adapter.

2017-04-06 Thread Michael Ellerman
Tyrel Datwyler  writes:

> On 04/06/2017 03:27 AM, Sachin Sant wrote:
>> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
>> any I/O adapter results in the following warning
>> 
>> This problem has been in the code for some time now. I had first seen this in
>> -next tree.
>> 
>> [  269.589441] rpadlpar_io: slot PHB 72 removed
>> [  270.589997] refcount_t: underflow; use-after-free.
>> [  270.590019] [ cut here ]
>> [  270.590025] WARNING: CPU: 5 PID: 3335 at lib/refcount.c:128 
>> refcount_sub_and_test+0xf4/0x110
>> [  270.590028] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE 
>> nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 
>> nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun 
>> bridge stp llc rpadlpar_io rpaphp kvm_pr kvm ebtable_filter ebtables 
>> ip6table_filter ip6_tables iptable_filter dccp_diag dccp tcp_diag udp_diag 
>> inet_diag unix_diag af_packet_diag netlink_diag ghash_generic xts gf128mul 
>> vmx_crypto tpm_ibmvtpm tpm sg pseries_rng nfsd auth_rpcgss nfs_acl lockd 
>> grace sunrpc binfmt_misc ip_tables xfs libcrc32c sr_mod sd_mod cdrom 
>> ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod
>> [  270.590076] CPU: 5 PID: 3335 Comm: drmgr Not tainted 4.11.0-rc5 #3
>> [  270.590079] task: c005d8df8600 task.stack: c000fb3a8000
>> [  270.590081] NIP: c1aa3ca4 LR: c1aa3ca0 CTR: 
>> 006338e4
>> [  270.590084] REGS: c000fb3ab8a0 TRAP: 0700   Not tainted  (4.11.0-rc5)
>> [  270.590087] MSR: 80029033 
>> [  270.590090]   CR: 22002422  XER: 0007
>> [  270.590093] CFAR: c1edaabc SOFTE: 1 
>> [  270.590093] GPR00: c1aa3ca0 c000fb3abb20 c25ea900 
>> 0026 
>> [  270.590093] GPR04: c0077fc4ada0 c0077fc617b8 000f0c33 
>>  
>> [  270.590093] GPR08:  c227146c 00077d9e 
>> 3ff0 
>> [  270.590093] GPR12: 2200 ce802d00  
>>  
>> [  270.590093] GPR16:    
>>  
>> [  270.590093] GPR20:  1001b5a8 10018338 
>> 10016650 
>> [  270.590093] GPR24: 1001b278 c00776e0fdcc 10016650 
>>  
>> [  270.590093] GPR28: c0077ffea910 c000fbf79180 c00776e0fdc0 
>> c000fbf791d8 
>> [  270.590126] NIP [c1aa3ca4] refcount_sub_and_test+0xf4/0x110
>> [  270.590129] LR [c1aa3ca0] refcount_sub_and_test+0xf0/0x110
>> [  270.590132] Call Trace:
>> [  270.590134] [c000fb3abb20] [c1aa3ca0] 
>> refcount_sub_and_test+0xf0/0x110 (unreliable)
>> [  270.590139] [c000fb3abb80] [c1a8221c] kobject_put+0x3c/0xa0
>> [  270.590143] [c000fb3abbf0] [c1d22d34] of_node_put+0x24/0x40
>> [  270.590147] [c000fb3abc10] [c165c874] ofdt_write+0x204/0x6b0
>> [  270.590151] [c000fb3abcd0] [c197a220] proc_reg_write+0x80/0xd0
>> [  270.590155] [c000fb3abd00] [c18de680] __vfs_write+0x40/0x1c0
>> [  270.590158] [c000fb3abd90] [c18dffd8] vfs_write+0xc8/0x240
>> [  270.590162] [c000fb3abde0] [c18e1c40] SyS_write+0x60/0x110
>> [  270.590165] [c000fb3abe30] [c15cb184] system_call+0x38/0xe0
>> [  270.590168] Instruction dump:
>> [  270.590170] 7863d182 4e800020 7c0802a6 3921 3d42fff8 3c62ffb1 
>> 386371a8 992a0171 
>> [  270.590175] f8010010 f821ffa1 48436de1 6000 <0fe0> 38210060 
>> 3860 e8010010 
>> [  270.590180] ---[ end trace 08c7a2f3c8bead33 ]—
>> 
>> Have attached the dmesg log from the system. Let me know if any additional
>> information is required to help debug this problem.
>
> I remember you mentioning this when the issue was brought up for CPUs. I
> assume the case is the same here where the issue is only seen with
> adapters that were hot-added after boot (ie. hot-remove of adapter
> present at boot doesn't trip the warning)?

So who's fixing this?

cheers


[PATCH V4] powerpc/hugetlb: Add ABI defines for supported HugeTLB page sizes

2017-04-06 Thread Anshuman Khandual
This just adds user space exported ABI definitions for 2MB, 16MB, 1GB,
16GB non default huge page sizes to be used with mmap() system call.

Signed-off-by: Anshuman Khandual 
---
These defined values will be used along with MAP_HUGETLB while calling
mmap() system call if the desired HugeTLB page size is not the default
one. Follows similar definitions present in x86.

arch/x86/include/uapi/asm/mman.h:#define MAP_HUGE_2MB(21 << MAP_HUGE_SHIFT)
arch/x86/include/uapi/asm/mman.h:#define MAP_HUGE_1GB(30 << MAP_HUGE_SHIFT)

Changes in V4:
- Added defines for 512KB and 8MB HugeTLB page sizes per Christophe

Changes in V3:
- Added comment about how these defines will help create 'flags'
  argument for mmap() system call per Balbir

Changes in V2:
- Added definitions for 2MB and 1GB HugeTLB pages per Aneesh

 arch/powerpc/include/uapi/asm/mman.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/uapi/asm/mman.h 
b/arch/powerpc/include/uapi/asm/mman.h
index 03c06ba..ebe99c7 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,4 +29,18 @@
 #define MAP_STACK  0x2 /* give out an address that is best 
suited for process/thread stacks */
 #define MAP_HUGETLB0x4 /* create a huge page mapping */
 
+/*
+ * These constant defines should be used for creating the
+ * 'flags' argument (26:31 bit positions) for mmap() system
+ * call should the caller decide to use non default HugeTLB
+ * page size.
+ */
+#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT)  /* 512KB HugeTLB Page */
+#define MAP_HUGE_1MB   (20 << MAP_HUGE_SHIFT)  /* 1MB   HugeTLB Page */
+#define MAP_HUGE_2MB   (21 << MAP_HUGE_SHIFT)  /* 2MB   HugeTLB Page */
+#define MAP_HUGE_8MB   (23 << MAP_HUGE_SHIFT)  /* 8MB   HugeTLB Page */
+#define MAP_HUGE_16MB  (24 << MAP_HUGE_SHIFT)  /* 16MB  HugeTLB Page */
+#define MAP_HUGE_1GB   (30 << MAP_HUGE_SHIFT)  /* 1GB   HugeTLB Page */
+#define MAP_HUGE_16GB  (34 << MAP_HUGE_SHIFT)  /* 16GB  HugeTLB Page */
+
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
-- 
1.8.5.2



Re: [PATCH V3] powerpc/hugetlb: Add ABI defines for supported HugeTLB page sizes

2017-04-06 Thread Anshuman Khandual
On 04/07/2017 12:33 AM, LEROY Christophe wrote:
> Hi
> 
> Anshuman Khandual  a écrit :
> 
>> This just adds user space exported ABI definitions for 2MB, 16MB, 1GB,
>> 16GB non default huge page sizes to be used with mmap() system call.
> 
> Why not add all possible huge page sizes ?
> For instance the 8xx (only) supports 512k and 8M hugepages

Sure, can do that.



Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Nicholas Piggin
On Thu, 6 Apr 2017 12:41:52 -0700
Linus Torvalds  wrote:

> On Thu, Apr 6, 2017 at 12:23 PM, Peter Zijlstra  wrote:
> >
> > Something like so then. According to the SDM mwait is a no-op if we do
> > not execute monitor first. So this variant should get the first
> > iteration without expensive instructions.  
> 
> No, the problem is that we *would* have executed a prior monitor that
> could still be pending - from a previous invocation of
> smp_cond_load_acquire().
> 
> Especially with spinlocks, these things can very much happen back-to-back.
> 
> And it would be pending with a different address (the previous
> spinlock) that might not have changed since then (and might not be
> changing), so now we might actually be pausing in mwait waiting for
> that *other* thing to change.
> 
> So it would probably need to do something complicated like
> 
>   #define smp_cond_load_acquire(ptr, cond_expr) \
>   ({\
> typeof(ptr) __PTR = (ptr);  \
> typeof(*ptr) VAL;   \
> do {\
> VAL = READ_ONCE(*__PTR);\
> if (cond_expr)  \
> break;  \
> for (;;) {  \
> ___monitor(__PTR, 0, 0);\
> VAL = READ_ONCE(*__PTR);\
> if (cond_expr) break;   \
> ___mwait(0xf0 /* C0 */, 0); \
> }   \
> } while (0) \
> smp_acquire__after_ctrl_dep();  \
> VAL;\
>   })
> 
> which might just generate nasty enough code to not be worth it.

Okay, that's a bit of an issue I had with the spin_begin/in/end primitives.
The problem being some of these loops are a fastpath that expect success on
first iteration when not spinning. seqlock is another example.

powerpc does not want to go to low priority until after the initial test
fails, but cpu_relax based architectures do not want a pointless extra branch.

I added a spin_until_cond_likely() primitive that catches a number of these.
Not sure how well people would like that though.

Most loops actually are slowpath ones and work fine with begin/in/end though.
Attached a combined patch for reference with powerpc implementation and some
sites converted.

diff --git a/include/linux/processor.h b/include/linux/processor.h
new file mode 100644
index ..0a058aaa9bab
--- /dev/null
+++ b/include/linux/processor.h
@@ -0,0 +1,62 @@
+/* Misc low level processor primitives */
+#ifndef _LINUX_PROCESSOR_H
+#define _LINUX_PROCESSOR_H
+
+#include 
+
+/*
+ * spin_begin is used before beginning a busy-wait loop, and must be paired
+ * with spin_end when the loop is exited. spin_cpu_relax must be called
+ * within the loop.
+ *
+ * The loop body should be as small and fast as possible, on the order of
+ * tens of instructions/cycles as a guide. It should and avoid calling
+ * cpu_relax, or any "spin" or sleep type of primitive including nested uses
+ * of these primitives. It should not lock or take any other resource.
+ * Violations of these guidelies will not cause a bug, but may cause sub
+ * optimal performance.
+ *
+ * These loops are optimized to be used where wait times are expected to be
+ * less than the cost of a context switch (and associated overhead).
+ *
+ * Detection of resource owner and decision to spin or sleep or guest-yield
+ * (e.g., spin lock holder vcpu preempted, or mutex owner not on CPU) can be
+ * tested within the loop body.
+ */
+#ifndef spin_begin
+#define spin_begin()
+#endif
+
+#ifndef spin_cpu_relax
+#define spin_cpu_relax() cpu_relax()
+#endif
+
+/*
+ * spin_cpu_yield may be called to yield (undirected) to the hypervisor if
+ * necessary. This should be used if the wait is expected to take longer
+ * than context switch overhead, but we can't sleep or do a directed yield.
+ */
+#ifndef spin_cpu_yield
+#define spin_cpu_yield() cpu_relax_yield()
+#endif
+
+#ifndef spin_end
+#define spin_end()
+#endif
+
+/*
+ * spin_until_cond_likely can be used to wait for a condition to become true. 
It
+ * may be expected that the first iteration will true in the common case
+ * (no spinning).
+ */
+#ifndef spin_until_cond_likely
+#define spin_until_cond_likely(cond)   \
+do {   \
+   spin_begin();

Re: [PATCH] powerpc/eeh: Avoid use after free in eeh_handle_special_event()

2017-04-06 Thread Alexey Kardashevskiy
On 06/03/17 12:54, Alexey Kardashevskiy wrote:
> On 06/03/17 10:22, Gavin Shan wrote:
>> On Fri, Mar 03, 2017 at 04:59:11PM +1100, Alexey Kardashevskiy wrote:
>>> On 03/03/17 15:47, Russell Currey wrote:
 eeh_handle_special_event() is called when an EEH event is detected but
 can't be narrowed down to a specific PE.  This function looks through
 every PE to find one in an erroneous state, then calls the regular event
 handler eeh_handle_normal_event() once it knows which PE has an error.

 However, if eeh_handle_normal_event() found that the PE cannot possibly
 be recovered, it will remove the PE and associated devices.  This leads
 to a use after free in eeh_handle_special_event() as it attempts to clear
 the "recovering" state on the PE after eeh_handle_normal_event() returns.

 Thus, make sure the PE is valid when attempting to clear state in
 eeh_handle_special_event().

 Cc:  #3.10+
 Reported-by: Alexey Kardashevskiy 
 Signed-off-by: Russell Currey 
 ---
  arch/powerpc/kernel/eeh_driver.c | 13 +
  1 file changed, 13 insertions(+)

 diff --git a/arch/powerpc/kernel/eeh_driver.c 
 b/arch/powerpc/kernel/eeh_driver.c
 index b94887165a10..492397298a2a 100644
 --- a/arch/powerpc/kernel/eeh_driver.c
 +++ b/arch/powerpc/kernel/eeh_driver.c
 @@ -983,6 +983,19 @@ static void eeh_handle_special_event(void)
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
eeh_handle_normal_event(pe);
 +
 +  /*
 +   * eeh_handle_normal_event() can free the PE if it
 +   * determines that the PE cannot possibly be recovered.
 +   * Make sure the PE still exists before changing its
 +   * state.
 +   */
 +  if (!pe || (pe->type & EEH_PE_INVALID)
 +  || (pe->state & EEH_PE_REMOVED)) {
>>>
>>>
>>> The bug is that pe becomes stale after eeh_handle_normal_event() returned
>>> and dereferencing it afterwards is broken.
>>>
>>
>> Correct, it won't cause a kernel crash as @pe is deferencing linear mapped
>> area whose address is always valid.
> 
> Dereferencing pe would not crash but dereferencing any pointer from the
> pnv_ioda_pe struct would (as it would random stuff or a poison).
> 
> 
>> I think the proper fix would be to use
>> eeh_handle_normal_event() to indicate the @pe has been released and don't
>> access it any more.
> 
> Correct. The problem is that the callstack from my other reply is a bit too
> long to make an trivial patch :)



Any update on this?



> 
> 
> 
>>>
>>>
 +  pr_warn("EEH: not clearing state on bad PE\n");
>>
>> The message like this isn't meaningful, no need to have it. The messages that
>> have prefix "EEH:" is informative messages. We definitely needn't this here.
>> However, the message might be not needed in next revision.
>>
 +  continue;
 +  }
 +
eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
} else {
pci_lock_rescan_remove();

>>
>> Thanks,
>> Gavin
>>
> 
> 


-- 
Alexey


[PATCH v2 5/5] perf report: Show branch type in callchain entry

2017-04-06 Thread Jin Yao
Show branch type in callchain entry. The branch type is printed
with other LBR information (such as cycles/abort/...).

One example:
perf report --branch-history --stdio --no-children

-23.60%--main div.c:42 (RET cycles:2)
 compute_flag div.c:28 (RET cycles:2)
 compute_flag div.c:27 (RET CROSS_2M cycles:1)
 rand rand.c:28 (RET CROSS_2M cycles:1)
 rand rand.c:28 (RET cycles:1)
 __random random.c:298 (RET cycles:1)
 __random random.c:297 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (RET cycles:9)

Signed-off-by: Jin Yao 
---
 tools/perf/util/callchain.c | 221 
 tools/perf/util/callchain.h |  20 
 2 files changed, 182 insertions(+), 59 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 3cea1fb..ca040a0 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -428,6 +428,89 @@ create_child(struct callchain_node *parent, bool 
inherit_children)
return new;
 }
 
+static const char *br_tag[BR_IDX_MAX] = {
+   "JCC forward",
+   "JCC backward",
+   "JMP",
+   "IND_JMP",
+   "CALL",
+   "IND_CALL",
+   "RET",
+   "SYSCALL",
+   "SYSRET",
+   "IRQ",
+   "INT",
+   "IRET",
+   "FAR_BRANCH",
+   "CROSS_4K",
+   "CROSS_2M",
+};
+
+static void
+branch_type_count(int *counts, struct branch_flags *flags)
+{
+   switch (flags->type) {
+   case PERF_BR_JCC_FWD:
+   counts[BR_IDX_JCC_FWD]++;
+   break;
+
+   case PERF_BR_JCC_BWD:
+   counts[BR_IDX_JCC_BWD]++;
+   break;
+
+   case PERF_BR_JMP:
+   counts[BR_IDX_JMP]++;
+   break;
+
+   case PERF_BR_IND_JMP:
+   counts[BR_IDX_IND_JMP]++;
+   break;
+
+   case PERF_BR_CALL:
+   counts[BR_IDX_CALL]++;
+   break;
+
+   case PERF_BR_IND_CALL:
+   counts[BR_IDX_IND_CALL]++;
+   break;
+
+   case PERF_BR_RET:
+   counts[BR_IDX_RET]++;
+   break;
+
+   case PERF_BR_SYSCALL:
+   counts[BR_IDX_SYSCALL]++;
+   break;
+
+   case PERF_BR_SYSRET:
+   counts[BR_IDX_SYSRET]++;
+   break;
+
+   case PERF_BR_IRQ:
+   counts[BR_IDX_IRQ]++;
+   break;
+
+   case PERF_BR_INT:
+   counts[BR_IDX_INT]++;
+   break;
+
+   case PERF_BR_IRET:
+   counts[BR_IDX_IRET]++;
+   break;
+
+   case PERF_BR_FAR_BRANCH:
+   counts[BR_IDX_FAR_BRANCH]++;
+   break;
+
+   default:
+   break;
+   }
+
+   if (flags->cross == PERF_BR_CROSS_2M)
+   counts[BR_IDX_CROSS_2M]++;
+   else if (flags->cross == PERF_BR_CROSS_4K)
+   counts[BR_IDX_CROSS_4K]++;
+}
 
 /*
  * Fill the node with callchain values
@@ -467,6 +550,9 @@ fill_node(struct callchain_node *node, struct 
callchain_cursor *cursor)
call->cycles_count = cursor_node->branch_flags.cycles;
call->iter_count = cursor_node->nr_loop_iter;
call->samples_count = cursor_node->samples;
+
+   branch_type_count(call->brtype_count,
+ &cursor_node->branch_flags);
}
 
list_add_tail(&call->list, &node->val);
@@ -579,6 +665,9 @@ static enum match_result match_chain(struct 
callchain_cursor_node *node,
cnode->cycles_count += node->branch_flags.cycles;
cnode->iter_count += node->nr_loop_iter;
cnode->samples_count += node->samples;
+
+   branch_type_count(cnode->brtype_count,
+ &node->branch_flags);
}
 
return MATCH_EQ;
@@ -1105,95 +1194,108 @@ int callchain_branch_counts(struct callchain_root 
*root,
  cycles_count);
 }
 
+static int branch_type_str(int *counts, char *bf, int bfsize)
+{
+   int i, printed = 0;
+   bool brace = false;
+
+   for (i = 0; i < BR_IDX_MAX; i++) {
+   if (printed == bfsize - 1)
+   return printed;
+
+   if (counts[i] > 0) {
+   if (!brace) {
+   brace = true;
+   printed += scnprintf(bf + printed,
+   bfsize - printed,
+   " (%s", br_tag[i]);
+   } else
+   printed += scnprintf(bf + printed,
+ 

[PATCH v2 4/5] perf report: Show branch type statistics for stdio mode

2017-04-06 Thread Jin Yao
Show the branch type statistics at the end of perf report --stdio.

For example:
perf report --stdio

 JCC forward:  27.7%
JCC backward:   9.8%
 JMP:   0.0%
 IND_JMP:   6.5%
CALL:  26.6%
IND_CALL:   0.0%
 RET:  29.3%
IRET:   0.0%
CROSS_4K:   0.0%
CROSS_2M:  14.3%

The branch types are:
-
 JCC forward: Conditional forward jump
JCC backward: Conditional backward jump
 JMP: Jump imm
 IND_JMP: Jump reg/mem
CALL: Call imm
IND_CALL: Call reg/mem
 RET: Ret
 SYSCALL: Syscall
  SYSRET: Syscall return
 IRQ: HW interrupt/trap/fault
 INT: SW interrupt
IRET: Return from interrupt
  FAR_BRANCH: Others not generic branch type

CROSS_4K and CROSS_2M:
--
They are the metrics checking for branches cross 4K or 2MB pages.
It's an approximate computing. We don't know if the area is 4K or
2MB, so always compute both.

To make the output simple, if a branch crosses 2M area, CROSS_4K
will not be incremented.

Signed-off-by: Jin Yao 
---
 tools/perf/builtin-report.c | 212 
 tools/perf/util/event.h |   4 +-
 tools/perf/util/hist.c  |   5 +-
 3 files changed, 216 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c18158b..1dc1058 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -43,6 +43,24 @@
 #include 
 #include 
 
+struct branch_type_stat {
+   u64 jcc_fwd;
+   u64 jcc_bwd;
+   u64 jmp;
+   u64 ind_jmp;
+   u64 call;
+   u64 ind_call;
+   u64 ret;
+   u64 syscall;
+   u64 sysret;
+   u64 irq;
+   u64 intr;
+   u64 iret;
+   u64 far_branch;
+   u64 cross_4k;
+   u64 cross_2m;
+};
+
 struct report {
struct perf_tooltool;
struct perf_session *session;
@@ -66,6 +84,7 @@ struct report {
u64 queue_size;
int socket_filter;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+   struct branch_type_stat brtype_stat;
 };
 
 static int report__config(const char *var, const char *value, void *cb)
@@ -144,6 +163,91 @@ static int hist_iter__report_callback(struct 
hist_entry_iter *iter,
return err;
 }
 
+static void branch_type_count(struct report *rep, struct branch_info *bi)
+{
+   struct branch_type_stat *stat = &rep->brtype_stat;
+   struct branch_flags *flags = &bi->flags;
+
+   switch (flags->type) {
+   case PERF_BR_JCC_FWD:
+   stat->jcc_fwd++;
+   break;
+
+   case PERF_BR_JCC_BWD:
+   stat->jcc_bwd++;
+   break;
+
+   case PERF_BR_JMP:
+   stat->jmp++;
+   break;
+
+   case PERF_BR_IND_JMP:
+   stat->ind_jmp++;
+   break;
+
+   case PERF_BR_CALL:
+   stat->call++;
+   break;
+
+   case PERF_BR_IND_CALL:
+   stat->ind_call++;
+   break;
+
+   case PERF_BR_RET:
+   stat->ret++;
+   break;
+
+   case PERF_BR_SYSCALL:
+   stat->syscall++;
+   break;
+
+   case PERF_BR_SYSRET:
+   stat->sysret++;
+   break;
+
+   case PERF_BR_IRQ:
+   stat->irq++;
+   break;
+
+   case PERF_BR_INT:
+   stat->intr++;
+   break;
+
+   case PERF_BR_IRET:
+   stat->iret++;
+   break;
+
+   case PERF_BR_FAR_BRANCH:
+   stat->far_branch++;
+   break;
+
+   default:
+   break;
+   }
+
+   if (flags->cross == PERF_BR_CROSS_2M)
+   stat->cross_2m++;
+   else if (flags->cross == PERF_BR_CROSS_4K)
+   stat->cross_4k++;
+}
+
+static int hist_iter__branch_callback(struct hist_entry_iter *iter,
+ struct addr_location *al __maybe_unused,
+ bool single __maybe_unused,
+ void *arg)
+{
+   struct hist_entry *he = iter->he;
+   struct report *rep = arg;
+   struct branch_info *bi;
+
+   if (sort__mode == SORT_MODE__BRANCH) {
+   bi = he->branch_info;
+   branch_type_count(rep, bi);
+   }
+
+   return 0;
+}
+
 static int process_sample_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -182,6 +286,8 @@ static int process_sample_event(struct perf_tool *tool,
 */
if (!sample->branch_stack)
goto out_put;
+
+   iter.add_entry_cb = hist_iter__branch_callback;
iter.ops = &hist_iter_branch;
} else if (rep->mem_mode) {
  

[PATCH v2 3/5] perf record: Create a new option save_type in --branch-filter

2017-04-06 Thread Jin Yao
The option indicates the kernel to save branch type during sampling.

One example:
perf record -g --branch-filter any,save_type 

Signed-off-by: Jin Yao 
---
 tools/perf/Documentation/perf-record.txt | 1 +
 tools/perf/util/parse-branch-options.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tools/perf/Documentation/perf-record.txt 
b/tools/perf/Documentation/perf-record.txt
index ea3789d..e2f5a4f 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -332,6 +332,7 @@ following filters are defined:
- no_tx: only when the target is not in a hardware transaction
- abort_tx: only when the target is a hardware transaction abort
- cond: conditional branches
+   - save_type: save branch type during sampling in case binary is not 
available later
 
 +
 The option requires at least one branch type among any, any_call, any_ret, 
ind_call, cond.
diff --git a/tools/perf/util/parse-branch-options.c 
b/tools/perf/util/parse-branch-options.c
index 38fd115..e71fb5f 100644
--- a/tools/perf/util/parse-branch-options.c
+++ b/tools/perf/util/parse-branch-options.c
@@ -28,6 +28,7 @@ static const struct branch_mode branch_modes[] = {
BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
BRANCH_OPT("ind_jmp", PERF_SAMPLE_BRANCH_IND_JUMP),
BRANCH_OPT("call", PERF_SAMPLE_BRANCH_CALL),
+   BRANCH_OPT("save_type", PERF_SAMPLE_BRANCH_TYPE_SAVE),
BRANCH_END
 };
 
-- 
2.7.4



[PATCH v2 2/5] perf/x86/intel: Record branch type

2017-04-06 Thread Jin Yao
Perf already has support for disassembling the branch instruction
and using the branch type for filtering. The patch just records
the branch type in perf_branch_entry.

Before recording, the patch converts the x86 branch classification
to common branch classification and compute for checking if the
branches cross 4K or 2MB areas. It's an approximate computing for
crossing 4K page or 2MB page.

Signed-off-by: Jin Yao 
---
 arch/x86/events/intel/lbr.c | 106 +++-
 1 file changed, 105 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 81b321a..635a0fb 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@ enum {
X86_BR_ZERO_CALL= 1 << 15,/* zero length call */
X86_BR_CALL_STACK   = 1 << 16,/* call stack */
X86_BR_IND_JMP  = 1 << 17,/* indirect jump */
+
+   X86_BR_TYPE_SAVE= 1 << 18,/* indicate to save branch type */
+
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -139,6 +142,9 @@ enum {
 X86_BR_IRQ |\
 X86_BR_INT)
 
+#define AREA_4K4096
+#define AREA_2M(2 * 1024 * 1024)
+
 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
 /*
@@ -670,6 +676,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event 
*event)
 
if (br_type & PERF_SAMPLE_BRANCH_CALL)
mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+   if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+   mask |= X86_BR_TYPE_SAVE;
+
/*
 * stash actual user request into reg, it may
 * be used by fixup code for some CPU
@@ -923,6 +933,84 @@ static int branch_type(unsigned long from, unsigned long 
to, int abort)
return ret;
 }
 
+static int
+common_branch_type(int type, u64 from, u64 to)
+{
+   int ret;
+
+   type = type & (~(X86_BR_KERNEL | X86_BR_USER));
+
+   switch (type) {
+   case X86_BR_CALL:
+   case X86_BR_ZERO_CALL:
+   ret = PERF_BR_CALL;
+   break;
+
+   case X86_BR_RET:
+   ret = PERF_BR_RET;
+   break;
+
+   case X86_BR_SYSCALL:
+   ret = PERF_BR_SYSCALL;
+   break;
+
+   case X86_BR_SYSRET:
+   ret = PERF_BR_SYSRET;
+   break;
+
+   case X86_BR_INT:
+   ret = PERF_BR_INT;
+   break;
+
+   case X86_BR_IRET:
+   ret = PERF_BR_IRET;
+   break;
+
+   case X86_BR_IRQ:
+   ret = PERF_BR_IRQ;
+   break;
+
+   case X86_BR_ABORT:
+   ret = PERF_BR_FAR_BRANCH;
+   break;
+
+   case X86_BR_JCC:
+   if (to > from)
+   ret = PERF_BR_JCC_FWD;
+   else
+   ret = PERF_BR_JCC_BWD;
+   break;
+
+   case X86_BR_JMP:
+   ret = PERF_BR_JMP;
+   break;
+
+   case X86_BR_IND_CALL:
+   ret = PERF_BR_IND_CALL;
+   break;
+
+   case X86_BR_IND_JMP:
+   ret = PERF_BR_IND_JMP;
+   break;
+
+   default:
+   ret = PERF_BR_NONE;
+   }
+
+   return ret;
+}
+
+static bool
+cross_area(u64 addr1, u64 addr2, int size)
+{
+   u64 align1, align2;
+
+   align1 = addr1 & ~(size - 1);
+   align2 = addr2 & ~(size - 1);
+
+   return (align1 != align2) ? true : false;
+}
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -939,7 +1027,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
bool compress = false;
 
/* if sampling all branches, then nothing to filter */
-   if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+   if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+   ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
return;
 
for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -960,6 +1049,21 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[i].from = 0;
compress = true;
}
+
+   if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) {
+   cpuc->lbr_entries[i].type = common_branch_type(type,
+  from,
+  to);
+   if (cross_area(from, to, AREA_2M))
+   cpuc->lbr_entries[i].cross = PERF_BR_CROSS_2M;
+   else if (cross_area(from, to, AREA_4K))
+   cpuc->lbr_entries[i].cross = PERF_BR_CROSS_4K;
+   else
+   cpuc->lbr_entries[i].cross = PERF_BR_CROSS_NONE;
+   } else {
+   cpuc->lbr_entries[i].t

[PATCH v2 1/5] perf/core: Define the common branch type classification

2017-04-06 Thread Jin Yao
It is often useful to know the branch types while analyzing branch
data. For example, a call is very different from a conditional branch.

Currently we have to look it up in binary while the binary may later
not be available and even the binary is available but user has to take
some time. It is very useful for user to check it directly in perf
report.

Perf already has support for disassembling the branch instruction
to get the x86 branch type.

To keep consistent on kernel and userspace and make the classification
more common, the patch adds the common branch type classification
in perf_event.h.

PERF_BR_NONE  : unknown
PERF_BR_JCC_FWD   : conditional forward jump
PERF_BR_JCC_BWD   : conditional backward jump
PERF_BR_JMP   : jump
PERF_BR_IND_JMP   : indirect jump
PERF_BR_CALL  : call
PERF_BR_IND_CALL  : indirect call
PERF_BR_RET   : return
PERF_BR_SYSCALL   : syscall
PERF_BR_SYSRET: syscall return
PERF_BR_IRQ   : hw interrupt/trap/fault
PERF_BR_INT   : sw interrupt
PERF_BR_IRET  : return from interrupt
PERF_BR_FAR_BRANCH: others not generic branch type

The patch adds following metrics checking for branches cross
4K or 2MB areas.

PERF_BR_CROSS_NONE: branch not cross an area
PERF_BR_CROSS_4K  : branch cross 4K area
PERF_BR_CROSS_2M  : branch cross 2MB area

Since the disassembling of branch instruction needs some overhead,
a new PERF_SAMPLE_BRANCH_TYPE_SAVE is introduced to indicate if it
needs to disassemble the branch instruction and record the branch
type.

Signed-off-by: Jin Yao 
---
 include/uapi/linux/perf_event.h   | 37 ++-
 tools/include/uapi/linux/perf_event.h | 37 ++-
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d09a9cd..e2fcd53 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT   = 14, /* no flags */
PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT  = 15, /* no cycles */
 
+   PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT  = 16, /* save branch type */
+
PERF_SAMPLE_BRANCH_MAX_SHIFT/* non-ABI */
 };
 
@@ -198,9 +200,38 @@ enum perf_branch_sample_type {
PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << 
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
PERF_SAMPLE_BRANCH_NO_CYCLES= 1U << 
PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+   PERF_SAMPLE_BRANCH_TYPE_SAVE=
+   1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
PERF_SAMPLE_BRANCH_MAX  = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+   PERF_BR_NONE= 0,/* unknown */
+   PERF_BR_JCC_FWD = 1,/* conditional forward jump */
+   PERF_BR_JCC_BWD = 2,/* conditional backward jump */
+   PERF_BR_JMP = 3,/* jump */
+   PERF_BR_IND_JMP = 4,/* indirect jump */
+   PERF_BR_CALL= 5,/* call */
+   PERF_BR_IND_CALL= 6,/* indirect call */
+   PERF_BR_RET = 7,/* return */
+   PERF_BR_SYSCALL = 8,/* syscall */
+   PERF_BR_SYSRET  = 9,/* syscall return */
+   PERF_BR_IRQ = 10,   /* hw interrupt/trap/fault */
+   PERF_BR_INT = 11,   /* sw interrupt */
+   PERF_BR_IRET= 12,   /* return from interrupt */
+   PERF_BR_FAR_BRANCH  = 13,   /* others not generic branch type */
+};
+
+enum {
+   PERF_BR_CROSS_NONE  = 0,/* branch not cross an area */
+   PERF_BR_CROSS_4K= 1,/* branch cross 4K */
+   PERF_BR_CROSS_2M= 2,/* branch cross 2MB */
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
(PERF_SAMPLE_BRANCH_USER|\
 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -999,6 +1030,8 @@ union perf_mem_data_src {
  * in_tx: running in a hardware transaction
  * abort: aborting a hardware transaction
  *cycles: cycles from last branch (or 0 if not supported)
+ *  type: branch type
+ * cross: branch cross 4K or 2MB area
  */
 struct perf_branch_entry {
__u64   from;
@@ -1008,7 +1041,9 @@ struct perf_branch_entry {
in_tx:1,/* in transaction */
abort:1,/* transaction abort */
cycles:16,  /* cycle count to last branch */
-   reserved:44;
+   type:4, /* branch type */
+   cross:2,/* branch cross 4K or 2MB area */
+   reserved:38;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index d09a9cd..e2fcd53 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
PERF_SAMPLE_BRANCH_NO_FLAGS_SHIF

[PATCH v2 0/5] perf report: Show branch type

2017-04-06 Thread Jin Yao
v2:
---
1. Use 4 bits in perf_branch_entry to record branch type.

2. Pull out some common branch types from FAR_BRANCH. Now the branch
   types defined in perf_event.h:

PERF_BR_NONE  : unknown
PERF_BR_JCC_FWD   : conditional forward jump
PERF_BR_JCC_BWD   : conditional backward jump
PERF_BR_JMP   : jump
PERF_BR_IND_JMP   : indirect jump
PERF_BR_CALL  : call
PERF_BR_IND_CALL  : indirect call
PERF_BR_RET   : return
PERF_BR_SYSCALL   : syscall
PERF_BR_SYSRET: syscall return
PERF_BR_IRQ   : hw interrupt/trap/fault
PERF_BR_INT   : sw interrupt
PERF_BR_IRET  : return from interrupt
PERF_BR_FAR_BRANCH: others not generic far branch type

3. Use 2 bits in perf_branch_entry for a "cross" metrics checking
   for branch cross 4K or 2M area. It's an approximate computing
   for checking if the branch cross 4K page or 2MB page.

For example:

perf record -g --branch-filter any,save_type 

perf report --stdio

 JCC forward:  27.7%
JCC backward:   9.8%
 JMP:   0.0%
 IND_JMP:   6.5%
CALL:  26.6%
IND_CALL:   0.0%
 RET:  29.3%
IRET:   0.0%
CROSS_4K:   0.0%
CROSS_2M:  14.3%

perf report --branch-history --stdio --no-children

-23.60%--main div.c:42 (RET cycles:2)
 compute_flag div.c:28 (RET cycles:2)
 compute_flag div.c:27 (RET CROSS_2M cycles:1)
 rand rand.c:28 (RET CROSS_2M cycles:1)
 rand rand.c:28 (RET cycles:1)
 __random random.c:298 (RET cycles:1)
 __random random.c:297 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (JCC forward cycles:1)
 __random random.c:295 (RET cycles:9)

Changed:
  perf/core: Define the common branch type classification
  perf/x86/intel: Record branch type
  perf report: Show branch type statistics for stdio mode
  perf report: Show branch type in callchain entry

Not changed:
  perf record: Create a new option save_type in --branch-filter

v1:
---
It is often useful to know the branch types while analyzing branch
data. For example, a call is very different from a conditional branch.

Currently we have to look it up in binary while the binary may later
not be available and even the binary is available but user has to take
some time. It is very useful for user to check it directly in perf
report.

Perf already has support for disassembling the branch instruction
to get the branch type.

The patch series records the branch type and show the branch type with
other LBR information in callchain entry via perf report. The patch
series also adds the branch type summary at the end of
perf report --stdio.

To keep consistent on kernel and userspace and make the classification
more common, the patch adds the common branch type classification
in perf_event.h.

The common branch types are:

 JCC forward: Conditional forward jump
JCC backward: Conditional backward jump
 JMP: Jump imm
 IND_JMP: Jump reg/mem
CALL: Call imm
IND_CALL: Call reg/mem
 RET: Ret
  FAR_BRANCH: SYSCALL/SYSRET, IRQ, IRET, TSX Abort

An example:

1. Record branch type (new option "save_type")

perf record -g --branch-filter any,save_type 

2. Show the branch type statistics at the end of perf report --stdio

perf report --stdio

 JCC forward:  34.0%
JCC backward:   3.6%
 JMP:   0.0%
 IND_JMP:   6.5%
CALL:  26.6%
IND_CALL:   0.0%
 RET:  29.3%
  FAR_BRANCH:   0.0%

3. Show branch type in callchain entry

perf report --branch-history --stdio --no-children

--23.91%--main div.c:42 (RET cycles:2)
  compute_flag div.c:28 (RET cycles:2)
  compute_flag div.c:27 (RET cycles:1)
  rand rand.c:28 (RET cycles:1)
  rand rand.c:28 (RET cycles:1)
  __random random.c:298 (RET cycles:1)
  __random random.c:297 (JCC forward cycles:1)
  __random random.c:295 (JCC forward cycles:1)
  __random random.c:295 (JCC forward cycles:1)
  __random random.c:295 (JCC forward cycles:1)
  __random random.c:295 (RET cycles:9)

Jin Yao (5):
  perf/core: Define the common branch type classification
  perf/x86/intel: Record branch type
  perf record: Create a new option save_type in --branch-filter
  perf report: Show branch type statistics for stdio mode
  perf report: Show branch type in callchain entry

 arch/x86/events/intel/lbr.c  | 106 ++-
 include/uapi/linux/perf_event.h  |  37 +-
 tools/include/uapi/linux/perf_event.h|  37 +-
 tools/perf/Documentation/perf-record.txt |   1 +
 tools/perf/builtin-report.c  | 212 +
 tools/perf/util/callchain.c  | 221 ++-
 tools/perf/util/callchain.h  |  20 +

Re: [PATCH v2 1/2] fadump: reduce memory consumption for capture kernel

2017-04-06 Thread Michael Ellerman
Hari Bathini  writes:

> In case of fadump, capture (fadump) kernel boots like a normal kernel.
> While this has its advantages, the capture kernel would initialize all
> the components like normal kernel, which may not necessarily be needed
> for a typical dump capture kernel. So, fadump capture kernel ends up
> needing more memory than a typical (read kdump) capture kernel to boot.
>
> This can be overcome by introducing parameters like fadump_nr_cpus=1,
> similar to nr_cpus=1 parameter, applicable only when fadump is active.
> But this approach needs introduction of special parameters applicable
> only when fadump is active (capture kernel), for every parameter that
> reduces memory/resource consumption.
>
> A better approach would be to pass extra parameters to fadump capture
> kernel. As firmware leaves the memory contents intact from the time of
> crash till the new kernel is booted up, parameters to append to capture
> kernel can be saved in real memory region and retrieved later when the
> capture kernel is in its early boot process for appending to command
> line parameters.
>
> This patch introduces a new node /sys/kernel/fadump_cmdline_append to
> specify the parameters to pass to fadump capture kernel, saves them in
> real memory region and appends these parameters to capture kernel early
> in its boot process.

As we discussed on IRC I don't really like this.

It's clever, (ab)using the fact that the first kernel's memory is left
intact. But it's also a bit gross :)

It also has a few real problems, like hard coding 128MB as the handover
location. You may not have memory there, or it may be reserved.


My preference would be that the fadump kernel "just works". If it's
using too much memory then the fadump kernel should do whatever it needs
to use less memory, eg. shrinking nr_cpu_ids etc.

Do we actually know *why* the fadump kernel is running out of memory?
Obviously large numbers of CPUs is one of the main drivers (lots of
stacks required). But other than that what is causing the memory
pressure? I would like some data on that before we proceed.


If we *must* have a way to pass command line arguments to the fadump
kernel then I think we should just use a command line argument that
specifies them.

eg: fadump_append=nr_cpus=1,use_less_memory,some_other_obscure_parameter=100


cheers


[PATCH 2/2] powerpc/64s: Add SCV FSCR bit for ISA v3.0

2017-04-06 Thread Nicholas Piggin
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/reg.h |  2 ++
 arch/powerpc/kernel/traps.c| 19 ++-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index fc879fd6bdae..d9e52a1336a6 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -310,6 +310,7 @@
 #define SPRN_PMCR  0x374   /* Power Management Control Register */
 
 /* HFSCR and FSCR bit numbers are the same */
+#define FSCR_SCV_LG12  /* Enable System Call Vectored */
 #define FSCR_MSGP_LG   10  /* Enable MSGP */
 #define FSCR_TAR_LG8   /* Enable Target Address Register */
 #define FSCR_EBB_LG7   /* Enable Event Based Branching */
@@ -320,6 +321,7 @@
 #define FSCR_VECVSX_LG 1   /* Enable VMX/VSX  */
 #define FSCR_FP_LG 0   /* Enable Floating Point */
 #define SPRN_FSCR  0x099   /* Facility Status & Control Register */
+#define   FSCR_SCV __MASK(FSCR_SCV_LG)
 #define   FSCR_TAR __MASK(FSCR_TAR_LG)
 #define   FSCR_EBB __MASK(FSCR_EBB_LG)
 #define   FSCR_DSCR__MASK(FSCR_DSCR_LG)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index a34d8bf3dbe4..5b307efed870 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1432,15 +1432,16 @@ static void tm_unavailable(struct pt_regs *regs)
 void facility_unavailable_exception(struct pt_regs *regs)
 {
static char *facility_strings[] = {
-   [FSCR_FP_LG] = "FPU",
-   [FSCR_VECVSX_LG] = "VMX/VSX",
-   [FSCR_DSCR_LG] = "DSCR",
-   [FSCR_PM_LG] = "PMU SPRs",
-   [FSCR_BHRB_LG] = "BHRB",
-   [FSCR_TM_LG] = "TM",
-   [FSCR_EBB_LG] = "EBB",
-   [FSCR_TAR_LG] = "TAR",
-   [FSCR_MSGP_LG] = "MSGP",
+   [FSCR_FP_LG]= "FPU",
+   [FSCR_VECVSX_LG]= "VMX/VSX",
+   [FSCR_DSCR_LG]  = "DSCR",
+   [FSCR_PM_LG]= "PMU SPRs",
+   [FSCR_BHRB_LG]  = "BHRB",
+   [FSCR_TM_LG]= "TM",
+   [FSCR_EBB_LG]   = "EBB",
+   [FSCR_TAR_LG]   = "TAR",
+   [FSCR_MSGP_LG]  = "MSGP",
+   [FSCR_SCV_LG]   = "SCV",
};
char *facility = "unknown";
u64 value;
-- 
2.11.0



[PATCH 1/2] powerpc/64s: Add msgp facility unavailable log string

2017-04-06 Thread Nicholas Piggin
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/traps.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index ff365f9de27a..a34d8bf3dbe4 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1440,6 +1440,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
[FSCR_TM_LG] = "TM",
[FSCR_EBB_LG] = "EBB",
[FSCR_TAR_LG] = "TAR",
+   [FSCR_MSGP_LG] = "MSGP",
};
char *facility = "unknown";
u64 value;
-- 
2.11.0



[PATCH 0/2] more facility bit additions for POWER9

2017-04-06 Thread Nicholas Piggin
No real change except add register definitions and facility
unavailable type names.

Nicholas Piggin (2):
  powerpc/64s: Add msgp facility unavailable log string
  powerpc/64s: Add SCV FSCR bit for ISA v3.0

 arch/powerpc/include/asm/reg.h |  2 ++
 arch/powerpc/kernel/traps.c| 18 ++
 2 files changed, 12 insertions(+), 8 deletions(-)

-- 
2.11.0



Re: [PATCH] powerpc/crypto/crc32c-vpmsum: Fix missing preempt_disable()

2017-04-06 Thread Daniel Axtens
Hi Michael,

We'll also need a similar patch for the t10dif code that went in to
Herbert's next.

I will do a patch once I get my development environment up and going
again.

Regards,
Daniel

Michael Ellerman  writes:

> In crc32c_vpmsum() we call enable_kernel_altivec() without first
> disabling preemption, which is not allowed:
>
>   WARNING: CPU: 9 PID: 2949 at ../arch/powerpc/kernel/process.c:277 
> enable_kernel_altivec+0x100/0x120
>   Modules linked in: dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio 
> libcrc32c vmx_crypto ...
>   CPU: 9 PID: 2949 Comm: docker Not tainted 
> 4.11.0-rc5-compiler_gcc-6.3.1-00033-g308ac7563944 #381
>   ...
>   NIP [c001e320] enable_kernel_altivec+0x100/0x120
>   LR [d3df0910] crc32c_vpmsum+0x108/0x150 [crc32c_vpmsum]
>   Call Trace:
> 0xc138fd09 (unreliable)
> crc32c_vpmsum+0x108/0x150 [crc32c_vpmsum]
> crc32c_vpmsum_update+0x3c/0x60 [crc32c_vpmsum]
> crypto_shash_update+0x88/0x1c0
> crc32c+0x64/0x90 [libcrc32c]
> dm_bm_checksum+0x48/0x80 [dm_persistent_data]
> sb_check+0x84/0x120 [dm_thin_pool]
> dm_bm_validate_buffer.isra.0+0xc0/0x1b0 [dm_persistent_data]
> dm_bm_read_lock+0x80/0xf0 [dm_persistent_data]
> __create_persistent_data_objects+0x16c/0x810 [dm_thin_pool]
> dm_pool_metadata_open+0xb0/0x1a0 [dm_thin_pool]
> pool_ctr+0x4cc/0xb60 [dm_thin_pool]
> dm_table_add_target+0x16c/0x3c0
> table_load+0x184/0x400
> ctl_ioctl+0x2f0/0x560
> dm_ctl_ioctl+0x38/0x50
> do_vfs_ioctl+0xd8/0x920
> SyS_ioctl+0x68/0xc0
> system_call+0x38/0xfc
>
> It used to be sufficient just to call pagefault_disable(), because that
> also disabled preemption. But the two were decoupled in commit 8222dbe21e79
> ("sched/preempt, mm/fault: Decouple preemption from the page fault
> logic") in mid 2015.
>
> So add the missing preempt_disable/enable(). We should also call
> disable_kernel_fp(), although it does nothing by default, there is a
> debug switch to make it active and all enables should be paired with
> disables.
>
> Fixes: 6dd7a82cc54e ("crypto: powerpc - Add POWER8 optimised crc32c")
> Cc: sta...@vger.kernel.org # v4.8+
> Signed-off-by: Michael Ellerman 
> ---
>  arch/powerpc/crypto/crc32c-vpmsum_glue.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c 
> b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
> index 411994551afc..f058e0c3e4d4 100644
> --- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
> +++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
> @@ -33,10 +33,13 @@ static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, 
> size_t len)
>   }
>  
>   if (len & ~VMX_ALIGN_MASK) {
> + preempt_disable();
>   pagefault_disable();
>   enable_kernel_altivec();
>   crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
> + disable_kernel_altivec();
>   pagefault_enable();
> + preempt_enable();
>   }
>  
>   tail = len & VMX_ALIGN_MASK;
> -- 
> 2.7.4


Re: WARN @lib/refcount.c:128 during hot unplug of I/O adapter.

2017-04-06 Thread Tyrel Datwyler
On 04/06/2017 03:27 AM, Sachin Sant wrote:
> On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
> any I/O adapter results in the following warning
> 
> This problem has been in the code for some time now. I had first seen this in
> -next tree.
> 
> [  269.589441] rpadlpar_io: slot PHB 72 removed
> [  270.589997] refcount_t: underflow; use-after-free.
> [  270.590019] [ cut here ]
> [  270.590025] WARNING: CPU: 5 PID: 3335 at lib/refcount.c:128 
> refcount_sub_and_test+0xf4/0x110
> [  270.590028] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE 
> nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 
> nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge 
> stp llc rpadlpar_io rpaphp kvm_pr kvm ebtable_filter ebtables ip6table_filter 
> ip6_tables iptable_filter dccp_diag dccp tcp_diag udp_diag inet_diag 
> unix_diag af_packet_diag netlink_diag ghash_generic xts gf128mul vmx_crypto 
> tpm_ibmvtpm tpm sg pseries_rng nfsd auth_rpcgss nfs_acl lockd grace sunrpc 
> binfmt_misc ip_tables xfs libcrc32c sr_mod sd_mod cdrom ibmvscsi ibmveth 
> scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod
> [  270.590076] CPU: 5 PID: 3335 Comm: drmgr Not tainted 4.11.0-rc5 #3
> [  270.590079] task: c005d8df8600 task.stack: c000fb3a8000
> [  270.590081] NIP: c1aa3ca4 LR: c1aa3ca0 CTR: 
> 006338e4
> [  270.590084] REGS: c000fb3ab8a0 TRAP: 0700   Not tainted  (4.11.0-rc5)
> [  270.590087] MSR: 80029033 
> [  270.590090]   CR: 22002422  XER: 0007
> [  270.590093] CFAR: c1edaabc SOFTE: 1 
> [  270.590093] GPR00: c1aa3ca0 c000fb3abb20 c25ea900 
> 0026 
> [  270.590093] GPR04: c0077fc4ada0 c0077fc617b8 000f0c33 
>  
> [  270.590093] GPR08:  c227146c 00077d9e 
> 3ff0 
> [  270.590093] GPR12: 2200 ce802d00  
>  
> [  270.590093] GPR16:    
>  
> [  270.590093] GPR20:  1001b5a8 10018338 
> 10016650 
> [  270.590093] GPR24: 1001b278 c00776e0fdcc 10016650 
>  
> [  270.590093] GPR28: c0077ffea910 c000fbf79180 c00776e0fdc0 
> c000fbf791d8 
> [  270.590126] NIP [c1aa3ca4] refcount_sub_and_test+0xf4/0x110
> [  270.590129] LR [c1aa3ca0] refcount_sub_and_test+0xf0/0x110
> [  270.590132] Call Trace:
> [  270.590134] [c000fb3abb20] [c1aa3ca0] 
> refcount_sub_and_test+0xf0/0x110 (unreliable)
> [  270.590139] [c000fb3abb80] [c1a8221c] kobject_put+0x3c/0xa0
> [  270.590143] [c000fb3abbf0] [c1d22d34] of_node_put+0x24/0x40
> [  270.590147] [c000fb3abc10] [c165c874] ofdt_write+0x204/0x6b0
> [  270.590151] [c000fb3abcd0] [c197a220] proc_reg_write+0x80/0xd0
> [  270.590155] [c000fb3abd00] [c18de680] __vfs_write+0x40/0x1c0
> [  270.590158] [c000fb3abd90] [c18dffd8] vfs_write+0xc8/0x240
> [  270.590162] [c000fb3abde0] [c18e1c40] SyS_write+0x60/0x110
> [  270.590165] [c000fb3abe30] [c15cb184] system_call+0x38/0xe0
> [  270.590168] Instruction dump:
> [  270.590170] 7863d182 4e800020 7c0802a6 3921 3d42fff8 3c62ffb1 386371a8 
> 992a0171 
> [  270.590175] f8010010 f821ffa1 48436de1 6000 <0fe0> 38210060 
> 3860 e8010010 
> [  270.590180] ---[ end trace 08c7a2f3c8bead33 ]—
> 
> Have attached the dmesg log from the system. Let me know if any additional
> information is required to help debug this problem.

I remember you mentioning this when the issue was brought up for CPUs. I
assume the case is the same here where the issue is only seen with
adapters that were hot-added after boot (ie. hot-remove of adapter
present at boot doesn't trip the warning)?

-Tyrel

> 
> Thanks
> -Sachin
> 
> 



Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Linus Torvalds
On Thu, Apr 6, 2017 at 12:23 PM, Peter Zijlstra  wrote:
>
> Something like so then. According to the SDM mwait is a no-op if we do
> not execute monitor first. So this variant should get the first
> iteration without expensive instructions.

No, the problem is that we *would* have executed a prior monitor that
could still be pending - from a previous invocation of
smp_cond_load_acquire().

Especially with spinlocks, these things can very much happen back-to-back.

And it would be pending with a different address (the previous
spinlock) that might not have changed since then (and might not be
changing), so now we might actually be pausing in mwait waiting for
that *other* thing to change.

So it would probably need to do something complicated like

  #define smp_cond_load_acquire(ptr, cond_expr) \
  ({\
typeof(ptr) __PTR = (ptr);  \
typeof(*ptr) VAL;   \
do {\
VAL = READ_ONCE(*__PTR);\
if (cond_expr)  \
break;  \
for (;;) {  \
___monitor(__PTR, 0, 0);\
VAL = READ_ONCE(*__PTR);\
if (cond_expr) break;   \
___mwait(0xf0 /* C0 */, 0); \
}   \
} while (0) \
smp_acquire__after_ctrl_dep();  \
VAL;\
  })

which might just generate nasty enough code to not be worth it.

I dunno

 Linus


Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Peter Zijlstra
On Thu, Apr 06, 2017 at 10:31:46AM -0700, Linus Torvalds wrote:
> And we'd probably want to make it even more strict, in that soem mwait
> implementations might simply not be very good for short waits.

Yeah, we need to find something that works; assuming its beneficial at
all on modern chips.

> > But it builds and boots, no clue if its better or worse. Changing mwait
> > eax to 0 would give us C1 and might also be worth a try I suppose.
> 
> Hmm. Also:
> 
> > +   ___mwait(0xf0 /* C0 */, 0x01 /* INT */);\
> 
> Do you actually want that "INT" bit set? It's only meaningful if
> interrupts are _masked_, afaik. Which doesn't necessarily make sense
> for this case.

Correct, in fact its quite broken. Corrected.

> But maybe "monitor" is really cheap. I suspect it's microcoded,
> though, which implies "no".

Something like so then. According to the SDM mwait is a no-op if we do
not execute monitor first. So this variant should get the first
iteration without expensive instructions.

And this variant is actually quite amenable to alternative(), with which
we can pick between PAUSE and this.

---
 arch/x86/include/asm/barrier.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index bfb28ca..2b5d5a2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -80,6 +80,36 @@ do { 
\
 #define __smp_mb__before_atomic()  barrier()
 #define __smp_mb__after_atomic()   barrier()
 
+static inline void ___monitor(const void *eax, unsigned long ecx,
+unsigned long edx)
+{
+   /* "monitor %eax, %ecx, %edx;" */
+   asm volatile(".byte 0x0f, 0x01, 0xc8;"
+:: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void ___mwait(unsigned long eax, unsigned long ecx)
+{
+   /* "mwait %eax, %ecx;" */
+   asm volatile(".byte 0x0f, 0x01, 0xc9;"
+:: "a" (eax), "c" (ecx));
+}
+
+#define smp_cond_load_acquire(ptr, cond_expr)  \
+({ \
+   typeof(ptr) __PTR = (ptr);  \
+   typeof(*ptr) VAL;   \
+   for (;;) {  \
+   VAL = READ_ONCE(*__PTR);\
+   if (cond_expr)  \
+   break;  \
+   ___mwait(0xf0 /* C0 */, 0); \
+   ___monitor(__PTR, 0, 0);\
+   }   \
+   smp_acquire__after_ctrl_dep();  \
+   VAL;\
+})
+
 #include 
 
 #endif /* _ASM_X86_BARRIER_H */


Re: [PATCH V3] powerpc/hugetlb: Add ABI defines for supported HugeTLB page sizes

2017-04-06 Thread LEROY Christophe

Hi

Anshuman Khandual  a écrit :


This just adds user space exported ABI definitions for 2MB, 16MB, 1GB,
16GB non default huge page sizes to be used with mmap() system call.


Why not add all possible huge page sizes ?
For instance the 8xx (only) supports 512k and 8M hugepages

Christophe



Signed-off-by: Anshuman Khandual 
---
These defined values will be used along with MAP_HUGETLB while calling
mmap() system call if the desired HugeTLB page size is not the default
one. Follows similar definitions present in x86.

arch/x86/include/uapi/asm/mman.h:#define MAP_HUGE_2MB(21 <<  
MAP_HUGE_SHIFT)
arch/x86/include/uapi/asm/mman.h:#define MAP_HUGE_1GB(30 <<  
MAP_HUGE_SHIFT)


Changes in V3:
- Added comment about how these defines will help create 'flags'
  argument for mmap() system call per Balbir

Changes in V2:
- Added definitions for 2MB and 1GB HugeTLB pages per Aneesh

 arch/powerpc/include/uapi/asm/mman.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/include/uapi/asm/mman.h  
b/arch/powerpc/include/uapi/asm/mman.h

index 03c06ba..0c84e14 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,4 +29,14 @@
 #define MAP_STACK	0x2		/* give out an address that is best  
suited for process/thread stacks */

 #define MAP_HUGETLB0x4 /* create a huge page mapping */

+/*
+ * These defines should be used for creating 'flags' argument
+ * (26:31 bits) for the mmap() system call should the caller
+ * decide to use non default HugeTLB page size.
+ */
+#define MAP_HUGE_2MB   (21 << MAP_HUGE_SHIFT)/* 2MB HugeTLB Page */
+#define MAP_HUGE_16MB  (24 << MAP_HUGE_SHIFT)/* 16MB HugeTLB Page */
+#define MAP_HUGE_1GB   (30 << MAP_HUGE_SHIFT)/* 1GB HugeTLB Page */
+#define MAP_HUGE_16GB  (34 << MAP_HUGE_SHIFT)/* 16GB HugeTLB Page */
+
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
--
1.8.5.2





[PATCH v8 3/3] KVM: PPC: Exit guest upon MCE when FWNMI capability is enabled

2017-04-06 Thread Mahesh J Salgaonkar
From: Aravinda Prasad 

Enhance KVM to cause a guest exit with KVM_EXIT_NMI
exit reason upon a machine check exception (MCE) in
the guest address space if the KVM_CAP_PPC_FWNMI
capability is enabled (instead of delivering a 0x200
interrupt to guest). This enables QEMU to build error
log and deliver machine check exception to guest via
guest registered machine check handler.

This approach simplifies the delivery of machine
check exception to guest OS compared to the earlier
approach of KVM directly invoking 0x200 guest interrupt
vector.

This design/approach is based on the feedback for the
QEMU patches to handle machine check exception. Details
of earlier approach of handling machine check exception
in QEMU and related discussions can be found at:

https://lists.nongnu.org/archive/html/qemu-devel/2014-11/msg00813.html

Note:

This patch now directly invokes machine_check_print_event_info()
from kvmppc_handle_exit_hv() to print the event to host console
at the time of guest exit before the exception is passed on to the
guest. Hence, the host-side handling which was performed earlier
via machine_check_fwnmi is removed.

The reasons for this approach is (i) it is not possible
to distinguish whether the exception occurred in the
guest or the host from the pt_regs passed on the
machine_check_exception(). Hence machine_check_exception()
calls panic, instead of passing on the exception to
the guest, if the machine check exception is not
recoverable. (ii) the approach introduced in this
patch gives opportunity to the host kernel to perform
actions in virtual mode before passing on the exception
to the guest. This approach does not require complex
tweaks to machine_check_fwnmi and friends.

Signed-off-by: Aravinda Prasad 
Reviewed-by: David Gibson 
Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/uapi/asm/kvm.h |6 
 arch/powerpc/kvm/book3s_hv.c|   23 --
 arch/powerpc/kvm/book3s_hv_ras.c|   18 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   52 ++-
 5 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4d02cbc..0627a5a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define KVM_MAX_VCPUS  NR_CPUS
 #define KVM_MAX_VCORES NR_CPUS
@@ -675,6 +676,7 @@ struct kvm_vcpu_arch {
int prev_cpu;
bool timer_running;
wait_queue_head_t cpu_run;
+   struct machine_check_event mce_evt; /* Valid if trap == 0x200 */
 
struct kvm_vcpu_arch_shared *shared;
 #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 4edbe4b..1fdf64c 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -57,6 +57,12 @@ struct kvm_regs {
 
 #define KVM_SREGS_E_FSL_PIDn   (1 << 0) /* PID1/PID2 */
 
+/* flags for kvm_run.flags */
+#define KVM_RUN_PPC_NMI_DISP_MASK  (3 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV   (2 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_NOT_RECOV   (3 << 0)
+
 /*
  * Feature bits indicate which sections of the sregs struct are valid,
  * both in KVM_GET_SREGS and KVM_SET_SREGS.  On KVM_SET_SREGS, registers
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9..9a33689 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -966,15 +966,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
-   /*
-* Deliver a machine check interrupt to the guest.
-* We have to do this, even if the host has handled the
-* machine check, because machine checks use SRR0/1 and
-* the interrupt might have trashed guest state in them.
-*/
-   kvmppc_book3s_queue_irqprio(vcpu,
-   BOOK3S_INTERRUPT_MACHINE_CHECK);
-   r = RESUME_GUEST;
+   /* Exit to guest with KVM_EXIT_NMI as exit reason */
+   run->exit_reason = KVM_EXIT_NMI;
+   run->hw.hardware_exit_reason = vcpu->arch.trap;
+   /* Clear out the old NMI status from run->flags */
+   run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
+   /* Now set the NMI status */
+   if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
+   run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
+   else
+   run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_R

[PATCH v8 2/3] powerpc/book3s: EXPORT_SYMBOL machine_check_print_event_info

2017-04-06 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

It will be used in arch/powerpc/kvm/book3s_hv.c KVM module.

Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/kernel/mce.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a1475e6..e567ff1 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -393,6 +393,7 @@ void machine_check_print_event_info(struct 
machine_check_event *evt)
break;
}
 }
+EXPORT_SYMBOL(machine_check_print_event_info);
 
 uint64_t get_mce_fault_addr(struct machine_check_event *evt)
 {



[PATCH v8 1/3] KVM: PPC: Add new capability to control MCE behaviour

2017-04-06 Thread Mahesh J Salgaonkar
From: Aravinda Prasad 

This patch introduces a new KVM capability to control
how KVM behaves on machine check exception (MCE).
Without this capability, KVM redirects machine check
exceptions to guest's 0x200 vector, if the address in
error belongs to the guest. With this capability KVM
causes a guest exit with NMI exit reason.

The new capability is required to avoid problems if
a new kernel/KVM is used with an old QEMU for guests
that don't issue "ibm,nmi-register". As old QEMU does
not understand the NMI exit type, it treats it as a
fatal error. However, the guest could have handled
the machine check error if the exception was delivered
to guest's 0x200 interrupt vector instead of NMI exit
in case of old QEMU.

Signed-off-by: Aravinda Prasad 
Reviewed-by: David Gibson 
Signed-off-by: Mahesh Salgaonkar 
---
 Documentation/virtual/kvm/api.txt   |   11 +++
 arch/powerpc/include/asm/kvm_host.h |1 +
 arch/powerpc/kernel/asm-offsets.c   |1 +
 arch/powerpc/kvm/powerpc.c  |9 +
 include/uapi/linux/kvm.h|1 +
 5 files changed, 23 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index fd10689..1c9d281 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4164,6 +4164,17 @@ to take care of that.
 This capability can be enabled dynamically even if VCPUs were already
 created and are running.
 
+7.9 KVM_CAP_PPC_FWNMI
+
+Architectures: ppc
+Parameters: none
+
+With this capability a machine check exception in the guest address
+space will cause KVM to exit the guest with NMI exit reason. This
+enables QEMU to build error log and branch to guest kernel registered
+machine check handling routine. Without this capability KVM will
+branch to guests' 0x200 interrupt vector.
+
 8. Other capabilities.
 --
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 7bba8f4..4d02cbc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -279,6 +279,7 @@ struct kvm_arch {
struct dentry *debugfs_dir;
struct dentry *htab_dentry;
struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
+   u8 fwnmi_enabled;
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
struct mutex hpt_mutex;
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 4367e7d..0daa47b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -481,6 +481,7 @@ int main(void)
OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls);
OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
OFFSET(KVM_RADIX, kvm, arch.radix);
+   OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95c91a9..b8e8cd4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -618,6 +618,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
/* Disable this on POWER9 until code handles new HPTE format */
r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
break;
+   case KVM_CAP_PPC_FWNMI:
+   r = 1;
+   break;
 #endif
case KVM_CAP_PPC_HTM:
r = cpu_has_feature(CPU_FTR_TM_COMP) &&
@@ -1226,6 +1229,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu 
*vcpu,
break;
}
 #endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_PPC_BOOK3S_64
+   case KVM_CAP_PPC_FWNMI:
+   r = 0;
+   vcpu->kvm->arch.fwnmi_enabled = true;
+   break;
+#endif /* CONFIG_PPC_BOOK3S_64 */
default:
r = -EINVAL;
break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f51d508..d5428a7 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -883,6 +883,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_PPC_FWNMI 137
 
 #ifdef KVM_CAP_IRQ_ROUTING
 



[PATCH v8 0/3] KVM: PPC: Add FWNMI support for KVM guests on POWER

2017-04-06 Thread Mahesh J Salgaonkar
From: Aravinda Prasad 

This series of patches add FWNMI support for KVM guests
on POWER.

Memory errors such as bit flips that cannot be corrected
by hardware is passed on to the kernel for handling
by raising machine check exception (an NMI). Upon such
machine check exceptions, if the address in error belongs
to the guest, the error is passed on to the guest
kernel for handling. However, for guest kernels that
have issued "ibm,nmi-register" call, QEMU should build
an error log and pass on the error log to the guest-
kernel registered machine check handler routine.

This patch series adds the functionality to pass on the
machine check exception to the guest kernel by
giving control to QEMU. QEMU builds the error log
and invokes the guest-kernel registered handler.

QEMU part can be found at:
http://lists.nongnu.org/archive/html/qemu-ppc/2015-12/msg00199.html

Change Log v8:
  - export symbol machine_check_print_event_info() and directly call it
from kvmppc_handle_exit_hv().

Change Log v7:
  - Move the FWNMI capability inside #ifdef CONFIG_PPC_BOOK3S_64 block.
  - Split 2nd patch into two, one that adds mce hook and other that touches
kvm code.

Change Log v6:
  - Deliver all MCE errors (handled/unhandled) for FWNMI capable guest.
  - Use kvm_run->flags to pass NMI disposition status.

Change Log v5:
  - Added capability documentation. No functionality/code change.

Change Log v4:
  - Allow host-side handling of the machine check exception before
passing on the exception to the guest.

Change Log v3:
  - Split the patch into 2. First patch introduces the
new capability while the second one enhances KVM to
redirect MCE.
  - Fix access width bug

Change Log v2:
  - Added KVM capability

---

Aravinda Prasad (2):
  KVM: PPC: Add new capability to control MCE behaviour
  KVM: PPC: Exit guest upon MCE when FWNMI capability is enabled

Mahesh Salgaonkar (1):
  powerpc/book3s: EXPORT_SYMBOL machine_check_print_event_info


 Documentation/virtual/kvm/api.txt   |   11 +++
 arch/powerpc/include/asm/kvm_host.h |3 ++
 arch/powerpc/include/uapi/asm/kvm.h |6 
 arch/powerpc/kernel/asm-offsets.c   |1 +
 arch/powerpc/kernel/mce.c   |1 +
 arch/powerpc/kvm/book3s_hv.c|   23 --
 arch/powerpc/kvm/book3s_hv_ras.c|   18 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   52 ++-
 arch/powerpc/kvm/powerpc.c  |9 +
 include/uapi/linux/kvm.h|1 +
 10 files changed, 93 insertions(+), 32 deletions(-)

--
Signature



Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Linus Torvalds
On Thu, Apr 6, 2017 at 9:36 AM, Peter Zijlstra  wrote:
>
> Something like the below, which is ugly (because I couldn't be bothered
> to resolve the header recursion and thus duplicates the monitor/mwait
> functions) and broken (because it hard assumes the hardware can do
> monitor/mwait).

Yeah, I think it needs to be conditional not just om mwait support,
but on the "new" mwait support (ie "CPUID.05H:ECX[bit 1] = 1").

And we'd probably want to make it even more strict, in that soem mwait
implementations might simply not be very good for short waits.

Because I think the latency was hundreds of cycles at some point (but
that may have been the original version that wouldn't have had the
"new mwait" bit set), and there are also issues with virtualization
(ie we may not want to do this in a guest because it causes a VM
exit).

> But it builds and boots, no clue if its better or worse. Changing mwait
> eax to 0 would give us C1 and might also be worth a try I suppose.

Hmm. Also:

> +   ___mwait(0xf0 /* C0 */, 0x01 /* INT */);\

Do you actually want that "INT" bit set? It's only meaningful if
interrupts are _masked_, afaik. Which doesn't necessarily make sense
for this case.

If interrupts would actually get delivered to software, mwait will
exit regardless.

So I think __mwait(0,0) might be the rigth thing at least in some
cases. Or at least worth timing at some point.

Of course, we really shouldn't have very many places that actually
need this. We currently use it in three places, I think:

 - spinlocks. This is likely the the big case.

 - the per-cpu cross-cpu calling (call_single_data) exclusivity waiting

 - the magical on_cpu waiting in ttwu. I'm not sure how often this
actually triggers, the original argument for this was to avoid an
expensive barrier - the loop itself probably very seldom actually
triggers.

It may be, for example, that just the fact that your implementation
does the "__monitor()" part before doing the load and test, might be
already too expensive for the (common) cases where we don't expect to
loop.

But maybe "monitor" is really cheap. I suspect it's microcoded,
though, which implies "no".

  Linus


Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Peter Zijlstra
On Thu, Apr 06, 2017 at 08:16:19AM -0700, Linus Torvalds wrote:

> In theory x86 could use monitor/mwait for it too, in practice I think
> it tends to still be too high latency (because it was originally just
> designed for the idle loop). mwait got extended to actually be useful,
> but I'm not sure what the latency is for the modern one.

I've been meaning to test mwait-c0 for this, but never got around to it.

Something like the below, which is ugly (because I couldn't be bothered
to resolve the header recursion and thus duplicates the monitor/mwait
functions) and broken (because it hard assumes the hardware can do
monitor/mwait).

But it builds and boots, no clue if its better or worse. Changing mwait
eax to 0 would give us C1 and might also be worth a try I suppose.

---
 arch/x86/include/asm/barrier.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index bfb28ca..faab9cd 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -80,6 +80,36 @@ do { 
\
 #define __smp_mb__before_atomic()  barrier()
 #define __smp_mb__after_atomic()   barrier()
 
+static inline void ___monitor(const void *eax, unsigned long ecx,
+unsigned long edx)
+{
+   /* "monitor %eax, %ecx, %edx;" */
+   asm volatile(".byte 0x0f, 0x01, 0xc8;"
+:: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void ___mwait(unsigned long eax, unsigned long ecx)
+{
+   /* "mwait %eax, %ecx;" */
+   asm volatile(".byte 0x0f, 0x01, 0xc9;"
+:: "a" (eax), "c" (ecx));
+}
+
+#define smp_cond_load_acquire(ptr, cond_expr)  \
+({ \
+   typeof(ptr) __PTR = (ptr);  \
+   typeof(*ptr) VAL;   \
+   for (;;) {  \
+   ___monitor(__PTR, 0, 0);\
+   VAL = READ_ONCE(*__PTR);\
+   if (cond_expr)  \
+   break;  \
+   ___mwait(0xf0 /* C0 */, 0x01 /* INT */);\
+   }   \
+   smp_acquire__after_ctrl_dep();  \
+   VAL;\
+})
+
 #include 
 
 #endif /* _ASM_X86_BARRIER_H */


Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Nicholas Piggin
On Thu, 6 Apr 2017 15:13:53 +0100
Will Deacon  wrote:

> Hi Nick,
> 
> On Thu, Apr 06, 2017 at 10:59:58AM +1000, Nicholas Piggin wrote:
> > On Wed, 05 Apr 2017 07:01:57 -0700 (PDT)
> > David Miller  wrote:
> >   
> > > From: Nicholas Piggin 
> > > Date: Tue, 4 Apr 2017 13:02:33 +1000
> > >   
> > > > On Mon, 3 Apr 2017 17:43:05 -0700
> > > > Linus Torvalds  wrote:
> > > > 
> > > >> But that depends on architectures having some pattern that we *can*
> > > >> abstract. Would some "begin/in-loop/end" pattern like the above be
> > > >> sufficient?
> > > > 
> > > > Yes. begin/in/end would be sufficient for powerpc SMT priority, and
> > > > for x86, and it looks like sparc64 too. So we could do that if you
> > > > prefer.
> > > 
> > > Sparc64 has two cases, on older chips we can induce a cpu thread yield
> > > with a special sequence of instructions, and on newer chips we have
> > > a bonafide pause instruction.
> > > 
> > > So cpu_relax() all by itself pretty much works for us.
> > >   
> > 
> > Thanks for taking a look. The default spin primitives should just
> > continue to do the right thing for you in that case.
> > 
> > Arm has a yield instruction, ia64 has a pause... No unusual
> > requirements that I can see.  
> 
> Yield tends to be implemented as a NOP in practice, since it's in the
> architecture for SMT CPUs and most ARM CPUs are single-threaded. We do have
> the WFE instruction (wait for event) which is used in our implementation of
> smp_cond_load_acquire, but I don't think we'd be able to use it with the
> proposals here.
> 
> WFE can stop the clock for the CPU until an "event" is signalled by
> another CPU. This could be done by an explicit SEV (send event) instruction,
> but that tends to require heavy barriers on the signalling side. Instead,
> the preferred way to generate an event is to clear the exclusive monitor
> reservation for the CPU executing the WFE. That means that the waiter
> does something like:
> 
>   LDXR x0, [some_address] // Load exclusive from some_address
>   CMP  x0, some value // If the value matches what I want
>   B.EQ out// then we're done
>   WFE // otherwise, wait
> 
> at this point, the waiter will stop on the WFE until its monitor is cleared,
> which happens if another CPU writes to some_address.
> 
> We've wrapped this up in the arm64 code as __cmpwait, and we use that
> to build smp_cond_load_acquire. It would be nice to use the same machinery
> for the conditional spinning here, unless you anticipate that we're only
> going to be spinning for a handful of iterations anyway?

So I do want to look at adding spin loop primitives as well as the
begin/in/end primitives to help with powerpc's SMT priorities.

So we'd have:

  spin_begin();
  spin_do {
if (blah) {
spin_end();
return;
}
  } spin_until(!locked);
  spin_end();

So you could implement your monitor with that. There's a handful of core
places. mutex, bit spinlock, seqlock, polling idle, etc. So I think if it
is beneficial for you in smp_cond_load_acquire, it should be useful in
those too.



Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Linus Torvalds
On Thu, Apr 6, 2017 at 7:13 AM, Will Deacon  wrote:
>
> We've wrapped this up in the arm64 code as __cmpwait, and we use that
> to build smp_cond_load_acquire. It would be nice to use the same machinery
> for the conditional spinning here, unless you anticipate that we're only
> going to be spinning for a handful of iterations anyway?

I suspect most of these loops aren't set up for the WFE kind of
spinning, because they look for more than one variable.

.. and the ones that _are_ set up for this probably should just be
rewritten to use smp_cond_load_acquire() anyway, because the "wait for
value" special case is fairly special.

In theory x86 could use monitor/mwait for it too, in practice I think
it tends to still be too high latency (because it was originally just
designed for the idle loop). mwait got extended to actually be useful,
but I'm not sure what the latency is for the modern one.

   Linus


[PATCH 2/3] powerpc/mm: Add comments to the vmemmap layout

2017-04-06 Thread Anshuman Khandual
Add some explaination to the layout of vmemmap virtual address
space and how physical page mapping is only used for valid PFNs
present at any point on the system.

Reviewed-by: Aneesh Kumar K.V 
Signed-off-by: Anshuman Khandual 
---
Previous discussions around this http://patchwork.ozlabs.org/patch/584111/

 arch/powerpc/mm/pgtable-hash64.c | 75 
 1 file changed, 75 insertions(+)

diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 8b85a14..16877f6 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -22,6 +22,81 @@
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
+ * vmemmap is the starting address of the virtual address space where
+ * struct pages are allocated for all possible PFNs present on the system
+ * including holes and bad memory (hence sparse). These virtual struct
+ * pages are stored in sequence in this virtual address space irrespective
+ * of the fact whether the corresponding PFN is valid or not. This achieves
+ * constant relationship between address of struct page and its PFN.
+ *
+ * During boot or memory hotplug operation when a new memory section is
+ * added, physical memory allocation (including hash table bolting) will
+ * be performed for the set of struct pages which are part of the memory
+ * section. This saves memory by not allocating struct pages for PFNs
+ * which are not valid.
+ *
+ * --
+ * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
+ * --
+ *
+ *f000  c000
+ * vmemmap +--+  +--+
+ *  +  |  page struct | +--> |  page struct |
+ *  |  +--+  +--+
+ *  |  |  page struct | +--> |  page struct |
+ *  |  +--+ |+--+
+ *  |  |  page struct | +   +--> |  page struct |
+ *  |  +--+ |+--+
+ *  |  |  page struct | |   +--> |  page struct |
+ *  |  +--+ |   |+--+
+ *  |  |  page struct | |   |
+ *  |  +--+ |   |
+ *  |  |  page struct | |   |
+ *  |  +--+ |   |
+ *  |  |  page struct | |   |
+ *  |  +--+ |   |
+ *  |  |  page struct | |   |
+ *  |  +--+ |   |
+ *  |  |  page struct | +---+   |
+ *  |  +--+ |
+ *  |  |  page struct | +---+
+ *  |  +--+
+ *  |  |  page struct | No mapping
+ *  |  +--+
+ *  |  |  page struct | No mapping
+ *  v  +--+
+ *
+ * -
+ * | RELATION BETWEEN STRUCT PAGES AND PFNS|
+ * -
+ *
+ * vmemmap +--+ +---+
+ *  +  |  page struct | +-> |  PFN  |
+ *  |  +--+ +---+
+ *  |  |  page struct | +-> |  PFN  |
+ *  |  +--+ +---+
+ *  |  |  page struct | +-> |  PFN  |
+ *  |  +--+ +---+
+ *  |  |  page struct | +-> |  PFN  |
+ *  |  +--+ +---+
+ *  |  |  |
+ *  |  +--+
+ *  |  |  |
+ *  |  +--+
+ *  |  |  |
+ *  |  +--+ +---+
+ *  |  |  page struct | +-> |  PFN  |
+ *  |  +--+ +---+
+ *  |  |  |
+ *  |  +--+
+ *  |  |  |
+ *  |  +--+ +---+
+ *  |  |  page struct | +-> |  PFN  |
+ *  |  +--+ +---+
+ *  |  |  page struct | +-> |  PFN  |
+ *  v  +--+ +---+
+ */
+/*
  * On hash-based CPUs, the vmemmap is bolted in the hash table.
  *
  */
-- 
1.9.3



[PATCH 3/3] powerpc/mm: Add comments on vmemmap physical mapping

2017-04-06 Thread Anshuman Khandual
Adds some explaination on how the vmemmap based struct page layout's
physical mapping is allocated and tracked through linked list. It
also keeps note of a possible race condition.

Signed-off-by: Anshuman Khandual 
---
Previous discussions on this http://patchwork.ozlabs.org/patch/584110/
Michael Ellerman had agreed to take the comments alone.

 arch/powerpc/mm/init_64.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 2a15986..6e5c54d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -118,8 +118,28 @@ static int __meminit vmemmap_populated(unsigned long 
start, int page_size)
return 0;
 }
 
+/*
+ * vmemmap virtual address space management does not have a traditonal page
+ * table to track which virtual struct pages are backed by physical mapping.
+ * The virtual to physical mappings are tracked in a simple linked list
+ * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
+ * all times where as the 'next' list maintains the available
+ * vmemmap_backing structures which have been deleted from the
+ * 'vmemmap_global' list during system runtime (memory hotplug remove
+ * operation). The freed 'vmemmap_backing' structures are reused later when
+ * new requests come in without allocating fresh memory. This pointer also
+ * tracks the allocated 'vmemmap_backing' structures as we allocate one
+ * full page memory at a time when we dont have any.
+ */
 struct vmemmap_backing *vmemmap_list;
 static struct vmemmap_backing *next;
+
+/* The same pointer 'next' tracks individual chunks inside the allocated
+ * full page during the boot time and again tracks the freeed nodes during
+ * runtime. It is racy but it does not happen as they are separated by the
+ * boot process. Will create problem if some how we have memory hotplug
+ * operation during boot !!
+ */
 static int num_left;
 static int num_freed;
 
-- 
1.9.3



[PATCH 1/3] powerpc/mm: Rename variable to reflect start address of a section

2017-04-06 Thread Anshuman Khandual
The commit (16a05bff1: powerpc: start loop at section start of
start in vmemmap_populated()) reused 'start' variable to compute
the starting address of the memory section where the given address
belongs. Then the same variable is used for iterating over starting
address of all memory sections before reaching the 'end' address.
Renaming it as 'section_start' makes the logic more clear.

Signed-off-by: Anshuman Khandual 
---
Previous discussions on this http://patchwork.ozlabs.org/patch/584103/

 arch/powerpc/mm/init_64.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c22f207..2a15986 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -104,11 +104,15 @@ static unsigned long __meminit 
vmemmap_section_start(unsigned long page)
  */
 static int __meminit vmemmap_populated(unsigned long start, int page_size)
 {
-   unsigned long end = start + page_size;
-   start = (unsigned long)(pfn_to_page(vmemmap_section_start(start)));
+   unsigned long end, section_start;
 
-   for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
-   if (pfn_valid(page_to_pfn((struct page *)start)))
+   end = start + page_size;
+   section_start = (unsigned long)(pfn_to_page
+   (vmemmap_section_start(start)));
+
+   for (; section_start < end; section_start
+   += (PAGES_PER_SECTION * sizeof(struct page)))
+   if (pfn_valid(page_to_pfn((struct page *)section_start)))
return 1;
 
return 0;
-- 
1.9.3



Re: [RFC][PATCH] spin loop arch primitives for busy waiting

2017-04-06 Thread Will Deacon
Hi Nick,

On Thu, Apr 06, 2017 at 10:59:58AM +1000, Nicholas Piggin wrote:
> On Wed, 05 Apr 2017 07:01:57 -0700 (PDT)
> David Miller  wrote:
> 
> > From: Nicholas Piggin 
> > Date: Tue, 4 Apr 2017 13:02:33 +1000
> > 
> > > On Mon, 3 Apr 2017 17:43:05 -0700
> > > Linus Torvalds  wrote:
> > >   
> > >> But that depends on architectures having some pattern that we *can*
> > >> abstract. Would some "begin/in-loop/end" pattern like the above be
> > >> sufficient?  
> > > 
> > > Yes. begin/in/end would be sufficient for powerpc SMT priority, and
> > > for x86, and it looks like sparc64 too. So we could do that if you
> > > prefer.  
> > 
> > Sparc64 has two cases, on older chips we can induce a cpu thread yield
> > with a special sequence of instructions, and on newer chips we have
> > a bonafide pause instruction.
> > 
> > So cpu_relax() all by itself pretty much works for us.
> > 
> 
> Thanks for taking a look. The default spin primitives should just
> continue to do the right thing for you in that case.
> 
> Arm has a yield instruction, ia64 has a pause... No unusual
> requirements that I can see.

Yield tends to be implemented as a NOP in practice, since it's in the
architecture for SMT CPUs and most ARM CPUs are single-threaded. We do have
the WFE instruction (wait for event) which is used in our implementation of
smp_cond_load_acquire, but I don't think we'd be able to use it with the
proposals here.

WFE can stop the clock for the CPU until an "event" is signalled by
another CPU. This could be done by an explicit SEV (send event) instruction,
but that tends to require heavy barriers on the signalling side. Instead,
the preferred way to generate an event is to clear the exclusive monitor
reservation for the CPU executing the WFE. That means that the waiter
does something like:

LDXR x0, [some_address] // Load exclusive from some_address
CMP  x0, some value // If the value matches what I want
B.EQ out// then we're done
WFE // otherwise, wait

at this point, the waiter will stop on the WFE until its monitor is cleared,
which happens if another CPU writes to some_address.

We've wrapped this up in the arm64 code as __cmpwait, and we use that
to build smp_cond_load_acquire. It would be nice to use the same machinery
for the conditional spinning here, unless you anticipate that we're only
going to be spinning for a handful of iterations anyway?

Cheers,

Will


Re: [PATCH 2/3] of/fdt: introduce of_scan_flat_dt_subnodes and of_get_flat_dt_phandle

2017-04-06 Thread Rob Herring
On Wed, Apr 5, 2017 at 7:38 PM, Nicholas Piggin  wrote:
> On Thu, 06 Apr 2017 06:58:01 +1000
> Benjamin Herrenschmidt  wrote:
>
>> On Wed, 2017-04-05 at 10:58 -0500, Rob Herring wrote:
>> > Well, I'd like to avoid expanding usage of flat DT parsing in the
>> > kernel. But you could just put this function into arch/powerpc and I'd
>> > never see it, but I like that even less. Mainly, I just wanted to
>> > raise the point.
>> >
>> > Your argument works until you need that setup in assembly code, then
>> > you are in the situation that you need to either handle the setup in
>> > bootloader/firmware or have an simple way to determine that condition.
>>
>> The main issue is that changing that is a very very invasive change in
>> an extremely fragile and rather nasty area of code shared by 32 and 64-
>> bit for which we don't even have easy access to all the machines to
>> test with anymore :)
>>
>> It's probably not impossible, but it would delay the new cpu feature
>> stuff that Nick is making by a lot, probably monthes, making it nearly
>> impossible to get back into distros etc...
>>
>> So while it might be something to consider, I would definitely keep
>> that as a separate unit of work to do later.
>
> Yeah, it's no longer a "drop in" replacement for existing features
> testing if we do this, which makes it hard to backport too (we will
> need this for compatibility with future firmware, so it will have to
> go into distro kernels.)
>
> Given that it's quite a small addition to of/fdt code, hopefully
> that gives you a reasonable justification to accept it.
>
> If you prefer not to, that's okay, but I think we would have to carry
> it in arch/powerpc at least for a time, because of the schedule we're
> working to for POWER9 enablement. As a longer term item I agree with you
> and Ben, it would be worth considering unflattening earlier.

As I mentioned, keeping it in arch/powerpc I like even less. So this is fine.

Rob


[PATCH] powerpc/crypto/crc32c-vpmsum: Fix missing preempt_disable()

2017-04-06 Thread Michael Ellerman
In crc32c_vpmsum() we call enable_kernel_altivec() without first
disabling preemption, which is not allowed:

  WARNING: CPU: 9 PID: 2949 at ../arch/powerpc/kernel/process.c:277 
enable_kernel_altivec+0x100/0x120
  Modules linked in: dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio 
libcrc32c vmx_crypto ...
  CPU: 9 PID: 2949 Comm: docker Not tainted 
4.11.0-rc5-compiler_gcc-6.3.1-00033-g308ac7563944 #381
  ...
  NIP [c001e320] enable_kernel_altivec+0x100/0x120
  LR [d3df0910] crc32c_vpmsum+0x108/0x150 [crc32c_vpmsum]
  Call Trace:
0xc138fd09 (unreliable)
crc32c_vpmsum+0x108/0x150 [crc32c_vpmsum]
crc32c_vpmsum_update+0x3c/0x60 [crc32c_vpmsum]
crypto_shash_update+0x88/0x1c0
crc32c+0x64/0x90 [libcrc32c]
dm_bm_checksum+0x48/0x80 [dm_persistent_data]
sb_check+0x84/0x120 [dm_thin_pool]
dm_bm_validate_buffer.isra.0+0xc0/0x1b0 [dm_persistent_data]
dm_bm_read_lock+0x80/0xf0 [dm_persistent_data]
__create_persistent_data_objects+0x16c/0x810 [dm_thin_pool]
dm_pool_metadata_open+0xb0/0x1a0 [dm_thin_pool]
pool_ctr+0x4cc/0xb60 [dm_thin_pool]
dm_table_add_target+0x16c/0x3c0
table_load+0x184/0x400
ctl_ioctl+0x2f0/0x560
dm_ctl_ioctl+0x38/0x50
do_vfs_ioctl+0xd8/0x920
SyS_ioctl+0x68/0xc0
system_call+0x38/0xfc

It used to be sufficient just to call pagefault_disable(), because that
also disabled preemption. But the two were decoupled in commit 8222dbe21e79
("sched/preempt, mm/fault: Decouple preemption from the page fault
logic") in mid 2015.

So add the missing preempt_disable/enable(). We should also call
disable_kernel_fp(), although it does nothing by default, there is a
debug switch to make it active and all enables should be paired with
disables.

Fixes: 6dd7a82cc54e ("crypto: powerpc - Add POWER8 optimised crc32c")
Cc: sta...@vger.kernel.org # v4.8+
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/crypto/crc32c-vpmsum_glue.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c 
b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
index 411994551afc..f058e0c3e4d4 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
@@ -33,10 +33,13 @@ static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, 
size_t len)
}
 
if (len & ~VMX_ALIGN_MASK) {
+   preempt_disable();
pagefault_disable();
enable_kernel_altivec();
crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+   disable_kernel_altivec();
pagefault_enable();
+   preempt_enable();
}
 
tail = len & VMX_ALIGN_MASK;
-- 
2.7.4



Re: powerpc/mm: Add missing global TLBI if cxl is active

2017-04-06 Thread Michael Ellerman
On Wed, 2017-03-29 at 17:19:42 UTC, Frederic Barrat wrote:
> Commit 4c6d9acce1f4 ("powerpc/mm: Add hooks for cxl") converted local
> TLBIs to global if the cxl driver is active. It is necessary because
> the CAPP snoops invalidations to forward them to the PSL on the cxl
> adapter.
> However one path was apparently forgotten. native_flush_hash_range()
> still sends local TLBIs, as found out the hard way recently.
> 
> This patch fixes it by following the same logic as previously: if the
> cxl driver is active, the local TLBIs are 'upgraded' to global.
> 
> Fixes: 4c6d9acce1f4 ("powerpc/mm: Add hooks for cxl")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Frederic Barrat 
> Reviewed-by: Aneesh Kumar K.V 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/88b1bf7268f56887ca88eb09c6fb0f

cheers


Re: [v2] powerpc/64: Fix flush_(d|i)cache_range() called from modules

2017-04-06 Thread Michael Ellerman
On Wed, 2017-04-05 at 11:49:09 UTC, Michael Ellerman wrote:
> From: Oliver O'Halloran 
> 
> When the kernel is compiled to use 64bit ABIv2 the _GLOBAL() macro does
> not include a global entry point. A function's global entry point is
> used when the function is called from a different TOC context and in the
> kernel this typically means a call from a module into the vmlinux (or
> vice-versa).
> 
> There are a few exported asm functions declared with _GLOBAL() and
> calling them from a module will likely crash the kernel since any TOC
> relative load will yield garbage.
> 
> flush_icache_range() and flush_dcache_range() are both exported to
> modules, and use the TOC, so must use _GLOBAL_TOC().
> 
> Fixes: 721aeaa9fdf3 ("powerpc: Build little endian ppc64 kernel with ABIv2")
> Cc: sta...@vger.kernel.org # v3.16+
> Signed-off-by: Oliver O'Halloran 
> Signed-off-by: Michael Ellerman 

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/8f5f525d5b83f7d76a6baf9c4e94d4

cheers


Re: powerpc: Don't try to fix up misaligned load-with-reservation instructions

2017-04-06 Thread Michael Ellerman
On Tue, 2017-04-04 at 04:56:05 UTC, Paul Mackerras wrote:
> In the past, there was only one load-with-reservation instruction,
> lwarx, and if a program attempted a lwarx on a misaligned address, it
> would take an alignment interrupt and the kernel handler would emulate
> it as though it was lwzx, which was not really correct, but benign
> since it is loading the right amount of data, and the lwarx should be
> paired with a stwcx. to the same address, which would also cause an
> alignment interrupt which would result in a SIGBUS being delivered to
> the process.
> 
> We now have 5 different sizes of load-with-reservation instruction.
> Of those, lharx and ldarx cause an immediate SIGBUS by luck since
> their entries in aligninfo[] overlap instructions which were not
> fixed up, but lqarx overlaps with lhz and will be emulated as such.
> lbarx can never generate an alignment interrupt since it only
> operates on 1 byte.
> 
> To straighten this out and fix the lqarx case, this adds code to
> detect the l[hwdq]arx instructions and return without fixing them
> up, resulting in a SIGBUS being delivered to the process.
> 
> Signed-off-by: Paul Mackerras 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/48fe9e9488743eec9b7c1addd3c93f

cheers


Re: [v2, 3/3] powerpc/powernv: Introduce address translation services for Nvlink2

2017-04-06 Thread Michael Ellerman
On Mon, 2017-04-03 at 09:51:44 UTC, Alistair Popple wrote:
> Nvlink2 supports address translation services (ATS) allowing devices
> to request address translations from an mmu known as the nest MMU
> which is setup to walk the CPU page tables.
> 
> To access this functionality certain firmware calls are required to
> setup and manage hardware context tables in the nvlink processing unit
> (NPU). The NPU also manages forwarding of TLB invalidates (known as
> address translation shootdowns/ATSDs) to attached devices.
> 
> This patch exports several methods to allow device drivers to register
> a process id (PASID/PID) in the hardware tables and to receive
> notification of when a device should stop issuing address translation
> requests (ATRs). It also adds a fault handler to allow device drivers
> to demand fault pages in.
> 
> Signed-off-by: Alistair Popple 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/1ab66d1fbadad86b1f4a9c7857e193

cheers


Re: [v2, 2/3] powerpc/powernv: Add sanity checks to pnv_pci_get_{gpu|npu}_dev

2017-04-06 Thread Michael Ellerman
On Mon, 2017-04-03 at 09:51:43 UTC, Alistair Popple wrote:
> The pnv_pci_get_{gpu|npu}_dev functions are used to find associations
> between nvlink PCIe devices and standard PCIe devices. However they
> lacked basic sanity checking which results in NULL pointer
> dereferencing if they are incorrect called can be harder to spot than
> an explicit WARN_ON.
> 
> Signed-off-by: Alistair Popple 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/4c3b89effc281704d5395282c800c4

cheers


Re: [v2,1/3] drivers/of/base.c: Add of_property_read_u64_index

2017-04-06 Thread Michael Ellerman
On Mon, 2017-04-03 at 09:51:42 UTC, Alistair Popple wrote:
> There is of_property_read_u32_index but no u64 variant. This patch
> adds one similar to the u32 version for u64.
> 
> Signed-off-by: Alistair Popple 
> Acked-by: Rob Herring 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/2475a2b6c877a0c8d1ca42c3f2b30f

cheers


Re: powerpc/mm: remove stale comment

2017-04-06 Thread Michael Ellerman
On Mon, 2017-04-03 at 08:09:06 UTC, Oliver O'Halloran wrote:
> The code to fix the problem it describes was removed in c40785a and it
> uses the stupid comment style. Away it goes!
> 
> Signed-off-by: Oliver O'Halloran 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/f6f9195ba0b0b1406af384037d9e7e

cheers


Re: powerpc: Avoid taking a data miss on every userspace instruction miss

2017-04-06 Thread Michael Ellerman
On Mon, 2017-04-03 at 06:41:02 UTC, Anton Blanchard wrote:
> From: Anton Blanchard 
> 
> Early on in do_page_fault() we call store_updates_sp(), regardless of
> the type of exception. For an instruction miss this doesn't make
> sense, because we only use this information to detect if a data miss
> is the result of a stack expansion instruction or not.
> 
> Worse still, it results in a data miss within every userspace
> instruction miss handler, because we try and load the very instruction
> we are about to install a pte for!
> 
> A simple exec microbenchmark runs 6% faster on POWER8 with this fix:
> 
>  #include 
>  #include 
>  #include 
> 
> int main(int argc, char *argv[])
> {
>   unsigned long left = atol(argv[1]);
>   char leftstr[16];
> 
>   if (left-- == 0)
>   return 0;
> 
>   sprintf(leftstr, "%ld", left);
>   execlp(argv[0], argv[0], leftstr, NULL);
>   perror("exec failed\n");
> 
>   return 0;
> }
> 
> Pass the number of iterations on the command line (eg 1) and time
> how long it takes to execute.
> 
> Signed-off-by: Anton Blanchard 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/a7a9dcd882a67b68568868b988289f

cheers


Re: powerpc/prom: Increase RMA size to 512MB

2017-04-06 Thread Michael Ellerman
On Thu, 2017-03-30 at 04:03:49 UTC, Sukadev Bhattiprolu wrote:
> >From 3ae8d1ed31b01b92b172fe20e4560cfbfab135ec Mon Sep 17 00:00:00 2001
> From: root 
> Date: Mon, 27 Mar 2017 19:43:14 -0400
> Subject: [PATCH] powerpc/prom: Increase RMA size to 512MB
> 
> When booting very large systems with a large initrd, we run out of
> space for either the RTAS or the flattened device tree (FDT). Boot
> fails with messages like:
> 
>   Could not allocate memory for RTAS
> or
>   No memory for flatten_device_tree (no room)
> 
> Increasing the minimum RMA size to 512MB fixes the problem. This
> should not have an impact on smaller LPARs (with 256MB memory),
> as the firmware will cap the RMA to the memory assigned to the LPAR.
> 
> Fix is based on input/discussions with Michael Ellerman. Thanks to
> Praveen K. Pandey for testing on a large system.
> 
> Reported-by: Praveen K. Pandey 
> Signed-off-by: Sukadev Bhattiprolu 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/687da8fce1682c9f1e87530e731189

cheers


Re: [v8] powerpc/powernv: add 'firmware/exports' attributes to sysfs

2017-04-06 Thread Michael Ellerman
On Wed, 2017-03-29 at 23:28:01 UTC, Matt Brown wrote:
> The HDAT data area is consumed by skiboot and turned into a device-tree. In
> some cases we would like to look directly at the HDAT. This is not possible
> through /dev/mem as it is reserved memory which is stopped by the /dev/mem
> filter. There are also other memory areas which are reserved but could be
> useful to view for debugging purposes.
> 
> This patch adds sysfs nodes to allow specified memory areas to be viewed.
> sysfs nodes are created for each property in the device-tree under
> /ibm,opal/firmware/exports/, and adds them to /sys/firmware/opal/exports/
> with root read-only permissions.
> 
> Signed-off-by: Matt Brown 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/11fe909d236263f62808dc3c73caf7

cheers


Re: [1/2] powerpc/book3s: Display more info for MCE error console log.

2017-04-06 Thread Michael Ellerman
On Tue, 2017-03-28 at 13:45:04 UTC, Mahesh Salgaonkar wrote:
> From: Mahesh Salgaonkar 
> 
> For D-side errors we print data load/store address as 'Effective address'
> that caused MC. In addition to print NIP, print kernel function name as well.
> 
> After this patch the MCE console log would look like:
> 
> [  291.444281] Severe Machine check interrupt [Recovered]
> [  291.77]   NIP [d0001bc70194]: init_module+0x194/0x2b0 [bork_kernel]
> [  291.444707]   Initiator: CPU
> [  291.444761]   Error type: SLB [Parity]
> [  291.444793] Effective address: d00026de
> 
> Signed-off-by: Mahesh Salgaonkar 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/5b1d6fc2d4d927852214f2a7e2a8eb

cheers


WARN @lib/refcount.c:128 during hot unplug of I/O adapter.

2017-04-06 Thread Sachin Sant
On a POWER8 LPAR running 4.11.0-rc5, a hot unplug operation on
any I/O adapter results in the following warning

This problem has been in the code for some time now. I had first seen this in
-next tree.

[  269.589441] rpadlpar_io: slot PHB 72 removed
[  270.589997] refcount_t: underflow; use-after-free.
[  270.590019] [ cut here ]
[  270.590025] WARNING: CPU: 5 PID: 3335 at lib/refcount.c:128 
refcount_sub_and_test+0xf4/0x110
[  270.590028] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE 
nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 
nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge 
stp llc rpadlpar_io rpaphp kvm_pr kvm ebtable_filter ebtables ip6table_filter 
ip6_tables iptable_filter dccp_diag dccp tcp_diag udp_diag inet_diag unix_diag 
af_packet_diag netlink_diag ghash_generic xts gf128mul vmx_crypto tpm_ibmvtpm 
tpm sg pseries_rng nfsd auth_rpcgss nfs_acl lockd grace sunrpc binfmt_misc 
ip_tables xfs libcrc32c sr_mod sd_mod cdrom ibmvscsi ibmveth scsi_transport_srp 
dm_mirror dm_region_hash dm_log dm_mod
[  270.590076] CPU: 5 PID: 3335 Comm: drmgr Not tainted 4.11.0-rc5 #3
[  270.590079] task: c005d8df8600 task.stack: c000fb3a8000
[  270.590081] NIP: c1aa3ca4 LR: c1aa3ca0 CTR: 006338e4
[  270.590084] REGS: c000fb3ab8a0 TRAP: 0700   Not tainted  (4.11.0-rc5)
[  270.590087] MSR: 80029033 
[  270.590090]   CR: 22002422  XER: 0007
[  270.590093] CFAR: c1edaabc SOFTE: 1 
[  270.590093] GPR00: c1aa3ca0 c000fb3abb20 c25ea900 
0026 
[  270.590093] GPR04: c0077fc4ada0 c0077fc617b8 000f0c33 
 
[  270.590093] GPR08:  c227146c 00077d9e 
3ff0 
[  270.590093] GPR12: 2200 ce802d00  
 
[  270.590093] GPR16:    
 
[  270.590093] GPR20:  1001b5a8 10018338 
10016650 
[  270.590093] GPR24: 1001b278 c00776e0fdcc 10016650 
 
[  270.590093] GPR28: c0077ffea910 c000fbf79180 c00776e0fdc0 
c000fbf791d8 
[  270.590126] NIP [c1aa3ca4] refcount_sub_and_test+0xf4/0x110
[  270.590129] LR [c1aa3ca0] refcount_sub_and_test+0xf0/0x110
[  270.590132] Call Trace:
[  270.590134] [c000fb3abb20] [c1aa3ca0] 
refcount_sub_and_test+0xf0/0x110 (unreliable)
[  270.590139] [c000fb3abb80] [c1a8221c] kobject_put+0x3c/0xa0
[  270.590143] [c000fb3abbf0] [c1d22d34] of_node_put+0x24/0x40
[  270.590147] [c000fb3abc10] [c165c874] ofdt_write+0x204/0x6b0
[  270.590151] [c000fb3abcd0] [c197a220] proc_reg_write+0x80/0xd0
[  270.590155] [c000fb3abd00] [c18de680] __vfs_write+0x40/0x1c0
[  270.590158] [c000fb3abd90] [c18dffd8] vfs_write+0xc8/0x240
[  270.590162] [c000fb3abde0] [c18e1c40] SyS_write+0x60/0x110
[  270.590165] [c000fb3abe30] [c15cb184] system_call+0x38/0xe0
[  270.590168] Instruction dump:
[  270.590170] 7863d182 4e800020 7c0802a6 3921 3d42fff8 3c62ffb1 386371a8 
992a0171 
[  270.590175] f8010010 f821ffa1 48436de1 6000 <0fe0> 38210060 3860 
e8010010 
[  270.590180] ---[ end trace 08c7a2f3c8bead33 ]—

Have attached the dmesg log from the system. Let me know if any additional
information is required to help debug this problem.

Thanks
-Sachin




dmesg-4.11-rc5.log
Description: Binary data


Re: [PATCH v6 03/11] powerpc/powernv: Detect supported IMC units and its events

2017-04-06 Thread Anju T Sudhakar

Hi Stewart,


Thanks for the review.


On Thursday 06 April 2017 02:07 PM, Stewart Smith wrote:

Madhavan Srinivasan  writes:

--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -33,6 +33,388 @@



+static void imc_pmu_setup(struct device_node *parent)
+{
+   struct device_node *child;
+   int pmu_count = 0, rc = 0;
+   const struct property *pp;
+
+   if (!parent)
+   return;
+
+   /* Setup all the IMC pmus */
+   for_each_child_of_node(parent, child) {
+   pp = of_get_property(child, "compatible", NULL);
+   if (pp) {
+   /*
+* If there is a node with a "compatible" field,
+* that's a PMU node
+*/
+   rc = imc_pmu_create(child, pmu_count);
+   if (rc)
+   return;
+   pmu_count++;
+   }
+   }
+}

This doesn't strike me as the right kind of structure, the presence of a
compatible property really just says "hey, there's this device and it's
compatible with these ways of accessing it".

I'm guessing the idea behind having imc-nest-offset/size in a top level
node is because it's common to everything under it and the aim is to not
blow up the device tree to be enormous.

So why not go after each ibm,imc-counters-nest compatible node under the
top level ibm,opal-in-memory-counters node? (i'm not convinced that
having ibm,ibmc-counters-nest versus ibm,imc-counters-core and
ibm,imc-counters-thread as I see in the dts is correct though, as
they're all accessed exactly the same way?)



The idea here is, we have one directory which contains common events 
information for nest(same incase of core and thread), and one directory 
for each nest(/core/thread) pmu.
So while parsing we need to make sure that the node which we are parsing 
is the pmu node, not the node which contains the common event 
information. We use the "compatible" property here for that purpose. 
Because we don't have a compatible property for the node which contains 
events info.





Regards,
Anju



[PATCH 08/11] EDAC: Remove EDAC_MM_EDAC

2017-04-06 Thread Borislav Petkov
From: Borislav Petkov 

Move all the EDAC core functionality behind CONFIG_EDAC and get rid of
that indirection. Update defconfigs which had it.

While at it, fix dependencies such that EDAC depends on RAS for the
tracepoints.

Signed-off-by: Borislav Petkov 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Chris Metcalf 
Cc: linux-e...@vger.kernel.org
---
 arch/arm/configs/multi_v7_defconfig |   1 -
 arch/arm/configs/pxa_defconfig  |   3 +-
 arch/powerpc/configs/85xx-hw.config |   3 +-
 arch/powerpc/configs/85xx/ge_imp3a_defconfig|   1 -
 arch/powerpc/configs/85xx/xes_mpc85xx_defconfig |   1 -
 arch/powerpc/configs/cell_defconfig |   1 -
 arch/powerpc/configs/pasemi_defconfig   |   1 -
 arch/powerpc/configs/ppc64_defconfig|   1 -
 arch/powerpc/configs/ppc64e_defconfig   |   1 -
 arch/powerpc/configs/ppc6xx_defconfig   |   3 +-
 arch/tile/configs/tilegx_defconfig  |   1 -
 arch/tile/configs/tilepro_defconfig |   1 -
 drivers/acpi/Kconfig|   1 -
 drivers/edac/Kconfig| 101 ++--
 drivers/edac/Makefile   |   3 +-
 drivers/edac/edac_stub.c|   2 +-
 16 files changed, 48 insertions(+), 77 deletions(-)

diff --git a/arch/arm/configs/multi_v7_defconfig 
b/arch/arm/configs/multi_v7_defconfig
index a94126fb02c2..6aa7be191f1a 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -748,7 +748,6 @@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_LEDS_TRIGGER_TRANSIENT=y
 CONFIG_LEDS_TRIGGER_CAMERA=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_HIGHBANK_MC=y
 CONFIG_EDAC_HIGHBANK_L2=y
 CONFIG_RTC_CLASS=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index 2aac99fd1c41..1318f61589dc 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -635,8 +635,7 @@ CONFIG_LEDS_TRIGGER_GPIO=m
 CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
 CONFIG_LEDS_TRIGGER_TRANSIENT=m
 CONFIG_LEDS_TRIGGER_CAMERA=m
-CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=m
+CONFIG_EDAC=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DEBUG=y
 CONFIG_RTC_DRV_DS1307=m
diff --git a/arch/powerpc/configs/85xx-hw.config 
b/arch/powerpc/configs/85xx-hw.config
index 528ff0e714e6..c03d0fb16665 100644
--- a/arch/powerpc/configs/85xx-hw.config
+++ b/arch/powerpc/configs/85xx-hw.config
@@ -16,9 +16,8 @@ CONFIG_DAVICOM_PHY=y
 CONFIG_DMADEVICES=y
 CONFIG_E1000E=y
 CONFIG_E1000=y
-CONFIG_EDAC_MM_EDAC=y
-CONFIG_EDAC_MPC85XX=y
 CONFIG_EDAC=y
+CONFIG_EDAC_MPC85XX=y
 CONFIG_EEPROM_AT24=y
 CONFIG_EEPROM_LEGACY=y
 CONFIG_FB_FSL_DIU=y
diff --git a/arch/powerpc/configs/85xx/ge_imp3a_defconfig 
b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
index c79283be5680..a917f7afb4f9 100644
--- a/arch/powerpc/configs/85xx/ge_imp3a_defconfig
+++ b/arch/powerpc/configs/85xx/ge_imp3a_defconfig
@@ -155,7 +155,6 @@ CONFIG_USB_OHCI_HCD_PPC_OF_BE=y
 CONFIG_USB_OHCI_HCD_PPC_OF_LE=y
 CONFIG_USB_STORAGE=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_MPC85XX=y
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_INTF_PROC is not set
diff --git a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig 
b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
index dbd961de251e..72900b84d3e0 100644
--- a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
+++ b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
@@ -116,7 +116,6 @@ CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_DS1307=y
 CONFIG_RTC_DRV_CMOS=y
diff --git a/arch/powerpc/configs/cell_defconfig 
b/arch/powerpc/configs/cell_defconfig
index 2d7fcbe047ac..aa564599e368 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -179,7 +179,6 @@ CONFIG_INFINIBAND_MTHCA=m
 CONFIG_INFINIBAND_IPOIB=m
 CONFIG_INFINIBAND_IPOIB_DEBUG_DATA=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_CELL=y
 CONFIG_UIO=m
 CONFIG_EXT2_FS=y
diff --git a/arch/powerpc/configs/pasemi_defconfig 
b/arch/powerpc/configs/pasemi_defconfig
index 5553c5ce4274..fe43ff47bd2f 100644
--- a/arch/powerpc/configs/pasemi_defconfig
+++ b/arch/powerpc/configs/pasemi_defconfig
@@ -142,7 +142,6 @@ CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_SL811_HCD=y
 CONFIG_USB_STORAGE=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_PASEMI=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_DS1307=y
diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index 4f1288b04303..f2e03f032041 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -262,7 +262,6 @@ CONFIG_INFINIBAND_IPOIB_CM=y
 CONFIG_INFINIBAND_SRP=m
 CONFIG_INFINIBAND_ISER=m
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_PASEMI=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_DS1307=y
diff --git a/arch/powerpc/configs/ppc64e_defconfig 
b/arch/powerpc/configs/ppc

Re: [PATCH 3/3] powerpc/64s: cpufeatures: add initial implementation for cpufeatures

2017-04-06 Thread kbuild test robot
Hi Nicholas,

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.11-rc5 next-20170406]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/latest-cpufeatures-patch-series/20170406-140710
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allnoconfig (attached as .config)
compiler: powerpc-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/kernel/prom.c: In function 'early_init_dt_scan_cpufeatures':
>> arch/powerpc/kernel/prom.c:508:2: error: 'has_cpufeatures_node' undeclared 
>> (first use in this function)
 has_cpufeatures_node = 1;
 ^~~~
   arch/powerpc/kernel/prom.c:508:2: note: each undeclared identifier is 
reported only once for each function it appears in
   arch/powerpc/kernel/prom.c: In function 'early_init_dt_scan_cpus':
   arch/powerpc/kernel/prom.c:631:7: error: 'has_cpufeatures_node' undeclared 
(first use in this function)
 if (!has_cpufeatures_node) {
  ^~~~

vim +/has_cpufeatures_node +508 arch/powerpc/kernel/prom.c

   502  printk("cpu-features node has missing property 
\"isa\"\n");
   503  return 0;
   504  }
   505  
   506  isa = be32_to_cpup(prop);
   507  
 > 508  has_cpufeatures_node = 1;
   509  
   510  /* Count and allocate space for cpu features */
   511  of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes,

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v3] cxl: Force context lock during EEH flow

2017-04-06 Thread Vaibhav Jain

Thanks for reviewing the patch Fred. I have sent a v4 addressing your
comments regarding cxlflash module.

@Uma,
Can you please take a look at v4 of this patch(
https://patchwork.ozlabs.org/patch/747236/) and see if this change is ok
from cxlflash module prespetive.

Thanks,
-- 
Vaibhav Jain 
Linux Technology Center, IBM India Pvt. Ltd.



Re: [PATCH v6 01/11] powerpc/powernv: Data structure and macros definitions

2017-04-06 Thread Stewart Smith
Madhavan Srinivasan  writes:
> +#define IMC_MAX_CHIPS32
> +#define IMC_MAX_PMUS 32

The max chips and PMUs we'd be able to work out from the device tre
though, right? We could just allocate the correct amount of memory on
boot.

We may hot plug/unplug CPUs, but we're not doing that from a hardware
level, what CPUs you get in the DT on PowerNV on boot is all you're
getting.

-- 
Stewart Smith
OPAL Architect, IBM.



Re: [PATCH v6 03/11] powerpc/powernv: Detect supported IMC units and its events

2017-04-06 Thread Stewart Smith
Madhavan Srinivasan  writes:
> --- a/arch/powerpc/platforms/powernv/opal-imc.c
> +++ b/arch/powerpc/platforms/powernv/opal-imc.c
> @@ -33,6 +33,388 @@

> +static void imc_pmu_setup(struct device_node *parent)
> +{
> + struct device_node *child;
> + int pmu_count = 0, rc = 0;
> + const struct property *pp;
> +
> + if (!parent)
> + return;
> +
> + /* Setup all the IMC pmus */
> + for_each_child_of_node(parent, child) {
> + pp = of_get_property(child, "compatible", NULL);
> + if (pp) {
> + /*
> +  * If there is a node with a "compatible" field,
> +  * that's a PMU node
> +  */
> + rc = imc_pmu_create(child, pmu_count);
> + if (rc)
> + return;
> + pmu_count++;
> + }
> + }
> +}

This doesn't strike me as the right kind of structure, the presence of a
compatible property really just says "hey, there's this device and it's
compatible with these ways of accessing it".

I'm guessing the idea behind having imc-nest-offset/size in a top level
node is because it's common to everything under it and the aim is to not
blow up the device tree to be enormous.

So why not go after each ibm,imc-counters-nest compatible node under the
top level ibm,opal-in-memory-counters node? (i'm not convinced that
having ibm,ibmc-counters-nest versus ibm,imc-counters-core and
ibm,imc-counters-thread as I see in the dts is correct though, as
they're all accessed exactly the same way?)

-- 
Stewart Smith
OPAL Architect, IBM.



Re: [PATCH v4] cxl: Force context lock during EEH flow

2017-04-06 Thread Frederic Barrat



Le 05/04/2017 à 13:35, Vaibhav Jain a écrit :

During an eeh event when the cxl card is fenced and card sysfs attr
perst_reloads_same_image is set following warning message is seen in the
kernel logs:

 [   60.622727] Adapter context unlocked with 0 active contexts
 [   60.622762] [ cut here ]
 [   60.622771] WARNING: CPU: 12 PID: 627 at
 ../drivers/misc/cxl/main.c:325 cxl_adapter_context_unlock+0x60/0x80 [cxl]

Even though this warning is harmless, it clutters the kernel log
during an eeh event. This warning is triggered as the EEH callback
cxl_pci_error_detected doesn't obtain a context-lock before forcibly
detaching all active context and when context-lock is released during
call to cxl_configure_adapter from cxl_pci_slot_reset, a warning in
cxl_adapter_context_unlock is triggered.

To fix this warning, we acquire the adapter context-lock via
cxl_adapter_context_lock() in the eeh callback
cxl_pci_error_detected() once all the virtual AFU PHBs are notified
and their contexts detached. The context-lock is released in
cxl_pci_slot_reset() after the adapter is successfully reconfigured
and before we call slot_reset callback on slice attached device-drivers.

Cc: sta...@vger.kernel.org
Fixes: 70b565bbdb91("cxl: Prevent adapter reset if an active context exists")
Reported-by: Andrew Donnellan 
Signed-off-by: Vaibhav Jain 
---


Pending test result from cxl-flash:
Acked-by: Frederic Barrat 




Change-Log:

v3..v4
- Moved the call to context-unlock from cxl_pci_resume to
cxl_pci_slot_reset to let cxlflash module activate its master context
during slot reset. (Fred)

v2..v3
- As discussed with Fred removed function
cxl_adapter_context_force_lock() which may potentially expose the code
to deadlock in the future.
-  Other details of changes in cxl_pci_error_detected() to fix an
earlier issue of eeh callbacks not being passed on to all slices, is
being reworked as a separate patch.

v2..v1
- Moved the call to cxl_adapter_context_force_lock() from
cxl_pci_error_detected() to cxl_remove. (Fred)
---
 drivers/misc/cxl/pci.c | 19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b27ea98..dd9a128 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1496,8 +1496,6 @@ static int cxl_configure_adapter(struct cxl *adapter, 
struct pci_dev *dev)
if ((rc = cxl_native_register_psl_err_irq(adapter)))
goto err;

-   /* Release the context lock as adapter is configured */
-   cxl_adapter_context_unlock(adapter);
return 0;

 err:
@@ -1596,6 +1594,9 @@ static struct cxl *cxl_pci_init_adapter(struct pci_dev 
*dev)
if ((rc = cxl_sysfs_adapter_add(adapter)))
goto err_put1;

+   /* Release the context lock as adapter is configured */
+   cxl_adapter_context_unlock(adapter);
+
return adapter;

 err_put1:
@@ -1895,6 +1896,13 @@ static pci_ers_result_t cxl_pci_error_detected(struct 
pci_dev *pdev,
cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
pci_deconfigure_afu(afu);
}
+
+   /* should take the context lock here */
+   if (cxl_adapter_context_lock(adapter) != 0)
+   dev_warn(&adapter->dev,
+"Couldn't take context lock with %d active-contexts\n",
+atomic_read(&adapter->contexts_num));
+
cxl_deconfigure_adapter(adapter);

return result;
@@ -1913,6 +1921,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct 
pci_dev *pdev)
if (cxl_configure_adapter(adapter, pdev))
goto err;

+   /*
+* Unlock context activation for the adapter. Ideally this should be
+* done in cxl_pci_resume but cxlflash module tries to activate the
+* master context as part of slot_reset callback.
+*/
+   cxl_adapter_context_unlock(adapter);
+
for (i = 0; i < adapter->slices; i++) {
afu = adapter->afu[i];





Re: [PATCH v6 01/11] powerpc/powernv: Data structure and macros definitions

2017-04-06 Thread Stewart Smith
Madhavan Srinivasan  writes:

> From: Hemant Kumar 
>
> Create new header file "imc-pmu.h" to add the data structures
> and macros needed for IMC pmu support.
>
> Signed-off-by: Anju T Sudhakar 
> Signed-off-by: Hemant Kumar 
> Signed-off-by: Madhavan Srinivasan 
> ---
>  arch/powerpc/include/asm/imc-pmu.h | 68 
> ++
>  1 file changed, 68 insertions(+)
>  create mode 100644 arch/powerpc/include/asm/imc-pmu.h
>
> diff --git a/arch/powerpc/include/asm/imc-pmu.h 
> b/arch/powerpc/include/asm/imc-pmu.h
> new file mode 100644
> index ..a3d4f1bf9492
> --- /dev/null
> +++ b/arch/powerpc/include/asm/imc-pmu.h
> @@ -0,0 +1,68 @@
> +#ifndef PPC_POWERNV_IMC_PMU_DEF_H
> +#define PPC_POWERNV_IMC_PMU_DEF_H
> +
> +/*
> + * IMC Nest Performance Monitor counter support.
> + *
> + * Copyright (C) 2016 Madhavan Srinivasan, IBM Corporation.
> + *   (C) 2016 Hemant K Shaw, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License version 2 as published
> + * by the Free Software Foundation.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define IMC_MAX_CHIPS32
> +#define IMC_MAX_PMUS 32
> +#define IMC_MAX_PMU_NAME_LEN 256

Why do we need a max length? We get the actual lengths from the device
tree, so we know at each point in time what the length of any new string
should be, right?

Otherwise you appear to be, in the general case, using 10x the memory
than you could.

-- 
Stewart Smith
OPAL Architect, IBM.



Re: [PATCH v6 06/11] powerpc/perf: IMC pmu cpumask and cpu hotplug support

2017-04-06 Thread Madhavan Srinivasan



On Tuesday 04 April 2017 10:03 AM, Daniel Axtens wrote:

Madhavan Srinivasan  writes:


From: Hemant Kumar 

Adds cpumask attribute to be used by each IMC pmu. Only one cpu (any
online CPU) from each chip for nest PMUs is designated to read counters.

On CPU hotplug, dying CPU is checked to see whether it is one of the
designated cpus, if yes, next online cpu from the same chip (for nest
units) is designated as new cpu to read counters. For this purpose, we
introduce a new state : CPUHP_AP_PERF_POWERPC_NEST_ONLINE.

Signed-off-by: Anju T Sudhakar 
Signed-off-by: Hemant Kumar 
Signed-off-by: Madhavan Srinivasan 
---
  arch/powerpc/include/asm/opal-api.h|  13 ++-
  arch/powerpc/include/asm/opal.h|   3 +
  arch/powerpc/perf/imc-pmu.c| 155 -
  arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
  include/linux/cpuhotplug.h |   1 +
  5 files changed, 171 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index a0aa285869b5..23fc51e9d71d 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -168,7 +168,8 @@
  #define OPAL_INT_SET_MFRR 125
  #define OPAL_PCI_TCE_KILL 126
  #define OPAL_NMMU_SET_PTCR127
-#define OPAL_LAST  127
+#define OPAL_NEST_IMC_COUNTERS_CONTROL 149

A couple of things:

  - why is this 149 rather than 128?

Current opal-api.h in the skiboot side has more opal call
defined. AT this point of time, the OPAL # defined in skiboot
for IMC patchset is 149 and 150.




  - CONTROL seems like it's inviting a very broad and under-specified
API. I notice most of the other opal calls are reasonably narrow:
often defining two calls for get/set rather than a single control
call.


Reason being going forward, microcode could support multiple
modes to aid debugging. Currently patchset exploits (production mode)
normal accumulation for a subset of events in each unit.

But in future, microcode could support switching of accumulation
mode to different mode (like debug mode) for a units. In which case
we could have new set of events configured. And we do not
want to have multiple calls for the different mode.
And hence we wanted to have one OPAL_NEST_* call
which can be extended to support these modes when available.



I don't have cycles to review the OPAL/skiboot side and I'm very much
open to being convinced here, I just want to avoid the situation where
we're passing around complicated data structures in a way that is
difficult to synchronise if we could avoid it.


+#define OPAL_LAST  149
  
  /* Device tree flags */
  
@@ -928,6 +929,16 @@ enum {

OPAL_PCI_TCE_KILL_ALL,
  };
  
+/* Operation argument to OPAL_NEST_IMC_COUNTERS_CONTROL */

+enum {
+   OPAL_NEST_IMC_PRODUCTION_MODE = 1,
+};

If there's only one mode, why do we need to specify it? As far as I can
tell no extra mode is added later in the series...

Will documents intended mode to be supported in future.


... looking at the opal-side patches, would it be better to just specify
the modes you intend to support in future here, and rely on opal
returning OPAL_PARAMETER when they're not supported?

+
+enum {
+   OPAL_NEST_IMC_STOP,
+   OPAL_NEST_IMC_START,
+};

Again, would it be better to have a stop/start call rather than a
generic control call?

As explain earlier, we would prefer to have one opal call
and take operation to perform as a parameter to it.




Also, the enum values (STOP=0, START=1) seem to be switched vs OPAL,
where https://patchwork.ozlabs.org/patch/746292/ has STOP=1, START=0. Or
am I missing something?

ah, this is mess I created with the latest rebase. Yes I switched the
values and updated the same to kernel side. But i missed to update
the same in the comment message when I sent out the opal side patchset.

Will repost with this change in the opal side.






+
  #endif /* __ASSEMBLY__ */
  
  #endif /* __OPAL_API_H */

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6da76e..d93d08204243 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -227,6 +227,9 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t 
kill_type,
  uint64_t dma_addr, uint32_t npages);
  int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
  
+int64_t opal_nest_imc_counters_control(uint64_t mode, uint64_t value1,

+   uint64_t value2, uint64_t value3);
+

This prototype does not make me feel better about the state of the API.

Looking at the opal side, I would be much more comfortable if you could
nail down what you intend to have these support now, even if OPAL bails
with OPAL_PARAMETER if they're specified.

Ok i will add more comments/Documents on for the OPAL A

Re: [v8] powerpc/powernv: add 'firmware/exports' attributes to sysfs

2017-04-06 Thread Oliver O'Halloran
On Thu, Mar 30, 2017 at 10:28 AM, Matt Brown
 wrote:
> The HDAT data area is consumed by skiboot and turned into a device-tree. In
> some cases we would like to look directly at the HDAT. This is not possible
> through /dev/mem as it is reserved memory which is stopped by the /dev/mem
> filter. There are also other memory areas which are reserved but could be
> useful to view for debugging purposes.
>
> This patch adds sysfs nodes to allow specified memory areas to be viewed.
> sysfs nodes are created for each property in the device-tree under
> /ibm,opal/firmware/exports/, and adds them to /sys/firmware/opal/exports/
> with root read-only permissions.
>
> Signed-off-by: Matt Brown 
> ---
> Changelog
> v8
> - fixed error handling
> - added dynamic allocation of attributes
> - using of_property_read_u64_array for reading attr vals
> - reordered vars
> - renaming vars
> ---
>  arch/powerpc/platforms/powernv/opal.c | 81 
> +++
>  1 file changed, 81 insertions(+)
>
> diff --git a/arch/powerpc/platforms/powernv/opal.c 
> b/arch/powerpc/platforms/powernv/opal.c
> index 2822935..232f94e 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -604,6 +604,84 @@ static void opal_export_symmap(void)
> pr_warn("Error %d creating OPAL symbols file\n", rc);
>  }
>
> +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
> +struct bin_attribute *bin_attr, char *buf,
> +loff_t off, size_t count)
> +{
> +   return memory_read_from_buffer(buf, count, &off, bin_attr->private,
> +  bin_attr->size);
> +}
> +
> +/*
> + * opal_export_attrs: creates a sysfs node for each property listed in
> + * the device-tree under /ibm,opal/firmware/exports/
> + * All new sysfs nodes are created under /opal/exports/.
> + * This allows for reserved memory regions (e.g. HDAT) to be read.
> + * The new sysfs nodes are only readable by root.
> + */
> +static void opal_export_attrs(void)
> +{
> +   struct bin_attribute *attr_tmp;
> +   struct device_node *np;
> +   struct property *prop;
> +   struct kobject *kobj;
> +   u64 vals[2];
> +   int rc, n;
> +
> +   /* Create new 'exports' directory - /sys/firmware/opal/exports */
> +   kobj = kobject_create_and_add("exports", opal_kobj);
> +   if (!kobj) {
> +   pr_warn("kobject_create_and_add exports failed\n");
> +   return;
> +   }
> +
> +   np = of_find_node_by_path("/ibm,opal/firmware/exports");
> +   if (!np)
> +   return;
> +
> +   n = 0;
> +   for (prop = np->properties; prop != NULL; prop = prop->next)
> +   n++;
> +
> +   if (n < 2)
> +   goto cleanup;
> +
> +   for_each_property_of_node(np, prop) {
> +   if (!strcmp(prop->name, "name") ||
> +   !strcmp(prop->name, "phandle"))
> +   continue;
> +
> +   if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
> +   continue;
> +
> +   attr_tmp = kmalloc(sizeof(*attr_tmp), GFP_KERNEL);
> +
> +   if (attr_tmp == NULL) {
> +   pr_warn("Failed kmalloc for bin_attribute attr_tmp");
> +   continue;
> +   }
> +
> +   attr_tmp->attr.name = kstrdup(prop->name, GFP_KERNEL);
> +   attr_tmp->attr.mode = 0400;
> +   attr_tmp->read = export_attr_read;
> +   attr_tmp->private = __va(vals[0]);
> +   attr_tmp->size = vals[1];
> +
> +   if (attr_tmp->attr.name == NULL) {
> +   pr_warn("Failed kstrdup for bin_attribute attr.name");
> +   kfree(attr_tmp);
> +   continue;
> +   }
> +   rc = sysfs_create_bin_file(kobj, attr_tmp);
> +   if (rc)
> +   pr_warn("Error %d creating OPAL sysfs exports/%s 
> file\n",
> + rc, prop->name);
> +   }
> +
> +cleanup:
> +   of_node_put(np);
> +}
> +
>  static void __init opal_dump_region_init(void)
>  {
> void *addr;
> @@ -742,6 +820,9 @@ static int __init opal_init(void)
> opal_msglog_sysfs_init();
> }
>
> +   /* Export all properties */
> +   opal_export_attrs();
> +
> /* Initialize platform devices: IPMI backend, PRD & flash interface */
> opal_pdev_init("ibm,opal-ipmi");
> opal_pdev_init("ibm,opal-flash");
> --
> 2.9.3
>

Reviewed-by: Oliver O'Halloran 


Re: [PATCH v7 2/3] powerpc/powernv: Introduce a machine check hook for Guest MCEs.

2017-04-06 Thread Mahesh Jagannath Salgaonkar
On 04/06/2017 10:52 AM, David Gibson wrote:
> On Thu, Apr 06, 2017 at 02:17:22AM +0530, Mahesh J Salgaonkar wrote:
>> From: Mahesh Salgaonkar 
>>
>> This patch introduces a mce hook which is invoked at the time of guest
>> exit to facilitate the host-side handling of machine check exception
>> before the exception is passed on to the guest. This hook will be invoked
>> from host virtual mode from KVM (before exiting the guest with
>> KVM_EXIT_NMI reason) for machine check exception that occurs in the guest.
>>
>> Signed-off-by: Mahesh Salgaonkar 
> 
> Um.. this introduces the hook, and puts in an implementation of it,
> but AFAICT, nothing calls it, either here or in the next patch.  That
> seems a bit pointless.

It gets called in the next patch [3/3] through
ppc_md.machine_check_exception_guest(). See the hunk for file
arch/powerpc/kvm/book3s_hv.c in next patch.

> 
>> ---
>>  arch/powerpc/include/asm/machdep.h |7 +++
>>  arch/powerpc/include/asm/opal.h|4 
>>  arch/powerpc/platforms/powernv/opal.c  |   26 ++
>>  arch/powerpc/platforms/powernv/setup.c |3 +++
>>  4 files changed, 40 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/machdep.h 
>> b/arch/powerpc/include/asm/machdep.h
>> index 5011b69..9d74e7a 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -15,6 +15,7 @@
>>  #include 
>>  
>>  #include 
>> +#include 
>>  
>>  /* We export this macro for external modules like Alsa to know if
>>   * ppc_md.feature_call is implemented or not
>> @@ -112,6 +113,12 @@ struct machdep_calls {
>>  /* Called during machine check exception to retrive fixup address. */
>>  bool(*mce_check_early_recovery)(struct pt_regs *regs);
>>  
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +/* Called after KVM interrupt handler finishes handling MCE for guest */
>> +int (*machine_check_exception_guest)
>> +(struct machine_check_event *evt);
>> +#endif
>> +
>>  /* Motherboard/chipset features. This is a kind of general purpose
>>   * hook used to control some machine specific features (like reset
>>   * lines, chip power control, etc...).
>> diff --git a/arch/powerpc/include/asm/opal.h 
>> b/arch/powerpc/include/asm/opal.h
>> index 1ff03a6..9b1fcbf 100644
>> --- a/arch/powerpc/include/asm/opal.h
>> +++ b/arch/powerpc/include/asm/opal.h
>> @@ -17,6 +17,7 @@
>>  #ifndef __ASSEMBLY__
>>  
>>  #include 
>> +#include 
>>  
>>  /* We calculate number of sg entries based on PAGE_SIZE */
>>  #define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct 
>> opal_sg_entry))
>> @@ -273,6 +274,9 @@ extern int opal_hmi_handler_init(void);
>>  extern int opal_event_init(void);
>>  
>>  extern int opal_machine_check(struct pt_regs *regs);
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +extern int opal_machine_check_guest(struct machine_check_event *evt);
>> +#endif
>>  extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
>>  extern int opal_hmi_exception_early(struct pt_regs *regs);
>>  extern int opal_handle_hmi_exception(struct pt_regs *regs);
>> diff --git a/arch/powerpc/platforms/powernv/opal.c 
>> b/arch/powerpc/platforms/powernv/opal.c
>> index e0f856b..5e633a4 100644
>> --- a/arch/powerpc/platforms/powernv/opal.c
>> +++ b/arch/powerpc/platforms/powernv/opal.c
>> @@ -479,6 +479,32 @@ int opal_machine_check(struct pt_regs *regs)
>>  return 0;
>>  }
>>  
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +/*
>> + * opal_machine_check_guest() is a hook which is invoked at the time
>> + * of guest exit to facilitate the host-side handling of machine check
>> + * exception before the exception is passed on to the guest. This hook
>> + * is invoked from host virtual mode from KVM (before exiting the guest
>> + * with KVM_EXIT_NMI reason) for machine check exception that occurs in
>> + * the guest.
>> + *
>> + * Currently no action is performed in the host other than printing the
>> + * event information. The machine check exception is passed on to the
>> + * guest kernel and the guest kernel will attempt for recovery.
>> + */
>> +int opal_machine_check_guest(struct machine_check_event *evt)
>> +{
>> +/* Print things out */
>> +if (evt->version != MCE_V1) {
>> +pr_err("Machine Check Exception, Unknown event version %d !\n",
>> +   evt->version);
>> +return 0;
>> +}
>> +machine_check_print_event_info(evt);
>> +return 0;
>> +}
>> +#endif
>> +
>>  /* Early hmi handler called in real mode. */
>>  int opal_hmi_exception_early(struct pt_regs *regs)
>>  {
>> diff --git a/arch/powerpc/platforms/powernv/setup.c 
>> b/arch/powerpc/platforms/powernv/setup.c
>> index d50c7d9..333ee09 100644
>> --- a/arch/powerpc/platforms/powernv/setup.c
>> +++ b/arch/powerpc/platforms/powernv/setup.c
>> @@ -264,6 +264,9 @@ static void __init pnv_setup_machdep_opal(void)
>>  ppc_md.mce_check_early_recovery = o

Re: [PATCH v6 02/11] powerpc/powernv: Autoload IMC device driver module

2017-04-06 Thread Stewart Smith
Madhavan Srinivasan  writes:
> From: Hemant Kumar 
>
> This patch does three things :
>  - Enables "opal.c" to create a platform device for the IMC interface
>according to the appropriate compatibility string.
>  - Find the reserved-memory region details from the system device tree
>and get the base address of HOMER (Reserved memory) region address for 
> each chip.
>  - We also get the Nest PMU counter data offsets (in the HOMER region)
>and their sizes. The offsets for the counters' data are fixed and
>won't change from chip to chip.
>
> The device tree parsing logic is separated from the PMU creation
> functions (which is done in subsequent patches).
>
> Signed-off-by: Anju T Sudhakar 
> Signed-off-by: Hemant Kumar 
> Signed-off-by: Madhavan Srinivasan 
> ---
>  arch/powerpc/platforms/powernv/Makefile   |   2 +-
>  arch/powerpc/platforms/powernv/opal-imc.c | 126 
> ++
>  arch/powerpc/platforms/powernv/opal.c |  14 
>  3 files changed, 141 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/platforms/powernv/opal-imc.c
>
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index b5d98cb3f482..44909fec1121 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -2,7 +2,7 @@ obj-y += setup.o opal-wrappers.o opal.o 
> opal-async.o idle.o
>  obj-y+= opal-rtc.o opal-nvram.o opal-lpc.o 
> opal-flash.o
>  obj-y+= rng.o opal-elog.o opal-dump.o 
> opal-sysparam.o opal-sensor.o
>  obj-y+= opal-msglog.o opal-hmi.o opal-power.o 
> opal-irqchip.o
> -obj-y+= opal-kmsg.o
> +obj-y+= opal-kmsg.o opal-imc.o
>
>  obj-$(CONFIG_SMP)+= smp.o subcore.o subcore-asm.o
>  obj-$(CONFIG_PCI)+= pci.o pci-ioda.o npu-dma.o
> diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
> b/arch/powerpc/platforms/powernv/opal-imc.c
> new file mode 100644
> index ..c476d596c6a8
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/opal-imc.c
> @@ -0,0 +1,126 @@
> +/*
> + * OPAL IMC interface detection driver
> + * Supported on POWERNV platform
> + *
> + * Copyright (C) 2016 Madhavan Srinivasan, IBM Corporation.
> + *   (C) 2016 Hemant K Shaw, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +struct perchip_nest_info nest_perchip_info[IMC_MAX_CHIPS];
> +
> +static int opal_imc_counters_probe(struct platform_device *pdev)
> +{
> + struct device_node *child, *imc_dev, *rm_node = NULL;
> + struct perchip_nest_info *pcni;
> + u32 pages, nest_offset, nest_size, idx;
> + int i = 0;
> + const char *node_name;
> + const __be32 *addrp;
> + u64 reg_addr, reg_size;
> +
> + if (!pdev || !pdev->dev.of_node)
> + return -ENODEV;
> +
> + /*
> +  * Check whether this kdump kernel. If yes, just return.
> +  */
> + if (is_kdump_kernel())
> + return -ENODEV;
> +
> + imc_dev = pdev->dev.of_node;
> +
> + /*
> +  * nest_offset : where the nest-counters' data start.
> +  * size : size of the entire nest-counters region
> +  */
> + if (of_property_read_u32(imc_dev, "imc-nest-offset", &nest_offset))
> + goto err;
> +
> + if (of_property_read_u32(imc_dev, "imc-nest-size", &nest_size))
> + goto err;
> +
> + /* Find the "homer region" for each chip */
> + rm_node = of_find_node_by_path("/reserved-memory");
> + if (!rm_node)
> + goto err;
> +
> + for_each_child_of_node(rm_node, child) {
> + if (of_property_read_string_index(child, "name", 0,
> +   &node_name))
> + continue;
> + if (strncmp("ibm,homer-image", node_name,
> + strlen("ibm,homer-image")))
> + continue;

A better way to do this would be to reference the memory region, like
what's shown in
Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt

just reference the phandle of the memory region.

seeing as these are per chip, why not just have something linking
together chip-id and the IMC layout node?

-- 
Stewart Smith
OPAL Architect, IBM.