[PATCH v5 6/7] powerpc/pseries: Display machine check error details.

2018-07-01 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

Extract the MCE error details from RTAS extended log and display it to
console.

With this patch you should now see mce logs like below:

[  142.371818] Severe Machine check interrupt [Recovered]
[  142.371822]   NIP [dca301b8]: init_module+0x1b8/0x338 [bork_kernel]
[  142.371822]   Initiator: CPU
[  142.371823]   Error type: SLB [Multihit]
[  142.371824] Effective address: dca7

Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/rtas.h  |5 +
 arch/powerpc/platforms/pseries/ras.c |  131 ++
 2 files changed, 136 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index ceeed2dd489b..26bc3d5c4992 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -197,6 +197,11 @@ static inline uint8_t rtas_error_extended(const struct 
rtas_error_log *elog)
return (elog->byte1 & 0x04) >> 2;
 }
 
+static inline uint8_t rtas_error_initiator(const struct rtas_error_log *elog)
+{
+   return (elog->byte2 & 0xf0) >> 4;
+}
+
 #define rtas_error_type(x) ((x)->byte3)
 
 static inline
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 9aa7885e0148..7d4d2b8bc019 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -427,6 +427,135 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
+#define VAL_TO_STRING(ar, val) ((val < ARRAY_SIZE(ar)) ? ar[val] : "Unknown")
+
+static void pseries_print_mce_info(struct pt_regs *regs,
+   struct rtas_error_log *errp)
+{
+   const char *level, *sevstr;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   uint8_t error_type, err_sub_type;
+   uint64_t addr;
+   uint8_t initiator = rtas_error_initiator(errp);
+   int disposition = rtas_error_disposition(errp);
+
+   static const char * const initiators[] = {
+   "Unknown",
+   "CPU",
+   "PCI",
+   "ISA",
+   "Memory",
+   "Power Mgmt",
+   };
+   static const char * const mc_err_types[] = {
+   "UE",
+   "SLB",
+   "ERAT",
+   "TLB",
+   "D-Cache",
+   "Unknown",
+   "I-Cache",
+   };
+   static const char * const mc_ue_types[] = {
+   "Indeterminate",
+   "Instruction fetch",
+   "Page table walk ifetch",
+   "Load/Store",
+   "Page table walk Load/Store",
+   };
+
+   /* SLB sub errors valid values are 0x0, 0x1, 0x2 */
+   static const char * const mc_slb_types[] = {
+   "Parity",
+   "Multihit",
+   "Indeterminate",
+   };
+
+   /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
+   static const char * const mc_soft_types[] = {
+   "Unknown",
+   "Parity",
+   "Multihit",
+   "Indeterminate",
+   };
+
+   if (!rtas_error_extended(errp)) {
+   pr_err("Machine check interrupt: Missing extended error log\n");
+   return;
+   }
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (pseries_log == NULL)
+   return;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+
+   error_type = rtas_mc_error_type(mce_log);
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (rtas_error_severity(errp)) {
+   case RTAS_SEVERITY_NO_ERROR:
+   level = KERN_INFO;
+   sevstr = "Harmless";
+   break;
+   case RTAS_SEVERITY_WARNING:
+   level = KERN_WARNING;
+   sevstr = "";
+   break;
+   case RTAS_SEVERITY_ERROR:
+   case RTAS_SEVERITY_ERROR_SYNC:
+   level = KERN_ERR;
+   sevstr = "Severe";
+   break;
+   case RTAS_SEVERITY_FATAL:
+   default:
+   level = KERN_ERR;
+   sevstr = "Fatal";
+   break;
+   }
+
+   printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+   disposition == RTAS_DISP_FULLY_RECOVERED ?
+   "Recovered" : "Not recovered");
+   if (user_mode(regs)) {
+   printk("%s  NIP: [%016lx] PID: %d Comm: %s\n", level,
+   regs->nip, current->pid, current->comm);
+   } else {
+   printk("%s  NIP [%016lx]: %pS\n", level, regs->nip,
+   (void *)regs->nip);
+   }
+   printk("%s  Initiator: %s\n", level,
+   VAL_TO_STRING(initiators, initiator));
+
+   switch (error_type) {
+   case PSERIES_MC_ERROR_TYPE_UE:
+   printk("%s  

[PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.

2018-07-01 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

On pseries, as of today system crashes if we get a machine check
exceptions due to SLB errors. These are soft errors and can be fixed by
flushing the SLBs so the kernel can continue to function instead of
system crash. We do this in real mode before turning on MMU. Otherwise
we would run into nested machine checks. This patch now fetches the
rtas error log in real mode and flushes the SLBs on SLB errors.

Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 
 arch/powerpc/include/asm/machdep.h|1 
 arch/powerpc/kernel/exceptions-64s.S  |   42 +
 arch/powerpc/kernel/mce.c |   16 +++-
 arch/powerpc/mm/slb.c |6 +++
 arch/powerpc/platforms/powernv/opal.c |1 
 arch/powerpc/platforms/pseries/pseries.h  |1 
 arch/powerpc/platforms/pseries/ras.c  |   51 +
 arch/powerpc/platforms/pseries/setup.c|1 
 9 files changed, 116 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 50ed64fba4ae..cc00a7088cf3 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -487,6 +487,7 @@ extern void hpte_init_native(void);
 
 extern void slb_initialize(void);
 extern void slb_flush_and_rebolt(void);
+extern void slb_flush_and_rebolt_realmode(void);
 
 extern void slb_vmalloc_update(void);
 extern void slb_set_size(u16 size);
diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index ffe7c71e1132..fe447e0d4140 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -108,6 +108,7 @@ struct machdep_calls {
 
/* Early exception handlers called in realmode */
int (*hmi_exception_early)(struct pt_regs *regs);
+   int (*machine_check_early)(struct pt_regs *regs);
 
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index f283958129f2..0038596b7906 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -332,6 +332,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
 machine_check_fwnmi:
SET_SCRATCH0(r13)   /* save r13 */
EXCEPTION_PROLOG_0(PACA_EXMC)
+BEGIN_FTR_SECTION
+   b   machine_check_pSeries_early
+END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 machine_check_pSeries_0:
EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
/*
@@ -343,6 +346,45 @@ machine_check_pSeries_0:
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
+TRAMP_REAL_BEGIN(machine_check_pSeries_early)
+BEGIN_FTR_SECTION
+   EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
+   mr  r10,r1  /* Save r1 */
+   ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
+   subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/
+   mfspr   r11,SPRN_SRR0   /* Save SRR0 */
+   mfspr   r12,SPRN_SRR1   /* Save SRR1 */
+   EXCEPTION_PROLOG_COMMON_1()
+   EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+   EXCEPTION_PROLOG_COMMON_3(0x200)
+   addir3,r1,STACK_FRAME_OVERHEAD
+   BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */
+
+   /* Move original SRR0 and SRR1 into the respective regs */
+   ld  r9,_MSR(r1)
+   mtspr   SPRN_SRR1,r9
+   ld  r3,_NIP(r1)
+   mtspr   SPRN_SRR0,r3
+   ld  r9,_CTR(r1)
+   mtctr   r9
+   ld  r9,_XER(r1)
+   mtxer   r9
+   ld  r9,_LINK(r1)
+   mtlrr9
+   REST_GPR(0, r1)
+   REST_8GPRS(2, r1)
+   REST_GPR(10, r1)
+   ld  r11,_CCR(r1)
+   mtcrr11
+   REST_GPR(11, r1)
+   REST_2GPRS(12, r1)
+   /* restore original r1. */
+   ld  r1,GPR1(r1)
+   SET_SCRATCH0(r13)   /* save r13 */
+   EXCEPTION_PROLOG_0(PACA_EXMC)
+   b   machine_check_pSeries_0
+END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
+
 EXC_COMMON_BEGIN(machine_check_common)
/*
 * Machine check is different because we use a different
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index efdd16a79075..221271c96a57 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
 
-   __this_cpu_inc(irq_stat.mce_exceptions);
+   /*
+* For pSeries we count mce when we go into virtual mode machine
+* check handler. Hence skip it. Also, We can't access per cpu
+* variables in real mode for LPAR.
+*/
+   if (early_cpu_has_feature(CPU_FTR_HVMODE))
+   

[PATCH v5 4/7] powerpc/pseries: Define MCE error event section.

2018-07-01 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

On pseries, the machine check error details are part of RTAS extended
event log passed under Machine check exception section. This patch adds
the definition of rtas MCE event section and related helper
functions.

Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/rtas.h |  111 +++
 1 file changed, 111 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index ec9dd79398ee..ceeed2dd489b 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -185,6 +185,13 @@ static inline uint8_t rtas_error_disposition(const struct 
rtas_error_log *elog)
return (elog->byte1 & 0x18) >> 3;
 }
 
+static inline
+void rtas_set_disposition_recovered(struct rtas_error_log *elog)
+{
+   elog->byte1 &= ~0x18;
+   elog->byte1 |= (RTAS_DISP_FULLY_RECOVERED << 3);
+}
+
 static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog)
 {
return (elog->byte1 & 0x04) >> 2;
@@ -275,6 +282,7 @@ inline uint32_t rtas_ext_event_company_id(struct 
rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H')
 #define PSERIES_ELOG_SECT_ID_USER_DEF  (('U' << 8) | 'D')
 #define PSERIES_ELOG_SECT_ID_HOTPLUG   (('H' << 8) | 'P')
+#define PSERIES_ELOG_SECT_ID_MCE   (('M' << 8) | 'C')
 
 /* Vendor specific Platform Event Log Format, Version 6, section header */
 struct pseries_errorlog {
@@ -326,6 +334,109 @@ struct pseries_hp_errorlog {
 #define PSERIES_HP_ELOG_ID_DRC_COUNT   3
 #define PSERIES_HP_ELOG_ID_DRC_IC  4
 
+/* RTAS pseries MCE errorlog section */
+#pragma pack(push, 1)
+struct pseries_mc_errorlog {
+   __be32  fru_id;
+   __be32  proc_id;
+   uint8_t error_type;
+   union {
+   struct {
+   uint8_t ue_err_type;
+   /* 
+* X1: Permanent or Transient UE.
+*  X   1: Effective address provided.
+*   X  1: Logical address provided.
+*XX2: Reserved.
+*  XXX 3: Type of UE error.
+*/
+   uint8_t reserved_1[6];
+   __be64  effective_address;
+   __be64  logical_address;
+   } ue_error;
+   struct {
+   uint8_t soft_err_type;
+   /* 
+* X1: Effective address provided.
+*  X   5: Reserved.
+*   XX 2: Type of SLB/ERAT/TLB error.
+*/
+   uint8_t reserved_1[6];
+   __be64  effective_address;
+   uint8_t reserved_2[8];
+   } soft_error;
+   } u;
+};
+#pragma pack(pop)
+
+/* RTAS pseries MCE error types */
+#define PSERIES_MC_ERROR_TYPE_UE   0x00
+#define PSERIES_MC_ERROR_TYPE_SLB  0x01
+#define PSERIES_MC_ERROR_TYPE_ERAT 0x02
+#define PSERIES_MC_ERROR_TYPE_TLB  0x04
+#define PSERIES_MC_ERROR_TYPE_D_CACHE  0x05
+#define PSERIES_MC_ERROR_TYPE_I_CACHE  0x07
+
+/* RTAS pseries MCE error sub types */
+#define PSERIES_MC_ERROR_UE_INDETERMINATE  0
+#define PSERIES_MC_ERROR_UE_IFETCH 1
+#define PSERIES_MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2
+#define PSERIES_MC_ERROR_UE_LOAD_STORE 3
+#define PSERIES_MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4
+
+#define PSERIES_MC_ERROR_SLB_PARITY0
+#define PSERIES_MC_ERROR_SLB_MULTIHIT  1
+#define PSERIES_MC_ERROR_SLB_INDETERMINATE 2
+
+#define PSERIES_MC_ERROR_ERAT_PARITY   1
+#define PSERIES_MC_ERROR_ERAT_MULTIHIT 2
+#define PSERIES_MC_ERROR_ERAT_INDETERMINATE3
+
+#define PSERIES_MC_ERROR_TLB_PARITY1
+#define PSERIES_MC_ERROR_TLB_MULTIHIT  2
+#define PSERIES_MC_ERROR_TLB_INDETERMINATE 3
+
+static inline uint8_t rtas_mc_error_type(const struct pseries_mc_errorlog 
*mlog)
+{
+   return mlog->error_type;
+}
+
+static inline uint8_t rtas_mc_error_sub_type(
+   const struct pseries_mc_errorlog *mlog)
+{
+   switch (mlog->error_type) {
+   casePSERIES_MC_ERROR_TYPE_UE:
+   return (mlog->u.ue_error.ue_err_type & 0x07);
+   casePSERIES_MC_ERROR_TYPE_SLB:
+   casePSERIES_MC_ERROR_TYPE_ERAT:
+   casePSERIES_MC_ERROR_TYPE_TLB:
+   return (mlog->u.soft_error.soft_err_type & 0x03);
+   default:
+   return 0;
+   }
+}
+
+static inline uint64_t rtas_mc_get_effective_addr(
+   const struct pseries_mc_errorlog *mlog)
+{
+   uint64_t addr = 0;
+
+   switch 

[PATCH v5 3/7] powerpc/pseries: Fix endainness while restoring of r3 in MCE handler.

2018-07-01 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

During Machine Check interrupt on pseries platform, register r3 points
RTAS extended event log passed by hypervisor. Since hypervisor uses r3
to pass pointer to rtas log, it stores the original r3 value at the
start of the memory (first 8 bytes) pointed by r3. Since hypervisor
stores this info and rtas log is in BE format, linux should make
sure to restore r3 value in correct endian format.

Without this patch when MCE handler, after recovery, returns to code that
that caused the MCE may end up with Data SLB access interrupt for invalid
address followed by kernel panic or hang.

[   62.878965] Severe Machine check interrupt [Recovered]
[   62.878968]   NIP [dca301b8]: init_module+0x1b8/0x338 [bork_kernel]
[   62.878969]   Initiator: CPU
[   62.878970]   Error type: SLB [Multihit]
[   62.878971] Effective address: dca7
cpu 0xa: Vector: 380 (Data SLB Access) at [c000fc7775b0]
pc: c09694c0: vsnprintf+0x80/0x480
lr: c09698e0: vscnprintf+0x20/0x60
sp: c000fc777830
   msr: 82009033
   dar: a803a30c00d0
  current = 0xcbc9ef00
  paca= 0xc0001eca5c00   softe: 3irq_happened: 0x01
pid   = 8860, comm = insmod
[c000fc7778b0] c09698e0 vscnprintf+0x20/0x60
[c000fc7778e0] c016b6c4 vprintk_emit+0xb4/0x4b0
[c000fc777960] c016d40c vprintk_func+0x5c/0xd0
[c000fc777980] c016cbb4 printk+0x38/0x4c
[c000fc7779a0] dca301c0 init_module+0x1c0/0x338 [bork_kernel]
[c000fc777a40] c000d9c4 do_one_initcall+0x54/0x230
[c000fc777b00] c01b3b74 do_init_module+0x8c/0x248
[c000fc777b90] c01b2478 load_module+0x12b8/0x15b0
[c000fc777d30] c01b29e8 sys_finit_module+0xa8/0x110
[c000fc777e30] c000b204 system_call+0x58/0x6c
--- Exception: c00 (System Call) at 7fff8bda0644
SP (7fffdfbfe980) is in userspace

This patch fixes this issue.

Fixes: a08a53ea4c97 ("powerpc/le: Enable RTAS events support")
Cc: sta...@vger.kernel.org
Reviewed-by: Nicholas Piggin 
Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/platforms/pseries/ras.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 14a46b07ab2f..851ce326874a 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -367,7 +367,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct 
pt_regs *regs)
}
 
savep = __va(regs->gpr[3]);
-   regs->gpr[3] = savep[0];/* restore original r3 */
+   regs->gpr[3] = be64_to_cpu(savep[0]);   /* restore original r3 */
 
h = (struct rtas_error_log *)[1];
/* Use the per cpu buffer from paca to store rtas error log */



[PATCH v5 2/7] powerpc/pseries: Defer the logging of rtas error to irq work queue.

2018-07-01 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

rtas_log_buf is a buffer to hold RTAS event data that are communicated
to kernel by hypervisor. This buffer is then used to pass RTAS event
data to user through proc fs. This buffer is allocated from vmalloc
(non-linear mapping) area.

On Machine check interrupt, register r3 points to RTAS extended event
log passed by hypervisor that contains the MCE event. The pseries
machine check handler then logs this error into rtas_log_buf. The
rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a
page fault (vector 0x300) while accessing it. Since machine check
interrupt handler runs in NMI context we can not afford to take any
page fault. Page faults are not honored in NMI context and causes
kernel panic. Apart from that, as Nick pointed out, pSeries_log_error()
also takes a spin_lock while logging error which is not safe in NMI
context. It may endup in deadlock if we get another MCE before releasing
the lock. Fix this by deferring the logging of rtas error to irq work queue.

Current implementation uses two different buffers to hold rtas error log
depending on whether extended log is provided or not. This makes bit
difficult to identify which buffer has valid data that needs to logged
later in irq work. Simplify this using single buffer, one per paca, and
copy rtas log to it irrespective of whether extended log is provided or
not. Allocate this buffer below RMA region so that it can be accessed
in real mode mce handler.

Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable 
interrupt")
Cc: sta...@vger.kernel.org
Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/paca.h|3 ++
 arch/powerpc/platforms/pseries/ras.c   |   47 ++--
 arch/powerpc/platforms/pseries/setup.c |   16 +++
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 3f109a3e3edb..b441fef53077 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -251,6 +251,9 @@ struct paca_struct {
void *rfi_flush_fallback_area;
u64 l1d_flush_size;
 #endif
+#ifdef CONFIG_PPC_PSERIES
+   u8 *mce_data_buf;   /* buffer to hold per cpu rtas errlog */
+#endif /* CONFIG_PPC_PSERIES */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index ef104144d4bc..14a46b07ab2f 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -32,11 +33,13 @@
 static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
 static DEFINE_SPINLOCK(ras_log_buf_lock);
 
-static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
-static DEFINE_PER_CPU(__u64, mce_data_buf);
-
 static int ras_check_exception_token;
 
+static void mce_process_errlog_event(struct irq_work *work);
+static struct irq_work mce_errlog_process_work = {
+   .func = mce_process_errlog_event,
+};
+
 #define EPOW_SENSOR_TOKEN  9
 #define EPOW_SENSOR_INDEX  0
 
@@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void 
*dev_id)
A) >= 0x7000) && ((A) < 0x7ff0)) || \
(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16
 
+static inline struct rtas_error_log *fwnmi_get_errlog(void)
+{
+   return (struct rtas_error_log *)local_paca->mce_data_buf;
+}
+
 /*
  * Get the error information for errors coming through the
  * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
  * the actual r3 if possible, and a ptr to the error log entry
  * will be returned if found.
  *
- * If the RTAS error is not of the extended type, then we put it in a per
- * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
+ * Use one buffer mce_data_buf per cpu to store RTAS error.
  *
- * The global_mce_data_buf does not have any locks or protection around it,
+ * The mce_data_buf does not have any locks or protection around it,
  * if a second machine check comes in, or a system reset is done
  * before we have logged the error, then we will get corruption in the
  * error log.  This is preferable over holding off on calling
@@ -349,7 +356,7 @@ static irqreturn_t ras_error_interrupt(int irq, void 
*dev_id)
 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
 {
unsigned long *savep;
-   struct rtas_error_log *h, *errhdr = NULL;
+   struct rtas_error_log *h;
 
/* Mask top two bits */
regs->gpr[3] &= ~(0x3UL << 62);
@@ -362,22 +369,20 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct 
pt_regs *regs)
savep = __va(regs->gpr[3]);
regs->gpr[3] = savep[0];/* restore original r3 */
 
-   /* If it isn't an extended log we can use the per cpu 64bit buffer */
h = (struct rtas_error_log *)[1];
+  

[PATCH v5 1/7] powerpc/pseries: Avoid using the size greater than

2018-07-01 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

The global mce data buffer that used to copy rtas error log is of 2048
(RTAS_ERROR_LOG_MAX) bytes in size. Before the copy we read
extended_log_length from rtas error log header, then use max of
extended_log_length and RTAS_ERROR_LOG_MAX as a size of data to be copied.
Ideally the platform (phyp) will never send extended error log with
size > 2048. But if that happens, then we have a risk of buffer overrun
and corruption. Fix this by using min_t instead.

Fixes: d368514c3097 ("powerpc: Fix corruption when grabbing FWNMI data")
Reported-by: Michal Suchanek 
Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/platforms/pseries/ras.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 5e1ef9150182..ef104144d4bc 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -371,7 +371,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct 
pt_regs *regs)
int len, error_log_length;
 
error_log_length = 8 + rtas_error_extended_log_length(h);
-   len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
+   len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
memcpy(global_mce_data_buf, h, len);
errhdr = (struct rtas_error_log *)global_mce_data_buf;



[PATCH v5 0/7] powerpc/pseries: Machien check handler improvements.

2018-07-01 Thread Mahesh J Salgaonkar
This patch series includes some improvement to Machine check handler
for pseries. Patch 1 fixes a buffer overrun issue if rtas extended error
log size is greater than RTAS_ERROR_LOG_MAX.
Patch 2 fixes an issue where machine check handler crashes
kernel while accessing vmalloc-ed buffer while in nmi context.
Patch 3 fixes endain bug while restoring of r3 in MCE handler.
Patch 5 implements a real mode mce handler and flushes the SLBs on SLB error.
Patch 6 display's the MCE error details on console.
Patch 7 saves and dumps the SLB contents on SLB MCE errors to improve the
debugability.

Change in V5:
- Use min_t instead of max_t.
- Fix an issue reported by kbuild test robot and addressed review comments.

Change in V4:
- Flush the SLBs in real mode mce handler to handle SLB errors for entry 0.
- Allocate buffers per cpu to hold rtas error log and old slb contents.
- Defer the logging of rtas error log to irq work queue.

Change in V3:
- Moved patch 5 to patch 2

Change in V2:
- patch 3: Display additional info (NIP and task info) in MCE error details.
- patch 5: Fix endain bug while restoring of r3 in MCE handler.

---

Mahesh Salgaonkar (7):
  powerpc/pseries: Avoid using the size greater than
  powerpc/pseries: Defer the logging of rtas error to irq work queue.
  powerpc/pseries: Fix endainness while restoring of r3 in MCE handler.
  powerpc/pseries: Define MCE error event section.
  powerpc/pseries: flush SLB contents on SLB MCE errors.
  powerpc/pseries: Display machine check error details.
  powerpc/pseries: Dump the SLB contents on SLB MCE errors.


 arch/powerpc/include/asm/book3s/64/mmu-hash.h |8 +
 arch/powerpc/include/asm/machdep.h|1 
 arch/powerpc/include/asm/paca.h   |4 
 arch/powerpc/include/asm/rtas.h   |  116 
 arch/powerpc/kernel/exceptions-64s.S  |   42 
 arch/powerpc/kernel/mce.c |   16 +-
 arch/powerpc/mm/slb.c |   63 +++
 arch/powerpc/platforms/powernv/opal.c |1 
 arch/powerpc/platforms/pseries/pseries.h  |1 
 arch/powerpc/platforms/pseries/ras.c  |  241 +++--
 arch/powerpc/platforms/pseries/setup.c|   27 +++
 11 files changed, 499 insertions(+), 21 deletions(-)

--
Signature



Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-01 Thread David Gibson
On Mon, Jul 02, 2018 at 02:33:30PM +1000, Alexey Kardashevskiy wrote:
> On Mon, 2 Jul 2018 14:08:52 +1000
> David Gibson  wrote:
> 
> > On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:
> > > On Fri, 29 Jun 2018 15:18:20 +1000
> > > Alexey Kardashevskiy  wrote:
> > >   
> > > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > > David Gibson  wrote:
> > > >   
> > > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote: 
> > > > >
> > > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > > David Gibson  wrote:
> > > > > >   
> > > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy 
> > > > > > > wrote:  
> > > > > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c 
> > > > > > > > that
> > > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > > hardware won't
> > > > > > > > get access to unassigned host memory.
> > > > > > > > 
> > > > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > > > accelerated
> > > > > > > > code) so the user space can pin memory backed with 64k pages 
> > > > > > > > and create
> > > > > > > > a hardware TCE table with a bigger page size. We were lucky so 
> > > > > > > > far and
> > > > > > > > did not hit this yet as the very first time the mapping happens
> > > > > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > > > > the userspace which in turn calls VFIO IOMMU driver and that 
> > > > > > > > fails
> > > > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > > > sustainable solution.
> > > > > > > > 
> > > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > > preregistered
> > > > > > > > region descriptor and changes the mm_iommu_xxx API to check 
> > > > > > > > this against
> > > > > > > > the IOMMU page size.
> > > > > > > > 
> > > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > > ---
> > > > > > > > Changes:
> > > > > > > > v2:
> > > > > > > > * explicitly check for compound pages before calling 
> > > > > > > > compound_order()
> > > > > > > > 
> > > > > > > > ---
> > > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell 
> > > > > > > > it to
> > > > > > > > advertise 16MB pages to the guest; a typical pseries guest will 
> > > > > > > > use 16MB
> > > > > > > > for IOMMU pages without checking the mmu pagesize and this will 
> > > > > > > > fail
> > > > > > > > at 
> > > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > > 
> > > > > > > > With the change, mapping will fail in KVM and the guest will 
> > > > > > > > print:
> > > > > > > > 
> > > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 
> > > > > > > > 800 2000 18 1f returned 0 (liobn = 0x8001 starting 
> > > > > > > > addr = 800 0)
> > > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > > > > > /pci@8002000/ethernet@0
> > > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > > /pci@8002000/ethernet@0: -1
> > > > > > > 
> > > > > > > [snip]  
> > > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > struct mm_iommu_table_group_mem_t **pmem)
> > > > > > > >  {
> > > > > > > > struct mm_iommu_table_group_mem_t *mem;
> > > > > > > > -   long i, j, ret = 0, locked_entries = 0;
> > > > > > > > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > > > struct page *page = NULL;
> > > > > > > >  
> > > > > > > > mutex_lock(_list_mutex);
> > > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > > unsigned long ua, unsigned long entries,
> > > > > > > > goto unlock_exit;
> > > > > > > > }
> > > > > > > >  
> > > > > >  > > +  mem->pageshift = 30; /* start from 1G pages - the 
> > > > > > biggest we have */
> > > > > > > 
> > > > > > > What about 16G pages on an HPT system?  
> > > > > > 
> > > > > > 
> > > > > > Below in the loop mem->pageshift will reduce to the biggest actual 
> > > > > > size
> > > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > > > pinned, no loss there.  
> > > > > 
> > > > > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > > > > some reason a guest can never use them?
> > > > 
> > > > 
> > > > ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> > > > lift the limit up to 64 then, easier this way.  
> > > 
> > > 
> > > Ah, no, rather this as the upper limit:
> > > 
> > > mem->pageshift = ilog2(entries) + PAGE_SHIFT;  
> > 
> > I can't make sense of this comment in context.  I see how you're
> > computing 

Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-01 Thread Alexey Kardashevskiy
On Mon, 2 Jul 2018 14:08:52 +1000
David Gibson  wrote:

> On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:
> > On Fri, 29 Jun 2018 15:18:20 +1000
> > Alexey Kardashevskiy  wrote:
> >   
> > > On Fri, 29 Jun 2018 14:57:02 +1000
> > > David Gibson  wrote:
> > >   
> > > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:   
> > > >  
> > > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > > David Gibson  wrote:
> > > > >   
> > > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy 
> > > > > > wrote:  
> > > > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c 
> > > > > > > that
> > > > > > > an IOMMU page is contained in the physical page so the PCI 
> > > > > > > hardware won't
> > > > > > > get access to unassigned host memory.
> > > > > > > 
> > > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > > accelerated
> > > > > > > code) so the user space can pin memory backed with 64k pages and 
> > > > > > > create
> > > > > > > a hardware TCE table with a bigger page size. We were lucky so 
> > > > > > > far and
> > > > > > > did not hit this yet as the very first time the mapping happens
> > > > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > > sustainable solution.
> > > > > > > 
> > > > > > > This stores the smallest preregistered page size in the 
> > > > > > > preregistered
> > > > > > > region descriptor and changes the mm_iommu_xxx API to check this 
> > > > > > > against
> > > > > > > the IOMMU page size.
> > > > > > > 
> > > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > > ---
> > > > > > > Changes:
> > > > > > > v2:
> > > > > > > * explicitly check for compound pages before calling 
> > > > > > > compound_order()
> > > > > > > 
> > > > > > > ---
> > > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell 
> > > > > > > it to
> > > > > > > advertise 16MB pages to the guest; a typical pseries guest will 
> > > > > > > use 16MB
> > > > > > > for IOMMU pages without checking the mmu pagesize and this will 
> > > > > > > fail
> > > > > > > at 
> > > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > > 
> > > > > > > With the change, mapping will fail in KVM and the guest will 
> > > > > > > print:
> > > > > > > 
> > > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 
> > > > > > > 2000 18 1f returned 0 (liobn = 0x8001 starting addr = 
> > > > > > > 800 0)
> > > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > > > > /pci@8002000/ethernet@0
> > > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > > /pci@8002000/ethernet@0: -1
> > > > > > 
> > > > > > [snip]  
> > > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > unsigned long ua, unsigned long entries,
> > > > > > >   struct mm_iommu_table_group_mem_t **pmem)
> > > > > > >  {
> > > > > > >   struct mm_iommu_table_group_mem_t *mem;
> > > > > > > - long i, j, ret = 0, locked_entries = 0;
> > > > > > > + long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > >   struct page *page = NULL;
> > > > > > >  
> > > > > > >   mutex_lock(_list_mutex);
> > > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > > unsigned long ua, unsigned long entries,
> > > > > > >   goto unlock_exit;
> > > > > > >   }
> > > > > > >  
> > > > >  > > +mem->pageshift = 30; /* start from 1G pages - the 
> > > > > biggest we have */
> > > > > > 
> > > > > > What about 16G pages on an HPT system?  
> > > > > 
> > > > > 
> > > > > Below in the loop mem->pageshift will reduce to the biggest actual 
> > > > > size
> > > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > > pinned, no loss there.  
> > > > 
> > > > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > > > some reason a guest can never use them?
> > > 
> > > 
> > > ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> > > lift the limit up to 64 then, easier this way.  
> > 
> > 
> > Ah, no, rather this as the upper limit:
> > 
> > mem->pageshift = ilog2(entries) + PAGE_SHIFT;  
> 
> I can't make sense of this comment in context.  I see how you're
> computing the minimum page size in the reserved region.
> 
> My question is about what the "maximum minimum" is - the starting
> value from which you calculate.  Currently it's 1G, but I can't
> immediately see a reason that 16G is impossible here.


16GB is impossible if the chunk we are preregistering here is smaller
than that, for example, the entire guest ram 

Re: [Update] Regression in 4.18 - 32-bit PowerPC crashes on boot - bisected to commit 1d40a5ea01d5

2018-07-01 Thread Michael Ellerman
Linus Torvalds  writes:
> On Fri, Jun 29, 2018 at 1:42 PM Larry Finger  
> wrote:
>>
>> I have more information regarding this BUG. Line 700 of page-flags.h is the
>> macro PAGE_TYPE_OPS(Table, table). For further debugging, I manually expanded
>> the macro, and found that the bug line is VM_BUG_ON_PAGE(!PageTable(page), 
>> page)
>> in routine __ClearPageTable(), which is called from pgtable_page_dtor() in
>> include/linux/mm.h. I also added a printk call to PageTable() that logs
>> page->page_type. The routine was called twice. The first had page_type of
>> 0xfbff, which would have been expected for a . The second call had
>> 0x, which led to the BUG.
>
> So it looks to me like the tear-down of the page tables first found a
> page that is indeed a page table, and cleared the page table bit
> (well, it set it - the bits are reversed).
...
>
> That said, can some ppc person who knows the 32-bit ppc code and maybe
> knows what that "interrupt: 700" means talk about that oddity in the
> trace, please?

I think everyone else answered your questions here, and it should be
fixed now in your tree.

Larry let me know if you're still seeing a crash with 4.18-rc3.

cheers


Re: [PATCH kernel v2 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-01 Thread David Gibson
On Fri, Jun 29, 2018 at 05:07:47PM +1000, Alexey Kardashevskiy wrote:
> On Fri, 29 Jun 2018 15:18:20 +1000
> Alexey Kardashevskiy  wrote:
> 
> > On Fri, 29 Jun 2018 14:57:02 +1000
> > David Gibson  wrote:
> > 
> > > On Fri, Jun 29, 2018 at 02:51:21PM +1000, Alexey Kardashevskiy wrote:  
> > > > On Fri, 29 Jun 2018 14:12:41 +1000
> > > > David Gibson  wrote:
> > > > 
> > > > > On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote: 
> > > > >
> > > > > > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> > > > > > an IOMMU page is contained in the physical page so the PCI hardware 
> > > > > > won't
> > > > > > get access to unassigned host memory.
> > > > > > 
> > > > > > However we do not have this check in KVM fastpath (H_PUT_TCE 
> > > > > > accelerated
> > > > > > code) so the user space can pin memory backed with 64k pages and 
> > > > > > create
> > > > > > a hardware TCE table with a bigger page size. We were lucky so far 
> > > > > > and
> > > > > > did not hit this yet as the very first time the mapping happens
> > > > > > we do not have tbl::it_userspace allocated yet and fall back to
> > > > > > the userspace which in turn calls VFIO IOMMU driver and that fails
> > > > > > because of the check in vfio_iommu_spapr_tce.c which is really
> > > > > > sustainable solution.
> > > > > > 
> > > > > > This stores the smallest preregistered page size in the 
> > > > > > preregistered
> > > > > > region descriptor and changes the mm_iommu_xxx API to check this 
> > > > > > against
> > > > > > the IOMMU page size.
> > > > > > 
> > > > > > Signed-off-by: Alexey Kardashevskiy 
> > > > > > ---
> > > > > > Changes:
> > > > > > v2:
> > > > > > * explicitly check for compound pages before calling 
> > > > > > compound_order()
> > > > > > 
> > > > > > ---
> > > > > > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it 
> > > > > > to
> > > > > > advertise 16MB pages to the guest; a typical pseries guest will use 
> > > > > > 16MB
> > > > > > for IOMMU pages without checking the mmu pagesize and this will fail
> > > > > > at 
> > > > > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> > > > > > 
> > > > > > With the change, mapping will fail in KVM and the guest will print:
> > > > > > 
> > > > > > mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 
> > > > > > 2000 18 1f returned 0 (liobn = 0x8001 starting addr = 
> > > > > > 800 0)
> > > > > > mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> > > > > > /pci@8002000/ethernet@0
> > > > > > mlx5_core :00:00.0: failed to map direct window for
> > > > > > /pci@8002000/ethernet@0: -1  
> > > > > 
> > > > > [snip]
> > > > > > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > unsigned long ua, unsigned long entries,
> > > > > > struct mm_iommu_table_group_mem_t **pmem)
> > > > > >  {
> > > > > > struct mm_iommu_table_group_mem_t *mem;
> > > > > > -   long i, j, ret = 0, locked_entries = 0;
> > > > > > +   long i, j, ret = 0, locked_entries = 0, pageshift;
> > > > > > struct page *page = NULL;
> > > > > >  
> > > > > > mutex_lock(_list_mutex);
> > > > > > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > unsigned long ua, unsigned long entries,
> > > > > > goto unlock_exit;
> > > > > > }
> > > > > >  
> > > >  > > +  mem->pageshift = 30; /* start from 1G pages - the biggest we 
> > > > have */  
> > > > > 
> > > > > What about 16G pages on an HPT system?
> > > > 
> > > > 
> > > > Below in the loop mem->pageshift will reduce to the biggest actual size
> > > > which will be 16mb/64k/4k. Or remain 1GB if no memory is actually
> > > > pinned, no loss there.
> > > 
> > > Are you saying that 16G IOMMU pages aren't supported?  Or that there's
> > > some reason a guest can never use them?  
> > 
> > 
> > ah, 16_G_, not _M_. My bad. I just never tried such huge pages, I will
> > lift the limit up to 64 then, easier this way.
> 
> 
> Ah, no, rather this as the upper limit:
> 
> mem->pageshift = ilog2(entries) + PAGE_SHIFT;

I can't make sense of this comment in context.  I see how you're
computing the minimum page size in the reserved region.

My question is about what the "maximum minimum" is - the starting
value from which you calculate.  Currently it's 1G, but I can't
immediately see a reason that 16G is impossible here.

> @entries here is a number of system pages being pinned in that
> function.
> 
> 
> 
> > 
> > >   
> > > > > > for (i = 0; i < entries; ++i) {
> > > > > > if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
> > > > > > 1/* pages */, 1/* iswrite */, 
> > > > > > )) {
> > > > > > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, 
> > > > > > unsigned long ua, unsigned long entries,
> > > > > >  

Re: [PATCH] powerpc: mpc5200: Remove VLA usage

2018-07-01 Thread Michael Ellerman
Kees Cook  writes:

> On Fri, Jun 29, 2018 at 2:02 PM, Arnd Bergmann  wrote:
>> On Fri, Jun 29, 2018 at 8:53 PM, Kees Cook  wrote:
>>> In the quest to remove all stack VLA usage from the kernel[1], this
>>> switches to using a stack size large enough for the saved routine and
>>> adds a sanity check.
>>>
>>> [1] 
>>> https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qpxydaacu1rq...@mail.gmail.com
>>>
>>> Signed-off-by: Kees Cook 
>>
>> This seems particularly nice, not only avoids it the dynamic stack
>> allocation, it
>> also makes sure the new 0x500 handler doesn't overflow into the 0x600
>> exception handler.
>>
>> It would help to explain how you arrived at that '256 byte' number in
>> the changelog though.
>
> Honestly, I just counted instructions, multiplied by 8 and rounded up
> to the next nearest power of 2, and the result felt right for giving
> some level of flexibility for code growth before tripping the WARN. :P
>
> I'm happy to adjust, of course. :)

What if we write it:

   char saved_0x500[0x600 - 0x500];

Hopefully the compiler is smart enough not to generate a VLA for that :)

cheers


Re: [PATCH v2 1/1] powerpc/pseries: fix EEH recovery of some IOV devices

2018-07-01 Thread Michael Ellerman
Sam Bobroff  writes:

> EEH recovery currently fails on pSeries for some IOV capable PCI
> devices, if CONFIG_PCI_IOV is on and the hypervisor doesn't provide
> certain device tree properties for the device. (Found on an IOV
> capable device using the ipr driver.)
>
> Recovery fails in pci_enable_resources() at the check on r->parent,
> because r->flags is set and r->parent is not.  This state is due to
> sriov_init() setting the start, end and flags members of the IOV BARs
> but the parent not being set later in
> pseries_pci_fixup_iov_resources(), because the
> "ibm,open-sriov-vf-bar-info" property is missing.
>
> Correct this by zeroing the resource flags for IOV BARs when they
> can't be configured.
>
> Signed-off-by: Sam Bobroff 
> ---
> Hi,
>
> This is a fix to allow EEH recovery to succeed in a specific situation,
> which I've tried to explain in the commit message.
>
> As with the RFC version, the IOV BARs are disabled by setting the resource
> flags to 0 but the other fields are now left as-is because that is what is 
> done
> elsewhere (see sriov_init() and __pci_read_base()).
>
> I've also examined the concern raised by Bjorn Helgaas, that VFs could be
> enabled later after the BARs are disabled, and it already seems safe: enabling
> VFs (on pseries) depends on another device tree property,
> "ibm,number-of-configurable-vfs" as well as support for the RTAS function
> "ibm_map_pes". Since these are all part of the hypervisor's support for IOV it
> seems unlikely that we would ever see some of them but not all. (None are
> currently provided by QEMU/KVM.) (Additionally, the ipr driver on which the 
> EEH
> recovery failure was discovered doesn't even seem to have SR-IOV support so it
> certainly can't enable VFs.)

Can you fold/reword the above into the change log, it seems like useful
detail.

cheers


Re: [PATCH v9 0/6] add support for relative references in special sections

2018-07-01 Thread Ard Biesheuvel
On 27 June 2018 at 17:15, Will Deacon  wrote:
> Hi Ard,
>
> On Tue, Jun 26, 2018 at 08:27:55PM +0200, Ard Biesheuvel wrote:
>> This adds support for emitting special sections such as initcall arrays,
>> PCI fixups and tracepoints as relative references rather than absolute
>> references. This reduces the size by 50% on 64-bit architectures, but
>> more importantly, it removes the need for carrying relocation metadata
>> for these sections in relocatable kernels (e.g., for KASLR) that needs
>> to be fixed up at boot time. On arm64, this reduces the vmlinux footprint
>> of such a reference by 8x (8 byte absolute reference + 24 byte RELA entry
>> vs 4 byte relative reference)
>>
>> Patch #3 was sent out before as a single patch. This series supersedes
>> the previous submission. This version makes relative ksymtab entries
>> dependent on the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS rather
>> than trying to infer from kbuild test robot replies for which architectures
>> it should be blacklisted.
>>
>> Patch #1 introduces the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS,
>> and sets it for the main architectures that are expected to benefit the
>> most from this feature, i.e., 64-bit architectures or ones that use
>> runtime relocations.
>>
>> Patch #2 add support for #define'ing __DISABLE_EXPORTS to get rid of
>> ksymtab/kcrctab sections in decompressor and EFI stub objects when
>> rebuilding existing C files to run in a different context.
>
> I had a small question on patch 3, but it's really for my understanding.
> So, for patches 1-3:
>
> Reviewed-by: Will Deacon 
>

Thanks all.

Thomas, Ingo,

Except for the below tweak against patch #3 for powerpc, which may
apparently get confused by an input section called .discard without
any suffixes, this series is good to go, but requires your ack to
proceed, so I would like to ask you to share your comments and/or
objections. Also, any suggestions or recommendations regarding the
route these patches should take are highly appreciated.

Ard.


diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 2d9c63f41031..61c844d4ab48 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -287,7 +287,7 @@ unsigned long read_word_at_a_time(const void *addr)
  * visible to the compiler.
  */
 #define __ADDRESSABLE(sym) \
-   static void * __attribute__((section(".discard"), used))\
+   static void * __attribute__((section(".discard.addressable"), used)) \
__PASTE(__addressable_##sym, __LINE__) = (void *)

 /**


[PATCH 4.17 057/220] powerpc/e500mc: Set assembler machine type to e500mc

2018-07-01 Thread Greg Kroah-Hartman
4.17-stable review patch.  If anyone has any objections, please let me know.

--

From: Michael Jeanson 

commit 69a8405999aa1c489de4b8d349468f0c2b83f093 upstream.

In binutils 2.26 a new opcode for the "wait" instruction was added for the
POWER9 and has precedence over the one specific to the e500mc. Commit
ebf714ff3756 ("powerpc/e500mc: Add support for the wait instruction in
e500_idle") uses this instruction specifically on the e500mc to work around
an erratum.

This results in an invalid instruction in idle_e500 when we build for the
e500mc on bintutils >= 2.26 with the default assembler machine type.

Since multiplatform between e500 and non-e500 is not supported, set the
assembler machine type globaly when CONFIG_PPC_E500MC=y.

Signed-off-by: Michael Jeanson 
Reviewed-by: Mathieu Desnoyers 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: Kumar Gala 
CC: Vakul Garg 
CC: Scott Wood 
CC: Mathieu Desnoyers 
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-ker...@vger.kernel.org
CC: sta...@vger.kernel.org
Signed-off-by: Michael Ellerman 
Signed-off-by: Greg Kroah-Hartman 

---
 arch/powerpc/Makefile |1 +
 1 file changed, 1 insertion(+)

--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -251,6 +251,7 @@ cpu-as-$(CONFIG_4xx)+= -Wa,-m405
 cpu-as-$(CONFIG_ALTIVEC)   += $(call as-option,-Wa$(comma)-maltivec)
 cpu-as-$(CONFIG_E200)  += -Wa,-me200
 cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4
+cpu-as-$(CONFIG_PPC_E500MC)+= $(call as-option,-Wa$(comma)-me500mc)
 
 KBUILD_AFLAGS += $(cpu-as-y)
 KBUILD_CFLAGS += $(cpu-as-y)




Re: [PATCH 1/3] [v2] powerpc: mac: fix rtc read/write functions

2018-07-01 Thread Meelis Roos
A patch for the subject is now upstream. That made me finally take some 
time to test it on my PowerMac G4. Tha date is OK but I get two warnings 
with backtrace on bootup. Full dmesg below.

[0.00] Total memory = 1024MB; using 2048kB for hash table (at (ptrval))
[0.00] RAM mapped without BATs
[0.00] Linux version 4.18.0-rc2-00223-g1904148a361a (mroos@pohl) (gcc 
version 7.3.0 (Debian 7.3.0-24)) #88 Sun Jul 1 01:39:01 EEST 2018
[0.00] Found UniNorth memory controller & host bridge @ 0xf800 
revision: 0x11
[0.00] Mapped at 0xff7c
[0.00] Found a Keylargo mac-io controller, rev: 3, mapped at 0x(ptrval)
[0.00] Processor NAP mode on idle enabled.
[0.00] PowerMac motherboard: PowerMac G4 Silver
[0.00] Using PowerMac machine description
[0.00] bootconsole [udbg0] enabled
[0.00] -
[0.00] Hash_size = 0x20
[0.00] phys_mem_size = 0x4000
[0.00] dcache_bsize  = 0x20
[0.00] icache_bsize  = 0x20
[0.00] cpu_features  = 0x0401a00a
[0.00]   possible= 0x2f7ff04b
[0.00]   always  = 0x
[0.00] cpu_user_features = 0x9c01 0x
[0.00] mmu_features  = 0x0001
[0.00] Hash  = 0x(ptrval)
[0.00] Hash_mask = 0x7fff
[0.00] -
[0.00] Found UniNorth PCI host bridge at 0xf000. Firmware 
bus number: 0->0
[0.00] PCI host bridge /pci@f000  ranges:
[0.00]  MEM 0xf100..0xf1ff -> 
0xf100 
[0.00]   IO 0xf000..0xf07f -> 0x
[0.00]  MEM 0x9000..0x9fff -> 
0x9000 
[0.00] Found UniNorth PCI host bridge at 0xf200. Firmware 
bus number: 0->0
[0.00] PCI host bridge /pci@f200 (primary) ranges:
[0.00]  MEM 0xf300..0xf3ff -> 
0xf300 
[0.00]   IO 0xf200..0xf27f -> 0x
[0.00]  MEM 0x8000..0x8fff -> 
0x8000 
[0.00] Found UniNorth PCI host bridge at 0xf400. Firmware 
bus number: 0->0
[0.00] PCI host bridge /pci@f400  ranges:
[0.00]  MEM 0xf500..0xf5ff -> 
0xf500 
[0.00]   IO 0xf400..0xf47f -> 0x
[0.00] via-pmu: Server Mode is disabled
[0.00] PMU driver v2 initialized for Core99, firmware: 0c
[0.00] nvram: Checking bank 0...
[0.00] nvram: gen0=134, gen1=135
[0.00] nvram: Active bank is: 1
[0.00] nvram: OF partition at 0x210
[0.00] nvram: XP partition at 0x1220
[0.00] nvram: NR partition at 0x1320
[0.00] Top of RAM: 0x4000, Total RAM: 0x4000
[0.00] Memory hole size: 0MB
[0.00] Zone ranges:
[0.00]   DMA  [mem 0x-0x2fff]
[0.00]   Normal   empty
[0.00]   HighMem  [mem 0x3000-0x3fff]
[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x-0x3fff]
[0.00] Initmem setup node 0 [mem 0x-0x3fff]
[0.00] On node 0 totalpages: 262144
[0.00]   DMA zone: 1536 pages used for memmap
[0.00]   DMA zone: 0 pages reserved
[0.00]   DMA zone: 196608 pages, LIFO batch:31
[0.00]   HighMem zone: 65536 pages, LIFO batch:15
[0.00] pcpu-alloc: s0 r0 d32768 u32768 alloc=1*32768
[0.00] pcpu-alloc: [0] 0 
[0.00] Built 1 zonelists, mobility grouping on.  Total pages: 260608
[0.00] Kernel command line: root=/dev/sda3 ro 
[0.00] Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
[0.00] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
[0.00] Memory: 1029696K/1048576K available (5136K kernel code, 228K 
rwdata, 996K rodata, 208K init, 255K bss, 18880K reserved, 0K cma-reserved, 
262144K highmem)
[0.00] Kernel virtual memory layout:
[0.00]   * 0xfffcf000..0xf000  : fixmap
[0.00]   * 0xff80..0xffc0  : highmem PTEs
[0.00]   * 0xfded8000..0xff80  : early ioremap
[0.00]   * 0xf100..0xfded8000  : vmalloc & ioremap
[0.00] SLUB: HWalign=32, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
[0.00] NR_IRQS: 512, nr_irqs: 512, preallocated irqs: 16
[0.00] mpic: Resetting
[0.00] mpic: Setting up MPIC " MPIC 1   " version 1.2 at 8004, max 
1 CPUs
[0.00] mpic: ISU size: 64, shift: 6, mask: 3f
[0.00] mpic: Initializing for 64 sources
[  

Patch "powerpc/e500mc: Set assembler machine type to e500mc" has been added to the 4.17-stable tree

2018-07-01 Thread gregkh


This is a note to let you know that I've just added the patch titled

powerpc/e500mc: Set assembler machine type to e500mc

to the 4.17-stable tree which can be found at:

http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
 powerpc-e500mc-set-assembler-machine-type-to-e500mc.patch
and it can be found in the queue-4.17 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let  know about it.


>From 69a8405999aa1c489de4b8d349468f0c2b83f093 Mon Sep 17 00:00:00 2001
From: Michael Jeanson 
Date: Thu, 14 Jun 2018 11:27:42 -0400
Subject: powerpc/e500mc: Set assembler machine type to e500mc

From: Michael Jeanson 

commit 69a8405999aa1c489de4b8d349468f0c2b83f093 upstream.

In binutils 2.26 a new opcode for the "wait" instruction was added for the
POWER9 and has precedence over the one specific to the e500mc. Commit
ebf714ff3756 ("powerpc/e500mc: Add support for the wait instruction in
e500_idle") uses this instruction specifically on the e500mc to work around
an erratum.

This results in an invalid instruction in idle_e500 when we build for the
e500mc on bintutils >= 2.26 with the default assembler machine type.

Since multiplatform between e500 and non-e500 is not supported, set the
assembler machine type globaly when CONFIG_PPC_E500MC=y.

Signed-off-by: Michael Jeanson 
Reviewed-by: Mathieu Desnoyers 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: Kumar Gala 
CC: Vakul Garg 
CC: Scott Wood 
CC: Mathieu Desnoyers 
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-ker...@vger.kernel.org
CC: sta...@vger.kernel.org
Signed-off-by: Michael Ellerman 
Signed-off-by: Greg Kroah-Hartman 

---
 arch/powerpc/Makefile |1 +
 1 file changed, 1 insertion(+)

--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -251,6 +251,7 @@ cpu-as-$(CONFIG_4xx)+= -Wa,-m405
 cpu-as-$(CONFIG_ALTIVEC)   += $(call as-option,-Wa$(comma)-maltivec)
 cpu-as-$(CONFIG_E200)  += -Wa,-me200
 cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4
+cpu-as-$(CONFIG_PPC_E500MC)+= $(call as-option,-Wa$(comma)-me500mc)
 
 KBUILD_AFLAGS += $(cpu-as-y)
 KBUILD_CFLAGS += $(cpu-as-y)


Patches currently in stable-queue which might be from mjean...@efficios.com are

queue-4.17/powerpc-e500mc-set-assembler-machine-type-to-e500mc.patch