For Scalable MCA enabled processors, errors are listed per IP block. And since it is not required for an IP to map to a particular bank, we need to use HWID and McaType values from the MCx_IPID register to figure out which IP a given bank represents.
We also have a new bit (TCC) in the MCx_STATUS register to indicate Task context is corrupt. Add logic here to decode errors from all known IP blocks for Fam17h Model 00-0fh and to print TCC errors. Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrish...@amd.com> --- arch/x86/include/asm/mce.h | 50 ++++++ arch/x86/include/asm/msr-index.h | 2 + arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 ++ drivers/edac/mce_amd.c | 327 ++++++++++++++++++++++++++++++++++- 4 files changed, 389 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 2ea4527..2ec67ac 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -42,6 +42,17 @@ /* AMD-specific bits */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ +/* + * McaX field if set indicates a given bank supports MCA extensions: + * - Deferred error interrupt type is specifiable by bank + * - BlkPtr field indicates prescence of extended MISC registers. + * But should not be used to determine MSR numbers + * - TCC bit is present in MCx_STATUS + */ +#define MCI_CONFIG_MCAX 0x1 +#define MCI_IPID_MCATYPE 0xFFFF0000 +#define MCI_IPID_HWID 0xFFF /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is @@ -287,4 +298,43 @@ struct cper_sec_mem_err; extern void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err); +/* + * Enumerating new IP types and HWID values + * in ScalableMCA enabled AMD processors + */ +#ifdef CONFIG_X86_MCE_AMD +enum ip_types { + F17H_CORE = 0, /* Core errors */ + DF, /* Data Fabric */ + UMC, /* Unified Memory Controller */ + FUSE, /* FUSE subsystem */ + PSP, /* Platform Security Processor */ + SMU, /* System Management Unit */ + N_IP_TYPES +}; + +struct hwid { + const char *ipname; + unsigned int hwid_value; +}; + +extern struct hwid hwid_mappings[N_IP_TYPES]; + +enum core_mcatypes { + LS = 0, /* Load Store */ + IF, /* Instruction Fetch */ + L2_CACHE, /* L2 cache */ + DE, /* Decoder unit */ + RES, /* Reserved */ + EX, /* Execution unit */ + FP, /* Floating Point */ + L3_CACHE /* L3 cache */ +}; + +enum df_mcatypes { + CS = 0, /* Coherent Slave */ + PIE /* Power management, Interrupts, etc */ +}; +#endif + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5523465..93bccbc 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -266,7 +266,9 @@ /* 'SMCA': AMD64 Scalable MCA */ #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 +#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 88de27b..8169103 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -71,6 +71,17 @@ static const char * const th_names[] = { "execution_unit", }; +/* Defining HWID to IP type mappings for Scalable MCA */ +struct hwid hwid_mappings[] = { + [F17H_CORE] = { "f17h_core", 0xB0 }, + [DF] = { "df", 0x2E }, + [UMC] = { "umc", 0x96 }, + [FUSE] = { "fuse", 0x5 }, + [PSP] = { "psp", 0xFF }, + [SMU] = { "smu", 0x1 }, +}; +EXPORT_SYMBOL_GPL(hwid_mappings); + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e3a945c..6e6b327 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -147,6 +147,136 @@ static const char * const mc6_mce_desc[] = { "Status Register File", }; +/* Scalable MCA error strings */ + +static const char * const f17h_ls_mce_desc[] = { + "Load queue parity", + "Store queue parity", + "Miss address buffer payload parity", + "L1 TLB parity", + "", /* reserved */ + "DC tag error type 6", + "DC tag error type 1", + "Internal error type 1", + "Internal error type 2", + "Sys Read data error thread 0", + "Sys read data error thread 1", + "DC tag error type 2", + "DC data error type 1 (poison comsumption)", + "DC data error type 2", + "DC data error type 3", + "DC tag error type 4", + "L2 TLB parity", + "PDC parity error", + "DC tag error type 3", + "DC tag error type 5", + "L2 fill data error", +}; + +static const char * const f17h_if_mce_desc[] = { + "microtag probe port parity error", + "IC microtag or full tag multi-hit error", + "IC full tag parity", + "IC data array parity", + "Decoupling queue phys addr parity error", + "L0 ITLB parity error", + "L1 ITLB parity error", + "L2 ITLB parity error", + "BPQ snoop parity on Thread 0", + "BPQ snoop parity on Thread 1", + "L1 BTB multi-match error", + "L2 BTB multi-match error", +}; + +static const char * const f17h_l2_mce_desc[] = { + "L2M tag multi-way-hit error", + "L2M tag ECC error", + "L2M data ECC error", + "HW assert", +}; + +static const char * const f17h_de_mce_desc[] = { + "uop cache tag parity error", + "uop cache data parity error", + "Insn buffer parity error", + "Insn dispatch queue parity error", + "Fetch address FIFO parity", + "Patch RAM data parity", + "Patch RAM sequencer parity", + "uop buffer parity" +}; + +static const char * const f17h_ex_mce_desc[] = { + "Watchdog timeout error", + "Phy register file parity", + "Flag register file parity", + "Immediate displacement register file parity", + "Address generator payload parity", + "EX payload parity", + "Checkpoint queue parity", + "Retire dispatch queue parity", +}; + +static const char * const f17h_fp_mce_desc[] = { + "Physical register file parity", + "Freelist parity error", + "Schedule queue parity", + "NSQ parity error", + "Retire queue parity", + "Status register file parity", +}; + +static const char * const f17h_l3_mce_desc[] = { + "Shadow tag macro ECC error", + "Shadow tag macro multi-way-hit error", + "L3M tag ECC error", + "L3M tag multi-way-hit error", + "L3M data ECC error", + "XI parity, L3 fill done channel error", + "L3 victim queue parity", + "L3 HW assert", +}; + +static const char * const f17h_cs_mce_desc[] = { + "Illegal request from transport layer", + "Address violation", + "Security violation", + "Illegal response from transport layer", + "Unexpected response", + "Parity error on incoming request or probe response data", + "Parity error on incoming read response data", + "Atomic request parity", + "ECC error on probe filter access", +}; + +static const char * const f17h_pie_mce_desc[] = { + "HW assert", + "Internal PIE register security violation", + "Error on GMI link", + "Poison data written to internal PIE register", +}; + +static const char * const f17h_umc_mce_desc[] = { + "DRAM ECC error", + "Data poison error on DRAM", + "SDP parity error", + "Advanced peripheral bus error", + "Command/address parity error", + "Write data CRC error", +}; + +static const char * const f17h_fuse_mce_desc[] = { + "FUSE RAM ECC error", +}; + +static const char * const f17h_psp_mce_desc[] = { + "PSP RAM ECC or parity error", +}; + +static const char * const f17h_smu_mce_desc[] = { + "SMU RAM ECC or parity error", +}; + static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -731,6 +861,178 @@ static bool amd_filter_mce(struct mce *m) return false; } +static void decode_f17hcore_errors(u8 xec, unsigned int mca_type) +{ + switch (mca_type) { + case LS: + if (xec == 0x4 || xec > (ARRAY_SIZE(f17h_ls_mce_desc) - 1)) + goto wrong_f17hcore_error; + + pr_cont("%s.\n", f17h_ls_mce_desc[xec]); + break; + + case IF: + if (xec > (ARRAY_SIZE(f17h_if_mce_desc) - 1)) + goto wrong_f17hcore_error; + + pr_cont("%s.\n", f17h_if_mce_desc[xec]); + break; + + case L2_CACHE: + if (xec > (ARRAY_SIZE(f17h_l2_mce_desc) - 1)) + goto wrong_f17hcore_error; + + pr_cont("%s.\n", f17h_l2_mce_desc[xec]); + break; + + case DE: + if (xec > (ARRAY_SIZE(f17h_de_mce_desc) - 1)) + goto wrong_f17hcore_error; + + pr_cont("%s.\n", f17h_de_mce_desc[xec]); + break; + + case EX: + if (xec > (ARRAY_SIZE(f17h_ex_mce_desc) - 1)) + goto wrong_f17hcore_error; + + pr_cont("%s.\n", f17h_ex_mce_desc[xec]); + break; + + case FP: + if (xec > (ARRAY_SIZE(f17h_fp_mce_desc) - 1)) + goto wrong_f17hcore_error; + + pr_cont("%s.\n", f17h_fp_mce_desc[xec]); + break; + + case L3_CACHE: + if (xec > (ARRAY_SIZE(f17h_l3_mce_desc) - 1)) + goto wrong_f17hcore_error; + + pr_cont("%s.\n", f17h_l3_mce_desc[xec]); + break; + + default: + goto wrong_f17hcore_error; + } + + return; + +wrong_f17hcore_error: + pr_cont("Unrecognized error code from %s MCA bank\n", + (mca_type == L3_CACHE) ? "L3 Cache" : "F17h Core"); +} + +static void decode_df_errors(u8 xec, unsigned int mca_type) +{ + switch (mca_type) { + case CS: + if (xec > (ARRAY_SIZE(f17h_cs_mce_desc) - 1)) + goto wrong_df_error; + + pr_cont("%s.\n", f17h_cs_mce_desc[xec]); + break; + + case PIE: + if (xec > (ARRAY_SIZE(f17h_pie_mce_desc) - 1)) + goto wrong_df_error; + + pr_cont("%s.\n", f17h_pie_mce_desc[xec]); + break; + + default: + goto wrong_df_error; + } + + return; + +wrong_df_error: + pr_cont("Unrecognized error code from DF MCA bank\n"); +} + +/* Decode errors according to Scalable MCA specification */ +static void decode_smca_errors(struct mce *m) +{ + u32 low, high; + u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); + unsigned int hwid, mca_type, i; + u8 xec = XEC(m->status, xec_mask); + + if (rdmsr_safe(addr, &low, &high)) { + pr_emerg("Unable to decode errors from banks\n"); + return; + } + + hwid = high & MCI_IPID_HWID; + mca_type = (high & MCI_IPID_MCATYPE) >> 16; + + pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", + m->bank, high, low); + + /* + * Based on hwid and mca_type values, + * decode errors from respective IPs. + * Note: mca_type values make sense only + * in the context of an hwid + */ + for (i = 0; i < ARRAY_SIZE(hwid_mappings); i++) + if (hwid_mappings[i].hwid_value == hwid) + break; + + switch (i) { + case F17H_CORE: + pr_emerg(HW_ERR "%s Error: ", + (mca_type == L3_CACHE) ? "L3 Cache" : "F17h Core"); + decode_f17hcore_errors(xec, mca_type); + break; + + case DF: + pr_emerg(HW_ERR "DF Error: "); + decode_df_errors(xec, mca_type); + break; + + case UMC: + pr_emerg(HW_ERR "UMC Error: "); + if (xec > (ARRAY_SIZE(f17h_umc_mce_desc) - 1)) { + pr_cont("Unrecognized error code from UMC MCA bank\n"); + return; + } + pr_cont("%s.\n", f17h_umc_mce_desc[xec]); + break; + + case FUSE: + pr_emerg(HW_ERR "FUSE Error: "); + if (xec > (ARRAY_SIZE(f17h_fuse_mce_desc) - 1)) { + pr_cont("Unrecognized error code from FUSE MCA bank\n"); + return; + } + pr_cont("%s.\n", f17h_fuse_mce_desc[xec]); + break; + + case PSP: + pr_emerg(HW_ERR "PSP Error: "); + if (xec > (ARRAY_SIZE(f17h_psp_mce_desc) - 1)) { + pr_cont("Unrecognized error code from PSP MCA bank\n"); + return; + } + pr_cont("%s.\n", f17h_psp_mce_desc[xec]); + break; + + case SMU: + pr_emerg(HW_ERR "SMU Error: "); + if (xec > (ARRAY_SIZE(f17h_smu_mce_desc) - 1)) { + pr_cont("Unrecognized error code from SMU MCA bank\n"); + return; + } + pr_cont("%s.\n", f17h_smu_mce_desc[xec]); + break; + + default: + pr_emerg(HW_ERR "HWID:%d does not match any existing IPs\n", hwid); + } +} + static const char *decode_error_status(struct mce *m) { if (m->status & MCI_STATUS_UC) { @@ -769,11 +1071,21 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 == 0x15 || c->x86 == 0x16) + if (c->x86 >= 0x15) pr_cont("|%s|%s", ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + if (mce_flags.smca) { + u32 smca_low, smca_high; + u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + + if (!rdmsr_safe(smca_addr, &smca_low, &smca_high) && + (smca_low & MCI_CONFIG_MCAX)) + pr_cont("|%s", + ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + } + /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; if (ecc) @@ -784,6 +1096,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_ADDRV) pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); + if (mce_flags.smca) { + decode_smca_errors(m); + goto err_code; + } + if (!fam_ops) goto err_code; @@ -888,6 +1205,14 @@ static int __init mce_amd_init(void) fam_ops->mc2_mce = f16h_mc2_mce; break; + case 0x17: + xec_mask = 0x3f; + if (!mce_flags.smca) { + printk(KERN_WARNING "Decoding supported only on Scalable MCA enabled processors\n"); + return 0; + } + break; + default: printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); kfree(fam_ops); -- 2.7.0