On Tue, Apr 25, 2017 at 02:16:11PM -0500, Yazen Ghannam wrote: > From: Yazen Ghannam <yazen.ghan...@amd.com> > > We have support for the new SMCA MCA_DE{STAT,ADDR} registers in Linux. So > we've used these registers in place of MCA_{STATUS,ADDR} on SMCA systems. > However, the guidance for current implementations of SMCA is to continue > using MCA_{STATUS,ADDR} and to use MCA_DE{STAT,ADDR} only if a Deferred > error was not found in the former registers. If we logged a Deferred error > in MCA_STATUS then we should also clear MCA_DESTAT. This also means we > shouldn't clear MCA_CONFIG[LogDeferredInMcaStat]. > > Don't clear MCA_CONFIG[LogDeferredInMcaStat] during AMD mcheck init. > > Don't break after finding the first error in either the Deferred or > Thresholding interrupt handlers. > > Rework __log_error() to only log error. Most valid checks are moved into > __log_error_* helper functions. > > Write the Deferred error __log_error_* helper function to follow the > guidance for current SMCA systems. > > Signed-off-by: Yazen Ghannam <yazen.ghan...@amd.com> > ---
... > +static void > +__log_error_deferred(unsigned int bank) > +{ > + u64 status, addr; > + bool logged_deferred = false; > + > + rdmsrl(msr_ops.status(bank), status); > + > + /* Log any valid error we find. */ > + if (status & MCI_STATUS_VAL) { > + rdmsrl(msr_ops.addr(bank), addr); Check ADDRV here and in all cases below before reading MCi_ADDR. > + > + __log_error(bank, status, addr, 0); > + > + wrmsrl(msr_ops.status(bank), 0); > + > + logged_deferred = !!(status & MCI_STATUS_DEFERRED); > + } > + > + if (!mce_flags.smca) > + return; > + > + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ > + if (logged_deferred) { if (status & MCI_STATUS_DEFERRED) is fine and you can drop the bool. Oh well, here's an updated version with those suggestions incorporated. Also, I've carved out the common functionality into a _log_error_bank() which is more compact. Now it is even easier to follow what log_error_deferred() does. diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 41439ab41102..3688a7fbb0bb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -472,20 +472,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, smca_high |= BIT(0); /* - * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} - * registers with the option of additionally logging to - * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. - * - * This bit is usually set by BIOS to retain the old behavior - * for OSes that don't use the new registers. Linux supports the - * new registers so let's disable that additional logging here. - * - * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high - * portion of the MSR). - */ - smca_high &= ~BIT(2); - - /* * SMCA sets the Deferred Error Interrupt type per bank. * * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us @@ -756,36 +742,19 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); static void -__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) +__log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - u32 msr_status = msr_ops.status(bank); - u32 msr_addr = msr_ops.addr(bank); struct mce m; - u64 status; - - WARN_ON_ONCE(deferred_err && threshold_err); - - if (deferred_err && mce_flags.smca) { - msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); - msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); - } - - rdmsrl(msr_status, status); - - if (!(status & MCI_STATUS_VAL)) - return; mce_setup(&m); m.status = status; + m.misc = misc; m.bank = bank; m.tsc = rdtsc(); - if (threshold_err) - m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) { - rdmsrl(msr_addr, m.addr); + m.addr = addr; /* * Extract [55:<lsb>] where lsb is the least significant @@ -806,8 +775,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) } mce_log(&m); - - wrmsrl(msr_status, 0); } static inline void __smp_deferred_error_interrupt(void) @@ -832,45 +799,86 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) exiting_ack_irq(); } -/* APIC interrupt handler for deferred errors */ -static void amd_deferred_error_interrupt(void) +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { - unsigned int bank; - u32 msr_status; - u64 status; + u64 status, addr = 0; - for (bank = 0; bank < mca_cfg.banks; ++bank) { - msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) - : msr_ops.status(bank); + rdmsrl(msr_stat, status); - rdmsrl(msr_status, status); + if (!(status & MCI_STATUS_VAL)) + return false; - if (!(status & MCI_STATUS_VAL) || - !(status & MCI_STATUS_DEFERRED)) - continue; + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); - __log_error(bank, true, false, 0); - break; - } + __log_error(bank, status, addr, misc); + + wrmsrl(status, 0); + + return status & MCI_STATUS_DEFERRED; } /* - * APIC Interrupt Handler + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. */ +static void log_error_deferred(unsigned int bank) +{ + bool defrd; + + defrd = _log_error_bank(bank, msr_ops.status(bank), + msr_ops.addr(bank), 0); + + if (!mce_flags.smca) + return; + + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ + if (defrd) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return; + } + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) + log_error_deferred(bank); +} + +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); +} /* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. */ - static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); struct thresh_restart tr; - /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; @@ -893,23 +901,18 @@ static void amd_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; - /* - * Log the machine check that caused the threshold - * event. - */ - if (high & MASK_OVERFLOW_HI) - goto log; - } - } - return; + if (!(high & MASK_OVERFLOW_HI)) + continue; -log: - __log_error(bank, false, true, ((u64)high << 32) | low); + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(bank, ((u64)high << 32) | low); - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; - threshold_restart_bank(&tr); + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + threshold_restart_bank(&tr); + } + } } /* -- 2.11.0 -- Regards/Gruss, Boris. Good mailing practices for 400: avoid top-posting and trim the reply.