Linus,

Please pull the latest ras-core-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git ras-core-for-linus

   # HEAD: 179eb850ac57c06edaed67fc744ba9d902172f96 x86/MCE: Make correctable 
error detection look at the Deferred bit

The main changes:

 - Various AMD SMCA error parsing/reporting improvements (Yazen Ghannam)

 - Extend Intel CMCI error reporting to more cases (Xie XiuQi)

 Thanks,

        Ingo

------------------>
Xie XiuQi (1):
      x86/MCE: Extend table to report action optional errors through CMCI too

Yazen Ghannam (4):
      x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems
      x86/MCE/AMD: Define a function to get SMCA bank type
      x86/MCE: Report only DRAM ECC as memory errors on AMD systems
      x86/MCE: Make correctable error detection look at the Deferred bit


 arch/x86/include/asm/mce.h                |  2 ++
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +++++++++++++++++---------
 arch/x86/kernel/cpu/mcheck/mce.c          | 17 +++++++++++++----
 arch/x86/kernel/cpu/mcheck/mce_amd.c      | 29 ++++++++++++++++++++++++++++-
 4 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b1e8d8db921f..96ea4b5ba658 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -376,6 +376,7 @@ struct smca_bank {
 extern struct smca_bank smca_banks[MAX_NR_BANKS];
 
 extern const char *smca_get_long_name(enum smca_bank_types t);
+extern bool amd_mce_is_memory_error(struct mce *m);
 
 extern int mce_threshold_create_device(unsigned int cpu);
 extern int mce_threshold_remove_device(unsigned int cpu);
@@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
 
 static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
 static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
+static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
 
 #endif
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c 
b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a06e0b..5bbd06f38ff6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@ static struct severity {
 #define  MCGMASK(x, y) .mcgmask = x, .mcgres = y
 #define  MASK(x, y)    .mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #define        MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 
@@ -101,6 +102,22 @@ static struct severity {
                NOSER, BITCLR(MCI_STATUS_UC)
                ),
 
+       /*
+        * known AO MCACODs reported via MCE or CMC:
+        *
+        * SRAO could be signaled either via a machine check exception or
+        * CMCI with the corresponding bit S 1 or 0. So we don't need to
+        * check bit S for SRAO.
+        */
+       MCESEV(
+               AO, "Action optional: memory scrubbing error",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, 
MCI_STATUS_UC|MCACOD_SCRUB)
+               ),
+       MCESEV(
+               AO, "Action optional: last level cache writeback error",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, 
MCI_STATUS_UC|MCACOD_L3WB)
+               ),
+
        /* ignore OVER for UCNA */
        MCESEV(
                UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@ static struct severity {
                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
                ),
 
-       /* known AO MCACODs: */
-       MCESEV(
-               AO, "Action optional: memory scrubbing error",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, 
MCI_UC_S|MCACOD_SCRUB)
-               ),
-       MCESEV(
-               AO, "Action optional: last level cache writeback error",
-               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, 
MCI_UC_S|MCACOD_L3WB)
-               ),
        MCESEV(
                SOME, "Action optional: unknown MCACOD",
                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b1d616d08eee..1b2c11473376 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
 bool mce_is_memory_error(struct mce *m)
 {
        if (m->cpuvendor == X86_VENDOR_AMD) {
-               /* ErrCodeExt[20:16] */
-               u8 xec = (m->status >> 16) & 0x1f;
+               return amd_mce_is_memory_error(m);
 
-               return (xec == 0x0 || xec == 0x8);
        } else if (m->cpuvendor == X86_VENDOR_INTEL) {
                /*
                 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_is_memory_error);
 
+static bool mce_is_correctable(struct mce *m)
+{
+       if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
+               return false;
+
+       if (m->status & MCI_STATUS_UC)
+               return false;
+
+       return true;
+}
+
 static bool cec_add_mce(struct mce *m)
 {
        if (!m)
@@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m)
 
        /* We eat only correctable DRAM errors with usable addresses. */
        if (mce_is_memory_error(m) &&
-           !(m->status & MCI_STATUS_UC) &&
+           mce_is_correctable(m)  &&
            mce_usable_address(m))
                if (!cec_add_elem(m->addr >> PAGE_SHIFT))
                        return true;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c 
b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 486f640b02ef..0f32ad242324 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
 }
 EXPORT_SYMBOL_GPL(smca_get_long_name);
 
+static enum smca_bank_types smca_get_bank_type(struct mce *m)
+{
+       struct smca_bank *b;
+
+       if (m->bank >= N_SMCA_BANK_TYPES)
+               return N_SMCA_BANK_TYPES;
+
+       b = &smca_banks[m->bank];
+       if (!b->hwid)
+               return N_SMCA_BANK_TYPES;
+
+       return b->hwid->bank_type;
+}
+
 static struct smca_hwid smca_hwid_mcatypes[] = {
        /* { bank_type, hwid_mcatype, xec_bitmap } */
 
@@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct 
cpuinfo_x86 *c)
            (deferred_error_int_vector != amd_deferred_error_interrupt))
                deferred_error_int_vector = amd_deferred_error_interrupt;
 
-       low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+       if (!mce_flags.smca)
+               low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+
        wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
@@ -738,6 +754,17 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 
umc, u64 *sys_addr)
 }
 EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
 
+bool amd_mce_is_memory_error(struct mce *m)
+{
+       /* ErrCodeExt[20:16] */
+       u8 xec = (m->status >> 16) & 0x1f;
+
+       if (mce_flags.smca)
+               return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;
+
+       return m->bank == 4 && xec == 0x8;
+}
+
 static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
 {
        struct mce m;

Reply via email to