On Fri, Aug 28, 2020 at 01:21:50PM -0700, Luck, Tony wrote:
> +static void adjust_mce_log(struct mce *m)
> +{
> +     struct cpuinfo_x86 *c = &boot_cpu_data;
> +
> +     if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
> +         c->x86_model == INTEL_FAM6_SKYLAKE_X && c->x86_stepping >= 4) {
> +             /*
> +              * Check the error code to see if this is an uncorrected patrol
> +              * scrub error from one of the memory controller banks. If so,
> +              * then adjust the severity level to MCE_AO_SEVERITY
> +              */
> +             if (((m->status & MCACOD_SCRUBMSK) == MCACOD_SCRUB) &&
> +                 ((m->status & MSCOD_MASK) == MSCOD_UCE_SCRUB) &&
> +                 m->bank >= 13 && m->bank <= 18)
> +                     m->severity = MCE_AO_SEVERITY;
> +     }
> +}
> +
>  DEFINE_PER_CPU(unsigned, mce_poll_count);
>  
>  /*
> @@ -772,6 +801,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t 
> *b)
>               if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
>                       goto clear_it;
>  
> +             adjust_mce_log(&m);
>               mce_log(&m);

Coming back to this and looking at it, I can't say that I like it. We're
sticking hooks to look at and massage the logged data everywhere on the
MCE processing path and it is getting really unwieldy.

And after staring at this a bit, it looks like all it wants to do is to
adjust the severity. And we have a severity grading mechanism. So let's
see how ugly it would become if we extended it to check that too.

So how's that below instead?

It builds here, I haven't even thought about testing it and I might've
missed out on some aspects but tbh this looks much better to me. Because
it is not bolted on the handling path but integral part of it.

Thoughts?

---
diff --git a/arch/x86/kernel/cpu/mce/severity.c 
b/arch/x86/kernel/cpu/mce/severity.c
index e1da619add19..8c1a41aa5e40 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -9,9 +9,11 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/debugfs.h>
-#include <asm/mce.h>
 #include <linux/uaccess.h>
 
+#include <asm/mce.h>
+#include <asm/intel-family.h>
+
 #include "internal.h"
 
 /*
@@ -40,9 +42,14 @@ static struct severity {
        unsigned char context;
        unsigned char excp;
        unsigned char covered;
+       unsigned char cpu_model;
+       unsigned char cpu_stepping;
+       unsigned char bank_lo, bank_hi;
        char *msg;
 } severities[] = {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
+#define MODEL_STEPPING(m,s) .cpu_model = m, .cpu_stepping = s
 #define  KERNEL                .context = IN_KERNEL
 #define  USER          .context = IN_USER
 #define  KERNEL_RECOV  .context = IN_KERNEL_RECOV
@@ -97,7 +104,10 @@ static struct severity {
                KEEP, "Corrected error",
                NOSER, BITCLR(MCI_STATUS_UC)
                ),
-
+       MCESEV(AO, "UnCorrected Patrol Scrub Error",
+               NOSER, MASK(0xffffeff0, 0x001000c0),
+               MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4),BANK_RANGE(13,18)
+       ),
        /*
         * known AO MCACODs reported via MCE or CMC:
         *
@@ -324,6 +334,12 @@ static int mce_severity_intel(struct mce *m, int tolerant, 
char **msg, bool is_e
                        continue;
                if (s->excp && excp != s->excp)
                        continue;
+               if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+                       continue;
+               if (s->cpu_stepping && boot_cpu_data.x86_stepping <= 
s->cpu_stepping)
+                       continue;
+               if (s->bank_lo && (s->bank_lo <= m->bank && m->bank <= 
s->bank_hi))
+                       continue;
                if (msg)
                        *msg = s->msg;
                s->covered = 1;

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Reply via email to