arch/x86/kernel/cpu/mce/core.c | 67 ++++++++++++++++++++++------------
1 file changed, 44 insertions(+), 23 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index f43a78bde670..3a842b3773b3 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -282,20 +282,35 @@ static int fake_panic;
static atomic_t mce_fake_panicked;
/* Panic in progress. Enable interrupts and wait for final IPI */
-static void wait_for_panic(void)
+static void wait_for_panic(struct pt_regs *regs)
{
long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
preempt_disable();
local_irq_enable();
- while (timeout-- > 0)
+ while (timeout-- > 0) {
+ /*
+ * We are in an NMI waiting to be stopped by the
+ * handing processor. For kdump handling, we need to
+ * be monitoring crash_ipi_issued since that is what
+ * is used for an NMI stop used by kdump. But we also
+ * need to have interrupts enabled some so that
+ * RESET_VECTOR will interrupt us on a normal
+ * shutdown.
+ */
+ local_irq_disable();
+ run_crash_ipi_callback(regs);
+ local_irq_enable();
+
udelay(1);
+ }
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
panic("Panicing machine check CPU died");
}
-static void mce_panic(const char *msg, struct mce *final, char *exp)
+static void mce_panic(const char *msg, struct mce *final, char *exp,
+ struct pt_regs *regs)
{
int apei_err = 0;
struct llist_node *pending;
@@ -306,7 +321,7 @@ static void mce_panic(const char *msg, struct mce *final,
char *exp)
* Make sure only one CPU runs in machine check panic
*/
if (atomic_inc_return(&mce_panicked) > 1)
- wait_for_panic();
+ wait_for_panic(regs);
barrier();
bust_spinlocks(1);
@@ -817,7 +832,7 @@ static atomic_t mce_callin;
/*
* Check if a timeout waiting for other CPUs happened.
*/
-static int mce_timed_out(u64 *t, const char *msg)
+static int mce_timed_out(u64 *t, const char *msg, struct pt_regs *regs)
{
/*
* The others already did panic for some reason.
@@ -827,12 +842,12 @@ static int mce_timed_out(u64 *t, const char *msg)
*/
rmb();
if (atomic_read(&mce_panicked))
- wait_for_panic();
+ wait_for_panic(regs);
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
if (mca_cfg.tolerant <= 1)
- mce_panic(msg, NULL, NULL);
+ mce_panic(msg, NULL, NULL, regs);
cpu_missing = 1;
return 1;
}
@@ -866,7 +881,7 @@ static int mce_timed_out(u64 *t, const char *msg)
* All the spin loops have timeouts; when a timeout happens a CPU
* typically elects itself to be Monarch.
*/
-static void mce_reign(void)
+static void mce_reign(struct pt_regs *regs)
{
int cpu;
struct mce *m = NULL;
@@ -896,7 +911,7 @@ static void mce_reign(void)
* other CPUs.
*/
if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal machine check", m, msg);
+ mce_panic("Fatal machine check", m, msg, regs);
/*
* For UC somewhere we let the CPU who detects it handle it.
@@ -909,7 +924,8 @@ static void mce_reign(void)
* source or one CPU is hung. Panic.
*/
if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal machine check from unknown source", NULL,
NULL);
+ mce_panic("Fatal machine check from unknown source", NULL, NULL,
+ regs);
/*
* Now clear all the mces_seen so that they don't reappear on
@@ -928,7 +944,7 @@ static atomic_t global_nwo;
* in the entry order.
* TBD double check parallel CPU hotunplug
*/
-static int mce_start(int *no_way_out)
+static int mce_start(int *no_way_out, struct pt_regs *regs)
{
int order;
int cpus = num_online_cpus();
@@ -949,7 +965,8 @@ static int mce_start(int *no_way_out)
*/
while (atomic_read(&mce_callin) != cpus) {
if (mce_timed_out(&timeout,
- "Timeout: Not all CPUs entered broadcast exception
handler")) {
+ "Timeout: Not all CPUs entered broadcast exception
handler",
+ regs)) {
atomic_set(&global_nwo, 0);
return -1;
}
@@ -975,7 +992,8 @@ static int mce_start(int *no_way_out)
*/
while (atomic_read(&mce_executing) < order) {
if (mce_timed_out(&timeout,
- "Timeout: Subject CPUs unable to finish
machine check processing")) {
+ "Timeout: Subject CPUs unable to finish
machine check processing",
+ regs)) {
atomic_set(&global_nwo, 0);
return -1;
}
@@ -995,7 +1013,7 @@ static int mce_start(int *no_way_out)
* Synchronize between CPUs after main scanning loop.
* This invokes the bulk of the Monarch processing.
*/
-static int mce_end(int order)
+static int mce_end(int order, struct pt_regs *regs)
{
int ret = -1;
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
@@ -1020,12 +1038,13 @@ static int mce_end(int order)
*/
while (atomic_read(&mce_executing) <= cpus) {
if (mce_timed_out(&timeout,
- "Timeout: Monarch CPU unable to finish
machine check processing"))
+ "Timeout: Monarch CPU unable to finish
machine check processing",
+ regs))
goto reset;
ndelay(SPINUNIT);
}
- mce_reign();
+ mce_reign(regs);
barrier();
ret = 0;
} else {
@@ -1034,7 +1053,8 @@ static int mce_end(int order)
*/
while (atomic_read(&mce_executing) != 0) {
if (mce_timed_out(&timeout,
- "Timeout: Monarch CPU did not finish
machine check processing"))
+ "Timeout: Monarch CPU did not finish
machine check processing",
+ regs))
goto reset;
ndelay(SPINUNIT);
}
@@ -1286,9 +1306,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
*/
if (lmce) {
if (no_way_out)
- mce_panic("Fatal local machine check", &m, msg);
+ mce_panic("Fatal local machine check", &m, msg, regs);
} else {
- order = mce_start(&no_way_out);
+ order = mce_start(&no_way_out, regs);
}
__mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
@@ -1301,7 +1321,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
* When there's any problem use only local no_way_out state.
*/
if (!lmce) {
- if (mce_end(order) < 0)
+ if (mce_end(order, regs) < 0)
no_way_out = worst >= MCE_PANIC_SEVERITY;
} else {
/*
@@ -1314,7 +1334,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
*/
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
mce_severity(&m, cfg->tolerant, &msg, true);
- mce_panic("Local fatal machine check!", &m, msg);
+ mce_panic("Local fatal machine check!", &m, msg, regs);
}
}
@@ -1325,7 +1345,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
if (cfg->tolerant == 3)
kill_it = 0;
else if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
+ mce_panic("Fatal machine check on current CPU", &m, msg, regs);
if (worst > 0)
irq_work_queue(&mce_irq_work);
@@ -1361,7 +1381,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
*/
if (m.kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
- mce_panic("Failed kernel mode recovery", &m,
msg);
+ mce_panic("Failed kernel mode recovery", &m,
+ msg, regs);
}
}
}
--
2.17.1
_______________________________________________
Openipmi-developer mailing list
openipmi-develo...@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/openipmi-developer