Zhang Liguang report a bug as bellow:
1) system detected cmci storm on current cpu
2) disable cmci interrupt on banks ownd by current cpu, then swiching to poll 
mode
3) a few minites later, system swiching to interrupt mode on current cpu
4) we expect system to reenable cmci interrupt on banks ownd by current cpu
   mce_intel_adjust_timer
   |-> cmci_reenable
       |-> cmci_discover     # but, ownd banks is ignore here

> static void cmci_discover(int banks)
>       ...
>       for (i = 0; i < banks; i++) {
>               ...
>               if (test_bit(i, owned)) # ownd banks is ignore here
>                       continue;

In this patch, we add a func cmci_storm_set_cmci(), just to enable or
disable banks which ownd by current cpu without clean the ownd flags.

Reported-by: Zhang Liguang <zhangligu...@huawei.com>
Cc: sta...@vger.kernel.org  # v3.15+
Signed-off-by: Xie XiuQi <xiexi...@huawei.com>
---
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 41 +++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c 
b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 844f56c..a20e18b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
        per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
 }
 
+static void cmci_storm_set_cmci(bool on)
+{
+       unsigned long flags, *owned;
+       int bank;
+       u64 val;
+
+       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+       owned = this_cpu_ptr(mce_banks_owned);
+       for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+               rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+               if (on)
+                       val |= MCI_CTL2_CMCI_EN;
+               else
+                       val &= ~MCI_CTL2_CMCI_EN;
+
+               wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+       }
+       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
 unsigned long cmci_intel_adjust_timer(unsigned long interval)
 {
        if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
@@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long 
interval)
                 */
                if (!atomic_read(&cmci_storm_on_cpus)) {
                        __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
-                       cmci_reenable();
+                       cmci_storm_set_cmci(true);
                        cmci_recheck();
                }
                return CMCI_POLL_INTERVAL;
@@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long 
interval)
        }
 }
 
-static void cmci_storm_disable_banks(void)
-{
-       unsigned long flags, *owned;
-       int bank;
-       u64 val;
-
-       raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-       owned = this_cpu_ptr(mce_banks_owned);
-       for_each_set_bit(bank, owned, MAX_NR_BANKS) {
-               rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
-               val &= ~MCI_CTL2_CMCI_EN;
-               wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
-       }
-       raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
 static bool cmci_storm_detect(void)
 {
        unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
        if (cnt <= CMCI_STORM_THRESHOLD)
                return false;
 
-       cmci_storm_disable_banks();
+       cmci_storm_set_cmci(false);
        __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
        r = atomic_add_return(1, &cmci_storm_on_cpus);
        mce_timer_kick(CMCI_STORM_INTERVAL);
-- 
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to