From: Marek Majtyka <marekx.majt...@intel.com>

Implemented CMEM EDAC support on rte config load/unload.
 - fixed bugs found CMEM/SMEM:
        - work queue hang on driver removal
        - unnecessary edac sysfs entries (valid only for ddr3)

Signed-off-by: Marek Majtyka <marekx.majt...@intel.com>
---
 drivers/edac/axxia_edac-cmc_56xx.c | 796 ++++++++++++++++++++++++++++---------
 drivers/edac/axxia_edac-mc_56xx.c  | 228 +++++++----
 2 files changed, 760 insertions(+), 264 deletions(-)

diff --git a/drivers/edac/axxia_edac-cmc_56xx.c 
b/drivers/edac/axxia_edac-cmc_56xx.c
index f99c46d..c4bf2d0 100644
--- a/drivers/edac/axxia_edac-cmc_56xx.c
+++ b/drivers/edac/axxia_edac-cmc_56xx.c
@@ -25,6 +25,9 @@
 #include <linux/mfd/syscon.h>
 #include <linux/regmap.h>
 #include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
 #include "edac_core.h"
 #include "edac_module.h"
 
@@ -78,12 +81,17 @@
 #define INT_BIT_5  (0x00000020)
 #define INT_BIT_6  (0x00000040)
 #define INT_BIT_7  (0x00000080)
+#define INT_BIT_8  (0x00000100)
 #define INT_BIT_11 (0x00000800)
 #define INT_BIT_21 (0x00200000)
 #define INT_BIT_25 (0x02000000)
 #define INT_BIT_30 (0x40000000)
 #define INT_BIT_31 (0x80000000)
 
+#define CM_INT_MASK_BASE_PROBE (~(\
+                       INT_BIT_8 |\
+                       INT_BIT_31))
+
 #define CM_INT_MASK_BASE (~(\
                        INT_BIT_1 |\
                        INT_BIT_2 |\
@@ -110,70 +118,70 @@
                        INT_BIT_30 |\
                        INT_BIT_31))
 
-#define CM_INT_MASK_ALL (0xffffffff)
+#define CM_INT_MASK_ALL (0x7fffffff)
 #define ALIVE_NOTIFICATION_PERIOD (90*1000)
 
 static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
+module_param(log, int, 0644);
 MODULE_PARM_DESC(log, "Log each error to kernel log.");
 
 static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
 MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
 
 static atomic64_t mc_counter = ATOMIC_INIT(0);
 /*
- Bit [31] = Logical OR of all lower bits.
- Bit [30] = A CRC error occurred on the write data bus.
- Bit [29] = The user-initiated DLL resync has completed.
- Bit [28] = A state change has been detected on the dfi_init_complete
-       signal after initialization.
- Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
-       inhibited the command queue.
- Bit [26] = The register interface-initiated mode register write has completed
-       and another mode register write may be issued.
- Bit [25] = MPR read command, initiated with a software MPR_READ request, is
-       complete.
- Bit [24] = Error received from the PHY on the DFI bus.
- Bit [23] = RESERVED
- Bit [22] = RESERVED
- Bit [21] = A parity error has been detected on the address/control bus on
-       a registered DIMM.
- Bit [20] = The leveling operation has completed.
- Bit [19] = A read leveling gate training operation has been requested.
- Bit [18] = A read leveling operation has been requested.
- Bit [17] = A write leveling operation has been requested.
- Bit [16] = A DFI update error has occurred.  Error information can be found in
-       the UPDATE_ERROR_STATUS parameter.
- Bit [15] = A write leveling error has occurred. Error information can be found
-       in the WRLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling gate training error has occurred. Error information
-       can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = A read leveling error has occurred. Error information can be found
-       in the RDLVL_ERROR_STATUS parameter.
- Bit [12] = The user has programmed an invalid setting associated with user
-       words per burst.
-       Examples:
-               Setting param_reduc when burst length = 2.
-               A 1:2 MC:PHY clock ratio with burst length = 2.
- Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
-       unsupported & may result in memory data corruption.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
-       have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
-       has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest.
-*/
+ * Bit [31] = Logical OR of all lower bits.
+ * Bit [30] = A CRC error occurred on the write data bus.
+ * Bit [29] = The user-initiated DLL resync has completed.
+ * Bit [28] = A state change has been detected on the dfi_init_complete
+ *        signal after initialization.
+ * Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
+ *        inhibited the command queue.
+ * Bit [26] = The register interface-initiated mode register write has 
completed
+ *        and another mode register write may be issued.
+ * Bit [25] = MPR read command, initiated with a software MPR_READ request, is
+ *        complete.
+ * Bit [24] = Error received from the PHY on the DFI bus.
+ * Bit [23] = RESERVED
+ * Bit [22] = RESERVED
+ * Bit [21] = A parity error has been detected on the address/control bus on
+ *        a registered DIMM.
+ * Bit [20] = The leveling operation has completed.
+ * Bit [19] = A read leveling gate training operation has been requested.
+ * Bit [18] = A read leveling operation has been requested.
+ * Bit [17] = A write leveling operation has been requested.
+ * Bit [16] = A DFI update error has occurred.  Error information can be found
+ *        in the UPDATE_ERROR_STATUS parameter.
+ * Bit [15] = A write leveling error has occurred. Error information can be
+ *        found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling gate training error has occurred. Error
+ *        information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = A read leveling error has occurred. Error information can be
+ *        found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [12] = The user has programmed an invalid setting associated with user
+ *        words per burst.
+ *        Examples:
+ *          Setting param_reduc when burst length = 2.
+ *          A 1:2 MC:PHY clock ratio with burst length = 2.
+ * Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
+ *        unsupported & may result in memory data corruption.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ *        have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ *        has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+
+ * Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest.
+ */
 
 /*
  *   MPR dump processing - overview.
@@ -182,7 +190,7 @@ static atomic64_t mc_counter = ATOMIC_INIT(0);
  * one need to collect dumps for all available cs. Below given example
  * for two cs0/cs1.
  *
- *   CMEM MC           cmmon_isr           cmmon_wq
+ *   CMEM MC           cmmon_isr_sw         cmmon_wq
  *     |                   |                   |
  *     |                   |                   |
  *     |ALERT_N - int_status bit [30]          |
@@ -337,6 +345,15 @@ struct __packed mpr_dump {
        u8      cs;
 };
 
+enum init_return_codes {ERR_STAGE_8 = -8,
+                       ERR_STAGE_7 = -7,
+                       ERR_STAGE_6 = -6,
+                       ERR_STAGE_5 = -5,
+                       ERR_STAGE_4 = -4,
+                       ERR_STAGE_3 = -3,
+                       ERR_STAGE_2 = -2,
+                       ERR_STAGE_1 = -1
+};
 enum events {
        EV_ILLEGAL = 0,
        EV_MULT_ILLEGAL,
@@ -437,9 +454,19 @@ struct intel_edac_dev_info {
        struct mc_edac_data *data;
        char *ctl_name;
        char *blk_name;
+       char *proc_name;
+#ifdef CONFIG_DEBUG_CMEM
+       struct proc_dir_entry *dir_entry;
+#endif
+       struct mutex state_machine_lock;
        struct work_struct offload_alerts;
        struct work_struct offload_events;
+       int finish_alerts;
+       int finish_events;
+       struct workqueue_struct *wq_alerts;
+       struct workqueue_struct *wq_events;
        int is_ddr4;
+       int is_controller_configured;
        int edac_idx;
        u32 cm_region;
        struct regmap *syscon;
@@ -448,7 +475,7 @@ struct intel_edac_dev_info {
        void (*check)(struct edac_device_ctl_info *edac_dev);
 };
 
-
+#ifdef CONFIG_DEBUG_CMEM
 static ssize_t mpr1_dump_show(struct edac_device_ctl_info
                                 *edac_dev, char *data)
 {
@@ -534,14 +561,17 @@ static ssize_t mpr1_dump_show(struct edac_device_ctl_info
        return 0;
 }
 
+
+
 static struct edac_dev_sysfs_attribute device_block_attr[] = {
        {
                .attr = {
                        .name = "mpr_page1",
-                       .mode = (S_IRUGO | S_IWUSR)
+                       .mode = (0644)
                },
                .show = mpr1_dump_show,
-               .store = NULL},
+               .store = NULL
+       },
        /* End of list */
        {
                .attr = {.name = NULL}
@@ -552,6 +582,7 @@ static void axxia_mc_sysfs_attributes(struct 
edac_device_ctl_info *edac_dev)
 {
                edac_dev->sysfs_attributes = &device_block_attr[0];
 }
+#endif
 
 static inline void __attribute__((always_inline))
 handle_events(struct intel_edac_dev_info *edac_dev,
@@ -679,11 +710,24 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 
page, int cs)
 }
 
 static irqreturn_t
-cmmon_isr(int interrupt, void *device)
+cmmon_isr_hw(int interrupt, void *device)
+{
+       return IRQ_WAKE_THREAD;
+}
+
+static int initialize(struct intel_edac_dev_info *dev_info);
+static int enable_workers(struct intel_edac_dev_info *dev_info);
+static void uninitialize(struct intel_edac_dev_info *dev_info,
+                       int ret, int only_disable);
+
+static irqreturn_t
+cmmon_isr_sw(int interrupt, void *device)
 {
        struct intel_edac_dev_info *dev_info = device;
        struct cm_56xx_denali_ctl_84 denali_ctl_84;
        struct cm_56xx_denali_ctl_85 denali_ctl_85 = {0};
+       struct cm_56xx_denali_ctl_86 denali_ctl_86;
+       int ret = 0;
 
        /*
         * NOTE:
@@ -702,12 +746,47 @@ cmmon_isr(int interrupt, void *device)
                4, (u32 *) &denali_ctl_84))
                goto error_read;
 
+       if (denali_ctl_84.int_status & INT_BIT_8) {
+               if (dev_info->is_controller_configured == 0) {
+                       ret = initialize(dev_info);
+                       if (ret)
+                               goto error_init;
+
+                       ret = enable_workers(dev_info);
+                       if (ret)
+                               goto error_init;
+
+                       dev_info->is_controller_configured = 1;
+               }
+
+               if (dev_info->is_ddr4)
+                       denali_ctl_86.int_mask = CM_INT_MASK_FULL;
+               else
+                       denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+
+               if (ncr_write(dev_info->cm_region,
+                                       CM_56XX_DENALI_CTL_86,
+                                       4, (u32 *) &denali_ctl_86)) {
+                       goto error_write;
+               }
+               return IRQ_HANDLED;
+       }
+
+       /*
+        * SAFETY CHECK
+        * one cannot go further if driver is not fully functional!!!
+        */
+       if (dev_info->is_controller_configured == 0)
+               return IRQ_HANDLED;
+
+
        handle_events(dev_info, &denali_ctl_84);
        atomic_set(&dev_info->data->event_ready, 1);
        wake_up(&dev_info->data->event_wq);
 
        denali_ctl_85.int_ack =
-               (denali_ctl_84.int_status & (~(INT_BIT_25 | INT_BIT_31)));
+               (denali_ctl_84.int_status &
+                  (~(INT_BIT_25 | INT_BIT_31 | INT_BIT_8)));
 
        if (dev_info->is_ddr4) {
                if (denali_ctl_84.int_status & INT_BIT_25) {
@@ -737,6 +816,12 @@ cmmon_isr(int interrupt, void *device)
        printk_ratelimited("%s: Error reading interrupt status\n",
                       dev_name(&dev_info->pdev->dev));
        return IRQ_HANDLED;
+error_init:
+       printk_ratelimited("%s: Error during driver initialization\n",
+                      dev_name(&dev_info->pdev->dev));
+       uninitialize(dev_info, ret,
+                       dev_info->is_controller_configured == 0 ? 1 : 0);
+       return IRQ_HANDLED;
 }
 
 
@@ -747,16 +832,19 @@ static void intel_cm_alerts_error_check(struct 
edac_device_ctl_info *edac_dev)
        struct event_counter (*alerts)[MAX_DQ][MPR_ERRORS] =
                        dev_info->data->alerts;
        struct cm_56xx_denali_ctl_34 denali_ctl_34;
-       int i, j, k, l;
+       int i, j, k, l, ret;
        u32 counter;
 
 start:
        /* keep hung up monitor happy 90 sec's */
-       if (0 == wait_event_timeout(dev_info->data->dump_wq,
+       if (wait_event_timeout(dev_info->data->dump_wq,
                atomic_read(&dev_info->data->dump_in_progress),
-               msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+               msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
                goto start;
 
+       if (dev_info->finish_alerts)
+               goto finish;
+
        for (i = 0; i < dev_info->data->cs_count; ++i) {
 
                /* trigger dump */
@@ -779,9 +867,15 @@ static void intel_cm_alerts_error_check(struct 
edac_device_ctl_info *edac_dev)
                        CM_56XX_DENALI_CTL_34,
                        4, (u32 *) &denali_ctl_34))
                        goto error_write;
+
                /* wait */
-               wait_event(dev_info->data->dump_wq,
-                          atomic_read(&dev_info->data->dump_ready));
+               ret = wait_event_timeout(dev_info->data->dump_wq,
+                          atomic_read(&dev_info->data->dump_ready),
+                          msecs_to_jiffies(1000));
+               if (dev_info->finish_alerts)
+                       goto finish;
+               if (ret == 0)
+                       goto timeout_error;
 
                atomic_set(&dev_info->data->dump_ready, 0);
                /* collect data */
@@ -811,11 +905,21 @@ static void intel_cm_alerts_error_check(struct 
edac_device_ctl_info *edac_dev)
        atomic_set(&dev_info->data->dump_in_progress, 0);
        goto start;
 
+timeout_error:
+       printk_ratelimited("Timeout occurred during MPR dump.\n");
+       atomic_set(&dev_info->data->dump_ready, 0);
+       atomic_set(&dev_info->data->dump_in_progress, 0);
+       goto start;
+
 error_read:
 error_write:
        printk_ratelimited("Could not collect MPR dump.\n");
        atomic_set(&dev_info->data->dump_in_progress, 0);
        goto start;
+
+finish:
+       atomic_set(&dev_info->data->dump_ready, 0);
+       atomic_set(&dev_info->data->dump_in_progress, 0);
 }
 
 static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev)
@@ -827,13 +931,16 @@ static void intel_cm_events_error_check(struct 
edac_device_ctl_info *edac_dev)
        u32 counter;
 
        while (1) {
-               if (0 == wait_event_timeout(dev_info->data->event_wq,
+               if (wait_event_timeout(dev_info->data->event_wq,
                        atomic_read(&dev_info->data->event_ready),
-                       msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+                       msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
                        continue;
 
                atomic_set(&dev_info->data->event_ready, 0);
 
+               if (dev_info->finish_events)
+                       break;
+
                mutex_lock(&dev_info->data->edac_sysfs_data_lock);
                for (i = 0; i < NR_EVENTS; ++i) {
                        counter = atomic_xchg(&events[i].counter, 0);
@@ -911,119 +1018,133 @@ static int get_active_dram(struct intel_edac_dev_info 
*dev_info)
        if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_74,
                4, (u32 *) &denali_ctl_74)) {
                pr_err("Could not read number of lanes.\n");
+               return dram;
        }
 
-       if (0 == denali_ctl_74.bank_diff)
+       if (denali_ctl_74.bank_diff == 0)
                dram = MAX_DQ/2;
 
-       if (1 == denali_ctl_74.bank_diff)
+       if (denali_ctl_74.bank_diff == 1)
                dram = MAX_DQ;
 
        return dram;
 }
 
-static int intel_edac_mc_probe(struct platform_device *pdev)
+static int get_ddr4(struct intel_edac_dev_info *dev_info)
 {
-       struct edac_device_instance *instance;
-       struct edac_device_block *block;
-       int i, j, k, l;
-       int count;
-       struct intel_edac_dev_info *dev_info = NULL;
-       struct resource *io;
-       struct device_node *np = pdev->dev.of_node;
-       int irq = -1, rc = 0;
        struct cm_56xx_denali_ctl_00 denali_ctl_00;
-       struct cm_56xx_denali_ctl_86 denali_ctl_86;
-       int cs_count = MAX_CS;
-       int dram_count = MAX_DQ;
-
-       count = atomic64_inc_return(&mc_counter);
-       if ((count - 1) == MEMORY_CONTROLLERS)
-               goto err_nodev;
-
-       dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL);
-       if (!dev_info)
-               goto err_nomem;
 
-       dev_info->data =
-               devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
-       if (!dev_info->data)
-               goto err_noctlinfo;
-
-       init_waitqueue_head(&dev_info->data->dump_wq);
-       init_waitqueue_head(&dev_info->data->event_wq);
-
-       raw_spin_lock_init(&dev_info->data->mpr_data_lock);
-       mutex_init(&dev_info->data->edac_sysfs_data_lock);
+       if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
+               4, (u32 *) &denali_ctl_00)) {
+               pr_err("Could not read ddr version.\n");
+               return -1;
+       }
 
-       dev_info->ctl_name = kstrdup(np->name, GFP_KERNEL);
-       dev_info->blk_name = "ECC";
-       edac_op_state = EDAC_OPSTATE_POLL;
+       if (denali_ctl_00.dram_class == 0xa)
+               return 1;
 
-       dev_info->pdev = pdev;
-       dev_info->edac_idx = edac_device_alloc_index();
-       dev_info->data->irq = 0;
+       return 0;
+}
 
-       /* setup all counters */
-       for (i = 0; i < NR_EVENTS; ++i)
-               atomic_set(&dev_info->data->events[i].counter, 0);
+static void uninitialize(struct intel_edac_dev_info *dev_info,
+                       int ret, int only_disable)
+{
+       struct cm_56xx_denali_ctl_86 denali_ctl_86;
 
-       for (j = 0; j < MAX_CS; ++j) {
-               for (l = 0; l < MAX_DQ; ++l) {
-                       for (k = 0; k < MPR_ERRORS; ++k, ++i) {
-                               atomic_set(&dev_info->data->
-                                               alerts[j][l][k].counter, 0);
+       switch (ret) {
+       case ERR_STAGE_8:
+               if (dev_info->data->irq) {
+                       disable_irq(dev_info->data->irq);
+                       devm_free_irq(&dev_info->pdev->dev,
+                                       dev_info->data->irq, dev_info);
+                       dev_info->data->irq = 0;
+               }
+               /* fall-through */
+       case ERR_STAGE_7:
+               denali_ctl_86.int_mask = CM_INT_MASK_ALL;
+               if (ncr_write(dev_info->cm_region,
+                                       CM_56XX_DENALI_CTL_86,
+                                       4, (u32 *) &denali_ctl_86)) {
+                       pr_err("Could not mask interrupts (%s - ctl_86).\n",
+                               dev_info->ctl_name);
+               }
+               if (only_disable)
+                       break;
+               /* fall-through */
+       case ERR_STAGE_6:
+               if (dev_info->is_ddr4) {
+                       dev_info->finish_alerts = 1;
+                       atomic_inc(&dev_info->data->dump_in_progress);
+                       atomic_set(&dev_info->data->dump_ready, 1);
+                       wake_up(&dev_info->data->dump_wq);
+                       cancel_work_sync(&dev_info->offload_alerts);
+               }
+               dev_info->finish_events = 1;
+               atomic_set(&dev_info->data->event_ready, 1);
+               wake_up(&dev_info->data->event_wq);
+               cancel_work_sync(&dev_info->offload_events);
+               /* fall-through */
+       case ERR_STAGE_5:
+               if (dev_info->is_ddr4)
+                       if (dev_info->wq_alerts) {
+                               destroy_workqueue(dev_info->wq_alerts);
+                               dev_info->wq_alerts = NULL;
                        }
+               /* fall-through */
+       case ERR_STAGE_4:
+               if (dev_info->wq_events) {
+                       destroy_workqueue(dev_info->wq_events);
+                       dev_info->wq_events = NULL;
+
+               }
+               /* fall-through */
+       case ERR_STAGE_3:
+               edac_device_del_device(&dev_info->pdev->dev);
+               /* fall-through */
+       case ERR_STAGE_2:
+               if (dev_info->edac_dev) {
+                       edac_device_free_ctl_info(dev_info->edac_dev);
+                       dev_info->edac_dev = NULL;
                }
+               /* fall-through */
+       case ERR_STAGE_1:
+               /* fall-through */
+       default:
+               break;
        }
+}
 
-       /* set up dump in progress flag */
-       atomic_set(&dev_info->data->dump_in_progress, 0);
+static int initialize(struct intel_edac_dev_info *dev_info)
+{
+       struct edac_device_instance *instance;
+       struct edac_device_block *block;
+       int i, j, k, l;
 
-       io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!io) {
-               dev_err(&pdev->dev, "Unable to get mem resource\n");
-               goto err_noctlinfo;
-       }
-       dev_info->cm_region = io->start;
-       dev_info->syscon =
-               syscon_regmap_lookup_by_phandle(np, "syscon");
-       if (IS_ERR(dev_info->syscon)) {
-               pr_info(FMT, np->name);
-               dev_info->axi2ser3_region = ioremap(AXI2_SER3_PHY_ADDR,
-                       AXI2_SER3_PHY_SIZE);
-               if (!dev_info->axi2ser3_region) {
-                       pr_err("ioremap of axi2ser3 region failed\n");
-                       goto err_noctlinfo;
-               }
-       }
+       int cs_count = MAX_CS;
+       int dram_count = MAX_DQ;
 
-       if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
-               4, (u32 *) &denali_ctl_00)) {
-               pr_err("Could not read ddr version.\n");
-               goto err_noctlinfo;
+       cs_count = get_active_cs(dev_info);
+       if (cs_count == 0) {
+               pr_err("Could not get cs number. Is config loaded?\n");
+               return ERR_STAGE_1;
        }
 
-       if (0xa == denali_ctl_00.dram_class) {
-               pr_info("%s supports mpr dump (DDR4).\n", dev_info->ctl_name);
-               dev_info->is_ddr4 = 1;
-       } else {
-               if (0x6 == denali_ctl_00.dram_class) {
-                       pr_info("%s doesn't support mpr dump (DDR3).\n",
-                               dev_info->ctl_name);
-               } else {
-                       pr_err("CMEM is not configured. Check uboot 
settings.\n");
-                       goto err_noctlinfo;
-               }
+       dram_count = get_active_dram(dev_info);
+       if (dram_count == 0) {
+               pr_err("Could not get dram number. Is config loaded?\n");
+               return ERR_STAGE_1;
        }
 
-       cs_count = get_active_cs(dev_info);
-       if (cs_count == 0)
-               goto err_noctlinfo;
+       dev_info->is_ddr4 = get_ddr4(dev_info);
 
-       dram_count = get_active_dram(dev_info);
-       if (dram_count == 0)
-               goto err_noctlinfo;
+       if (dev_info->is_ddr4 == -1) {
+               pr_err("Could not get dram version. Is config loaded?\n");
+               return ERR_STAGE_1;
+       }
+       /*dev_info->is_ddr4 = 1;*/
+
+       dev_info->finish_alerts = 0;
+       dev_info->finish_events = 0;
 
        dev_info->data->cs_count = cs_count;
        dev_info->data->dram_count = dram_count;
@@ -1031,13 +1152,15 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        dev_info->edac_dev =
                edac_device_alloc_ctl_info(0, dev_info->ctl_name,
                                         1, dev_info->blk_name,
-                                        NR_EVENTS +
-                                        cs_count * dram_count * MPR_ERRORS,
+                                        NR_EVENTS + (dev_info->is_ddr4 ?
+                                        cs_count * dram_count * MPR_ERRORS
+                                        :
+                                        0),
                                         0, NULL, 0, dev_info->edac_idx);
 
        if (!dev_info->edac_dev) {
                pr_info("No memory for edac device\n");
-               goto err_noctlinfo;
+               return ERR_STAGE_1;
        }
 
        instance = &dev_info->edac_dev->instances[0];
@@ -1086,51 +1209,85 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        dev_info->edac_dev->dev_name = dev_name(&dev_info->pdev->dev);
        dev_info->edac_dev->edac_check = NULL;
 
+#ifdef CONFIG_DEBUG_CMEM
        if (dev_info->is_ddr4)
                axxia_mc_sysfs_attributes(dev_info->edac_dev);
+#endif
 
        if (edac_device_add_device(dev_info->edac_dev) != 0) {
                pr_info("Unable to add edac device for %s\n",
                        dev_info->ctl_name);
-               goto err_nosysfs;
+               return ERR_STAGE_2;
        }
 
-       snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
-                       "%s-mon", dev_info->ctl_name);
+       return 0;
+}
+
+static int enable_workers(struct intel_edac_dev_info *dev_info)
+{
+       atomic_set(&dev_info->data->dump_ready, 0);
+       atomic_set(&dev_info->data->event_ready, 0);
+       atomic_set(&dev_info->data->dump_in_progress, 0);
+
+       dev_info->wq_events = alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+                                               (dev_info->ctl_name));
+       if (!dev_info->wq_events)
+               return ERR_STAGE_3;
 
+       if (dev_info->is_ddr4) {
+               dev_info->wq_alerts =
+                       alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+                                       (dev_info->ctl_name));
+               if (!dev_info->wq_alerts)
+                       return ERR_STAGE_4;
+       }
        if (dev_info->is_ddr4)
                INIT_WORK(&dev_info->offload_alerts, axxia_alerts_work);
 
        INIT_WORK(&dev_info->offload_events, axxia_events_work);
 
        if (dev_info->is_ddr4)
-               schedule_work(&dev_info->offload_alerts);
-       schedule_work(&dev_info->offload_events);
+               queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
+       queue_work(dev_info->wq_events, &dev_info->offload_events);
+
+       return 0;
+}
 
-       irq = platform_get_irq(pdev, 0);
+static int enable_driver_irq(struct intel_edac_dev_info *dev_info)
+{
+       int irq = -1, rc = 0;
+       struct cm_56xx_denali_ctl_86 denali_ctl_86;
+
+       snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
+                       "%s-mon", dev_info->ctl_name);
+
+       irq = platform_get_irq(dev_info->pdev, 0);
        if (irq < 0) {
                pr_err("Could not get irq number.\n");
-               goto err_noirq;
+               return ERR_STAGE_5;
        }
 
        /*
         * Enable memory controller interrupts.
         */
-       if (dev_info->is_ddr4)
-               denali_ctl_86.int_mask = CM_INT_MASK_FULL;
-       else
-               denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+       if (dev_info->is_controller_configured) {
+               if (dev_info->is_ddr4)
+                       denali_ctl_86.int_mask = CM_INT_MASK_FULL;
+               else
+                       denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+       } else
+               denali_ctl_86.int_mask = CM_INT_MASK_BASE_PROBE;
 
        if (ncr_write(dev_info->cm_region, CM_56XX_DENALI_CTL_86,
                4, (u32 *) &denali_ctl_86)) {
                pr_err("Could not write interrupt mask reg (%s - ctl_86).\n",
                        dev_info->ctl_name);
-               goto err_noirq;
+               return ERR_STAGE_6;
        }
 
        dev_info->data->irq = irq;
-       rc = devm_request_irq(&pdev->dev, irq,
-                       cmmon_isr, IRQF_ONESHOT,
+       rc = devm_request_threaded_irq(&dev_info->pdev->dev, irq,
+                       cmmon_isr_hw, cmmon_isr_sw, IRQF_ONESHOT,
                        &dev_info->data->irq_name[0], dev_info);
 
        if (rc) {
@@ -1145,23 +1302,285 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
                                        4, (u32 *) &denali_ctl_86)) {
                        pr_err("Could not mask interrupts (%s - ctl_86).\n",
                                        dev_info->ctl_name);
+                       return ERR_STAGE_7;
                }
 
-               goto err_noirq;
+               return ERR_STAGE_6;
        }
        return 0;
+}
 
-err_noirq:
-       if (dev_info->is_ddr4)
-               cancel_work_sync(&dev_info->offload_alerts);
-       cancel_work_sync(&dev_info->offload_events);
 
-       edac_device_del_device(&dev_info->pdev->dev);
+#ifdef CONFIG_DEBUG_CMEM
+static ssize_t
+axxia_cmem_read(struct file *filp, char *buffer, size_t length, loff_t *offset)
+{
+       char *buf = NULL;
+       struct intel_edac_dev_info *dev_info =
+               (struct intel_edac_dev_info *) filp->private_data;
+       ssize_t len;
+
+       if (*offset > 0)
+               return 0;
+
+       buf = kmalloc(PAGE_SIZE, __GFP_WAIT);
+       if (buf == NULL)
+               goto no_mem_buffer;
+
+       mutex_lock(&dev_info->state_machine_lock);
+
+       /*
+        * Do not modify this. Content is used by rte driver.
+        * Once changed modify rte code.
+        */
+       len = snprintf(buf, PAGE_SIZE-1, "Node: 0x%x\n"
+               "Command available:\n"
+               "          dump - triggers mpr_page1 dump.\n",
+               (int) dev_info->cm_region >> 16);
+
+       mutex_unlock(&dev_info->state_machine_lock);
+
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               len = -EFAULT;
+
+       kfree(buf);
+       *offset += len;
+       return len;
 
-err_nosysfs:
-       edac_device_free_ctl_info(dev_info->edac_dev);
-err_noctlinfo:
+no_mem_buffer:
+       pr_err("Could not allocate memory for cmem edac control buffer.\n");
+       return -ENOSPC;
+}
+
+static ssize_t
+axxia_cmem_write(struct file *file, const char __user *buffer,
+                size_t count, loff_t *ppos)
+{
+       char *buf = NULL;
+       struct intel_edac_dev_info *dev_info =
+               (struct intel_edac_dev_info *) file->private_data;
+
+       buf = kmalloc(count + 1, __GFP_WAIT);
+       if (buf == NULL)
+               goto no_mem_buffer;
+
+       memset(buf, 0, count + 1);
+
+       if (copy_from_user(buf, buffer, count)) {
+               pr_err("Could not copy data from user.\n");
+               goto cfu_failed;
+       }
+
+       if (!strncmp(buf, "dump", 4)) {
+               atomic_inc(&dev_info->data->dump_in_progress);
+               wake_up(&dev_info->data->dump_wq);
+       }
+
+       kfree(buf);
+       return count;
+
+cfu_failed:
+       kfree(buf);
+       return -EFAULT;
+
+no_mem_buffer:
+       pr_err("Could not allocate memory for cmem edac control buffer.\n");
+       return -ENOSPC;
+}
+
+int axxia_cmem_open(struct inode *inode, struct file *filp)
+{
+       try_module_get(THIS_MODULE);
+       filp->private_data = PDE_DATA(inode);
+       return 0;
+}
+
+int axxia_cmem_close(struct inode *inode, struct file *filp)
+{
+       module_put(THIS_MODULE);
+       filp->private_data = 0;
+       return 0;
+}
+
+static const struct file_operations axxia_edac_cmem_proc_ops = {
+       .owner      = THIS_MODULE,
+       .open       = axxia_cmem_open,
+       .read       = axxia_cmem_read,
+       .write      = axxia_cmem_write,
+       .release    = axxia_cmem_close,
+       .llseek     = noop_llseek
+};
+
+static void remove_procfs_entry(struct intel_edac_dev_info *dev_info)
+{
+       if (dev_info && dev_info->dir_entry) {
+               proc_remove(dev_info->dir_entry);
+               dev_info->dir_entry = NULL;
+       }
+}
+#endif
+
+static int intel_edac_mc_probe(struct platform_device *pdev)
+{
+       int i, j, k, l;
+       int count;
+       struct intel_edac_dev_info *dev_info = NULL;
+       struct resource *io;
+       struct device_node *np = pdev->dev.of_node;
+       struct cm_56xx_denali_ctl_00 denali_ctl_00;
+       int ret = -1;
+
+       count = atomic64_inc_return(&mc_counter);
+       if ((count - 1) == MEMORY_CONTROLLERS)
+               goto err_nodev;
+
+       dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL);
+       if (!dev_info)
+               goto err_nomem;
+
+       dev_info->ctl_name =
+               devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+       if (!dev_info->ctl_name)
+               goto err_nomem;
+
+       dev_info->blk_name =
+               devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+       if (!dev_info->blk_name)
+               goto err_nomem;
+
+       dev_info->data =
+               devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
+       if (!dev_info->data)
+               goto err_nomem;
+
+       init_waitqueue_head(&dev_info->data->dump_wq);
+       init_waitqueue_head(&dev_info->data->event_wq);
+
+       raw_spin_lock_init(&dev_info->data->mpr_data_lock);
+       mutex_init(&dev_info->data->edac_sysfs_data_lock);
+       mutex_init(&dev_info->state_machine_lock);
+
+       strncpy(dev_info->ctl_name, np->name, 32);
+       dev_info->ctl_name[31] = '\0';
+
+       strncpy(dev_info->blk_name, "ECC", 32);
+       dev_info->blk_name[31] = '\0';
+
+       edac_op_state = EDAC_OPSTATE_POLL;
+
+       dev_info->pdev = pdev;
+       dev_info->edac_idx = edac_device_alloc_index();
+       dev_info->data->irq = 0;
+
+       /* setup all counters */
+       for (i = 0; i < NR_EVENTS; ++i)
+               atomic_set(&dev_info->data->events[i].counter, 0);
+
+       for (j = 0; j < MAX_CS; ++j) {
+               for (l = 0; l < MAX_DQ; ++l) {
+                       for (k = 0; k < MPR_ERRORS; ++k, ++i) {
+                               atomic_set(&dev_info->data->
+                                               alerts[j][l][k].counter, 0);
+                       }
+               }
+       }
+
+       io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!io) {
+               dev_err(&pdev->dev, "Unable to get mem resource\n");
+               goto err_init;
+       }
+       dev_info->cm_region = io->start;
+       dev_info->syscon =
+               syscon_regmap_lookup_by_phandle(np, "syscon");
+       if (IS_ERR(dev_info->syscon)) {
+               pr_info(FMT, np->name);
+               dev_info->axi2ser3_region = ioremap(AXI2_SER3_PHY_ADDR,
+                       AXI2_SER3_PHY_SIZE);
+               if (!dev_info->axi2ser3_region) {
+                       pr_err("ioremap of axi2ser3 region failed\n");
+                       goto err_init;
+               }
+       }
+
+       if (ncr_read(dev_info->cm_region, CM_56XX_DENALI_CTL_00,
+               4, (u32 *) &denali_ctl_00)) {
+               pr_err("Could not read ddr version.\n");
+               goto err_init;
+       }
+
+       if (denali_ctl_00.start == 1) {
+               /* uboot has configured CMEM */
+               if (denali_ctl_00.dram_class == 0xa) {
+                       pr_info("%s supports mpr dump (DDR4).\n",
+                                       dev_info->ctl_name);
+                       dev_info->is_ddr4 = 1;
+               }
+               if (denali_ctl_00.dram_class == 0x6) {
+                       pr_info("%s doesn't support mpr dump (DDR3).\n",
+                               dev_info->ctl_name);
+               }
+               dev_info->is_controller_configured = 1;
+
+               ret = initialize(dev_info);
+               if (ret)
+                       goto err_uninit;
+
+               ret = enable_workers(dev_info);
+               if (ret)
+                       goto err_uninit;
+
+               ret = enable_driver_irq(dev_info);
+               if (ret)
+                       goto err_uninit;
+
+       } else {
+               /* CMEM is not configured */
+               dev_info->is_controller_configured = 0;
+
+               ret = enable_driver_irq(dev_info);
+               if (ret)
+                       goto err_uninit;
+
+               pr_info("CMEM base init: controller: %s DEV %s (INTERRUPT).\n",
+                       dev_info->ctl_name,
+                       dev_name(&dev_info->pdev->dev));
+       }
+
+#ifdef CONFIG_DEBUG_CMEM
+       /* in this case create procfs file to be used by rte */
+       dev_info->proc_name =
+               devm_kzalloc(&pdev->dev, 32*sizeof(char),
+                               GFP_KERNEL);
+       if (!dev_info->proc_name)
+               goto err_uninit;
+
+       snprintf(dev_info->proc_name, 31*sizeof(char),
+               "driver/axxia_edac_%s_control",
+               dev_info->ctl_name);
+
+       /* each instance shall know each private data */
+       dev_info->dir_entry =
+               proc_create_data(dev_info->proc_name, 0200,
+                               NULL, &axxia_edac_cmem_proc_ops,
+                               dev_info);
+
+       if (dev_info->dir_entry == NULL) {
+               pr_err("Could not create proc entry for %s.\n",
+                               dev_info->ctl_name);
+               goto err_uninit;
+       }
+#endif
+       return 0;
+
+
+err_uninit:
+       uninitialize(dev_info, ret,
+                       dev_info->is_controller_configured == 0 ? 1 : 0);
+err_init:
        mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
+       mutex_destroy(&dev_info->state_machine_lock);
        atomic64_dec(&mc_counter);
        return 1;
 err_nomem:
@@ -1172,23 +1591,20 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        return -ENODEV;
 }
 
+
+
 static int intel_edac_mc_remove(struct platform_device *pdev)
 {
        struct intel_edac_dev_info *dev_info =
                (struct intel_edac_dev_info *) &pdev->dev;
 
        if (dev_info) {
-               if (dev_info->data->irq > 0) {
-                       disable_irq(dev_info->data->irq);
-                       devm_free_irq(&pdev->dev,
-                                       dev_info->data->irq, dev_info);
+#ifdef CONFIG_DEBUG_CMEM
+               remove_procfs_entry(dev_info);
+#endif
 
-                       dev_info->data->irq = 0;
-
-                       if (dev_info->is_ddr4)
-                               cancel_work_sync(&dev_info->offload_alerts);
-                       cancel_work_sync(&dev_info->offload_events);
-               }
+               uninitialize(dev_info, ERR_STAGE_8,
+                       dev_info->is_controller_configured == 0 ? 1 : 0);
 
                if (dev_info->edac_dev != NULL) {
                        edac_device_del_device(&dev_info->pdev->dev);
@@ -1196,6 +1612,8 @@ static int intel_edac_mc_remove(struct platform_device 
*pdev)
                }
 
                mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
+               mutex_destroy(&dev_info->state_machine_lock);
+
                atomic64_dec(&mc_counter);
        }
        platform_device_unregister(pdev);
diff --git a/drivers/edac/axxia_edac-mc_56xx.c 
b/drivers/edac/axxia_edac-mc_56xx.c
index f850ebe..fb1e742 100644
--- a/drivers/edac/axxia_edac-mc_56xx.c
+++ b/drivers/edac/axxia_edac-mc_56xx.c
@@ -92,68 +92,68 @@
 #define ALIVE_NOTIFICATION_PERIOD (90*1000)
 
 static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
+module_param(log, int, 0644);
 MODULE_PARM_DESC(log, "Log each error to kernel log.");
 
 static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
 MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
 
 static atomic64_t mc_counter = ATOMIC_INIT(0);
 /*
- Bit [34] = Logical OR of all lower bits.
- Bit [33] = A CRC error occurred on the write data bus.
- Bit [32] = The software-initiated control word write has completed.
- Bit [31] = The user-initiated DLL resync has completed.
- Bit [30] = A state change has been detected on the
-       dfi_init_complete signal after initialization.
- Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
-       successfully inhibited the command queue.
- Bit [28] = The register interface-initiated mode register write has
-       completed and another mode register write may be issued.
- Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
- Bit [26] = MPR read command, initiated with a software MPR_READ request,
-        is complete.
- Bit [25] = Error received from the PHY on the DFI bus.
- Bit [24] = RESERVED
- Bit [23] = RESERVED
- Bit [22] = A parity error has been detected on the address/control bus
-       on a registered DIMM.
- Bit [21] = The leveling operation has completed.
- Bit [20] = A read leveling gate training operation has been requested.
- Bit [19] = A read leveling operation has been requested.
- Bit [18] = A write leveling operation has been requested.
- Bit [17] = A DFI update error has occurred. Error information can be
-       found in the UPDATE_ERROR_STATUS parameter.
- Bit [16] = A write leveling error has occurred. Error information can
-       be found in the WRLVL_ERROR_STATUS parameter.
- Bit [15] = A read leveling gate training error has occurred. Error
-       information can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling error has occurred. Error information can be
-       found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = The user has programmed an invalid setting associated with
-       user words per burst.
-       Examples: Setting param_reduc when burst length = 2. A 1:2
-       MC:PHY clock ratio with burst length = 2.
- Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
-       is unsupported & may result in memory data corruption.
- Bit [11] = A write was attempted to a writeprotected region.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
-       have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
-       has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest.
-*/
+ * Bit [34] = Logical OR of all lower bits.
+ * Bit [33] = A CRC error occurred on the write data bus.
+ * Bit [32] = The software-initiated control word write has completed.
+ * Bit [31] = The user-initiated DLL resync has completed.
+ * Bit [30] = A state change has been detected on the
+ *        dfi_init_complete signal after initialization.
+ * Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
+ *        successfully inhibited the command queue.
+ * Bit [28] = The register interface-initiated mode register write has
+ *        completed and another mode register write may be issued.
+ * Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
+ * Bit [26] = MPR read command, initiated with a software MPR_READ request,
+ *         is complete.
+ * Bit [25] = Error received from the PHY on the DFI bus.
+ * Bit [24] = RESERVED
+ * Bit [23] = RESERVED
+ * Bit [22] = A parity error has been detected on the address/control bus
+ *        on a registered DIMM.
+ * Bit [21] = The leveling operation has completed.
+ * Bit [20] = A read leveling gate training operation has been requested.
+ * Bit [19] = A read leveling operation has been requested.
+ * Bit [18] = A write leveling operation has been requested.
+ * Bit [17] = A DFI update error has occurred. Error information can be
+ *        found in the UPDATE_ERROR_STATUS parameter.
+ * Bit [16] = A write leveling error has occurred. Error information can
+ *        be found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [15] = A read leveling gate training error has occurred. Error
+ *        information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling error has occurred. Error information can be
+ *        found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = The user has programmed an invalid setting associated with
+ *        user words per burst.
+ *        Examples: Setting param_reduc when burst length = 2. A 1:2
+ *        MC:PHY clock ratio with burst length = 2.
+ * Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
+ *        is unsupported & may result in memory data corruption.
+ * Bit [11] = A write was attempted to a writeprotected region.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ *        have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ *        has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+ *
+ * Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest.
+ */
 
 /*
  *   MPR dump processing - overview.
@@ -162,7 +162,7 @@ static atomic64_t mc_counter = ATOMIC_INIT(0);
  * one need to collect dumps for all available cs. Below given example
  * for two cs0/cs1.
  *
- *   SMEM MC           smmon_isr           smmon_wq
+ *   SMEM MC           smmon_isr_sw           smmon_wq
  *     |                   |                   |
  *     |                   |                   |
  *     |ALERT_N - int_status bit [33]          |
@@ -612,6 +612,10 @@ struct intel_edac_dev_info {
        char *blk_name;
        struct work_struct offload_alerts;
        struct work_struct offload_events;
+       struct workqueue_struct *wq_alerts;
+       struct workqueue_struct *wq_events;
+       int finish_alerts;
+       int finish_events;
        int is_ddr4;
        int edac_idx;
        u32 sm_region;
@@ -725,7 +729,7 @@ static struct edac_dev_sysfs_attribute device_block_attr[] 
= {
        {
                .attr = {
                        .name = "mpr_page1",
-                       .mode = (S_IRUGO | S_IWUSR)
+                       .mode = (0644)
                },
                .show = mpr1_dump_show,
                .store = NULL},
@@ -904,7 +908,13 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 
page, int cs)
 }
 
 static irqreturn_t
-smmon_isr(int interrupt, void *device)
+smmon_isr_hw(int interrupt, void *device)
+{
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t
+smmon_isr_sw(int interrupt, void *device)
 {
        struct intel_edac_dev_info *dev_info = device;
        struct sm_56xx_denali_ctl_366 denali_ctl_366;
@@ -990,11 +1000,14 @@ static void intel_sm_alerts_error_check(struct 
edac_device_ctl_info *edac_dev)
 
 start:
        /* keep hung up monitor happy 90 sec's */
-       if (0 == wait_event_timeout(dev_info->data->dump_wq,
+       if (wait_event_timeout(dev_info->data->dump_wq,
                atomic_read(&dev_info->data->dump_in_progress),
-               msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+               msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
                goto start;
 
+       if (dev_info->finish_alerts)
+               goto finish;
+
                /* the only one running workqueue */
        for (i = 0; i < dev_info->data->cs_count; ++i) {
 
@@ -1021,6 +1034,9 @@ static void intel_sm_alerts_error_check(struct 
edac_device_ctl_info *edac_dev)
                wait_event(dev_info->data->dump_wq,
                           atomic_read(&dev_info->data->dump_ready));
 
+               if (dev_info->finish_alerts)
+                       goto finish;
+
                atomic_set(&dev_info->data->dump_ready, 0);
                /* collect data */
                collect_mpr_dump(dev_info, SM_MPR_PAGE, i);
@@ -1054,6 +1070,10 @@ static void intel_sm_alerts_error_check(struct 
edac_device_ctl_info *edac_dev)
        printk_ratelimited("Could not collect MPR dump.\n");
        atomic_set(&dev_info->data->dump_in_progress, 0);
        goto start;
+
+finish:
+       atomic_set(&dev_info->data->dump_ready, 0);
+       atomic_set(&dev_info->data->dump_in_progress, 0);
 }
 
 static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev)
@@ -1065,13 +1085,16 @@ static void intel_sm_events_error_check(struct 
edac_device_ctl_info *edac_dev)
        u32 counter;
 
        while (1) {
-               if (0 == wait_event_timeout(dev_info->data->event_wq,
+               if (wait_event_timeout(dev_info->data->event_wq,
                        atomic_read(&dev_info->data->event_ready),
-                       msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+                       msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
                        continue;
 
                atomic_set(&dev_info->data->event_ready, 0);
 
+               if (dev_info->finish_events)
+                       break;
+
                mutex_lock(&dev_info->data->edac_sysfs_data_lock);
                for (i = 0; i < NR_EVENTS; ++i) {
                        counter = atomic_xchg(&events[i].counter, 0);
@@ -1160,6 +1183,22 @@ static int get_active_dram(struct intel_edac_dev_info 
*dev_info)
        return dram;
 }
 
+static void finish_workqueues(struct intel_edac_dev_info *dev_info)
+{
+       if (dev_info->is_ddr4) {
+               dev_info->finish_alerts = 1;
+               atomic_inc(&dev_info->data->dump_in_progress);
+               atomic_set(&dev_info->data->dump_ready, 1);
+               wake_up(&dev_info->data->dump_wq);
+               cancel_work_sync(&dev_info->offload_alerts);
+       }
+
+       dev_info->finish_events = 1;
+       atomic_set(&dev_info->data->event_ready, 1);
+       wake_up(&dev_info->data->event_wq);
+       cancel_work_sync(&dev_info->offload_events);
+}
+
 static int intel_edac_mc_probe(struct platform_device *pdev)
 {
        struct edac_device_instance *instance;
@@ -1184,10 +1223,20 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        if (!dev_info)
                goto err_nomem;
 
+       dev_info->ctl_name =
+               devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+       if (!dev_info->ctl_name)
+               goto err_nomem;
+
+       dev_info->blk_name =
+               devm_kzalloc(&pdev->dev, 32*sizeof(char), GFP_KERNEL);
+       if (!dev_info->blk_name)
+               goto err_nomem;
+
        dev_info->data =
                devm_kzalloc(&pdev->dev, sizeof(*dev_info->data), GFP_KERNEL);
        if (!dev_info->data)
-               goto err_noctlinfo;
+               goto err_nomem;
 
        init_waitqueue_head(&dev_info->data->dump_wq);
        init_waitqueue_head(&dev_info->data->event_wq);
@@ -1195,6 +1244,12 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        raw_spin_lock_init(&dev_info->data->mpr_data_lock);
        mutex_init(&dev_info->data->edac_sysfs_data_lock);
 
+       strncpy(dev_info->ctl_name, np->name, 32);
+       dev_info->ctl_name[31] = '\0';
+
+       strncpy(dev_info->blk_name, "ECC", 32);
+       dev_info->ctl_name[31] = '\0';
+
        dev_info->ctl_name = kstrdup(np->name, GFP_KERNEL);
        dev_info->blk_name = "ECC";
        edac_op_state = EDAC_OPSTATE_POLL;
@@ -1268,8 +1323,10 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        dev_info->edac_dev =
                edac_device_alloc_ctl_info(0, dev_info->ctl_name,
                                         1, dev_info->blk_name,
-                                        NR_EVENTS +
-                                        cs_count * dram_count * MPR_ERRORS,
+                                        NR_EVENTS + (dev_info->is_ddr4 ?
+                                        cs_count * dram_count * MPR_ERRORS
+                                        :
+                                        0),
                                         0, NULL, 0, dev_info->edac_idx);
 
        if (!dev_info->edac_dev) {
@@ -1329,20 +1386,34 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        if (edac_device_add_device(dev_info->edac_dev) != 0) {
                pr_info("Unable to add edac device for %s\n",
                        dev_info->ctl_name);
-               goto err_nosysfs;
+               goto err_noctlinfo;
        }
 
        snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
                        "%s-mon", dev_info->ctl_name);
 
+       dev_info->wq_events =
+               alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+                                  (dev_info->ctl_name));
+       if (!dev_info->wq_events)
+               goto err_nosysfs;
+
+       if (dev_info->is_ddr4) {
+               dev_info->wq_alerts =
+                 alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+                                  (dev_info->ctl_name));
+
+               if (!dev_info->wq_alerts)
+                       goto err_noevents;
+       }
        if (dev_info->is_ddr4)
                INIT_WORK(&dev_info->offload_alerts, axxia_alerts_work);
 
        INIT_WORK(&dev_info->offload_events, axxia_events_work);
 
        if (dev_info->is_ddr4)
-               schedule_work(&dev_info->offload_alerts);
-       schedule_work(&dev_info->offload_events);
+               queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
+       queue_work(dev_info->wq_events, &dev_info->offload_events);
 
        irq = platform_get_irq(pdev, 0);
        if (irq < 0) {
@@ -1382,8 +1453,8 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        }
 
        dev_info->data->irq = irq;
-       rc = devm_request_irq(&pdev->dev, irq,
-                       smmon_isr, IRQF_ONESHOT,
+       rc = devm_request_threaded_irq(&pdev->dev, irq,
+                       smmon_isr_hw, smmon_isr_sw, IRQF_ONESHOT,
                        &dev_info->data->irq_name[0], dev_info);
 
        if (rc) {
@@ -1411,18 +1482,23 @@ static int intel_edac_mc_probe(struct platform_device 
*pdev)
        return 0;
 
 err_noirq:
+       finish_workqueues(dev_info);
+       edac_device_del_device(&dev_info->pdev->dev);
+
        if (dev_info->is_ddr4)
-               cancel_work_sync(&dev_info->offload_alerts);
-       cancel_work_sync(&dev_info->offload_events);
+               destroy_workqueue(dev_info->wq_alerts);
 
-       edac_device_del_device(&dev_info->pdev->dev);
+err_noevents:
+       destroy_workqueue(dev_info->wq_events);
 
 err_nosysfs:
        edac_device_free_ctl_info(dev_info->edac_dev);
+
 err_noctlinfo:
        mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
        atomic64_dec(&mc_counter);
        return 1;
+
 err_nomem:
        atomic64_dec(&mc_counter);
        return -ENOMEM;
@@ -1444,9 +1520,11 @@ static int intel_edac_mc_remove(struct platform_device 
*pdev)
 
                        dev_info->data->irq = 0;
 
+                       finish_workqueues(dev_info);
+
                        if (dev_info->is_ddr4)
-                               cancel_work_sync(&dev_info->offload_alerts);
-                       cancel_work_sync(&dev_info->offload_events);
+                               destroy_workqueue(dev_info->wq_alerts);
+                       destroy_workqueue(dev_info->wq_events);
                }
 
                if (dev_info->edac_dev != NULL) {
-- 
2.7.4

-- 
_______________________________________________
linux-yocto mailing list
linux-yocto@yoctoproject.org
https://lists.yoctoproject.org/listinfo/linux-yocto

Reply via email to