From: Marek Majtyka <marekx.majt...@intel.com> Changes and fixes: - CMEM/SMEM driver trace capabilities added - CMEM/SMEM cpu0 affinity used - SMEM code maintainability improved - CMEM multiple ca parity error event added - CMEM mpr dump on signal bit [21] - CMEM clear mr5 ca parity error flag added - CMEM irq storm fix - kernel 4.9 Axxia EDAC code alignment
Signed-off-by: Marek Majtyka <marekx.majt...@intel.com> --- arch/arm64/boot/dts/intel/axc67xx.dtsi | 44 ++-- drivers/edac/Kconfig | 7 + drivers/edac/axxia_edac-cmc_56xx.c | 383 +++++++++++++++++++++++---------- drivers/edac/axxia_edac-l2_cpu_56xx.c | 3 +- drivers/edac/axxia_edac-l3_56xx.c | 46 +++- drivers/edac/axxia_edac-mc_56xx.c | 295 +++++++++++++++++-------- include/trace/events/edac_cmc.h | 101 +++++++++ include/trace/events/edac_mc.h | 104 +++++++++ 8 files changed, 744 insertions(+), 239 deletions(-) create mode 100644 include/trace/events/edac_cmc.h create mode 100644 include/trace/events/edac_mc.h diff --git a/arch/arm64/boot/dts/intel/axc67xx.dtsi b/arch/arm64/boot/dts/intel/axc67xx.dtsi index 7bb4cd8..d4d3171 100644 --- a/arch/arm64/boot/dts/intel/axc67xx.dtsi +++ b/arch/arm64/boot/dts/intel/axc67xx.dtsi @@ -232,33 +232,33 @@ }; gpdma0: gpdma@8005020000 { - compatible = "lsi,dma32"; + compatible = "lsi,dma32"; reg = <0x80 0x05020000 0 0x10000>; interrupts = <GIC_SPI 44 IRQ_TYPE_LEVEL_HIGH>, - <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>; - channel0 { - interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>; - }; - - channel1 { - interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>; - }; - }; - - gpdma1: gpdma@8005030000 { - compatible = "lsi,dma32"; + <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>; + channel0 { + interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>; + }; + + channel1 { + interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>; + }; + }; + + gpdma1: gpdma@8005030000 { + compatible = "lsi,dma32"; reg = <0x80 0x05030000 0 0x10000>; interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>, - <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>; + <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>; status = "disabled"; - channel0 { - interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>; - }; - - channel1 { - interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_HIGH>; - }; - }; + channel0 { + interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>; + }; + + channel1 { + interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_HIGH>; + }; + }; i2c0: i2c@8080600000 { compatible = "lsi,api2c"; diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 856fd8f..5588930 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -413,6 +413,13 @@ config EDAC_AXXIA_SYSMEM_6700 the System Memory error detection. System Memory error detection is interrupt driven. +config DEBUG_EDAC_AXXIA_SYSMEM + depends on ARCH_AXXIA + bool "AXXIA EDAC SYSMEM error injection interface." + help + Support for configuration of SYSMEM edac tracing functionality. + It works for both 5600 and 6700 board families. + config EDAC_AXXIA_CMEM_5600 depends on ARCH_AXXIA bool "AXXIA EDAC CMem Controller for 5600" diff --git a/drivers/edac/axxia_edac-cmc_56xx.c b/drivers/edac/axxia_edac-cmc_56xx.c index 884f746..f29e6926 100644 --- a/drivers/edac/axxia_edac-cmc_56xx.c +++ b/drivers/edac/axxia_edac-cmc_56xx.c @@ -1,7 +1,7 @@ /* * drivers/edac/axxia_edac-mc.c * - * EDAC Driver for Intel's Axxia 5600 Configuration Memory Controller + * EDAC Driver for Intel's Axxia 5600/6700 Configuration Memory Controller * * Copyright (C) 2016 Intel Inc. * @@ -31,6 +31,9 @@ #include "edac_core.h" #include "edac_module.h" +#define CREATE_TRACE_POINTS +#include <trace/events/edac_cmc.h> + #define FMT "%s: syscon lookup failed hence using hardcoded register address\n" #define MPR_FMT2 "\n%3d %#010x %#010x" @@ -53,7 +56,9 @@ #define CM_MPR_PAGE 0x1 #define CM_56XX_DENALI_CTL_00 0x0 +#define CM_56XX_DENALI_CTL_33 0x84 #define CM_56XX_DENALI_CTL_34 0x88 +#define CM_56XX_DENALI_CTL_45 0xb4 #define CM_56XX_DENALI_CTL_74 0x128 #define CM_56XX_DENALI_CTL_80 0x140 @@ -90,6 +95,7 @@ #define INT_BIT_8 (0x00000100) #define INT_BIT_11 (0x00000800) #define INT_BIT_21 (0x00200000) +#define INT_BIT_23 (0x00800000) #define INT_BIT_25 (0x02000000) #define INT_BIT_30 (0x40000000) #define INT_BIT_31 (0x80000000) @@ -108,6 +114,7 @@ INT_BIT_7 |\ INT_BIT_11 |\ INT_BIT_21 |\ + INT_BIT_23 |\ INT_BIT_31)) #define CM_INT_MASK_FULL (~(\ @@ -120,6 +127,7 @@ INT_BIT_7 |\ INT_BIT_11 |\ INT_BIT_21 |\ + INT_BIT_23 |\ INT_BIT_25 |\ INT_BIT_30 |\ INT_BIT_31)) @@ -127,67 +135,66 @@ #define CM_INT_MASK_ALL (0x7fffffff) #define ALIVE_NOTIFICATION_PERIOD (90*1000) -static int log = 1; -module_param(log, int, S_IRUGO|S_IWUSR); -MODULE_PARM_DESC(log, "Log each error to kernel log."); +static cpumask_t only_cpu_0 = { CPU_BITS_CPU0 }; static int force_restart = 1; -module_param(force_restart, int, S_IRUGO|S_IWUSR); +module_param(force_restart, int, 0644); MODULE_PARM_DESC(force_restart, "Machine restart on fatal error."); static atomic64_t mc_counter = ATOMIC_INIT(0); /* - Bit [31] = Logical OR of all lower bits. - Bit [30] = A CRC error occurred on the write data bus. - Bit [29] = The user-initiated DLL resync has completed. - Bit [28] = A state change has been detected on the dfi_init_complete - signal after initialization. - Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully - inhibited the command queue. - Bit [26] = The register interface-initiated mode register write has completed - and another mode register write may be issued. - Bit [25] = MPR read command, initiated with a software MPR_READ request, is - complete. - Bit [24] = Error received from the PHY on the DFI bus. - Bit [23] = RESERVED - Bit [22] = RESERVED - Bit [21] = A parity error has been detected on the address/control bus on - a registered DIMM. - Bit [20] = The leveling operation has completed. - Bit [19] = A read leveling gate training operation has been requested. - Bit [18] = A read leveling operation has been requested. - Bit [17] = A write leveling operation has been requested. - Bit [16] = A DFI update error has occurred. Error information can be found in - the UPDATE_ERROR_STATUS parameter. - Bit [15] = A write leveling error has occurred. Error information can be found - in the WRLVL_ERROR_STATUS parameter. - Bit [14] = A read leveling gate training error has occurred. Error information - can be found in the RDLVL_ERROR_STATUS parameter. - Bit [13] = A read leveling error has occurred. Error information can be found - in the RDLVL_ERROR_STATUS parameter. - Bit [12] = The user has programmed an invalid setting associated with user - words per burst. - Examples: - Setting param_reduc when burst length = 2. - A 1:2 MC:PHY clock ratio with burst length = 2. - Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is - unsupported & may result in memory data corruption. - Bit [10] = The BIST operation has been completed. - Bit [9] = The low power operation has been completed. - Bit [8] = The MC initialization has been completed. - Bit [7] = An error occurred on the port command channel. - Bit [6] = Multiple uncorrectable ECC events have been detected. - Bit [5] = An uncorrectable ECC event has been detected. - Bit [4] = Multiple correctable ECC events have been detected. - Bit [3] = A correctable ECC event has been detected. - Bit [2] = Multiple accesses outside the defined PHYSICAL memory space - have occurred. - Bit [1] = A memory access outside the defined PHYSICAL memory space - has occurred. - Bit [0] = The memory reset is valid on the DFI bus. - - Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest. -*/ + * Bit [31] = Logical OR of all lower bits. + * Bit [30] = A CRC error occurred on the write data bus. + * Bit [29] = The user-initiated DLL resync has completed. + * Bit [28] = A state change has been detected on the dfi_init_complete + * signal after initialization. + * Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully + * inhibited the command queue. + * Bit [26] = The register interface-initiated mode register write has completed + * and another mode register write may be issued. + * Bit [25] = MPR read command, initiated with a software MPR_READ request, is + * complete. + * Bit [24] = Error received from the PHY on the DFI bus. + * Bit [23] = RESERVED + * Bit [22] = RESERVED + * Bit [21] = A parity error has been detected on the address/control bus on + * a registered DIMM. + * Bit [20] = The leveling operation has completed. + * Bit [19] = A read leveling gate training operation has been requested. + * Bit [18] = A read leveling operation has been requested. + * Bit [17] = A write leveling operation has been requested. + * Bit [16] = A DFI update error has occurred. Error information can be found + * in the UPDATE_ERROR_STATUS parameter. + * Bit [15] = A write leveling error has occurred. Error information can be + * found in the WRLVL_ERROR_STATUS parameter. + * Bit [14] = A read leveling gate training error has occurred. Error + * information can be found in the RDLVL_ERROR_STATUS parameter. + * Bit [13] = A read leveling error has occurred. Error information can be + * found in the RDLVL_ERROR_STATUS parameter. + * Bit [12] = The user has programmed an invalid setting associated with user + * words per burst. + * Examples: + * Setting param_reduc when burst length = 2. + * A 1:2 MC:PHY clock ratio with burst length = 2. + * Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is + * unsupported & may result in memory data corruption. + * Bit [10] = The BIST operation has been completed. + * Bit [9] = The low power operation has been completed. + * Bit [8] = The MC initialization has been completed. + * Bit [7] = An error occurred on the port command channel. + * Bit [6] = Multiple uncorrectable ECC events have been detected. + * Bit [5] = An uncorrectable ECC event has been detected. + * Bit [4] = Multiple correctable ECC events have been detected. + * Bit [3] = A correctable ECC event has been detected. + * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space + * have occurred. + * Bit [1] = A memory access outside the defined PHYSICAL memory space + * has occurred. + * Bit [0] = The memory reset is valid on the DFI bus. + * + * Of these 1, 2, 3, 4, 5, 6, 7, 11, 13, 14, 15, 16, 21, 24, 25, and 30 + * are of our interest. + */ /* * MPR dump processing - overview. @@ -196,10 +203,10 @@ static atomic64_t mc_counter = ATOMIC_INIT(0); * one need to collect dumps for all available cs. Below given example * for two cs0/cs1. * - * CMEM MC cmmon_isr_sw cmmon_wq + * CMEM MC cmmon_isr_sw wq_alerts * | | | * | | | - * |ALERT_N - int_status bit [30] | + * |ALERT_N - int_status bit [30] or [21] | * |------------------>| | * | |schedule cmmon_wq | * | |------------------>| @@ -270,6 +277,19 @@ struct __packed cm_56xx_denali_ctl_00 #endif }; +struct __packed cm_56xx_denali_ctl_33 +{ +#ifdef CPU_BIG_ENDIAN + unsigned reserved : 6; + unsigned write : 1; + unsigned write_modereg : 25; +#else /* Little Endian */ + unsigned write_modereg : 25; + unsigned write : 1; + unsigned reserved : 6; +#endif +}; + /* Trigger MPR */ struct __packed cm_56xx_denali_ctl_34 { @@ -292,6 +312,24 @@ struct __packed cm_56xx_denali_ctl_34 #endif }; +/* + * this structure is the same for all registers(one definition used) + * cm_56xx_denali_ctl_45, cm_56xx_denali_ctl_48, + * cm_56xx_denali_ctl_53, cm_56xx_denali_ctl_56 + */ +struct __packed cm_56xx_denali_ctl_45 +{ +#ifdef CPU_BIG_ENDIAN + unsigned absolete1 : 6; + unsigned reserved : 7; + unsigned mrsingle_data_0 : 17; +#else /* Little Endian */ + unsigned mrsingle_data_0 : 17; + unsigned reserved : 7; + unsigned absolete1 : 6; +#endif +}; + #ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM #define CM_56XX_DENALI_CTL_62 0xf8 @@ -395,6 +433,7 @@ enum events { EV_PORT_ERROR, EV_WRAP_ERROR, EV_PARITY_ERROR, + EV_SEC_PARITY_ERROR, NR_EVENTS }; @@ -409,6 +448,7 @@ static char *block_name[] = { "port_error", "wrap_error", "parity_error", + "second_parity_error", "alert_n_cs0_dram0_ca_par_error", "alert_n_cs0_dram0_crc_error", "alert_n_cs0_dram1_ca_par_error", @@ -437,6 +477,7 @@ static const u32 event_mask[NR_EVENTS] = { [EV_PORT_ERROR] = INT_BIT_7, [EV_WRAP_ERROR] = INT_BIT_11, [EV_PARITY_ERROR] = INT_BIT_21, + [EV_SEC_PARITY_ERROR] = INT_BIT_23, }; static const struct event_logging { @@ -453,6 +494,7 @@ static const struct event_logging { [EV_PORT_ERROR] = {0, KERN_CRIT, "Port error"}, [EV_WRAP_ERROR] = {0, KERN_CRIT, "Wrap error"}, [EV_PARITY_ERROR] = {0, KERN_CRIT, "Parity error"}, + [EV_SEC_PARITY_ERROR] = {1, KERN_CRIT, "Second parity error"}, }; /* Private structure for common edac device */ @@ -513,28 +555,28 @@ static int setup_fault_injection(struct intel_edac_dev_info *dev_info, struct cm_56xx_denali_ctl_62 denali_ctl_62; if (ncr_read(dev_info->cm_region, - CM_56XX_DENALI_CTL_62, - 4, &denali_ctl_62)) - goto error_read; + CM_56XX_DENALI_CTL_62, + 4, &denali_ctl_62)) + goto error_read; denali_ctl_62.xor_check_bits = fault; if (ncr_write(dev_info->cm_region, - CM_56XX_DENALI_CTL_62, - 4, (u32 *) &denali_ctl_62)) - goto error_write; + CM_56XX_DENALI_CTL_62, + 4, (u32 *) &denali_ctl_62)) + goto error_write; if (ncr_read(dev_info->cm_region, - CM_56XX_DENALI_CTL_62, - 4, &denali_ctl_62)) - goto error_read; + CM_56XX_DENALI_CTL_62, + 4, &denali_ctl_62)) + goto error_read; denali_ctl_62.fwc = (enable > 0 ? 0x1 : 0x0); if (ncr_write(dev_info->cm_region, - CM_56XX_DENALI_CTL_62, - 4, (u32 *) &denali_ctl_62)) - goto error_write; + CM_56XX_DENALI_CTL_62, + 4, (u32 *) &denali_ctl_62)) + goto error_write; return 0; error_read: @@ -639,7 +681,7 @@ static struct edac_dev_sysfs_attribute device_block_attr[] = { { .attr = { .name = "mpr_page1", - .mode = (S_IRUGO | S_IWUSR) + .mode = (0644) }, .show = mpr1_dump_show, .store = NULL @@ -670,7 +712,7 @@ handle_events(struct intel_edac_dev_info *edac_dev, set_val = readl( edac_dev->axi2ser3_region + SYSCON_PERSIST_SCRATCH); - /* set bit 3 in pscratch reg */ + /* set bit 7 in pscratch reg */ set_val = set_val | CMEM_PERSIST_SCRATCH_BIT; writel(set_val, @@ -721,6 +763,48 @@ store_mpr_dump(struct intel_edac_dev_info *edac_dev, int cs) MAX_DQ * MPR_PAGE_BYTES); } +static int clear_ca_parity_error(struct intel_edac_dev_info *dev_info, int cs) +{ + + struct cm_56xx_denali_ctl_45 denali_ctl_45; + struct cm_56xx_denali_ctl_33 denali_ctl_33; + + if (ncr_read(dev_info->cm_region, + CM_56XX_DENALI_CTL_45 + 0xc + 0x20 * cs, + 4, (u32 *) &denali_ctl_45)) + goto error_read; + + /* + * Clear always as we can't get info about state change + * from denali_ctl_45, which means this check would be faulty!!! + * if (denali_ctl_45.mrsingle_data_0 & 0x10) + */ + denali_ctl_45.mrsingle_data_0 &= 0x3FFEF; /* clear A4 bit */ + denali_ctl_33.write = 1; /* write */ + denali_ctl_33.write_modereg = 0x800005; /* MR5 write */ + denali_ctl_33.write_modereg |= (cs << 8); /* chip select */ + + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_45 + 0x20 * cs, + 4, (u32 *) &denali_ctl_45)) + goto error_write; + + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_33, + 4, (u32 *) &denali_ctl_33)) + goto error_write; + return 0; + +error_write: + printk_ratelimited("%s: Write error when clearing ca parity in mr5\n", + dev_name(&dev_info->pdev->dev)); + return 1; +error_read: + printk_ratelimited("%s: Read error when clearing ca parity in mr5\n", + dev_name(&dev_info->pdev->dev)); + return 1; +} + static inline void __attribute__((always_inline)) update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs) { @@ -739,8 +823,11 @@ update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs) (u8 (*)[MPR_PAGE_BYTES]) (&edac_dev->data->mpr.dram_0_page[0]); int i; - for (i = 0; i < MAX_DQ; ++i) + for (i = 0; i < edac_dev->data->dram_count; ++i) { inc_alert_counter(edac_dev->data->alerts, cs, i, dram[i][3]); + trace_edac_cmc_dump_processed(edac_dev->cm_region >> 16, + cs, i, (int) dram[i][3]); + } } @@ -751,6 +838,9 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) unsigned long flags; u32 regval; int i; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + u32 node = edac_dev->cm_region >> 16; +#endif mpr->mpr_page_id = page; @@ -761,11 +851,32 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_0_page[i] = regval & 0xff; + +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 0, + (int) mpr->dram_0_page[i]); +#endif + mpr->dram_1_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 1, + (int) mpr->dram_1_page[i]); +#endif if (edac_dev->data->dram_count == MAX_DQ) { mpr->dram_2_page[i] = ((regval & 0xff0000) >> 16); + +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 2, + (int) mpr->dram_2_page[i]); +#endif + mpr->dram_3_page[i] = ((regval & 0xff000000) >> 24); + +#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM + trace_edac_cmc_dump_collected(node, cs, i, 3, + (int) mpr->dram_3_page[i]); +#endif } } raw_spin_lock_irqsave(&edac_dev->data->mpr_data_lock, flags); @@ -773,6 +884,7 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) raw_spin_unlock_irqrestore(&edac_dev->data->mpr_data_lock, flags); update_alert_counters(edac_dev, cs); + clear_ca_parity_error(edac_dev, cs); return 0; error_read: @@ -804,13 +916,16 @@ cmmon_isr_sw(int interrupt, void *device) /* * NOTE: * ISR function is only reading int_status, and write into int_act - * registers. + * and int_mask registers (as well as rte config load) * - * - first handle critical events, which might require restart + * - first handles driver initialization if not configured in uboot, + * once initialized it mask irq bit 8 from raising interrupt + * as this bit must be acknowledged by rte, + * - second handle critical events, which might require restart * (handle_events) and then to the job outside isr - * - second collect MPR dump if any exists and then trigger new if + * - third collect MPR dump if any exists and then trigger new if * needed - all outside isr, - * - third wake up job outside isr to trigger mpr dump procedure when + * - finally wake up job outside isr to trigger mpr dump procedure when * ALERT_N reported (bit [30] is on) */ @@ -818,8 +933,12 @@ cmmon_isr_sw(int interrupt, void *device) 4, (u32 *) &denali_ctl_84)) goto error_read; - if (denali_ctl_84.int_status & INT_BIT_8) { - if (dev_info->is_controller_configured == 0) { + trace_edac_cmc_int_status(dev_info->cm_region >> 16, + denali_ctl_84.int_status); + + if (dev_info->is_controller_configured == 0) { + /* first init case */ + if (denali_ctl_84.int_status & INT_BIT_8) { ret = initialize(dev_info); if (ret) goto error_init; @@ -829,28 +948,46 @@ cmmon_isr_sw(int interrupt, void *device) goto error_init; dev_info->is_controller_configured = 1; - } - if (dev_info->is_ddr4) - denali_ctl_86.int_mask = CM_INT_MASK_FULL; - else - denali_ctl_86.int_mask = CM_INT_MASK_BASE; + denali_ctl_85.int_ack = INT_BIT_8; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_85, + 4, (u32 *) &denali_ctl_85)) + goto error_write; + + denali_ctl_86.int_mask = CM_INT_MASK_ALL; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_86, + 4, (u32 *) &denali_ctl_86)) + goto error_write; - if (ncr_write(dev_info->cm_region, - CM_56XX_DENALI_CTL_86, - 4, (u32 *) &denali_ctl_86)) { - goto error_write; } + /* + * SAFETY CHECK + * One cannot go further if driver is not fully functional!!! + */ return IRQ_HANDLED; - } - /* - * SAFETY CHECK - * one cannot go further if driver is not fully functional!!! - */ - if (dev_info->is_controller_configured == 0) - return IRQ_HANDLED; + } else { + /* reload config case */ + if (denali_ctl_84.int_status & INT_BIT_8) { + denali_ctl_85.int_ack = INT_BIT_8; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_85, + 4, (u32 *) &denali_ctl_85)) + goto error_write; + + denali_ctl_86.int_mask = CM_INT_MASK_ALL; + if (ncr_write(dev_info->cm_region, + CM_56XX_DENALI_CTL_86, + 4, (u32 *) &denali_ctl_86)) + goto error_write; + + + return IRQ_HANDLED; + } + } handle_events(dev_info, &denali_ctl_84); atomic_set(&dev_info->data->event_ready, 1); @@ -867,10 +1004,9 @@ cmmon_isr_sw(int interrupt, void *device) denali_ctl_85.int_ack |= INT_BIT_25; } - if (denali_ctl_84.int_status & INT_BIT_30) { + if (denali_ctl_84.int_status & (INT_BIT_30 | INT_BIT_21)) { atomic_inc(&dev_info->data->dump_in_progress); wake_up(&dev_info->data->dump_wq); - denali_ctl_85.int_ack |= INT_BIT_30; } } @@ -892,7 +1028,7 @@ error_init: printk_ratelimited("%s: Error during driver initialization\n", dev_name(&dev_info->pdev->dev)); uninitialize(dev_info, ret, - 0 == dev_info->is_controller_configured ? 1 : 0); + dev_info->is_controller_configured == 0 ? 1 : 0); return IRQ_HANDLED; } @@ -909,9 +1045,9 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev) start: /* keep hung up monitor happy 90 sec's */ - if (0 == wait_event_timeout(dev_info->data->dump_wq, + if (wait_event_timeout(dev_info->data->dump_wq, atomic_read(&dev_info->data->dump_in_progress), - msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD))) + msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0) goto start; if (dev_info->finish_alerts) @@ -940,13 +1076,15 @@ start: 4, (u32 *) &denali_ctl_34)) goto error_write; + trace_edac_cmc_dump_triggered(dev_info->cm_region >> 16, i); + /* wait */ ret = wait_event_timeout(dev_info->data->dump_wq, atomic_read(&dev_info->data->dump_ready), msecs_to_jiffies(1000)); if (dev_info->finish_alerts) goto finish; - if (0 == ret) + if (ret == 0) goto timeout_error; atomic_set(&dev_info->data->dump_ready, 0); @@ -1003,9 +1141,9 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev) u32 counter; while (1) { - if (0 == wait_event_timeout(dev_info->data->event_wq, + if (wait_event_timeout(dev_info->data->event_wq, atomic_read(&dev_info->data->event_ready), - msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD))) + msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0) continue; atomic_set(&dev_info->data->event_ready, 0); @@ -1026,6 +1164,7 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev) case EV_MULT_ILLEGAL: case EV_UNCORR_ECC: case EV_MULT_UNCORR_ECC: + case EV_SEC_PARITY_ERROR: edac_device_handle_multi_ue(edac_dev, 0, i, counter, edac_dev->ctl_name); @@ -1094,10 +1233,10 @@ static int get_active_dram(struct intel_edac_dev_info *dev_info) return dram; } - if (0 == denali_ctl_74.bank_diff) + if (denali_ctl_74.bank_diff == 0) dram = MAX_DQ/2; - if (1 == denali_ctl_74.bank_diff) + if (denali_ctl_74.bank_diff == 1) dram = MAX_DQ; return dram; @@ -1214,7 +1353,6 @@ static int initialize(struct intel_edac_dev_info *dev_info) pr_err("Could not get dram version. Is config loaded?\n"); return ERR_STAGE_1; } - /*dev_info->is_ddr4 = 1;*/ dev_info->finish_alerts = 0; dev_info->finish_events = 0; @@ -1237,6 +1375,7 @@ static int initialize(struct intel_edac_dev_info *dev_info) } dev_info->edac_dev->log_ce = 0; + instance = &dev_info->edac_dev->instances[0]; /* It just gives more descriptive name. */ @@ -1303,14 +1442,14 @@ static int enable_workers(struct intel_edac_dev_info *dev_info) atomic_set(&dev_info->data->event_ready, 0); atomic_set(&dev_info->data->dump_in_progress, 0); - dev_info->wq_events = alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1, + dev_info->wq_events = alloc_workqueue("%s-events", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_events) return ERR_STAGE_3; if (dev_info->is_ddr4) { dev_info->wq_alerts = - alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1, + alloc_workqueue("%s-alerts", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_alerts) return ERR_STAGE_4; @@ -1321,8 +1460,9 @@ static int enable_workers(struct intel_edac_dev_info *dev_info) INIT_WORK(&dev_info->offload_events, axxia_events_work); if (dev_info->is_ddr4) - queue_work(dev_info->wq_alerts, &dev_info->offload_alerts); - queue_work(dev_info->wq_events, &dev_info->offload_events); + queue_work_on(0, dev_info->wq_alerts, + &dev_info->offload_alerts); + queue_work_on(0, dev_info->wq_events, &dev_info->offload_events); return 0; } @@ -1331,6 +1471,7 @@ static int enable_driver_irq(struct intel_edac_dev_info *dev_info) { int irq = -1, rc = 0; struct cm_56xx_denali_ctl_86 denali_ctl_86; + struct irq_desc *desc; snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN, "%s-mon", dev_info->ctl_name); @@ -1381,6 +1522,10 @@ static int enable_driver_irq(struct intel_edac_dev_info *dev_info) return ERR_STAGE_6; } + + desc = irq_to_desc(irq); + sched_setaffinity(desc->action->thread->pid, &only_cpu_0); + return 0; } @@ -1397,8 +1542,8 @@ axxia_cmem_read(struct file *filp, char *buffer, size_t length, loff_t *offset) if (*offset > 0) return 0; - buf = kmalloc(PAGE_SIZE, __GFP_WAIT); - if (NULL == buf) + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (buf == NULL) goto no_mem_buffer; mutex_lock(&dev_info->state_machine_lock); @@ -1443,8 +1588,8 @@ axxia_cmem_write(struct file *file, const char __user *buffer, struct intel_edac_dev_info *dev_info = (struct intel_edac_dev_info *) file->private_data; - buf = kmalloc(count + 1, __GFP_WAIT); - if (NULL == buf) + buf = kmalloc(count + 1, GFP_KERNEL); + if (buf == NULL) goto no_mem_buffer; memset(buf, 0, count + 1); @@ -1606,12 +1751,12 @@ static int intel_edac_mc_probe(struct platform_device *pdev) if (denali_ctl_00.start == 1) { /* uboot has configured CMEM */ - if (0xa == denali_ctl_00.dram_class) { + if (denali_ctl_00.dram_class == 0xa) { pr_info("%s supports mpr dump (DDR4).\n", dev_info->ctl_name); dev_info->is_ddr4 = 1; } - if (0x6 == denali_ctl_00.dram_class) { + if (denali_ctl_00.dram_class == 0x6) { pr_info("%s doesn't support mpr dump (DDR3).\n", dev_info->ctl_name); } @@ -1656,7 +1801,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev) /* each instance shall know each private data */ dev_info->dir_entry = - proc_create_data(dev_info->proc_name, S_IWUSR, + proc_create_data(dev_info->proc_name, 0200, NULL, &axxia_edac_cmem_proc_ops, dev_info); @@ -1671,7 +1816,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev) err_uninit: uninitialize(dev_info, ret, - 0 == dev_info->is_controller_configured ? 1 : 0); + dev_info->is_controller_configured == 0 ? 1 : 0); err_init: mutex_destroy(&dev_info->data->edac_sysfs_data_lock); mutex_destroy(&dev_info->state_machine_lock); @@ -1698,7 +1843,7 @@ static int intel_edac_mc_remove(struct platform_device *pdev) #endif uninitialize(dev_info, ERR_STAGE_8, - 0 == dev_info->is_controller_configured ? 1 : 0); + dev_info->is_controller_configured == 0 ? 1 : 0); if (dev_info->edac_dev != NULL) { edac_device_del_device(&dev_info->pdev->dev); diff --git a/drivers/edac/axxia_edac-l2_cpu_56xx.c b/drivers/edac/axxia_edac-l2_cpu_56xx.c index a9400e8..a183c65 100644 --- a/drivers/edac/axxia_edac-l2_cpu_56xx.c +++ b/drivers/edac/axxia_edac-l2_cpu_56xx.c @@ -1,7 +1,7 @@ /* * drivers/edac/axxia_edac-l2_cpu_56xx.c * - * EDAC Driver for Intel's Axxia 5600 System Memory Controller + * EDAC Driver for Intel's Axxia 5600/6700 System Memory Controller * * Copyright (C) 2016 Intel Inc. * @@ -291,6 +291,7 @@ static const struct of_device_id intel_edac_l2_match[] = { }, #endif + {}, }; diff --git a/drivers/edac/axxia_edac-l3_56xx.c b/drivers/edac/axxia_edac-l3_56xx.c index 32427c6..b630031 100644 --- a/drivers/edac/axxia_edac-l3_56xx.c +++ b/drivers/edac/axxia_edac-l3_56xx.c @@ -21,6 +21,8 @@ #include <linux/of_platform.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/irq.h> #include <linux/platform_device.h> #include <linux/reboot.h> #include <linux/mfd/syscon.h> @@ -77,6 +79,8 @@ #define CCN_NODE_ERR_SYND_REG1 0x408 #define CCN_NODE_ERR_SYND_CLR 0x480 +static cpumask_t only_cpu_0 = { CPU_BITS_CPU0}; + union dickens_hnf_err_syndrome_reg0 { struct __packed { #ifdef CPU_BIG_ENDIAN @@ -250,17 +254,16 @@ static irqreturn_t ccn_irq_thread(int irq, void *device) return IRQ_HANDLED; } -static irqreturn_t ccn_irq_handler(int irq, void *device) +static irqreturn_t collect_and_clean(struct intel_edac_dev_info *dev_info, + int report_error) { - struct intel_edac_dev_info *dev_info = device; void __iomem *ccn_base = dev_info->dickens_L3; - - irqreturn_t res = IRQ_NONE; u64 err_sig_val[3]; u64 err_type_value[4]; u64 err_or; u64 err_synd_reg0 = 0, err_synd_reg1 = 0; int i; + irqreturn_t res = IRQ_NONE; /* PMU overflow is a special case - for the future */ err_or = err_sig_val[0] = readq(ccn_base + CCN_MN_ERR_SIG_VAL_63_0); @@ -351,11 +354,21 @@ static irqreturn_t ccn_irq_handler(int irq, void *device) } } - if (err_or) + if (err_or && report_error) dev_err(&dev_info->pdev->dev, "Error reported in %016llx %016llx %016llx.\n", err_sig_val[2], err_sig_val[1], err_sig_val[0]); + return res; +} + +static irqreturn_t ccn_irq_handler(int irq, void *device) +{ + struct intel_edac_dev_info *dev_info = device; + irqreturn_t res = IRQ_NONE; + + res = collect_and_clean(dev_info, 1); + /* HERE all error data collected, but interrupt not deasserted */ return IRQ_WAKE_THREAD; } @@ -414,8 +427,8 @@ static int intel_edac_l3_probe(struct platform_device *pdev) struct intel_edac_dev_info *dev_info = NULL; struct device_node *np = pdev->dev.of_node; struct resource *r; - struct arm_smccc_res ret; + struct irq_desc *desc; dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL); if (!dev_info) @@ -457,6 +470,7 @@ static int intel_edac_l3_probe(struct platform_device *pdev) } dev_info->edac_dev->log_ce = 0; + r = platform_get_resource(pdev, IORESOURCE_IRQ, 0); if (!r) return -EINVAL; @@ -467,10 +481,21 @@ static int intel_edac_l3_probe(struct platform_device *pdev) * Once -1 return, it means old uboot without ccn service. * Then only polling mechanism is allowed, as it was before. */ - if (ARM_SMCCC_UNKNOWN != __arm_smccc_smc(0xc4000027, - CCN_MN_ERRINT_STATUS__PMU_EVENTS__DISABLE, - 0, 0, &ret)) + __arm_smccc_smc(0xc4000027, CCN_MN_ERRINT_STATUS__PMU_EVENTS__DISABLE, + 0, 0, &ret); + trace_edacl3_smc_results(&ret); + + if (ret.a0 != ARM_SMCCC_UNKNOWN) { + irqreturn_t res; + dev_info->irq_used = 1; + /* clear all error from earlier boot stage */ + res = collect_and_clean(dev_info, 0); + __arm_smccc_smc(0xc4000027, + CCN_MN_ERRINT_STATUS__INTREQ__DESSERT, + 0, 0, &ret); + trace_edacl3_smc_results(&ret); + } dev_info->edac_dev->pvt_info = dev_info; dev_info->edac_dev->dev = &dev_info->pdev->dev; @@ -499,6 +524,9 @@ static int intel_edac_l3_probe(struct platform_device *pdev) goto err2; } + desc = irq_to_desc(r->start); + sched_setaffinity(desc->action->thread->pid, &only_cpu_0); + return 0; err2: edac_device_free_ctl_info(dev_info->edac_dev); diff --git a/drivers/edac/axxia_edac-mc_56xx.c b/drivers/edac/axxia_edac-mc_56xx.c index 00f3462..fba04d1 100644 --- a/drivers/edac/axxia_edac-mc_56xx.c +++ b/drivers/edac/axxia_edac-mc_56xx.c @@ -1,7 +1,7 @@ /* * drivers/edac/axxia_edac-mc.c * - * EDAC Driver for Intel's Axxia 5600 System Memory Controller + * EDAC Driver for Intel's Axxia 5600/6700 System Memory Controller * * Copyright (C) 2016 Intel Inc. * @@ -29,6 +29,9 @@ #include "edac_core.h" #include "edac_module.h" +#define CREATE_TRACE_POINTS +#include <trace/events/edac_mc.h> + #define FMT "%s: syscon lookup failed hence using hardcoded register address\n" #define MPR_FMT9 "\n%3d %#010x %#010x %#010x %#010x"\ @@ -91,75 +94,99 @@ #define MPR_PAGE_BYTES 4 #define MPR_ERRORS 2 /* CRC, CA Parity error */ -#define SM_INT_MASK_LOW (0xfbbfef01) +#define INT_BIT_0 (0x00000001) +#define INT_BIT_1 (0x00000002) +#define INT_BIT_2 (0x00000004) +#define INT_BIT_3 (0x00000008) +#define INT_BIT_4 (0x00000010) +#define INT_BIT_5 (0x00000020) +#define INT_BIT_6 (0x00000040) +#define INT_BIT_7 (0x00000080) +#define INT_BIT_12 (0x00001000) +#define INT_BIT_22 (0x00400000) +#define INT_BIT_24 (0x01000000) +#define INT_BIT_26 (0x04000000) + + +#define SM_INT_MASK_LOW (~(\ + INT_BIT_1 |\ + INT_BIT_2 |\ + INT_BIT_3 |\ + INT_BIT_4 |\ + INT_BIT_5 |\ + INT_BIT_6 |\ + INT_BIT_7 |\ + INT_BIT_12 |\ + INT_BIT_22 |\ + INT_BIT_24 |\ + INT_BIT_26)) + #define SM_INT_MASK_ALL_LOW (0xffffffff) -#define SM_INT_MASK_HIGH (0x1) -#define SM_INT_MASK_ALL_HIGH (0x7) +#define SM_INT_MASK_HIGH (INT_BIT_0) +#define SM_INT_MASK_ALL_HIGH (INT_BIT_0|INT_BIT_1|INT_BIT_2) #define ALIVE_NOTIFICATION_PERIOD (90*1000) -static int log = 1; -module_param(log, int, S_IRUGO|S_IWUSR); -MODULE_PARM_DESC(log, "Log each error to kernel log."); +static cpumask_t only_cpu_0 = { CPU_BITS_CPU0}; static int force_restart = 1; -module_param(force_restart, int, S_IRUGO|S_IWUSR); +module_param(force_restart, int, 0644); MODULE_PARM_DESC(force_restart, "Machine restart on fatal error."); static atomic64_t mc_counter = ATOMIC_INIT(0); /* - Bit [34] = Logical OR of all lower bits. - Bit [33] = A CRC error occurred on the write data bus. - Bit [32] = The software-initiated control word write has completed. - Bit [31] = The user-initiated DLL resync has completed. - Bit [30] = A state change has been detected on the - dfi_init_complete signal after initialization. - Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has - successfully inhibited the command queue. - Bit [28] = The register interface-initiated mode register write has - completed and another mode register write may be issued. - Bit [27] = A Low Power Interface (LPI) timeout error has occurred. - Bit [26] = MPR read command, initiated with a software MPR_READ request, - is complete. - Bit [25] = Error received from the PHY on the DFI bus. - Bit [24] = RESERVED - Bit [23] = RESERVED - Bit [22] = A parity error has been detected on the address/control bus - on a registered DIMM. - Bit [21] = The leveling operation has completed. - Bit [20] = A read leveling gate training operation has been requested. - Bit [19] = A read leveling operation has been requested. - Bit [18] = A write leveling operation has been requested. - Bit [17] = A DFI update error has occurred. Error information can be - found in the UPDATE_ERROR_STATUS parameter. - Bit [16] = A write leveling error has occurred. Error information can - be found in the WRLVL_ERROR_STATUS parameter. - Bit [15] = A read leveling gate training error has occurred. Error - information can be found in the RDLVL_ERROR_STATUS parameter. - Bit [14] = A read leveling error has occurred. Error information can be - found in the RDLVL_ERROR_STATUS parameter. - Bit [13] = The user has programmed an invalid setting associated with - user words per burst. - Examples: Setting param_reduc when burst length = 2. A 1:2 - MC:PHY clock ratio with burst length = 2. - Bit [12] = A wrap cycle crossing a DRAM page has been detected. This - is unsupported & may result in memory data corruption. - Bit [11] = A write was attempted to a writeprotected region. - Bit [10] = The BIST operation has been completed. - Bit [9] = The low power operation has been completed. - Bit [8] = The MC initialization has been completed. - Bit [7] = An error occurred on the port command channel. - Bit [6] = Multiple uncorrectable ECC events have been detected. - Bit [5] = An uncorrectable ECC event has been detected. - Bit [4] = Multiple correctable ECC events have been detected. - Bit [3] = A correctable ECC event has been detected. - Bit [2] = Multiple accesses outside the defined PHYSICAL memory space - have occurred. - Bit [1] = A memory access outside the defined PHYSICAL memory space - has occurred. - Bit [0] = The memory reset is valid on the DFI bus. - - Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest. -*/ + * Bit [34] = Logical OR of all lower bits. + * Bit [33] = A CRC error occurred on the write data bus. + * Bit [32] = The software-initiated control word write has completed. + * Bit [31] = The user-initiated DLL resync has completed. + * Bit [30] = A state change has been detected on the + * dfi_init_complete signal after initialization. + * Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has + * successfully inhibited the command queue. + * Bit [28] = The register interface-initiated mode register write has + * completed and another mode register write may be issued. + * Bit [27] = A Low Power Interface (LPI) timeout error has occurred. + * Bit [26] = MPR read command, initiated with a software MPR_READ request, + * is complete. + * Bit [25] = Error received from the PHY on the DFI bus. + * Bit [24] = RESERVED + * Bit [23] = RESERVED + * Bit [22] = A parity error has been detected on the address/control bus + * on a registered DIMM. + * Bit [21] = The leveling operation has completed. + * Bit [20] = A read leveling gate training operation has been requested. + * Bit [19] = A read leveling operation has been requested. + * Bit [18] = A write leveling operation has been requested. + * Bit [17] = A DFI update error has occurred. Error information can be + * found in the UPDATE_ERROR_STATUS parameter. + * Bit [16] = A write leveling error has occurred. Error information can + * be found in the WRLVL_ERROR_STATUS parameter. + * Bit [15] = A read leveling gate training error has occurred. Error + * information can be found in the RDLVL_ERROR_STATUS parameter. + * Bit [14] = A read leveling error has occurred. Error information can be + * found in the RDLVL_ERROR_STATUS parameter. + * Bit [13] = The user has programmed an invalid setting associated with + * user words per burst. + * Examples: Setting param_reduc when burst length = 2. A 1:2 + * MC:PHY clock ratio with burst length = 2. + * Bit [12] = A wrap cycle crossing a DRAM page has been detected. This + * is unsupported & may result in memory data corruption. + * Bit [11] = A write was attempted to a writeprotected region. + * Bit [10] = The BIST operation has been completed. + * Bit [9] = The low power operation has been completed. + * Bit [8] = The MC initialization has been completed. + * Bit [7] = An error occurred on the port command channel. + * Bit [6] = Multiple uncorrectable ECC events have been detected. + * Bit [5] = An uncorrectable ECC event has been detected. + * Bit [4] = Multiple correctable ECC events have been detected. + * Bit [3] = A correctable ECC event has been detected. + * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space + * have occurred. + * Bit [1] = A memory access outside the defined PHYSICAL memory space + * has occurred. + * Bit [0] = The memory reset is valid on the DFI bus. + * + * Of these 1, 2, 3, 4, 5, 6, 7, 12, 22, 24 and 26 are of interest. + */ /* * MPR dump processing - overview. @@ -559,15 +586,15 @@ static char *block_name[] = { static const u32 event_mask[NR_EVENTS] = { - [EV_ILLEGAL] = 0x00000002, - [EV_MULT_ILLEGAL] = 0x00000004, - [EV_CORR_ECC] = 0x00000008, - [EV_MULT_CORR_ECC] = 0x00000010, - [EV_UNCORR_ECC] = 0x00000020, - [EV_MULT_UNCORR_ECC] = 0x00000040, - [EV_PORT_ERROR] = 0x00000080, - [EV_WRAP_ERROR] = 0x00001000, - [EV_PARITY_ERROR] = 0x00400000, + [EV_ILLEGAL] = INT_BIT_1, + [EV_MULT_ILLEGAL] = INT_BIT_2, + [EV_CORR_ECC] = INT_BIT_3, + [EV_MULT_CORR_ECC] = INT_BIT_4, + [EV_UNCORR_ECC] = INT_BIT_5, + [EV_MULT_UNCORR_ECC] = INT_BIT_6, + [EV_PORT_ERROR] = INT_BIT_7, + [EV_WRAP_ERROR] = INT_BIT_12, + [EV_PARITY_ERROR] = INT_BIT_22, }; static const struct event_logging { @@ -735,7 +762,7 @@ static struct edac_dev_sysfs_attribute device_block_attr[] = { { .attr = { .name = "mpr_page1", - .mode = (S_IRUGO | S_IWUSR) + .mode = (0644) }, .show = mpr1_dump_show, .store = NULL}, @@ -833,8 +860,11 @@ update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs) (u8 (*)[MPR_PAGE_BYTES]) (&edac_dev->data->mpr.dram_0_page[0]); int i; - for (i = 0; i < MAX_DQ; ++i) + for (i = 0; i < edac_dev->data->dram_count; ++i) { inc_alert_counter(edac_dev->data->alerts, cs, i, dram[i][3]); + trace_edac_mc_dump_processed(edac_dev->sm_region >> 16, + cs, i, (int) dram[i][3]); + } } @@ -845,6 +875,9 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) unsigned long flags; u32 regval; int i; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + u32 node = edac_dev->sm_region >> 16; +#endif mpr->mpr_page_id = page; @@ -855,9 +888,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_0_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 0, + (int) mpr->dram_0_page[i]); +#endif mpr->dram_1_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 1, + (int) mpr->dram_1_page[i]); +#endif mpr->dram_2_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 2, + (int) mpr->dram_2_page[i]); +#endif mpr->dram_3_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 3, + (int) mpr->dram_3_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_59 + (0x14 * i)), @@ -865,9 +914,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_4_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 4, + (int) mpr->dram_4_page[i]); +#endif mpr->dram_5_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 5, + (int) mpr->dram_5_page[i]); +#endif mpr->dram_6_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 6, + (int) mpr->dram_6_page[i]); +#endif mpr->dram_7_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 7, + (int) mpr->dram_7_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_60 + (0x14 * i)), @@ -875,11 +940,27 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_8_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 8, + (int) mpr->dram_8_page[i]); +#endif if (edac_dev->data->dram_count == MAX_DQ) { mpr->dram_9_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 9, + (int) mpr->dram_9_page[i]); +#endif mpr->dram_10_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 10, + (int) mpr->dram_10_page[i]); +#endif mpr->dram_11_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 11, + (int) mpr->dram_11_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_60 + @@ -887,9 +968,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_12_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 12, + (int) mpr->dram_12_page[i]); +#endif mpr->dram_13_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 13, + (int) mpr->dram_13_page[i]); +#endif mpr->dram_14_page[i] = ((regval & 0xff0000) >> 16); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 14, + (int) mpr->dram_14_page[i]); +#endif mpr->dram_15_page[i] = ((regval & 0xff000000) >> 24); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 15, + (int) mpr->dram_15_page[i]); +#endif if (ncr_read(edac_dev->sm_region, (SM_56XX_DENALI_CTL_61 + @@ -897,7 +994,15 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs) goto error_read; mpr->dram_16_page[i] = regval & 0xff; +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 16, + (int) mpr->dram_16_page[i]); +#endif mpr->dram_17_page[i] = ((regval & 0xff00) >> 8); +#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM + trace_edac_mc_dump_collected(node, cs, i, 17, + (int) mpr->dram_17_page[i]); +#endif } } raw_spin_lock_irqsave(&edac_dev->data->mpr_data_lock, flags); @@ -945,24 +1050,30 @@ smmon_isr_sw(int interrupt, void *device) 4, (u32 *) &denali_ctl_367)) goto error_read; - if (denali_ctl_367.int_status & 0x4) { + trace_edac_mc_int_status(dev_info->sm_region >> 16, 0, + denali_ctl_367.int_status); + + if (denali_ctl_367.int_status & INT_BIT_2) { if (ncr_read(dev_info->sm_region, SM_56XX_DENALI_CTL_366, 4, (u32 *) &denali_ctl_366)) goto error_read; + trace_edac_mc_int_status(dev_info->sm_region >> 16, 1, + denali_ctl_366.int_status); + handle_events(dev_info, &denali_ctl_366); atomic_set(&dev_info->data->event_ready, 1); wake_up(&dev_info->data->event_wq); denali_ctl_368.int_ack = - (denali_ctl_366.int_status & 0xf8ffffff); + (denali_ctl_366.int_status & (~(INT_BIT_26))); if (dev_info->is_ddr4) { - if (denali_ctl_366.int_status & 0x4000000) { + if (denali_ctl_366.int_status & INT_BIT_26) { atomic_set(&dev_info->data->dump_ready, 1); wake_up(&dev_info->data->dump_wq); - denali_ctl_368.int_ack |= 0x4000000; + denali_ctl_368.int_ack |= INT_BIT_26; } } if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_368, @@ -970,12 +1081,12 @@ smmon_isr_sw(int interrupt, void *device) goto error_write; } - if (denali_ctl_367.int_status & 0x2) { + if (denali_ctl_367.int_status & INT_BIT_1) { if (dev_info->is_ddr4) { atomic_inc(&dev_info->data->dump_in_progress); wake_up(&dev_info->data->dump_wq); } - denali_ctl_369.int_ack = 0x2; + denali_ctl_369.int_ack = INT_BIT_1; if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_369, 4, (u32 *) &denali_ctl_369)) goto error_write; @@ -1006,9 +1117,9 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev) start: /* keep hung up monitor happy 90 sec's */ - if (0 == wait_event_timeout(dev_info->data->dump_wq, + if (wait_event_timeout(dev_info->data->dump_wq, atomic_read(&dev_info->data->dump_in_progress), - msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD))) + msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0) goto start; if (dev_info->finish_alerts) @@ -1036,6 +1147,9 @@ start: SM_56XX_DENALI_CTL_57, 4, (u32 *) &denali_ctl_57)) goto error_write; + + trace_edac_mc_dump_triggered(dev_info->sm_region >> 16, i); + /* wait */ wait_event(dev_info->data->dump_wq, atomic_read(&dev_info->data->dump_ready)); @@ -1091,9 +1205,9 @@ static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev) u32 counter; while (1) { - if (0 == wait_event_timeout(dev_info->data->event_wq, + if (wait_event_timeout(dev_info->data->event_wq, atomic_read(&dev_info->data->event_ready), - msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD))) + msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0) continue; atomic_set(&dev_info->data->event_ready, 0); @@ -1221,6 +1335,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev) struct sm_56xx_denali_ctl_371 denali_ctl_371; int cs_count = MAX_CS; int dram_count = MAX_DQ; + struct irq_desc *desc; count = atomic64_inc_return(&mc_counter); if ((count - 1) == MEMORY_CONTROLLERS) @@ -1342,6 +1457,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev) } dev_info->edac_dev->log_ce = 0; + instance = &dev_info->edac_dev->instances[0]; /* It just gives more descriptive name. */ @@ -1401,14 +1517,14 @@ static int intel_edac_mc_probe(struct platform_device *pdev) "%s-mon", dev_info->ctl_name); dev_info->wq_events = - alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1, + alloc_workqueue("%s-events", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_events) goto err_nosysfs; if (dev_info->is_ddr4) { dev_info->wq_alerts = - alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1, + alloc_workqueue("%s-alerts", 0, 1, (dev_info->ctl_name)); if (!dev_info->wq_alerts) @@ -1420,8 +1536,9 @@ static int intel_edac_mc_probe(struct platform_device *pdev) INIT_WORK(&dev_info->offload_events, axxia_events_work); if (dev_info->is_ddr4) - queue_work(dev_info->wq_alerts, &dev_info->offload_alerts); - queue_work(dev_info->wq_events, &dev_info->offload_events); + queue_work_on(0, dev_info->wq_alerts, + &dev_info->offload_alerts); + queue_work_on(0, dev_info->wq_events, &dev_info->offload_events); irq = platform_get_irq(pdev, 0); if (irq < 0) { @@ -1435,8 +1552,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev) if (dev_info->is_ddr4) denali_ctl_370.int_mask = SM_INT_MASK_LOW; else - denali_ctl_370.int_mask = SM_INT_MASK_LOW | - 0x04000000; + denali_ctl_370.int_mask = SM_INT_MASK_LOW | INT_BIT_26; if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_370, 4, (u32 *) &denali_ctl_370)) { @@ -1487,6 +1603,9 @@ static int intel_edac_mc_probe(struct platform_device *pdev) } goto err_noirq; } + desc = irq_to_desc(irq); + sched_setaffinity(desc->action->thread->pid, &only_cpu_0); + return 0; err_noirq: diff --git a/include/trace/events/edac_cmc.h b/include/trace/events/edac_cmc.h new file mode 100644 index 0000000..143d58f --- /dev/null +++ b/include/trace/events/edac_cmc.h @@ -0,0 +1,101 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM edac_cmc + +#if !defined(_TRACE_EDAC_CMC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EDAC_CMC_H + +#include <linux/types.h> +#include <linux/tracepoint.h> + + +TRACE_EVENT(edac_cmc_int_status, + TP_PROTO(u32 node, u32 int_status), + + TP_ARGS(node, int_status), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, int_status) + ), + + TP_fast_assign( + __entry->node = node; + __entry->int_status = int_status; + ), + + TP_printk("CMEM(node=0x%x) int_status=0x%08x", + (u32) __entry->node, (u32) __entry->int_status) +); + +TRACE_EVENT(edac_cmc_dump_processed, + TP_PROTO(u32 node, u32 cs, u32 dram, u32 val), + + TP_ARGS(node, cs, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("CMEM(node=0x%x) cs=%d dram=%d value=0x%x", + (u32) __entry->node, (u32) __entry->cs, + (u32) __entry->dram, (u32) __entry->val) +); + +TRACE_EVENT(edac_cmc_dump_collected, + TP_PROTO(u32 node, u32 cs, u32 byte, u32 dram, u32 val), + + TP_ARGS(node, cs, byte, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, byte) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->byte = byte; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("CMEM(node=0x%x) cs=%d dram_%d_page[byte=%d]=0x%x", + (u32) __entry->node, (u32) __entry->cs, (u32) __entry->dram, + (u32) __entry->byte, (u32) __entry->val) +); + +TRACE_EVENT(edac_cmc_dump_triggered, + TP_PROTO(u32 node, u32 cs), + + TP_ARGS(node, cs), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + ), + + TP_printk("CMEM(node=0x%x) cs=%d", + (u32) __entry->node, (u32) __entry->cs) +); +#endif /* _TRACE_EDAC_CMC_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/edac_mc.h b/include/trace/events/edac_mc.h new file mode 100644 index 0000000..fa09fda --- /dev/null +++ b/include/trace/events/edac_mc.h @@ -0,0 +1,104 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM edac_mc + +#if !defined(_TRACE_EDAC_MC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EDAC_MC_H + +#include <linux/types.h> +#include <linux/tracepoint.h> + + +TRACE_EVENT(edac_mc_int_status, + TP_PROTO(u32 node, u32 idx, u32 int_status), + + TP_ARGS(node, idx, int_status), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, idx) + __field(u32, int_status) + ), + + TP_fast_assign( + __entry->node = node; + __entry->idx = idx; + __entry->int_status = int_status; + ), + + TP_printk("SMEM(node=0x%x) int_status[%d]=0x%08x", + (u32) __entry->node, (u32) __entry->idx, + (u32) __entry->int_status) +); + +TRACE_EVENT(edac_mc_dump_processed, + TP_PROTO(u32 node, u32 cs, u32 dram, u32 val), + + TP_ARGS(node, cs, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("SMEM(node=0x%x) cs=%d dram=%d value=0x%x", + (u32) __entry->node, (u32) __entry->cs, + (u32) __entry->dram, (u32) __entry->val) +); + +TRACE_EVENT(edac_mc_dump_collected, + TP_PROTO(u32 node, u32 cs, u32 byte, u32 dram, u32 val), + + TP_ARGS(node, cs, byte, dram, val), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + __field(u32, byte) + __field(u32, dram) + __field(u32, val) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + __entry->byte = byte; + __entry->dram = dram; + __entry->val = val; + ), + + TP_printk("SMEM(node=0x%x) cs=%d dram_%d_page[byte=%d]=0x%x", + (u32) __entry->node, (u32) __entry->cs, (u32) __entry->dram, + (u32) __entry->byte, (u32) __entry->val) +); + +TRACE_EVENT(edac_mc_dump_triggered, + TP_PROTO(u32 node, u32 cs), + + TP_ARGS(node, cs), + + TP_STRUCT__entry( + __field(u32, node) + __field(u32, cs) + ), + + TP_fast_assign( + __entry->node = node; + __entry->cs = cs; + ), + + TP_printk("SMEM(node=0x%x) cs=%d", + (u32) __entry->node, (u32) __entry->cs) +); +#endif /* _TRACE_EDAC_MC_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> -- 2.7.4 -- _______________________________________________ linux-yocto mailing list linux-yocto@yoctoproject.org https://lists.yoctoproject.org/listinfo/linux-yocto