From: Marek Majtyka <marekx.majt...@intel.com> Changes: - Fatal error is handled first. - Counters calculation fixed (bug found edac L2/L1). - Added new function for increment multiple errors as a single one. - Checkpatch fixes. - Traces added (for L1/L2 debugging only). (/sys/kernel/debug/tracing/events/edac)
Signed-off-by: Marek Majtyka <marekx.majt...@intel.com> --- drivers/edac/axxia_edac-cmc_56xx.c | 27 +++++---- drivers/edac/axxia_edac-l2_cpu_56xx.c | 66 +++++++++++---------- drivers/edac/axxia_edac-l3_56xx.c | 16 +++--- drivers/edac/axxia_edac-mc_56xx.c | 27 ++++----- drivers/edac/edac_core.h | 17 ++++-- drivers/edac/edac_device.c | 105 ++++++++++++++++++++++++++++++++-- include/trace/events/edac.h | 77 +++++++++++++++++++++++++ 7 files changed, 263 insertions(+), 72 deletions(-) create mode 100644 include/trace/events/edac.h diff --git a/drivers/edac/axxia_edac-cmc_56xx.c b/drivers/edac/axxia_edac-cmc_56xx.c index 9d22279..3fc2af8 100644 --- a/drivers/edac/axxia_edac-cmc_56xx.c +++ b/drivers/edac/axxia_edac-cmc_56xx.c @@ -899,7 +899,7 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev) struct event_counter (*alerts)[MAX_DQ][MPR_ERRORS] = dev_info->data->alerts; struct cm_56xx_denali_ctl_34 denali_ctl_34; - int i, j, k, l, ret; + int i, j, k, ret; u32 counter; start: @@ -961,10 +961,10 @@ start: */ counter = atomic_xchg(&alerts[i][j][k].counter, 0); - for (l = 0; l < counter; ++l) - edac_device_handle_ce(edac_dev, 0, + if (counter) + edac_device_handle_multi_ce(edac_dev, 0, alerts[i][j][k].edac_block_idx, - edac_dev->ctl_name); + counter, edac_dev->ctl_name); } } } @@ -994,7 +994,7 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev) struct intel_edac_dev_info *dev_info = (struct intel_edac_dev_info *) edac_dev->pvt_info; struct event_counter *events = dev_info->data->events; - int i, j; + int i; u32 counter; while (1) { @@ -1011,7 +1011,7 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev) mutex_lock(&dev_info->data->edac_sysfs_data_lock); for (i = 0; i < NR_EVENTS; ++i) { counter = atomic_xchg(&events[i].counter, 0); - for (j = 0; j < counter; ++j) { + if (counter) switch (i) { /* * TODO - How can one determine event type? @@ -1021,22 +1021,23 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev) case EV_MULT_ILLEGAL: case EV_UNCORR_ECC: case EV_MULT_UNCORR_ECC: - edac_device_handle_ue(edac_dev, 0, i, - edac_dev->ctl_name); + edac_device_handle_multi_ue(edac_dev, + 0, i, counter, + edac_dev->ctl_name); break; case EV_CORR_ECC: case EV_MULT_CORR_ECC: case EV_PORT_ERROR: case EV_WRAP_ERROR: case EV_PARITY_ERROR: - edac_device_handle_ce(edac_dev, 0, i, - edac_dev->ctl_name); + edac_device_handle_multi_ce(edac_dev, + 0, i, counter, + edac_dev->ctl_name); break; default: printk_ratelimited( "ERROR EVENT MISSING.\n"); } - } } mutex_unlock(&dev_info->data->edac_sysfs_data_lock); } @@ -1459,8 +1460,10 @@ axxia_cmem_write(struct file *file, const char __user *buffer, /* 0x3 0x3 */ setup_fault_injection(dev_info, 0x183, 1); } - if (!strncmp(buf, "disable", 7)) + if (!strncmp(buf, "disable", 7)) { + /* disable injection */ setup_fault_injection(dev_info, 0x0, 0); + } kfree(buf); return count; diff --git a/drivers/edac/axxia_edac-l2_cpu_56xx.c b/drivers/edac/axxia_edac-l2_cpu_56xx.c index 4b5f6bf..8ddf018 100644 --- a/drivers/edac/axxia_edac-l2_cpu_56xx.c +++ b/drivers/edac/axxia_edac-l2_cpu_56xx.c @@ -9,6 +9,8 @@ * GNU General Public License. */ +#define CREATE_TRACE_POINTS + #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> @@ -22,6 +24,7 @@ #include <linux/reboot.h> #include <linux/mfd/syscon.h> #include <linux/regmap.h> +#include <trace/events/edac.h> #include "edac_core.h" #include "edac_module.h" #include "axxia_l2_56xx.h" @@ -50,38 +53,41 @@ void log_cpumerrsr(void *edac) struct edac_device_ctl_info *edac_dev = edac; u64 val, clear_val; u32 count0, count1; - int i; struct intel_edac_dev_info *dev_info; dev_info = edac_dev->pvt_info; /* Read S3_1_c15_c2_2 for CPUMERRSR_EL1 counts */ val = read_cpumerrsr(); + trace_edac_l1cache_syndrome(val); + + if (val & 0x8000000000000000) { + regmap_update_bits(dev_info->syscon, + SYSCON_PERSIST_SCRATCH, + CPU_PERSIST_SCRATCH_BIT, + CPU_PERSIST_SCRATCH_BIT); + pr_emerg("CPU uncorrectable error\n"); + machine_restart(NULL); + } + if (val & 0x80000000) { int cpu = get_cpu(); - count0 = ((val) & 0x000000ff00000000) >> 31; - count1 = ((val) & 0x0000ff0000000000) >> 39; + count0 = ((val) & 0x000000ff00000000) >> 32; + count1 = ((val) & 0x0000ff0000000000) >> 40; /* increment correctable error counts */ - for (i = 0; i < count0+count1; i++) { - edac_device_handle_ce(edac_dev, 0, - cpu, edac_dev->ctl_name); - } + trace_edac_l1cache_counter(count0 + count1); + + if (count0 || count1) + edac_device_handle_multi_ce(edac_dev, 0, cpu, + count0 + count1, edac_dev->ctl_name); /* Clear the valid bit */ clear_val = 0x80000000; write_cpumerrsr(clear_val); put_cpu(); } - if (val & 0x8000000000000000) { - regmap_update_bits(dev_info->syscon, - SYSCON_PERSIST_SCRATCH, - CPU_PERSIST_SCRATCH_BIT, - CPU_PERSIST_SCRATCH_BIT); - pr_emerg("CPU uncorrectable error\n"); - machine_restart(NULL); - } } @@ -101,38 +107,38 @@ void log_l2merrsr(void *edac) struct edac_device_ctl_info *edac_dev = edac; u64 val, clear_val; u32 count0, count1; - int i; struct intel_edac_dev_info *dev_info; dev_info = edac_dev->pvt_info; val = read_l2merrsr(); + trace_edac_l2cache_syndrome(val); + if (val & 0x8000000000000000) { + regmap_update_bits(dev_info->syscon, + SYSCON_PERSIST_SCRATCH, + L2_PERSIST_SCRATCH_BIT, + L2_PERSIST_SCRATCH_BIT); + pr_emerg("L2 uncorrectable error\n"); + machine_restart(NULL); + } if (val & 0x80000000) { int cpu = get_cpu(); - count0 = ((val) & 0x000000ff00000000) >> 31; - count1 = ((val) & 0x0000ff0000000000) >> 39; + count0 = ((val) & 0x000000ff00000000) >> 32; + count1 = ((val) & 0x0000ff0000000000) >> 40; /* increment correctable error counts */ - for (i = 0; i < count0+count1; i++) { - edac_device_handle_ce(edac_dev, 0, + trace_edac_l2cache_counter(count0 + count1); + if (count0 || count1) + edac_device_handle_multi_ce(edac_dev, 0, cpu/CORES_PER_CLUSTER, - edac_dev->ctl_name); - } + count0 + count1, edac_dev->ctl_name); /* Clear the valid bit */ clear_val = 0x80000000; write_l2merrsr(clear_val); put_cpu(); } - if (val & 0x8000000000000000) { - regmap_update_bits(dev_info->syscon, - SYSCON_PERSIST_SCRATCH, - L2_PERSIST_SCRATCH_BIT, - L2_PERSIST_SCRATCH_BIT); - pr_emerg("L2 uncorrectable error\n"); - machine_restart(NULL); - } } /* Check for L2 Errors */ diff --git a/drivers/edac/axxia_edac-l3_56xx.c b/drivers/edac/axxia_edac-l3_56xx.c index d2d9e5a..4fa9fe6 100644 --- a/drivers/edac/axxia_edac-l3_56xx.c +++ b/drivers/edac/axxia_edac-l3_56xx.c @@ -198,7 +198,7 @@ static irqreturn_t ccn_irq_thread(int irq, void *device) union dickens_hnf_err_syndrome_reg1 err_syndrome_reg1; struct arm_smccc_res r; unsigned count = 0; - int i, j; + int i; /* only HNF nodes are of our interest */ for (i = 0; i < CCN_HNF_NODES; ++i) { @@ -222,10 +222,10 @@ static irqreturn_t ccn_irq_thread(int irq, void *device) machine_restart(NULL); } count = err_syndrome_reg0.reg0.err_count; - for (j = 0; j < count; j++) - edac_device_handle_ce(edac_dev, 0, + if (count) + edac_device_handle_multi_ce(edac_dev, 0, dev_info->data[i].idx, - edac_dev->ctl_name); + count, edac_dev->ctl_name); } } @@ -335,7 +335,7 @@ static void intel_l3_error_check(struct edac_device_ctl_info *edac_dev) union dickens_hnf_err_syndrome_reg0 err_syndrome_reg0; union dickens_hnf_err_syndrome_clr err_syndrome_clr; unsigned count = 0; - int i, instance; + int instance; struct intel_edac_dev_info *dev_info; err_syndrome_clr.value = 0x0; @@ -363,9 +363,9 @@ static void intel_l3_error_check(struct edac_device_ctl_info *edac_dev) machine_restart(NULL); } count = err_syndrome_reg0.reg0.err_count; - for (i = 0; i < count; i++) - edac_device_handle_ce(edac_dev, 0, - instance, edac_dev->ctl_name); + if (count) + edac_device_handle_multi_ce(edac_dev, 0, + instance, count, edac_dev->ctl_name); /* clear the valid bit */ clear_node_error(addr + CCN_NODE_ERR_SYND_CLR); diff --git a/drivers/edac/axxia_edac-mc_56xx.c b/drivers/edac/axxia_edac-mc_56xx.c index cfd73c3..947f427 100644 --- a/drivers/edac/axxia_edac-mc_56xx.c +++ b/drivers/edac/axxia_edac-mc_56xx.c @@ -995,7 +995,7 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev) struct event_counter (*alerts)[MAX_DQ][MPR_ERRORS] = dev_info->data->alerts; struct sm_56xx_denali_ctl_57 denali_ctl_57; - int i, j, k, l; + int i, j, k; u32 counter; start: @@ -1052,12 +1052,12 @@ start: * TODO - How can one determine event type? * recoverable/unrecoverable */ - counter = atomic_xchg(&alerts[i][j][k].counter, - 0); - for (l = 0; l < counter; ++l) - edac_device_handle_ce(edac_dev, 0, + counter = atomic_xchg( + &alerts[i][j][k].counter, 0); + if (counter) + edac_device_handle_multi_ce(edac_dev, 0, alerts[i][j][k].edac_block_idx, - edac_dev->ctl_name); + counter, edac_dev->ctl_name); } } } @@ -1081,7 +1081,7 @@ static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev) struct intel_edac_dev_info *dev_info = (struct intel_edac_dev_info *) edac_dev->pvt_info; struct event_counter *events = dev_info->data->events; - int i, j; + int i; u32 counter; while (1) { @@ -1098,7 +1098,7 @@ static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev) mutex_lock(&dev_info->data->edac_sysfs_data_lock); for (i = 0; i < NR_EVENTS; ++i) { counter = atomic_xchg(&events[i].counter, 0); - for (j = 0; j < counter; ++j) { + if (counter) switch (i) { /* * TODO - How can one determine event type? @@ -1108,22 +1108,23 @@ static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev) case EV_MULT_ILLEGAL: case EV_UNCORR_ECC: case EV_MULT_UNCORR_ECC: - edac_device_handle_ue(edac_dev, 0, i, - edac_dev->ctl_name); + edac_device_handle_multi_ue(edac_dev, + 0, i, counter, + edac_dev->ctl_name); break; case EV_CORR_ECC: case EV_MULT_CORR_ECC: case EV_PORT_ERROR: case EV_WRAP_ERROR: case EV_PARITY_ERROR: - edac_device_handle_ce(edac_dev, 0, i, - edac_dev->ctl_name); + edac_device_handle_multi_ce(edac_dev, + 0, i, counter, + edac_dev->ctl_name); break; default: printk_ratelimited( "ERROR EVENT MISSING.\n"); } - } } mutex_unlock(&dev_info->data->edac_sysfs_data_lock); } diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index ad42587..38fe952 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -250,7 +250,7 @@ struct edac_device_ctl_info { * If NULL: Then assumes INTERRUPT operation, where * MC driver will receive events */ - void (*edac_check) (struct edac_device_ctl_info * edac_dev); + void (*edac_check)(struct edac_device_ctl_info *edac_dev); struct device *dev; /* pointer to device structure */ @@ -260,7 +260,7 @@ struct edac_device_ctl_info { void *pvt_info; /* pointer to 'private driver' info */ - unsigned long start_time; /* edac_device load start time (jiffies) */ + unsigned long start_time; /* edac_device load start time (jiffies) */ struct completion removal_complete; @@ -293,7 +293,7 @@ struct edac_device_ctl_info { container_of(w, struct mem_ctl_info, work) #define to_edac_device_ctl_work(w) \ - container_of(w,struct edac_device_ctl_info,work) + container_of(w, struct edac_device_ctl_info, work) /* * The alloc() and free() functions for the 'edac_device' control info @@ -348,7 +348,7 @@ struct edac_pci_ctl_info { * If NULL: Then assumes INTERRUPT operation, where * MC driver will receive events */ - void (*edac_check) (struct edac_pci_ctl_info * edac_dev); + void (*edac_check)(struct edac_pci_ctl_info *edac_dev); struct device *dev; /* pointer to device structure */ @@ -382,7 +382,7 @@ struct edac_pci_ctl_info { }; #define to_edac_pci_ctl_work(w) \ - container_of(w, struct edac_pci_ctl_info,work) + container_of(w, struct edac_pci_ctl_info, work) /* write all or some bits in a byte-register*/ static inline void pci_write_bits8(struct pci_dev *pdev, int offset, u8 value, @@ -479,8 +479,15 @@ extern int edac_device_add_device(struct edac_device_ctl_info *edac_dev); extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev); extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, int inst_nr, int block_nr, const char *msg); +extern void edac_device_handle_multi_ue(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, int events, + const char *msg); extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, int inst_nr, int block_nr, const char *msg); +extern void edac_device_handle_multi_ce(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, int events, + const char *msg); + extern int edac_device_alloc_index(void); extern const char *edac_layer_name[]; diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 5358737..5327254 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -25,7 +25,7 @@ #include <linux/list.h> #include <linux/ctype.h> #include <linux/workqueue.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/page.h> #include "edac_core.h" @@ -154,7 +154,7 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info( dev_ctl->log_ue = 1; /* Name of this edac device */ - snprintf(dev_ctl->name,sizeof(dev_ctl->name),"%s",edac_device_name); + snprintf(dev_ctl->name, sizeof(dev_ctl->name), "%s", edac_device_name); edac_dbg(4, "edac_dev=%p next after end=%p\n", dev_ctl, pvt + sz_private); @@ -591,8 +591,8 @@ struct edac_device_ctl_info *edac_device_del_device(struct device *dev) edac_printk(KERN_INFO, EDAC_MC, "Removed device %d for %s %s: DEV %s\n", - edac_dev->dev_idx, - edac_dev->mod_name, edac_dev->ctl_name, edac_dev_name(edac_dev)); + edac_dev->dev_idx, edac_dev->mod_name, + edac_dev->ctl_name, edac_dev_name(edac_dev)); return edac_dev; } @@ -661,6 +661,52 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, EXPORT_SYMBOL_GPL(edac_device_handle_ce); /* + * edac_device_handle_multi_ce + * perform a common output and handling of an 'edac_dev' CE multiple events + */ +void edac_device_handle_multi_ce(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, int events, const char *msg) +{ + struct edac_device_instance *instance; + struct edac_device_block *block = NULL; + + if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) { + edac_device_printk(edac_dev, KERN_ERR, + "INTERNAL ERROR: 'instance' out of range " + "(%d >= %d)\n", inst_nr, + edac_dev->nr_instances); + return; + } + + instance = edac_dev->instances + inst_nr; + + if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) { + edac_device_printk(edac_dev, KERN_ERR, + "INTERNAL ERROR: instance %d 'block' " + "out of range (%d >= %d)\n", + inst_nr, block_nr, + instance->nr_blocks); + return; + } + + if (instance->nr_blocks > 0) { + block = instance->blocks + block_nr; + block->counters.ce_count += events; + } + + /* Propagate the count up the 'totals' tree */ + instance->counters.ce_count += events; + edac_dev->counters.ce_count += events; + + if (edac_device_get_log_ce(edac_dev)) + edac_device_printk(edac_dev, KERN_WARNING, + "CE: %s instance: %s block: %s events: %d '%s'\n", + edac_dev->ctl_name, instance->name, + block ? block->name : "N/A", events, msg); +} +EXPORT_SYMBOL_GPL(edac_device_handle_multi_ce); + +/* * edac_device_handle_ue * perform a common output and handling of an 'edac_dev' UE event */ @@ -710,3 +756,54 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, block ? block->name : "N/A", msg); } EXPORT_SYMBOL_GPL(edac_device_handle_ue); + +/* + * edac_device_handle_multi_ue + * perform a common output and handling of an 'edac_dev' UE event + */ +void edac_device_handle_multi_ue(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, int events, const char *msg) +{ + struct edac_device_instance *instance; + struct edac_device_block *block = NULL; + + if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) { + edac_device_printk(edac_dev, KERN_ERR, + "INTERNAL ERROR: 'instance' out of range " + "(%d >= %d)\n", inst_nr, + edac_dev->nr_instances); + return; + } + + instance = edac_dev->instances + inst_nr; + + if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) { + edac_device_printk(edac_dev, KERN_ERR, + "INTERNAL ERROR: instance %d 'block' " + "out of range (%d >= %d)\n", + inst_nr, block_nr, + instance->nr_blocks); + return; + } + + if (instance->nr_blocks > 0) { + block = instance->blocks + block_nr; + block->counters.ue_count += events; + } + + /* Propagate the count up the 'totals' tree */ + instance->counters.ue_count += events; + edac_dev->counters.ue_count += events; + + if (edac_device_get_log_ue(edac_dev)) + edac_device_printk(edac_dev, KERN_EMERG, + "UE: %s instance: %s block: %s events: %d '%s'\n", + edac_dev->ctl_name, instance->name, + block ? block->name : "N/A", events, msg); + + if (edac_device_get_panic_on_ue(edac_dev)) + panic("EDAC %s: UE instance: %s block %s events: %d '%s'\n", + edac_dev->ctl_name, instance->name, + block ? block->name : "N/A", events, msg); +} +EXPORT_SYMBOL_GPL(edac_device_handle_multi_ue); diff --git a/include/trace/events/edac.h b/include/trace/events/edac.h new file mode 100644 index 0000000..29da0d1 --- /dev/null +++ b/include/trace/events/edac.h @@ -0,0 +1,77 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM edac + +#if !defined(_TRACE_EDAC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EDAC_H + +#include <linux/types.h> +#include <linux/tracepoint.h> + + +TRACE_EVENT(edac_l2cache_syndrome, + TP_PROTO(u64 syndrome), + + TP_ARGS(syndrome), + + TP_STRUCT__entry( + __field(u64, syndrome) + ), + + TP_fast_assign( + __entry->syndrome = syndrome; + ), + + TP_printk("L2MERRSR_EL1=0x%016llx", (u64) __entry->syndrome) +); + +TRACE_EVENT(edac_l2cache_counter, + TP_PROTO(int counter), + + TP_ARGS(counter), + + TP_STRUCT__entry( + __field(int, counter) + ), + + TP_fast_assign( + __entry->counter = counter; + ), + + TP_printk("l2 counter =%d", __entry->counter) +); + +TRACE_EVENT(edac_l1cache_syndrome, + TP_PROTO(u64 syndrome), + + TP_ARGS(syndrome), + + TP_STRUCT__entry( + __field(u64, syndrome) + ), + + TP_fast_assign( + __entry->syndrome = syndrome; + ), + + TP_printk("CPUMERRSR_EL1=0x%016llx", (u64) __entry->syndrome) +); + +TRACE_EVENT(edac_l1cache_counter, + TP_PROTO(int counter), + + TP_ARGS(counter), + + TP_STRUCT__entry( + __field(int, counter) + ), + + TP_fast_assign( + __entry->counter = counter; + ), + + TP_printk("l1 counter =%d", __entry->counter) +); +#endif /* _TRACE_EDAC_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> -- 2.7.4 -- _______________________________________________ linux-yocto mailing list linux-yocto@yoctoproject.org https://lists.yoctoproject.org/listinfo/linux-yocto