Hi! On 22/06/2019 07:38, Lei Wang wrote: > New driver supports error detection and correction on the devices with ARM > DMC-520 memory controller.
> diff --git a/drivers/edac/dmc520_edac.c b/drivers/edac/dmc520_edac.c > new file mode 100644 > index 000000000000..c23734c13933 > --- /dev/null > +++ b/drivers/edac/dmc520_edac.c > @@ -0,0 +1,604 @@ > +#include <linux/module.h> > +#include <linux/platform_device.h> > +#include <linux/edac.h> > +#include <linux/io.h> > +#include <linux/of.h> > +#include <linux/interrupt.h> > +#include <linux/bitfield.h> #include <linux/spinlock.h> ? It's best to keep this list sorted, it makes it easier for the maintainer to resolve conflicts when header files get split/moved-around. > +#include "edac_mc.h" [...] > +#define REG_OFFSET_FEATURE_CONTROL_NEXT 0x1F0 Nothing uses this, do we need it? [...] > +#define REG_OFFSET_DECODE_CONTROL_NOW 0x1014 Nothing uses this, do we need it? [...] > +/* DMC-520 types, masks and bitfields */ > +#define DRAM_ECC_INT_CE_MASK BIT(2) > +#define DRAM_ECC_INT_UE_MASK BIT(3) (The 'MASK' suffix isn't really needed for a single bit, you can't confuse it with the value.) [...] > +#define SCRUB_CONTROL_MASK GENMASK(1, 0) Isn't this field called TRIGGER0_NEXT? It would be good to use the names from the datasheet[0] as it makes it much easier for someone else to debug. > +#define DMC520_EDAC_ERR_GRAIN 1 > +#define DMC520_BUS_WIDTH 8 /* Data bus width is 64bits/8Bytes */ Can you point me to where this comes from in the datasheet[0]? I see it talk in "1.3 Features" of "either a 32-bit wide data SDRAM interface or a 64-bit wide data SDRAM interface". If this is a choice that was made on your platform it needs to be described in the DT. (I may be confused between SDRAM/DDR/DRAM, as 2.3.3. "PHY interface" seems to describe one connecting to the other.) > +/* memory type */ > +enum dmc520_mem_type { > + mem_type_ddr3 = 1, > + mem_type_ddr4 = 2 > +}; > + > +/* memory device width */ > +enum dmc520_dev_width { > + dev_width_x4 = 0, > + dev_width_x8 = 1, > + dev_width_x16 = 2 > +}; (Nit: the convention for enums members is all-caps. e.g. include/linux/edac.h) [...] > +static irqreturn_t > +dmc520_edac_dram_all_isr(int irq, void *data, u32 interrupt_mask); (You could avoid this by moving the user after definition of this function) [...] > +static bool dmc520_get_dram_ecc_error_info(struct dmc520_edac *edac, > + bool is_ce, > + struct ecc_error_info *info) > +{ > + u32 reg_offset_low, reg_offset_high; > + u32 reg_val_low, reg_val_high; > + bool valid; > + > + reg_offset_low = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 : > + REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00; > + reg_offset_high = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 : > + REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32; > + > + reg_val_low = dmc520_read_reg(edac, reg_offset_low); > + reg_val_high = dmc520_read_reg(edac, reg_offset_high); > + > + valid = (FIELD_GET(REG_FIELD_ERR_INFO_LOW_VALID, reg_val_low) != 0) && > + (FIELD_GET(REG_FIELD_ERR_INFO_HIGH_VALID, reg_val_high) != 0); > + if (valid) { > + info->col = FIELD_GET(REG_FIELD_ERR_INFO_LOW_COL, reg_val_low); > + info->row = FIELD_GET(REG_FIELD_ERR_INFO_LOW_ROW, reg_val_low); > + info->rank = FIELD_GET(REG_FIELD_ERR_INFO_LOW_RANK, > reg_val_low); > + info->bank = FIELD_GET(REG_FIELD_ERR_INFO_HIGH_BANK, > reg_val_high); > + } else { > + memset(info, 0, sizeof(struct ecc_error_info)); > + } > + return valid; Nothing checks this return value. > +} > +static bool dmc520_get_scrub_type(struct dmc520_edac *edac) This function returns enum scrub_type, not bool. > +{ > + enum scrub_type type = SCRUB_NONE; > + u32 reg_val, scrub_cfg; > + > + reg_val = dmc520_read_reg(edac, REG_OFFSET_SCRUB_CONTROL0_NOW); > + scrub_cfg = FIELD_GET(SCRUB_CONTROL_MASK, reg_val); > + > + if (DMC520_SCRUB_TRIGGER_ERR_DETECT == scrub_cfg || > + DMC520_SCRUB_TRIGGER_IDLE == scrub_cfg) > + type = SCRUB_HW_PROG; > + > + return type; > +} > +static void dmc520_handle_dram_ecc_errors(struct mem_ctl_info *mci, > + bool is_ce) > +{ > + struct ecc_error_info info; > + struct dmc520_edac *edac; > + u32 cnt; > + char message[EDAC_MSG_BUF_SIZE]; > + unsigned long flags; > + > + edac = mci->pvt_info; > + dmc520_get_dram_ecc_error_info(edac, is_ce, &info); > + > + cnt = dmc520_get_dram_ecc_error_count(edac, is_ce); > + > + if (cnt > 0) { > + snprintf(message, ARRAY_SIZE(message), > + "rank:%d bank:%d row:%d col:%d", > + info.rank, info.bank, > + info.row, info.col); > + > + spin_lock_irqsave(&edac->ecc_lock, flags); irqsave/irqrestore is overkill as this function is only called from an interrupt handler. There is no way for this to be called with interrupts unmasked. > + edac_mc_handle_error((is_ce ? HW_EVENT_ERR_CORRECTED : > + HW_EVENT_ERR_UNCORRECTED), > + mci, cnt, 0, 0, 0, info.rank, -1, -1, > + message, ""); > + spin_unlock_irqrestore(&edac->ecc_lock, flags); > + } > +} > + > +static irqreturn_t dmc520_edac_dram_ecc_isr(int irq, void *data, bool is_ce) data here could be struct mem_ctl_info *, as it only has one caller. > +{ > + u32 i_mask; > + struct mem_ctl_info *mci; > + struct dmc520_edac *edac; > + > + mci = data; > + edac = mci->pvt_info; > + > + i_mask = is_ce ? DRAM_ECC_INT_CE_MASK : DRAM_ECC_INT_UE_MASK; (The mask/bit here could be passed in directly, its the value you need most often) > + dmc520_handle_dram_ecc_errors(mci, is_ce); > + > + dmc520_write_reg(edac, i_mask, REG_OFFSET_INTERRUPT_CLR); > + > + return IRQ_HANDLED; > +} [...] > +static int dmc520_edac_probe(struct platform_device *pdev) > +{ [...] > + if (nintr > ARRAY_SIZE(dmc520_isr_array)) { > + edac_printk(KERN_ERR, EDAC_MOD_NAME, > + "Invalid device node configuration: # of interrupt > config " > + "elements (%d) can not exeed %ld.\n", (Nit: exceed) > + nintr, ARRAY_SIZE(dmc520_isr_array)); > + return -EINVAL; > + } [...] > + ret = of_property_read_u32_array(dev->of_node, "interrupt-config", > + edac->interrupt_masks, nintr); > + if (ret) { > + edac_printk(KERN_ERR, EDAC_MOD_NAME, > + "Failed to get interrupt-config arrays.\n"); > + goto err_free_mc; > + } > + for (intr_index = 0; intr_index < nintr; ++intr_index) { > + if (edac->interrupt_mask_all & > edac->interrupt_masks[intr_index]) { > + edac_printk(KERN_ERR, EDAC_MC, > + "interrupt-config error: " > + "element %d's interrupt mask %d has overlap.\n", > + intr_index, edac->interrupt_masks[intr_index]); > + goto err_free_mc; > + } > + > + edac->interrupt_mask_all |= edac->interrupt_masks[intr_index]; > + } Ah, so the driver doesn't support overlapping masks... but wasn't this the reason for describing the interrupts with these masks in the first place? (It looks like the DT-folk want this as named interrupts) lore.kernel.org/r/byapr21mb1319bc4d079b918ab038a4d590...@byapr21mb1319.namprd21.prod.outlook.com Would this driver support the configuration you gave there? > + edac->interrupt_mask_all &= ALL_INT_MASK; This is to removed invalid interrupt fields? Shouldn't we print a warning instead? Either the DT is invalid, or its some future hardware that has an extra interrupt that this driver won't enable. [...] > + /* Clear interrupts */ > + reg_val = dmc520_read_reg(edac, REG_OFFSET_INTERRUPT_CONTROL); > + dmc520_write_reg(edac, reg_val & (~(edac->interrupt_mask_all)), > + REG_OFFSET_INTERRUPT_CONTROL); > + dmc520_write_reg(edac, edac->interrupt_mask_all, > REG_OFFSET_INTERRUPT_CLR); [...] > + /* Enable interrupts */ > + dmc520_write_reg(edac, edac->interrupt_mask_all, > REG_OFFSET_INTERRUPT_CONTROL); Won't this disable any interrupts we weren't told about? You did a read-modify write above. Can we do the same here? > + return 0; > + > +err_free_irq: > + for (intr_index = 0; intr_index < nintr_registered; ++intr_index) { > + int irq_id = platform_get_irq(pdev, intr_index); > + devm_free_irq(&pdev->dev, irq_id, mci); > + } > + edac_mc_del_mc(&pdev->dev); > +err_free_mc: > + edac_mc_free(mci); > + > + return ret; > +} > + [...] > +static const struct of_device_id dmc520_edac_driver_id[] = { > + { .compatible = "brcm,dmc-520", }, > + { .compatible = "arm,dmc-520", }, You should only need the "arm,dmc-520" entry here. The additional compatible values are for quirking the driver when integration issues are discovered. The 'brcm' version should be in the DT from day-one, but the kernel only needs to pick it up when it needs to treat the brcm version differently. > + { /* end of table */ } > +}; With the bool/enum and interrupt-disabling things fixed: Reviewed-by: James Morse <james.mo...@arm.com> Thanks, James [0] https://static.docs.arm.com/100000/0200/corelink_dmc520_trm_100000_0200_01_en.pdf