Hi!

On 22/06/2019 07:38, Lei Wang wrote:
> New driver supports error detection and correction on the devices with ARM
> DMC-520 memory controller.

> diff --git a/drivers/edac/dmc520_edac.c b/drivers/edac/dmc520_edac.c
> new file mode 100644
> index 000000000000..c23734c13933
> --- /dev/null
> +++ b/drivers/edac/dmc520_edac.c
> @@ -0,0 +1,604 @@

> +#include <linux/module.h>
> +#include <linux/platform_device.h>
> +#include <linux/edac.h>
> +#include <linux/io.h>
> +#include <linux/of.h>
> +#include <linux/interrupt.h>
> +#include <linux/bitfield.h>

#include <linux/spinlock.h> ?

It's best to keep this list sorted, it makes it easier for the maintainer to 
resolve
conflicts when header files get split/moved-around.

> +#include "edac_mc.h"

[...]

> +#define REG_OFFSET_FEATURE_CONTROL_NEXT              0x1F0

Nothing uses this, do we need it?

[...]

> +#define REG_OFFSET_DECODE_CONTROL_NOW                0x1014

Nothing uses this, do we need it?
[...]

> +/* DMC-520 types, masks and bitfields */
> +#define DRAM_ECC_INT_CE_MASK                 BIT(2)
> +#define DRAM_ECC_INT_UE_MASK                 BIT(3)

(The 'MASK' suffix isn't really needed for a single bit, you can't confuse it 
with the value.)

[...]

> +#define SCRUB_CONTROL_MASK                   GENMASK(1, 0)

Isn't this field called TRIGGER0_NEXT? It would be good to use the names from 
the
datasheet[0] as it makes it much easier for someone else to debug.


> +#define DMC520_EDAC_ERR_GRAIN                        1

> +#define DMC520_BUS_WIDTH     8  /* Data bus width is 64bits/8Bytes */

Can you point me to where this comes from in the datasheet[0]?
I see it talk in "1.3 Features" of "either a 32-bit wide data SDRAM interface 
or a 64-bit
wide data SDRAM interface".

If this is a choice that was made on your platform it needs to be described in 
the DT.

(I may be confused between SDRAM/DDR/DRAM, as 2.3.3. "PHY interface" seems to 
describe one
connecting to the other.)



> +/* memory type */
> +enum dmc520_mem_type {
> +     mem_type_ddr3 = 1,
> +     mem_type_ddr4 = 2
> +};
> +
> +/* memory device width */
> +enum dmc520_dev_width {
> +     dev_width_x4 = 0,
> +     dev_width_x8 = 1,
> +     dev_width_x16 = 2
> +};

(Nit: the convention for enums members is all-caps. e.g. include/linux/edac.h)

[...]

> +static irqreturn_t
> +dmc520_edac_dram_all_isr(int irq, void *data, u32 interrupt_mask);

(You could avoid this by moving the user after definition of this function)

[...]

> +static bool dmc520_get_dram_ecc_error_info(struct dmc520_edac *edac,
> +                                        bool is_ce,
> +                                        struct ecc_error_info *info)
> +{
> +     u32 reg_offset_low, reg_offset_high;
> +     u32 reg_val_low, reg_val_high;
> +     bool valid;
> +
> +     reg_offset_low = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 :
> +                              REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00;
> +     reg_offset_high = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 :
> +                               REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32;
> +
> +     reg_val_low = dmc520_read_reg(edac, reg_offset_low);
> +     reg_val_high = dmc520_read_reg(edac, reg_offset_high);
> +
> +     valid = (FIELD_GET(REG_FIELD_ERR_INFO_LOW_VALID, reg_val_low) != 0) &&
> +             (FIELD_GET(REG_FIELD_ERR_INFO_HIGH_VALID, reg_val_high) != 0);

> +     if (valid) {
> +             info->col = FIELD_GET(REG_FIELD_ERR_INFO_LOW_COL, reg_val_low);
> +             info->row = FIELD_GET(REG_FIELD_ERR_INFO_LOW_ROW, reg_val_low);
> +             info->rank = FIELD_GET(REG_FIELD_ERR_INFO_LOW_RANK, 
> reg_val_low);
> +             info->bank = FIELD_GET(REG_FIELD_ERR_INFO_HIGH_BANK, 
> reg_val_high);
> +     } else {
> +             memset(info, 0, sizeof(struct ecc_error_info));
> +     }

> +     return valid;

Nothing checks this return value.

> +}

> +static bool dmc520_get_scrub_type(struct dmc520_edac *edac)

This function returns enum scrub_type, not bool.

> +{
> +     enum scrub_type type = SCRUB_NONE;
> +     u32 reg_val, scrub_cfg;
> +
> +     reg_val = dmc520_read_reg(edac, REG_OFFSET_SCRUB_CONTROL0_NOW);
> +     scrub_cfg = FIELD_GET(SCRUB_CONTROL_MASK, reg_val);
> +
> +     if (DMC520_SCRUB_TRIGGER_ERR_DETECT == scrub_cfg ||
> +             DMC520_SCRUB_TRIGGER_IDLE == scrub_cfg)
> +             type = SCRUB_HW_PROG;
> +
> +     return type;
> +}


> +static void dmc520_handle_dram_ecc_errors(struct mem_ctl_info *mci,
> +                                       bool is_ce)
> +{
> +     struct ecc_error_info info;
> +     struct dmc520_edac *edac;
> +     u32 cnt;
> +     char message[EDAC_MSG_BUF_SIZE];
> +     unsigned long flags;
> +
> +     edac = mci->pvt_info;
> +     dmc520_get_dram_ecc_error_info(edac, is_ce, &info);
> +
> +     cnt = dmc520_get_dram_ecc_error_count(edac, is_ce);
> +
> +     if (cnt > 0) {
> +             snprintf(message, ARRAY_SIZE(message),
> +                      "rank:%d bank:%d row:%d col:%d",
> +                      info.rank, info.bank,
> +                      info.row, info.col);
> +
> +             spin_lock_irqsave(&edac->ecc_lock, flags);

irqsave/irqrestore is overkill as this function is only called from an 
interrupt handler.
There is no way for this to be called with interrupts unmasked.


> +             edac_mc_handle_error((is_ce ? HW_EVENT_ERR_CORRECTED :
> +                                  HW_EVENT_ERR_UNCORRECTED),
> +                                  mci, cnt, 0, 0, 0, info.rank, -1, -1,
> +                                  message, "");
> +             spin_unlock_irqrestore(&edac->ecc_lock, flags);
> +     }
> +}
> +
> +static irqreturn_t dmc520_edac_dram_ecc_isr(int irq, void *data, bool is_ce)

data here could be struct mem_ctl_info *, as it only has one caller.

> +{
> +     u32 i_mask;
> +     struct mem_ctl_info *mci;
> +     struct dmc520_edac *edac;
> +
> +     mci = data;
> +     edac = mci->pvt_info;
> +
> +     i_mask = is_ce ? DRAM_ECC_INT_CE_MASK : DRAM_ECC_INT_UE_MASK;

(The mask/bit here could be passed in directly, its the value you need most 
often)


> +     dmc520_handle_dram_ecc_errors(mci, is_ce);
> +
> +     dmc520_write_reg(edac, i_mask, REG_OFFSET_INTERRUPT_CLR);
> +
> +     return IRQ_HANDLED;
> +}

[...]


> +static int dmc520_edac_probe(struct platform_device *pdev)
> +{

[...]

> +     if (nintr > ARRAY_SIZE(dmc520_isr_array)) {
> +             edac_printk(KERN_ERR, EDAC_MOD_NAME,
> +                     "Invalid device node configuration: # of interrupt 
> config "
> +                     "elements (%d) can not exeed %ld.\n",

(Nit: exceed)

> +                     nintr, ARRAY_SIZE(dmc520_isr_array));
> +             return -EINVAL;
> +     }

[...]

> +     ret = of_property_read_u32_array(dev->of_node, "interrupt-config",
> +                     edac->interrupt_masks, nintr);
> +     if (ret) {
> +             edac_printk(KERN_ERR, EDAC_MOD_NAME,
> +                     "Failed to get interrupt-config arrays.\n");
> +             goto err_free_mc;
> +     }

> +     for (intr_index = 0; intr_index < nintr; ++intr_index) {
> +             if (edac->interrupt_mask_all & 
> edac->interrupt_masks[intr_index]) {
> +                     edac_printk(KERN_ERR, EDAC_MC,
> +                             "interrupt-config error: "
> +                             "element %d's interrupt mask %d has overlap.\n",
> +                             intr_index, edac->interrupt_masks[intr_index]);
> +                     goto err_free_mc;
> +             }
> +
> +             edac->interrupt_mask_all |= edac->interrupt_masks[intr_index];
> +     }

Ah, so the driver doesn't support overlapping masks... but wasn't this the 
reason for
describing the interrupts with these masks in the first place?
(It looks like the DT-folk want this as named interrupts)

lore.kernel.org/r/byapr21mb1319bc4d079b918ab038a4d590...@byapr21mb1319.namprd21.prod.outlook.com

Would this driver support the configuration you gave there?


> +     edac->interrupt_mask_all &= ALL_INT_MASK;

This is to removed invalid interrupt fields? Shouldn't we print a warning 
instead? Either
the DT is invalid, or its some future hardware that has an extra interrupt that 
this
driver won't enable.


[...]

> +     /* Clear interrupts */
> +     reg_val = dmc520_read_reg(edac, REG_OFFSET_INTERRUPT_CONTROL);
> +     dmc520_write_reg(edac, reg_val & (~(edac->interrupt_mask_all)),
> +                     REG_OFFSET_INTERRUPT_CONTROL);
> +     dmc520_write_reg(edac, edac->interrupt_mask_all, 
> REG_OFFSET_INTERRUPT_CLR);

[...]

> +     /* Enable interrupts */
> +     dmc520_write_reg(edac, edac->interrupt_mask_all, 
> REG_OFFSET_INTERRUPT_CONTROL);

Won't this disable any interrupts we weren't told about? You did a read-modify 
write
above. Can we do the same here?


> +     return 0;
> +
> +err_free_irq:
> +     for (intr_index = 0; intr_index < nintr_registered; ++intr_index) {
> +             int irq_id = platform_get_irq(pdev, intr_index);
> +             devm_free_irq(&pdev->dev, irq_id, mci);
> +     }
> +     edac_mc_del_mc(&pdev->dev);
> +err_free_mc:
> +     edac_mc_free(mci);
> +
> +     return ret;
> +}
> +

[...]

> +static const struct of_device_id dmc520_edac_driver_id[] = {
> +     { .compatible = "brcm,dmc-520", },
> +     { .compatible = "arm,dmc-520", },

You should only need the "arm,dmc-520" entry here. The additional compatible 
values are
for quirking the driver when integration issues are discovered.
The 'brcm' version should be in the DT from day-one, but the kernel only needs 
to pick it
up when it needs to treat the brcm version differently.


> +     { /* end of table */ }
> +};


With the bool/enum and interrupt-disabling things fixed:
Reviewed-by: James Morse <james.mo...@arm.com>



Thanks,

James

[0] 
https://static.docs.arm.com/100000/0200/corelink_dmc520_trm_100000_0200_01_en.pdf

Reply via email to