RE: [PATCH v1 1/1] EDAC, mellanox: Add ECC support for BlueField DDR4

2019-03-14 Thread Junhan Zhou
Hi, any comments on this patch?
I'm trying to add EDAC support for our BlueField platform memory controller.

> -Original Message-
> From: linux-edac-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Junhan Zhou
> Sent: Wednesday, February 27, 2019 3:22 PM
> To: Borislav Petkov ; Mauro Carvalho Chehab
> 
> Cc: Junhan Zhou ; Liming Sun
> ; linux-e...@vger.kernel.org; linux-
> ker...@vger.kernel.org
> Subject: [PATCH v1 1/1] EDAC, mellanox: Add ECC support for BlueField DDR4
> 
> Add ECC support for Mellanox BlueField SoC DDR controller.
> This requires SMC to the running Arm Trusted Firmware to report what is the
> current memory configuration.
> 
> Signed-off-by: Junhan Zhou 
> ---
>  drivers/edac/Kconfig  |   7 +
>  drivers/edac/Makefile |   1 +
>  drivers/edac/bluefield_edac.c | 425
> ++
>  3 files changed, 433 insertions(+)
>  create mode 100644 drivers/edac/bluefield_edac.c
> 
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index
> e286b5b..3bcb8fe 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -475,4 +475,11 @@ config EDAC_QCOM
> For debugging issues having to do with stability and overall system
> health, you should probably say 'Y' here.
> 
> +config EDAC_BLUEFIELD
> + tristate "Mellanox BlueField Memory ECC"
> + depends on (MELLANOX_PLATFORM && ARM64 && ACPI) ||
> COMPILE_TEST
> + help
> +   Support for error detection and correction on the
> +   Mellanox BlueField SoCs.
> +
>  endif # EDAC
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index
> 716096d..6dd5e67 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -78,3 +78,4 @@ obj-$(CONFIG_EDAC_SYNOPSYS) +=
> synopsys_edac.o
>  obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o
>  obj-$(CONFIG_EDAC_TI)+= ti_edac.o
>  obj-$(CONFIG_EDAC_QCOM)  += qcom_edac.o
> +obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o
> diff --git a/drivers/edac/bluefield_edac.c b/drivers/edac/bluefield_edac.c
> new file mode 100644 index 000..bdcf0cc
> --- /dev/null
> +++ b/drivers/edac/bluefield_edac.c
> @@ -0,0 +1,425 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Bluefield-specific EDAC driver.
> + *
> + * Copyright (c) 2019 Mellanox Technologies.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "edac_module.h"
> +
> +#define DRIVER_NAME  "bluefield-edac"
> +
> +/*
> + * Mellanox BlueField EMI (External Memory Interface) register definitions.
> + * Registers which have individual bitfields have a union defining the
> + * bitfields following the address define.
> + */
> +
> +#define MLXBF_EMI_DRAM_ECC_COUNT 0x340
> +
> +union MLXBF_EMI_DRAM_ECC_COUNT_u {
> + struct {
> + /* ECC single errors counter */
> + u32 single_error_count : 16;
> + /* ECC double errors counter */
> + u32 double_error_count : 16;
> + };
> +
> + u32 word;
> +};
> +
> +#define MLXBF_EMI_DRAM_ECC_ERROR 0x348
> +
> +union MLXBF_EMI_DRAM_ECC_ERROR_u {
> + struct {
> + /* 1 - ECC single error from DRAM wrapper x. */
> + u32 dram_ecc_single : 1;
> + /* Reserved. */
> + u32 __reserved_0: 15;
> + /* 1 - ECC double error from DRAM wrapper x. */
> + u32 dram_ecc_double : 1;
> + /* Reserved. */
> + u32 __reserved_1: 15;
> + };
> +
> + u32 word;
> +};
> +
> +#define MLXBF_EMI_DRAM_ECC_LATCH_SELECT 0x354
> +
> +union MLXBF_EMI_DRAM_ECC_LATCH_SELECT_u {
> + struct {
> + /*
> +  * For DRAM wrapper x.
> +  *
> +  * 0 - Last ECC error will be stored.
> +  *
> +  * 1 - First ECC error will be stored.
> +  */
> + u32 dram_ecc_first : 1;
> + /* Reserved. */
> + u32 __reserved_0   : 15;
> + /* Selects to Edge (one of 8) to get the ECC info. */
> + u32 edge_sel   : 4;
> + /* Reserved. */
> + u32 __reserved_1   : 4;
> + /* 1 - Start select DRAM_INFO */
> + u32 start  : 1;
> + /* Reserved. */
> + u32 __reserved_2   : 7;
> + };
> +
> + u32 word;
> +};
> +
> +#define MLXBF_EMI_DRAM_ERR_ADDR_0 0x358
> +
> +#define MLXBF_EMI_DRAM_ERR_ADDR_1

[PATCH v1 1/1] EDAC, mellanox: Add ECC support for BlueField DDR4

2019-02-27 Thread Junhan Zhou
Add ECC support for Mellanox BlueField SoC DDR controller.
This requires SMC to the running Arm Trusted Firmware to report
what is the current memory configuration.

Signed-off-by: Junhan Zhou 
---
 drivers/edac/Kconfig  |   7 +
 drivers/edac/Makefile |   1 +
 drivers/edac/bluefield_edac.c | 425 ++
 3 files changed, 433 insertions(+)
 create mode 100644 drivers/edac/bluefield_edac.c

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index e286b5b..3bcb8fe 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -475,4 +475,11 @@ config EDAC_QCOM
  For debugging issues having to do with stability and overall system
  health, you should probably say 'Y' here.
 
+config EDAC_BLUEFIELD
+   tristate "Mellanox BlueField Memory ECC"
+   depends on (MELLANOX_PLATFORM && ARM64 && ACPI) || COMPILE_TEST
+   help
+ Support for error detection and correction on the
+ Mellanox BlueField SoCs.
+
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 716096d..6dd5e67 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_EDAC_SYNOPSYS)   += synopsys_edac.o
 obj-$(CONFIG_EDAC_XGENE)   += xgene_edac.o
 obj-$(CONFIG_EDAC_TI)  += ti_edac.o
 obj-$(CONFIG_EDAC_QCOM)+= qcom_edac.o
+obj-$(CONFIG_EDAC_BLUEFIELD)   += bluefield_edac.o
diff --git a/drivers/edac/bluefield_edac.c b/drivers/edac/bluefield_edac.c
new file mode 100644
index 000..bdcf0cc
--- /dev/null
+++ b/drivers/edac/bluefield_edac.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bluefield-specific EDAC driver.
+ *
+ * Copyright (c) 2019 Mellanox Technologies.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "edac_module.h"
+
+#define DRIVER_NAME"bluefield-edac"
+
+/*
+ * Mellanox BlueField EMI (External Memory Interface) register definitions.
+ * Registers which have individual bitfields have a union defining the
+ * bitfields following the address define.
+ */
+
+#define MLXBF_EMI_DRAM_ECC_COUNT 0x340
+
+union MLXBF_EMI_DRAM_ECC_COUNT_u {
+   struct {
+   /* ECC single errors counter */
+   u32 single_error_count : 16;
+   /* ECC double errors counter */
+   u32 double_error_count : 16;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ECC_ERROR 0x348
+
+union MLXBF_EMI_DRAM_ECC_ERROR_u {
+   struct {
+   /* 1 - ECC single error from DRAM wrapper x. */
+   u32 dram_ecc_single : 1;
+   /* Reserved. */
+   u32 __reserved_0: 15;
+   /* 1 - ECC double error from DRAM wrapper x. */
+   u32 dram_ecc_double : 1;
+   /* Reserved. */
+   u32 __reserved_1: 15;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ECC_LATCH_SELECT 0x354
+
+union MLXBF_EMI_DRAM_ECC_LATCH_SELECT_u {
+   struct {
+   /*
+* For DRAM wrapper x.
+*
+* 0 - Last ECC error will be stored.
+*
+* 1 - First ECC error will be stored.
+*/
+   u32 dram_ecc_first : 1;
+   /* Reserved. */
+   u32 __reserved_0   : 15;
+   /* Selects to Edge (one of 8) to get the ECC info. */
+   u32 edge_sel   : 4;
+   /* Reserved. */
+   u32 __reserved_1   : 4;
+   /* 1 - Start select DRAM_INFO */
+   u32 start  : 1;
+   /* Reserved. */
+   u32 __reserved_2   : 7;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ERR_ADDR_0 0x358
+
+#define MLXBF_EMI_DRAM_ERR_ADDR_1 0x37c
+
+#define MLXBF_EMI_DRAM_SYNDROM 0x35c
+
+union MLXBF_EMI_DRAM_SYNDROM_u {
+   struct {
+   /* 1 - Double error. */
+   u32 derr : 1;
+   /* 1 - Single error. */
+   u32 serr : 1;
+   /* Reserved. */
+   u32 __reserved_0 : 14;
+   /* ECC syndrome (error bit according to the Hamming code). */
+   u32 syndrom  : 10;
+   /* Reserved. */
+   u32 __reserved_1 : 6;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ADDITIONAL_INFO_0 0x364
+
+union MLXBF_EMI_DRAM_ADDITIONAL_INFO_0_u {
+   struct {
+   /* Physical Error Bank. */
+   u32 err_bank : 4;
+   /* Physical Error Logical Rank. */
+   u32 err_lrank: 2;
+   /* Reserved. */
+   u32 __reserved_0 : 2;
+   /* Physical Error Physical Rank. */
+   u32 err_prank: 2;
+   /* Reserved. */
+   u32 __reserved_1 : 6;
+   /* Er

[PATCH v2] EDAC, mellanox: Add ECC support for BlueField DDR4

2019-03-18 Thread Junhan Zhou
Add ECC support for Mellanox BlueField SoC DDR controller.
This requires SMC to the running Arm Trusted Firmware to report
what is the current memory configuration.

Signed-off-by: Junhan Zhou 
---
 MAINTAINERS   |   5 +
 drivers/edac/Kconfig  |   7 +
 drivers/edac/Makefile |   1 +
 drivers/edac/bluefield_edac.c | 396 ++
 4 files changed, 409 insertions(+)
 create mode 100644 drivers/edac/bluefield_edac.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e17ebf7..394d6c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5539,6 +5539,11 @@ S:   Supported
 F: drivers/edac/aspeed_edac.c
 F: Documentation/devicetree/bindings/edac/aspeed-sdram-edac.txt
 
+EDAC-BLUEFIELD
+M: Junhan Zhou 
+S: Supported
+F: drivers/edac/bluefield_edac.c
+
 EDAC-CALXEDA
 M: Robert Richter 
 L: linux-e...@vger.kernel.org
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 47eb4d1..dfb36f2 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -504,4 +504,11 @@ config EDAC_ASPEED
  First, ECC must be configured in the bootloader. Then, this driver
  will expose error counters via the EDAC kernel framework.
 
+config EDAC_BLUEFIELD
+   tristate "Mellanox BlueField Memory ECC"
+   depends on (MELLANOX_PLATFORM && ARM64 && ACPI) || COMPILE_TEST
+   help
+ Support for error detection and correction on the
+ Mellanox BlueField SoCs.
+
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 89ad4a84..0294a67 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -84,3 +84,4 @@ obj-$(CONFIG_EDAC_XGENE)  += xgene_edac.o
 obj-$(CONFIG_EDAC_TI)  += ti_edac.o
 obj-$(CONFIG_EDAC_QCOM)+= qcom_edac.o
 obj-$(CONFIG_EDAC_ASPEED)  += aspeed_edac.o
+obj-$(CONFIG_EDAC_BLUEFIELD)   += bluefield_edac.o
diff --git a/drivers/edac/bluefield_edac.c b/drivers/edac/bluefield_edac.c
new file mode 100644
index 000..981f419
--- /dev/null
+++ b/drivers/edac/bluefield_edac.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bluefield-specific EDAC driver.
+ *
+ * Copyright (c) 2019 Mellanox Technologies.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "edac_module.h"
+
+#define DRIVER_NAME"bluefield-edac"
+
+/*
+ * Mellanox BlueField EMI (External Memory Interface) register definitions.
+ * Registers which have individual bitfields have a union defining the
+ * bitfields following the address define.
+ */
+
+#define MLXBF_EMI_DRAM_ECC_COUNT 0x340
+
+union mlxbf_emi_dram_ecc_count {
+   struct {
+   u32 single_error_count : 16;
+   u32 double_error_count : 16;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ECC_ERROR 0x348
+
+union mlxbf_emi_dram_ecc_error {
+   struct {
+   u32 dram_ecc_single : 1;
+   u32 __reserved_0: 15;
+   u32 dram_ecc_double : 1;
+   u32 __reserved_1: 15;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ECC_LATCH_SELECT 0x354
+
+union mlxbf_emi_dram_ecc_latch_select {
+   struct {
+   u32 dram_ecc_first : 1;
+   u32 __reserved_0   : 15;
+   u32 edge_sel   : 4;
+   u32 __reserved_1   : 4;
+   u32 start  : 1;
+   u32 __reserved_2   : 7;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ERR_ADDR_0 0x358
+
+#define MLXBF_EMI_DRAM_ERR_ADDR_1 0x37c
+
+#define MLXBF_EMI_DRAM_SYNDROM 0x35c
+
+union mlxbf_emi_dram_syndrom {
+   struct {
+   u32 derr : 1;
+   u32 serr : 1;
+   u32 __reserved_0 : 14;
+   /* ECC syndrome (error bit according to the Hamming code). */
+   u32 syndrom  : 10;
+   u32 __reserved_1 : 6;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ADDITIONAL_INFO_0 0x364
+
+union mlxbf_emi_dram_additional_info_0 {
+   struct {
+   u32 err_bank : 4;
+   u32 err_lrank: 2;
+   u32 __reserved_0 : 2;
+   u32 err_prank: 2;
+   u32 __reserved_1 : 6;
+   u32 err_edge : 8;
+   u32 __reserved_2 : 8;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EDAC_DIMM_PER_MC 2
+#define MLXBF_EDAC_ERROR_GRAIN 8
+
+/*
+ * Request MLNX_SIP_GET_DIMM_INFO
+ *
+ * Retrieve information about DIMM on a certain slot.
+ *
+ * Call register usage:
+ * a0: MLNX_SIP_GET_DIMM_INFO
+ * a1: (Memory controller index) << 16 | (Dimm index in memory controller)
+ * a2-7: not used.
+ *
+ * Return status:
+ * a0: dimm_info_smc union defined below describing the DIMM.
+ * a1-3: not used.
+ */
+#define MLNX_SIP_GET_DIMM_INFO 0x8208
+
+/* Format for the

RE: [PATCH v1 1/1] EDAC, mellanox: Add ECC support for BlueField DDR4

2019-03-18 Thread Junhan Zhou
> > +config EDAC_BLUEFIELD
> > +   tristate "Mellanox BlueField Memory ECC"
> > +   depends on (MELLANOX_PLATFORM && ARM64 && ACPI) ||
> COMPILE_TEST
> 
> MELLANOX_PLATFORM depends on X86 || ARM but this depends on ARM64.
> How is that even supposed to work?

We are pushing to add driver support in the mainline kernel for our A72 (ARM64) 
based BlueField SoC, and after debates decided to use the MELLANOX_PLATFORM 
option for this.
My colleague is also in the process of upstreaming a driver which makes 
MELLANOX_PLATFORM depends on ARM64, you can see the patch at 
https://lore.kernel.org/lkml/1552056084-12137-1-git-send-email-l...@mellanox.com/
If you want, I can also add the change to this patch, but I would prefer to 
only have the change in one patch.

> 
> > +   help
> > + Support for error detection and correction on the
> > + Mellanox BlueField SoCs.
> > +
> >  endif # EDAC
> 
> This patch needs MAINTAINERS entry so that you can get CCed on bug
> reports.

Done. Added myself.

> > +union MLXBF_EMI_DRAM_ECC_COUNT_u {
> 
> No all caps names pls. And no "_u" suffixing - this is not C++.

Done. Removed the _u suffix and changed all union declarations to lower case.

> > +   struct {
> > +   /* ECC single errors counter */
> > +   u32 single_error_count : 16;
> 
> You have excessive comments for members whose names already describe
> what they are. Remove the comments and descrease the clutter this way.

Sorry, this was autogenerated from the register hardware definitions. Removed 
all except for the ECC syndrome one as it does have added information of saying 
it's error bit according to the Hamming code.

> 
> > +union MLXBF_EMI_DRAM_ECC_ERROR_u {
> > +   struct {
> > +   /* 1 - ECC single error from DRAM wrapper x. */
> 
> What's a "DRAM wrapper x" which is being mentioned here and there?

Sorry about this, removed. It would be more confusing if just left here.
The story is this hardware IP was originally used for the Ezchip NPS memory 
controller where there were a lot of separate DRAM groups or "wrappers", and 
each bit here would corresponding to one of it. But when moving to BlueField it 
was decided to get rid of all these wrappers. But apparently the description 
wasn't updated.

> > +   u32 dram_ecc_single : 1;
> > +   /* Reserved. */
> All those "Reserved" comments are completely useless.
All removed.
> > + * Retrieve information about DIMM on a certaion slot.
>  
> Run your whole patch through a spellchecker.
Good tip. Run it through aspell. Though there are a lot of false positives, 
especially for the git hashes. Any tips on this?
BTW, it did pick up "syndrom" as an error where the correct spelling is 
"syndrome". But the hardware register is named that way so I kept that alone.
> > +   if (edec.single_error_count != 0) {
>   if (edec.single_error_count) {
> is enough. Below too.
Done.
> > +   if (smc_info.size_GB == 0) {
>
>   if (!...
> 
Done

> > +   /* Only POLL mode supported so far. */
> > +   edac_op_state = EDAC_OPSTATE_POLL;
> 
> This needs to happen *last* in this function, when everything else succeeds.
> 
Done
> > +   if (mci == NULL)
> 
>   if (!mci)
> 
Done

Thanks for your comments! Just posted a V2 patch addressing them.


RE: [PATCH v2] EDAC, mellanox: Add ECC support for BlueField DDR4

2019-03-21 Thread Junhan Zhou
> > +config EDAC_BLUEFIELD
> > +   tristate "Mellanox BlueField Memory ECC"
> > +   depends on (MELLANOX_PLATFORM && ARM64 && ACPI) ||
> COMPILE_TEST
> 
> Hi,
> 
> While I am in favor of using COMPILE_TEST whenever possible, I don't see
> how that is possible here if:
> 
> # MELLANOX_PLATFORM is not set
> # ARM64 is not set
> # ACPI is not set
> COMPILE_TEST=y
> 
> Can you please explain?
> 
> Thanks.
> 
> ~Randy

Good point. I tried compiling it against the x86 architecture and it fails 
because it can't find the definition for arm_smccc_smc(). But it would compile 
for the ARM64 architecture with both ACPI and MELLANOX _PLATFORM not set, so 
I'll change it to ARM64 && ((MELLANOX_PLATFORM && ACPI) || COMPILE_TEST) 
instead.
The dependency on ACPI is because our BlueField SoC firmware only uses ACPI to 
pass the device mappings, and MELLANOX_PLATFORM because this device is only 
seen on the Mellanox BlueField SoC so it would be pointless to be used 
elsewhere, but these doesn't prevent it from being compiled.

-Junhan


[PATCH v3] EDAC, mellanox: Add ECC support for BlueField DDR4

2019-03-21 Thread Junhan Zhou
Add ECC support for Mellanox BlueField SoC DDR controller.
This requires SMC to the running Arm Trusted Firmware to report
what is the current memory configuration.

Signed-off-by: Junhan Zhou 
---
 MAINTAINERS   |   5 +
 drivers/edac/Kconfig  |   7 +
 drivers/edac/Makefile |   1 +
 drivers/edac/bluefield_edac.c | 396 ++
 4 files changed, 409 insertions(+)
 create mode 100644 drivers/edac/bluefield_edac.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e17ebf7..394d6c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5539,6 +5539,11 @@ S:   Supported
 F: drivers/edac/aspeed_edac.c
 F: Documentation/devicetree/bindings/edac/aspeed-sdram-edac.txt
 
+EDAC-BLUEFIELD
+M: Junhan Zhou 
+S: Supported
+F: drivers/edac/bluefield_edac.c
+
 EDAC-CALXEDA
 M: Robert Richter 
 L: linux-e...@vger.kernel.org
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 47eb4d1..404d853 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -504,4 +504,11 @@ config EDAC_ASPEED
  First, ECC must be configured in the bootloader. Then, this driver
  will expose error counters via the EDAC kernel framework.
 
+config EDAC_BLUEFIELD
+   tristate "Mellanox BlueField Memory ECC"
+   depends on ARM64 && ((MELLANOX_PLATFORM && ACPI) || COMPILE_TEST)
+   help
+ Support for error detection and correction on the
+ Mellanox BlueField SoCs.
+
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 89ad4a84..0294a67 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -84,3 +84,4 @@ obj-$(CONFIG_EDAC_XGENE)  += xgene_edac.o
 obj-$(CONFIG_EDAC_TI)  += ti_edac.o
 obj-$(CONFIG_EDAC_QCOM)+= qcom_edac.o
 obj-$(CONFIG_EDAC_ASPEED)  += aspeed_edac.o
+obj-$(CONFIG_EDAC_BLUEFIELD)   += bluefield_edac.o
diff --git a/drivers/edac/bluefield_edac.c b/drivers/edac/bluefield_edac.c
new file mode 100644
index 000..88f51f7
--- /dev/null
+++ b/drivers/edac/bluefield_edac.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bluefield-specific EDAC driver.
+ *
+ * Copyright (c) 2019 Mellanox Technologies.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "edac_module.h"
+
+#define DRIVER_NAME"bluefield-edac"
+
+/*
+ * Mellanox BlueField EMI (External Memory Interface) register definitions.
+ * Registers which have individual bitfields have a union defining the
+ * bitfields following the address define.
+ */
+
+#define MLXBF_EMI_DRAM_ECC_COUNT 0x340
+
+union mlxbf_emi_dram_ecc_count {
+   struct {
+   u32 single_error_count : 16;
+   u32 double_error_count : 16;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ECC_ERROR 0x348
+
+union mlxbf_emi_dram_ecc_error {
+   struct {
+   u32 dram_ecc_single : 1;
+   u32 __reserved_0: 15;
+   u32 dram_ecc_double : 1;
+   u32 __reserved_1: 15;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ECC_LATCH_SELECT 0x354
+
+union mlxbf_emi_dram_ecc_latch_select {
+   struct {
+   u32 dram_ecc_first : 1;
+   u32 __reserved_0   : 15;
+   u32 edge_sel   : 4;
+   u32 __reserved_1   : 4;
+   u32 start  : 1;
+   u32 __reserved_2   : 7;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ERR_ADDR_0 0x358
+
+#define MLXBF_EMI_DRAM_ERR_ADDR_1 0x37c
+
+#define MLXBF_EMI_DRAM_SYNDROM 0x35c
+
+union mlxbf_emi_dram_syndrom {
+   struct {
+   u32 derr : 1;
+   u32 serr : 1;
+   u32 __reserved_0 : 14;
+   /* ECC syndrome (error bit according to the Hamming code). */
+   u32 syndrom  : 10;
+   u32 __reserved_1 : 6;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EMI_DRAM_ADDITIONAL_INFO_0 0x364
+
+union mlxbf_emi_dram_additional_info_0 {
+   struct {
+   u32 err_bank : 4;
+   u32 err_lrank: 2;
+   u32 __reserved_0 : 2;
+   u32 err_prank: 2;
+   u32 __reserved_1 : 6;
+   u32 err_edge : 8;
+   u32 __reserved_2 : 8;
+   };
+
+   u32 word;
+};
+
+#define MLXBF_EDAC_DIMM_PER_MC 2
+#define MLXBF_EDAC_ERROR_GRAIN 8
+
+/*
+ * Request MLNX_SIP_GET_DIMM_INFO
+ *
+ * Retrieve information about DIMM on a certain slot.
+ *
+ * Call register usage:
+ * a0: MLNX_SIP_GET_DIMM_INFO
+ * a1: (Memory controller index) << 16 | (Dimm index in memory controller)
+ * a2-7: not used.
+ *
+ * Return status:
+ * a0: dimm_info_smc union defined below describing the DIMM.
+ * a1-3: not used.
+ */
+#define MLNX_SIP_GET_DIMM_INFO 0x8208
+
+/* Format for the

RE: [PATCH v3] EDAC, mellanox: Add ECC support for BlueField DDR4

2019-05-30 Thread Junhan Zhou
> -Original Message-
> From: James Morse 
> Sent: Thursday, May 23, 2019 1:30 PM
> To: Junhan Zhou 
> Cc: Borislav Petkov ; Mauro Carvalho Chehab
> ; Liming Sun ; linux-
> e...@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] EDAC, mellanox: Add ECC support for BlueField DDR4
> 
> Hi Junhan,
> 
> On 21/03/2019 14:31, Junhan Zhou wrote:
> > Add ECC support for Mellanox BlueField SoC DDR controller.
> > This requires SMC to the running Arm Trusted Firmware to report what
> > is the current memory configuration.
> 
> Sorry for the delay on this, it slipped through the cracks. (Please don't 
> reply
> with new patches to the discussion of an old patch/series, this makes it look
> like ongoing discussion on a v1, and v2 never arrives!)
> 
Sorry about this. But at least you caught me before leaving. Shravan (cc'ed) 
would be pushing the V4 and beyond versions of this patch.

> > +config EDAC_BLUEFIELD
> > +   tristate "Mellanox BlueField Memory ECC"
> > +   depends on ARM64 && ((MELLANOX_PLATFORM && ACPI) ||
> COMPILE_TEST)
> 
> What is the MELLANOX_PLATFORM needed for? Is it just to turn off a set of
> drivers in one go? I can't see what other infrastructure you depend on.
> 
Correct. Our memory controller is only present on the BlueField platform, so it 
doesn't make sense to be built on other platforms. There were discussions with 
other maintainers about what config option is used when upstreaming other 
drivers for BlueField, e.g. having a BLUEFIELD_SOC config. But in the end they 
settled down on simply using this flag.

> > +union mlxbf_emi_dram_additional_info_0 {
> > +   struct {
> > +   u32 err_bank : 4;
> > +   u32 err_lrank: 2;
> > +   u32 __reserved_0 : 2;
> > +   u32 err_prank: 2;
> > +   u32 __reserved_1 : 6;
> > +   u32 err_edge : 8;
> > +   u32 __reserved_2 : 8;
> > +   };
> > +
> > +   u32 word;
> > +};
> 
> ... you're expecting the compiler to pack this bitfield in exactly the same 
> way
> your hardware did. I don't think that's guaranteed.
> It evidently works for your current compiler, but another compiler may pack
> this structure differently. Toggling endianness will break this, (arm64
> supports both). If your platform supports aarch32, someone may want to get
> 32bit arm running, which may have different compiler behaviour.
> 
> You are also using bitfields between hardware and firmware, so its currently
> possible the firmware requires the kernel to be built with a compiler that
> means it can't interact with the hardware...
> 
> When this has come up in the past, the advice was not to use bitfields:
> https://lore.kernel.org/lkml/1077080607.1078.109.camel@gaston/
> 
> Please use shifts and masks.
> 
Okay if you insist. I do see this kind of style used elsewhere, e.g. in 
pnd2_edac.h. And this driver is only used on the BlueField SoC which our 
bootloader would configure it to be only running at Aarch64 little endian.

> 
> > +#define MLXBF_EDAC_DIMM_PER_MC 2
> > +#define MLXBF_EDAC_ERROR_GRAIN 8
> 
> If these numbers changed, would it still be a BlueField SoC?
> (if next years made-up:BlueField2 supports more Dimms per MC, it would be
> better to make this a property in the firmware table).
> 
No it won't happen. The memory controller we are using doesn't have the 
strength to drive more than 2 DIMMs. If this were ever fixed it in the future, 
I would anticipate they design a completely new memory controller for which 
this driver code won't be compatible with.
BTW, how did you ever guess that the next gen chip would be called BlueField2? 
Did you search for "BlueField" in the registered PCI IDs and accidentally found 
it? (https://fossies.org/linux/systemd/hwdb/pci.ids)
> 
> > +/*
> > + * Request MLNX_SIP_GET_DIMM_INFO
> > + *
> > + * Retrieve information about DIMM on a certain slot.
> > + *
> > + * Call register usage:
> > + * a0: MLNX_SIP_GET_DIMM_INFO
> > + * a1: (Memory controller index) << 16 | (Dimm index in memory
> > +controller)
> > + * a2-7: not used.
> > + *
> > + * Return status:
> > + * a0: dimm_info_smc union defined below describing the DIMM.
> > + * a1-3: not used.
> > + */
> 
> Have Mellanox published these call numbers/arguments in a document
> somewhere? If they differ with the firmware, it would be good to know
> which side needs fixing.
> 
> It is a little odd that you read the number of memory controllers from the
> ACPI table, but use an SMC call to read the DI