Re: [RFC PATCH 1/6] efi / ras: CCIX Memory error reporting

2019-06-21 Thread Jonathan Cameron
On Thu, 6 Jun 2019 20:36:49 +0800
Jonathan Cameron  wrote:

> CCIX defines a number of different error types
> (See CCIX spec 1.0) and UEFI 2.8 defines a CPER record to allow
> for them to be reported when firmware first handling is in use.
> The last part of that record is a copy of the CCIX protocol
> error record which can provide very detailed information.
> 
> This patch introduces infrastructure and support for one of those
> error types, CCIX Memory Errors.  Later patches will supply
> equivalent support for the other error types.
> 
> The variable length and content of the different messages makes
> a single tracepoint impractical.  As such the current RAS
> tracepoint only covers the memory error. Additional trace points
> will be introduced for other error types along with their
> cper handling in a follow up series.
> 
> RAS daemon support to follow shortly. qemu injection patches
> also available but not currently planing to upstream those.
> 
> Signed-off-by: Jonathan Cameron 
As this is still and RFC I'm not going to spin a new version yet,
but we need some ifdef fun in the event header as it's calling
functions much like extlog_mem_event does.

I'll roll that fix into v2 once people have had time to look at
this version.

Thanks,

Jonathan

> ---
>  drivers/acpi/apei/Kconfig|   8 +
>  drivers/acpi/apei/ghes.c |  39 
>  drivers/firmware/efi/Kconfig |   5 +
>  drivers/firmware/efi/Makefile|   1 +
>  drivers/firmware/efi/cper-ccix.c | 356 +++
>  drivers/firmware/efi/cper.c  |   6 +
>  include/linux/cper.h | 118 ++
>  include/ras/ras_event.h  |  77 +++
>  8 files changed, 610 insertions(+)
> 
> diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
> index 6b18f8bc7be35..e687b18dee344 100644
> --- a/drivers/acpi/apei/Kconfig
> +++ b/drivers/acpi/apei/Kconfig
> @@ -68,3 +68,11 @@ config ACPI_APEI_ERST_DEBUG
> error information to and from a persistent store. Enable this
> if you want to debugging and testing the ERST kernel support
> and firmware implementation.
> +
> +config ACPI_APEI_CCIX
> +   bool "APEI CCIX error recovery support"
> +   depends on ACPI_APEI && MEMORY_FAILURE
> +   help
> +  CCIX has a number of defined error types. This option enables
> +  the handling of CPER records generated by a firmware performing
> +  firmware first error handling of these CCIX errors.
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 993940d582f50..cfc7dc31a9380 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -477,6 +477,42 @@ static void ghes_handle_aer(struct 
> acpi_hest_generic_data *gdata)
>  #endif
>  }
>  
> +static void ghes_handle_ccix_per(struct acpi_hest_generic_data *gdata, int 
> sev)
> +{
> +#ifdef CONFIG_ACPI_APEI_CCIX
> + struct cper_sec_ccix_header *header = acpi_hest_get_payload(gdata);
> + __u32 *dw;
> + enum ccix_per_type per_type;
> + static u32 err_seq;
> + void *payload;
> +
> + /* Check if space for CCIX CPER header and 8 DW of a PER log header */
> + if (gdata->error_data_length <
> + sizeof(*header) + CCIX_PER_LOG_HEADER_DWS * sizeof(__u32))
> + return;
> +
> + if ((header->validation_bits & CPER_CCIX_VALID_PER_LOG) == 0)
> + return;
> +
> + dw = (__u32 *)(header + 1);
> +
> + per_type = FIELD_GET(CCIX_PER_LOG_DW1_PER_TYPE_M, dw[1]);
> + payload = acpi_hest_get_payload(gdata);
> +
> + switch (per_type) {
> + case CCIX_MEMORY_ERROR:
> + trace_ccix_memory_error_event(payload, err_seq, sev,
> +   
> ccix_mem_err_ven_len_get(payload));
> + break;
> + default:
> + /* Unknown error type */
> + pr_info("CCIX error of unknown or vendor defined type\n");
> + break;
> + }
> + err_seq++;
> +#endif
> +}
> +
>  static void ghes_do_proc(struct ghes *ghes,
>const struct acpi_hest_generic_status *estatus)
>  {
> @@ -507,6 +543,9 @@ static void ghes_do_proc(struct ghes *ghes,
>   else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
>   ghes_handle_aer(gdata);
>   }
> + else if (guid_equal(sec_type, &CPER_SEC_CCIX)) {
> + ghes_handle_ccix_per(gdata, estatus->error_severity);
> + }
>   else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>   struct cper_sec_proc_arm *err = 
> acpi_hest_get_payload(gdata);
>  
> diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
> index d4ea929e8b344..9ea161f68da8d 100644
> --- a/drivers/firmware/efi/Kconfig
> +++ b/drivers/firmware/efi/Kconfig
> @@ -195,6 +195,11 @@ config UEFI_CPER_X86
>   depends on UEFI_CPER && X86
>   default y
>  
> +config UEFI_CPER_CCIX
> +   bool
> +   depends on UEF

[RFC PATCH 1/6] efi / ras: CCIX Memory error reporting

2019-06-06 Thread Jonathan Cameron
CCIX defines a number of different error types
(See CCIX spec 1.0) and UEFI 2.8 defines a CPER record to allow
for them to be reported when firmware first handling is in use.
The last part of that record is a copy of the CCIX protocol
error record which can provide very detailed information.

This patch introduces infrastructure and support for one of those
error types, CCIX Memory Errors.  Later patches will supply
equivalent support for the other error types.

The variable length and content of the different messages makes
a single tracepoint impractical.  As such the current RAS
tracepoint only covers the memory error. Additional trace points
will be introduced for other error types along with their
cper handling in a follow up series.

RAS daemon support to follow shortly. qemu injection patches
also available but not currently planing to upstream those.

Signed-off-by: Jonathan Cameron 
---
 drivers/acpi/apei/Kconfig|   8 +
 drivers/acpi/apei/ghes.c |  39 
 drivers/firmware/efi/Kconfig |   5 +
 drivers/firmware/efi/Makefile|   1 +
 drivers/firmware/efi/cper-ccix.c | 356 +++
 drivers/firmware/efi/cper.c  |   6 +
 include/linux/cper.h | 118 ++
 include/ras/ras_event.h  |  77 +++
 8 files changed, 610 insertions(+)

diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 6b18f8bc7be35..e687b18dee344 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -68,3 +68,11 @@ config ACPI_APEI_ERST_DEBUG
  error information to and from a persistent store. Enable this
  if you want to debugging and testing the ERST kernel support
  and firmware implementation.
+
+config ACPI_APEI_CCIX
+   bool "APEI CCIX error recovery support"
+   depends on ACPI_APEI && MEMORY_FAILURE
+   help
+CCIX has a number of defined error types. This option enables
+the handling of CPER records generated by a firmware performing
+firmware first error handling of these CCIX errors.
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 993940d582f50..cfc7dc31a9380 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -477,6 +477,42 @@ static void ghes_handle_aer(struct acpi_hest_generic_data 
*gdata)
 #endif
 }
 
+static void ghes_handle_ccix_per(struct acpi_hest_generic_data *gdata, int sev)
+{
+#ifdef CONFIG_ACPI_APEI_CCIX
+   struct cper_sec_ccix_header *header = acpi_hest_get_payload(gdata);
+   __u32 *dw;
+   enum ccix_per_type per_type;
+   static u32 err_seq;
+   void *payload;
+
+   /* Check if space for CCIX CPER header and 8 DW of a PER log header */
+   if (gdata->error_data_length <
+   sizeof(*header) + CCIX_PER_LOG_HEADER_DWS * sizeof(__u32))
+   return;
+
+   if ((header->validation_bits & CPER_CCIX_VALID_PER_LOG) == 0)
+   return;
+
+   dw = (__u32 *)(header + 1);
+
+   per_type = FIELD_GET(CCIX_PER_LOG_DW1_PER_TYPE_M, dw[1]);
+   payload = acpi_hest_get_payload(gdata);
+
+   switch (per_type) {
+   case CCIX_MEMORY_ERROR:
+   trace_ccix_memory_error_event(payload, err_seq, sev,
+ 
ccix_mem_err_ven_len_get(payload));
+   break;
+   default:
+   /* Unknown error type */
+   pr_info("CCIX error of unknown or vendor defined type\n");
+   break;
+   }
+   err_seq++;
+#endif
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 const struct acpi_hest_generic_status *estatus)
 {
@@ -507,6 +543,9 @@ static void ghes_do_proc(struct ghes *ghes,
else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
ghes_handle_aer(gdata);
}
+   else if (guid_equal(sec_type, &CPER_SEC_CCIX)) {
+   ghes_handle_ccix_per(gdata, estatus->error_severity);
+   }
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
struct cper_sec_proc_arm *err = 
acpi_hest_get_payload(gdata);
 
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index d4ea929e8b344..9ea161f68da8d 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -195,6 +195,11 @@ config UEFI_CPER_X86
depends on UEFI_CPER && X86
default y
 
+config UEFI_CPER_CCIX
+   bool
+   depends on UEFI_CPER
+   default y
+
 config EFI_DEV_PATH_PARSER
bool
depends on ACPI
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index d2d0d20306200..69287da9664b6 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER)  += capsule-loader.o
 obj-$(CONFIG_EFI_EARLYCON) += earlycon.o
 obj-$(CONFIG_UEFI_CPER_ARM)+= cper-arm.o
 obj-$(CONFIG_UEFI_CP