Find CPU caches in the ACPI PPTT and add CPU EDAC device
and EDAC device blocks for the caches found.

For the firmware-first error handling, add an interface in the
ghes_edac, enable to report the CPU corrected error count for
a CPU core to the user-space through the CPU EDAC device.

Suggested-by: James Morse <james.mo...@arm.com>
Signed-off-by: Jonathan Cameron <jonathan.came...@huawei.com>
Signed-off-by: Shiju Jose <shiju.j...@huawei.com>
---
 drivers/edac/Kconfig     |  10 +++
 drivers/edac/ghes_edac.c | 135 +++++++++++++++++++++++++++++++++++++++
 include/acpi/ghes.h      |  27 ++++++++
 3 files changed, 172 insertions(+)

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 7a47680d6f07..3a0d8d134dcc 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -74,6 +74,16 @@ config EDAC_GHES
 
          In doubt, say 'Y'.
 
+config EDAC_GHES_CPU_ERROR
+       bool "EDAC device for reporting firmware-first BIOS detected CPU error 
count"
+       depends on EDAC_GHES && ACPI_PPTT
+       help
+         EDAC device for the firmware-first BIOS detected CPU error count 
reported
+         via ACPI APEI/GHES. By enabling this option, EDAC device for the CPU
+         hierarchy and EDAC device blocks for caches hierarchy would be 
created.
+         The cpu error count is shared with the userspace via the CPU EDAC
+         device's sysfs interface.
+
 config EDAC_AMD64
        tristate "AMD64 (Opteron, Athlon64)"
        depends on AMD_NB && EDAC_DECODE_MCE
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index a918ca93e4f7..96619483e5f3 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -12,6 +12,9 @@
 #include <acpi/ghes.h>
 #include <linux/edac.h>
 #include <linux/dmi.h>
+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+#include <linux/cacheinfo.h>
+#endif
 #include "edac_module.h"
 #include <ras/ras_event.h>
 
@@ -497,6 +500,130 @@ void ghes_edac_report_mem_error(int sev, struct 
cper_sec_mem_err *mem_err)
        spin_unlock_irqrestore(&ghes_lock, flags);
 }
 
+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+#define MAX_NUM_CACHES 20
+static struct ghes_edac_cpu_block {
+       int cpu;
+       u8 level;
+       u8 type;
+       int block_nr;
+} *cpu_edac_block_list;
+
+static struct edac_device_ctl_info *cpu_edac_dev;
+static int max_number_of_caches;
+
+void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo)
+{
+       struct ghes_edac_cpu_block *block;
+       int i;
+
+       if (!einfo || !(einfo->ce_count) || !max_number_of_caches)
+               return;
+
+       for (i = 0; i < max_number_of_caches; i++) {
+               block = cpu_edac_block_list + (einfo->cpu * 
max_number_of_caches) + i;
+               if ((block->level == einfo->cache_level) && (block->type == 
einfo->cache_type)) {
+                       edac_device_handle_ce_count(cpu_edac_dev, 
einfo->ce_count,
+                                                   einfo->cpu, 
block->block_nr, "");
+                       break;
+               }
+       }
+}
+
+static  int ghes_edac_add_cpu_device(struct device *dev)
+{
+       int rc;
+
+       cpu_edac_dev = edac_device_alloc_ctl_info(0, "cpu",  
num_possible_cpus(),
+                                                 "cache", 
max_number_of_caches, 0, NULL,
+                                                 0, edac_device_alloc_index());
+       if (!cpu_edac_dev) {
+               pr_warn("edac_device_alloc_ctl_info for cpu_edac_dev failed\n");
+               return -ENOMEM;
+       }
+
+       cpu_edac_dev->dev = dev;
+       cpu_edac_dev->ctl_name = "cpu_edac_dev";
+       cpu_edac_dev->dev_name = "ghes";
+       cpu_edac_dev->mod_name = "ghes_edac.c";
+       rc = edac_device_add_device(cpu_edac_dev);
+       if (rc) {
+               pr_warn("edac_device_add_device failed\n");
+               edac_device_free_ctl_info(cpu_edac_dev);
+               return rc;
+       }
+
+       return 0;
+}
+
+static  void ghes_edac_delete_cpu_device(void)
+{
+       max_number_of_caches = 0;
+       if (cpu_edac_dev) {
+               edac_device_del_device(cpu_edac_dev->dev);
+               edac_device_free_ctl_info(cpu_edac_dev);
+       }
+       vfree(cpu_edac_block_list);
+}
+
+static void ghes_edac_create_cpu_device(struct device *dev)
+{
+       int cpu, i;
+       struct ghes_edac_cpu_block *block;
+       int number_of_caches;
+       struct acpi_cacheinfo cacheinfo[MAX_NUM_CACHES];
+
+       /* Find the maximum number of caches present in the cpu heirarchy among 
the CPUs */
+       for_each_possible_cpu(cpu) {
+               number_of_caches = acpi_find_cache_info(cpu, &cacheinfo[0], 
MAX_NUM_CACHES);
+               if (number_of_caches <= 0)
+                       return;
+
+               if (max_number_of_caches < number_of_caches)
+                       max_number_of_caches = number_of_caches;
+       }
+       if (!max_number_of_caches)
+               return;
+
+       /*
+        * EDAC device interface supports creating the CPU hierarchy for all 
the CPUs
+        * together. Thus need to allocate cpu_edac_block_list for the 
max_number_of_caches
+        * among all the CPU hierarchy irrespective of the number of caches per 
CPU might vary.
+        */
+       cpu_edac_block_list = vzalloc(num_possible_cpus() * 
max_number_of_caches *
+                                     sizeof(*cpu_edac_block_list));
+       if (!cpu_edac_block_list)
+               return;
+
+       if (ghes_edac_add_cpu_device(dev))
+               goto error;
+
+       for_each_possible_cpu(cpu) {
+               memset(cacheinfo, 0, MAX_NUM_CACHES * sizeof(struct 
acpi_cacheinfo));
+               number_of_caches = acpi_find_cache_info(cpu, &cacheinfo[0], 
MAX_NUM_CACHES);
+               if (number_of_caches <= 0)
+                       goto error;
+               /*
+                * The edac cpu cache device blocks entries in the sysfs should 
match with the cpu
+                * cache structure in the sysfs so that the affected cpus for a 
shared cache
+                * can be easily extracted in the userspace.
+                */
+               for (i = 0; i < number_of_caches; i++) {
+                       block = cpu_edac_block_list + (cpu * 
max_number_of_caches) + i;
+                       block->cpu = cpu;
+                       block->level = cacheinfo[i].level;
+                       block->type = cacheinfo[i].type;
+                       block->block_nr = i;
+               }
+       }
+
+       return;
+
+error:
+       ghes_edac_delete_cpu_device();
+}
+#endif
+
 /*
  * Known systems that are safe to enable this module.
  */
@@ -624,6 +751,10 @@ int ghes_edac_register(struct ghes *ghes, struct device 
*dev)
        ghes_pvt = pvt;
        spin_unlock_irqrestore(&ghes_lock, flags);
 
+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+       ghes_edac_create_cpu_device(dev);
+#endif
+
        /* only set on success */
        refcount_set(&ghes_refcount, 1);
 
@@ -654,6 +785,10 @@ void ghes_edac_unregister(struct ghes *ghes)
        if (!refcount_dec_and_test(&ghes_refcount))
                goto unlock;
 
+#if defined(CONFIG_EDAC_GHES_CPU_ERROR)
+       ghes_edac_delete_cpu_device();
+#endif
+
        /*
         * Wait for the irq handler being finished.
         */
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 34fb3431a8f3..a9098daf53d4 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -73,6 +73,24 @@ void ghes_unregister_vendor_record_notifier(struct 
notifier_block *nb);
 
 int ghes_estatus_pool_init(int num_ghes);
 
+/*
+ * struct ghes_einfo_cpu  - structure to pass cpu error info to the edac
+ * @cpu: CPU index.
+ * @error_type: error type, cache/TLB/bus/ etc.
+ * @cache_level: cache level.
+ * @cache_type: ACPI cache type.
+ * @ue_count: CPU uncorrectable error count.
+ * @ce_count: CPU correctable error count.
+ */
+struct ghes_einfo_cpu {
+       int cpu;
+       u8 error_type;
+       u8 cache_level;
+       u8 cache_type;
+       u16 ue_count;
+       u16 ce_count;
+};
+
 /* From drivers/edac/ghes_edac.c */
 
 #ifdef CONFIG_EDAC_GHES
@@ -98,6 +116,15 @@ static inline void ghes_edac_unregister(struct ghes *ghes)
 }
 #endif
 
+#ifdef CONFIG_EDAC_GHES_CPU_ERROR
+void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo_cpu);
+
+#else
+static inline void ghes_edac_report_cpu_error(struct ghes_einfo_cpu *einfo_cpu)
+{
+}
+#endif
+
 static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
 {
        return gdata->revision >> 8;
-- 
2.17.1

Reply via email to