On 08/08/25 1:52 PM, Ganesh Goudar wrote:
parse AER uncorrectable and correctable error status
registers to print error type and severity.
output looks like
EEH:AER severity=Uncorrected (Fatal), Error Type: Data Link Protocol Error
Signed-off-by: Ganesh Goudar <[email protected]>
---
v2:
* Remove unnecessary checks.
* Change the error message format.
---
arch/powerpc/kernel/eeh.c | 81 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 80 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 83fe99861eb1..cd083e59d6b3 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -139,6 +139,49 @@ struct eeh_stats {
static struct eeh_stats eeh_stats;
+static const char * const aer_uncor_errors[] = {
+ "Undefined",
+ "Undefined",
+ "Undefined",
+ "Undefined",
+ "Data Link Protocol",
+ "Surprise Down",
+ "Poisoned TLP",
+ "Flow Control Protocol",
+ "Completion Timeout",
+ "Completer Abort",
+ "Unexpected Completion",
+ "Receiver Overflow",
+ "Malformed TLP",
+ "ECRC Error",
+ "Unsupported Request",
+ "ACS Violation",
+ "Uncorrectable Internal Error",
+ "MC Blocked TLP",
+ "AtomicOp Egress Blocked",
+ "TLPPrefix Blocked",
+ "Poisoned TLP Egress Blocked"
+};
+
+static const char * const aer_cor_errors[] = {
+ "Receiver Error",
+ "Undefined",
+ "Undefined",
+ "Undefined",
+ "Undefined",
+ "Undefined",
+ "Bad TLP",
+ "Bad DLLP",
+ "Replay Num Rollover",
+ "Undefined",
+ "Undefined",
+ "Undefined",
+ "Replay Timer Timeout",
+ "Advisory Non-Fatal Error",
+ "Corrected Internal Error",
+ "Header Log Overflow",
+};
+
static int __init eeh_setup(char *str)
{
if (!strcmp(str, "off"))
@@ -160,6 +203,40 @@ void eeh_show_enabled(void)
pr_info("EEH: No capable adapters found: recovery disabled.\n");
}
+static void eeh_parse_aer_registers(struct eeh_dev *edev, int cap)
+{
+ int i;
+ const char *error_type;
+ u32 uncor_status, uncor_severity, cor_status;
+
+ eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_STATUS, 4,
&uncor_status);
+ eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_SEVER, 4,
&uncor_severity);
+ eeh_ops->read_config(edev, cap + PCI_ERR_COR_STATUS, 4, &cor_status);
+
+ if (uncor_status) {
+ for (i = 0; i < ARRAY_SIZE(aer_uncor_errors); i++) {
+ if (uncor_status & (1 << i)) {
+ error_type = (i < ARRAY_SIZE(aer_uncor_errors))
+ ? aer_uncor_errors[i] : "Unknown";
+ pr_err("EEH:AER severity=Uncorrected (%s), Error
type: %s\n",
+ (uncor_severity & (1 << i)) ?
+ "Fatal" : "Non-Fatal", error_type);
+ }
+ }
+ }
+
+ if (cor_status) {
+ for (i = 0; i < ARRAY_SIZE(aer_cor_errors); i++) {
+ if (cor_status & (1 << i)) {
+ error_type = (i < ARRAY_SIZE(aer_cor_errors))
+ ? aer_cor_errors[i] : "Unknown";
+ pr_err("EEH:AER severity=Correctable, Error Type:
%s\n",
+ error_type);
+ }
+ }
+ }
+}
+
In the last version you missed my review comment.
If you include it would be better to also consider the AER mask
registers when
interpreting error status. Otherwise masked-off bits may still appear in
logs, leading to false positives. For example, something like:
eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_MASK, 4, &uncor_mask);
eeh_ops->read_config(edev, cap + PCI_ERR_COR_MASK, 4, &cor_mask);
if (uncor_status & ~uncor_mask) { ... }
if (cor_status & ~cor_mask) { ... }
This way only unmasked errors are reported.
Regards,
Narayana Murty N
/*
* This routine captures assorted PCI configuration space data
* for the indicated PCI device, and puts them into a buffer
@@ -237,9 +314,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char
*buf, size_t len)
pr_warn("%s\n", buffer);
}
- /* If AER capable, dump it */
+ /* If AER capable, parse and dump it */
cap = edev->aer_cap;
if (cap) {
+ eeh_parse_aer_registers(edev, cap);
+
n += scnprintf(buf+n, len-n, "pci-e AER:\n");
pr_warn("EEH: PCI-E AER capability register set follows:\n");