The branch main has been updated by gallatin:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=16e5abf415baf801c6d7c7948a742aeda75e2237

commit 16e5abf415baf801c6d7c7948a742aeda75e2237
Author:     Andrew Gallatin <[email protected]>
AuthorDate: 2026-06-06 00:07:03 +0000
Commit:     Andrew Gallatin <[email protected]>
CommitDate: 2026-06-06 00:12:21 +0000

    APEI: Provide more info on fatal hardware errors
    
    This change refactors fatal error delivery via APEI and prints more info:
    
    - Makes the NMI handler call into the ge handler to establish a common
            code flow, no matter how the error is delivered
    - Adds the FRU to the panic string so as to provide more information than
            just "APEI Fatal Hardware Error!" such as
            "APEI Fatal Hardware Error: PcieError"
    - Prints more details about fatal pcie errors.  Note that we skip acquiring
            Giant on fatal errors
    - Hexdumps the full GED data on fatal errors, so as to facilitate
            offline data analysis
    
    Reviewed by: imp
    Sponsored by: Netflix
    Differential Revision: https://reviews.freebsd.org/D57417
---
 sys/dev/acpica/acpi_apei.c | 53 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/sys/dev/acpica/acpi_apei.c b/sys/dev/acpica/acpi_apei.c
index e85b3910e46d..925558d585bf 100644
--- a/sys/dev/acpica/acpi_apei.c
+++ b/sys/dev/acpica/acpi_apei.c
@@ -237,7 +237,7 @@ apei_mem_handler(ACPI_HEST_GENERIC_DATA *ged)
 }
 
 static int
-apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
+apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged, bool fatal)
 {
        struct apei_pcie_error *p = (struct apei_pcie_error *)GED_DATA(ged);
        int off;
@@ -246,7 +246,8 @@ apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
        int h = 0, sev;
 
        if ((p->ValidationBits & 0x8) == 0x8) {
-               mtx_lock(&Giant);
+               if (!fatal)
+                       mtx_lock(&Giant);
                dev = pci_find_dbsf((uint32_t)p->DeviceID[10] << 8 |
                    p->DeviceID[9], p->DeviceID[11], p->DeviceID[8],
                    p->DeviceID[7]);
@@ -264,9 +265,11 @@ apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
                        }
                        pcie_apei_error(dev, sev,
                            (p->ValidationBits & 0x80) ? p->AERInfo : NULL);
-                       h = 1;
+                       if (!fatal)
+                               h = 1;
                }
-               mtx_unlock(&Giant);
+               if (!fatal)
+                       mtx_unlock(&Giant);
        }
        if (h)
                return (h);
@@ -322,8 +325,8 @@ apei_pcie_handler(ACPI_HEST_GENERIC_DATA *ged)
        return (0);
 }
 
-static void
-apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
+static const char *
+apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged, bool fatal)
 {
        ACPI_HEST_GENERIC_DATA_V300 *ged3 = (ACPI_HEST_GENERIC_DATA_V300 *)ged;
        /* A5BC1114-6F64-4EDE-B863-3E83ED7C83B1 */
@@ -342,12 +345,12 @@ apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
        if (memcmp(mem_uuid, ged->SectionType, ACPI_UUID_LENGTH) == 0) {
                h = apei_mem_handler(ged);
        } else if (memcmp(pcie_uuid, ged->SectionType, ACPI_UUID_LENGTH) == 0) {
-               h = apei_pcie_handler(ged);
+               h = apei_pcie_handler(ged, fatal);
        } else {
                if (!log_corrected &&
                    (ged->ErrorSeverity == ACPI_HEST_GEN_ERROR_CORRECTED ||
                    ged->ErrorSeverity == ACPI_HEST_GEN_ERROR_NONE))
-                       return;
+                       return (NULL);
 
                t = ged->SectionType;
                printf("APEI %s Error %02x%02x%02x%02x-%02x%02x-"
@@ -364,7 +367,7 @@ apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
                }
        }
        if (h)
-               return;
+               return (NULL);
 
        printf(" Flags: 0x%x\n", ged->Flags);
        if (ged->ValidationBits & ACPI_HEST_GEN_VALID_FRU_ID) {
@@ -379,6 +382,19 @@ apei_ged_handler(ACPI_HEST_GENERIC_DATA *ged)
        if (ged->Revision >= 0x300 &&
            ged->ValidationBits & ACPI_HEST_GEN_VALID_TIMESTAMP)
                printf(" Timestamp: %016jx\n", ged3->TimeStamp);
+       if (fatal) {
+               printf(" Error Data:\n");
+               t = (uint8_t *)GED_DATA(ged);
+               for (off = 0; off < ged->ErrorDataLength; off++) {
+                       printf(" %02x", t[off]);
+                       if ((off % 16) == 15 ||
+                           off + 1 == ged->ErrorDataLength)
+                               printf("\n");
+               }
+       }
+       if (ged->ValidationBits & ACPI_HEST_GEN_VALID_FRU_STRING)
+               return ((const char *)ged->FruText);
+       return (NULL);
 }
 
 static int
@@ -387,23 +403,27 @@ apei_ge_handler(struct apei_ge *ge, bool copy)
        uint8_t *buf = copy ? ge->copybuf : ge->buf;
        ACPI_HEST_GENERIC_STATUS *ges = (ACPI_HEST_GENERIC_STATUS *)buf;
        ACPI_HEST_GENERIC_DATA *ged;
+       const char *fru, *f;
        size_t off, len;
-       uint32_t sev;
        int i, c;
+       bool fatal;
 
        if (ges == NULL || ges->BlockStatus == 0)
                return (0);
 
        c = (ges->BlockStatus >> 4) & 0x3ff;
-       sev = ges->ErrorSeverity;
+       fatal = (ges->ErrorSeverity == ACPI_HEST_GEN_ERROR_FATAL);
 
        /* Process error entries. */
+       fru = NULL;
        len = MIN(ge->v1.ErrorBlockLength - sizeof(*ges), ges->DataLength);
        for (off = i = 0; i < c && off + sizeof(*ged) <= len; i++) {
                ged = (ACPI_HEST_GENERIC_DATA *)&buf[sizeof(*ges) + off];
                if ((uint64_t)GED_SIZE(ged) + ged->ErrorDataLength > len - off)
                        break;
-               apei_ged_handler(ged);
+               f = apei_ged_handler(ged, fatal);
+               if (f != NULL && fru == NULL)
+                       fru = f;
                off += GED_SIZE(ged) + ged->ErrorDataLength;
        }
 
@@ -418,8 +438,9 @@ apei_ge_handler(struct apei_ge *ge, bool copy)
        }
 
        /* If ACPI told the error is fatal -- make it so. */
-       if (sev == ACPI_HEST_GEN_ERROR_FATAL)
-               panic("APEI Fatal Hardware Error!");
+       if (fatal)
+               panic("APEI Fatal Hardware Error: %.20s",
+                   fru != NULL ? fru : "unknown");
 
        return (1);
 }
@@ -450,9 +471,9 @@ apei_nmi_handler(void)
                if (ges == NULL || ges->BlockStatus == 0)
                        continue;
 
-               /* If ACPI told the error is fatal -- make it so. */
+               /* Log and panic via apei_ge_handler(); does not return. */
                if (ges->ErrorSeverity == ACPI_HEST_GEN_ERROR_FATAL)
-                       panic("APEI Fatal Hardware Error!");
+                       apei_ge_handler(ge, false);
 
                /* Copy the buffer for later processing. */
                gesc = (ACPI_HEST_GENERIC_STATUS *)ge->copybuf;

Reply via email to