Add support to handle CSC firmware reported errors. When CSC firmware errors are encoutered, a error interrupt is received by the GFX device as a MSI interrupt.
Device Source control registers indicates the source of the error as CSC The HEC error status register indicates that the error is firmware reported Depending on the type of error, the error cause is written to the HEC Firmware error register. On encountering such CSC firmware errors, the graphics device is non-recoverable from driver context. The only way to recover from these errors is firmware flash. System admin/userspace is notified of the necessity of firmware flash with a combination of vendor-specific drm wedged uevent, dmesg logs and runtime survivability sysfs. It is the responsiblity of the consumer to verify all the actions and then trigger a firmware update using tools like fwupd $ udevadm monitor --property --kernel monitor will print the received events for: KERNEL - the kernel uevent KERNEL[754.709341] change /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0/drm/card0 (drm) ACTION=change DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0/drm/card0 SUBSYSTEM=drm WEDGED=vendor-specific DEVNAME=/dev/dri/card0 DEVTYPE=drm_minor SEQNUM=5973 MAJOR=226 MINOR=0 Logs xe 0000:03:00.0: [drm] *ERROR* [Hardware Error]: Tile0 reported NONFATAL error 0x20000 xe 0000:03:00.0: [drm] *ERROR* [Hardware Error]: NONFATAL: HEC Uncorrected FW FD Corruption error reported, bit[2] is set xe 0000:03:00.0: Runtime Survivability mode enabled xe 0000:03:00.0: [drm] *ERROR* CRITICAL: Xe has declared device 0000:03:00.0 as wedged. IOCTLs and executions are blocked. Only a rebind may clear the failure Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new xe 0000:03:00.0: [drm] device wedged, needs recovery xe 0000:03:00.0: Firmware update required, Please refer to the userspace documentation for more details! Runtime survivability Sysfs: /sys/bus/pci/devices/<device>/survivability_mode v2: use vendor recovery method with runtime survivability (Christian, Rodrigo, Raag) v3: move declare wedged to runtime survivability mode (Rodrigo) v4: update commit message Signed-off-by: Riana Tauro <riana.ta...@intel.com> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.rama...@intel.com> --- drivers/gpu/drm/xe/regs/xe_gsc_regs.h | 2 + drivers/gpu/drm/xe/regs/xe_hw_error_regs.h | 7 ++- drivers/gpu/drm/xe/xe_device_types.h | 3 + drivers/gpu/drm/xe/xe_hw_error.c | 64 ++++++++++++++++++++++ drivers/gpu/drm/xe/xe_survivability_mode.c | 2 +- 5 files changed, 76 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h index 9b66cc972a63..180be82672ab 100644 --- a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h @@ -13,6 +13,8 @@ /* Definitions of GSC H/W registers, bits, etc */ +#define BMG_GSC_HECI1_BASE 0x373000 + #define MTL_GSC_HECI1_BASE 0x00116000 #define MTL_GSC_HECI2_BASE 0x00117000 diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h index ed9b81fb28a0..c146b9ef44eb 100644 --- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h @@ -6,10 +6,15 @@ #ifndef _XE_HW_ERROR_REGS_H_ #define _XE_HW_ERROR_REGS_H_ +#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118) +#define UNCORR_FW_REPORTED_ERR BIT(6) + +#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124) + #define DEV_ERR_STAT_NONFATAL 0x100178 #define DEV_ERR_STAT_CORRECTABLE 0x10017c #define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ DEV_ERR_STAT_CORRECTABLE, \ DEV_ERR_STAT_NONFATAL)) - +#define XE_CSC_ERROR BIT(17) #endif diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 94e25ff1db00..c21fa9a0ad5f 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -244,6 +244,9 @@ struct xe_tile { /** @memirq: Memory Based Interrupts. */ struct xe_memirq memirq; + /** @csc_hw_error_work: worker to report CSC HW errors */ + struct work_struct csc_hw_error_work; + /** @pcode: tile's PCODE */ struct { /** @pcode.lock: protecting tile's PCODE mailbox data */ diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 0b09f9950cb9..c5b3e8c207fa 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -3,12 +3,16 @@ * Copyright © 2025 Intel Corporation */ +#include "regs/xe_gsc_regs.h" #include "regs/xe_hw_error_regs.h" #include "regs/xe_irq_regs.h" #include "xe_device.h" #include "xe_hw_error.h" #include "xe_mmio.h" +#include "xe_survivability_mode.h" + +#define HEC_UNCORR_FW_ERR_BITS 4 /* Error categories reported by hardware */ enum hardware_error { @@ -18,6 +22,13 @@ enum hardware_error { HARDWARE_ERROR_MAX, }; +static const char * const hec_uncorrected_fw_errors[] = { + "Fatal", + "CSE Disabled", + "FD Corruption", + "Data Corruption" +}; + static const char *hw_error_to_str(const enum hardware_error hw_err) { switch (hw_err) { @@ -32,6 +43,52 @@ static const char *hw_error_to_str(const enum hardware_error hw_err) } } +static void csc_hw_error_work(struct work_struct *work) +{ + struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work); + struct xe_device *xe = tile_to_xe(tile); + int ret; + + ret = xe_survivability_mode_runtime_enable(xe); + if (ret) + drm_err(&xe->drm, "Failed to enable runtime survivability mode\n"); +} + +static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err) +{ + const char *hw_err_str = hw_error_to_str(hw_err); + struct xe_device *xe = tile_to_xe(tile); + struct xe_mmio *mmio = &tile->mmio; + u32 base, err_bit, err_src; + unsigned long fw_err; + + if (xe->info.platform != XE_BATTLEMAGE) + return; + + base = BMG_GSC_HECI1_BASE; + lockdep_assert_held(&xe->irq.lock); + err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base)); + if (!err_src) { + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n", + tile->id, hw_err_str); + return; + } + + if (err_src & UNCORR_FW_REPORTED_ERR) { + fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base)); + for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) { + drm_err_ratelimited(&xe->drm, HW_ERR + "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n", + hw_err_str, hec_uncorrected_fw_errors[err_bit], + err_bit); + + schedule_work(&tile->csc_hw_error_work); + } + } + + xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src); +} + static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) { const char *hw_err_str = hw_error_to_str(hw_err); @@ -50,6 +107,9 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported %s error 0x%x\n", tile->id, hw_err_str, err_src); + if (err_src & XE_CSC_ERROR) + csc_hw_error_handler(tile, hw_err); + xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); unlock: @@ -99,8 +159,12 @@ static void process_hw_errors(struct xe_device *xe) */ void xe_hw_error_init(struct xe_device *xe) { + struct xe_tile *tile = xe_device_get_root_tile(xe); + if (IS_SRIOV_VF(xe)) return; + INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); + process_hw_errors(xe); } diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 9f770db116f4..6f0d123505ab 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -346,7 +346,7 @@ int xe_survivability_mode_runtime_enable(struct xe_device *xe) xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); xe_device_declare_wedged(xe); - dev_err(&pdev->dev, "Firmware update required, Refer the userspace documentation for more details!\n"); + dev_err(&pdev->dev, "Firmware update required, Please refer to the userspace documentation for more details!\n"); return 0; } -- 2.47.1