This handler is designed to be called when critical errors are detected that affect device-level state persisting across warm resets. The cold reset recovery method signals to userspace that only a complete device power cycle can restore normal operation.
Signed-off-by: Mallesh Koujalagi <[email protected]> --- drivers/gpu/drm/xe/xe_hw_error.c | 28 ++++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_hw_error.h | 1 + 2 files changed, 29 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 8c65291f36fc..d7825d6b9703 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -5,6 +5,8 @@ #include <linux/fault-inject.h> +#include <drm/drm_drv.h> + #include "regs/xe_gsc_regs.h" #include "regs/xe_hw_error_regs.h" #include "regs/xe_irq_regs.h" @@ -162,6 +164,32 @@ static void process_hw_errors(struct xe_device *xe) } } +/** + * xe_critical_error_handler - Handler for critical errors + * @xe: device instance + * + * Handles critical errors that affect the device and cannot + * be recovered through driver reload, PCIe reset, etc. + * + * Marks the device as wedged with DRM_WEDGE_RECOVERY_COLD_RESET method + * and notifies userspace that a complete device power cycle is required. + */ +void xe_critical_error_handler(struct xe_device *xe) +{ + drm_err(&xe->drm, "CRITICAL: error detected\n"); + drm_err(&xe->drm, "Recovery: Device cold reset required\n"); + + /* Set cold reset recovery method */ + xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_COLD_RESET); + + if (xe_device_wedged(xe)) { + drm_dev_wedged_event(&xe->drm, xe->wedged.method, NULL); + } else { + /* Declare device wedged - will trigger uevent with cold reset method */ + xe_device_declare_wedged(xe); + } +} + /** * xe_hw_error_init - Initialize hw errors * @xe: xe device instance diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h index d86e28c5180c..f824e22655a0 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.h +++ b/drivers/gpu/drm/xe/xe_hw_error.h @@ -11,5 +11,6 @@ struct xe_tile; struct xe_device; void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl); +void xe_critical_error_handler(struct xe_device *xe); void xe_hw_error_init(struct xe_device *xe); #endif -- 2.34.1
