When PUNIT (power management unit) errors are detected that persist across warm resets, mark the device as wedged with DRM_WEDGE_RECOVERY_COLD_RESET and notify userspace that a complete device cold reset is required to restore normal operation.
v3: - Use PUNIT instead of PMU. (Riana) - Use consistent wordingi. - Remove log. (Raag) Signed-off-by: Mallesh Koujalagi <[email protected]> --- drivers/gpu/drm/xe/xe_ras.c | 21 ++++++++++++++++++++- drivers/gpu/drm/xe/xe_ras.h | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 437811845c01..e2e1ab3fb4ce 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -5,6 +5,7 @@ #include "xe_assert.h" #include "xe_device_types.h" +#include "xe_device.h" #include "xe_printk.h" #include "xe_ras.h" #include "xe_ras_types.h" @@ -93,6 +94,24 @@ static enum xe_ras_recovery_action handle_compute_errors(struct xe_device *xe, return XE_RAS_RECOVERY_ACTION_RECOVERED; } +/** + * xe_punit_error_handler - Handler for Punit errors requiring cold reset + * @xe: device instance + * + * Handles Punit errors that affect the device and cannot be recovered + * through driver reload, PCIe reset, etc. + * + * Marks the device as wedged with DRM_WEDGE_RECOVERY_COLD_RESET method + * and notifies userspace that a device cold reset is required. + */ +void xe_punit_error_handler(struct xe_device *xe) +{ + xe_err(xe, "Recovery: Device cold reset required\n"); + + xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_COLD_RESET); + xe_device_declare_wedged(xe); +} + static enum xe_ras_recovery_action handle_soc_internal_errors(struct xe_device *xe, struct xe_ras_error_array *arr) { @@ -132,7 +151,7 @@ static enum xe_ras_recovery_action handle_soc_internal_errors(struct xe_device * xe_err(xe, "[RAS]: PUNIT %s error detected: 0x%x\n", severity_to_str(xe, common.severity), ieh_error->global_error_status); - /** TODO: Add PUNIT error handling */ + xe_punit_error_handler(xe); return XE_RAS_RECOVERY_ACTION_DISCONNECT; } } diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h index e191ab80080c..ab1fde200625 100644 --- a/drivers/gpu/drm/xe/xe_ras.h +++ b/drivers/gpu/drm/xe/xe_ras.h @@ -11,6 +11,7 @@ struct xe_device; void xe_ras_init(struct xe_device *xe); +void xe_punit_error_handler(struct xe_device *xe); enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe); #endif -- 2.34.1
