xe_hw_error: Add support for Core-Compute errors

Riana Tauro Mon, 09 Feb 2026 21:59:02 -0800



On 2/5/2026 9:00 PM, Raag Jadav wrote:

On Mon, Feb 02, 2026 at 12:14:00PM +0530, Riana Tauro wrote:

PVC supports GT error reporting via vector registers along with
error status register. Add support to report these errors and
update respective counters. Incase of Subslice error reported
by vector register, process the error status register
for applicable bits.

The counter is embedded in the xe drm ras structure and is
exposed to the userspace using the drm_ras generic netlink
interface.

$ sudo ynl --family drm_ras --do query-error-counter  --json


We usually add '\' at the end for wrapping commands so that they're easy
to apply directly (and same for all other patches where applicable).

   '{"node-id":0, "error-id":1}'


Ditto.


Will fix this

{'error-id': 1, 'error-name': 'core-compute', 'error-value': 0}

Co-developed-by: Himal Prasad Ghimiray <[email protected]>
Signed-off-by: Himal Prasad Ghimiray <[email protected]>
Signed-off-by: Riana Tauro <[email protected]>
---
v2: Add ID's and names as uAPI (Rodrigo)
     Add documentation
     Modify commit message

v3: remove 'error' from counters
     use drmm_kcalloc
     add a for_each for severity
     differentitate error classes and severity in UAPI(Raag)
     Use correctable and uncorrectable in uapi (Pratik / Aravind)

v4: modify enums in UAPI
     improve comments
     add bounds check in handler
     add error mask macro (Raag)
     use atomic_t
     add null pointer checks
---
  drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  62 ++++++-
  drivers/gpu/drm/xe/xe_hw_error.c           | 199 +++++++++++++++++++--
  2 files changed, 241 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h 
b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
index c146b9ef44eb..17982a335941 100644
--- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
@@ -6,15 +6,59 @@
  #ifndef _XE_HW_ERROR_REGS_H_
  #define _XE_HW_ERROR_REGS_H_

-#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118)

-#define    UNCORR_FW_REPORTED_ERR                      BIT(6)
+#define HEC_UNCORR_ERR_STATUS(base)                    XE_REG((base) + 0x118)
+#define   UNCORR_FW_REPORTED_ERR                       REG_BIT(6)

-#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124)

+#define HEC_UNCORR_FW_ERR_DW0(base)                    XE_REG((base) + 0x124)
+
+#define ERR_STAT_GT_COR                                        0x100160
+#define   EU_GRF_COR_ERR                               REG_BIT(15)
+#define   EU_IC_COR_ERR                                        REG_BIT(14)
+#define   SLM_COR_ERR                                  REG_BIT(13)
+#define   GUC_COR_ERR                                  REG_BIT(1)
+
+#define ERR_STAT_GT_NONFATAL                           0x100164
+#define ERR_STAT_GT_FATAL                              0x100168
+#define   EU_GRF_FAT_ERR                               REG_BIT(15)
+#define   SLM_FAT_ERR                                  REG_BIT(13)
+#define   GUC_FAT_ERR                                  REG_BIT(6)
+#define   FPU_FAT_ERR                                  REG_BIT(3)
+
+#define ERR_STAT_GT_REG(x)                             XE_REG(_PICK_EVEN((x), \
+                                                                         
ERR_STAT_GT_COR, \
+                                                                         
ERR_STAT_GT_NONFATAL))
+
+#define PVC_COR_ERR_MASK                               (GUC_COR_ERR | 
SLM_COR_ERR | \
+                                                        EU_IC_COR_ERR | 
EU_GRF_COR_ERR)
+
+#define PVC_FAT_ERR_MASK                               (FPU_FAT_ERR | 
GUC_FAT_ERR | \
+                                                       EU_GRF_FAT_ERR | 
SLM_FAT_ERR)


Nit: Whitespace please!


alignment?

+#define DEV_ERR_STAT_NONFATAL                          0x100178
+#define DEV_ERR_STAT_CORRECTABLE                       0x10017c
+#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
+                                                                         
DEV_ERR_STAT_CORRECTABLE, \
+                                                                         
DEV_ERR_STAT_NONFATAL))


I know it was already like this but how does this evaluate for FATAL?


#define _PICK_EVEN(__index, __a, __b) ((__a) + (__index) * ((__b) - (__a)))
(index, 0x10017c, 0x100178)  = (0x10017c + index * (0x100178 - 0x10017c));
0 =  0x10017c
1 =  0x100178
2 =  0x100174

+#define   XE_CSC_ERROR                                 17
+#define   XE_GT_ERROR                                  0
+
+#define ERR_STAT_GT_FATAL_VECTOR_0                     0x100260
+#define ERR_STAT_GT_FATAL_VECTOR_1                     0x100264
+
+#define ERR_STAT_GT_FATAL_VECTOR_REG(x)                        
XE_REG(_PICK_EVEN((x), \
+                                                                 
ERR_STAT_GT_FATAL_VECTOR_0, \
+                                                                 
ERR_STAT_GT_FATAL_VECTOR_1))
+
+#define ERR_STAT_GT_COR_VECTOR_0                       0x1002a0
+#define ERR_STAT_GT_COR_VECTOR_1                       0x1002a4
+
+#define ERR_STAT_GT_COR_VECTOR_REG(x)                  XE_REG(_PICK_EVEN((x), \
+                                                                         
ERR_STAT_GT_COR_VECTOR_0, \
+                                                                         
ERR_STAT_GT_COR_VECTOR_1))
+
+#define ERR_STAT_GT_VECTOR_REG(hw_err, x)              (hw_err == 
HARDWARE_ERROR_CORRECTABLE ? \
+                                                       
ERR_STAT_GT_COR_VECTOR_REG(x) : \
+                                                       
ERR_STAT_GT_FATAL_VECTOR_REG(x))


Ditto for whitespace.

-#define DEV_ERR_STAT_NONFATAL                  0x100178
-#define DEV_ERR_STAT_CORRECTABLE               0x10017c
-#define DEV_ERR_STAT_REG(x)                    XE_REG(_PICK_EVEN((x), \
-                                                                 
DEV_ERR_STAT_CORRECTABLE, \
-                                                                 
DEV_ERR_STAT_NONFATAL))
-#define   XE_CSC_ERROR                         BIT(17)
  #endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 2019aaaa1ebe..ff31fb322c8a 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -3,6 +3,7 @@
   * Copyright © 2025 Intel Corporation
   */

+#include <linux/bitmap.h>

  #include <linux/fault-inject.h>

#include "regs/xe_gsc_regs.h"

@@ -15,7 +16,13 @@
  #include "xe_mmio.h"
  #include "xe_survivability_mode.h"

-#define HEC_UNCORR_FW_ERR_BITS 4

+#define  GT_HW_ERROR_MAX_ERR_BITS      16
+#define  HEC_UNCORR_FW_ERR_BITS                4
+#define  XE_RAS_REG_SIZE               32
+
+#define  PVC_ERROR_MASK_SET(hw_err, err_bit) \
+       ((hw_err == HARDWARE_ERROR_CORRECTABLE) ? (BIT(err_bit) & 
PVC_COR_ERR_MASK) : \
+       (BIT(err_bit) & PVC_FAT_ERR_MASK))


I'd write this as below and move it to xe_hw_error_regs.h

This is not specific to register selection or defining bits. It'srelated to mask. So .c should be the right place


#define PVC_COR_ERR_MASK_SET(err_bit)                   (PVC_COR_ERR_MASK & 
REG_BIT(err_bit))
#define PVC_FAT_ERR_MASK_SET(err_bit)                   (PVC_FAT_ERR_MASK & 
REG_BIT(err_bit))

#define PVC_ERR_MASK_SET(hw_err, err_bit)               ((hw_err == 
HARDWARE_ERROR_CORRECTABLE) ? \
                                                                
PVC_COR_ERR_MASK_SET(err_bit) : \
                                                                
PVC_FAT_ERR_MASK_SET(err_bit)

...

+static void gt_hw_error_handler(struct xe_tile *tile, const enum 
hardware_error hw_err,
+                               u32 error_id)
+{
+       const enum drm_xe_ras_error_severity severity = 
hw_err_to_severity(hw_err);
+       struct xe_device *xe = tile_to_xe(tile);
+       struct xe_drm_ras *ras = &xe->ras;
+       struct xe_drm_ras_counter *info = ras->info[severity];
+       struct xe_mmio *mmio = &tile->mmio;
+       unsigned long err_stat = 0;
+       int i, len;
+
+       if (xe->info.platform != XE_PVC)
+               return;
+
+       if (!info)
+               return;


Since info allocation is not related to hardware, we shouldn't even be
at this point without it. So let's not hide bugs and fail probe instead.

yes currently it is supported only on PVC. I can remove this here asthere is a PVC check but cannot remove the one suggested below.


Thanks
Riana

+       if (hw_err == HARDWARE_ERROR_NONFATAL) {
+               atomic_inc(&info[error_id].counter);
+               log_hw_error(tile, info[error_id].name, severity);
+               return;
+       }

...

  static void hw_error_source_handler(struct xe_tile *tile, const enum 
hardware_error hw_err)
  {
        const enum drm_xe_ras_error_severity severity = 
hw_err_to_severity(hw_err);
        const char *severity_str = error_severity[severity];
        struct xe_device *xe = tile_to_xe(tile);
-       unsigned long flags;
-       u32 err_src;
+       struct xe_drm_ras *ras = &xe->ras;
+       struct xe_drm_ras_counter *info = ras->info[severity];
+       unsigned long flags, err_src;
+       u32 err_bit;

- if (xe->info.platform != XE_BATTLEMAGE)

+       if (!IS_DGFX(xe))
                return;

spin_lock_irqsave(&xe->irq.lock, flags);

@@ -108,11 +242,53 @@ static void hw_error_source_handler(struct xe_tile *tile, 
const enum hardware_er
                goto unlock;
        }

- if (err_src & XE_CSC_ERROR)

+       /*
+        * On encountering CSC firmware errors, the graphics device becomes 
unrecoverable
+        * so return immediately on error. The only way to recover from these 
errors is
+        * firmware flash. The device will enter Runtime Survivability mode 
when such
+        * errors are detected.
+        */
+       if (err_src & XE_CSC_ERROR) {
                csc_hw_error_handler(tile, hw_err);
+               goto clear_reg;
+       }

- xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);

+       if (!info)
+               goto clear_reg;


Same as above.

Raag

Re: [PATCH v5 4/5] drm/xe/xe_hw_error: Add support for Core-Compute errors

Reply via email to