Perform gpu reset after gfx finishes processing
ras poison consumption on gfx_v11_0_3.

V2:
 Move gfx poison consumption handler from hw_ops to ip
 function level.

V3:
 Adjust the calling position of amdgpu_gfx_poison_consumation_handler.

V4:
   Since gfx v11_0_3 does not have .hw_ops instance, the .hw_ops null
 pointer check in amdgpu_ras_interrupt_poison_consumption_handler
 needs to be adjusted.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>
Reviewed-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  9 +++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++++---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 13 +++++++++++++
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 09c42c00e43c..caf7fd3adcbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -731,6 +731,15 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
        return 0;
 }
 
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+                                               struct amdgpu_iv_entry *entry)
+{
+       if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
+               return adev->gfx.ras->poison_consumption_handler(adev, entry);
+
+       return 0;
+}
+
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
                void *err_data,
                struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 0b39fe3cd624..86ec9d0d12c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -213,6 +213,8 @@ struct amdgpu_gfx_ras {
        int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
                                struct amdgpu_irq_src *source,
                                struct amdgpu_iv_entry *entry);
+       int (*poison_consumption_handler)(struct amdgpu_device *adev,
+                                               struct amdgpu_iv_entry *entry);
 };
 
 struct amdgpu_gfx_funcs {
@@ -437,4 +439,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
 void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t 
ucode_id);
 
 int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+                                               struct amdgpu_iv_entry *entry);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d06beb884a16..0a95d1c1e7ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1620,14 +1620,14 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        struct amdgpu_ras_block_object *block_obj =
                amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
-       if (!block_obj || !block_obj->hw_ops)
+       if (!block_obj)
                return;
 
        /* both query_poison_status and handle_poison_consumption are optional,
         * but at least one of them should be implemented if we need poison
         * consumption handler
         */
-       if (block_obj->hw_ops->query_poison_status) {
+       if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
                poison_stat = block_obj->hw_ops->query_poison_status(adev);
                if (!poison_stat) {
                        /* Not poison consumption interrupt, no need to handle 
it */
@@ -1641,7 +1641,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        if (!adev->gmc.xgmi.connected_to_cpu)
                amdgpu_umc_poison_handler(adev, false);
 
-       if (block_obj->hw_ops->handle_poison_consumption)
+       if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
                poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
 
        /* gpu reset is fallback for failed and default cases */
@@ -1649,6 +1649,8 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
                dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
                                block_obj->ras_comm.name);
                amdgpu_ras_reset_gpu(adev);
+       } else {
+               amdgpu_gfx_poison_consumption_handler(adev, entry);
        }
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index a18e09de31dd..b07a72ca25d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -70,6 +70,19 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device 
*adev,
        return 0;
 }
 
+static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
+                                       struct amdgpu_iv_entry *entry)
+{
+       /* Workaround: when vmid and pasid are both zero, trigger gpu reset in 
KGD. */
+       if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
+           (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
+            !entry->vmid && !entry->pasid)
+               amdgpu_ras_reset_gpu(adev);
+
+       return 0;
+}
+
 struct amdgpu_gfx_ras gfx_v11_0_3_ras = {
        .rlc_gc_fed_irq = gfx_v11_0_3_rlc_gc_fed_irq,
+       .poison_consumption_handler = gfx_v11_0_3_poison_consumption_handler,
 };
-- 
2.25.1

Reply via email to