From: Jesse Zhang <jesse.zh...@amd.com>

For the bad opcode case, it will cause CP/ME hang.
The firmware will prevent the ME side from hanging by raising a bad opcode 
interrupt.
And the driver needs to perform a vmid reset when receiving the interrupt.

v2: update irq naming (drop priv) (Alex)

Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 74 ++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index c74c8a60a23a..63b073fd4dc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -1349,6 +1349,13 @@ static int gfx_v12_0_sw_init(void *handle)
        if (r)
                return r;
 
+       /* Bad opcode Event */
+       r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
+                             GFX_11_0_0__SRCID__CP_BAD_OPCODE_ERROR,
+                             &adev->gfx.bad_op_irq);
+       if (r)
+               return r;
+
        /* Privileged reg */
        r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
                              GFX_11_0_0__SRCID__CP_PRIV_REG_FAULT,
@@ -3592,6 +3599,7 @@ static int gfx_v12_0_hw_fini(void *handle)
 
        amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
+       amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);
 
        if (!adev->no_hw_access) {
                if (amdgpu_async_gfx_ring) {
@@ -3712,6 +3720,10 @@ static int gfx_v12_0_late_init(void *handle)
        if (r)
                return r;
 
+       r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0);
+       if (r)
+               return r;
+
        return 0;
 }
 
@@ -4831,6 +4843,51 @@ static int gfx_v12_0_set_priv_reg_fault_state(struct 
amdgpu_device *adev,
        return 0;
 }
 
+static int gfx_v12_0_set_bad_op_fault_state(struct amdgpu_device *adev,
+                                           struct amdgpu_irq_src *source,
+                                           unsigned type,
+                                           enum amdgpu_interrupt_state state)
+{
+       u32 cp_int_cntl_reg, cp_int_cntl;
+       int i , j;
+
+       switch (state) {
+       case AMDGPU_IRQ_STATE_DISABLE:
+       case AMDGPU_IRQ_STATE_ENABLE:
+               for (i = 0; i < adev->gfx.me.num_me; i++) {
+                       for (j = 0; j < adev->gfx.me.num_pipe_per_me; j++) {
+                               cp_int_cntl_reg = 
gfx_v12_0_get_cpg_int_cntl(adev, i, j);
+
+                               if (cp_int_cntl_reg) {
+                                       cp_int_cntl = RREG32_SOC15_IP(GC, 
cp_int_cntl_reg);
+                                       cp_int_cntl = 
REG_SET_FIELD(cp_int_cntl, CP_INT_CNTL_RING0,
+                                                                   
OPCODE_ERROR_INT_ENABLE,
+                                                                   state == 
AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+                                       WREG32_SOC15_IP(GC, cp_int_cntl_reg, 
cp_int_cntl);
+                               }
+                       }
+               }
+               for (i = 0; i < adev->gfx.mec.num_mec; i++) {
+                       for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) {
+                               /* MECs start at 1 */
+                               cp_int_cntl_reg = 
gfx_v12_0_get_cpc_int_cntl(adev, i + 1, j);
+
+                               if (cp_int_cntl_reg) {
+                                       cp_int_cntl = RREG32_SOC15_IP(GC, 
cp_int_cntl_reg);
+                                       cp_int_cntl = 
REG_SET_FIELD(cp_int_cntl, CP_ME1_PIPE0_INT_CNTL,
+                                                                   
OPCODE_ERROR_INT_ENABLE,
+                                                                   state == 
AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+                                       WREG32_SOC15_IP(GC, cp_int_cntl_reg, 
cp_int_cntl);
+                               }
+                       }
+               }
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
 static int gfx_v12_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
                                               struct amdgpu_irq_src *source,
                                               unsigned int type,
@@ -4907,6 +4964,15 @@ static int gfx_v12_0_priv_reg_irq(struct amdgpu_device 
*adev,
        return 0;
 }
 
+static int gfx_v12_0_bad_op_irq(struct amdgpu_device *adev,
+                               struct amdgpu_irq_src *source,
+                               struct amdgpu_iv_entry *entry)
+{
+       DRM_ERROR("Illegal opcode in command stream \n");
+       gfx_v12_0_handle_priv_fault(adev, entry);
+       return 0;
+}
+
 static int gfx_v12_0_priv_inst_irq(struct amdgpu_device *adev,
                                   struct amdgpu_irq_src *source,
                                   struct amdgpu_iv_entry *entry)
@@ -5219,6 +5285,11 @@ static const struct amdgpu_irq_src_funcs 
gfx_v12_0_priv_reg_irq_funcs = {
        .process = gfx_v12_0_priv_reg_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v12_0_bad_op_irq_funcs = {
+       .set = gfx_v12_0_set_bad_op_fault_state,
+       .process = gfx_v12_0_bad_op_irq,
+};
+
 static const struct amdgpu_irq_src_funcs gfx_v12_0_priv_inst_irq_funcs = {
        .set = gfx_v12_0_set_priv_inst_fault_state,
        .process = gfx_v12_0_priv_inst_irq,
@@ -5232,6 +5303,9 @@ static void gfx_v12_0_set_irq_funcs(struct amdgpu_device 
*adev)
        adev->gfx.priv_reg_irq.num_types = 1;
        adev->gfx.priv_reg_irq.funcs = &gfx_v12_0_priv_reg_irq_funcs;
 
+       adev->gfx.bad_op_irq.num_types = 1;
+       adev->gfx.bad_op_irq.funcs = &gfx_v12_0_bad_op_irq_funcs;
+
        adev->gfx.priv_inst_irq.num_types = 1;
        adev->gfx.priv_inst_irq.funcs = &gfx_v12_0_priv_inst_irq_funcs;
 }
-- 
2.45.2

Reply via email to