[AMD Official Use Only - General]

amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
                if (r)
                        return r;

-               r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
-               if (r)
-                       goto late_fini;
+               if (!(adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))){
+                       r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 
0);
+                       if (r)
+                               goto late_fini;
+               }

GFX11 doesn't support SRAM RAS. GFX_RAS MASK should *not* be set. As a result, 
amdgpu_gfx_ras_late_init should *not* go to the code path as above. The 
expectation is amdgpu_gfx_ras_late_init only issues an enable command to 
program GB_EDC_MODE.

Can you please further check if GFX_RAS mask is set in your case? if so, it 
could be an IFWI issue.


Regards,
Hawking

-----Original Message-----
From: Horatio Zhang <hongkun.zh...@amd.com>
Sent: Wednesday, April 26, 2023 16:41
To: Zhang, Hawking <hawking.zh...@amd.com>; Koenig, Christian 
<christian.koe...@amd.com>; Chen, Guchun <guchun.c...@amd.com>; 
amd-gfx@lists.freedesktop.org
Cc: Xu, Feifei <feifei...@amd.com>; Yao, Longlong <longlong....@amd.com>; 
Zhang, Horatio <hongkun.zh...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>; 
Chen, Guchun <guchun.c...@amd.com>
Subject: [PATCH v3] drm/amdgpu: drop gfx_v11_0_cp_ecc_error_irq_funcs

The gfx.cp_ecc_error_irq is retired in gfx11. In gfx_v11_0_hw_fini still use 
amdgpu_irq_put to disable this interrupt, which caused the call trace in this 
function.

[  102.873958] Call Trace:
[  102.873959]  <TASK>
[  102.873961]  gfx_v11_0_hw_fini+0x23/0x1e0 [amdgpu] [  102.874019]  
gfx_v11_0_suspend+0xe/0x20 [amdgpu] [  102.874072]  
amdgpu_device_ip_suspend_phase2+0x240/0x460 [amdgpu] [  102.874122]  
amdgpu_device_ip_suspend+0x3d/0x80 [amdgpu] [  102.874172]  
amdgpu_device_pre_asic_reset+0xd9/0x490 [amdgpu] [  102.874223]  
amdgpu_device_gpu_recover.cold+0x548/0xce6 [amdgpu] [  102.874321]  
amdgpu_debugfs_reset_work+0x4c/0x70 [amdgpu] [  102.874375]  
process_one_work+0x21f/0x3f0 [  102.874377]  worker_thread+0x200/0x3e0 [  
102.874378]  ? process_one_work+0x3f0/0x3f0 [  102.874379]  kthread+0xfd/0x130 
[  102.874380]  ? kthread_complete_and_exit+0x20/0x20
[  102.874381]  ret_from_fork+0x22/0x30

v2:
- Handle umc and gfx ras cases in separated patch
- Retired the gfx_v11_0_cp_ecc_error_irq_funcs in gfx11

v3:
- Improve the subject and code comments
- Add judgment on gfx11 in the function of amdgpu_gfx_ras_late_init

Signed-off-by: Horatio Zhang <hongkun.zh...@amd.com>
Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>
Acked-by: Christian König <christian.koe...@amd.com>
Reviewed-by: Guchun Chen <guchun.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  8 ++++--  
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 38 -------------------------
 2 files changed, 5 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 60bb4bba1994..5e69eec4b754 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -719,9 +719,11 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, 
struct ras_common_if *r
                if (r)
                        return r;

-               r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
-               if (r)
-                       goto late_fini;
+               if (!(adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))){
+                       r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 
0);
+                       if (r)
+                               goto late_fini;
+               }
        } else {
                amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 8a4c4769e607..e9491aec3cae 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1355,13 +1355,6 @@ static int gfx_v11_0_sw_init(void *handle)
        if (r)
                return r;

-       /* ECC error */
-       r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
-                                 GFX_11_0_0__SRCID__CP_ECC_ERROR,
-                                 &adev->gfx.cp_ecc_error_irq);
-       if (r)
-               return r;
-
        /* FED error */
        r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GFX,
                                  GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT,
@@ -4483,7 +4476,6 @@ static int gfx_v11_0_hw_fini(void *handle)
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
        int r;

-       amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);

@@ -5970,28 +5962,6 @@ static void 
gfx_v11_0_set_compute_eop_interrupt_state(struct amdgpu_device *adev
                WREG32_SOC15_IP(GC, reg_addr, tmp); \
        } while (0)

-static int gfx_v11_0_set_cp_ecc_error_state(struct amdgpu_device *adev,
-                                                       struct amdgpu_irq_src 
*source,
-                                                       unsigned type,
-                                                       enum 
amdgpu_interrupt_state state)
-{
-       uint32_t ecc_irq_state = 0;
-       uint32_t pipe0_int_cntl_addr = 0;
-       int i = 0;
-
-       ecc_irq_state = (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0;
-
-       pipe0_int_cntl_addr = SOC15_REG_OFFSET(GC, 0, regCP_ME1_PIPE0_INT_CNTL);
-
-       WREG32_FIELD15_PREREG(GC, 0, CP_INT_CNTL_RING0, 
CP_ECC_ERROR_INT_ENABLE, ecc_irq_state);
-
-       for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++)
-               SET_ECC_ME_PIPE_STATE(pipe0_int_cntl_addr + i * 
CP_ME1_PIPE_INST_ADDR_INTERVAL,
-                                       ecc_irq_state);
-
-       return 0;
-}
-
 static int gfx_v11_0_set_eop_interrupt_state(struct amdgpu_device *adev,
                                            struct amdgpu_irq_src *src,
                                            unsigned type,
@@ -6408,11 +6378,6 @@ static const struct amdgpu_irq_src_funcs 
gfx_v11_0_priv_inst_irq_funcs = {
        .process = gfx_v11_0_priv_inst_irq,
 };

-static const struct amdgpu_irq_src_funcs gfx_v11_0_cp_ecc_error_irq_funcs = {
-       .set = gfx_v11_0_set_cp_ecc_error_state,
-       .process = amdgpu_gfx_cp_ecc_error_irq,
-};
-
 static const struct amdgpu_irq_src_funcs gfx_v11_0_rlc_gc_fed_irq_funcs = {
        .process = gfx_v11_0_rlc_gc_fed_irq,
 };
@@ -6428,9 +6393,6 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device 
*adev)
        adev->gfx.priv_inst_irq.num_types = 1;
        adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;

-       adev->gfx.cp_ecc_error_irq.num_types = 1; /* CP ECC error */
-       adev->gfx.cp_ecc_error_irq.funcs = &gfx_v11_0_cp_ecc_error_irq_funcs;
-
        adev->gfx.rlc_gc_fed_irq.num_types = 1; /* 0x80 FED error */
        adev->gfx.rlc_gc_fed_irq.funcs = &gfx_v11_0_rlc_gc_fed_irq_funcs;

--
2.34.1

Reply via email to