RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3

2023-05-17 Thread Zhang, Hawking
[AMD Official Use Only - General]

Thanks for the clarification, Thomas. The patch is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Chai, Thomas  
Sent: Wednesday, May 17, 2023 14:31
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Yang, 
Stanley 
Subject: RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

[AMD Official Use Only - General]

reset_context is a local variable in amdgpu_ras_do_recovery, if gpu_reset_flag 
is not used, read regRLC_RLCS_FED_STATUS_0 register and check sdma fed error 
field may move into amdgpu_ras_do_recovery, which may corrupt the code 
structure of amdgpu_ras.c.

amdgpu_ras_do_recovery support various mode resets, but the order of these 
resets is fixed and the driver cannot specify a reset type.

gpu_reset_flag is like the input parameter of amdgpu_ras_do_recovery, which 
allows the driver to specify a special reset type.

-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Wednesday, May 17, 2023 11:41 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Yang, 
Stanley 
Subject: RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

[AMD Official Use Only - General]

Shall we just force the mode-2 reset if it is non-fatal error mode? Is the 
gpu_reset_flag really necessary in such case?

reset_context.method = AMD_RESET_METHOD_MODE2;

Ideally, driver decides either perform reset or other error handling approach 
(i.e. unmap queue for gfx) in IP specific handler, while keep the 
amdgpu_ras_do_recovery as the unified entry for various driver mode reset as 
ras error handling. Is it feasible?

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Wednesday, May 17, 2023 10:14
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 
; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

perform mode2 reset for sdma fed error on gfx v11_0_3.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  5 +  
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 14 +-
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6bb438642cc0..f2da69adcd9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
/* Perform full reset in fatal error mode */
if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
set_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
-   else
+   else {
clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);

+   if (ras->gpu_reset_flags & 
AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+   ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset_context.method = AMD_RESET_METHOD_MODE2;
+   }
+   }
+
amdgpu_device_gpu_recover(ras->adev, NULL, _context);
}
atomic_set(>in_recovery, 0); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bc43f7db17cc..46bf1889a9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -339,6 +339,8 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_STATUS_VALID(1 << 1)
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)

+#define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+
 struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
uint32_t ip_inst;
@@ -427,6 +429,9 @@ struct amdgpu_ras {

/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+
+   /* Record special requirements of gpu reset caller */
+   uint32_t  gpu_reset_flags;
 };

 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 068b9586a223..26d6286d86c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct 
amdgpu_device *adev,
/* Workaround: when vmid and pasid are both zero, trigger gpu reset in 
KGD. */
if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
(entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
-!entry->vmid && !entry->pasid)
+!entry->vmid && !entry->pasid) {
+   uint32

RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3

2023-05-17 Thread Chai, Thomas
[AMD Official Use Only - General]

reset_context is a local variable in amdgpu_ras_do_recovery, if gpu_reset_flag 
is not used, read regRLC_RLCS_FED_STATUS_0 register and check sdma fed error 
field may move into amdgpu_ras_do_recovery, which may corrupt the code 
structure of amdgpu_ras.c.

amdgpu_ras_do_recovery support various mode resets, but the order of these 
resets is fixed and the driver cannot specify a reset type.

gpu_reset_flag is like the input parameter of amdgpu_ras_do_recovery, which 
allows the driver to specify a special reset type.

-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Wednesday, May 17, 2023 11:41 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Yang, 
Stanley 
Subject: RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

[AMD Official Use Only - General]

Shall we just force the mode-2 reset if it is non-fatal error mode? Is the 
gpu_reset_flag really necessary in such case?

reset_context.method = AMD_RESET_METHOD_MODE2;

Ideally, driver decides either perform reset or other error handling approach 
(i.e. unmap queue for gfx) in IP specific handler, while keep the 
amdgpu_ras_do_recovery as the unified entry for various driver mode reset as 
ras error handling. Is it feasible?

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Wednesday, May 17, 2023 10:14
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 
; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

perform mode2 reset for sdma fed error on gfx v11_0_3.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  5 +  
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 14 +-
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6bb438642cc0..f2da69adcd9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
/* Perform full reset in fatal error mode */
if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
set_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
-   else
+   else {
clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);

+   if (ras->gpu_reset_flags & 
AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+   ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset_context.method = AMD_RESET_METHOD_MODE2;
+   }
+   }
+
amdgpu_device_gpu_recover(ras->adev, NULL, _context);
}
atomic_set(>in_recovery, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bc43f7db17cc..46bf1889a9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -339,6 +339,8 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_STATUS_VALID(1 << 1)
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)

+#define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+
 struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
uint32_t ip_inst;
@@ -427,6 +429,9 @@ struct amdgpu_ras {

/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+
+   /* Record special requirements of gpu reset caller */
+   uint32_t  gpu_reset_flags;
 };

 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 068b9586a223..26d6286d86c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct 
amdgpu_device *adev,
/* Workaround: when vmid and pasid are both zero, trigger gpu reset in 
KGD. */
if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
(entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
-!entry->vmid && !entry->pasid)
+!entry->vmid && !entry->pasid) {
+   uint32_t rlc_status0 = 0;
+
+   rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
+
+   if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, 
SDMA0_FED_ERR) ||
+   REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, 
SDMA1_FED_ERR)) {
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   ras->gpu_reset_fla

RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3

2023-05-16 Thread Zhang, Hawking
[AMD Official Use Only - General]

Shall we just force the mode-2 reset if it is non-fatal error mode? Is the 
gpu_reset_flag really necessary in such case?

reset_context.method = AMD_RESET_METHOD_MODE2;

Ideally, driver decides either perform reset or other error handling approach 
(i.e. unmap queue for gfx) in IP specific handler, while keep the 
amdgpu_ras_do_recovery as the unified entry for various driver mode reset as 
ras error handling. Is it feasible?

Regards,
Hawking

-Original Message-
From: Chai, Thomas  
Sent: Wednesday, May 17, 2023 10:14
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 
; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

perform mode2 reset for sdma fed error on gfx v11_0_3.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  5 +  
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 14 +-
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6bb438642cc0..f2da69adcd9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
/* Perform full reset in fatal error mode */
if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
set_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
-   else
+   else {
clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
 
+   if (ras->gpu_reset_flags & 
AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+   ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset_context.method = AMD_RESET_METHOD_MODE2;
+   }
+   }
+
amdgpu_device_gpu_recover(ras->adev, NULL, _context);
}
atomic_set(>in_recovery, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bc43f7db17cc..46bf1889a9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -339,6 +339,8 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_STATUS_VALID(1 << 1)
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)
 
+#define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+
 struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
uint32_t ip_inst;
@@ -427,6 +429,9 @@ struct amdgpu_ras {
 
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+
+   /* Record special requirements of gpu reset caller */
+   uint32_t  gpu_reset_flags;
 };
 
 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 068b9586a223..26d6286d86c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct 
amdgpu_device *adev,
/* Workaround: when vmid and pasid are both zero, trigger gpu reset in 
KGD. */
if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
(entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
-!entry->vmid && !entry->pasid)
+!entry->vmid && !entry->pasid) {
+   uint32_t rlc_status0 = 0;
+
+   rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
+
+   if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, 
SDMA0_FED_ERR) ||
+   REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, 
SDMA1_FED_ERR)) {
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   ras->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   }
+
amdgpu_ras_reset_gpu(adev);
+   }
 
return 0;
 }
--
2.34.1