RE: [PATCH] drm/amdgpu: gpu recovers from fatal error in poison mode

2023-06-26 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Chai, Thomas 
Sent: Monday, June 26, 2023 13:39
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 
; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: gpu recovers from fatal error in poison mode

Fatal error occurs in ras poison mode, mode1 reset is used to recover gpu.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 700eb180ea60..c6f8b6b50b86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2065,6 +2065,14 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
reset_context.method = AMD_RESET_METHOD_MODE2;
}
+
+   /* Fatal error occurs in poison mode, mode1 reset is 
used to
+* recover gpu.
+*/
+   if (ras->gpu_reset_flags & 
AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
+   ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   set_bit(AMDGPU_NEED_FULL_RESET, 
_context.flags);
+   }
}

amdgpu_device_gpu_recover(ras->adev, NULL, _context); @@ 
-2956,9 +2964,12 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
return;

if (atomic_cmpxchg(_ras_in_intr, 0, 1) == 0) {
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
dev_info(adev->dev, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");

+   ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 46bf1889a9d7..ffb49b2d533a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -340,6 +340,7 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)

 #define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+#define AMDGPU_RAS_GPU_RESET_MODE1_RESET  (0x1 << 1)

 struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
--
2.34.1



[PATCH] drm/amdgpu: gpu recovers from fatal error in poison mode

2023-06-25 Thread YiPeng Chai
Fatal error occurs in ras poison mode, mode1 reset
is used to recover gpu.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 700eb180ea60..c6f8b6b50b86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2065,6 +2065,14 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
reset_context.method = AMD_RESET_METHOD_MODE2;
}
+
+   /* Fatal error occurs in poison mode, mode1 reset is 
used to
+* recover gpu.
+*/
+   if (ras->gpu_reset_flags & 
AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
+   ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   set_bit(AMDGPU_NEED_FULL_RESET, 
_context.flags);
+   }
}
 
amdgpu_device_gpu_recover(ras->adev, NULL, _context);
@@ -2956,9 +2964,12 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device 
*adev)
return;
 
if (atomic_cmpxchg(_ras_in_intr, 0, 1) == 0) {
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
dev_info(adev->dev, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
 
+   ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 46bf1889a9d7..ffb49b2d533a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -340,6 +340,7 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)
 
 #define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+#define AMDGPU_RAS_GPU_RESET_MODE1_RESET  (0x1 << 1)
 
 struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
-- 
2.34.1