RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, June 18, 2024 3:19 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Yang, Stanley 
Subject: RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling



-Original Message-
From: Chai, Thomas 
Sent: 2024年6月18日 14:34
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

Add gpu reset check and exception handling for page retirement.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +
 1 file changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f8e6ca07957..635dc86dbfd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1386,10 +1386,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev, struct ras_query_i
memset(, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, 
amdgpu_ras_intr_triggered() ?
   RAS_EVENT_TYPE_ISR : 
RAS_EVENT_TYPE_INVALID);
+
+   if (!down_read_trylock(>reset_domain->sem))
+   return -EIO;
+
ret = amdgpu_ras_query_error_status_helper(adev, info,
   _data,
   ,
   error_query_mode);
+   up_read(>reset_domain->sem);
if (ret)
goto out_fini_err_data;

@@ -2884,6 +2889,14 @@ static int amdgpu_ras_poison_creation_handler(struct 
amdgpu_device *adev,
return 0;
 }

+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_poison_msg msg;
+
+   while (kfifo_get(>poison_fifo, )); }
+
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
uint32_t msg_count, uint32_t *gpu_reset)  { @@ -2913,6 
+2926,11 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
else
reset = reset_flags;

+   /* Check if gpu is in reset state */
+   if (!down_read_trylock(>reset_domain->sem))
+   return -EIO;
+   up_read(>reset_domain->sem);

> [Kevin]:
> I'm confused that why not using 'amdgpu_in_reset()' helper function to check 
> reset state?

>Best Regards,
> Kevin

[Thomas] This function is called in page retirement thread.
 According to Christian König's previous email suggestions  
that "It's illegal to call amdgpu_in_reset() from outside of the hw specific 
backends."

+
flush_delayed_work(>page_retirement_dwork);

reinit_completion(>ras_recovery_completion);
@@ -2977,6 +2995,31 @@ static int amdgpu_ras_page_retirement_thread(void *param)
}
}

+   if ((ret == -EIO) || (gpu_reset == 
AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+   /* gpu is in mode-1 reset state */
+   /* Clear poison creation request */
+   while (atomic_read(>poison_creation_count))
+   atomic_dec(>poison_creation_count);
[Kevin]:

Aha! It is better to use atomic_set() to instead of it.

Best Regards,
Kevin
+
+   /* Clear poison consumption fifo */
+   amdgpu_ras_clear_poison_fifo(adev);
+
+   while (atomic_read(>page_retirement_req_cnt))
+   atomic_dec(>page_retirement_req_cnt);
+
+   if (ret == -EIO) {
+   /* Wait for mode-1 reset to complete */
+   down_read(>reset_domain->sem);
+   up_read(>reset_domain->sem);
+   }
+
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(>page_retirement_dwork, 0);
+   } else if (gpu_reset) {
+   /* gpu is in mode-2 reset or other reset state */
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(>page_retirement_dwork, 0);
+   }
 #else
 dev_info(adev->dev, "Start processing page retirement. request:%d\n",
 atomic_read(>page_retirement_req_cnt));
--
2.34.1



RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

2024-06-18 Thread Wang, Yang(Kevin)


-Original Message-
From: Chai, Thomas  
Sent: 2024年6月18日 14:34
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

Add gpu reset check and exception handling for page retirement.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +
 1 file changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f8e6ca07957..635dc86dbfd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1386,10 +1386,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev, struct ras_query_i
memset(, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, 
amdgpu_ras_intr_triggered() ?
   RAS_EVENT_TYPE_ISR : 
RAS_EVENT_TYPE_INVALID);
+
+   if (!down_read_trylock(>reset_domain->sem))
+   return -EIO;
+
ret = amdgpu_ras_query_error_status_helper(adev, info,
   _data,
   ,
   error_query_mode);
+   up_read(>reset_domain->sem);
if (ret)
goto out_fini_err_data;
 
@@ -2884,6 +2889,14 @@ static int amdgpu_ras_poison_creation_handler(struct 
amdgpu_device *adev,
return 0;
 }
 
+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_poison_msg msg;
+
+   while (kfifo_get(>poison_fifo, )); }
+
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
uint32_t msg_count, uint32_t *gpu_reset)  { @@ -2913,6 
+2926,11 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
else
reset = reset_flags;
 
+   /* Check if gpu is in reset state */
+   if (!down_read_trylock(>reset_domain->sem))
+   return -EIO;
+   up_read(>reset_domain->sem);

[Kevin]:
I'm confused that why not using 'amdgpu_in_reset()' helper function to check 
reset state?

Best Regards,
Kevin
+
flush_delayed_work(>page_retirement_dwork);
 
reinit_completion(>ras_recovery_completion);
@@ -2977,6 +2995,31 @@ static int amdgpu_ras_page_retirement_thread(void *param)
}
}
 
+   if ((ret == -EIO) || (gpu_reset == 
AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+   /* gpu is in mode-1 reset state */
+   /* Clear poison creation request */
+   while (atomic_read(>poison_creation_count))
+   atomic_dec(>poison_creation_count);
[Kevin]:

Aha! It is better to use atomic_set() to instead of it.

Best Regards,
Kevin
+
+   /* Clear poison consumption fifo */
+   amdgpu_ras_clear_poison_fifo(adev);
+
+   while (atomic_read(>page_retirement_req_cnt))
+   atomic_dec(>page_retirement_req_cnt);
+
+   if (ret == -EIO) {
+   /* Wait for mode-1 reset to complete */
+   down_read(>reset_domain->sem);
+   up_read(>reset_domain->sem);
+   }
+
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(>page_retirement_dwork, 0);
+   } else if (gpu_reset) {
+   /* gpu is in mode-2 reset or other reset state */
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(>page_retirement_dwork, 0);
+   }
 #else
 dev_info(adev->dev, "Start processing page retirement. request:%d\n",
 atomic_read(>page_retirement_req_cnt));
--
2.34.1