RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]


-
Best Regards,
Thomas


_
From: Zhou1, Tao 
Sent: Thursday, January 18, 2024 11:24 AM
To: Chai, Thomas ; Zhang, Hawking ; 
amd-gfx@lists.freedesktop.org
Cc: Li, Candice ; Wang, Yang(Kevin) 
; Yang, Stanley 
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


[AMD Official Use Only - General]





  _
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Thursday, January 18, 2024 11:06 AM
  To: Zhang, Hawking mailto:hawking.zh...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
      Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]






  -
  Best Regards,
  Thomas


  _
  From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
  Sent: Wednesday, January 17, 2024 7:54 PM
  To: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]



  Please check my comments inline

  Regards,
  Hawking

  -Original Message-
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Tuesday, January 16, 2024 16:21
  To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, 
Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
  Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning

  Use asynchronous polling to handle umc_v12_0 poisoning.

  Signed-off-by: YiPeng Chai 
mailto:yipeng.c...@amd.com>>
  ---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
   3 files changed, 120 insertions(+), 31 deletions(-)

  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  index 856206e95842..44929281840e 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
   /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
   #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

  +#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
  +
   enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
  @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void 
*param)
atomic_read(&con->page_retirement_req_cnt));

atomic_dec(&con->page_retirement_req_cnt);
  +
  + amdgpu_umc_poison_retire_page_polling_timeout(adev,
  + false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  index 9d1cf41cf483..2dde29cb807d 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  @@ -23,6 +23,7 @@

   #include "amdgpu.h"
   #include "umc_v6_7.h"
  +#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

   static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
   }

  -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
  - void *ras_error_status,
  - struct amdgpu_iv_entry *entry,
  - bool reset)
  +static void amdgpu_umc_handle_bad_pages(struct amdgpu_devi

RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Zhou1, Tao
[AMD Official Use Only - General]


  _
  From: Chai, Thomas 
  Sent: Thursday, January 18, 2024 11:06 AM
  To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
  Cc: Zhou1, Tao ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]






  -
  Best Regards,
  Thomas


  _
  From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
  Sent: Wednesday, January 17, 2024 7:54 PM
  To: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
      Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]



  Please check my comments inline

  Regards,
  Hawking

  -Original Message-
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Tuesday, January 16, 2024 16:21
  To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, 
Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
  Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning

  Use asynchronous polling to handle umc_v12_0 poisoning.

  Signed-off-by: YiPeng Chai 
mailto:yipeng.c...@amd.com>>
  ---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
   3 files changed, 120 insertions(+), 31 deletions(-)

  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  index 856206e95842..44929281840e 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
   /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
   #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

  +#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
  +
   enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
  @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void 
*param)
atomic_read(&con->page_retirement_req_cnt));

atomic_dec(&con->page_retirement_req_cnt);
  +
  + amdgpu_umc_poison_retire_page_polling_timeout(adev,
  + false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  index 9d1cf41cf483..2dde29cb807d 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  @@ -23,6 +23,7 @@

   #include "amdgpu.h"
   #include "umc_v6_7.h"
  +#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

   static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
   }

  -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
  - void *ras_error_status,
  - struct amdgpu_iv_entry *entry,
  - bool reset)
  +static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
  + void *ras_error_status)
   {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
  -
  - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
  + mutex_lock(&con->page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ 
-163,19 +161,86 @@ static int amdgpu_umc_do_page_retir

RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]


-
Best Regards,
Thomas


_
From: Zhang, Hawking 
Sent: Wednesday, January 17, 2024 7:54 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


[AMD Official Use Only - General]



Please check my comments inline

Regards,
Hawking

-Original Message-
From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>; Zhou1, Tao 
mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 
poisoning

Use asynchronous polling to handle umc_v12_0 poisoning.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
 3 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 856206e95842..44929281840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(&con->page_retirement_req_cnt));

atomic_dec(&con->page_retirement_req_cnt);
+
+   amdgpu_umc_poison_retire_page_polling_timeout(adev,
+   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..2dde29cb807d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,7 @@

 #include "amdgpu.h"
 #include "umc_v6_7.h"
+#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
 }

-static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
-   void *ras_error_status,
-   struct amdgpu_iv_entry *entry,
-   bool reset)
+static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+   void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
-
-   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   mutex_lock(&con->page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ 
-163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
con->update_channel_flag = false;
}
}
-
-   if (reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   amdgpu_ras_reset_gpu(adev);
-   }
}

kfree(err_data->err_addr);
+
+   mutex_unlock(&con->page_retirement_lock);
+}
+
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+   void *ras_error_status,
+   struct amdgpu_iv_entry *entry,
+   bool reset)
+{
+   struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_umc_handle_bad_pages(ade

RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Zhang, Hawking
[AMD Official Use Only - General]


Please check my comments inline

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 
poisoning

Use asynchronous polling to handle umc_v12_0 poisoning.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
 3 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 856206e95842..44929281840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(&con->page_retirement_req_cnt));

atomic_dec(&con->page_retirement_req_cnt);
+
+   amdgpu_umc_poison_retire_page_polling_timeout(adev,
+   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..2dde29cb807d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,7 @@

 #include "amdgpu.h"
 #include "umc_v6_7.h"
+#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
 }

-static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
-   void *ras_error_status,
-   struct amdgpu_iv_entry *entry,
-   bool reset)
+static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+   void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
-
-   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   mutex_lock(&con->page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ 
-163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
con->update_channel_flag = false;
}
}
-
-   if (reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   amdgpu_ras_reset_gpu(adev);
-   }
}

kfree(err_data->err_addr);
+
+   mutex_unlock(&con->page_retirement_lock);
+}
+
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+   void *ras_error_status,
+   struct amdgpu_iv_entry *entry,
+   bool reset)
+{
+   struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_umc_handle_bad_pages(adev, ras_error_status);
+
+   if (err_data->ue_count && reset) {
+   /* use mode-2 reset for poison consumption */
+   if (!entry)
+   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;

[Hawking]: Shall we do further check on con->poison_supported flag to decide 
issuing mode-2 or mode-1.

+   amdgpu_ras_reset_gpu(adev);
+   }
+
return AMDGPU_RAS_SUCCESS;
 }

+int amdgpu_umc_poison_retire_page_polling_timeout(struct amdgpu_device *adev,
+   bool reset, uint32_t timeout_ms)
[Hawking] int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, 
boot reset, uint32_t timeout_ms)
+{
+   struct ras_err_data err_data;
+   struct ras_common_if head = {
+   .block = AMDGPU_RAS_BLOCK__UMC,
+   }