RE: [PATCH 04/15] drm/amdgpu: add poison creation handler

2024-04-24 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, I will do this.


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Thursday, April 25, 2024 10:33 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: RE: [PATCH 04/15] drm/amdgpu: add poison creation handler

[AMD Official Use Only - General]

Is it okay to drop below static function and just implement the logic in poison 
creation handler leveraging the ras query api: amdgpu_ras_query_error_status.

It seems to me the static function may not be able to be used for other IP 
blocks.

Regards,
Hawking

+ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+   enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+   int ret = 0;
+   struct ras_ecc_log_info *ecc_log;
+   struct ras_query_if info;
+   uint32_t timeout = timeout_ms;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   memset(&info, 0, sizeof(info));
+   info.head.block = ras_block;
+
+   ecc_log = &ras->umc_ecc_log;
+   ecc_log->de_updated = false;
+   do {
+   ret = amdgpu_ras_query_error_status(adev, &info);
+   if (ret) {
+   dev_err(adev->dev, "Failed to query ras error! 
ret:%d\n", ret);
+   return ret;
+   }
+
+   if (timeout && !ecc_log->de_updated) {
+   msleep(1);
+   timeout--;
+   }
+   } while (timeout && !ecc_log->de_updated);
+
+   if (timeout_ms && !timeout) {
+   dev_warn(adev->dev, "Can't find deferred error\n");
+   return -ETIMEDOUT;
+   }
+
+   return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+   uint32_t timeout) {
+   amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC,
+timeout); }
+

-Original Message-
From: amd-gfx  On Behalf Of YiPeng Chai
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 04/15] drm/amdgpu: add poison creation handler

Add poison creation handler.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 +++--
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64e6e20c6de7..126616eaeec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2080,6 +2080,17 @@ static void 
amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj  {
dev_info(obj->adev->dev,
"Poison is created\n");
+
+   if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+   struct amdgpu_ras *con =
+ amdgpu_ras_get_context(obj->adev);
+
+   amdgpu_ras_put_poison_req(obj->adev,
+   AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+
+   atomic_inc(&con->page_retirement_req_cnt);
+
+   wake_up(&con->page_retirement_wq);
+   }
 }

 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ 
-2754,10 +2765,54 @@ static void amdgpu_ras_ecc_log_fini(struct 
ras_ecc_log_info *ecc_log)
mutex_destroy(&ecc_log->lock);
ecc_log->de_updated = false;
 }
+
+static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+   enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+   int ret = 0;
+   struct ras_ecc_log_info *ecc_log;
+   struct ras_query_if info;
+   uint32_t timeout = timeout_ms;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   memset(&info, 0, sizeof(info));
+   info.head.block = ras_block;
+
+   ecc_log = &ras->umc_ecc_log;
+   ecc_log->de_updated = false;
+   do {
+   ret = amdgpu_ras_query_error_status(adev, &info);
+   if (ret) {
+   dev_err(adev->dev, "Failed to query ras error! 
ret:%d\n", ret);
+   return ret;
+   }
+
+   if (timeout && !ecc_log->de_updated) {
+   msleep(1);
+   timeout--;
+   }
+   } while (timeout && !ecc_log->de_updated);
+
+   if (timeout_ms && !timeout) {
+   dev_warn(adev->dev, "Can't find deferred error\n");
+   return -ETIMEDOUT;
+   }
+
+   return 0;
+}
+
+static void amdg

RE: [PATCH 04/15] drm/amdgpu: add poison creation handler

2024-04-24 Thread Zhang, Hawking
[AMD Official Use Only - General]

Is it okay to drop below static function and just implement the logic in poison 
creation handler leveraging the ras query api: amdgpu_ras_query_error_status.

It seems to me the static function may not be able to be used for other IP 
blocks.

Regards,
Hawking

+ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+   enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+   int ret = 0;
+   struct ras_ecc_log_info *ecc_log;
+   struct ras_query_if info;
+   uint32_t timeout = timeout_ms;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   memset(&info, 0, sizeof(info));
+   info.head.block = ras_block;
+
+   ecc_log = &ras->umc_ecc_log;
+   ecc_log->de_updated = false;
+   do {
+   ret = amdgpu_ras_query_error_status(adev, &info);
+   if (ret) {
+   dev_err(adev->dev, "Failed to query ras error! 
ret:%d\n", ret);
+   return ret;
+   }
+
+   if (timeout && !ecc_log->de_updated) {
+   msleep(1);
+   timeout--;
+   }
+   } while (timeout && !ecc_log->de_updated);
+
+   if (timeout_ms && !timeout) {
+   dev_warn(adev->dev, "Can't find deferred error\n");
+   return -ETIMEDOUT;
+   }
+
+   return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+   uint32_t timeout)
+{
+   amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); }
+

-Original Message-
From: amd-gfx  On Behalf Of YiPeng Chai
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 04/15] drm/amdgpu: add poison creation handler

Add poison creation handler.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 +++--
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64e6e20c6de7..126616eaeec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2080,6 +2080,17 @@ static void 
amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj  {
dev_info(obj->adev->dev,
"Poison is created\n");
+
+   if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
+
+   amdgpu_ras_put_poison_req(obj->adev,
+   AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+
+   atomic_inc(&con->page_retirement_req_cnt);
+
+   wake_up(&con->page_retirement_wq);
+   }
 }

 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ 
-2754,10 +2765,54 @@ static void amdgpu_ras_ecc_log_fini(struct 
ras_ecc_log_info *ecc_log)
mutex_destroy(&ecc_log->lock);
ecc_log->de_updated = false;
 }
+
+static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+   enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+   int ret = 0;
+   struct ras_ecc_log_info *ecc_log;
+   struct ras_query_if info;
+   uint32_t timeout = timeout_ms;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   memset(&info, 0, sizeof(info));
+   info.head.block = ras_block;
+
+   ecc_log = &ras->umc_ecc_log;
+   ecc_log->de_updated = false;
+   do {
+   ret = amdgpu_ras_query_error_status(adev, &info);
+   if (ret) {
+   dev_err(adev->dev, "Failed to query ras error! 
ret:%d\n", ret);
+   return ret;
+   }
+
+   if (timeout && !ecc_log->de_updated) {
+   msleep(1);
+   timeout--;
+   }
+   } while (timeout && !ecc_log->de_updated);
+
+   if (timeout_ms && !timeout) {
+   dev_warn(adev->dev, "Can't find deferred error\n");
+   return -ETIMEDOUT;
+   }
+
+   return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+   uint32_t timeout)
+{
+   amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); }
+
 static int amdgpu_ras_page_retirement_thread(void *param)  {
struct amdgpu_device *adev = (struct amdgpu_device *)param;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_poison_msg poison_msg;
+   enum amdgpu_ras_block ras_block;

while (!kthread_should_stop()) {

@@ -2768,13 +2823,22 @@ static int amdgpu_ras_page_retirement_thread(void 
*param)
if (kthread_should_stop())