RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

2024-04-24 Thread Chai, Thomas
[AMD Official Use Only - General]

-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Thursday, April 25, 2024 11:01 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison 
consumption

[AMD Official Use Only - General]

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t
+reset);

> So we ultimately switch to above poison consumption handler for all the 
> existing v9 adapters, right? If so, we shall be able to make this function 
> backwards compatible. I'm wondering if we can just change the existing 
> amdgpu_amdkfd_ras_poison_consumption_handler.

> Pasid_poison_consumption_handler is a little bit confusing.

[Thomas] No,  Only  UMC_HWIP  greater or equal to IP_VERSION(12, 0, 0)),  it 
works on  the new path.  The IP check is in amdgpu_umc_pasid_poison_handler 
function.



Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Thursday, April 18, 2024 10:59
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

Prepare to handle pasid poison consumption.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  9 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 20 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66753940bb4d..287ce431901c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
return amdgpu_ras_get_fed_status(adev);  }

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t 
reset) {
+   amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn,
+data, reset); }
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset)  {
-   amdgpu_umc_poison_handler(adev, block, reset);
+   amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL,
+ reset);
 }

 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ad50c7bbc326..54e15994d02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device 
*adev,
struct tile_config *config);  void 
amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset);
+
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t
+reset);
+
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);  bool 
amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dcda3d24bee3..8ebbca9e2e22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
return 0;
 }

-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, uint32_t reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t
+reset)
 {
int ret = AMDGPU_RAS_SUCCESS;

@@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,

amdgpu_ras_error_data_fini(_data);
} else {
-   if (reset) {
-   amdgpu_umc_bad_page_polling_timeout(adev,
- 

RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

2024-04-24 Thread Zhang, Hawking
[AMD Official Use Only - General]

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t reset);

So we ultimately switch to above poison consumption handler for all the 
existing v9 adapters, right? If so, we shall be able to make this function 
backwards compatible. I'm wondering if we can just change the existing 
amdgpu_amdkfd_ras_poison_consumption_handler.

Pasid_poison_consumption_handler is a little bit confusing.

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Thursday, April 18, 2024 10:59
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

Prepare to handle pasid poison consumption.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  9 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 20 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66753940bb4d..287ce431901c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
return amdgpu_ras_get_fed_status(adev);  }

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t 
reset) {
+   amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data,
+reset); }
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset)  {
-   amdgpu_umc_poison_handler(adev, block, reset);
+   amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
 }

 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ad50c7bbc326..54e15994d02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device 
*adev,
struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset);
+
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t reset);
+
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);  bool 
amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dcda3d24bee3..8ebbca9e2e22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
return 0;
 }

-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, uint32_t reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t reset)
 {
int ret = AMDGPU_RAS_SUCCESS;

@@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,

amdgpu_ras_error_data_fini(_data);
} else {
-   if (reset) {
-   amdgpu_umc_bad_page_polling_timeout(adev,
-   reset, 
MAX_UMC_POISON_POLLING_TIME_SYNC);
-   } else {
struct amdgpu_ras *con = 
amdgpu_ras_get_context(adev);

+   amdgpu_ras_put_poison_req(adev,
+   block, pasid, pasid_fn, data, reset);
+
atomic_inc(>page_retirement_req_cnt);

wake_up(>page_retirement_wq);
-   }
}
} else {
if (adev->virt.ops &&