RE: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

2024-04-28 Thread Deng, Emily
[AMD Official Use Only - General]

Reviewed-by: Emily Deng 

Emily Deng
Best Wishes



>-Original Message-
>From: Li, Yunxiang (Teddy) 
>Sent: Saturday, April 27, 2024 2:27 AM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Koenig, Christian
>; Lazar, Lijo ; Kuehling,
>Felix ; Deng, Emily ; Li,
>Yunxiang (Teddy) 
>Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR
>
>There are other reset sources that pass NULL as the job pointer, such as
>amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the
>FLR comes from the host does not work.
>
>Add a flag in reset_context to explicitly mark host triggered reset, and set
>this flag when we receive host reset notification.
>
>Signed-off-by: Yunxiang Li 
>---
>v2: fix typo
>v3: pass reset_context directly
>v4: clear the flag in case we retry
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 -
>drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  1 +
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  1 +
> drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  1 +
> 5 files changed, 12 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 8befd10bf007..33c889c027a5 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct
>amdgpu_device *adev)
>  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
>  *
>  * @adev: amdgpu_device pointer
>- * @from_hypervisor: request from hypervisor
>+ * @reset_context: amdgpu reset context pointer
>  *
>  * do VF FLR and reinitialize Asic
>  * return 0 means succeeded otherwise failed
>  */
> static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>-   bool from_hypervisor)
>+   struct amdgpu_reset_context
>*reset_context)
> {
>   int r;
>   struct amdgpu_hive_info *hive = NULL;
>@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct
>amdgpu_device *adev,
> retry:
>   amdgpu_amdkfd_pre_reset(adev);
>
>-  if (from_hypervisor)
>+  if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
>+  clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
>   r = amdgpu_virt_request_full_gpu(adev, true);
>-  else
>+  } else {
>   r = amdgpu_virt_reset_gpu(adev);
>+  }
>   if (r)
>   return r;
>+
>   amdgpu_ras_set_fed(adev, false);
>   amdgpu_irq_gpu_reset_resume_helper(adev);
>
>@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct
>amdgpu_device *adev,
>   /* Actual ASIC resets if needed.*/
>   /* Host driver will handle XGMI hive reset for SRIOV */
>   if (amdgpu_sriov_vf(adev)) {
>-  r = amdgpu_device_reset_sriov(adev, job ? false : true);
>+  r = amdgpu_device_reset_sriov(adev, reset_context);
>   if (r)
>   adev->asic_reset_res = r;
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>index b11d190ece53..5a9cc043b858 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
>   AMDGPU_NEED_FULL_RESET = 0,
>   AMDGPU_SKIP_HW_RESET = 1,
>   AMDGPU_SKIP_COREDUMP = 2,
>+  AMDGPU_HOST_FLR = 3,
> };
>
> struct amdgpu_reset_context {
>diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>index c5ba9c4757a8..f4c47492e0cd 100644
>--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>   reset_context.method = AMD_RESET_METHOD_NONE;
>   reset_context.reset_req_dev = adev;
>   clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>+  set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
>
>   amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   }
>diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>index fa9d1b02f391..14cc7910e5cf 100644
>--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>   reset_context.method = AMD_RESET_METHOD_NONE;
>   reset_context.reset_req_dev = adev;
>   clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>+  set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
>
>   amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>   }
>diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
>index 14a065516ae4..78cd07744ebe 100644
>--- a/driver

RE: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

2024-04-30 Thread Luo, Zhigang
[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: amd-gfx  On Behalf Of Yunxiang Li
Sent: Friday, April 26, 2024 2:27 PM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Lazar, Lijo ; Kuehling, Felix 
; Deng, Emily ; Li, Yunxiang 
(Teddy) 
Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR

There are other reset sources that pass NULL as the job pointer, such as 
amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the FLR 
comes from the host does not work.

Add a flag in reset_context to explicitly mark host triggered reset, and set 
this flag when we receive host reset notification.

Signed-off-by: Yunxiang Li 
---
v2: fix typo
v3: pass reset_context directly
v4: clear the flag in case we retry

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 -  
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  1 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  1 +
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8befd10bf007..33c889c027a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct 
amdgpu_device *adev)
  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
  *
  * @adev: amdgpu_device pointer
- * @from_hypervisor: request from hypervisor
+ * @reset_context: amdgpu reset context pointer
  *
  * do VF FLR and reinitialize Asic
  * return 0 means succeeded otherwise failed
  */
 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
-bool from_hypervisor)
+struct amdgpu_reset_context *reset_context)
 {
int r;
struct amdgpu_hive_info *hive = NULL;
@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,
 retry:
amdgpu_amdkfd_pre_reset(adev);

-   if (from_hypervisor)
+   if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
+   clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true);
-   else
+   } else {
r = amdgpu_virt_reset_gpu(adev);
+   }
if (r)
return r;
+
amdgpu_ras_set_fed(adev, false);
amdgpu_irq_gpu_reset_resume_helper(adev);

@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
-   r = amdgpu_device_reset_sriov(adev, job ? false : true);
+   r = amdgpu_device_reset_sriov(adev, reset_context);
if (r)
adev->asic_reset_res = r;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index b11d190ece53..5a9cc043b858 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_SKIP_COREDUMP = 2,
+   AMDGPU_HOST_FLR = 3,
 };

 struct amdgpu_reset_context {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index c5ba9c4757a8..f4c47492e0cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+   set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index fa9d1b02f391..14cc7910e5cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+   set_bit(AMDGPU_HOST_FLR, &reset_context.flags);

amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 14a065516ae4..78cd07744ebe 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -529,6