Re: [RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.

2022-02-08 Thread Christian König




Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: Liu Shaoyun 
Signed-off-by: Andrey Grodzovsky 


Feel free to add an Acked-by: Christian König 
, but an rb from somebody more familiar with 
the code would be better.


Regards,
Christian.


---
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++---
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++---
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++---
  3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 56da5ab82987..5869d51d8bee 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
  }
  
  static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,

@@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device 
*adev,
  
  	switch (event) {

case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 477d0dde19c5..5728a6401d73 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
  }
  
  static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,

@@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device 
*adev,
  
  	switch (event) {

case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can 
ignore
 * it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct 
*work)
  
  	/* Trigger recovery due to world switch failure */

if (amdgpu_device_should_recover_gpu(adev))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
  }
  
  static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,

@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device 
*adev,
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
  
  		/* only handle FLR_NOTIFY now */

-   if (!r)
-   schedule_work(>virt.flr_work);
+   if (!r && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
}
  
  	return 0;




RE: [RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.

2022-02-08 Thread Liu, Shaoyun
[AMD Official Use Only]

This patch is reviewed by  Shaoyun.liu 

Since  other  patches are suggested by  other engineer and they may already od 
some review on them , so I will leave  them  to continue review  the rest 
patches.  

Regards
Shaoyun.liu

-Original Message-
From: Grodzovsky, Andrey  
Sent: Tuesday, February 8, 2022 7:23 PM
To: dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; dan...@ffwll.ch; Liu, Monk 
; Chen, Horace ; Lazar, Lijo 
; Chen, JingWen ; Grodzovsky, Andrey 
; Liu, Shaoyun 
Subject: [RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR 
queue.

No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification following guest side triggered 
reset.
Fix: Preven qeuing flr_work from mailbox irq if guest already executing a reset.

Suggested-by: Liu Shaoyun 
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++---  
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++---  
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 56da5ab82987..5869d51d8bee 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -307,8 
+307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 477d0dde19c5..5728a6401d73 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -337,8 
+337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can 
ignore
 * it byfar since that polling thread will handle it, diff 
--git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct 
*work)
 
/* Trigger recovery due to world switch failure */
if (amdgpu_device_should_recover_gpu(adev))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -550,8 
+550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
/* only handle FLR_NOTIFY now */
-   if (!r)
-   schedule_work(>virt.flr_work);
+   if (!r && !amdgp

[RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue.

2022-02-08 Thread Andrey Grodzovsky
No need to to trigger another work queue inside the work queue.

v3:

Problem:
Extra reset caused by host side FLR notification
following guest side triggered reset.
Fix: Preven qeuing flr_work from mailbox irq if guest
already executing a reset.

Suggested-by: Liu Shaoyun 
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 56da5ab82987..5869d51d8bee 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device 
*adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 477d0dde19c5..5728a6401d73 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device 
*adev,
 
switch (event) {
case IDH_FLR_NOTIFICATION:
-   if (amdgpu_sriov_runtime(adev))
-   schedule_work(>virt.flr_work);
+   if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can 
ignore
 * it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index aef9d059ae52..02290febfcf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct 
*work)
 
/* Trigger recovery due to world switch failure */
if (amdgpu_device_should_recover_gpu(adev))
-   amdgpu_device_gpu_recover(adev, NULL);
+   amdgpu_device_gpu_recover_imp(adev, NULL);
 }
 
 static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device 
*adev,
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
 
/* only handle FLR_NOTIFY now */
-   if (!r)
-   schedule_work(>virt.flr_work);
+   if (!r && !amdgpu_in_reset(adev))
+   WARN_ONCE(!queue_work(adev->reset_domain.wq,
+ >virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
}
 
return 0;
-- 
2.25.1