Re: [PATCH v6 13/16] drm/amdgpu: Fix hang on device removal.

2021-05-10 Thread Christian König




Am 10.05.21 um 18:36 schrieb Andrey Grodzovsky:

If removing while commands in flight you cannot wait to flush the
HW fences on a ring since the device is gone.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++--
  1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 1ffb36bd0b19..fa03702ecbfb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -36,6 +36,7 @@
  #include 
  #include 
  
+#include 

  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  
@@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)

   */
  void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
  {
-   unsigned i, j;
-   int r;
+   int i, r;


Is j not used here any more?

Christian.

  
  	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {

struct amdgpu_ring *ring = adev->rings[i];
@@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct amdgpu_device 
*adev)
continue;
if (!ring->no_scheduler)
drm_sched_fini(&ring->sched);
-   r = amdgpu_fence_wait_empty(ring);
-   if (r) {
-   /* no need to trigger GPU reset as we are unloading */
+   /* You can't wait for HW to signal if it's gone */
+   if (!drm_dev_is_unplugged(&adev->ddev))
+   r = amdgpu_fence_wait_empty(ring);
+   else
+   r = -ENODEV;
+   /* no need to trigger GPU reset as we are unloading */
+   if (r)
amdgpu_fence_driver_force_completion(ring);
-   }
+
if (ring->fence_drv.irq_src)
amdgpu_irq_put(adev, ring->fence_drv.irq_src,
   ring->fence_drv.irq_type);




[PATCH v6 13/16] drm/amdgpu: Fix hang on device removal.

2021-05-10 Thread Andrey Grodzovsky
If removing while commands in flight you cannot wait to flush the
HW fences on a ring since the device is gone.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 1ffb36bd0b19..fa03702ecbfb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 
+#include 
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 
@@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
  */
 void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
 {
-   unsigned i, j;
-   int r;
+   int i, r;
 
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct amdgpu_device 
*adev)
continue;
if (!ring->no_scheduler)
drm_sched_fini(&ring->sched);
-   r = amdgpu_fence_wait_empty(ring);
-   if (r) {
-   /* no need to trigger GPU reset as we are unloading */
+   /* You can't wait for HW to signal if it's gone */
+   if (!drm_dev_is_unplugged(&adev->ddev))
+   r = amdgpu_fence_wait_empty(ring);
+   else
+   r = -ENODEV;
+   /* no need to trigger GPU reset as we are unloading */
+   if (r)
amdgpu_fence_driver_force_completion(ring);
-   }
+
if (ring->fence_drv.irq_src)
amdgpu_irq_put(adev, ring->fence_drv.irq_src,
   ring->fence_drv.irq_type);
-- 
2.25.1