[PATCH] drm/radeon: avoid deadlock in pm path when waiting for fence

2012-12-17 Thread Christian König
On 17.12.2012 17:04, j.glisse at gmail.com wrote:
> From: Jerome Glisse 
>
> radeon_fence_wait_empty_locked should not trigger GPU reset as no
> place where it's call from would benefit from such thing and it
> actually lead to a kernel deadlock in case the reset is triggered
> from pm codepath. Instead force ring completion in place where it
> makes sense or return early in others.
>
> Signed-off-by: Jerome Glisse 

I wanted to stop losing GPU reset signals by moving the reset into 
radeon_fence_wait_empty locked, but it's true that in most cases it 
doesn't make much sense (suspend/finish) and I didn't know that it could 
cause a hang in the PM code.

We should probably fix the PM code to properly signal this condition to 
it's caller and reset the GPU when it is possible to do so, but fixing 
the deadlock is of course more important.

Also should probably go into the stable kernel as well, but anyway it is:

Reviewed-by: Christian K?nig 

> ---
>   drivers/gpu/drm/radeon/radeon.h|  2 +-
>   drivers/gpu/drm/radeon/radeon_device.c | 13 +++--
>   drivers/gpu/drm/radeon/radeon_fence.c  | 30 ++
>   drivers/gpu/drm/radeon/radeon_pm.c | 15 ---
>   4 files changed, 38 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 9c7625c..071b2d7 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, int 
> ring);
>   bool radeon_fence_signaled(struct radeon_fence *fence);
>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>   int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring);
> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
>   int radeon_fence_wait_any(struct radeon_device *rdev,
> struct radeon_fence **fences,
> bool intr);
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c 
> b/drivers/gpu/drm/radeon/radeon_device.c
> index 774fae7..53a9223 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, 
> pm_message_t state)
>   struct drm_crtc *crtc;
>   struct drm_connector *connector;
>   int i, r;
> + bool force_completion = false;
>   
>   if (dev == NULL || dev->dev_private == NULL) {
>   return -ENODEV;
> @@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, 
> pm_message_t state)
>   
>   mutex_lock(>ring_lock);
>   /* wait for gpu to finish processing current batch */
> - for (i = 0; i < RADEON_NUM_RINGS; i++)
> - radeon_fence_wait_empty_locked(rdev, i);
> + for (i = 0; i < RADEON_NUM_RINGS; i++) {
> + r = radeon_fence_wait_empty_locked(rdev, i);
> + if (r) {
> + /* delay GPU reset to resume */
> + force_completion = true;
> + }
> + }
> + if (force_completion) {
> + radeon_fence_driver_force_completion(rdev);
> + }
>   mutex_unlock(>ring_lock);
>   
>   radeon_save_bios_scratch_regs(rdev);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
> b/drivers/gpu/drm/radeon/radeon_fence.c
> index bf7b20e..28c09b6 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct radeon_device 
> *rdev, int ring)
>* Returns 0 if the fences have passed, error for all other cases.
>* Caller must hold ring lock.
>*/
> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
>   {
>   uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
> + int r;
>   
> - while(1) {
> - int r;
> - r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
> + r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
> + if (r) {
>   if (r == -EDEADLK) {
> - mutex_unlock(>ring_lock);
> - r = radeon_gpu_reset(rdev);
> - mutex_lock(>ring_lock);
> - if (!r)
> - continue;
> - }
> - if (r) {
> - dev_err(rdev->dev, "error waiting for ring to become"
> - " idle (%d)\n", r);
> + return -EDEADLK;
>   }
> - return;
> + dev_err(rdev->dev, "error waiting for ring[%d] to become idle 
> (%d)\n",
> + ring, r);
>   }
> + return 0;
>   }
>   
>   /**
> @@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct 

[PATCH] drm/radeon: avoid deadlock in pm path when waiting for fence

2012-12-17 Thread Alex Deucher
Added to fixes and stable.  Thanks!

On Mon, Dec 17, 2012 at 11:41 AM, Christian K?nig
 wrote:
> On 17.12.2012 17:04, j.glisse at gmail.com wrote:
>>
>> From: Jerome Glisse 
>>
>> radeon_fence_wait_empty_locked should not trigger GPU reset as no
>> place where it's call from would benefit from such thing and it
>> actually lead to a kernel deadlock in case the reset is triggered
>> from pm codepath. Instead force ring completion in place where it
>> makes sense or return early in others.
>>
>> Signed-off-by: Jerome Glisse 
>
>
> I wanted to stop losing GPU reset signals by moving the reset into
> radeon_fence_wait_empty locked, but it's true that in most cases it doesn't
> make much sense (suspend/finish) and I didn't know that it could cause a
> hang in the PM code.
>
> We should probably fix the PM code to properly signal this condition to it's
> caller and reset the GPU when it is possible to do so, but fixing the
> deadlock is of course more important.
>
> Also should probably go into the stable kernel as well, but anyway it is:
>
> Reviewed-by: Christian K?nig 
>
>
>> ---
>>   drivers/gpu/drm/radeon/radeon.h|  2 +-
>>   drivers/gpu/drm/radeon/radeon_device.c | 13 +++--
>>   drivers/gpu/drm/radeon/radeon_fence.c  | 30
>> ++
>>   drivers/gpu/drm/radeon/radeon_pm.c | 15 ---
>>   4 files changed, 38 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 9c7625c..071b2d7 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev,
>> int ring);
>>   bool radeon_fence_signaled(struct radeon_fence *fence);
>>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>>   int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring);
>> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int
>> ring);
>> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
>>   int radeon_fence_wait_any(struct radeon_device *rdev,
>>   struct radeon_fence **fences,
>>   bool intr);
>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
>> b/drivers/gpu/drm/radeon/radeon_device.c
>> index 774fae7..53a9223 100644
>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>> @@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev,
>> pm_message_t state)
>> struct drm_crtc *crtc;
>> struct drm_connector *connector;
>> int i, r;
>> +   bool force_completion = false;
>> if (dev == NULL || dev->dev_private == NULL) {
>> return -ENODEV;
>> @@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev,
>> pm_message_t state)
>> mutex_lock(>ring_lock);
>> /* wait for gpu to finish processing current batch */
>> -   for (i = 0; i < RADEON_NUM_RINGS; i++)
>> -   radeon_fence_wait_empty_locked(rdev, i);
>> +   for (i = 0; i < RADEON_NUM_RINGS; i++) {
>> +   r = radeon_fence_wait_empty_locked(rdev, i);
>> +   if (r) {
>> +   /* delay GPU reset to resume */
>> +   force_completion = true;
>> +   }
>> +   }
>> +   if (force_completion) {
>> +   radeon_fence_driver_force_completion(rdev);
>> +   }
>> mutex_unlock(>ring_lock);
>> radeon_save_bios_scratch_regs(rdev);
>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
>> b/drivers/gpu/drm/radeon/radeon_fence.c
>> index bf7b20e..28c09b6 100644
>> --- a/drivers/gpu/drm/radeon/radeon_fence.c
>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
>> @@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct
>> radeon_device *rdev, int ring)
>>* Returns 0 if the fences have passed, error for all other cases.
>>* Caller must hold ring lock.
>>*/
>> -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
>> +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
>>   {
>> uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
>> +   int r;
>>   - while(1) {
>> -   int r;
>> -   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
>> +   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
>> +   if (r) {
>> if (r == -EDEADLK) {
>> -   mutex_unlock(>ring_lock);
>> -   r = radeon_gpu_reset(rdev);
>> -   mutex_lock(>ring_lock);
>> -   if (!r)
>> -   continue;
>> -   }
>> -   if (r) {
>> -   dev_err(rdev->dev, "error waiting for ring to
>> become"
>> -   " idle (%d)\n", r);
>> +   return -EDEADLK;

[PATCH] drm/radeon: avoid deadlock in pm path when waiting for fence

2012-12-17 Thread j.gli...@gmail.com
From: Jerome Glisse 

radeon_fence_wait_empty_locked should not trigger GPU reset as no
place where it's call from would benefit from such thing and it
actually lead to a kernel deadlock in case the reset is triggered
from pm codepath. Instead force ring completion in place where it
makes sense or return early in others.

Signed-off-by: Jerome Glisse 
---
 drivers/gpu/drm/radeon/radeon.h|  2 +-
 drivers/gpu/drm/radeon/radeon_device.c | 13 +++--
 drivers/gpu/drm/radeon/radeon_fence.c  | 30 ++
 drivers/gpu/drm/radeon/radeon_pm.c | 15 ---
 4 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 9c7625c..071b2d7 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, int 
ring);
 bool radeon_fence_signaled(struct radeon_fence *fence);
 int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
 int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring);
-void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
+int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
 int radeon_fence_wait_any(struct radeon_device *rdev,
  struct radeon_fence **fences,
  bool intr);
diff --git a/drivers/gpu/drm/radeon/radeon_device.c 
b/drivers/gpu/drm/radeon/radeon_device.c
index 774fae7..53a9223 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, 
pm_message_t state)
struct drm_crtc *crtc;
struct drm_connector *connector;
int i, r;
+   bool force_completion = false;

if (dev == NULL || dev->dev_private == NULL) {
return -ENODEV;
@@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, 
pm_message_t state)

mutex_lock(>ring_lock);
/* wait for gpu to finish processing current batch */
-   for (i = 0; i < RADEON_NUM_RINGS; i++)
-   radeon_fence_wait_empty_locked(rdev, i);
+   for (i = 0; i < RADEON_NUM_RINGS; i++) {
+   r = radeon_fence_wait_empty_locked(rdev, i);
+   if (r) {
+   /* delay GPU reset to resume */
+   force_completion = true;
+   }
+   }
+   if (force_completion) {
+   radeon_fence_driver_force_completion(rdev);
+   }
mutex_unlock(>ring_lock);

radeon_save_bios_scratch_regs(rdev);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
b/drivers/gpu/drm/radeon/radeon_fence.c
index bf7b20e..28c09b6 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct radeon_device 
*rdev, int ring)
  * Returns 0 if the fences have passed, error for all other cases.
  * Caller must hold ring lock.
  */
-void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
+int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
 {
uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
+   int r;

-   while(1) {
-   int r;
-   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+   if (r) {
if (r == -EDEADLK) {
-   mutex_unlock(>ring_lock);
-   r = radeon_gpu_reset(rdev);
-   mutex_lock(>ring_lock);
-   if (!r)
-   continue;
-   }
-   if (r) {
-   dev_err(rdev->dev, "error waiting for ring to become"
-   " idle (%d)\n", r);
+   return -EDEADLK;
}
-   return;
+   dev_err(rdev->dev, "error waiting for ring[%d] to become idle 
(%d)\n",
+   ring, r);
}
+   return 0;
 }

 /**
@@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev)
  */
 void radeon_fence_driver_fini(struct radeon_device *rdev)
 {
-   int ring;
+   int ring, r;

mutex_lock(>ring_lock);
for (ring = 0; ring < RADEON_NUM_RINGS; ring++) {
if (!rdev->fence_drv[ring].initialized)
continue;
-   radeon_fence_wait_empty_locked(rdev, ring);
+   r = radeon_fence_wait_empty_locked(rdev, ring);
+   if (r) {
+   /* no need to trigger GPU reset as we are unloading */
+   radeon_fence_driver_force_completion(rdev);
+   }
wake_up_all(>fence_queue);
radeon_scratch_free(rdev, 

[PATCH] drm/radeon: avoid deadlock in pm path when waiting for fence

2012-12-17 Thread j . glisse
From: Jerome Glisse jgli...@redhat.com

radeon_fence_wait_empty_locked should not trigger GPU reset as no
place where it's call from would benefit from such thing and it
actually lead to a kernel deadlock in case the reset is triggered
from pm codepath. Instead force ring completion in place where it
makes sense or return early in others.

Signed-off-by: Jerome Glisse jgli...@redhat.com
---
 drivers/gpu/drm/radeon/radeon.h|  2 +-
 drivers/gpu/drm/radeon/radeon_device.c | 13 +++--
 drivers/gpu/drm/radeon/radeon_fence.c  | 30 ++
 drivers/gpu/drm/radeon/radeon_pm.c | 15 ---
 4 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 9c7625c..071b2d7 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, int 
ring);
 bool radeon_fence_signaled(struct radeon_fence *fence);
 int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
 int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring);
-void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
+int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
 int radeon_fence_wait_any(struct radeon_device *rdev,
  struct radeon_fence **fences,
  bool intr);
diff --git a/drivers/gpu/drm/radeon/radeon_device.c 
b/drivers/gpu/drm/radeon/radeon_device.c
index 774fae7..53a9223 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, 
pm_message_t state)
struct drm_crtc *crtc;
struct drm_connector *connector;
int i, r;
+   bool force_completion = false;
 
if (dev == NULL || dev-dev_private == NULL) {
return -ENODEV;
@@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, 
pm_message_t state)
 
mutex_lock(rdev-ring_lock);
/* wait for gpu to finish processing current batch */
-   for (i = 0; i  RADEON_NUM_RINGS; i++)
-   radeon_fence_wait_empty_locked(rdev, i);
+   for (i = 0; i  RADEON_NUM_RINGS; i++) {
+   r = radeon_fence_wait_empty_locked(rdev, i);
+   if (r) {
+   /* delay GPU reset to resume */
+   force_completion = true;
+   }
+   }
+   if (force_completion) {
+   radeon_fence_driver_force_completion(rdev);
+   }
mutex_unlock(rdev-ring_lock);
 
radeon_save_bios_scratch_regs(rdev);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
b/drivers/gpu/drm/radeon/radeon_fence.c
index bf7b20e..28c09b6 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct radeon_device 
*rdev, int ring)
  * Returns 0 if the fences have passed, error for all other cases.
  * Caller must hold ring lock.
  */
-void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
+int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
 {
uint64_t seq = rdev-fence_drv[ring].sync_seq[ring];
+   int r;
 
-   while(1) {
-   int r;
-   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+   if (r) {
if (r == -EDEADLK) {
-   mutex_unlock(rdev-ring_lock);
-   r = radeon_gpu_reset(rdev);
-   mutex_lock(rdev-ring_lock);
-   if (!r)
-   continue;
-   }
-   if (r) {
-   dev_err(rdev-dev, error waiting for ring to become
-idle (%d)\n, r);
+   return -EDEADLK;
}
-   return;
+   dev_err(rdev-dev, error waiting for ring[%d] to become idle 
(%d)\n,
+   ring, r);
}
+   return 0;
 }
 
 /**
@@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev)
  */
 void radeon_fence_driver_fini(struct radeon_device *rdev)
 {
-   int ring;
+   int ring, r;
 
mutex_lock(rdev-ring_lock);
for (ring = 0; ring  RADEON_NUM_RINGS; ring++) {
if (!rdev-fence_drv[ring].initialized)
continue;
-   radeon_fence_wait_empty_locked(rdev, ring);
+   r = radeon_fence_wait_empty_locked(rdev, ring);
+   if (r) {
+   /* no need to trigger GPU reset as we are unloading */
+   radeon_fence_driver_force_completion(rdev);
+   }
wake_up_all(rdev-fence_queue);

Re: [PATCH] drm/radeon: avoid deadlock in pm path when waiting for fence

2012-12-17 Thread Christian König

On 17.12.2012 17:04, j.gli...@gmail.com wrote:

From: Jerome Glisse jgli...@redhat.com

radeon_fence_wait_empty_locked should not trigger GPU reset as no
place where it's call from would benefit from such thing and it
actually lead to a kernel deadlock in case the reset is triggered
from pm codepath. Instead force ring completion in place where it
makes sense or return early in others.

Signed-off-by: Jerome Glisse jgli...@redhat.com


I wanted to stop losing GPU reset signals by moving the reset into 
radeon_fence_wait_empty locked, but it's true that in most cases it 
doesn't make much sense (suspend/finish) and I didn't know that it could 
cause a hang in the PM code.


We should probably fix the PM code to properly signal this condition to 
it's caller and reset the GPU when it is possible to do so, but fixing 
the deadlock is of course more important.


Also should probably go into the stable kernel as well, but anyway it is:

Reviewed-by: Christian König christian.koe...@amd.com


---
  drivers/gpu/drm/radeon/radeon.h|  2 +-
  drivers/gpu/drm/radeon/radeon_device.c | 13 +++--
  drivers/gpu/drm/radeon/radeon_fence.c  | 30 ++
  drivers/gpu/drm/radeon/radeon_pm.c | 15 ---
  4 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 9c7625c..071b2d7 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev, int 
ring);
  bool radeon_fence_signaled(struct radeon_fence *fence);
  int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
  int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring);
-void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
+int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
  int radeon_fence_wait_any(struct radeon_device *rdev,
  struct radeon_fence **fences,
  bool intr);
diff --git a/drivers/gpu/drm/radeon/radeon_device.c 
b/drivers/gpu/drm/radeon/radeon_device.c
index 774fae7..53a9223 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev, 
pm_message_t state)
struct drm_crtc *crtc;
struct drm_connector *connector;
int i, r;
+   bool force_completion = false;
  
  	if (dev == NULL || dev-dev_private == NULL) {

return -ENODEV;
@@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev, 
pm_message_t state)
  
  	mutex_lock(rdev-ring_lock);

/* wait for gpu to finish processing current batch */
-   for (i = 0; i  RADEON_NUM_RINGS; i++)
-   radeon_fence_wait_empty_locked(rdev, i);
+   for (i = 0; i  RADEON_NUM_RINGS; i++) {
+   r = radeon_fence_wait_empty_locked(rdev, i);
+   if (r) {
+   /* delay GPU reset to resume */
+   force_completion = true;
+   }
+   }
+   if (force_completion) {
+   radeon_fence_driver_force_completion(rdev);
+   }
mutex_unlock(rdev-ring_lock);
  
  	radeon_save_bios_scratch_regs(rdev);

diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
b/drivers/gpu/drm/radeon/radeon_fence.c
index bf7b20e..28c09b6 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct radeon_device 
*rdev, int ring)
   * Returns 0 if the fences have passed, error for all other cases.
   * Caller must hold ring lock.
   */
-void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
+int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
  {
uint64_t seq = rdev-fence_drv[ring].sync_seq[ring];
+   int r;
  
-	while(1) {

-   int r;
-   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+   if (r) {
if (r == -EDEADLK) {
-   mutex_unlock(rdev-ring_lock);
-   r = radeon_gpu_reset(rdev);
-   mutex_lock(rdev-ring_lock);
-   if (!r)
-   continue;
-   }
-   if (r) {
-   dev_err(rdev-dev, error waiting for ring to become
-idle (%d)\n, r);
+   return -EDEADLK;
}
-   return;
+   dev_err(rdev-dev, error waiting for ring[%d] to become idle 
(%d)\n,
+   ring, r);
}
+   return 0;
  }
  
  /**

@@ -854,13 +848,17 @@ int radeon_fence_driver_init(struct radeon_device *rdev)
   */
  void radeon_fence_driver_fini(struct 

Re: [PATCH] drm/radeon: avoid deadlock in pm path when waiting for fence

2012-12-17 Thread Alex Deucher
Added to fixes and stable.  Thanks!

On Mon, Dec 17, 2012 at 11:41 AM, Christian König
deathsim...@vodafone.de wrote:
 On 17.12.2012 17:04, j.gli...@gmail.com wrote:

 From: Jerome Glisse jgli...@redhat.com

 radeon_fence_wait_empty_locked should not trigger GPU reset as no
 place where it's call from would benefit from such thing and it
 actually lead to a kernel deadlock in case the reset is triggered
 from pm codepath. Instead force ring completion in place where it
 makes sense or return early in others.

 Signed-off-by: Jerome Glisse jgli...@redhat.com


 I wanted to stop losing GPU reset signals by moving the reset into
 radeon_fence_wait_empty locked, but it's true that in most cases it doesn't
 make much sense (suspend/finish) and I didn't know that it could cause a
 hang in the PM code.

 We should probably fix the PM code to properly signal this condition to it's
 caller and reset the GPU when it is possible to do so, but fixing the
 deadlock is of course more important.

 Also should probably go into the stable kernel as well, but anyway it is:

 Reviewed-by: Christian König christian.koe...@amd.com


 ---
   drivers/gpu/drm/radeon/radeon.h|  2 +-
   drivers/gpu/drm/radeon/radeon_device.c | 13 +++--
   drivers/gpu/drm/radeon/radeon_fence.c  | 30
 ++
   drivers/gpu/drm/radeon/radeon_pm.c | 15 ---
   4 files changed, 38 insertions(+), 22 deletions(-)

 diff --git a/drivers/gpu/drm/radeon/radeon.h
 b/drivers/gpu/drm/radeon/radeon.h
 index 9c7625c..071b2d7 100644
 --- a/drivers/gpu/drm/radeon/radeon.h
 +++ b/drivers/gpu/drm/radeon/radeon.h
 @@ -231,7 +231,7 @@ void radeon_fence_process(struct radeon_device *rdev,
 int ring);
   bool radeon_fence_signaled(struct radeon_fence *fence);
   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
   int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring);
 -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int
 ring);
 +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring);
   int radeon_fence_wait_any(struct radeon_device *rdev,
   struct radeon_fence **fences,
   bool intr);
 diff --git a/drivers/gpu/drm/radeon/radeon_device.c
 b/drivers/gpu/drm/radeon/radeon_device.c
 index 774fae7..53a9223 100644
 --- a/drivers/gpu/drm/radeon/radeon_device.c
 +++ b/drivers/gpu/drm/radeon/radeon_device.c
 @@ -1163,6 +1163,7 @@ int radeon_suspend_kms(struct drm_device *dev,
 pm_message_t state)
 struct drm_crtc *crtc;
 struct drm_connector *connector;
 int i, r;
 +   bool force_completion = false;
 if (dev == NULL || dev-dev_private == NULL) {
 return -ENODEV;
 @@ -1205,8 +1206,16 @@ int radeon_suspend_kms(struct drm_device *dev,
 pm_message_t state)
 mutex_lock(rdev-ring_lock);
 /* wait for gpu to finish processing current batch */
 -   for (i = 0; i  RADEON_NUM_RINGS; i++)
 -   radeon_fence_wait_empty_locked(rdev, i);
 +   for (i = 0; i  RADEON_NUM_RINGS; i++) {
 +   r = radeon_fence_wait_empty_locked(rdev, i);
 +   if (r) {
 +   /* delay GPU reset to resume */
 +   force_completion = true;
 +   }
 +   }
 +   if (force_completion) {
 +   radeon_fence_driver_force_completion(rdev);
 +   }
 mutex_unlock(rdev-ring_lock);
 radeon_save_bios_scratch_regs(rdev);
 diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
 b/drivers/gpu/drm/radeon/radeon_fence.c
 index bf7b20e..28c09b6 100644
 --- a/drivers/gpu/drm/radeon/radeon_fence.c
 +++ b/drivers/gpu/drm/radeon/radeon_fence.c
 @@ -609,26 +609,20 @@ int radeon_fence_wait_next_locked(struct
 radeon_device *rdev, int ring)
* Returns 0 if the fences have passed, error for all other cases.
* Caller must hold ring lock.
*/
 -void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
 +int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
   {
 uint64_t seq = rdev-fence_drv[ring].sync_seq[ring];
 +   int r;
   - while(1) {
 -   int r;
 -   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
 +   r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
 +   if (r) {
 if (r == -EDEADLK) {
 -   mutex_unlock(rdev-ring_lock);
 -   r = radeon_gpu_reset(rdev);
 -   mutex_lock(rdev-ring_lock);
 -   if (!r)
 -   continue;
 -   }
 -   if (r) {
 -   dev_err(rdev-dev, error waiting for ring to
 become
 -idle (%d)\n, r);
 +   return -EDEADLK;
 }
 -   return;
 +   dev_err(rdev-dev, error waiting for ring[%d] to become
 idle