Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-18 Thread Alex Deucher
Applied.  Let's see how long this one lasts :)

Alex

On Tue, Aug 17, 2021 at 4:23 AM Michel Dänzer  wrote:
>
> From: Michel Dänzer 
>
> schedule_delayed_work does not push back the work if it was already
> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
> was disabled and re-enabled again during those 100 ms.
>
> This resulted in frame drops / stutter with the upcoming mutter 41
> release on Navi 14, due to constantly enabling GFXOFF in the HW and
> disabling it again (for getting the GPU clock counter).
>
> To fix this, call cancel_delayed_work_sync when the disable count
> transitions from 0 to 1, and only schedule the delayed work on the
> reverse transition, not if the disable count was already 0. This makes
> sure the delayed work doesn't run at unexpected times, and allows it to
> be lock-free.
>
> v2:
> * Use cancel_delayed_work_sync & mutex_trylock instead of
>   mod_delayed_work.
> v3:
> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
> v4:
> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>   adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>   checking for it to be 0 (Evan Quan)
>
> Cc: sta...@vger.kernel.org
> Reviewed-by: Lijo Lazar  # v3
> Acked-by: Christian König  # v3
> Signed-off-by: Michel Dänzer 
> ---
>
> Alex, probably best to wait a bit longer before picking this up. :)
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 36 +++---
>  2 files changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f3fd5ec710b6..f944ed858f3e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
> work_struct *work)
> struct amdgpu_device *adev =
> container_of(work, struct amdgpu_device, 
> gfx.gfx_off_delay_work.work);
>
> -   mutex_lock(>gfx.gfx_off_mutex);
> -   if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
> -   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
> AMD_IP_BLOCK_TYPE_GFX, true))
> -   adev->gfx.gfx_off_state = true;
> -   }
> -   mutex_unlock(>gfx.gfx_off_mutex);
> +   WARN_ON_ONCE(adev->gfx.gfx_off_state);
> +   WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
> +
> +   if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
> true))
> +   adev->gfx.gfx_off_state = true;
>  }
>
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a0be0772c8b3..b4ced45301be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, 
> bool enable)
>
> mutex_lock(>gfx.gfx_off_mutex);
>
> -   if (!enable)
> -   adev->gfx.gfx_off_req_count++;
> -   else if (adev->gfx.gfx_off_req_count > 0)
> +   if (enable) {
> +   /* If the count is already 0, it means there's an imbalance 
> bug somewhere.
> +* Note that the bug may be in a different caller than the 
> one which triggers the
> +* WARN_ON_ONCE.
> +*/
> +   if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
> +   goto unlock;
> +
> adev->gfx.gfx_off_req_count--;
>
> -   if (enable && !adev->gfx.gfx_off_state && 
> !adev->gfx.gfx_off_req_count) {
> -   schedule_delayed_work(>gfx.gfx_off_delay_work, 
> GFX_OFF_DELAY_ENABLE);
> -   } else if (!enable && adev->gfx.gfx_off_state) {
> -   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
> AMD_IP_BLOCK_TYPE_GFX, false)) {
> -   adev->gfx.gfx_off_state = false;
> +   if (adev->gfx.gfx_off_req_count == 0 && 
> !adev->gfx.gfx_off_state)
> +   schedule_delayed_work(>gfx.gfx_off_delay_work, 
> GFX_OFF_DELAY_ENABLE);
> +   } else {
> +   if (adev->gfx.gfx_off_req_count == 0) {
> +   
> cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
> +
> +   if (adev->gfx.gfx_off_state &&
> +   !amdgpu_dpm_set_powergating_by_smu(adev, 
> AMD_IP_BLOCK_TYPE_GFX, false)) {
> +   adev->gfx.gfx_off_state = false;
>
> -   if (adev->gfx.funcs->init_spm_golden) {
> -   dev_dbg(adev->dev, "GFXOFF is disabled, 
> re-init SPM golden settings\n");
> -   amdgpu_gfx_init_spm_golden(adev);
> +   if (adev->gfx.funcs->init_spm_golden) {
> +   

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Lazar, Lijo




On 8/17/2021 5:19 PM, Lazar, Lijo wrote:



On 8/17/2021 4:36 PM, Michel Dänzer wrote:

On 2021-08-17 12:37 p.m., Lazar, Lijo wrote:



On 8/17/2021 3:29 PM, Michel Dänzer wrote:

On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:



On 8/17/2021 2:56 PM, Michel Dänzer wrote:

On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:



On 8/17/2021 1:53 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if 
GFXOFF

was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This 
makes
sure the delayed work doesn't run at unexpected times, and 
allows it to

be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian 
König)

v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
  adev->gfx.gfx_off_req_count and 
amdgpu_device_delay_enable_gfx_off

  checking for it to be 0 (Evan Quan)

Cc: sta...@vger.kernel.org
Reviewed-by: Lijo Lazar  # v3
Acked-by: Christian König  # v3
Signed-off-by: Michel Dänzer 
---

Alex, probably best to wait a bit longer before picking this up. :)

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 
+++---

 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void 
amdgpu_device_delay_enable_gfx_off(struct work_struct *work)

 struct amdgpu_device *adev =
 container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);

 -    mutex_lock(>gfx.gfx_off_mutex);
-    if (!adev->gfx.gfx_off_state && 
!adev->gfx.gfx_off_req_count) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))

-    adev->gfx.gfx_off_state = true;
-    }
-    mutex_unlock(>gfx.gfx_off_mutex);
+    WARN_ON_ONCE(adev->gfx.gfx_off_state);
+    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))

+    adev->gfx.gfx_off_state = true;
 }
   /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct 
amdgpu_device *adev, bool enable)

   mutex_lock(>gfx.gfx_off_mutex);
 -    if (!enable)
-    adev->gfx.gfx_off_req_count++;
-    else if (adev->gfx.gfx_off_req_count > 0)
+    if (enable) {
+    /* If the count is already 0, it means there's an 
imbalance bug somewhere.
+ * Note that the bug may be in a different caller than 
the one which triggers the

+ * WARN_ON_ONCE.
+ */
+    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+    goto unlock;
+
 adev->gfx.gfx_off_req_count--;
 -    if (enable && !adev->gfx.gfx_off_state && 
!adev->gfx.gfx_off_req_count) {
-    schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);

-    } else if (!enable && adev->gfx.gfx_off_state) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {

-    adev->gfx.gfx_off_state = false;
+    if (adev->gfx.gfx_off_req_count == 0 && 
!adev->gfx.gfx_off_state)
+
schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);

+    } else {
+    if (adev->gfx.gfx_off_req_count == 0) {
+
cancel_delayed_work_sync(>gfx.gfx_off_delay_work);

+
+    if (adev->gfx.gfx_off_state &&


More of a question which I didn't check last time - Is this 
expected to be true when the disable call comes in first?


My assumption is that cancel_delayed_work_sync guarantees 
amdgpu_device_delay_enable_gfx_off's assignment is visible here.




To clarify - when nothing is scheduled. If enable() is called when 
the count is 0, it goes to unlock. Now the expectation is someone 
to call Disable first.


Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, 
or it's a bug, which


  if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))

will catch.


Let's say  

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Lazar, Lijo




On 8/17/2021 4:36 PM, Michel Dänzer wrote:

On 2021-08-17 12:37 p.m., Lazar, Lijo wrote:



On 8/17/2021 3:29 PM, Michel Dänzer wrote:

On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:



On 8/17/2021 2:56 PM, Michel Dänzer wrote:

On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:



On 8/17/2021 1:53 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
  adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
  checking for it to be 0 (Evan Quan)

Cc: sta...@vger.kernel.org
Reviewed-by: Lijo Lazar  # v3
Acked-by: Christian König  # v3
Signed-off-by: Michel Dänzer 
---

Alex, probably best to wait a bit longer before picking this up. :)

     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++---
     2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
     struct amdgpu_device *adev =
     container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
     -    mutex_lock(>gfx.gfx_off_mutex);
-    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
true))
-    adev->gfx.gfx_off_state = true;
-    }
-    mutex_unlock(>gfx.gfx_off_mutex);
+    WARN_ON_ONCE(adev->gfx.gfx_off_state);
+    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
+    adev->gfx.gfx_off_state = true;
     }
       /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
       mutex_lock(>gfx.gfx_off_mutex);
     -    if (!enable)
-    adev->gfx.gfx_off_req_count++;
-    else if (adev->gfx.gfx_off_req_count > 0)
+    if (enable) {
+    /* If the count is already 0, it means there's an imbalance bug 
somewhere.
+ * Note that the bug may be in a different caller than the one which 
triggers the
+ * WARN_ON_ONCE.
+ */
+    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+    goto unlock;
+
     adev->gfx.gfx_off_req_count--;
     -    if (enable && !adev->gfx.gfx_off_state && 
!adev->gfx.gfx_off_req_count) {
-    schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-    } else if (!enable && adev->gfx.gfx_off_state) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
false)) {
-    adev->gfx.gfx_off_state = false;
+    if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
+    schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
+    } else {
+    if (adev->gfx.gfx_off_req_count == 0) {
+    cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
+
+    if (adev->gfx.gfx_off_state &&


More of a question which I didn't check last time - Is this expected to be true 
when the disable call comes in first?


My assumption is that cancel_delayed_work_sync guarantees 
amdgpu_device_delay_enable_gfx_off's assignment is visible here.



To clarify - when nothing is scheduled. If enable() is called when the count is 
0, it goes to unlock. Now the expectation is someone to call Disable first.


Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, or it's a 
bug, which

  if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))

will catch.



Let's say  Disable() is called first, then the variable will be false, right?


Ohh, I see 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer
On 2021-08-17 12:37 p.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 3:29 PM, Michel Dänzer wrote:
>> On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
>>>
>>>
>>> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
 On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>
>
> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when the disable count
>> transitions from 0 to 1, and only schedule the delayed work on the
>> reverse transition, not if the disable count was already 0. This makes
>> sure the delayed work doesn't run at unexpected times, and allows it to
>> be lock-free.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>  mod_delayed_work.
>> v3:
>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>> v4:
>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>  adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>  checking for it to be 0 (Evan Quan)
>>
>> Cc: sta...@vger.kernel.org
>> Reviewed-by: Lijo Lazar  # v3
>> Acked-by: Christian König  # v3
>> Signed-off-by: Michel Dänzer 
>> ---
>>
>> Alex, probably best to wait a bit longer before picking this up. :)
>>
>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 
>> +++---
>>     2 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f3fd5ec710b6..f944ed858f3e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2777,12 +2777,11 @@ static void 
>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>     struct amdgpu_device *adev =
>>     container_of(work, struct amdgpu_device, 
>> gfx.gfx_off_delay_work.work);
>>     -    mutex_lock(>gfx.gfx_off_mutex);
>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>> AMD_IP_BLOCK_TYPE_GFX, true))
>> -    adev->gfx.gfx_off_state = true;
>> -    }
>> -    mutex_unlock(>gfx.gfx_off_mutex);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>> +
>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> true))
>> +    adev->gfx.gfx_off_state = true;
>>     }
>>       /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index a0be0772c8b3..b4ced45301be 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device 
>> *adev, bool enable)
>>       mutex_lock(>gfx.gfx_off_mutex);
>>     -    if (!enable)
>> -    adev->gfx.gfx_off_req_count++;
>> -    else if (adev->gfx.gfx_off_req_count > 0)
>> +    if (enable) {
>> +    /* If the count is already 0, it means there's an imbalance bug 
>> somewhere.
>> + * Note that the bug may be in a different caller than the one 
>> which triggers the
>> + * WARN_ON_ONCE.
>> + */
>> +    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>> +    goto unlock;
>> +
>>     adev->gfx.gfx_off_req_count--;
>>     -    if (enable && !adev->gfx.gfx_off_state && 
>> !adev->gfx.gfx_off_req_count) {
>> -    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>> AMD_IP_BLOCK_TYPE_GFX, false)) {
>> -    adev->gfx.gfx_off_state = false;
>> +    if (adev->gfx.gfx_off_req_count == 0 && 
>> !adev->gfx.gfx_off_state)
>> +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> +    } else {
>> +    if (adev->gfx.gfx_off_req_count == 0) {
>> +    cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
>> +
>> +    if 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Lazar, Lijo




On 8/17/2021 3:29 PM, Michel Dänzer wrote:

On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:



On 8/17/2021 2:56 PM, Michel Dänzer wrote:

On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:



On 8/17/2021 1:53 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
     mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
     adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
     checking for it to be 0 (Evan Quan)

Cc: sta...@vger.kernel.org
Reviewed-by: Lijo Lazar  # v3
Acked-by: Christian König  # v3
Signed-off-by: Michel Dänzer 
---

Alex, probably best to wait a bit longer before picking this up. :)

    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++---
    2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
    struct amdgpu_device *adev =
    container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
    -    mutex_lock(>gfx.gfx_off_mutex);
-    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
true))
-    adev->gfx.gfx_off_state = true;
-    }
-    mutex_unlock(>gfx.gfx_off_mutex);
+    WARN_ON_ONCE(adev->gfx.gfx_off_state);
+    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
+    adev->gfx.gfx_off_state = true;
    }
      /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
      mutex_lock(>gfx.gfx_off_mutex);
    -    if (!enable)
-    adev->gfx.gfx_off_req_count++;
-    else if (adev->gfx.gfx_off_req_count > 0)
+    if (enable) {
+    /* If the count is already 0, it means there's an imbalance bug 
somewhere.
+ * Note that the bug may be in a different caller than the one which 
triggers the
+ * WARN_ON_ONCE.
+ */
+    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+    goto unlock;
+
    adev->gfx.gfx_off_req_count--;
    -    if (enable && !adev->gfx.gfx_off_state && 
!adev->gfx.gfx_off_req_count) {
-    schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-    } else if (!enable && adev->gfx.gfx_off_state) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
false)) {
-    adev->gfx.gfx_off_state = false;
+    if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
+    schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
+    } else {
+    if (adev->gfx.gfx_off_req_count == 0) {
+    cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
+
+    if (adev->gfx.gfx_off_state &&


More of a question which I didn't check last time - Is this expected to be true 
when the disable call comes in first?


My assumption is that cancel_delayed_work_sync guarantees 
amdgpu_device_delay_enable_gfx_off's assignment is visible here.



To clarify - when nothing is scheduled. If enable() is called when the count is 
0, it goes to unlock. Now the expectation is someone to call Disable first.


Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, or it's a 
bug, which

 if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))

will catch.



Let's say  Disable() is called first, then the variable will be false, right?


Ohh, I see what you mean. The first time amdgpu_gfx_off_ctrl is called with 
enable=false, adev->gfx.gfx_off_state == 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer
On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>
>>>
>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
 From: Michel Dänzer 

 schedule_delayed_work does not push back the work if it was already
 scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
 after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
 was disabled and re-enabled again during those 100 ms.

 This resulted in frame drops / stutter with the upcoming mutter 41
 release on Navi 14, due to constantly enabling GFXOFF in the HW and
 disabling it again (for getting the GPU clock counter).

 To fix this, call cancel_delayed_work_sync when the disable count
 transitions from 0 to 1, and only schedule the delayed work on the
 reverse transition, not if the disable count was already 0. This makes
 sure the delayed work doesn't run at unexpected times, and allows it to
 be lock-free.

 v2:
 * Use cancel_delayed_work_sync & mutex_trylock instead of
     mod_delayed_work.
 v3:
 * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
 v4:
 * Fix race condition between amdgpu_gfx_off_ctrl incrementing
     adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
     checking for it to be 0 (Evan Quan)

 Cc: sta...@vger.kernel.org
 Reviewed-by: Lijo Lazar  # v3
 Acked-by: Christian König  # v3
 Signed-off-by: Michel Dänzer 
 ---

 Alex, probably best to wait a bit longer before picking this up. :)

    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++---
    2 files changed, 30 insertions(+), 17 deletions(-)

 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
 b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 index f3fd5ec710b6..f944ed858f3e 100644
 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 @@ -2777,12 +2777,11 @@ static void 
 amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
    struct amdgpu_device *adev =
    container_of(work, struct amdgpu_device, 
 gfx.gfx_off_delay_work.work);
    -    mutex_lock(>gfx.gfx_off_mutex);
 -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
 -    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
 AMD_IP_BLOCK_TYPE_GFX, true))
 -    adev->gfx.gfx_off_state = true;
 -    }
 -    mutex_unlock(>gfx.gfx_off_mutex);
 +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
 +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
 +
 +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
 true))
 +    adev->gfx.gfx_off_state = true;
    }
      /**
 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
 b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
 index a0be0772c8b3..b4ced45301be 100644
 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
 @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, 
 bool enable)
      mutex_lock(>gfx.gfx_off_mutex);
    -    if (!enable)
 -    adev->gfx.gfx_off_req_count++;
 -    else if (adev->gfx.gfx_off_req_count > 0)
 +    if (enable) {
 +    /* If the count is already 0, it means there's an imbalance bug 
 somewhere.
 + * Note that the bug may be in a different caller than the one 
 which triggers the
 + * WARN_ON_ONCE.
 + */
 +    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
 +    goto unlock;
 +
    adev->gfx.gfx_off_req_count--;
    -    if (enable && !adev->gfx.gfx_off_state && 
 !adev->gfx.gfx_off_req_count) {
 -    schedule_delayed_work(>gfx.gfx_off_delay_work, 
 GFX_OFF_DELAY_ENABLE);
 -    } else if (!enable && adev->gfx.gfx_off_state) {
 -    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
 AMD_IP_BLOCK_TYPE_GFX, false)) {
 -    adev->gfx.gfx_off_state = false;
 +    if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
 +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
 GFX_OFF_DELAY_ENABLE);
 +    } else {
 +    if (adev->gfx.gfx_off_req_count == 0) {
 +    cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
 +
 +    if (adev->gfx.gfx_off_state &&
>>>
>>> More of a question which I didn't check last time - Is this expected to be 
>>> true when the disable call comes in first?
>>
>> My assumption is that cancel_delayed_work_sync guarantees 
>> amdgpu_device_delay_enable_gfx_off's assignment is visible here.
>>
> 
> To clarify - when nothing is scheduled. If enable() 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Lazar, Lijo




On 8/17/2021 2:56 PM, Michel Dänzer wrote:

On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:



On 8/17/2021 1:53 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
    mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
    adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
    checking for it to be 0 (Evan Quan)

Cc: sta...@vger.kernel.org
Reviewed-by: Lijo Lazar  # v3
Acked-by: Christian König  # v3
Signed-off-by: Michel Dänzer 
---

Alex, probably best to wait a bit longer before picking this up. :)

   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++---
   2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
   struct amdgpu_device *adev =
   container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
   -    mutex_lock(>gfx.gfx_off_mutex);
-    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
true))
-    adev->gfx.gfx_off_state = true;
-    }
-    mutex_unlock(>gfx.gfx_off_mutex);
+    WARN_ON_ONCE(adev->gfx.gfx_off_state);
+    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
+    adev->gfx.gfx_off_state = true;
   }
     /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
     mutex_lock(>gfx.gfx_off_mutex);
   -    if (!enable)
-    adev->gfx.gfx_off_req_count++;
-    else if (adev->gfx.gfx_off_req_count > 0)
+    if (enable) {
+    /* If the count is already 0, it means there's an imbalance bug 
somewhere.
+ * Note that the bug may be in a different caller than the one which 
triggers the
+ * WARN_ON_ONCE.
+ */
+    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+    goto unlock;
+
   adev->gfx.gfx_off_req_count--;
   -    if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{
-    schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-    } else if (!enable && adev->gfx.gfx_off_state) {
-    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
false)) {
-    adev->gfx.gfx_off_state = false;
+    if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
+    schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
+    } else {
+    if (adev->gfx.gfx_off_req_count == 0) {
+    cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
+
+    if (adev->gfx.gfx_off_state &&


More of a question which I didn't check last time - Is this expected to be true 
when the disable call comes in first?


My assumption is that cancel_delayed_work_sync guarantees 
amdgpu_device_delay_enable_gfx_off's assignment is visible here.



To clarify - when nothing is scheduled. If enable() is called when the 
count is 0, it goes to unlock. Now the expectation is someone to call 
Disable first.  Let's say  Disable() is called first, then the variable 
will be false, right?


Thanks,
Lijo


RE: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Quan, Evan
[AMD Official Use Only]

Thanks! This seems fine to me.
Reviewed-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of
> Michel Dänzer
> Sent: Tuesday, August 17, 2021 4:23 PM
> To: Deucher, Alexander ; Koenig, Christian
> 
> Cc: Liu, Leo ; Zhu, James ; amd-
> g...@lists.freedesktop.org; dri-de...@lists.freedesktop.org
> Subject: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is
> disabled
> 
> From: Michel Dänzer 
> 
> schedule_delayed_work does not push back the work if it was already
> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF was
> disabled and re-enabled again during those 100 ms.
> 
> This resulted in frame drops / stutter with the upcoming mutter 41 release
> on Navi 14, due to constantly enabling GFXOFF in the HW and disabling it
> again (for getting the GPU clock counter).
> 
> To fix this, call cancel_delayed_work_sync when the disable count transitions
> from 0 to 1, and only schedule the delayed work on the reverse transition,
> not if the disable count was already 0. This makes sure the delayed work
> doesn't run at unexpected times, and allows it to be lock-free.
> 
> v2:
> * Use cancel_delayed_work_sync & mutex_trylock instead of
>   mod_delayed_work.
> v3:
> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
> v4:
> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>   adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>   checking for it to be 0 (Evan Quan)
> 
> Cc: sta...@vger.kernel.org
> Reviewed-by: Lijo Lazar  # v3
> Acked-by: Christian König  # v3
> Signed-off-by: Michel Dänzer 
> ---
> 
> Alex, probably best to wait a bit longer before picking this up. :)
> 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 36 +++
> ---
>  2 files changed, 30 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f3fd5ec710b6..f944ed858f3e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2777,12 +2777,11 @@ static void
> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>   struct amdgpu_device *adev =
>   container_of(work, struct amdgpu_device,
> gfx.gfx_off_delay_work.work);
> 
> - mutex_lock(>gfx.gfx_off_mutex);
> - if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
> - if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, true))
> - adev->gfx.gfx_off_state = true;
> - }
> - mutex_unlock(>gfx.gfx_off_mutex);
> + WARN_ON_ONCE(adev->gfx.gfx_off_state);
> + WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
> +
> + if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, true))
> + adev->gfx.gfx_off_state = true;
>  }
> 
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a0be0772c8b3..b4ced45301be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device
> *adev, bool enable)
> 
>   mutex_lock(>gfx.gfx_off_mutex);
> 
> - if (!enable)
> - adev->gfx.gfx_off_req_count++;
> - else if (adev->gfx.gfx_off_req_count > 0)
> + if (enable) {
> + /* If the count is already 0, it means there's an imbalance bug
> somewhere.
> +  * Note that the bug may be in a different caller than the one
> which triggers the
> +  * WARN_ON_ONCE.
> +  */
> + if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
> + goto unlock;
> +
>   adev->gfx.gfx_off_req_count--;
> 
> - if (enable && !adev->gfx.gfx_off_state && !adev-
> >gfx.gfx_off_req_count) {
> - schedule_delayed_work(>gfx.gfx_off_delay_work,
> GFX_OFF_DELAY_ENABLE);
> - } else if (!enable && adev->gfx.gfx_off_state) {
> - if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, false)) {
> - adev->gfx.gfx_off_state = false;
> + if (adev->gfx.gfx_off_req_count == 0 && !adev-
> >gfx.gfx_off_state)
> + schedule_delayed_work(
> >gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
> + } else {
> + i

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer
On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when the disable count
>> transitions from 0 to 1, and only schedule the delayed work on the
>> reverse transition, not if the disable count was already 0. This makes
>> sure the delayed work doesn't run at unexpected times, and allows it to
>> be lock-free.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>    mod_delayed_work.
>> v3:
>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>> v4:
>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>    adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>    checking for it to be 0 (Evan Quan)
>>
>> Cc: sta...@vger.kernel.org
>> Reviewed-by: Lijo Lazar  # v3
>> Acked-by: Christian König  # v3
>> Signed-off-by: Michel Dänzer 
>> ---
>>
>> Alex, probably best to wait a bit longer before picking this up. :)
>>
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++---
>>   2 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f3fd5ec710b6..f944ed858f3e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2777,12 +2777,11 @@ static void 
>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>   struct amdgpu_device *adev =
>>   container_of(work, struct amdgpu_device, 
>> gfx.gfx_off_delay_work.work);
>>   -    mutex_lock(>gfx.gfx_off_mutex);
>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> true))
>> -    adev->gfx.gfx_off_state = true;
>> -    }
>> -    mutex_unlock(>gfx.gfx_off_mutex);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>> +
>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> true))
>> +    adev->gfx.gfx_off_state = true;
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index a0be0772c8b3..b4ced45301be 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, 
>> bool enable)
>>     mutex_lock(>gfx.gfx_off_mutex);
>>   -    if (!enable)
>> -    adev->gfx.gfx_off_req_count++;
>> -    else if (adev->gfx.gfx_off_req_count > 0)
>> +    if (enable) {
>> +    /* If the count is already 0, it means there's an imbalance bug 
>> somewhere.
>> + * Note that the bug may be in a different caller than the one 
>> which triggers the
>> + * WARN_ON_ONCE.
>> + */
>> +    if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>> +    goto unlock;
>> +
>>   adev->gfx.gfx_off_req_count--;
>>   -    if (enable && !adev->gfx.gfx_off_state && 
>> !adev->gfx.gfx_off_req_count) {
>> -    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> false)) {
>> -    adev->gfx.gfx_off_state = false;
>> +    if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>> +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> +    } else {
>> +    if (adev->gfx.gfx_off_req_count == 0) {
>> +    cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
>> +
>> +    if (adev->gfx.gfx_off_state &&
> 
> More of a question which I didn't check last time - Is this expected to be 
> true when the disable call comes in first?

My assumption is that cancel_delayed_work_sync guarantees 
amdgpu_device_delay_enable_gfx_off's assignment is visible here.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer


Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Lazar, Lijo




On 8/17/2021 1:53 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
   mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
   adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
   checking for it to be 0 (Evan Quan)

Cc: sta...@vger.kernel.org
Reviewed-by: Lijo Lazar  # v3
Acked-by: Christian König  # v3
Signed-off-by: Michel Dänzer 
---

Alex, probably best to wait a bit longer before picking this up. :)

  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 36 +++---
  2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
  
-	mutex_lock(>gfx.gfx_off_mutex);

-   if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
-   adev->gfx.gfx_off_state = true;
-   }
-   mutex_unlock(>gfx.gfx_off_mutex);
+   WARN_ON_ONCE(adev->gfx.gfx_off_state);
+   WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+   if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
true))
+   adev->gfx.gfx_off_state = true;
  }
  
  /**

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
  
  	mutex_lock(>gfx.gfx_off_mutex);
  
-	if (!enable)

-   adev->gfx.gfx_off_req_count++;
-   else if (adev->gfx.gfx_off_req_count > 0)
+   if (enable) {
+   /* If the count is already 0, it means there's an imbalance bug 
somewhere.
+* Note that the bug may be in a different caller than the one 
which triggers the
+* WARN_ON_ONCE.
+*/
+   if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+   goto unlock;
+
adev->gfx.gfx_off_req_count--;
  
-	if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {

-   schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
-   adev->gfx.gfx_off_state = false;
+   if (adev->gfx.gfx_off_req_count == 0 && 
!adev->gfx.gfx_off_state)
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
+   } else {
+   if (adev->gfx.gfx_off_req_count == 0) {
+   cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
+
+   if (adev->gfx.gfx_off_state &&


More of a question which I didn't check last time - Is this expected to 
be true when the disable call comes in first?


Thanks,
Lijo


+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   adev->gfx.gfx_off_state = false;
  
-			if (adev->gfx.funcs->init_spm_golden) {

-   dev_dbg(adev->dev, "GFXOFF is disabled, re-init SPM 
golden settings\n");
-   amdgpu_gfx_init_spm_golden(adev);
+   if (adev->gfx.funcs->init_spm_golden) {
+   dev_dbg(adev->dev,
+   "GFXOFF is disabled, re-init SPM 
golden settings\n");
+   

[PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-17 Thread Michel Dänzer
From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
  adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
  checking for it to be 0 (Evan Quan)

Cc: sta...@vger.kernel.org
Reviewed-by: Lijo Lazar  # v3
Acked-by: Christian König  # v3
Signed-off-by: Michel Dänzer 
---

Alex, probably best to wait a bit longer before picking this up. :)

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 36 +++---
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
 
-   mutex_lock(>gfx.gfx_off_mutex);
-   if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
-   adev->gfx.gfx_off_state = true;
-   }
-   mutex_unlock(>gfx.gfx_off_mutex);
+   WARN_ON_ONCE(adev->gfx.gfx_off_state);
+   WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+   if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
true))
+   adev->gfx.gfx_off_state = true;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
 
mutex_lock(>gfx.gfx_off_mutex);
 
-   if (!enable)
-   adev->gfx.gfx_off_req_count++;
-   else if (adev->gfx.gfx_off_req_count > 0)
+   if (enable) {
+   /* If the count is already 0, it means there's an imbalance bug 
somewhere.
+* Note that the bug may be in a different caller than the one 
which triggers the
+* WARN_ON_ONCE.
+*/
+   if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+   goto unlock;
+
adev->gfx.gfx_off_req_count--;
 
-   if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{
-   schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
-   adev->gfx.gfx_off_state = false;
+   if (adev->gfx.gfx_off_req_count == 0 && 
!adev->gfx.gfx_off_state)
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
+   } else {
+   if (adev->gfx.gfx_off_req_count == 0) {
+   cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
+
+   if (adev->gfx.gfx_off_state &&
+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   adev->gfx.gfx_off_state = false;
 
-   if (adev->gfx.funcs->init_spm_golden) {
-   dev_dbg(adev->dev, "GFXOFF is disabled, re-init 
SPM golden settings\n");
-   amdgpu_gfx_init_spm_golden(adev);
+   if (adev->gfx.funcs->init_spm_golden) {
+   dev_dbg(adev->dev,
+   "GFXOFF is disabled, re-init 
SPM golden settings\n");
+   amdgpu_gfx_init_spm_golden(adev);
+   }
}
}
+
+   adev->gfx.gfx_off_req_count++;
  

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer
On 2021-08-16 6:13 a.m., Lazar, Lijo wrote:
> On 8/13/2021 9:30 PM, Michel Dänzer wrote:
>> On 2021-08-13 5:07 p.m., Lazar, Lijo wrote:
>>> On 8/13/2021 8:10 PM, Michel Dänzer wrote:
 On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:
> On 8/13/2021 7:04 PM, Michel Dänzer wrote:
>> On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
>>> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
 From: Michel Dänzer 

 schedule_delayed_work does not push back the work if it was already
 scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
 after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
 was disabled and re-enabled again during those 100 ms.

 This resulted in frame drops / stutter with the upcoming mutter 41
 release on Navi 14, due to constantly enabling GFXOFF in the HW and
 disabling it again (for getting the GPU clock counter).

 To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
 enabled to disabled. This makes sure the delayed work will be scheduled
 as intended in the reverse case.

 In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
 to use mutex_trylock instead of mutex_lock.

 v2:
 * Use cancel_delayed_work_sync & mutex_trylock instead of
   mod_delayed_work.

 Signed-off-by: Michel Dänzer 
 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
  3 files changed, 20 insertions(+), 7 deletions(-)

 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
 b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 index f3fd5ec710b6..8b025f70706c 100644
 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 @@ -2777,7 +2777,16 @@ static void 
 amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
  struct amdgpu_device *adev =
  container_of(work, struct amdgpu_device, 
 gfx.gfx_off_delay_work.work);
  -    mutex_lock(>gfx.gfx_off_mutex);
 +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
 amdgpu_gfx_off_ctrl. */
 +    if (!mutex_trylock(>gfx.gfx_off_mutex)) {
 +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
 called with enable=true
 + * when adev->gfx.gfx_off_req_count is already 0, we might 
 race with that.
 + * Re-schedule to make sure gfx off will be re-enabled in the 
 HW eventually.
 + */
 +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
 AMDGPU_GFX_OFF_DELAY_ENABLE);
 +    return;
>>>
>>> This is not needed and is just creating another thread to contend for 
>>> mutex.
>>
>> Still not sure what you mean by that. What other thread?
>
> Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
> further. For ex: if it was another function like gfx_off_status holding 
> the lock at the time of check.
>
>>
>>> The checks below take care of enabling gfxoff correctly. If it's 
>>> already in gfx_off state, it doesn't do anything. So I don't see why 
>>> this change is needed.
>>
>> mutex_trylock is needed to prevent the deadlock discussed before and 
>> below.
>>
>> schedule_delayed_work is needed due to this scenario hinted at by the 
>> comment:
>>
>> 1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
>> 2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which 
>> fails
>>
>> GFXOFF would never get re-enabled in HW in this case (until 
>> amdgpu_gfx_off_ctrl calls schedule_delayed_work again).
>>
>> (cancel_delayed_work_sync guarantees there's no pending delayed work 
>> when it returns, even if amdgpu_device_delay_enable_gfx_off calls 
>> schedule_delayed_work)
>>
>
> I think we need to explain based on the original code before. There is an 
> asssumption here that the only other contention of this mutex is with the 
> gfx_off_ctrl function.

 Not really.


> As far as I understand if the work has already started running when 
> schedule_delayed_work is called, it will insert another in the work queue 
> after delay. Based on that understanding I didn't find a problem with the 
> original code.

 Original code as in without this patch or the mod_delayed_work patch? If 
 so, the problem is not when the work has already started running. It's 
 that when it hasn't started running yet, 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer
On 2021-08-16 12:20 p.m., Quan, Evan wrote:
> [AMD Official Use Only]
> 
> Hi Michel,
> 
> The patch seems reasonable to me(especially the cancel_delayed_work_sync() 
> part).
> However, can you explain more about the code below?
> What's the race issue here exactly?
> 
> + /* mutex_lock could deadlock with cancel_delayed_work_sync in 
> amdgpu_gfx_off_ctrl. */
> + if (!mutex_trylock(>gfx.gfx_off_mutex)) {
> + /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
> called with enable=true
> +  * when adev->gfx.gfx_off_req_count is already 0, we might race 
> with that.
> +  * Re-schedule to make sure gfx off will be re-enabled in the 
> HW eventually.
> +  */
> + schedule_delayed_work(>gfx.gfx_off_delay_work, 
> AMDGPU_GFX_OFF_DELAY_ENABLE);
> + return;
> + }

If amdgpu_gfx_off_ctrl was called with enable=true when 
adev->gfx.gfx_off_req_count == 0 already, it could have prevented 
amdgpu_device_delay_enable_gfx_off from locking the mutex.

v3 solves this by only scheduling the work when adev->gfx.gfx_off_req_count 
transitions from 1 to 0, which means it no longer needs to lock the mutex.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer


Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Michel Dänzer
On 2021-08-16 9:38 a.m., Christian König wrote:
> Am 13.08.21 um 12:29 schrieb Michel Dänzer:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>> enabled to disabled. This makes sure the delayed work will be scheduled
>> as intended in the reverse case.
>>
>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>> to use mutex_trylock instead of mutex_lock.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>    mod_delayed_work.
> 
> While this may work it still smells a little bit fishy.
> 
> In general you have two common locking orders around work items, either 
> lock->work or work->lock. If you mix this as lock->work->lock like here 
> trouble is usually imminent.
> 
> I think what we should do instead is to double check if taking the lock 
> inside the work item is necessary and instead making sure that the work is 
> sync canceled when we don't want it to run. In other words fully switching to 
> the lock->work approach.

Done in v3, thanks for the suggestion!


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer


RE: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Quan, Evan
[AMD Official Use Only]

Hi Michel,

The patch seems reasonable to me(especially the cancel_delayed_work_sync() 
part).
However, can you explain more about the code below?
What's the race issue here exactly?

+   /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+   if (!mutex_trylock(>gfx.gfx_off_mutex)) {
+   /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
called with enable=true
+* when adev->gfx.gfx_off_req_count is already 0, we might race 
with that.
+* Re-schedule to make sure gfx off will be re-enabled in the 
HW eventually.
+*/
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   return;
+   }

BR
Evan
> -Original Message-
> From: amd-gfx  On Behalf Of
> Michel Dänzer
> Sent: Friday, August 13, 2021 6:29 PM
> To: Deucher, Alexander ; Koenig, Christian
> 
> Cc: Liu, Leo ; Zhu, James ; amd-
> g...@lists.freedesktop.org; dri-de...@lists.freedesktop.org
> Subject: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is
> disabled
> 
> From: Michel Dänzer 
> 
> schedule_delayed_work does not push back the work if it was already
> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
> was disabled and re-enabled again during those 100 ms.
> 
> This resulted in frame drops / stutter with the upcoming mutter 41
> release on Navi 14, due to constantly enabling GFXOFF in the HW and
> disabling it again (for getting the GPU clock counter).
> 
> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
> enabled to disabled. This makes sure the delayed work will be scheduled
> as intended in the reverse case.
> 
> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
> to use mutex_trylock instead of mutex_lock.
> 
> v2:
> * Use cancel_delayed_work_sync & mutex_trylock instead of
>   mod_delayed_work.
> 
> Signed-off-by: Michel Dänzer 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 13 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  3 +++
>  3 files changed, 20 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f3fd5ec710b6..8b025f70706c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2777,7 +2777,16 @@ static void
> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>   struct amdgpu_device *adev =
>   container_of(work, struct amdgpu_device,
> gfx.gfx_off_delay_work.work);
> 
> - mutex_lock(>gfx.gfx_off_mutex);
> + /* mutex_lock could deadlock with cancel_delayed_work_sync in
> amdgpu_gfx_off_ctrl. */
> + if (!mutex_trylock(>gfx.gfx_off_mutex)) {
> + /* If there's a bug which causes amdgpu_gfx_off_ctrl to be
> called with enable=true
> +  * when adev->gfx.gfx_off_req_count is already 0, we might
> race with that.
> +  * Re-schedule to make sure gfx off will be re-enabled in the
> HW eventually.
> +  */
> + schedule_delayed_work(>gfx.gfx_off_delay_work,
> AMDGPU_GFX_OFF_DELAY_ENABLE);
> + return;
> + }
> +
>   if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>   if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, true))
>   adev->gfx.gfx_off_state = true;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a0be0772c8b3..da4c46db3093 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -28,9 +28,6 @@
>  #include "amdgpu_rlc.h"
>  #include "amdgpu_ras.h"
> 
> -/* delay 0.1 second to enable gfx off feature */
> -#define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
> -
>  /*
>   * GPU GFX IP block helpers function.
>   */
> @@ -569,9 +566,13 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device
> *adev, bool enable)
>   adev->gfx.gfx_off_req_count--;
> 
>   if (enable && !adev->gfx.gfx_off_state && !adev-
> >gfx.gfx_off_req_count) {
> - schedule_delayed_work(>gfx.gfx_off_delay_work,
> GFX_OFF_DELAY_ENABLE);
> - } else if (!enable && adev->gfx.gfx_off_state) {
> - if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, false)) {
> +

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-16 Thread Christian König

Am 13.08.21 um 12:29 schrieb Michel Dänzer:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
enabled to disabled. This makes sure the delayed work will be scheduled
as intended in the reverse case.

In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
to use mutex_trylock instead of mutex_lock.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
   mod_delayed_work.


While this may work it still smells a little bit fishy.

In general you have two common locking orders around work items, either 
lock->work or work->lock. If you mix this as lock->work->lock like here 
trouble is usually imminent.


I think what we should do instead is to double check if taking the lock 
inside the work item is necessary and instead making sure that the work 
is sync canceled when we don't want it to run. In other words fully 
switching to the lock->work approach.


But please note that this are just high level design thoughts, I don't 
really know the details of the gfx_off code at all. Could even be that 
we need two locks, one outside and one inside of the work item.


Regards,
Christian.



Signed-off-by: Michel Dänzer 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 13 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  3 +++
  3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..8b025f70706c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
  
-	mutex_lock(>gfx.gfx_off_mutex);

+   /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+   if (!mutex_trylock(>gfx.gfx_off_mutex)) {
+   /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
called with enable=true
+* when adev->gfx.gfx_off_req_count is already 0, we might race 
with that.
+* Re-schedule to make sure gfx off will be re-enabled in the 
HW eventually.
+*/
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   return;
+   }
+
if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
adev->gfx.gfx_off_state = true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..da4c46db3093 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -28,9 +28,6 @@
  #include "amdgpu_rlc.h"
  #include "amdgpu_ras.h"
  
-/* delay 0.1 second to enable gfx off feature */

-#define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
-
  /*
   * GPU GFX IP block helpers function.
   */
@@ -569,9 +566,13 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
adev->gfx.gfx_off_req_count--;
  
  	if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {

-   schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   } else if (!enable) {
+   if (adev->gfx.gfx_off_req_count == 1 && 
!adev->gfx.gfx_off_state)
+   cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
+
+   if (adev->gfx.gfx_off_state &&
+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
adev->gfx.gfx_off_state = false;
  
  			if (adev->gfx.funcs->init_spm_golden) {

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d43fe2ed8116..dcdb505bb7f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -32,6 +32,9 @@
  #include "amdgpu_rlc.h"
  #include "soc15.h"
  
+/* delay 0.1 second to enable gfx off feature */


Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-15 Thread Lazar, Lijo




On 8/13/2021 9:30 PM, Michel Dänzer wrote:

On 2021-08-13 5:07 p.m., Lazar, Lijo wrote:



On 8/13/2021 8:10 PM, Michel Dänzer wrote:

On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:

On 8/13/2021 7:04 PM, Michel Dänzer wrote:

On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:

On 8/13/2021 3:59 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
enabled to disabled. This makes sure the delayed work will be scheduled
as intended in the reverse case.

In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
to use mutex_trylock instead of mutex_lock.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.

Signed-off-by: Michel Dänzer 
---
     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
     3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..8b025f70706c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
     struct amdgpu_device *adev =
     container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
     -    mutex_lock(>gfx.gfx_off_mutex);
+    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+    if (!mutex_trylock(>gfx.gfx_off_mutex)) {
+    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be called with 
enable=true
+ * when adev->gfx.gfx_off_req_count is already 0, we might race with 
that.
+ * Re-schedule to make sure gfx off will be re-enabled in the HW 
eventually.
+ */
+    schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+    return;


This is not needed and is just creating another thread to contend for mutex.


Still not sure what you mean by that. What other thread?


Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
further. For ex: if it was another function like gfx_off_status holding the 
lock at the time of check.




The checks below take care of enabling gfxoff correctly. If it's already in 
gfx_off state, it doesn't do anything. So I don't see why this change is needed.


mutex_trylock is needed to prevent the deadlock discussed before and below.

schedule_delayed_work is needed due to this scenario hinted at by the comment:

1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which fails

GFXOFF would never get re-enabled in HW in this case (until amdgpu_gfx_off_ctrl 
calls schedule_delayed_work again).

(cancel_delayed_work_sync guarantees there's no pending delayed work when it 
returns, even if amdgpu_device_delay_enable_gfx_off calls schedule_delayed_work)



I think we need to explain based on the original code before. There is an 
asssumption here that the only other contention of this mutex is with the 
gfx_off_ctrl function.


Not really.



As far as I understand if the work has already started running when 
schedule_delayed_work is called, it will insert another in the work queue after 
delay. Based on that understanding I didn't find a problem with the original 
code.


Original code as in without this patch or the mod_delayed_work patch? If so, 
the problem is not when the work has already started running. It's that when it 
hasn't started running yet, schedule_delayed_work doesn't change the timeout 
for the already scheduled work, so it ends up enabling GFXOFF earlier than 
intended (and thus at all in scenarios when it's not supposed to).



I meant the original implementation of amdgpu_device_delay_enable_gfx_off().


If you indeed want to use _sync, there is a small problem with this 
implementation also which is roughly equivalent to the original problem you 
faced.

amdgpu_gfx_off_ctrl(disable) locks mutex
calls cancel_delayed_work_sync
amdgpu_device_delay_enable_gfx_off already started running
 mutex_trylock fails and schedules another one
amdgpu_gfx_off_ctrl(enable)
 schedules_delayed_work() - Delay is not extended, it's the same as when 
it's rearmed from work item.



This cannot happen. When cancel_delayed_work_sync returns, it guarantees that 
the delayed work 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer
On 2021-08-13 5:07 p.m., Lazar, Lijo wrote:
> 
> 
> On 8/13/2021 8:10 PM, Michel Dänzer wrote:
>> On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:
>>> On 8/13/2021 7:04 PM, Michel Dänzer wrote:
 On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>> enabled to disabled. This makes sure the delayed work will be scheduled
>> as intended in the reverse case.
>>
>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>> to use mutex_trylock instead of mutex_lock.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>  mod_delayed_work.
>>
>> Signed-off-by: Michel Dänzer 
>> ---
>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
>>     3 files changed, 20 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f3fd5ec710b6..8b025f70706c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2777,7 +2777,16 @@ static void 
>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>     struct amdgpu_device *adev =
>>     container_of(work, struct amdgpu_device, 
>> gfx.gfx_off_delay_work.work);
>>     -    mutex_lock(>gfx.gfx_off_mutex);
>> +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
>> amdgpu_gfx_off_ctrl. */
>> +    if (!mutex_trylock(>gfx.gfx_off_mutex)) {
>> +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
>> called with enable=true
>> + * when adev->gfx.gfx_off_req_count is already 0, we might race 
>> with that.
>> + * Re-schedule to make sure gfx off will be re-enabled in the 
>> HW eventually.
>> + */
>> +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> AMDGPU_GFX_OFF_DELAY_ENABLE);
>> +    return;
>
> This is not needed and is just creating another thread to contend for 
> mutex.

 Still not sure what you mean by that. What other thread?
>>>
>>> Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
>>> further. For ex: if it was another function like gfx_off_status holding the 
>>> lock at the time of check.
>>>

> The checks below take care of enabling gfxoff correctly. If it's already 
> in gfx_off state, it doesn't do anything. So I don't see why this change 
> is needed.

 mutex_trylock is needed to prevent the deadlock discussed before and below.

 schedule_delayed_work is needed due to this scenario hinted at by the 
 comment:

 1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
 2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which 
 fails

 GFXOFF would never get re-enabled in HW in this case (until 
 amdgpu_gfx_off_ctrl calls schedule_delayed_work again).

 (cancel_delayed_work_sync guarantees there's no pending delayed work when 
 it returns, even if amdgpu_device_delay_enable_gfx_off calls 
 schedule_delayed_work)

>>>
>>> I think we need to explain based on the original code before. There is an 
>>> asssumption here that the only other contention of this mutex is with the 
>>> gfx_off_ctrl function.
>>
>> Not really.
>>
>>
>>> As far as I understand if the work has already started running when 
>>> schedule_delayed_work is called, it will insert another in the work queue 
>>> after delay. Based on that understanding I didn't find a problem with the 
>>> original code.
>>
>> Original code as in without this patch or the mod_delayed_work patch? If so, 
>> the problem is not when the work has already started running. It's that when 
>> it hasn't started running yet, schedule_delayed_work doesn't change the 
>> timeout for the already scheduled work, so it ends up enabling GFXOFF 
>> earlier than intended (and thus at all in scenarios when it's not supposed 
>> to).
>>
> 
> I meant the original implementation of amdgpu_device_delay_enable_gfx_off().
> 
> 
> If you indeed want to use _sync, 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Lazar, Lijo




On 8/13/2021 8:10 PM, Michel Dänzer wrote:

On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:

On 8/13/2021 7:04 PM, Michel Dänzer wrote:

On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:

On 8/13/2021 3:59 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
enabled to disabled. This makes sure the delayed work will be scheduled
as intended in the reverse case.

In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
to use mutex_trylock instead of mutex_lock.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
     mod_delayed_work.

Signed-off-by: Michel Dänzer 
---
    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
    3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..8b025f70706c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
    struct amdgpu_device *adev =
    container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
    -    mutex_lock(>gfx.gfx_off_mutex);
+    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+    if (!mutex_trylock(>gfx.gfx_off_mutex)) {
+    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be called with 
enable=true
+ * when adev->gfx.gfx_off_req_count is already 0, we might race with 
that.
+ * Re-schedule to make sure gfx off will be re-enabled in the HW 
eventually.
+ */
+    schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+    return;


This is not needed and is just creating another thread to contend for mutex.


Still not sure what you mean by that. What other thread?


Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
further. For ex: if it was another function like gfx_off_status holding the 
lock at the time of check.




The checks below take care of enabling gfxoff correctly. If it's already in 
gfx_off state, it doesn't do anything. So I don't see why this change is needed.


mutex_trylock is needed to prevent the deadlock discussed before and below.

schedule_delayed_work is needed due to this scenario hinted at by the comment:

1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which fails

GFXOFF would never get re-enabled in HW in this case (until amdgpu_gfx_off_ctrl 
calls schedule_delayed_work again).

(cancel_delayed_work_sync guarantees there's no pending delayed work when it 
returns, even if amdgpu_device_delay_enable_gfx_off calls schedule_delayed_work)



I think we need to explain based on the original code before. There is an 
asssumption here that the only other contention of this mutex is with the 
gfx_off_ctrl function.


Not really.



As far as I understand if the work has already started running when 
schedule_delayed_work is called, it will insert another in the work queue after 
delay. Based on that understanding I didn't find a problem with the original 
code.


Original code as in without this patch or the mod_delayed_work patch? If so, 
the problem is not when the work has already started running. It's that when it 
hasn't started running yet, schedule_delayed_work doesn't change the timeout 
for the already scheduled work, so it ends up enabling GFXOFF earlier than 
intended (and thus at all in scenarios when it's not supposed to).



I meant the original implementation of 
amdgpu_device_delay_enable_gfx_off().



If you indeed want to use _sync, there is a small problem with this 
implementation also which is roughly equivalent to the original problem 
you faced.


amdgpu_gfx_off_ctrl(disable) locks mutex
calls cancel_delayed_work_sync
amdgpu_device_delay_enable_gfx_off already started running
mutex_trylock fails and schedules another one
amdgpu_gfx_off_ctrl(enable)
	schedules_delayed_work() - Delay is not extended, it's the same as when 
it's rearmed from work item.


Probably, overthinking about the solution. Looking back, mod_ version is 
simpler :). May be just delay it further everytime there is a call with 
enable instead of doing it only for req_cnt==0?

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer
On 2021-08-13 4:14 p.m., Lazar, Lijo wrote:
> On 8/13/2021 7:04 PM, Michel Dänzer wrote:
>> On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
>>> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
 From: Michel Dänzer 

 schedule_delayed_work does not push back the work if it was already
 scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
 after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
 was disabled and re-enabled again during those 100 ms.

 This resulted in frame drops / stutter with the upcoming mutter 41
 release on Navi 14, due to constantly enabling GFXOFF in the HW and
 disabling it again (for getting the GPU clock counter).

 To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
 enabled to disabled. This makes sure the delayed work will be scheduled
 as intended in the reverse case.

 In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
 to use mutex_trylock instead of mutex_lock.

 v2:
 * Use cancel_delayed_work_sync & mutex_trylock instead of
     mod_delayed_work.

 Signed-off-by: Michel Dänzer 
 ---
    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
    3 files changed, 20 insertions(+), 7 deletions(-)

 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
 b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 index f3fd5ec710b6..8b025f70706c 100644
 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
 @@ -2777,7 +2777,16 @@ static void 
 amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
    struct amdgpu_device *adev =
    container_of(work, struct amdgpu_device, 
 gfx.gfx_off_delay_work.work);
    -    mutex_lock(>gfx.gfx_off_mutex);
 +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
 amdgpu_gfx_off_ctrl. */
 +    if (!mutex_trylock(>gfx.gfx_off_mutex)) {
 +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be called 
 with enable=true
 + * when adev->gfx.gfx_off_req_count is already 0, we might race 
 with that.
 + * Re-schedule to make sure gfx off will be re-enabled in the HW 
 eventually.
 + */
 +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
 AMDGPU_GFX_OFF_DELAY_ENABLE);
 +    return;
>>>
>>> This is not needed and is just creating another thread to contend for mutex.
>>
>> Still not sure what you mean by that. What other thread?
> 
> Sorry, I meant it schedules another workitem and delays GFXOFF enablement 
> further. For ex: if it was another function like gfx_off_status holding the 
> lock at the time of check.
> 
>>
>>> The checks below take care of enabling gfxoff correctly. If it's already in 
>>> gfx_off state, it doesn't do anything. So I don't see why this change is 
>>> needed.
>>
>> mutex_trylock is needed to prevent the deadlock discussed before and below.
>>
>> schedule_delayed_work is needed due to this scenario hinted at by the 
>> comment:
>>
>> 1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
>> 2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which fails
>>
>> GFXOFF would never get re-enabled in HW in this case (until 
>> amdgpu_gfx_off_ctrl calls schedule_delayed_work again).
>>
>> (cancel_delayed_work_sync guarantees there's no pending delayed work when it 
>> returns, even if amdgpu_device_delay_enable_gfx_off calls 
>> schedule_delayed_work)
>>
> 
> I think we need to explain based on the original code before. There is an 
> asssumption here that the only other contention of this mutex is with the 
> gfx_off_ctrl function.

Not really.


> As far as I understand if the work has already started running when 
> schedule_delayed_work is called, it will insert another in the work queue 
> after delay. Based on that understanding I didn't find a problem with the 
> original code.

Original code as in without this patch or the mod_delayed_work patch? If so, 
the problem is not when the work has already started running. It's that when it 
hasn't started running yet, schedule_delayed_work doesn't change the timeout 
for the already scheduled work, so it ends up enabling GFXOFF earlier than 
intended (and thus at all in scenarios when it's not supposed to).


> [...], there could be cases where it could have gone to gfxoff right after 
> gfx_off_status releases the lock, but it doesn't delaying it further. That 
> would be the case if some other function is also introduced which takes this 
> mutex.

I really don't think we need to worry about amdgpu_get_gfx_off_status, since 
it's only called from debugfs (and should be very short). If something hits 
that debugfs file and it causes higher energy 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Lazar, Lijo




On 8/13/2021 7:04 PM, Michel Dänzer wrote:

On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:



On 8/13/2021 3:59 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
enabled to disabled. This makes sure the delayed work will be scheduled
as intended in the reverse case.

In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
to use mutex_trylock instead of mutex_lock.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
    mod_delayed_work.

Signed-off-by: Michel Dänzer 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
   3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..8b025f70706c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
   struct amdgpu_device *adev =
   container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
   -    mutex_lock(>gfx.gfx_off_mutex);
+    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+    if (!mutex_trylock(>gfx.gfx_off_mutex)) {
+    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be called with 
enable=true
+ * when adev->gfx.gfx_off_req_count is already 0, we might race with 
that.
+ * Re-schedule to make sure gfx off will be re-enabled in the HW 
eventually.
+ */
+    schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+    return;


This is not needed and is just creating another thread to contend for mutex.


Still not sure what you mean by that. What other thread?


Sorry, I meant it schedules another workitem and delays GFXOFF 
enablement further. For ex: if it was another function like 
gfx_off_status holding the lock at the time of check.





The checks below take care of enabling gfxoff correctly. If it's already in 
gfx_off state, it doesn't do anything. So I don't see why this change is needed.


mutex_trylock is needed to prevent the deadlock discussed before and below.

schedule_delayed_work is needed due to this scenario hinted at by the comment:

1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which fails

GFXOFF would never get re-enabled in HW in this case (until amdgpu_gfx_off_ctrl 
calls schedule_delayed_work again).

(cancel_delayed_work_sync guarantees there's no pending delayed work when it 
returns, even if amdgpu_device_delay_enable_gfx_off calls schedule_delayed_work)



I think we need to explain based on the original code before. There is 
an asssumption here that the only other contention of this mutex is with 
the gfx_off_ctrl function. That is not true, so this is not the only 
case where mutex_trylock can fail. It could be because gfx_off_status is 
holding the lock.


As far as I understand if the work has already started running when 
schedule_delayed_work is called, it will insert another in the work 
queue after delay. Based on that understanding I didn't find a problem 
with the original code. Maybe, mutex_trylock is added to call _sync to 
make sure work is cancelled or not running but that breaks other 
assumptions.



The other problem is amdgpu_get_gfx_off_status() also uses the same mutex.


Not sure what for TBH. AFAICT there's only one implementation of this for 
Renoir, which just reads a register. (It's only called from debugfs)



I'm not sure either :) But as long as there are other functions that 
contend for the same lock, it's not good to implement based on 
assumptions only about a particular scenario.



So it won't be knowing which thread it would be contending against and blindly 
creates more work items.


There is only ever at most one instance of the delayed work at any time. 
amdgpu_device_delay_enable_gfx_off doesn't care whether amdgpu_gfx_off_ctrl or 
amdgpu_get_gfx_off_status is holding the mutex, it just keeps re-scheduling 
itself 100 ms later until it succeeds.



Yes, that is the problem, there could be cases where it could have gone 
to gfxoff right after gfx_off_status releases the lock, but it doesn't 
delaying it further. That would be the case 

Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer
On 2021-08-13 1:50 p.m., Lazar, Lijo wrote:
> 
> 
> On 8/13/2021 3:59 PM, Michel Dänzer wrote:
>> From: Michel Dänzer 
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
>> enabled to disabled. This makes sure the delayed work will be scheduled
>> as intended in the reverse case.
>>
>> In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
>> to use mutex_trylock instead of mutex_lock.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>    mod_delayed_work.
>>
>> Signed-off-by: Michel Dänzer 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 13 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  3 +++
>>   3 files changed, 20 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f3fd5ec710b6..8b025f70706c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
>> work_struct *work)
>>   struct amdgpu_device *adev =
>>   container_of(work, struct amdgpu_device, 
>> gfx.gfx_off_delay_work.work);
>>   -    mutex_lock(>gfx.gfx_off_mutex);
>> +    /* mutex_lock could deadlock with cancel_delayed_work_sync in 
>> amdgpu_gfx_off_ctrl. */
>> +    if (!mutex_trylock(>gfx.gfx_off_mutex)) {
>> +    /* If there's a bug which causes amdgpu_gfx_off_ctrl to be called 
>> with enable=true
>> + * when adev->gfx.gfx_off_req_count is already 0, we might race 
>> with that.
>> + * Re-schedule to make sure gfx off will be re-enabled in the HW 
>> eventually.
>> + */
>> +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> AMDGPU_GFX_OFF_DELAY_ENABLE);
>> +    return;
> 
> This is not needed and is just creating another thread to contend for mutex.

Still not sure what you mean by that. What other thread?

> The checks below take care of enabling gfxoff correctly. If it's already in 
> gfx_off state, it doesn't do anything. So I don't see why this change is 
> needed.

mutex_trylock is needed to prevent the deadlock discussed before and below.

schedule_delayed_work is needed due to this scenario hinted at by the comment:

1. amdgpu_gfx_off_ctrl locks mutex, calls schedule_delayed_work
2. amdgpu_device_delay_enable_gfx_off runs, calls mutex_trylock, which fails

GFXOFF would never get re-enabled in HW in this case (until amdgpu_gfx_off_ctrl 
calls schedule_delayed_work again).

(cancel_delayed_work_sync guarantees there's no pending delayed work when it 
returns, even if amdgpu_device_delay_enable_gfx_off calls schedule_delayed_work)


> The other problem is amdgpu_get_gfx_off_status() also uses the same mutex.

Not sure what for TBH. AFAICT there's only one implementation of this for 
Renoir, which just reads a register. (It's only called from debugfs)

> So it won't be knowing which thread it would be contending against and 
> blindly creates more work items. 

There is only ever at most one instance of the delayed work at any time. 
amdgpu_device_delay_enable_gfx_off doesn't care whether amdgpu_gfx_off_ctrl or 
amdgpu_get_gfx_off_status is holding the mutex, it just keeps re-scheduling 
itself 100 ms later until it succeeds.


>> @@ -569,9 +566,13 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, 
>> bool enable)
>>   adev->gfx.gfx_off_req_count--;
>>     if (enable && !adev->gfx.gfx_off_state && 
>> !adev->gfx.gfx_off_req_count) {
>> -    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> GFX_OFF_DELAY_ENABLE);
>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>> -    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, 
>> false)) {
>> +    schedule_delayed_work(>gfx.gfx_off_delay_work, 
>> AMDGPU_GFX_OFF_DELAY_ENABLE);
>> +    } else if (!enable) {
>> +    if (adev->gfx.gfx_off_req_count == 1 && !adev->gfx.gfx_off_state)
>> +    cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
> 
> This has the deadlock problem as discussed in the other thread.

It does not. If amdgpu_device_delay_enable_gfx_off runs while 
amdgpu_gfx_off_ctrl holds the mutex, 
mutex_trylock fails and the former bails.


-- 
Earthling Michel Dänzer   |   https://redhat.com
Libre software enthusiast | Mesa and X developer


Re: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Lazar, Lijo




On 8/13/2021 3:59 PM, Michel Dänzer wrote:

From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
enabled to disabled. This makes sure the delayed work will be scheduled
as intended in the reverse case.

In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
to use mutex_trylock instead of mutex_lock.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
   mod_delayed_work.

Signed-off-by: Michel Dänzer 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 13 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  3 +++
  3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..8b025f70706c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
  
-	mutex_lock(>gfx.gfx_off_mutex);

+   /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+   if (!mutex_trylock(>gfx.gfx_off_mutex)) {
+   /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
called with enable=true
+* when adev->gfx.gfx_off_req_count is already 0, we might race 
with that.
+* Re-schedule to make sure gfx off will be re-enabled in the 
HW eventually.
+*/
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   return;


This is not needed and is just creating another thread to contend for 
mutex. The checks below take care of enabling gfxoff correctly. If it's 
already in gfx_off state, it doesn't do anything. So I don't see why 
this change is needed.


The other problem is amdgpu_get_gfx_off_status() also uses the same 
mutex. So it won't be knowing which thread it would be contending 
against and blindly creates more work items.



+   }
+
if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
adev->gfx.gfx_off_state = true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..da4c46db3093 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -28,9 +28,6 @@
  #include "amdgpu_rlc.h"
  #include "amdgpu_ras.h"
  
-/* delay 0.1 second to enable gfx off feature */

-#define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
-
  /*
   * GPU GFX IP block helpers function.
   */
@@ -569,9 +566,13 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
adev->gfx.gfx_off_req_count--;
  
  	if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {

-   schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   } else if (!enable) {
+   if (adev->gfx.gfx_off_req_count == 1 && 
!adev->gfx.gfx_off_state)
+   cancel_delayed_work_sync(>gfx.gfx_off_delay_work);


This has the deadlock problem as discussed in the other thread.

Thanks,
Lijo


+   if (adev->gfx.gfx_off_state &&
+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
adev->gfx.gfx_off_state = false;
  
  			if (adev->gfx.funcs->init_spm_golden) {

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d43fe2ed8116..dcdb505bb7f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -32,6 +32,9 @@
  #include "amdgpu_rlc.h"
  #include "soc15.h"
  
+/* delay 0.1 second to enable gfx off feature */

+#define AMDGPU_GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
+
  /* GFX current status */
  #define AMDGPU_GFX_NORMAL_MODE0xL
  #define AMDGPU_GFX_SAFE_MODE  0x0001L



[PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is disabled

2021-08-13 Thread Michel Dänzer
From: Michel Dänzer 

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when GFXOFF transitions from
enabled to disabled. This makes sure the delayed work will be scheduled
as intended in the reverse case.

In order to avoid a deadlock, amdgpu_device_delay_enable_gfx_off needs
to use mutex_trylock instead of mutex_lock.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.

Signed-off-by: Michel Dänzer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c| 13 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h|  3 +++
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..8b025f70706c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,7 +2777,16 @@ static void amdgpu_device_delay_enable_gfx_off(struct 
work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, 
gfx.gfx_off_delay_work.work);
 
-   mutex_lock(>gfx.gfx_off_mutex);
+   /* mutex_lock could deadlock with cancel_delayed_work_sync in 
amdgpu_gfx_off_ctrl. */
+   if (!mutex_trylock(>gfx.gfx_off_mutex)) {
+   /* If there's a bug which causes amdgpu_gfx_off_ctrl to be 
called with enable=true
+* when adev->gfx.gfx_off_req_count is already 0, we might race 
with that.
+* Re-schedule to make sure gfx off will be re-enabled in the 
HW eventually.
+*/
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   return;
+   }
+
if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, true))
adev->gfx.gfx_off_state = true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..da4c46db3093 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -28,9 +28,6 @@
 #include "amdgpu_rlc.h"
 #include "amdgpu_ras.h"
 
-/* delay 0.1 second to enable gfx off feature */
-#define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
-
 /*
  * GPU GFX IP block helpers function.
  */
@@ -569,9 +566,13 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool 
enable)
adev->gfx.gfx_off_req_count--;
 
if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) 
{
-   schedule_delayed_work(>gfx.gfx_off_delay_work, 
GFX_OFF_DELAY_ENABLE);
-   } else if (!enable && adev->gfx.gfx_off_state) {
-   if (!amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
+   schedule_delayed_work(>gfx.gfx_off_delay_work, 
AMDGPU_GFX_OFF_DELAY_ENABLE);
+   } else if (!enable) {
+   if (adev->gfx.gfx_off_req_count == 1 && 
!adev->gfx.gfx_off_state)
+   cancel_delayed_work_sync(>gfx.gfx_off_delay_work);
+
+   if (adev->gfx.gfx_off_state &&
+   !amdgpu_dpm_set_powergating_by_smu(adev, 
AMD_IP_BLOCK_TYPE_GFX, false)) {
adev->gfx.gfx_off_state = false;
 
if (adev->gfx.funcs->init_spm_golden) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d43fe2ed8116..dcdb505bb7f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -32,6 +32,9 @@
 #include "amdgpu_rlc.h"
 #include "soc15.h"
 
+/* delay 0.1 second to enable gfx off feature */
+#define AMDGPU_GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
+
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE 0xL
 #define AMDGPU_GFX_SAFE_MODE   0x0001L
-- 
2.32.0