Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-22 Thread Christian König

Am 20.02.24 um 15:37 schrieb Lazar, Lijo:

On 2/20/2024 7:52 PM, Christian König wrote:

Am 20.02.24 um 07:32 schrieb Lazar, Lijo:

On 2/16/2024 8:43 PM, Alex Deucher wrote:

Use the new reset critical section accessors for debugfs, sysfs,
and the INFO IOCTL to provide proper mutual exclusivity
to hardware with respect the GPU resets.


This looks more like a priority inversion. When the device needs reset,
it doesn't make sense for reset handling to wait on these. Instead,
reset should be done at the earliest.

Nope that doesn't make any sense.

We block out operations which are not allowed to run concurrently with
resets here.

So the reset absolutely has to wait for those operations to finish.


Not at all. The first thing is to make sure that in_reset is set at the
earliest so that anything like these don't create further outstanding
transactions on the device. This patch prevents that.

The operations done/queried on the device on a hung state doesn't make
sense. The more the opportunities for such outstanding transactions, the
further it creates issues during the actual reset process.


No, exactly that is the buggy behavior we try to fix here.

A reset is not something which should interfere with those debugfs, 
sysfs and query IOCTL. E.g. a reset is something transparent for userspace.


That's also why submitting things is still allowed during a reset and 
just stash up jobs in the scheduler.


I finally start to understand how we ended up with that nonsense. If 
something like this comes up again then discuss it with Alex and me.


Regards,
Christian.



Thanks,
Lijo


Regards,
Christian.


Thanks,
Lijo


Signed-off-by: Alex Deucher 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
   drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
   drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
   4 files changed, 235 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 72eceb7d6667..d0e4a8729703 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct
seq_file *m, void *unused)
   }
     /* Avoid accidently unparking the sched thread during GPU
reset */
-    r = down_write_killable(&adev->reset_domain->sem);
+    r = amdgpu_reset_domain_access_write_start(adev);
   if (r)
   return r;
   @@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct
seq_file *m, void *unused)
   kthread_unpark(ring->sched.thread);
   }
   -    up_write(&adev->reset_domain->sem);
+    amdgpu_reset_domain_access_write_end(adev);
     pm_runtime_mark_last_busy(dev->dev);
   pm_runtime_put_autosuspend(dev->dev);
@@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void
*data, u64 val)
   return -ENOMEM;
     /* Avoid accidently unparking the sched thread during GPU
reset */
-    r = down_read_killable(&adev->reset_domain->sem);
+    r = amdgpu_reset_domain_access_read_start(adev);
   if (r)
   goto pro_end;
   @@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void
*data, u64 val)
   /* restart the scheduler */
   kthread_unpark(ring->sched.thread);
   -    up_read(&adev->reset_domain->sem);
+    amdgpu_reset_domain_access_read_end(adev);
     pro_end:
   kfree(fences);
@@ -2031,23 +2031,23 @@ static ssize_t
amdgpu_reset_dump_register_list_read(struct file *f,
   return 0;
     memset(reg_offset, 0, 12);
-    ret = down_read_killable(&adev->reset_domain->sem);
+    ret = amdgpu_reset_domain_access_read_start(adev);
   if (ret)
   return ret;
     for (i = 0; i < adev->reset_info.num_regs; i++) {
   sprintf(reg_offset, "0x%x\n",
adev->reset_info.reset_dump_reg_list[i]);
-    up_read(&adev->reset_domain->sem);
+    amdgpu_reset_domain_access_read_end(adev);
   if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
   return -EFAULT;
     len += strlen(reg_offset);
-    ret = down_read_killable(&adev->reset_domain->sem);
+    ret = amdgpu_reset_domain_access_read_start(adev);
   if (ret)
   return ret;
   }
   -    up_read(&adev->reset_domain->sem);
+    amdgpu_reset_domain_access_read_end(adev);
   *pos += len;
     return len;
@@ -2089,14 +2089,14 @@ static ssize_t
amdgpu_reset_dump_register_list_write(struct file *f,
   ret = -ENOMEM;
   goto error_free;
   }
-    ret = down_write_killable(&adev->reset_domain->sem);
+    ret = amdgpu_reset_domain_access_write_start(adev);
   if (ret)
   goto error_free;
     swap(adev->reset_info.reset_dump_reg_list, tmp);
   swap(adev->reset_info.reset_dump_reg_value, new);
   adev->reset_info.num_regs = i;
-    up_write(&adev->reset_domai

Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-20 Thread Lazar, Lijo



On 2/20/2024 7:52 PM, Christian König wrote:
> Am 20.02.24 um 07:32 schrieb Lazar, Lijo:
>> On 2/16/2024 8:43 PM, Alex Deucher wrote:
>>> Use the new reset critical section accessors for debugfs, sysfs,
>>> and the INFO IOCTL to provide proper mutual exclusivity
>>> to hardware with respect the GPU resets.
>>>
>> This looks more like a priority inversion. When the device needs reset,
>> it doesn't make sense for reset handling to wait on these. Instead,
>> reset should be done at the earliest.
> 
> Nope that doesn't make any sense.
> 
> We block out operations which are not allowed to run concurrently with
> resets here.
> 
> So the reset absolutely has to wait for those operations to finish.
> 

Not at all. The first thing is to make sure that in_reset is set at the
earliest so that anything like these don't create further outstanding
transactions on the device. This patch prevents that.

The operations done/queried on the device on a hung state doesn't make
sense. The more the opportunities for such outstanding transactions, the
further it creates issues during the actual reset process.

Thanks,
Lijo

> Regards,
> Christian.
> 
>>
>> Thanks,
>> Lijo
>>
>>> Signed-off-by: Alex Deucher 
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
>>>   drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
>>>   drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
>>>   4 files changed, 235 insertions(+), 101 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 72eceb7d6667..d0e4a8729703 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct
>>> seq_file *m, void *unused)
>>>   }
>>>     /* Avoid accidently unparking the sched thread during GPU
>>> reset */
>>> -    r = down_write_killable(&adev->reset_domain->sem);
>>> +    r = amdgpu_reset_domain_access_write_start(adev);
>>>   if (r)
>>>   return r;
>>>   @@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct
>>> seq_file *m, void *unused)
>>>   kthread_unpark(ring->sched.thread);
>>>   }
>>>   -    up_write(&adev->reset_domain->sem);
>>> +    amdgpu_reset_domain_access_write_end(adev);
>>>     pm_runtime_mark_last_busy(dev->dev);
>>>   pm_runtime_put_autosuspend(dev->dev);
>>> @@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void
>>> *data, u64 val)
>>>   return -ENOMEM;
>>>     /* Avoid accidently unparking the sched thread during GPU
>>> reset */
>>> -    r = down_read_killable(&adev->reset_domain->sem);
>>> +    r = amdgpu_reset_domain_access_read_start(adev);
>>>   if (r)
>>>   goto pro_end;
>>>   @@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void
>>> *data, u64 val)
>>>   /* restart the scheduler */
>>>   kthread_unpark(ring->sched.thread);
>>>   -    up_read(&adev->reset_domain->sem);
>>> +    amdgpu_reset_domain_access_read_end(adev);
>>>     pro_end:
>>>   kfree(fences);
>>> @@ -2031,23 +2031,23 @@ static ssize_t
>>> amdgpu_reset_dump_register_list_read(struct file *f,
>>>   return 0;
>>>     memset(reg_offset, 0, 12);
>>> -    ret = down_read_killable(&adev->reset_domain->sem);
>>> +    ret = amdgpu_reset_domain_access_read_start(adev);
>>>   if (ret)
>>>   return ret;
>>>     for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>   sprintf(reg_offset, "0x%x\n",
>>> adev->reset_info.reset_dump_reg_list[i]);
>>> -    up_read(&adev->reset_domain->sem);
>>> +    amdgpu_reset_domain_access_read_end(adev);
>>>   if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
>>>   return -EFAULT;
>>>     len += strlen(reg_offset);
>>> -    ret = down_read_killable(&adev->reset_domain->sem);
>>> +    ret = amdgpu_reset_domain_access_read_start(adev);
>>>   if (ret)
>>>   return ret;
>>>   }
>>>   -    up_read(&adev->reset_domain->sem);
>>> +    amdgpu_reset_domain_access_read_end(adev);
>>>   *pos += len;
>>>     return len;
>>> @@ -2089,14 +2089,14 @@ static ssize_t
>>> amdgpu_reset_dump_register_list_write(struct file *f,
>>>   ret = -ENOMEM;
>>>   goto error_free;
>>>   }
>>> -    ret = down_write_killable(&adev->reset_domain->sem);
>>> +    ret = amdgpu_reset_domain_access_write_start(adev);
>>>   if (ret)
>>>   goto error_free;
>>>     swap(adev->reset_info.reset_dump_reg_list, tmp);
>>>   swap(adev->reset_info.reset_dump_reg_value, new);
>>>   adev->reset_info.num_regs = i;
>>> -    up_write(&adev->reset_domain->sem);
>>> +    amdgpu_reset_domain_access_write_end(adev);
>>>   ret = size;
>>>     error_free:
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> b/drivers/gp

Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-20 Thread Christian König

Am 20.02.24 um 07:32 schrieb Lazar, Lijo:

On 2/16/2024 8:43 PM, Alex Deucher wrote:

Use the new reset critical section accessors for debugfs, sysfs,
and the INFO IOCTL to provide proper mutual exclusivity
to hardware with respect the GPU resets.


This looks more like a priority inversion. When the device needs reset,
it doesn't make sense for reset handling to wait on these. Instead,
reset should be done at the earliest.


Nope that doesn't make any sense.

We block out operations which are not allowed to run concurrently with 
resets here.


So the reset absolutely has to wait for those operations to finish.

Regards,
Christian.



Thanks,
Lijo


Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
  drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
  drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
  4 files changed, 235 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 72eceb7d6667..d0e4a8729703 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_write_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_write_start(adev);
if (r)
return r;
  
@@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)

kthread_unpark(ring->sched.thread);
}
  
-	up_write(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_write_end(adev);
  
  	pm_runtime_mark_last_busy(dev->dev);

pm_runtime_put_autosuspend(dev->dev);
@@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_read_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_read_start(adev);
if (r)
goto pro_end;
  
@@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)

/* restart the scheduler */
kthread_unpark(ring->sched.thread);
  
-	up_read(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
  
  pro_end:

kfree(fences);
@@ -2031,23 +2031,23 @@ static ssize_t 
amdgpu_reset_dump_register_list_read(struct file *f,
return 0;
  
  	memset(reg_offset, 0, 12);

-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
  
  	for (i = 0; i < adev->reset_info.num_regs; i++) {

sprintf(reg_offset, "0x%x\n", 
adev->reset_info.reset_dump_reg_list[i]);
-   up_read(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_read_end(adev);
if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
return -EFAULT;
  
  		len += strlen(reg_offset);

-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
}
  
-	up_read(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
*pos += len;
  
  	return len;

@@ -2089,14 +2089,14 @@ static ssize_t 
amdgpu_reset_dump_register_list_write(struct file *f,
ret = -ENOMEM;
goto error_free;
}
-   ret = down_write_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_write_start(adev);
if (ret)
goto error_free;
  
  	swap(adev->reset_info.reset_dump_reg_list, tmp);

swap(adev->reset_info.reset_dump_reg_value, new);
adev->reset_info.num_regs = i;
-   up_write(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_write_end(adev);
ret = size;
  
  error_free:

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index a2df3025a754..4efb44a964ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,6 +43,7 @@
  #include "amdgpu_gem.h"
  #include "amdgpu_display.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
  #include "amd_pcie.h"
  
  void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)

@@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0;
}
case AMDGPU_INFO_TIMESTAMP:
+   ret = amdgpu_reset_domain_access_read_start(adev);
+ 

Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-19 Thread Lazar, Lijo



On 2/16/2024 8:43 PM, Alex Deucher wrote:
> Use the new reset critical section accessors for debugfs, sysfs,
> and the INFO IOCTL to provide proper mutual exclusivity
> to hardware with respect the GPU resets.
> 

This looks more like a priority inversion. When the device needs reset,
it doesn't make sense for reset handling to wait on these. Instead,
reset should be done at the earliest.

Thanks,
Lijo

> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
>  drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
>  drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
>  4 files changed, 235 insertions(+), 101 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 72eceb7d6667..d0e4a8729703 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
> *m, void *unused)
>   }
>  
>   /* Avoid accidently unparking the sched thread during GPU reset */
> - r = down_write_killable(&adev->reset_domain->sem);
> + r = amdgpu_reset_domain_access_write_start(adev);
>   if (r)
>   return r;
>  
> @@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
> *m, void *unused)
>   kthread_unpark(ring->sched.thread);
>   }
>  
> - up_write(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_write_end(adev);
>  
>   pm_runtime_mark_last_busy(dev->dev);
>   pm_runtime_put_autosuspend(dev->dev);
> @@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>   return -ENOMEM;
>  
>   /* Avoid accidently unparking the sched thread during GPU reset */
> - r = down_read_killable(&adev->reset_domain->sem);
> + r = amdgpu_reset_domain_access_read_start(adev);
>   if (r)
>   goto pro_end;
>  
> @@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>   /* restart the scheduler */
>   kthread_unpark(ring->sched.thread);
>  
> - up_read(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_read_end(adev);
>  
>  pro_end:
>   kfree(fences);
> @@ -2031,23 +2031,23 @@ static ssize_t 
> amdgpu_reset_dump_register_list_read(struct file *f,
>   return 0;
>  
>   memset(reg_offset, 0, 12);
> - ret = down_read_killable(&adev->reset_domain->sem);
> + ret = amdgpu_reset_domain_access_read_start(adev);
>   if (ret)
>   return ret;
>  
>   for (i = 0; i < adev->reset_info.num_regs; i++) {
>   sprintf(reg_offset, "0x%x\n", 
> adev->reset_info.reset_dump_reg_list[i]);
> - up_read(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_read_end(adev);
>   if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
>   return -EFAULT;
>  
>   len += strlen(reg_offset);
> - ret = down_read_killable(&adev->reset_domain->sem);
> + ret = amdgpu_reset_domain_access_read_start(adev);
>   if (ret)
>   return ret;
>   }
>  
> - up_read(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_read_end(adev);
>   *pos += len;
>  
>   return len;
> @@ -2089,14 +2089,14 @@ static ssize_t 
> amdgpu_reset_dump_register_list_write(struct file *f,
>   ret = -ENOMEM;
>   goto error_free;
>   }
> - ret = down_write_killable(&adev->reset_domain->sem);
> + ret = amdgpu_reset_domain_access_write_start(adev);
>   if (ret)
>   goto error_free;
>  
>   swap(adev->reset_info.reset_dump_reg_list, tmp);
>   swap(adev->reset_info.reset_dump_reg_value, new);
>   adev->reset_info.num_regs = i;
> - up_write(&adev->reset_domain->sem);
> + amdgpu_reset_domain_access_write_end(adev);
>   ret = size;
>  
>  error_free:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index a2df3025a754..4efb44a964ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -43,6 +43,7 @@
>  #include "amdgpu_gem.h"
>  #include "amdgpu_display.h"
>  #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>  #include "amd_pcie.h"
>  
>  void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)
> @@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *filp)
>   return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0;
>   }
>   case AMDGPU_INFO_TIMESTAMP:
> + ret = amdgpu_reset_domain_access_read_start(adev);
> + if (ret)
> + return -EFAULT;
>   ui64 = amdgpu_gfx_get_gpu_clock_cou

Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-19 Thread Christian König

Am 16.02.24 um 16:13 schrieb Alex Deucher:

Use the new reset critical section accessors for debugfs, sysfs,
and the INFO IOCTL to provide proper mutual exclusivity
to hardware with respect the GPU resets.

Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
  drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
  drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
  4 files changed, 235 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 72eceb7d6667..d0e4a8729703 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_write_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_write_start(adev);
if (r)
return r;
  
@@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)

kthread_unpark(ring->sched.thread);
}
  
-	up_write(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_write_end(adev);
  
  	pm_runtime_mark_last_busy(dev->dev);

pm_runtime_put_autosuspend(dev->dev);
@@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_read_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_read_start(adev);
if (r)
goto pro_end;
  
@@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)

/* restart the scheduler */
kthread_unpark(ring->sched.thread);
  
-	up_read(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
  
  pro_end:

kfree(fences);
@@ -2031,23 +2031,23 @@ static ssize_t 
amdgpu_reset_dump_register_list_read(struct file *f,
return 0;
  
  	memset(reg_offset, 0, 12);

-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
  
  	for (i = 0; i < adev->reset_info.num_regs; i++) {

sprintf(reg_offset, "0x%x\n", 
adev->reset_info.reset_dump_reg_list[i]);
-   up_read(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_read_end(adev);
if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
return -EFAULT;
  
  		len += strlen(reg_offset);

-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
}
  
-	up_read(&adev->reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
*pos += len;
  
  	return len;

@@ -2089,14 +2089,14 @@ static ssize_t 
amdgpu_reset_dump_register_list_write(struct file *f,
ret = -ENOMEM;
goto error_free;
}
-   ret = down_write_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_write_start(adev);
if (ret)
goto error_free;
  
  	swap(adev->reset_info.reset_dump_reg_list, tmp);

swap(adev->reset_info.reset_dump_reg_value, new);
adev->reset_info.num_regs = i;
-   up_write(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_write_end(adev);
ret = size;
  
  error_free:

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index a2df3025a754..4efb44a964ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,6 +43,7 @@
  #include "amdgpu_gem.h"
  #include "amdgpu_display.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
  #include "amd_pcie.h"
  
  void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)

@@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0;
}
case AMDGPU_INFO_TIMESTAMP:
+   ret = amdgpu_reset_domain_access_read_start(adev);
+   if (ret)
+   return -EFAULT;


Probably better to return ret here and on other occasions as well.

Shouldn't make a practical different at the moment, but should we ever 
change the killable to interruptible easier to handle.


Apart from that looks like a massive cleanup to me.

Regards,
Christian.


ui64 = amdgpu_gfx_get_gpu_clock_counter(adev);
+   amdgpu_reset_domain_access_read_end(adev

[PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-16 Thread Alex Deucher
Use the new reset critical section accessors for debugfs, sysfs,
and the INFO IOCTL to provide proper mutual exclusivity
to hardware with respect the GPU resets.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
 drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
 4 files changed, 235 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 72eceb7d6667..d0e4a8729703 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_write_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_write_start(adev);
if (r)
return r;
 
@@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
kthread_unpark(ring->sched.thread);
}
 
-   up_write(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_write_end(adev);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_read_killable(&adev->reset_domain->sem);
+   r = amdgpu_reset_domain_access_read_start(adev);
if (r)
goto pro_end;
 
@@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   up_read(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_read_end(adev);
 
 pro_end:
kfree(fences);
@@ -2031,23 +2031,23 @@ static ssize_t 
amdgpu_reset_dump_register_list_read(struct file *f,
return 0;
 
memset(reg_offset, 0, 12);
-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
 
for (i = 0; i < adev->reset_info.num_regs; i++) {
sprintf(reg_offset, "0x%x\n", 
adev->reset_info.reset_dump_reg_list[i]);
-   up_read(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_read_end(adev);
if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
return -EFAULT;
 
len += strlen(reg_offset);
-   ret = down_read_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
}
 
-   up_read(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_read_end(adev);
*pos += len;
 
return len;
@@ -2089,14 +2089,14 @@ static ssize_t 
amdgpu_reset_dump_register_list_write(struct file *f,
ret = -ENOMEM;
goto error_free;
}
-   ret = down_write_killable(&adev->reset_domain->sem);
+   ret = amdgpu_reset_domain_access_write_start(adev);
if (ret)
goto error_free;
 
swap(adev->reset_info.reset_dump_reg_list, tmp);
swap(adev->reset_info.reset_dump_reg_value, new);
adev->reset_info.num_regs = i;
-   up_write(&adev->reset_domain->sem);
+   amdgpu_reset_domain_access_write_end(adev);
ret = size;
 
 error_free:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index a2df3025a754..4efb44a964ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,6 +43,7 @@
 #include "amdgpu_gem.h"
 #include "amdgpu_display.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 #include "amd_pcie.h"
 
 void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)
@@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
return copy_to_user(out, &count, min(size, 4u)) ? -EFAULT : 0;
}
case AMDGPU_INFO_TIMESTAMP:
+   ret = amdgpu_reset_domain_access_read_start(adev);
+   if (ret)
+   return -EFAULT;
ui64 = amdgpu_gfx_get_gpu_clock_counter(adev);
+   amdgpu_reset_domain_access_read_end(adev);
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_FW_VERSION: {
struct drm_amdgpu_info_firmware fw_info;
@@ -831,6 +836,9 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)